zhaoqingang
2025-02-18 c966f133bea46f5d1b804296b270955478fa0ac4
报表合并问题处理
4个文件已修改
1个文件已添加
88 ■■■■ 已修改文件
app/api/agent.py 2 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/api/excel.py 24 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/api/files.py 7 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/service/files.py 55 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
requirements.txt 补丁 | 查看 | 原始文档 | blame | 历史
app/api/agent.py
@@ -192,6 +192,8 @@
            for i in session.log_to_json().get("message", []):
                if i.get("role") == "user":
                    tmp_data["question"]=i.get("content")
                    if "upload_filenames" in i:
                        tmp_data["upload_filenames"] = i.get("upload_filenames")
                elif i.get("role") == "assistant":
                    if isinstance(i.get("content"), dict):
app/api/excel.py
@@ -1,10 +1,10 @@
import random
import string
from fastapi import APIRouter, File, UploadFile, Form, BackgroundTasks, Depends, Request
from fastapi import APIRouter, File, UploadFile, Form, BackgroundTasks, Depends, Request, WebSocket
from fastapi.responses import JSONResponse, FileResponse
from sqlalchemy.orm import Session
from starlette.websockets import WebSocket
# from starlette.websockets import WebSocket
from app.api import get_current_user, get_current_user_websocket, Response
from app.models import UserModel, AgentType
@@ -52,14 +52,15 @@
    return prefix + random_part
def db_create_session(db: Session, user_id: str):
def db_create_session(db: Session, user_id: str, message:str, upload_filenames: list):
    db_id = generate_db_id()
    session = SessionService(db).create_session(
        db_id,
        "合并Excel",
        message,
        "basic_excel_merge",
        AgentType.BASIC,
        int(user_id)
        int(user_id),
        {"role": "user", "content": message, "upload_filenames": upload_filenames}
    )
    return session
@@ -102,11 +103,12 @@
    user_excel = EXCEL_FILES_PATH
    create_dir_if_not_exists(user_source)
    create_dir_if_not_exists(user_excel)
    while True:
        data = await websocket.receive_text()
        # data = await websocket.receive_text()git
        receive_message = await websocket.receive_json()
        try:
            if data == "\"合并Excel\"":
            if receive_message.get("message") == "合并Excel":
                upload_filenames = receive_message.get('upload_filenames', [])
                merge_file = run_conformity(user_source, user_excel)
                if merge_file is not None:
@@ -124,7 +126,7 @@
                        "type": "close",
                    })
                    # 创建会话记录
                    session = db_create_session(db, user_id)
                    session = db_create_session(db, user_id, receive_message.get("message"), upload_filenames)
                    # 更新会话记录
                    if session:
                        session_id = session.id
@@ -143,8 +145,8 @@
                    await websocket.send_json({"error": "合并失败", "type": "stream", "files": []})
                    await websocket.close()
            else:
                print(f"Received data: {data}")
                await websocket.send_json({"error": "未知指令", "data": str(data)})
                print(f"Received data: {receive_message.get('message')}")
                await websocket.send_json({"error": "未知指令", "data": str(receive_message.get('message'))})
                await websocket.close()
        except Exception as e:
            await websocket.send_json({"error": str(e)})
app/api/files.py
@@ -8,6 +8,7 @@
from starlette.responses import StreamingResponse
from werkzeug.utils import send_file
from Log import logger
from app.api import Response, get_current_user, ResponseList
from app.config.config import settings
from app.config.const import DOCUMENT_TO_REPORT, IMAGE_TO_TEXT, DOCUMENT_TO_REPORT_TITLE, DOCUMENT_IA_QUESTIONS, \
@@ -18,6 +19,7 @@
from app.models.user_model import UserModel
from app.service.basic import BasicService
from app.service.bisheng import BishengService
from app.service.files import read_file
from app.service.v2.api_token import DfTokenDao
from app.service.difyService import DifyService
from app.service.ragflow import RagflowService
@@ -134,6 +136,11 @@
                    return Response(code=400, msg=str(e))
                try:
                    file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
                    try:
                        tokens = await read_file(file_content, f.filename, f.content_type)
                        file_upload["tokens"] = tokens
                    except Exception as e:
                        logger.error(e)
                    result.append(file_upload)
                except Exception as e:
                    raise HTTPException(status_code=500, detail=str(e))
app/service/files.py
New file
@@ -0,0 +1,55 @@
import fitz
import io
from docx import Document
from dashscope import get_tokenizer  # dashscope版本 >= 1.14.0
from app.service.auth import decode_access_token
async def get_str_token(input_str):
    # 获取tokenizer对象,目前只支持通义千问系列模型
    tokenizer = get_tokenizer('qwen-turbo')
    # 将字符串切分成token并转换为token id
    tokens = tokenizer.encode(input_str)
    # print(f"经过切分后的token id为:{tokens}。")
    # # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
    # print(f"经过切分后共有{len(tokens)}个token")
    # # 经过切分后共有8个token
    #
    # # 将token id转化为字符串并打印出来
    # for i in range(len(tokens)):
    #     print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}")
    return len(tokens)
async def read_pdf(pdf_stream):
    text = ""
    with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
        for page in pdf_document:
            text += page.get_text()
    return text
async def read_word(word_stream):
    # 使用 python-docx 打开 Word 文件流
    doc = Document(io.BytesIO(word_stream))
    # 提取每个段落的文本
    text = ""
    for para in doc.paragraphs:
        text += para.text
    return text
async def read_file(file, filename, content_type):
    text = ""
    if content_type == "application/pdf" or filename.endswith('.pdf'):
        # 提取 PDF 内容
        text = await read_pdf(file)
    elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith(
            '.docx'):
        text = await read_word(file)
    return await get_str_token(text)
requirements.txt
Binary files differ