zhaoqingang
2025-01-02 992d91359f4e4437ddba9843173254441c896918
报告生成文档清洗
1个文件已添加
5个文件已修改
108 ■■■■■ 已修改文件
app/api/chat.py 46 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/api/files.py 7 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/config/config.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/config/config.yaml 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/service/files.py 53 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
requirements.txt 补丁 | 查看 | 原始文档 | blame | 历史
app/api/chat.py
@@ -436,6 +436,10 @@
                        title_number = receive_message.get('title_number', 8)
                        title_style = receive_message.get('title_style', "")
                        title_query = receive_message.get('title_query', "")
                        is_clean = receive_message.get('is_clean', 0)
                        file_type = receive_message.get('file_type', 1)
                        max_token = receive_message.get('max_tokens', 100000)
                        tokens = receive_message.get('tokens', 0)
                        if upload_files:
                            title_query = "start"
                        # if not upload_files:
@@ -456,15 +460,27 @@
                        }
                        files = []
                        for file in upload_files:
                            if file_type == 1:
                            files.append({
                                "type": "document",
                                "transfer_method": "local_file",
                                "url": "",
                                "upload_file_id": file
                            })
                            else:
                                files.append({
                                    "type": "document",
                                    "transfer_method": "remote_url",
                                    "url": file,
                                    "upload_file_id": ""
                                })
                        inputs_list = []
                        token_list = []
                        if workflow_type == 1:
                            inputs["input_files"] = files
                        elif workflow_type == 2:
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 2 and is_clean == 0:
                            inputs["file_list"] = files
                            inputs["Completion_of_main_indicators"] = title
                            inputs["sub_titles"] = sub_titles
@@ -472,6 +488,8 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 3:
                            inputs["file_list"] = files
                            inputs["number_of_title"] = title_number
@@ -480,9 +498,27 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_title", "type": "error"})
                            # inputs_list.append(inputs)
                            # token_list.append(token)
                        elif workflow_type == 2 and is_clean == 1:
                            # inputs["input_files"] = files
                            inputs_list.append(inputs)
                            token_list.append(token)
                            inputs1 = {}
                            # inputs1["file_list"] = files
                            inputs1["Completion_of_main_indicators"] = title
                            inputs1["sub_titles"] = sub_titles
                            token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs1)
                            token_list.append(token)
                        complete_response = ""
                        if workflow_type == 1 or workflow_type == 2:
                            for inputs in inputs_list:
                                inputs["input_files"] = files
                            async for rag_response in dify_service.workflow(token, current_user.id, inputs):
                                # print(rag_response)
                                try:
@@ -527,7 +563,13 @@
                                                    download_url = outputs.get("download_url", "")
                                                else:
                                                    message = answer.get("error", "")
                                                    if download_url:
                                                        files = [{
                                                            "type": "document",
                                                            "transfer_method": "remote_url",
                                                            "url": download_url,
                                                            "upload_file_id": ""
                                                        }]
                                                result = {"message": message, "type": "message", "download_url": download_url}
                                                try:
                                                    SessionService(db).update_session(chat_id,
app/api/files.py
@@ -18,6 +18,7 @@
from app.service.bisheng import BishengService
from app.service.common.api_token import DfTokenDao
from app.service.difyService import DifyService
from app.service.files import read_file
from app.service.ragflow import RagflowService
from app.service.service_token import get_ragflow_token, get_bisheng_token
import urllib.parse
@@ -124,7 +125,11 @@
                except Exception as e:
                    return Response(code=400, msg=str(e))
                try:
                    file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
                    filename = f.filename
                    file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
                    print(file_upload)
                    tokens = await read_file(file_content, filename, f.content_type)
                    file_upload["tokens"] = tokens
                    result.append(file_upload)
                except Exception as e:
                    raise HTTPException(status_code=500, detail=str(e))
app/config/config.py
@@ -23,6 +23,7 @@
    dify_workflow_clean: str = ''
    dify_workflow_report: str = ''
    postgresql_database_url: str = ''
    max_report_tokens: int = 100000
    def __init__(self, **kwargs):
        # 替换配置中的IP地址
        host_ip = os.getenv('HOST_IP', '127.0.0.1')
app/config/config.yaml
@@ -21,3 +21,4 @@
postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
max_report_tokens: 100000
app/service/files.py
New file
@@ -0,0 +1,53 @@
import fitz
import io
from docx import Document
from dashscope import get_tokenizer  # dashscope版本 >= 1.14.0
from app.service.auth import decode_access_token
async def get_str_token(input_str):
    # 获取tokenizer对象,目前只支持通义千问系列模型
    tokenizer = get_tokenizer('qwen-turbo')
    # 将字符串切分成token并转换为token id
    tokens = tokenizer.encode(input_str)
    # print(f"经过切分后的token id为:{tokens}。")
    # # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
    # print(f"经过切分后共有{len(tokens)}个token")
    # # 经过切分后共有8个token
    #
    # # 将token id转化为字符串并打印出来
    # for i in range(len(tokens)):
    #     print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}")
    return len(tokens)
async def read_pdf(pdf_stream):
        text = ""
        with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
            for page in pdf_document:
                text += page.get_text()
        return text
async def read_word(word_stream):
    # 使用 python-docx 打开 Word 文件流
    doc = Document(io.BytesIO(word_stream))
    # 提取每个段落的文本
    text = ""
    for para in doc.paragraphs:
        text += para.text
    return text
async def read_file(file, filename, content_type):
    text = ""
    if content_type == "application/pdf" or filename.endswith('.pdf'):
        # 提取 PDF 内容
        text = await read_pdf(file)
    elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
        text = await read_word(file)
    return await get_str_token(text)
requirements.txt
Binary files differ