zhaoqingang
2025-01-02 992d91359f4e4437ddba9843173254441c896918
报告生成文档清洗
5个文件已修改
1个文件已添加
264 ■■■■■ 已修改文件
app/api/chat.py 200 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/api/files.py 7 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/config/config.py 1 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/config/config.yaml 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
app/service/files.py 53 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
requirements.txt 补丁 | 查看 | 原始文档 | blame | 历史
app/api/chat.py
@@ -436,6 +436,10 @@
                        title_number = receive_message.get('title_number', 8)
                        title_style = receive_message.get('title_style', "")
                        title_query = receive_message.get('title_query', "")
                        is_clean = receive_message.get('is_clean', 0)
                        file_type = receive_message.get('file_type', 1)
                        max_token = receive_message.get('max_tokens', 100000)
                        tokens = receive_message.get('tokens', 0)
                        if upload_files:
                            title_query = "start"
                        # if not upload_files:
@@ -456,15 +460,27 @@
                        }
                        files = []
                        for file in upload_files:
                            files.append({
                                "type": "document",
                                "transfer_method": "local_file",
                                "url": "",
                                "upload_file_id": file
                            })
                            if file_type == 1:
                                files.append({
                                    "type": "document",
                                    "transfer_method": "local_file",
                                    "url": "",
                                    "upload_file_id": file
                                })
                            else:
                                files.append({
                                    "type": "document",
                                    "transfer_method": "remote_url",
                                    "url": file,
                                    "upload_file_id": ""
                                })
                        inputs_list = []
                        token_list = []
                        if workflow_type == 1:
                            inputs["input_files"] = files
                        elif workflow_type == 2:
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 2 and is_clean == 0:
                            inputs["file_list"] = files
                            inputs["Completion_of_main_indicators"] = title
                            inputs["sub_titles"] = sub_titles
@@ -472,6 +488,8 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 3:
                            inputs["file_list"] = files
                            inputs["number_of_title"] = title_number
@@ -480,89 +498,113 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_title", "type": "error"})
                            # inputs_list.append(inputs)
                            # token_list.append(token)
                        elif workflow_type == 2 and is_clean == 1:
                            # inputs["input_files"] = files
                            inputs_list.append(inputs)
                            token_list.append(token)
                            inputs1 = {}
                            # inputs1["file_list"] = files
                            inputs1["Completion_of_main_indicators"] = title
                            inputs1["sub_titles"] = sub_titles
                            token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs1)
                            token_list.append(token)
                        complete_response = ""
                        if workflow_type == 1 or workflow_type == 2:
                            async for rag_response in dify_service.workflow(token, current_user.id, inputs):
                                # print(rag_response)
                                try:
                                    if rag_response[:5] == "data:":
                                        # 如果是,则截取掉前5个字符,并去除首尾空白符
                                        complete_response = rag_response[5:].strip()
                                    elif "event: ping" in rag_response:
                                        continue
                                    else:
                                        # 否则,保持原样
                                        complete_response += rag_response
                            for inputs in inputs_list:
                                inputs["input_files"] = files
                                async for rag_response in dify_service.workflow(token, current_user.id, inputs):
                                    # print(rag_response)
                                    try:
                                        data = json.loads(complete_response)
                                        complete_response = ""
                                        if data.get("event") == "node_started" or data.get("event") == "node_finished":  # "event": "message_end"
                                            if "data" not in data or not data["data"]:  # 信息过滤
                                                logger.error("非法数据--------------------")
                                                logger.error(data)
                                                continue
                                            else:  # 正常输出
                                        if rag_response[:5] == "data:":
                                            # 如果是,则截取掉前5个字符,并去除首尾空白符
                                            complete_response = rag_response[5:].strip()
                                        elif "event: ping" in rag_response:
                                            continue
                                        else:
                                            # 否则,保持原样
                                            complete_response += rag_response
                                        try:
                                            data = json.loads(complete_response)
                                            complete_response = ""
                                            if data.get("event") == "node_started" or data.get("event") == "node_finished":  # "event": "message_end"
                                                if "data" not in data or not data["data"]:  # 信息过滤
                                                    logger.error("非法数据--------------------")
                                                    logger.error(data)
                                                    continue
                                                else:  # 正常输出
                                                    answer = data.get("data", "")
                                                    if isinstance(answer, str):
                                                        logger.error("----------------未知数据--------------------")
                                                        logger.error(data)
                                                        continue
                                                    elif isinstance(answer, dict):
                                                        message = answer.get("title", "")
                                                    result = {"message": message, "type": "system"}
                                            elif data.get("event") == "workflow_finished":
                                                answer = data.get("data", "")
                                                if isinstance(answer, str):
                                                    logger.error("----------------未知数据--------------------")
                                                    logger.error(data)
                                                    continue
                                                    result = {"message": "", "type": "close", "download_url": ""}
                                                elif isinstance(answer, dict):
                                                    message = answer.get("title", "")
                                                result = {"message": message, "type": "system"}
                                        elif data.get("event") == "workflow_finished":
                                            answer = data.get("data", "")
                                            if isinstance(answer, str):
                                                logger.error("----------------未知数据--------------------")
                                                logger.error(data)
                                                result = {"message": "", "type": "close", "download_url": ""}
                                            elif isinstance(answer, dict):
                                                download_url = ""
                                                outputs = answer.get("outputs", {})
                                                if outputs:
                                                    message = outputs.get("output", "")
                                                    download_url = outputs.get("download_url", "")
                                                else:
                                                    message = answer.get("error", "")
                                                result = {"message": message, "type": "message", "download_url": download_url}
                                                try:
                                                    SessionService(db).update_session(chat_id,
                                                                                      message={"role": "assistant",
                                                                                               "content": {
                                                                                                   "answer": message,
                                                                                                   "download_url": download_url}},
                                                                                      conversation_id=data.get(
                                                                                          "conversation_id"))
                                                except Exception as e:
                                                    logger.error("保存dify的会话异常!")
                                                    logger.error(e)
                                                try:
                                                    await websocket.send_json(result)
                                                except Exception as e:
                                                    logger.error(e)
                                                    logger.error("返回客户端消息异常!")
                                                result = {"message": "", "type": "close", "download_url": ""}
                                                    download_url = ""
                                                    outputs = answer.get("outputs", {})
                                                    if outputs:
                                                        message = outputs.get("output", "")
                                                        download_url = outputs.get("download_url", "")
                                                    else:
                                                        message = answer.get("error", "")
                                                    if download_url:
                                                        files = [{
                                                            "type": "document",
                                                            "transfer_method": "remote_url",
                                                            "url": download_url,
                                                            "upload_file_id": ""
                                                        }]
                                                    result = {"message": message, "type": "message", "download_url": download_url}
                                                    try:
                                                        SessionService(db).update_session(chat_id,
                                                                                          message={"role": "assistant",
                                                                                                   "content": {
                                                                                                       "answer": message,
                                                                                                       "download_url": download_url}},
                                                                                          conversation_id=data.get(
                                                                                              "conversation_id"))
                                                    except Exception as e:
                                                        logger.error("保存dify的会话异常!")
                                                        logger.error(e)
                                                    try:
                                                        await websocket.send_json(result)
                                                    except Exception as e:
                                                        logger.error(e)
                                                        logger.error("返回客户端消息异常!")
                                                    result = {"message": "", "type": "close", "download_url": ""}
                                        else:
                                            continue
                                        try:
                                            await websocket.send_json(result)
                                        except Exception  as e:
                                            logger.error(e)
                                            logger.error("返回客户端消息异常!")
                                        complete_response = ""
                                    except json.JSONDecodeError as e:
                                        print(f"Error decoding JSON: {e}")
                                        # print(f"Response text: {text}")
                                except Exception as e2:
                                    result = {"message": f"内部错误: {e2}", "type": "close"}
                                    await websocket.send_json(result)
                                    print(f"Error process message of ragflow: {e2}")
                                            else:
                                                continue
                                            try:
                                                await websocket.send_json(result)
                                            except Exception  as e:
                                                logger.error(e)
                                                logger.error("返回客户端消息异常!")
                                            complete_response = ""
                                        except json.JSONDecodeError as e:
                                            print(f"Error decoding JSON: {e}")
                                            # print(f"Response text: {text}")
                                    except Exception as e2:
                                        result = {"message": f"内部错误: {e2}", "type": "close"}
                                        await websocket.send_json(result)
                                        print(f"Error process message of ragflow: {e2}")
                        elif workflow_type == 3:
                            image_list = []
                            # print(inputs)
app/api/files.py
@@ -18,6 +18,7 @@
from app.service.bisheng import BishengService
from app.service.common.api_token import DfTokenDao
from app.service.difyService import DifyService
from app.service.files import read_file
from app.service.ragflow import RagflowService
from app.service.service_token import get_ragflow_token, get_bisheng_token
import urllib.parse
@@ -124,7 +125,11 @@
                except Exception as e:
                    return Response(code=400, msg=str(e))
                try:
                    file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
                    filename = f.filename
                    file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
                    print(file_upload)
                    tokens = await read_file(file_content, filename, f.content_type)
                    file_upload["tokens"] = tokens
                    result.append(file_upload)
                except Exception as e:
                    raise HTTPException(status_code=500, detail=str(e))
app/config/config.py
@@ -23,6 +23,7 @@
    dify_workflow_clean: str = ''
    dify_workflow_report: str = ''
    postgresql_database_url: str = ''
    max_report_tokens: int = 100000
    def __init__(self, **kwargs):
        # 替换配置中的IP地址
        host_ip = os.getenv('HOST_IP', '127.0.0.1')
app/config/config.yaml
@@ -20,4 +20,5 @@
dify_api_token: app-YmOAMDsPpDDlqryMHnc9TzTO
postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
max_report_tokens: 100000
app/service/files.py
New file
@@ -0,0 +1,53 @@
import fitz
import io
from docx import Document
from dashscope import get_tokenizer  # dashscope版本 >= 1.14.0
from app.service.auth import decode_access_token
async def get_str_token(input_str):
    # 获取tokenizer对象,目前只支持通义千问系列模型
    tokenizer = get_tokenizer('qwen-turbo')
    # 将字符串切分成token并转换为token id
    tokens = tokenizer.encode(input_str)
    # print(f"经过切分后的token id为:{tokens}。")
    # # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
    # print(f"经过切分后共有{len(tokens)}个token")
    # # 经过切分后共有8个token
    #
    # # 将token id转化为字符串并打印出来
    # for i in range(len(tokens)):
    #     print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}")
    return len(tokens)
async def read_pdf(pdf_stream):
        text = ""
        with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
            for page in pdf_document:
                text += page.get_text()
        return text
async def read_word(word_stream):
    # 使用 python-docx 打开 Word 文件流
    doc = Document(io.BytesIO(word_stream))
    # 提取每个段落的文本
    text = ""
    for para in doc.paragraphs:
        text += para.text
    return text
async def read_file(file, filename, content_type):
    text = ""
    if content_type == "application/pdf" or filename.endswith('.pdf'):
        # 提取 PDF 内容
        text = await read_pdf(file)
    elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
        text = await read_word(file)
    return await get_str_token(text)
requirements.txt
Binary files differ