From 992d91359f4e4437ddba9843173254441c896918 Mon Sep 17 00:00:00 2001
From: zhaoqingang <zhaoqg0118@163.com>
Date: 星期四, 02 一月 2025 18:03:28 +0800
Subject: [PATCH] 报告生成文档清洗

---
 requirements.txt       |    0 
 app/config/config.py   |    1 
 app/service/files.py   |   53 ++++++++++
 app/api/chat.py        |  200 ++++++++++++++++++++++++---------------
 app/config/config.yaml |    3 
 app/api/files.py       |    7 +
 6 files changed, 183 insertions(+), 81 deletions(-)

diff --git a/app/api/chat.py b/app/api/chat.py
index 3c8e9c7..2993b2a 100644
--- a/app/api/chat.py
+++ b/app/api/chat.py
@@ -436,6 +436,10 @@
                         title_number = receive_message.get('title_number', 8)
                         title_style = receive_message.get('title_style', "")
                         title_query = receive_message.get('title_query', "")
+                        is_clean = receive_message.get('is_clean', 0)
+                        file_type = receive_message.get('file_type', 1)
+                        max_token = receive_message.get('max_tokens', 100000)
+                        tokens = receive_message.get('tokens', 0)
                         if upload_files:
                             title_query = "start"
                         # if not upload_files:
@@ -456,15 +460,27 @@
                         }
                         files = []
                         for file in upload_files:
-                            files.append({
-                                "type": "document",
-                                "transfer_method": "local_file",
-                                "url": "",
-                                "upload_file_id": file
-                            })
+                            if file_type == 1:
+                                files.append({
+                                    "type": "document",
+                                    "transfer_method": "local_file",
+                                    "url": "",
+                                    "upload_file_id": file
+                                })
+                            else:
+                                files.append({
+                                    "type": "document",
+                                    "transfer_method": "remote_url",
+                                    "url": file,
+                                    "upload_file_id": ""
+                                })
+                        inputs_list = []
+                        token_list = []
                         if workflow_type == 1:
                             inputs["input_files"] = files
-                        elif workflow_type == 2:
+                            inputs_list.append(inputs)
+                            token_list.append(token)
+                        elif workflow_type == 2 and is_clean == 0:
                             inputs["file_list"] = files
                             inputs["Completion_of_main_indicators"] = title
                             inputs["sub_titles"] = sub_titles
@@ -472,6 +488,8 @@
                             if not token:
                                 await websocket.send_json(
                                     {"message": "Invalid token document_to_report", "type": "error"})
+                            inputs_list.append(inputs)
+                            token_list.append(token)
                         elif workflow_type == 3:
                             inputs["file_list"] = files
                             inputs["number_of_title"] = title_number
@@ -480,89 +498,113 @@
                             if not token:
                                 await websocket.send_json(
                                     {"message": "Invalid token document_to_title", "type": "error"})
+                            # inputs_list.append(inputs)
+                            # token_list.append(token)
+                        elif workflow_type == 2 and is_clean == 1:
+                            # inputs["input_files"] = files
+                            inputs_list.append(inputs)
+                            token_list.append(token)
+                            inputs1 = {}
+                            # inputs1["file_list"] = files
+                            inputs1["Completion_of_main_indicators"] = title
+                            inputs1["sub_titles"] = sub_titles
+                            token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
+                            if not token:
+                                await websocket.send_json(
+                                    {"message": "Invalid token document_to_report", "type": "error"})
+                            inputs_list.append(inputs1)
+                            token_list.append(token)
 
                         complete_response = ""
                         if workflow_type == 1 or workflow_type == 2:
-                            async for rag_response in dify_service.workflow(token, current_user.id, inputs):
-                                # print(rag_response)
-                                try:
-                                    if rag_response[:5] == "data:":
-                                        # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界
-                                        complete_response = rag_response[5:].strip()
-                                    elif "event: ping" in rag_response:
-                                        continue
-                                    else:
-                                        # 鍚﹀垯锛屼繚鎸佸師鏍�
-                                        complete_response += rag_response
+                            for inputs in inputs_list:
+                                inputs["input_files"] = files
+                                async for rag_response in dify_service.workflow(token, current_user.id, inputs):
+                                    # print(rag_response)
                                     try:
-                                        data = json.loads(complete_response)
-                                        complete_response = ""
-                                        if data.get("event") == "node_started" or data.get("event") == "node_finished":  # "event": "message_end"
-                                            if "data" not in data or not data["data"]:  # 淇℃伅杩囨护
-                                                logger.error("闈炴硶鏁版嵁--------------------")
-                                                logger.error(data)
-                                                continue
-                                            else:  # 姝ｅ父杈撳嚭
+                                        if rag_response[:5] == "data:":
+                                            # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界
+                                            complete_response = rag_response[5:].strip()
+                                        elif "event: ping" in rag_response:
+                                            continue
+                                        else:
+                                            # 鍚﹀垯锛屼繚鎸佸師鏍�
+                                            complete_response += rag_response
+                                        try:
+                                            data = json.loads(complete_response)
+                                            complete_response = ""
+                                            if data.get("event") == "node_started" or data.get("event") == "node_finished":  # "event": "message_end"
+                                                if "data" not in data or not data["data"]:  # 淇℃伅杩囨护
+                                                    logger.error("闈炴硶鏁版嵁--------------------")
+                                                    logger.error(data)
+                                                    continue
+                                                else:  # 姝ｅ父杈撳嚭
+                                                    answer = data.get("data", "")
+                                                    if isinstance(answer, str):
+                                                        logger.error("----------------鏈煡鏁版嵁--------------------")
+                                                        logger.error(data)
+                                                        continue
+                                                    elif isinstance(answer, dict):
+
+                                                        message = answer.get("title", "")
+
+                                                    result = {"message": message, "type": "system"}
+                                            elif data.get("event") == "workflow_finished":
                                                 answer = data.get("data", "")
                                                 if isinstance(answer, str):
                                                     logger.error("----------------鏈煡鏁版嵁--------------------")
                                                     logger.error(data)
-                                                    continue
+                                                    result = {"message": "", "type": "close", "download_url": ""}
                                                 elif isinstance(answer, dict):
-
-                                                    message = answer.get("title", "")
-
-                                                result = {"message": message, "type": "system"}
-                                        elif data.get("event") == "workflow_finished":
-                                            answer = data.get("data", "")
-                                            if isinstance(answer, str):
-                                                logger.error("----------------鏈煡鏁版嵁--------------------")
-                                                logger.error(data)
-                                                result = {"message": "", "type": "close", "download_url": ""}
-                                            elif isinstance(answer, dict):
-                                                download_url = ""
-                                                outputs = answer.get("outputs", {})
-                                                if outputs:
-                                                    message = outputs.get("output", "")
-                                                    download_url = outputs.get("download_url", "")
-                                                else:
-                                                    message = answer.get("error", "")
-
-                                                result = {"message": message, "type": "message", "download_url": download_url}
-                                                try:
-                                                    SessionService(db).update_session(chat_id,
-                                                                                      message={"role": "assistant",
-                                                                                               "content": {
-                                                                                                   "answer": message,
-                                                                                                   "download_url": download_url}},
-                                                                                      conversation_id=data.get(
-                                                                                          "conversation_id"))
-                                                except Exception as e:
-                                                    logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒")
-                                                    logger.error(e)
-                                                try:
-                                                    await websocket.send_json(result)
-                                                except Exception as e:
-                                                    logger.error(e)
-                                                    logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
-                                                result = {"message": "", "type": "close", "download_url": ""}
+                                                    download_url = ""
+                                                    outputs = answer.get("outputs", {})
+                                                    if outputs:
+                                                        message = outputs.get("output", "")
+                                                        download_url = outputs.get("download_url", "")
+                                                    else:
+                                                        message = answer.get("error", "")
+                                                    if download_url:
+                                                        files = [{
+                                                            "type": "document",
+                                                            "transfer_method": "remote_url",
+                                                            "url": download_url,
+                                                            "upload_file_id": ""
+                                                        }]
+                                                    result = {"message": message, "type": "message", "download_url": download_url}
+                                                    try:
+                                                        SessionService(db).update_session(chat_id,
+                                                                                          message={"role": "assistant",
+                                                                                                   "content": {
+                                                                                                       "answer": message,
+                                                                                                       "download_url": download_url}},
+                                                                                          conversation_id=data.get(
+                                                                                              "conversation_id"))
+                                                    except Exception as e:
+                                                        logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒")
+                                                        logger.error(e)
+                                                    try:
+                                                        await websocket.send_json(result)
+                                                    except Exception as e:
+                                                        logger.error(e)
+                                                        logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
+                                                    result = {"message": "", "type": "close", "download_url": ""}
 
 
-                                        else:
-                                            continue
-                                        try:
-                                            await websocket.send_json(result)
-                                        except Exception  as e:
-                                            logger.error(e)
-                                            logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
-                                        complete_response = ""
-                                    except json.JSONDecodeError as e:
-                                        print(f"Error decoding JSON: {e}")
-                                        # print(f"Response text: {text}")
-                                except Exception as e2:
-                                    result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"}
-                                    await websocket.send_json(result)
-                                    print(f"Error process message of ragflow: {e2}")
+                                            else:
+                                                continue
+                                            try:
+                                                await websocket.send_json(result)
+                                            except Exception  as e:
+                                                logger.error(e)
+                                                logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
+                                            complete_response = ""
+                                        except json.JSONDecodeError as e:
+                                            print(f"Error decoding JSON: {e}")
+                                            # print(f"Response text: {text}")
+                                    except Exception as e2:
+                                        result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"}
+                                        await websocket.send_json(result)
+                                        print(f"Error process message of ragflow: {e2}")
                         elif workflow_type == 3:
                             image_list = []
                             # print(inputs)
diff --git a/app/api/files.py b/app/api/files.py
index a5c2fd1..92ee599 100644
--- a/app/api/files.py
+++ b/app/api/files.py
@@ -18,6 +18,7 @@
 from app.service.bisheng import BishengService
 from app.service.common.api_token import DfTokenDao
 from app.service.difyService import DifyService
+from app.service.files import read_file
 from app.service.ragflow import RagflowService
 from app.service.service_token import get_ragflow_token, get_bisheng_token
 import urllib.parse
@@ -124,7 +125,11 @@
                 except Exception as e:
                     return Response(code=400, msg=str(e))
                 try:
-                    file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
+                    filename = f.filename
+                    file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
+                    print(file_upload)
+                    tokens = await read_file(file_content, filename, f.content_type)
+                    file_upload["tokens"] = tokens
                     result.append(file_upload)
                 except Exception as e:
                     raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/config/config.py b/app/config/config.py
index c5a10a0..e925d4b 100644
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -23,6 +23,7 @@
     dify_workflow_clean: str = ''
     dify_workflow_report: str = ''
     postgresql_database_url: str = ''
+    max_report_tokens: int = 100000
     def __init__(self, **kwargs):
         # 鏇挎崲閰嶇疆涓殑IP鍦板潃
         host_ip = os.getenv('HOST_IP', '127.0.0.1')
diff --git a/app/config/config.yaml b/app/config/config.yaml
index ddee042..462bc37 100644
--- a/app/config/config.yaml
+++ b/app/config/config.yaml
@@ -20,4 +20,5 @@
 dify_api_token: app-YmOAMDsPpDDlqryMHnc9TzTO
 postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
 dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
-dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
\ No newline at end of file
+dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
+max_report_tokens: 100000
diff --git a/app/service/files.py b/app/service/files.py
new file mode 100644
index 0000000..ba9d85f
--- /dev/null
+++ b/app/service/files.py
@@ -0,0 +1,53 @@
+import fitz
+import io
+from docx import Document
+from dashscope import get_tokenizer  # dashscope鐗堟湰 >= 1.14.0
+
+from app.service.auth import decode_access_token
+
+
+async def get_str_token(input_str):
+    # 鑾峰彇tokenizer瀵硅薄锛岀洰鍓嶅彧鏀寔閫氫箟鍗冮棶绯诲垪妯″瀷
+    tokenizer = get_tokenizer('qwen-turbo')
+    # 灏嗗瓧绗︿覆鍒囧垎鎴恡oken骞惰浆鎹负token id
+    tokens = tokenizer.encode(input_str)
+    # print(f"缁忚繃鍒囧垎鍚庣殑token id涓猴細{tokens}銆�")
+    # # 缁忚繃鍒囧垎鍚庣殑token id涓猴細 [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
+    # print(f"缁忚繃鍒囧垎鍚庡叡鏈墈len(tokens)}涓猼oken")
+    # # 缁忚繃鍒囧垎鍚庡叡鏈�8涓猼oken
+    #
+    # # 灏唗oken id杞寲涓哄瓧绗︿覆骞舵墦鍗板嚭鏉�
+    # for i in range(len(tokens)):
+    #     print(f"token id涓簕tokens[i]}瀵瑰簲鐨勫瓧绗︿覆涓猴細{tokenizer.decode(tokens[i])}")
+    return len(tokens)
+
+async def read_pdf(pdf_stream):
+        text = ""
+        with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
+            for page in pdf_document:
+                text += page.get_text()
+        return text
+
+
+async def read_word(word_stream):
+    # 浣跨敤 python-docx 鎵撳紑 Word 鏂囦欢娴�
+    doc = Document(io.BytesIO(word_stream))
+
+    # 鎻愬彇姣忎釜娈佃惤鐨勬枃鏈�
+    text = ""
+    for para in doc.paragraphs:
+        text += para.text
+
+    return text
+
+
+async def read_file(file, filename, content_type):
+    text = ""
+    if content_type == "application/pdf" or filename.endswith('.pdf'):
+
+        # 鎻愬彇 PDF 鍐呭
+        text = await read_pdf(file)
+    elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
+        text = await read_word(file)
+
+    return await get_str_token(text)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 29ba9db..6c797ba 100644
--- a/requirements.txt
+++ b/requirements.txt
Binary files differ

--
Gitblit v1.8.0