From 992d91359f4e4437ddba9843173254441c896918 Mon Sep 17 00:00:00 2001
From: zhaoqingang <zhaoqg0118@163.com>
Date: 星期四, 02 一月 2025 18:03:28 +0800
Subject: [PATCH] 报告生成文档清洗
---
requirements.txt | 0
app/config/config.py | 1
app/service/files.py | 53 ++++++++++
app/api/chat.py | 200 ++++++++++++++++++++++++---------------
app/config/config.yaml | 3
app/api/files.py | 7 +
6 files changed, 183 insertions(+), 81 deletions(-)
diff --git a/app/api/chat.py b/app/api/chat.py
index 3c8e9c7..2993b2a 100644
--- a/app/api/chat.py
+++ b/app/api/chat.py
@@ -436,6 +436,10 @@
title_number = receive_message.get('title_number', 8)
title_style = receive_message.get('title_style', "")
title_query = receive_message.get('title_query', "")
+ is_clean = receive_message.get('is_clean', 0)
+ file_type = receive_message.get('file_type', 1)
+ max_token = receive_message.get('max_tokens', 100000)
+ tokens = receive_message.get('tokens', 0)
if upload_files:
title_query = "start"
# if not upload_files:
@@ -456,15 +460,27 @@
}
files = []
for file in upload_files:
- files.append({
- "type": "document",
- "transfer_method": "local_file",
- "url": "",
- "upload_file_id": file
- })
+ if file_type == 1:
+ files.append({
+ "type": "document",
+ "transfer_method": "local_file",
+ "url": "",
+ "upload_file_id": file
+ })
+ else:
+ files.append({
+ "type": "document",
+ "transfer_method": "remote_url",
+ "url": file,
+ "upload_file_id": ""
+ })
+ inputs_list = []
+ token_list = []
if workflow_type == 1:
inputs["input_files"] = files
- elif workflow_type == 2:
+ inputs_list.append(inputs)
+ token_list.append(token)
+ elif workflow_type == 2 and is_clean == 0:
inputs["file_list"] = files
inputs["Completion_of_main_indicators"] = title
inputs["sub_titles"] = sub_titles
@@ -472,6 +488,8 @@
if not token:
await websocket.send_json(
{"message": "Invalid token document_to_report", "type": "error"})
+ inputs_list.append(inputs)
+ token_list.append(token)
elif workflow_type == 3:
inputs["file_list"] = files
inputs["number_of_title"] = title_number
@@ -480,89 +498,113 @@
if not token:
await websocket.send_json(
{"message": "Invalid token document_to_title", "type": "error"})
+ # inputs_list.append(inputs)
+ # token_list.append(token)
+ elif workflow_type == 2 and is_clean == 1:
+ # inputs["input_files"] = files
+ inputs_list.append(inputs)
+ token_list.append(token)
+ inputs1 = {}
+ # inputs1["file_list"] = files
+ inputs1["Completion_of_main_indicators"] = title
+ inputs1["sub_titles"] = sub_titles
+ token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
+ if not token:
+ await websocket.send_json(
+ {"message": "Invalid token document_to_report", "type": "error"})
+ inputs_list.append(inputs1)
+ token_list.append(token)
complete_response = ""
if workflow_type == 1 or workflow_type == 2:
- async for rag_response in dify_service.workflow(token, current_user.id, inputs):
- # print(rag_response)
- try:
- if rag_response[:5] == "data:":
- # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界
- complete_response = rag_response[5:].strip()
- elif "event: ping" in rag_response:
- continue
- else:
- # 鍚﹀垯锛屼繚鎸佸師鏍�
- complete_response += rag_response
+ for inputs in inputs_list:
+ inputs["input_files"] = files
+ async for rag_response in dify_service.workflow(token, current_user.id, inputs):
+ # print(rag_response)
try:
- data = json.loads(complete_response)
- complete_response = ""
- if data.get("event") == "node_started" or data.get("event") == "node_finished": # "event": "message_end"
- if "data" not in data or not data["data"]: # 淇℃伅杩囨护
- logger.error("闈炴硶鏁版嵁--------------------")
- logger.error(data)
- continue
- else: # 姝e父杈撳嚭
+ if rag_response[:5] == "data:":
+ # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界
+ complete_response = rag_response[5:].strip()
+ elif "event: ping" in rag_response:
+ continue
+ else:
+ # 鍚﹀垯锛屼繚鎸佸師鏍�
+ complete_response += rag_response
+ try:
+ data = json.loads(complete_response)
+ complete_response = ""
+ if data.get("event") == "node_started" or data.get("event") == "node_finished": # "event": "message_end"
+ if "data" not in data or not data["data"]: # 淇℃伅杩囨护
+ logger.error("闈炴硶鏁版嵁--------------------")
+ logger.error(data)
+ continue
+ else: # 姝e父杈撳嚭
+ answer = data.get("data", "")
+ if isinstance(answer, str):
+ logger.error("----------------鏈煡鏁版嵁--------------------")
+ logger.error(data)
+ continue
+ elif isinstance(answer, dict):
+
+ message = answer.get("title", "")
+
+ result = {"message": message, "type": "system"}
+ elif data.get("event") == "workflow_finished":
answer = data.get("data", "")
if isinstance(answer, str):
logger.error("----------------鏈煡鏁版嵁--------------------")
logger.error(data)
- continue
+ result = {"message": "", "type": "close", "download_url": ""}
elif isinstance(answer, dict):
-
- message = answer.get("title", "")
-
- result = {"message": message, "type": "system"}
- elif data.get("event") == "workflow_finished":
- answer = data.get("data", "")
- if isinstance(answer, str):
- logger.error("----------------鏈煡鏁版嵁--------------------")
- logger.error(data)
- result = {"message": "", "type": "close", "download_url": ""}
- elif isinstance(answer, dict):
- download_url = ""
- outputs = answer.get("outputs", {})
- if outputs:
- message = outputs.get("output", "")
- download_url = outputs.get("download_url", "")
- else:
- message = answer.get("error", "")
-
- result = {"message": message, "type": "message", "download_url": download_url}
- try:
- SessionService(db).update_session(chat_id,
- message={"role": "assistant",
- "content": {
- "answer": message,
- "download_url": download_url}},
- conversation_id=data.get(
- "conversation_id"))
- except Exception as e:
- logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒")
- logger.error(e)
- try:
- await websocket.send_json(result)
- except Exception as e:
- logger.error(e)
- logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
- result = {"message": "", "type": "close", "download_url": ""}
+ download_url = ""
+ outputs = answer.get("outputs", {})
+ if outputs:
+ message = outputs.get("output", "")
+ download_url = outputs.get("download_url", "")
+ else:
+ message = answer.get("error", "")
+ if download_url:
+ files = [{
+ "type": "document",
+ "transfer_method": "remote_url",
+ "url": download_url,
+ "upload_file_id": ""
+ }]
+ result = {"message": message, "type": "message", "download_url": download_url}
+ try:
+ SessionService(db).update_session(chat_id,
+ message={"role": "assistant",
+ "content": {
+ "answer": message,
+ "download_url": download_url}},
+ conversation_id=data.get(
+ "conversation_id"))
+ except Exception as e:
+ logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒")
+ logger.error(e)
+ try:
+ await websocket.send_json(result)
+ except Exception as e:
+ logger.error(e)
+ logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
+ result = {"message": "", "type": "close", "download_url": ""}
- else:
- continue
- try:
- await websocket.send_json(result)
- except Exception as e:
- logger.error(e)
- logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
- complete_response = ""
- except json.JSONDecodeError as e:
- print(f"Error decoding JSON: {e}")
- # print(f"Response text: {text}")
- except Exception as e2:
- result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"}
- await websocket.send_json(result)
- print(f"Error process message of ragflow: {e2}")
+ else:
+ continue
+ try:
+ await websocket.send_json(result)
+ except Exception as e:
+ logger.error(e)
+ logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!")
+ complete_response = ""
+ except json.JSONDecodeError as e:
+ print(f"Error decoding JSON: {e}")
+ # print(f"Response text: {text}")
+ except Exception as e2:
+ result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"}
+ await websocket.send_json(result)
+ print(f"Error process message of ragflow: {e2}")
elif workflow_type == 3:
image_list = []
# print(inputs)
diff --git a/app/api/files.py b/app/api/files.py
index a5c2fd1..92ee599 100644
--- a/app/api/files.py
+++ b/app/api/files.py
@@ -18,6 +18,7 @@
from app.service.bisheng import BishengService
from app.service.common.api_token import DfTokenDao
from app.service.difyService import DifyService
+from app.service.files import read_file
from app.service.ragflow import RagflowService
from app.service.service_token import get_ragflow_token, get_bisheng_token
import urllib.parse
@@ -124,7 +125,11 @@
except Exception as e:
return Response(code=400, msg=str(e))
try:
- file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
+ filename = f.filename
+ file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
+ print(file_upload)
+ tokens = await read_file(file_content, filename, f.content_type)
+ file_upload["tokens"] = tokens
result.append(file_upload)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
diff --git a/app/config/config.py b/app/config/config.py
index c5a10a0..e925d4b 100644
--- a/app/config/config.py
+++ b/app/config/config.py
@@ -23,6 +23,7 @@
dify_workflow_clean: str = ''
dify_workflow_report: str = ''
postgresql_database_url: str = ''
+ max_report_tokens: int = 100000
def __init__(self, **kwargs):
# 鏇挎崲閰嶇疆涓殑IP鍦板潃
host_ip = os.getenv('HOST_IP', '127.0.0.1')
diff --git a/app/config/config.yaml b/app/config/config.yaml
index ddee042..462bc37 100644
--- a/app/config/config.yaml
+++ b/app/config/config.yaml
@@ -20,4 +20,5 @@
dify_api_token: app-YmOAMDsPpDDlqryMHnc9TzTO
postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
-dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
\ No newline at end of file
+dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
+max_report_tokens: 100000
diff --git a/app/service/files.py b/app/service/files.py
new file mode 100644
index 0000000..ba9d85f
--- /dev/null
+++ b/app/service/files.py
@@ -0,0 +1,53 @@
+import fitz
+import io
+from docx import Document
+from dashscope import get_tokenizer # dashscope鐗堟湰 >= 1.14.0
+
+from app.service.auth import decode_access_token
+
+
+async def get_str_token(input_str):
+ # 鑾峰彇tokenizer瀵硅薄锛岀洰鍓嶅彧鏀寔閫氫箟鍗冮棶绯诲垪妯″瀷
+ tokenizer = get_tokenizer('qwen-turbo')
+ # 灏嗗瓧绗︿覆鍒囧垎鎴恡oken骞惰浆鎹负token id
+ tokens = tokenizer.encode(input_str)
+ # print(f"缁忚繃鍒囧垎鍚庣殑token id涓猴細{tokens}銆�")
+ # # 缁忚繃鍒囧垎鍚庣殑token id涓猴細 [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
+ # print(f"缁忚繃鍒囧垎鍚庡叡鏈墈len(tokens)}涓猼oken")
+ # # 缁忚繃鍒囧垎鍚庡叡鏈�8涓猼oken
+ #
+ # # 灏唗oken id杞寲涓哄瓧绗︿覆骞舵墦鍗板嚭鏉�
+ # for i in range(len(tokens)):
+ # print(f"token id涓簕tokens[i]}瀵瑰簲鐨勫瓧绗︿覆涓猴細{tokenizer.decode(tokens[i])}")
+ return len(tokens)
+
+async def read_pdf(pdf_stream):
+ text = ""
+ with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
+ for page in pdf_document:
+ text += page.get_text()
+ return text
+
+
+async def read_word(word_stream):
+ # 浣跨敤 python-docx 鎵撳紑 Word 鏂囦欢娴�
+ doc = Document(io.BytesIO(word_stream))
+
+ # 鎻愬彇姣忎釜娈佃惤鐨勬枃鏈�
+ text = ""
+ for para in doc.paragraphs:
+ text += para.text
+
+ return text
+
+
+async def read_file(file, filename, content_type):
+ text = ""
+ if content_type == "application/pdf" or filename.endswith('.pdf'):
+
+ # 鎻愬彇 PDF 鍐呭
+ text = await read_pdf(file)
+ elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
+ text = await read_word(file)
+
+ return await get_str_token(text)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 29ba9db..6c797ba 100644
--- a/requirements.txt
+++ b/requirements.txt
Binary files differ
--
Gitblit v1.8.0