From 992d91359f4e4437ddba9843173254441c896918 Mon Sep 17 00:00:00 2001 From: zhaoqingang <zhaoqg0118@163.com> Date: 星期四, 02 一月 2025 18:03:28 +0800 Subject: [PATCH] 报告生成文档清洗 --- requirements.txt | 0 app/config/config.py | 1 app/service/files.py | 53 ++++++++++ app/api/chat.py | 200 ++++++++++++++++++++++++--------------- app/config/config.yaml | 3 app/api/files.py | 7 + 6 files changed, 183 insertions(+), 81 deletions(-) diff --git a/app/api/chat.py b/app/api/chat.py index 3c8e9c7..2993b2a 100644 --- a/app/api/chat.py +++ b/app/api/chat.py @@ -436,6 +436,10 @@ title_number = receive_message.get('title_number', 8) title_style = receive_message.get('title_style', "") title_query = receive_message.get('title_query', "") + is_clean = receive_message.get('is_clean', 0) + file_type = receive_message.get('file_type', 1) + max_token = receive_message.get('max_tokens', 100000) + tokens = receive_message.get('tokens', 0) if upload_files: title_query = "start" # if not upload_files: @@ -456,15 +460,27 @@ } files = [] for file in upload_files: - files.append({ - "type": "document", - "transfer_method": "local_file", - "url": "", - "upload_file_id": file - }) + if file_type == 1: + files.append({ + "type": "document", + "transfer_method": "local_file", + "url": "", + "upload_file_id": file + }) + else: + files.append({ + "type": "document", + "transfer_method": "remote_url", + "url": file, + "upload_file_id": "" + }) + inputs_list = [] + token_list = [] if workflow_type == 1: inputs["input_files"] = files - elif workflow_type == 2: + inputs_list.append(inputs) + token_list.append(token) + elif workflow_type == 2 and is_clean == 0: inputs["file_list"] = files inputs["Completion_of_main_indicators"] = title inputs["sub_titles"] = sub_titles @@ -472,6 +488,8 @@ if not token: await websocket.send_json( {"message": "Invalid token document_to_report", "type": "error"}) + inputs_list.append(inputs) + token_list.append(token) elif workflow_type == 3: inputs["file_list"] = files inputs["number_of_title"] = title_number @@ -480,89 +498,113 @@ if not token: await websocket.send_json( {"message": "Invalid token document_to_title", "type": "error"}) + # inputs_list.append(inputs) + # token_list.append(token) + elif workflow_type == 2 and is_clean == 1: + # inputs["input_files"] = files + inputs_list.append(inputs) + token_list.append(token) + inputs1 = {} + # inputs1["file_list"] = files + inputs1["Completion_of_main_indicators"] = title + inputs1["sub_titles"] = sub_titles + token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE) + if not token: + await websocket.send_json( + {"message": "Invalid token document_to_report", "type": "error"}) + inputs_list.append(inputs1) + token_list.append(token) complete_response = "" if workflow_type == 1 or workflow_type == 2: - async for rag_response in dify_service.workflow(token, current_user.id, inputs): - # print(rag_response) - try: - if rag_response[:5] == "data:": - # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界 - complete_response = rag_response[5:].strip() - elif "event: ping" in rag_response: - continue - else: - # 鍚﹀垯锛屼繚鎸佸師鏍� - complete_response += rag_response + for inputs in inputs_list: + inputs["input_files"] = files + async for rag_response in dify_service.workflow(token, current_user.id, inputs): + # print(rag_response) try: - data = json.loads(complete_response) - complete_response = "" - if data.get("event") == "node_started" or data.get("event") == "node_finished": # "event": "message_end" - if "data" not in data or not data["data"]: # 淇℃伅杩囨护 - logger.error("闈炴硶鏁版嵁--------------------") - logger.error(data) - continue - else: # 姝e父杈撳嚭 + if rag_response[:5] == "data:": + # 濡傛灉鏄紝鍒欐埅鍙栨帀鍓�5涓瓧绗︼紝骞跺幓闄ら灏剧┖鐧界 + complete_response = rag_response[5:].strip() + elif "event: ping" in rag_response: + continue + else: + # 鍚﹀垯锛屼繚鎸佸師鏍� + complete_response += rag_response + try: + data = json.loads(complete_response) + complete_response = "" + if data.get("event") == "node_started" or data.get("event") == "node_finished": # "event": "message_end" + if "data" not in data or not data["data"]: # 淇℃伅杩囨护 + logger.error("闈炴硶鏁版嵁--------------------") + logger.error(data) + continue + else: # 姝e父杈撳嚭 + answer = data.get("data", "") + if isinstance(answer, str): + logger.error("----------------鏈煡鏁版嵁--------------------") + logger.error(data) + continue + elif isinstance(answer, dict): + + message = answer.get("title", "") + + result = {"message": message, "type": "system"} + elif data.get("event") == "workflow_finished": answer = data.get("data", "") if isinstance(answer, str): logger.error("----------------鏈煡鏁版嵁--------------------") logger.error(data) - continue + result = {"message": "", "type": "close", "download_url": ""} elif isinstance(answer, dict): - - message = answer.get("title", "") - - result = {"message": message, "type": "system"} - elif data.get("event") == "workflow_finished": - answer = data.get("data", "") - if isinstance(answer, str): - logger.error("----------------鏈煡鏁版嵁--------------------") - logger.error(data) - result = {"message": "", "type": "close", "download_url": ""} - elif isinstance(answer, dict): - download_url = "" - outputs = answer.get("outputs", {}) - if outputs: - message = outputs.get("output", "") - download_url = outputs.get("download_url", "") - else: - message = answer.get("error", "") - - result = {"message": message, "type": "message", "download_url": download_url} - try: - SessionService(db).update_session(chat_id, - message={"role": "assistant", - "content": { - "answer": message, - "download_url": download_url}}, - conversation_id=data.get( - "conversation_id")) - except Exception as e: - logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒") - logger.error(e) - try: - await websocket.send_json(result) - except Exception as e: - logger.error(e) - logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!") - result = {"message": "", "type": "close", "download_url": ""} + download_url = "" + outputs = answer.get("outputs", {}) + if outputs: + message = outputs.get("output", "") + download_url = outputs.get("download_url", "") + else: + message = answer.get("error", "") + if download_url: + files = [{ + "type": "document", + "transfer_method": "remote_url", + "url": download_url, + "upload_file_id": "" + }] + result = {"message": message, "type": "message", "download_url": download_url} + try: + SessionService(db).update_session(chat_id, + message={"role": "assistant", + "content": { + "answer": message, + "download_url": download_url}}, + conversation_id=data.get( + "conversation_id")) + except Exception as e: + logger.error("淇濆瓨dify鐨勪細璇濆紓甯革紒") + logger.error(e) + try: + await websocket.send_json(result) + except Exception as e: + logger.error(e) + logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!") + result = {"message": "", "type": "close", "download_url": ""} - else: - continue - try: - await websocket.send_json(result) - except Exception as e: - logger.error(e) - logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!") - complete_response = "" - except json.JSONDecodeError as e: - print(f"Error decoding JSON: {e}") - # print(f"Response text: {text}") - except Exception as e2: - result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"} - await websocket.send_json(result) - print(f"Error process message of ragflow: {e2}") + else: + continue + try: + await websocket.send_json(result) + except Exception as e: + logger.error(e) + logger.error("杩斿洖瀹㈡埛绔秷鎭紓甯�!") + complete_response = "" + except json.JSONDecodeError as e: + print(f"Error decoding JSON: {e}") + # print(f"Response text: {text}") + except Exception as e2: + result = {"message": f"鍐呴儴閿欒锛� {e2}", "type": "close"} + await websocket.send_json(result) + print(f"Error process message of ragflow: {e2}") elif workflow_type == 3: image_list = [] # print(inputs) diff --git a/app/api/files.py b/app/api/files.py index a5c2fd1..92ee599 100644 --- a/app/api/files.py +++ b/app/api/files.py @@ -18,6 +18,7 @@ from app.service.bisheng import BishengService from app.service.common.api_token import DfTokenDao from app.service.difyService import DifyService +from app.service.files import read_file from app.service.ragflow import RagflowService from app.service.service_token import get_ragflow_token, get_bisheng_token import urllib.parse @@ -124,7 +125,11 @@ except Exception as e: return Response(code=400, msg=str(e)) try: - file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id) + filename = f.filename + file_upload = await dify_service.upload(token, filename, file_content, current_user.id) + print(file_upload) + tokens = await read_file(file_content, filename, f.content_type) + file_upload["tokens"] = tokens result.append(file_upload) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/config/config.py b/app/config/config.py index c5a10a0..e925d4b 100644 --- a/app/config/config.py +++ b/app/config/config.py @@ -23,6 +23,7 @@ dify_workflow_clean: str = '' dify_workflow_report: str = '' postgresql_database_url: str = '' + max_report_tokens: int = 100000 def __init__(self, **kwargs): # 鏇挎崲閰嶇疆涓殑IP鍦板潃 host_ip = os.getenv('HOST_IP', '127.0.0.1') diff --git a/app/config/config.yaml b/app/config/config.yaml index ddee042..462bc37 100644 --- a/app/config/config.yaml +++ b/app/config/config.yaml @@ -20,4 +20,5 @@ dify_api_token: app-YmOAMDsPpDDlqryMHnc9TzTO postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a -dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s \ No newline at end of file +dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s +max_report_tokens: 100000 diff --git a/app/service/files.py b/app/service/files.py new file mode 100644 index 0000000..ba9d85f --- /dev/null +++ b/app/service/files.py @@ -0,0 +1,53 @@ +import fitz +import io +from docx import Document +from dashscope import get_tokenizer # dashscope鐗堟湰 >= 1.14.0 + +from app.service.auth import decode_access_token + + +async def get_str_token(input_str): + # 鑾峰彇tokenizer瀵硅薄锛岀洰鍓嶅彧鏀寔閫氫箟鍗冮棶绯诲垪妯″瀷 + tokenizer = get_tokenizer('qwen-turbo') + # 灏嗗瓧绗︿覆鍒囧垎鎴恡oken骞惰浆鎹负token id + tokens = tokenizer.encode(input_str) + # print(f"缁忚繃鍒囧垎鍚庣殑token id涓猴細{tokens}銆�") + # # 缁忚繃鍒囧垎鍚庣殑token id涓猴細 [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773] + # print(f"缁忚繃鍒囧垎鍚庡叡鏈墈len(tokens)}涓猼oken") + # # 缁忚繃鍒囧垎鍚庡叡鏈�8涓猼oken + # + # # 灏唗oken id杞寲涓哄瓧绗︿覆骞舵墦鍗板嚭鏉� + # for i in range(len(tokens)): + # print(f"token id涓簕tokens[i]}瀵瑰簲鐨勫瓧绗︿覆涓猴細{tokenizer.decode(tokens[i])}") + return len(tokens) + +async def read_pdf(pdf_stream): + text = "" + with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document: + for page in pdf_document: + text += page.get_text() + return text + + +async def read_word(word_stream): + # 浣跨敤 python-docx 鎵撳紑 Word 鏂囦欢娴� + doc = Document(io.BytesIO(word_stream)) + + # 鎻愬彇姣忎釜娈佃惤鐨勬枃鏈� + text = "" + for para in doc.paragraphs: + text += para.text + + return text + + +async def read_file(file, filename, content_type): + text = "" + if content_type == "application/pdf" or filename.endswith('.pdf'): + + # 鎻愬彇 PDF 鍐呭 + text = await read_pdf(file) + elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'): + text = await read_word(file) + + return await get_str_token(text) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 29ba9db..6c797ba 100644 --- a/requirements.txt +++ b/requirements.txt Binary files differ -- Gitblit v1.8.0