app/api/chat.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
app/api/files.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
app/config/config.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
app/config/config.yaml | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
app/service/files.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
requirements.txt | 补丁 | 查看 | 原始文档 | blame | 历史 |
app/api/chat.py
@@ -436,6 +436,10 @@ title_number = receive_message.get('title_number', 8) title_style = receive_message.get('title_style', "") title_query = receive_message.get('title_query', "") is_clean = receive_message.get('is_clean', 0) file_type = receive_message.get('file_type', 1) max_token = receive_message.get('max_tokens', 100000) tokens = receive_message.get('tokens', 0) if upload_files: title_query = "start" # if not upload_files: @@ -456,15 +460,27 @@ } files = [] for file in upload_files: if file_type == 1: files.append({ "type": "document", "transfer_method": "local_file", "url": "", "upload_file_id": file }) else: files.append({ "type": "document", "transfer_method": "remote_url", "url": file, "upload_file_id": "" }) inputs_list = [] token_list = [] if workflow_type == 1: inputs["input_files"] = files elif workflow_type == 2: inputs_list.append(inputs) token_list.append(token) elif workflow_type == 2 and is_clean == 0: inputs["file_list"] = files inputs["Completion_of_main_indicators"] = title inputs["sub_titles"] = sub_titles @@ -472,6 +488,8 @@ if not token: await websocket.send_json( {"message": "Invalid token document_to_report", "type": "error"}) inputs_list.append(inputs) token_list.append(token) elif workflow_type == 3: inputs["file_list"] = files inputs["number_of_title"] = title_number @@ -480,9 +498,27 @@ if not token: await websocket.send_json( {"message": "Invalid token document_to_title", "type": "error"}) # inputs_list.append(inputs) # token_list.append(token) elif workflow_type == 2 and is_clean == 1: # inputs["input_files"] = files inputs_list.append(inputs) token_list.append(token) inputs1 = {} # inputs1["file_list"] = files inputs1["Completion_of_main_indicators"] = title inputs1["sub_titles"] = sub_titles token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE) if not token: await websocket.send_json( {"message": "Invalid token document_to_report", "type": "error"}) inputs_list.append(inputs1) token_list.append(token) complete_response = "" if workflow_type == 1 or workflow_type == 2: for inputs in inputs_list: inputs["input_files"] = files async for rag_response in dify_service.workflow(token, current_user.id, inputs): # print(rag_response) try: @@ -527,7 +563,13 @@ download_url = outputs.get("download_url", "") else: message = answer.get("error", "") if download_url: files = [{ "type": "document", "transfer_method": "remote_url", "url": download_url, "upload_file_id": "" }] result = {"message": message, "type": "message", "download_url": download_url} try: SessionService(db).update_session(chat_id, app/api/files.py
@@ -18,6 +18,7 @@ from app.service.bisheng import BishengService from app.service.common.api_token import DfTokenDao from app.service.difyService import DifyService from app.service.files import read_file from app.service.ragflow import RagflowService from app.service.service_token import get_ragflow_token, get_bisheng_token import urllib.parse @@ -124,7 +125,11 @@ except Exception as e: return Response(code=400, msg=str(e)) try: file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id) filename = f.filename file_upload = await dify_service.upload(token, filename, file_content, current_user.id) print(file_upload) tokens = await read_file(file_content, filename, f.content_type) file_upload["tokens"] = tokens result.append(file_upload) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) app/config/config.py
@@ -23,6 +23,7 @@ dify_workflow_clean: str = '' dify_workflow_report: str = '' postgresql_database_url: str = '' max_report_tokens: int = 100000 def __init__(self, **kwargs): # 替换配置中的IP地址 host_ip = os.getenv('HOST_IP', '127.0.0.1') app/config/config.yaml
@@ -21,3 +21,4 @@ postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s max_report_tokens: 100000 app/service/files.py
New file @@ -0,0 +1,53 @@ import fitz import io from docx import Document from dashscope import get_tokenizer # dashscope版本 >= 1.14.0 from app.service.auth import decode_access_token async def get_str_token(input_str): # 获取tokenizer对象,目前只支持通义千问系列模型 tokenizer = get_tokenizer('qwen-turbo') # 将字符串切分成token并转换为token id tokens = tokenizer.encode(input_str) # print(f"经过切分后的token id为:{tokens}。") # # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773] # print(f"经过切分后共有{len(tokens)}个token") # # 经过切分后共有8个token # # # 将token id转化为字符串并打印出来 # for i in range(len(tokens)): # print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}") return len(tokens) async def read_pdf(pdf_stream): text = "" with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document: for page in pdf_document: text += page.get_text() return text async def read_word(word_stream): # 使用 python-docx 打开 Word 文件流 doc = Document(io.BytesIO(word_stream)) # 提取每个段落的文本 text = "" for para in doc.paragraphs: text += para.text return text async def read_file(file, filename, content_type): text = "" if content_type == "application/pdf" or filename.endswith('.pdf'): # 提取 PDF 内容 text = await read_pdf(file) elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'): text = await read_word(file) return await get_str_token(text) requirements.txtBinary files differ