From f6a0aa15269cae87737d1564ad7a33063f1d94de Mon Sep 17 00:00:00 2001 From: zhaoqingang <zhaoqg0118@163.com> Date: 星期二, 18 二月 2025 13:43:13 +0800 Subject: [PATCH] Merge branch 'master' of http://192.168.5.5:10010/r/rag-gateway into develop_2.1 --- requirements.txt | 0 app/service/files.py | 55 +++++++++++++++++++++++++++ app/api/excel.py | 24 ++++++----- app/api/agent.py | 2 + app/api/files.py | 7 +++ 5 files changed, 77 insertions(+), 11 deletions(-) diff --git a/app/api/agent.py b/app/api/agent.py index 0a6429e..d5c5ea8 100644 --- a/app/api/agent.py +++ b/app/api/agent.py @@ -192,6 +192,8 @@ for i in session.log_to_json().get("message", []): if i.get("role") == "user": tmp_data["question"]=i.get("content") + if "upload_filenames" in i: + tmp_data["upload_filenames"] = i.get("upload_filenames") elif i.get("role") == "assistant": if isinstance(i.get("content"), dict): diff --git a/app/api/excel.py b/app/api/excel.py index e8cc6ec..1c0ccf4 100644 --- a/app/api/excel.py +++ b/app/api/excel.py @@ -1,10 +1,10 @@ import random import string -from fastapi import APIRouter, File, UploadFile, Form, BackgroundTasks, Depends, Request +from fastapi import APIRouter, File, UploadFile, Form, BackgroundTasks, Depends, Request, WebSocket from fastapi.responses import JSONResponse, FileResponse from sqlalchemy.orm import Session -from starlette.websockets import WebSocket +# from starlette.websockets import WebSocket from app.api import get_current_user, get_current_user_websocket, Response from app.models import UserModel, AgentType @@ -52,14 +52,15 @@ return prefix + random_part -def db_create_session(db: Session, user_id: str): +def db_create_session(db: Session, user_id: str, message:str, upload_filenames: list): db_id = generate_db_id() session = SessionService(db).create_session( db_id, - "鍚堝苟Excel", + message, "basic_excel_merge", AgentType.BASIC, - int(user_id) + int(user_id), + {"role": "user", "content": message, "upload_filenames": upload_filenames} ) return session @@ -102,11 +103,12 @@ user_excel = EXCEL_FILES_PATH create_dir_if_not_exists(user_source) create_dir_if_not_exists(user_excel) - while True: - data = await websocket.receive_text() + # data = await websocket.receive_text()git + receive_message = await websocket.receive_json() try: - if data == "\"鍚堝苟Excel\"": + if receive_message.get("message") == "鍚堝苟Excel": + upload_filenames = receive_message.get('upload_filenames', []) merge_file = run_conformity(user_source, user_excel) if merge_file is not None: @@ -124,7 +126,7 @@ "type": "close", }) # 鍒涘缓浼氳瘽璁板綍 - session = db_create_session(db, user_id) + session = db_create_session(db, user_id, receive_message.get("message"), upload_filenames) # 鏇存柊浼氳瘽璁板綍 if session: session_id = session.id @@ -143,8 +145,8 @@ await websocket.send_json({"error": "鍚堝苟澶辫触", "type": "stream", "files": []}) await websocket.close() else: - print(f"Received data: {data}") - await websocket.send_json({"error": "鏈煡鎸囦护", "data": str(data)}) + print(f"Received data: {receive_message.get('message')}") + await websocket.send_json({"error": "鏈煡鎸囦护", "data": str(receive_message.get('message'))}) await websocket.close() except Exception as e: await websocket.send_json({"error": str(e)}) diff --git a/app/api/files.py b/app/api/files.py index 3a8491d..e00c43b 100644 --- a/app/api/files.py +++ b/app/api/files.py @@ -8,6 +8,7 @@ from starlette.responses import StreamingResponse from werkzeug.utils import send_file +from Log import logger from app.api import Response, get_current_user, ResponseList from app.config.config import settings from app.config.const import DOCUMENT_TO_REPORT, IMAGE_TO_TEXT, DOCUMENT_TO_REPORT_TITLE, DOCUMENT_IA_QUESTIONS, \ @@ -18,6 +19,7 @@ from app.models.user_model import UserModel from app.service.basic import BasicService from app.service.bisheng import BishengService +from app.service.files import read_file from app.service.v2.api_token import DfTokenDao from app.service.difyService import DifyService from app.service.ragflow import RagflowService @@ -134,6 +136,11 @@ return Response(code=400, msg=str(e)) try: file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id) + try: + tokens = await read_file(file_content, f.filename, f.content_type) + file_upload["tokens"] = tokens + except Exception as e: + logger.error(e) result.append(file_upload) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) diff --git a/app/service/files.py b/app/service/files.py new file mode 100644 index 0000000..ad0e93e --- /dev/null +++ b/app/service/files.py @@ -0,0 +1,55 @@ +import fitz +import io +from docx import Document +from dashscope import get_tokenizer # dashscope鐗堟湰 >= 1.14.0 + +from app.service.auth import decode_access_token + + +async def get_str_token(input_str): + # 鑾峰彇tokenizer瀵硅薄锛岀洰鍓嶅彧鏀寔閫氫箟鍗冮棶绯诲垪妯″瀷 + tokenizer = get_tokenizer('qwen-turbo') + # 灏嗗瓧绗︿覆鍒囧垎鎴恡oken骞惰浆鎹负token id + tokens = tokenizer.encode(input_str) + # print(f"缁忚繃鍒囧垎鍚庣殑token id涓猴細{tokens}銆�") + # # 缁忚繃鍒囧垎鍚庣殑token id涓猴細 [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773] + # print(f"缁忚繃鍒囧垎鍚庡叡鏈墈len(tokens)}涓猼oken") + # # 缁忚繃鍒囧垎鍚庡叡鏈�8涓猼oken + # + # # 灏唗oken id杞寲涓哄瓧绗︿覆骞舵墦鍗板嚭鏉� + # for i in range(len(tokens)): + # print(f"token id涓簕tokens[i]}瀵瑰簲鐨勫瓧绗︿覆涓猴細{tokenizer.decode(tokens[i])}") + return len(tokens) + + +async def read_pdf(pdf_stream): + text = "" + with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document: + for page in pdf_document: + text += page.get_text() + return text + + +async def read_word(word_stream): + # 浣跨敤 python-docx 鎵撳紑 Word 鏂囦欢娴� + doc = Document(io.BytesIO(word_stream)) + + # 鎻愬彇姣忎釜娈佃惤鐨勬枃鏈� + text = "" + for para in doc.paragraphs: + text += para.text + + return text + + +async def read_file(file, filename, content_type): + text = "" + if content_type == "application/pdf" or filename.endswith('.pdf'): + + # 鎻愬彇 PDF 鍐呭 + text = await read_pdf(file) + elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith( + '.docx'): + text = await read_word(file) + + return await get_str_token(text) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 9928452..4d645ea 100644 --- a/requirements.txt +++ b/requirements.txt Binary files differ -- Gitblit v1.8.0