rag-gateway.git

parent: c8da9695 | 补丁 | 提交 | show whitespace

zhaoqingang

2025-01-02 992d91359f4e4437ddba9843173254441c896918

报告生成文档清洗

1个文件已添加

5个文件已修改

	app/api/chat.py	46 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	app/api/files.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	app/config/config.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	app/config/config.yaml	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	app/service/files.py	53 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	requirements.txt	补丁 \| 查看 \| 原始文档 \| blame \| 历史

 app/api/chat.py

@@ -436,6 +436,10 @@
                        title_number = receive_message.get('title_number', 8)
                        title_style = receive_message.get('title_style', "")
                        title_query = receive_message.get('title_query', "")
                        is_clean = receive_message.get('is_clean', 0)
                        file_type = receive_message.get('file_type', 1)
                        max_token = receive_message.get('max_tokens', 100000)
                        tokens = receive_message.get('tokens', 0)
                        if upload_files:
                            title_query = "start"
                        # if not upload_files:
@@ -456,15 +460,27 @@
                        }
                        files = []
                        for file in upload_files:
                            if file_type == 1:
                            files.append({
                                "type": "document",
                                "transfer_method": "local_file",
                                "url": "",
                                "upload_file_id": file
                            })
                            else:
                                files.append({
                                    "type": "document",
                                    "transfer_method": "remote_url",
                                    "url": file,
                                    "upload_file_id": ""
                                })
                        inputs_list = []
                        token_list = []
                        if workflow_type == 1:
                            inputs["input_files"] = files
                        elif workflow_type == 2:
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 2 and is_clean == 0:
                            inputs["file_list"] = files
                            inputs["Completion_of_main_indicators"] = title
                            inputs["sub_titles"] = sub_titles
@@ -472,6 +488,8 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs)
                            token_list.append(token)
                        elif workflow_type == 3:
                            inputs["file_list"] = files
                            inputs["number_of_title"] = title_number
@@ -480,9 +498,27 @@
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_title", "type": "error"})
                            # inputs_list.append(inputs)
                            # token_list.append(token)
                        elif workflow_type == 2 and is_clean == 1:
                            # inputs["input_files"] = files
                            inputs_list.append(inputs)
                            token_list.append(token)
                            inputs1 = {}
                            # inputs1["file_list"] = files
                            inputs1["Completion_of_main_indicators"] = title
                            inputs1["sub_titles"] = sub_titles
                            token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
                            if not token:
                                await websocket.send_json(
                                    {"message": "Invalid token document_to_report", "type": "error"})
                            inputs_list.append(inputs1)
                            token_list.append(token)

                        complete_response = ""
                        if workflow_type == 1 or workflow_type == 2:
                            for inputs in inputs_list:
                                inputs["input_files"] = files
                            async for rag_response in dify_service.workflow(token, current_user.id, inputs):
                                # print(rag_response)
                                try:
@@ -527,7 +563,13 @@
                                                    download_url = outputs.get("download_url", "")
                                                else:
                                                    message = answer.get("error", "")

                                                    if download_url:
                                                        files = [{
                                                            "type": "document",
                                                            "transfer_method": "remote_url",
                                                            "url": download_url,
                                                            "upload_file_id": ""
                                                        }]
                                                result = {"message": message, "type": "message", "download_url": download_url}
                                                try:
                                                    SessionService(db).update_session(chat_id,

 app/api/files.py

@@ -18,6 +18,7 @@
from app.service.bisheng import BishengService
from app.service.common.api_token import DfTokenDao
from app.service.difyService import DifyService
from app.service.files import read_file
from app.service.ragflow import RagflowService
from app.service.service_token import get_ragflow_token, get_bisheng_token
import urllib.parse
@@ -124,7 +125,11 @@
                except Exception as e:
                    return Response(code=400, msg=str(e))
                try:
                    file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
                    filename = f.filename
                    file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
                    print(file_upload)
                    tokens = await read_file(file_content, filename, f.content_type)
                    file_upload["tokens"] = tokens
                    result.append(file_upload)
                except Exception as e:
                    raise HTTPException(status_code=500, detail=str(e))

 app/config/config.py

@@ -23,6 +23,7 @@
    dify_workflow_clean: str = ''
    dify_workflow_report: str = ''
    postgresql_database_url: str = ''
    max_report_tokens: int = 100000
    def __init__(self, **kwargs):
        # 替换配置中的IP地址
        host_ip = os.getenv('HOST_IP', '127.0.0.1')

 app/config/config.yaml

@@ -21,3 +21,4 @@
postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
max_report_tokens: 100000

 app/service/files.py

New file
@@ -0,0 +1,53 @@
import fitz
import io
from docx import Document
from dashscope import get_tokenizer  # dashscope版本 >= 1.14.0

from app.service.auth import decode_access_token


async def get_str_token(input_str):
    # 获取tokenizer对象，目前只支持通义千问系列模型
    tokenizer = get_tokenizer('qwen-turbo')
    # 将字符串切分成token并转换为token id
    tokens = tokenizer.encode(input_str)
    # print(f"经过切分后的token id为：{tokens}。")
    # # 经过切分后的token id为： [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
    # print(f"经过切分后共有{len(tokens)}个token")
    # # 经过切分后共有8个token
    #
    # # 将token id转化为字符串并打印出来
    # for i in range(len(tokens)):
    #     print(f"token id为{tokens[i]}对应的字符串为：{tokenizer.decode(tokens[i])}")
    return len(tokens)

async def read_pdf(pdf_stream):
        text = ""
        with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
            for page in pdf_document:
                text += page.get_text()
        return text


async def read_word(word_stream):
    # 使用 python-docx 打开 Word 文件流
    doc = Document(io.BytesIO(word_stream))

    # 提取每个段落的文本
    text = ""
    for para in doc.paragraphs:
        text += para.text

    return text


async def read_file(file, filename, content_type):
    text = ""
    if content_type == "application/pdf" or filename.endswith('.pdf'):

        # 提取 PDF 内容
        text = await read_pdf(file)
    elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
        text = await read_word(file)

    return await get_str_token(text)

 requirements.txt

Binary files differ

			@@ -436,6 +436,10 @@
			title_number = receive_message.get('title_number', 8)
			title_style = receive_message.get('title_style', "")
			title_query = receive_message.get('title_query', "")
			is_clean = receive_message.get('is_clean', 0)
			file_type = receive_message.get('file_type', 1)
			max_token = receive_message.get('max_tokens', 100000)
			tokens = receive_message.get('tokens', 0)
			if upload_files:
			title_query = "start"
			# if not upload_files:
			@@ -456,15 +460,27 @@
			}
			files = []
			for file in upload_files:
			if file_type == 1:
			files.append({
			"type": "document",
			"transfer_method": "local_file",
			"url": "",
			"upload_file_id": file
			})
			else:
			files.append({
			"type": "document",
			"transfer_method": "remote_url",
			"url": file,
			"upload_file_id": ""
			})
			inputs_list = []
			token_list = []
			if workflow_type == 1:
			inputs["input_files"] = files
			elif workflow_type == 2:
			inputs_list.append(inputs)
			token_list.append(token)
			elif workflow_type == 2 and is_clean == 0:
			inputs["file_list"] = files
			inputs["Completion_of_main_indicators"] = title
			inputs["sub_titles"] = sub_titles
			@@ -472,6 +488,8 @@
			if not token:
			await websocket.send_json(
			{"message": "Invalid token document_to_report", "type": "error"})
			inputs_list.append(inputs)
			token_list.append(token)
			elif workflow_type == 3:
			inputs["file_list"] = files
			inputs["number_of_title"] = title_number
			@@ -480,9 +498,27 @@
			if not token:
			await websocket.send_json(
			{"message": "Invalid token document_to_title", "type": "error"})
			# inputs_list.append(inputs)
			# token_list.append(token)
			elif workflow_type == 2 and is_clean == 1:
			# inputs["input_files"] = files
			inputs_list.append(inputs)
			token_list.append(token)
			inputs1 = {}
			# inputs1["file_list"] = files
			inputs1["Completion_of_main_indicators"] = title
			inputs1["sub_titles"] = sub_titles
			token = DfTokenDao(db).get_token_by_id(DOCUMENT_TO_REPORT_TITLE)
			if not token:
			await websocket.send_json(
			{"message": "Invalid token document_to_report", "type": "error"})
			inputs_list.append(inputs1)
			token_list.append(token)

			complete_response = ""
			if workflow_type == 1 or workflow_type == 2:
			for inputs in inputs_list:
			inputs["input_files"] = files
			async for rag_response in dify_service.workflow(token, current_user.id, inputs):
			# print(rag_response)
			try:
			@@ -527,7 +563,13 @@
			download_url = outputs.get("download_url", "")
			else:
			message = answer.get("error", "")

			if download_url:
			files = [{
			"type": "document",
			"transfer_method": "remote_url",
			"url": download_url,
			"upload_file_id": ""
			}]
			result = {"message": message, "type": "message", "download_url": download_url}
			try:
			SessionService(db).update_session(chat_id,

			@@ -18,6 +18,7 @@
			from app.service.bisheng import BishengService
			from app.service.common.api_token import DfTokenDao
			from app.service.difyService import DifyService
			from app.service.files import read_file
			from app.service.ragflow import RagflowService
			from app.service.service_token import get_ragflow_token, get_bisheng_token
			import urllib.parse
			@@ -124,7 +125,11 @@
			except Exception as e:
			return Response(code=400, msg=str(e))
			try:
			file_upload = await dify_service.upload(token, f.filename, file_content, current_user.id)
			filename = f.filename
			file_upload = await dify_service.upload(token, filename, file_content, current_user.id)
			print(file_upload)
			tokens = await read_file(file_content, filename, f.content_type)
			file_upload["tokens"] = tokens
			result.append(file_upload)
			except Exception as e:
			raise HTTPException(status_code=500, detail=str(e))

			@@ -23,6 +23,7 @@
			dify_workflow_clean: str = ''
			dify_workflow_report: str = ''
			postgresql_database_url: str = ''
			max_report_tokens: int = 100000
			def __init__(self, **kwargs):
			# 替换配置中的IP地址
			host_ip = os.getenv('HOST_IP', '127.0.0.1')

			@@ -21,3 +21,4 @@
			postgresql_database_url: postgresql+asyncpg://kong:kongpass@192.168.20.119:5432/kong
			dify_workflow_clean: app-OpF0drPu0XcgqcekQpT4FA8a
			dify_workflow_report: app-0MAkdFWqh9zxwmU69O0BFU1s
			max_report_tokens: 100000

New file
			@@ -0,0 +1,53 @@
			import fitz
			import io
			from docx import Document
			from dashscope import get_tokenizer # dashscope版本 >= 1.14.0

			from app.service.auth import decode_access_token


			async def get_str_token(input_str):
			# 获取tokenizer对象，目前只支持通义千问系列模型
			tokenizer = get_tokenizer('qwen-turbo')
			# 将字符串切分成token并转换为token id
			tokens = tokenizer.encode(input_str)
			# print(f"经过切分后的token id为：{tokens}。")
			# # 经过切分后的token id为： [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
			# print(f"经过切分后共有{len(tokens)}个token")
			# # 经过切分后共有8个token
			#
			# # 将token id转化为字符串并打印出来
			# for i in range(len(tokens)):
			# print(f"token id为{tokens[i]}对应的字符串为：{tokenizer.decode(tokens[i])}")
			return len(tokens)

			async def read_pdf(pdf_stream):
			text = ""
			with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
			for page in pdf_document:
			text += page.get_text()
			return text


			async def read_word(word_stream):
			# 使用 python-docx 打开 Word 文件流
			doc = Document(io.BytesIO(word_stream))

			# 提取每个段落的文本
			text = ""
			for para in doc.paragraphs:
			text += para.text

			return text


			async def read_file(file, filename, content_type):
			text = ""
			if content_type == "application/pdf" or filename.endswith('.pdf'):

			# 提取 PDF 内容
			text = await read_pdf(file)
			elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith('.docx'):
			text = await read_word(file)

			return await get_str_token(text)