import json import fitz import io from docx import Document from dashscope import get_tokenizer # dashscope版本 >= 1.14.0 from app.models import ComplexChatSessionDao from app.service.auth import decode_access_token async def get_str_token(input_str): # 获取tokenizer对象,目前只支持通义千问系列模型 tokenizer = get_tokenizer('qwen-turbo') # 将字符串切分成token并转换为token id tokens = tokenizer.encode(input_str) # print(f"经过切分后的token id为:{tokens}。") # # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773] # print(f"经过切分后共有{len(tokens)}个token") # # 经过切分后共有8个token # # # 将token id转化为字符串并打印出来 # for i in range(len(tokens)): # print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}") return len(tokens) async def read_pdf(pdf_stream): text = "" with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document: for page in pdf_document: text += page.get_text() return text async def read_word(word_stream): # 使用 python-docx 打开 Word 文件流 doc = Document(io.BytesIO(word_stream)) # 提取每个段落的文本 text = "" for para in doc.paragraphs: text += para.text return text async def read_file(file, filename, content_type): text = "" if content_type == "application/pdf" or filename.endswith('.pdf'): # 提取 PDF 内容 text = await read_pdf(file) elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith( '.docx'): text = await read_word(file) return await get_str_token(text) async def service_chat_message(db, message_id: str): message = await ComplexChatSessionDao(db).get_session_by_id(message_id) content = "" title = "" if message: content = message.content title= json.loads(message.query).get("query") return title, content async def generate_word_document(title, content): doc = Document() # 添加标题 doc.add_heading(title, level=1) # 将内容按段落分割并写入文档 for paragraph in content.split('\n'): # print("--------------:", paragraph) doc.add_paragraph(paragraph) return doc