import json
|
|
import fitz
|
import io
|
from docx import Document
|
from dashscope import get_tokenizer # dashscope版本 >= 1.14.0
|
|
from app.models import ComplexChatSessionDao
|
from app.service.auth import decode_access_token
|
|
|
async def get_str_token(input_str):
|
# 获取tokenizer对象,目前只支持通义千问系列模型
|
tokenizer = get_tokenizer('qwen-turbo')
|
# 将字符串切分成token并转换为token id
|
tokens = tokenizer.encode(input_str)
|
# print(f"经过切分后的token id为:{tokens}。")
|
# # 经过切分后的token id为: [31935, 64559, 99320, 56007, 100629, 104795, 99788, 1773]
|
# print(f"经过切分后共有{len(tokens)}个token")
|
# # 经过切分后共有8个token
|
#
|
# # 将token id转化为字符串并打印出来
|
# for i in range(len(tokens)):
|
# print(f"token id为{tokens[i]}对应的字符串为:{tokenizer.decode(tokens[i])}")
|
return len(tokens)
|
|
|
async def read_pdf(pdf_stream):
|
text = ""
|
with fitz.open(stream=pdf_stream, filetype="pdf") as pdf_document:
|
for page in pdf_document:
|
text += page.get_text()
|
return text
|
|
|
async def read_word(word_stream):
|
# 使用 python-docx 打开 Word 文件流
|
doc = Document(io.BytesIO(word_stream))
|
|
# 提取每个段落的文本
|
text = ""
|
for para in doc.paragraphs:
|
text += para.text
|
|
return text
|
|
|
async def read_file(file, filename, content_type):
|
text = ""
|
if content_type == "application/pdf" or filename.endswith('.pdf'):
|
|
# 提取 PDF 内容
|
text = await read_pdf(file)
|
elif content_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" or filename.endswith(
|
'.docx'):
|
text = await read_word(file)
|
|
return await get_str_token(text)
|
|
|
async def service_chat_message(db, message_id: str):
|
message = await ComplexChatSessionDao(db).get_session_by_id(message_id)
|
content = ""
|
title = ""
|
if message:
|
content = message.content
|
title= json.loads(message.query).get("query")
|
return title, content
|
|
|
async def generate_word_document(title, content):
|
doc = Document()
|
# 添加标题
|
doc.add_heading(title, level=1)
|
|
# 将内容按段落分割并写入文档
|
for paragraph in content.split('\n'):
|
# print("--------------:", paragraph)
|
doc.add_paragraph(paragraph)
|
|
return doc
|