File size: 6,507 Bytes
36cac86 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
import os
import json
import shutil
from datetime import datetime
from retriever import retriever, reload_retriever
from generator import answer_query
from langchain_community.document_loaders import (
PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import html
# Đường dẫn file CSS
CUSTOM_CSS_PATH = "gradio_theme.css"
# Quản lý danh sách file upload
UPLOADED_FILES_JSON = "uploaded_files.json"
uploaded_files = []
def save_uploaded_files_to_json():
with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
def load_uploaded_files_from_json():
global uploaded_files
if os.path.exists(UPLOADED_FILES_JSON):
with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
uploaded_files = json.load(f)
else:
uploaded_files = []
def update_uploaded_files():
if not uploaded_files:
return "_Chưa có tài liệu nào được tải lên._"
return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
)
# Load khi khởi động
load_uploaded_files_from_json()
def process_document(file):
file_path = file.name
if os.path.exists("vectorstore"):
shutil.rmtree("vectorstore")
try:
if file_path.endswith(".pdf"):
loader = PyPDFLoader(file_path)
elif file_path.endswith(".csv"):
loader = CSVLoader(file_path)
elif file_path.endswith(".txt"):
loader = TextLoader(file_path, autodetect_encoding=True) # <== fix lỗi txt
elif file_path.endswith(".docx") or file_path.endswith(".doc"):
loader = UnstructuredWordDocumentLoader(file_path)
else:
return "❌ Định dạng file không hỗ trợ.", update_uploaded_files()
documents = loader.load()
except Exception as e:
return f"❌ Lỗi khi tải tài liệu: {e}", update_uploaded_files()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
docs = splitter.split_documents(documents)
if not docs:
return "⚠️ Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.from_documents(docs, embeddings)
db.save_local("vectorstore")
reload_retriever()
uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
save_uploaded_files_to_json()
return f"✅ Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
def delete_file(filename):
global uploaded_files
filename = filename.strip()
uploaded_files = [f for f in uploaded_files if f["name"] != filename]
save_uploaded_files_to_json()
return update_uploaded_files()
def clear_inputs():
return "", ""
def query_function(question, model_choice, temperature, include_sources):
answer, docs = answer_query(question, model=model_choice, temperature=temperature)
answer = html.escape(answer)
if include_sources and docs:
unique_sources = set()
for doc in docs:
section = doc.metadata.get("section")
if section:
unique_sources.add(section.strip())
else:
filename = os.path.basename(doc.metadata.get("source", "Unknown"))
unique_sources.add(filename.strip())
if unique_sources:
sources_list = [f"- {src}" for src in sorted(unique_sources)]
sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
answer += sources_text
return answer
# Giao diện Gradio
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
with gr.Row():
with gr.Column(scale=5):
gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
with gr.Tabs():
with gr.TabItem("🔍 Tìm kiếm"):
with gr.Column(elem_classes="container-box"):
question = gr.Textbox(lines=3, label="Câu hỏi")
with gr.Row():
model_choice = gr.Dropdown(["Gemini Pro", "GPT-3.5", "GPT-4", "Claude"], value="Gemini Pro", label="Mô hình")
temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
with gr.Row():
search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
output = gr.Markdown(elem_classes="output-box") # Hiển thị kết quả trong khung đẹp
search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
clear_btn.click(clear_inputs, outputs=[question, output])
with gr.TabItem("📚 Quản lý tài liệu"):
with gr.Column(elem_classes="container-box"):
upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
with gr.Column(elem_classes="container-box"):
delete_filename = gr.Textbox(label="Tên file muốn xóa")
delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
demo.launch(share=True)
|