|
import gradio as gr
|
|
import os
|
|
import json
|
|
import shutil
|
|
from datetime import datetime
|
|
from retriever import retriever, reload_retriever
|
|
from generator import answer_query
|
|
from langchain_community.document_loaders import (
|
|
PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader
|
|
)
|
|
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
from langchain_huggingface import HuggingFaceEmbeddings
|
|
from langchain_community.vectorstores import FAISS
|
|
import html
|
|
|
|
|
|
CUSTOM_CSS_PATH = "gradio_theme.css"
|
|
|
|
|
|
UPLOADED_FILES_JSON = "uploaded_files.json"
|
|
uploaded_files = []
|
|
|
|
def save_uploaded_files_to_json():
|
|
with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f:
|
|
json.dump(uploaded_files, f, ensure_ascii=False, indent=2)
|
|
|
|
def load_uploaded_files_from_json():
|
|
global uploaded_files
|
|
if os.path.exists(UPLOADED_FILES_JSON):
|
|
with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f:
|
|
uploaded_files = json.load(f)
|
|
else:
|
|
uploaded_files = []
|
|
|
|
def update_uploaded_files():
|
|
if not uploaded_files:
|
|
return "_Chưa có tài liệu nào được tải lên._"
|
|
return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join(
|
|
f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files
|
|
)
|
|
|
|
|
|
load_uploaded_files_from_json()
|
|
|
|
def process_document(file):
|
|
file_path = file.name
|
|
|
|
if os.path.exists("vectorstore"):
|
|
shutil.rmtree("vectorstore")
|
|
|
|
try:
|
|
if file_path.endswith(".pdf"):
|
|
loader = PyPDFLoader(file_path)
|
|
elif file_path.endswith(".csv"):
|
|
loader = CSVLoader(file_path)
|
|
elif file_path.endswith(".txt"):
|
|
loader = TextLoader(file_path, autodetect_encoding=True)
|
|
elif file_path.endswith(".docx") or file_path.endswith(".doc"):
|
|
loader = UnstructuredWordDocumentLoader(file_path)
|
|
else:
|
|
return "❌ Định dạng file không hỗ trợ.", update_uploaded_files()
|
|
|
|
documents = loader.load()
|
|
except Exception as e:
|
|
return f"❌ Lỗi khi tải tài liệu: {e}", update_uploaded_files()
|
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
|
docs = splitter.split_documents(documents)
|
|
|
|
if not docs:
|
|
return "⚠️ Không trích xuất được nội dung từ tài liệu.", update_uploaded_files()
|
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
|
db = FAISS.from_documents(docs, embeddings)
|
|
db.save_local("vectorstore")
|
|
reload_retriever()
|
|
|
|
uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()})
|
|
save_uploaded_files_to_json()
|
|
|
|
return f"✅ Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files()
|
|
|
|
def delete_file(filename):
|
|
global uploaded_files
|
|
filename = filename.strip()
|
|
uploaded_files = [f for f in uploaded_files if f["name"] != filename]
|
|
save_uploaded_files_to_json()
|
|
return update_uploaded_files()
|
|
|
|
def clear_inputs():
|
|
return "", ""
|
|
|
|
def query_function(question, model_choice, temperature, include_sources):
|
|
answer, docs = answer_query(question, model=model_choice, temperature=temperature)
|
|
answer = html.escape(answer)
|
|
|
|
if include_sources and docs:
|
|
unique_sources = set()
|
|
for doc in docs:
|
|
section = doc.metadata.get("section")
|
|
if section:
|
|
unique_sources.add(section.strip())
|
|
else:
|
|
filename = os.path.basename(doc.metadata.get("source", "Unknown"))
|
|
unique_sources.add(filename.strip())
|
|
if unique_sources:
|
|
sources_list = [f"- {src}" for src in sorted(unique_sources)]
|
|
sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list)
|
|
answer += sources_text
|
|
return answer
|
|
|
|
|
|
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo:
|
|
with gr.Row():
|
|
with gr.Column(scale=5):
|
|
gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box")
|
|
|
|
with gr.Tabs():
|
|
with gr.TabItem("🔍 Tìm kiếm"):
|
|
with gr.Column(elem_classes="container-box"):
|
|
question = gr.Textbox(lines=3, label="Câu hỏi")
|
|
with gr.Row():
|
|
model_choice = gr.Dropdown(["Gemini Pro", "GPT-3.5", "GPT-4", "Claude"], value="Gemini Pro", label="Mô hình")
|
|
temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature")
|
|
include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True)
|
|
with gr.Row():
|
|
search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary")
|
|
clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary")
|
|
output = gr.Markdown(elem_classes="output-box")
|
|
|
|
search_btn.click(query_function, inputs=[question, model_choice, temperature, include_sources], outputs=[output])
|
|
clear_btn.click(clear_inputs, outputs=[question, output])
|
|
|
|
with gr.TabItem("📚 Quản lý tài liệu"):
|
|
with gr.Column(elem_classes="container-box"):
|
|
upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"])
|
|
upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary")
|
|
upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False)
|
|
uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box")
|
|
with gr.Column(elem_classes="container-box"):
|
|
delete_filename = gr.Textbox(label="Tên file muốn xóa")
|
|
delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary")
|
|
|
|
upload_btn.click(process_document, inputs=[upload_file], outputs=[upload_status, uploaded_files_list])
|
|
delete_btn.click(delete_file, inputs=[delete_filename], outputs=[uploaded_files_list])
|
|
|
|
demo.launch(share=True)
|
|
|