import os import streamlit as st from datetime import datetime import re from werkzeug.utils import secure_filename import fitz # PyMuPDF import base64 from src.gpp import GPP, GPPConfig from src.qa import AnswerGenerator # --- Page Configuration --- st.set_page_config( page_title="Document Intelligence", page_icon="🤖", layout="wide" ) # --- Session State Initialization --- if 'chat_history' not in st.session_state: st.session_state.chat_history = [] if 'parsed_info' not in st.session_state: st.session_state.parsed_info = None # Will store {collection_name, layout_pdf, md_path, etc.} if "selected_chunks" not in st.session_state: st.session_state.selected_chunks = [] # --- Custom CSS for Messenger-like UI --- st.markdown( """ """, unsafe_allow_html=True ) # --- Left Sidebar: Instructions & Upload --- with st.sidebar: # App info section st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=40) st.title("Document Intelligence") st.caption(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M')}") with st.expander("How It Works", expanded=True): st.markdown("1. **Upload & Parse**: Select your PDF to begin.\n2. **Ask Questions**: Use the chat to query your document.\n3. **Get Answers**: The AI provides instant, evidence-backed responses.") st.markdown("---") # Upload section st.subheader("Upload Document") uploaded_file = st.file_uploader("Select a PDF", type=["pdf"], help="Upload a PDF file to analyze") if uploaded_file: filename = secure_filename(uploaded_file.name) # Sanitize filename to be a valid Chroma collection name collection_name = re.sub(r'[^a-zA-Z0-9_-]', '_', os.path.splitext(filename)[0]) if st.button("Parse Document", use_container_width=True, key="parse_button"): output_dir = os.path.join("./parsed", filename) os.makedirs(output_dir, exist_ok=True) pdf_path = os.path.join(output_dir, filename) with open(pdf_path, "wb") as f: f.write(uploaded_file.getbuffer()) with st.spinner("Processing document..."): try: gpp = GPP(GPPConfig()) parsed_info = gpp.run(pdf_path, output_dir, collection_name) st.session_state.parsed_info = parsed_info st.session_state.chat_history = [] st.session_state.selected_chunks = [] st.success("Document ready!") except Exception as e: st.error(f"Processing failed: {str(e)}") st.session_state.parsed_info = None # Display document preview if parsed if st.session_state.parsed_info: st.markdown("---") st.subheader("Document Preview") parsed = st.session_state.parsed_info # Layout PDF layout_pdf = parsed.get("layout_pdf") if layout_pdf and os.path.exists(layout_pdf): with st.expander("View Layout PDF", expanded=False): st.markdown(f"[Open in new tab]({layout_pdf})") doc = fitz.open(layout_pdf) thumb_width = 500 thumbs = [] for page_num in range(len(doc)): page = doc.load_page(page_num) pix = page.get_pixmap(matrix=fitz.Matrix(thumb_width / page.rect.width, thumb_width / page.rect.width)) img_bytes = pix.tobytes("png") b64 = base64.b64encode(img_bytes).decode("utf-8") thumbs.append((page_num, b64)) st.markdown("
{md_text[:3000]}{'...' if len(md_text)>3000 else ''}", unsafe_allow_html=True) except Exception as e: st.warning(f"Could not preview content: {str(e)}") st.markdown("---") st.subheader("Chat Controls") if st.button("Clear Chat", use_container_width=True): st.session_state.chat_history = [] st.session_state.selected_chunks = [] st.rerun() # --- Main Chat Area --- main_col, evidence_col = st.columns([2, 1]) with main_col: if not st.session_state.parsed_info: st.info("Please upload and parse a document to start the chat.") else: # Create a container for the chat window st.markdown("