Spaces:

AbhinavGavireddi
/

Document_intelligence

Running

App Files Files Community

Abhinav Gavireddi commited on Jun 19

Commit

6c61722

1 Parent(s): c613bb1

[fix]: optimized the entire pipeline

Browse files

Files changed (8) hide show

app.py +222 -340
requirements.txt +6 -5
src/__init__.py +44 -20
src/ghm.py +2 -2
src/gpp.py +69 -152
src/qa.py +75 -74
src/retriever.py +48 -96
src/utils.py +45 -11

app.py CHANGED Viewed

@@ -3,250 +3,170 @@ import streamlit as st
 from datetime import datetime
 import re
 from werkzeug.utils import secure_filename
 from src.gpp import GPP, GPPConfig
 from src.qa import AnswerGenerator
-# Check if we need to modify the AnswerGenerator class to accept conversation context
-# If the original implementation doesn't support this, we'll create a wrapper
-class ContextAwareAnswerGenerator:
-    """Wrapper around AnswerGenerator to include conversation context"""
-    def __init__(self, chunks):
-        self.chunks = chunks
-        self.original_generator = AnswerGenerator(chunks)
-    def answer(self, question, conversation_context=None):
-        """
-        Generate answer with conversation context
-        Args:
-            chunks: Document chunks to search
-            question: Current question
-            conversation_context: List of previous Q&A for context
-        Returns:
-            answer, supporting_chunks
-        """
-        # If no conversation context or original implementation supports it directly
-        if conversation_context is None or len(conversation_context) <= 1:
-            return self.original_generator.answer(question)
-        # Otherwise, enhance the question with context
-        # Create a contextual prompt by summarizing previous exchanges
-        context_prompt = "Based on our conversation so far:\n"
-        # Include the last few exchanges (limiting to prevent context getting too large)
-        max_history = min(len(conversation_context) - 1, 4)  # Last 4 exchanges maximum
-        for i in range(max(0, len(conversation_context) - max_history - 1), len(conversation_context) - 1, 2):
-            if i < len(conversation_context) and i+1 < len(conversation_context):
-                user_q = conversation_context[i]["content"]
-                assistant_a = conversation_context[i+1]["content"]
-                context_prompt += f"You were asked: '{user_q}'\n"
-                context_prompt += f"You answered: '{assistant_a}'\n"
-        context_prompt += f"\nNow answer this follow-up question: {question}"
-        # Use the enhanced prompt
-        return self.original_generator.answer(context_prompt)
 # --- Page Configuration ---
 st.set_page_config(
-    page_title="Document Intelligence Q&A",
-    page_icon="📄",
     layout="wide"
 )
 # --- Session State Initialization ---
 if 'chat_history' not in st.session_state:
-    st.session_state.chat_history = []  # List of {role: 'user'/'assistant', content: str}
-if 'parsed' not in st.session_state:
-    st.session_state.parsed = None
 if "selected_chunks" not in st.session_state:
     st.session_state.selected_chunks = []
-if "conversation_context" not in st.session_state:
-    st.session_state.conversation_context = []
-# --- Custom CSS for styling ---
 st.markdown(
     """
     <style>
-    /* Global Styles */
-    body {
-        background-color: #fafafa;
-        font-family: 'Helvetica Neue', sans-serif;
     }
-    /* Header Styles */
-    .main-header {
-        margin-bottom: 2rem;
     }
-    /* Card Styles */
-    .card {
-        background: white;
-        border-radius: 8px;
-        padding: 20px;
-        margin-bottom: 20px;
-        box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
     }
-    /* Button Styles */
-    .stButton>button {
-        background-color: #4361ee;
-        color: white;
-        border-radius: 4px;
-        border: none;
-        padding: 8px 16px;
-        font-weight: 500;
     }
-    .stButton>button:hover {
-        background-color: #3a56d4;
     }
-    /* Input Styles */
-    .stTextInput>div>div>input {
-        border-radius: 4px;
-        border: 1px solid #e0e0e0;
     }
-    /* Code Block Styles */
-    pre {
-        background-color: #f5f5f5;
-        padding: 12px;
-        border-radius: 4px;
-        font-size: 14px;
     }
-    /* Hide Streamlit footer */
-    footer {
-        display: none;
     }
-    /* Sidebar Styles */
-    .css-18e3th9 {
-        padding-top: 1rem;
     }
-    /* Expander styles */
-    .streamlit-expanderHeader {
-        font-size: 1rem;
-        font-weight: 500;
     }
-    /* Chat Interface Styles */
-    .chat-container {
-        display: flex;
-        flex-direction: column;
-        gap: 12px;
-        margin-top: 20px;
-        margin-bottom: 20px;
     }
-    .chat-message {
-        display: flex;
-        margin-bottom: 10px;
     }
-    .user-message {
-        justify-content: flex-end;
     }
-    .assistant-message {
-        justify-content: flex-start;
     }
-    .message-content {
-        padding: 12px 16px;
         border-radius: 18px;
-        max-width: 80%;
-        overflow-wrap: break-word;
     }
-    .user-message .message-content {
-        background-color: #4361ee;
         color: white;
-        border-bottom-right-radius: 4px;
-    }
-    .assistant-message .message-content {
-        background-color: #f0f2f6;
-        color: #1e1e1e;
-        border-bottom-left-radius: 4px;
     }
-    .message-content p {
-        margin: 0;
-        padding: 0;
     }
-    /* Empty chat placeholder style */
     .empty-chat-placeholder {
         display: flex;
         flex-direction: column;
-        align-items: center;
         justify-content: center;
-        height: 300px;
-        background-color: #f8f9fa;
-        border-radius: 8px;
-        margin-bottom: 20px;
-        text-align: center;
-        color: #6c757d;
-    }
-    .empty-chat-icon {
-        font-size: 40px;
-        margin-bottom: 16px;
-        color: #adb5bd;
-    }
-    /* Message typing indicator */
-    .typing-indicator {
-        display: flex;
         align-items: center;
-        justify-content: flex-start;
-        margin-top: 8px;
-    }
-    .typing-indicator span {
-        height: 8px;
-        width: 8px;
-        background-color: #4361ee;
-        border-radius: 50%;
-        margin: 0 2px;
-        display: inline-block;
-        opacity: 0.7;
-    }
-    .typing-indicator span:nth-child(1) {
-        animation: pulse 1s infinite;
-    }
-    .typing-indicator span:nth-child(2) {
-        animation: pulse 1s infinite 0.2s;
-    }
-    .typing-indicator span:nth-child(3) {
-        animation: pulse 1s infinite 0.4s;
-    }
-    @keyframes pulse {
-        0% { transform: scale(1); opacity: 0.7; }
-        50% { transform: scale(1.2); opacity: 1; }
-        100% { transform: scale(1); opacity: 0.7; }
     }
-    /* Spinner */
-    .stSpinner > div > div {
-        border-top-color: #4361ee !important;
     }
-    /* Info box */
-    .stAlert {
-        border-radius: 8px;
-    }
     </style>
     """, unsafe_allow_html=True
 )
@@ -256,17 +176,10 @@ with st.sidebar:
     # App info section
     st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=40)
     st.title("Document Intelligence")
-    st.caption(f"Last updated: {datetime.now().strftime('%Y-%m-%d')}")
     with st.expander("How It Works", expanded=True):
-        st.markdown(
-            """
-            1. **Upload PDF**: Select and parse your document
-            2. **Ask Questions**: Type your query about the document
-            3. **Get Answers**: AI analyzes and responds with insights
-            4. **View Evidence**: See supporting chunks in the right sidebar
-            """
-        )
     st.markdown("---")
@@ -275,54 +188,54 @@ with st.sidebar:
     uploaded_file = st.file_uploader("Select a PDF", type=["pdf"], help="Upload a PDF file to analyze")
     if uploaded_file:
-        try:
-            filename = secure_filename(uploaded_file.name)
-            if not re.match(r'^[\w\-. ]+$', filename):
-                st.error("Invalid file name. Please rename your file.")
-            else:
-                col1, col2 = st.columns(2)
-                with col1:
-                    if st.button("Parse pdf", use_container_width=True, key="parse_button"):
-                        output_dir = os.path.join("./parsed", filename)
-                        os.makedirs(output_dir, exist_ok=True)
-                        pdf_path = os.path.join(output_dir, filename)
-                        with open(pdf_path, "wb") as f:
-                            f.write(uploaded_file.getbuffer())
-                        with st.spinner("Parsing document..."):
-                            try:
-                                gpp = GPP(GPPConfig())
-                                parsed = gpp.run(pdf_path, output_dir)
-                                st.session_state.parsed = parsed
-                                st.session_state.chat_history = []  # Reset chat when new document is parsed
-                                st.session_state.conversation_context = []  # Reset conversation context
-                                st.session_state.selected_chunks = []  # Reset selected chunks
-                                st.success("Document parsed successfully!")
-                            except Exception as e:
-                                st.error(f"Parsing failed: {str(e)}")
-                                st.session_state.parsed = None
-                with col2:
-                    if st.button("Clear", use_container_width=True, key="clear_button"):
-                        st.session_state.parsed = None
-                        st.session_state.selected_chunks = []
-                        st.session_state.chat_history = []
-                        st.session_state.conversation_context = []
-                        st.experimental_rerun()
-        except Exception as e:
-            st.error(f"Upload error: {str(e)}")
     # Display document preview if parsed
-    if st.session_state.parsed:
         st.markdown("---")
         st.subheader("Document Preview")
-        parsed = st.session_state.parsed
         # Layout PDF
         layout_pdf = parsed.get("layout_pdf")
         if layout_pdf and os.path.exists(layout_pdf):
             with st.expander("View Layout PDF", expanded=False):
                 st.markdown(f"[Open in new tab]({layout_pdf})")
         # Content preview
         md_path = parsed.get("md_path")
@@ -335,123 +248,92 @@ with st.sidebar:
             except Exception as e:
                 st.warning(f"Could not preview content: {str(e)}")
-# --- Main Content Area ---
-# Create a two-column layout for main content
-main_col, evidence_col = st.columns([3, 1])
-with main_col:
-    st.markdown("<div class='main-header'>", unsafe_allow_html=True)
-    st.title("Document Q&A")
-    st.markdown("</div>", unsafe_allow_html=True)
-    if not st.session_state.parsed:
-        st.info("👈 Please upload and parse a document to begin asking questions.")
     else:
-        # Q&A Section with chat-like interface
-        st.markdown("<div class='card'>", unsafe_allow_html=True)
-        question =     st.text_input(
-            "Ask a question about your document:",
-            key="question_input",
-            placeholder="E.g., 'What are the key findings?' or 'Summarize the data'",
-            on_change=None  # Ensure the input field gets cleared naturally after submission
-        )
-    col_btn1, col_btn2 = st.columns([4, 1])
-    with col_btn1:
-        submit_button = st.button("Get Answer", use_container_width=True)
-    with col_btn2:
-        clear_chat = st.button("Clear Chat", use_container_width=True)
-    # Initialize chat history
-    if "chat_history" not in st.session_state:
-        st.session_state.chat_history = []
-    # Clear chat when button is pressed
-    if clear_chat:
-        st.session_state.chat_history = []
-        st.session_state.conversation_context = []
-        st.session_state.selected_chunks = []
-        st.experimental_rerun()
-    if submit_button and question:
-        with st.spinner("Analyzing document and generating answer..."):
-            try:
-                # Add user question to chat history
-                st.session_state.chat_history.append({"role": "user", "content": question})
-                # Generate answer using conversation context
-                generator = ContextAwareAnswerGenerator(st.session_state.parsed['chunks'])
-                answer, supporting_chunks = generator.answer(
-                    question, conversation_context=st.session_state.chat_history
-                )
-                # Add assistant response to chat history
                 st.session_state.chat_history.append({"role": "assistant", "content": answer})
-                # Store supporting chunks in session state for the right sidebar
                 st.session_state.selected_chunks = supporting_chunks
-                # Clear the question input
-                question = ""
-            except Exception as e:
-                st.error(f"Failed to generate answer: {str(e)}")
-                st.session_state.selected_chunks = []
-    # Display chat history
-    st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
-    if not st.session_state.chat_history:
-        # Show empty chat state with icon
-        st.markdown("""
-        <div class='empty-chat-placeholder'>
-            <div class='empty-chat-icon'>💬</div>
-            <p>Ask questions about your document to start a conversation</p>
-        </div>
-        """, unsafe_allow_html=True)
-    else:
-        for message in st.session_state.chat_history:
-            if message["role"] == "user":
-                st.markdown(f"""
-                <div class='chat-message user-message'>
-                    <div class='message-content'>
-                        <p>{message["content"]}</p>
-                    </div>
-                </div>
-                """, unsafe_allow_html=True)
-            else:
-                st.markdown(f"""
-                <div class='chat-message assistant-message'>
-                    <div class='message-content'>
-                        <p>{message["content"]}</p>
-                    </div>
-                </div>
-                """, unsafe_allow_html=True)
-    st.markdown("</div>", unsafe_allow_html=True)
-    st.markdown("</div>", unsafe_allow_html=True)
 # --- Supporting Evidence in the right column ---
 with evidence_col:
-    if st.session_state.parsed:
         st.markdown("### Supporting Evidence")
         if not st.session_state.selected_chunks:
             st.info("Evidence chunks will appear here after you ask a question.")
         else:
             for idx, chunk in enumerate(st.session_state.selected_chunks):
-                with st.expander(f"Evidence #{idx+1}", expanded=True):
-                    st.markdown(f"**Type:** {chunk['type'].capitalize()}")
                     st.markdown(chunk.get('narration', 'No narration available'))
-                    # Display table if available
                     if 'table_structure' in chunk:
-                        st.write("**Table Data:**")
                         st.dataframe(chunk['table_structure'], use_container_width=True)
-                    # Display images if available
                     for blk in chunk.get('blocks', []):
-                        if blk.get('type') == 'img_path' and 'images_dir' in st.session_state.parsed:
-                            img_path = os.path.join(st.session_state.parsed['images_dir'], blk.get('img_path',''))
                             if os.path.exists(img_path):
                                 st.image(img_path, use_column_width=True)

 from datetime import datetime
 import re
 from werkzeug.utils import secure_filename
+import fitz  # PyMuPDF
+import base64
 from src.gpp import GPP, GPPConfig
 from src.qa import AnswerGenerator
 # --- Page Configuration ---
 st.set_page_config(
+    page_title="Document Intelligence",
+    page_icon="🤖",
     layout="wide"
 )
 # --- Session State Initialization ---
 if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+if 'parsed_info' not in st.session_state:
+    st.session_state.parsed_info = None  # Will store {collection_name, layout_pdf, md_path, etc.}
 if "selected_chunks" not in st.session_state:
     st.session_state.selected_chunks = []
+# --- Custom CSS for Messenger-like UI ---
 st.markdown(
     """
     <style>
+    /* Main app background */
+    .stApp {
+        background-color: #121212; /* Dark background */
+        color: #EAEAEA; /* Light text */
     }
+    /* Ensure all text in the main content area is light */
+    .st-emotion-cache-16txtl3,
+    .st-emotion-cache-16txtl3 h1,
+    .st-emotion-cache-16txtl3 h2,
+    .st-emotion-cache-16txtl3 h3 {
+        color: #EAEAEA;
     }
+    /* Sidebar adjustments */
+    .st-emotion-cache-16txtl3 {
+        padding-top: 2rem;
     }
+    /* Main chat window container */
+    .chat-window {
+        height: 75vh;
+        background: #1E1E1E; /* Slightly lighter dark for chat window */
+        border-radius: 10px;
+        box-shadow: 0 4px 8px rgba(0,0,0,0.4);
+        display: flex;
+        flex-direction: column;
+        overflow: hidden;
     }
+    /* Chat message history */
+    .chat-history {
+        flex-grow: 1;
+        overflow-y: auto;
+        padding: 20px;
+        display: flex;
+        flex-direction: column;
+        gap: 15px;
     }
+    /* General message styling */
+    .message-row {
+        display: flex;
+        align-items: flex-end;
+        gap: 10px;
     }
+    /* Assistant message alignment */
+    .assistant-row {
+        justify-content: flex-start;
     }
+    /* User message alignment */
+    .user-row {
+        justify-content: flex-end;
     }
+    /* Avatar styling */
+    .avatar {
+        width: 40px;
+        height: 40px;
+        border-radius: 50%;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        font-size: 20px;
+        background-color: #3A3B3C; /* Dark gray for avatar */
+        color: white;
     }
+    /* Chat bubble styling */
+    .message-bubble {
+        max-width: 70%;
+        padding: 10px 15px;
+        border-radius: 18px;
+        overflow-wrap: break-word;
+        color: #EAEAEA; /* Light text for all bubbles */
     }
+    .message-bubble p {
+        margin: 0;
     }
+    /* Assistant bubble color */
+    .assistant-bubble {
+        background-color: #3A3B3C; /* Dark gray for assistant */
     }
+    /* User bubble color */
+    .user-bubble {
+        background-color: #0084FF;
+        color: white; /* White text for user bubble */
     }
+    /* Chat input container */
+    .chat-input-container {
+        padding: 15px 20px;
+        background: #1E1E1E; /* Match chat window background */
+        border-top: 1px solid #3A3B3C;
     }
+    /* Input field styling */
+    .stTextInput>div>div>input {
         border-radius: 18px;
+        border: 1px solid #555;
+        background-color: #3A3B3C; /* Dark input field */
+        color: #EAEAEA; /* Light text in input */
+        padding: 10px 15px;
     }
+    /* Button styling */
+    .stButton>button {
+        border-radius: 18px;
+        border: none;
+        background-color: #0084FF;
         color: white;
+        height: 42px;
     }
+    /* Hide the default "Get Answer" header for a cleaner look */
+    .st-emotion-cache-16txtl3 > h1 {
+        display: none;
     }
+    /* Empty chat placeholder */
     .empty-chat-placeholder {
+        flex-grow: 1;
         display: flex;
         flex-direction: column;
         justify-content: center;
         align-items: center;
+        color: #A0A0A0; /* Lighter gray for placeholder text */
     }
+    .empty-chat-placeholder .icon {
+        font-size: 50px;
+        margin-bottom: 10px;
     }
     </style>
     """, unsafe_allow_html=True
 )
     # App info section
     st.image("https://img.icons8.com/ios-filled/50/4A90E2/document.png", width=40)
     st.title("Document Intelligence")
+    st.caption(f"Last updated: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
     with st.expander("How It Works", expanded=True):
+        st.markdown("1. **Upload & Parse**: Select your PDF to begin.\n2. **Ask Questions**: Use the chat to query your document.\n3. **Get Answers**: The AI provides instant, evidence-backed responses.")
     st.markdown("---")
     uploaded_file = st.file_uploader("Select a PDF", type=["pdf"], help="Upload a PDF file to analyze")
     if uploaded_file:
+        filename = secure_filename(uploaded_file.name)
+        # Sanitize filename to be a valid Chroma collection name
+        collection_name = re.sub(r'[^a-zA-Z0-9_-]', '_', os.path.splitext(filename)[0])
+        if st.button("Parse Document", use_container_width=True, key="parse_button"):
+            output_dir = os.path.join("./parsed", filename)
+            os.makedirs(output_dir, exist_ok=True)
+            pdf_path = os.path.join(output_dir, filename)
+            with open(pdf_path, "wb") as f:
+                f.write(uploaded_file.getbuffer())
+            with st.spinner("Processing document..."):
+                try:
+                    gpp = GPP(GPPConfig())
+                    parsed_info = gpp.run(pdf_path, output_dir, collection_name)
+                    st.session_state.parsed_info = parsed_info
+                    st.session_state.chat_history = []
+                    st.session_state.selected_chunks = []
+                    st.success("Document ready!")
+                except Exception as e:
+                    st.error(f"Processing failed: {str(e)}")
+                    st.session_state.parsed_info = None
     # Display document preview if parsed
+    if st.session_state.parsed_info:
         st.markdown("---")
         st.subheader("Document Preview")
+        parsed = st.session_state.parsed_info
         # Layout PDF
         layout_pdf = parsed.get("layout_pdf")
         if layout_pdf and os.path.exists(layout_pdf):
             with st.expander("View Layout PDF", expanded=False):
                 st.markdown(f"[Open in new tab]({layout_pdf})")
+                doc = fitz.open(layout_pdf)
+                thumb_width = 500
+                thumbs = []
+                for page_num in range(len(doc)):
+                    page = doc.load_page(page_num)
+                    pix = page.get_pixmap(matrix=fitz.Matrix(thumb_width / page.rect.width, thumb_width / page.rect.width))
+                    img_bytes = pix.tobytes("png")
+                    b64 = base64.b64encode(img_bytes).decode("utf-8")
+                    thumbs.append((page_num, b64))
+                st.markdown("<div style='overflow-x: auto; white-space: nowrap; border: 1px solid #eee; border-radius: 8px; padding: 8px; background: #fafbfc; max-width: 100%;'>", unsafe_allow_html=True)
+                for page_num, b64 in thumbs:
+                    st.markdown(f"<a href='{layout_pdf}#page={page_num+1}' target='_blank' style='display:inline-block;margin-right:8px;'><img src='data:image/png;base64,{b64}' width='{thumb_width}' style='border:1px solid #ccc;border-radius:4px;box-shadow:0 1px 2px #0001;'/></a>", unsafe_allow_html=True)
+                st.markdown("</div>", unsafe_allow_html=True)
         # Content preview
         md_path = parsed.get("md_path")
             except Exception as e:
                 st.warning(f"Could not preview content: {str(e)}")
+    st.markdown("---")
+    st.subheader("Chat Controls")
+    if st.button("Clear Chat", use_container_width=True):
+        st.session_state.chat_history = []
+        st.session_state.selected_chunks = []
+        st.rerun()
+# --- Main Chat Area ---
+main_col, evidence_col = st.columns([2, 1])
+with main_col:
+    if not st.session_state.parsed_info:
+        st.info("Please upload and parse a document to start the chat.")
     else:
+        # Create a container for the chat window
+        st.markdown("<div class='chat-window'>", unsafe_allow_html=True)
+        # Display chat history
+        st.markdown("<div class='chat-history'>", unsafe_allow_html=True)
+        if not st.session_state.chat_history:
+             st.markdown("""
+            <div class='empty-chat-placeholder'>
+                <span class="icon">🤖</span>
+                <h3>Ask me anything about your document!</h3>
+            </div>
+            """, unsafe_allow_html=True)
+        else:
+            for message in st.session_state.chat_history:
+                if message["role"] == "user":
+                    st.markdown(f"""
+                    <div class="message-row user-row">
+                        <div class="message-bubble user-bubble">
+                            <p>{message["content"]}</p>
+                        </div>
+                    </div>
+                    """, unsafe_allow_html=True)
+                else:
+                    st.markdown(f"""
+                    <div class="message-row assistant-row">
+                        <div class="avatar">🤖</div>
+                        <div class="message-bubble assistant-bubble">
+                            <p>{message["content"]}</p>
+                        </div>
+                    </div>
+                    """, unsafe_allow_html=True)
+        st.markdown("</div>", unsafe_allow_html=True) # Close chat-history
+        # Chat input bar
+        st.markdown("<div class='chat-input-container'>", unsafe_allow_html=True)
+        input_col, button_col = st.columns([4, 1])
+        with input_col:
+            question = st.text_input("Ask a question...", key="question_input", label_visibility="collapsed")
+        with button_col:
+            send_button = st.button("Send", use_container_width=True)
+        st.markdown("</div>", unsafe_allow_html=True) # Close chat-input-container
+        st.markdown("</div>", unsafe_allow_html=True) # Close chat-window
+        # --- Handle message sending ---
+        if send_button and question:
+            st.session_state.chat_history.append({"role": "user", "content": question})
+            with st.spinner("Thinking..."):
+                generator = AnswerGenerator(st.session_state.parsed_info['collection_name'])
+                answer, supporting_chunks = generator.answer(question)
                 st.session_state.chat_history.append({"role": "assistant", "content": answer})
                 st.session_state.selected_chunks = supporting_chunks
+            st.rerun()
 # --- Supporting Evidence in the right column ---
 with evidence_col:
+    if st.session_state.parsed_info:
         st.markdown("### Supporting Evidence")
         if not st.session_state.selected_chunks:
             st.info("Evidence chunks will appear here after you ask a question.")
         else:
             for idx, chunk in enumerate(st.session_state.selected_chunks):
+                with st.expander(f"Evidence Chunk #{idx+1}", expanded=True):
                     st.markdown(chunk.get('narration', 'No narration available'))
                     if 'table_structure' in chunk:
                         st.dataframe(chunk['table_structure'], use_container_width=True)
                     for blk in chunk.get('blocks', []):
+                        if blk.get('type') == 'img_path' and 'images_dir' in st.session_state.parsed_info:
+                            img_path = os.path.join(st.session_state.parsed_info['images_dir'], blk.get('img_path',''))
                             if os.path.exists(img_path):
                                 st.image(img_path, use_column_width=True)

requirements.txt CHANGED Viewed

@@ -1,8 +1,9 @@
 # Core
 streamlit>=1.25.0
-sentence-transformers>=2.2.2
-rank-bm25>=0.2.2
-hnswlib>=0.7.0
 huggingface-hub>=0.16.4
 langchain>=0.1.9
 langchain-openai>=0.1.9
@@ -21,7 +22,7 @@ scikit-learn>=1.0.2
 pdfminer.six>=20231228
 torch>=2.6.0
 torchvision
-matplotlib>=3.10
 ultralytics>=8.3.48
 rapid-table>=1.0.3,<2.0.0
 doclayout-yolo==0.0.2b1
@@ -30,7 +31,7 @@ PyYAML>=6.0.2,<7
 ftfy>=6.3.1,<7
 openai>=1.70.0,<2
 pydantic>=2.7.2,<2.11
-transformers>=4.49.0,<5.0.0
 gradio-pdf>=0.0.21
 shapely>=2.0.7,<3
 pyclipper>=1.3.0,<2

 # Core
 streamlit>=1.25.0
+sentence-transformers>=2.2.2 # Re-enabled for local embeddings
+# rank-bm25>=0.2.2 - Replaced by ChromaDB
+# hnswlib>=0.7.0 - Replaced by ChromaDB
+chromadb>=0.4.18
 huggingface-hub>=0.16.4
 langchain>=0.1.9
 langchain-openai>=0.1.9
 pdfminer.six>=20231228
 torch>=2.6.0
 torchvision
+# matplotlib>=3.10 - Removed, not used in the app
 ultralytics>=8.3.48
 rapid-table>=1.0.3,<2.0.0
 doclayout-yolo==0.0.2b1
 ftfy>=6.3.1,<7
 openai>=1.70.0,<2
 pydantic>=2.7.2,<2.11
+# transformers>=4.49.0,<5.0.0 - Removed as reranker is disabled
 gradio-pdf>=0.0.21
 shapely>=2.0.7,<3
 pyclipper>=1.3.0,<2

src/__init__.py CHANGED Viewed

@@ -2,6 +2,11 @@ import os
 from dotenv import load_dotenv
 import bleach
 from loguru import logger
 load_dotenv()
@@ -14,35 +19,54 @@ Central configuration for the entire Document Intelligence app.
 All modules import from here rather than hard-coding values.
 """
-OPENAI_EMBEDDING_MODEL = os.getenv(
-        "OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002"
-    )
 class EmbeddingConfig:
-    PROVIDER = os.getenv("EMBEDDING_PROVIDER",'HF')
     TEXT_MODEL = os.getenv('TEXT_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
-    META_MODEL = os.getenv('META_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
 class RetrieverConfig:
-    PROVIDER = os.getenv("EMBEDDING_PROVIDER",'HF')
-    TOP_K = int(os.getenv('RETRIEVER_TOP_K', 10))
-    DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
-    ANN_TOP = int(os.getenv('ANN_TOP', 50))
-class RerankerConfig:
-    @staticmethod
-    def get_device():
-        import torch
-        return 'cuda' if torch.cuda.is_available() else 'cpu'
-    MODEL_NAME = os.getenv('RERANKER_MODEL', 'BAAI/bge-reranker-v2-Gemma')
-    DEVICE = get_device()
 class GPPConfig:
     CHUNK_TOKEN_SIZE = int(os.getenv('CHUNK_TOKEN_SIZE', 256))
     DEDUP_SIM_THRESHOLD = float(os.getenv('DEDUP_SIM_THRESHOLD', 0.9))
     EXPANSION_SIM_THRESHOLD = float(os.getenv('EXPANSION_SIM_THRESHOLD', 0.85))
     COREF_CONTEXT_SIZE = int(os.getenv('COREF_CONTEXT_SIZE', 3))
-    HNSW_EF_CONSTRUCTION = int(os.getenv("HNSW_EF_CONSTRUCTION", "200"))
-    HNSW_M = int(os.getenv("HNSW_M", "16"))
-    HNSW_EF_SEARCH = int(os.getenv("HNSW_EF_SEARCH", "50"))

 from dotenv import load_dotenv
 import bleach
 from loguru import logger
+import streamlit as st
+from sentence_transformers import SentenceTransformer
+import torch
+import chromadb
+from src.utils import OpenAIEmbedder, LocalEmbedder
 load_dotenv()
 All modules import from here rather than hard-coding values.
 """
+# --- Embedding & ChromaDB Config ---
 class EmbeddingConfig:
+    PROVIDER = os.getenv("EMBEDDING_PROVIDER", 'local')
     TEXT_MODEL = os.getenv('TEXT_EMBED_MODEL', 'sentence-transformers/all-MiniLM-L6-v2')
+# --- Retriever Config for Low Latency ---
 class RetrieverConfig:
+    # Retrieve more chunks initially, let the final prompt handle trimming.
+    TOP_K = int(os.getenv('RETRIEVER_TOP_K', 5))
+# --- GPP Config ---
 class GPPConfig:
     CHUNK_TOKEN_SIZE = int(os.getenv('CHUNK_TOKEN_SIZE', 256))
     DEDUP_SIM_THRESHOLD = float(os.getenv('DEDUP_SIM_THRESHOLD', 0.9))
     EXPANSION_SIM_THRESHOLD = float(os.getenv('EXPANSION_SIM_THRESHOLD', 0.85))
     COREF_CONTEXT_SIZE = int(os.getenv('COREF_CONTEXT_SIZE', 3))
+# --- Centralized, Streamlit-cached Clients & Models ---
+@st.cache_resource(show_spinner="Connecting to ChromaDB...")
+def get_chroma_client():
+    """
+    Initializes a ChromaDB client.
+    Defaults to a serverless, persistent client, which is ideal for local
+    development and single-container deployments.
+    If CHROMA_HOST is set, it will attempt to connect to a standalone server.
+    """
+    chroma_host = os.getenv("CHROMA_HOST")
+    if chroma_host:
+        logger.info(f"Connecting to ChromaDB server at {chroma_host}...")
+        client = chromadb.HttpClient(
+            host=chroma_host,
+            port=int(os.getenv("CHROMA_PORT", "8000"))
+        )
+    else:
+        persist_directory = os.getenv("PERSIST_DIRECTORY", "./parsed/chroma_db")
+        logger.info(f"Using persistent ChromaDB at: {persist_directory}")
+        client = chromadb.PersistentClient(path=persist_directory)
+    return client
+@st.cache_resource(show_spinner="Loading embedding model...")
+def get_embedder():
+    if EmbeddingConfig.PROVIDER == "openai":
+        logger.info(f"Using OpenAI embedder with model: {EmbeddingConfig.TEXT_MODEL}")
+        return OpenAIEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)
+    else:
+        logger.info(f"Using local embedder with model: {EmbeddingConfig.TEXT_MODEL}")
+        return LocalEmbedder(model_name=EmbeddingConfig.TEXT_MODEL)

src/ghm.py CHANGED Viewed

@@ -33,8 +33,8 @@ if __name__ == '__main__':
     mineru_patterns = [
         # "models/Layout/LayoutLMv3/*",
         "models/Layout/YOLO/*",
-        "models/MFD/YOLO/*",
-        "models/MFR/unimernet_hf_small_2503/*",
         "models/OCR/paddleocr_torch/*",
         # "models/TabRec/TableMaster/*",
         # "models/TabRec/StructEqTable/*",

     mineru_patterns = [
         # "models/Layout/LayoutLMv3/*",
         "models/Layout/YOLO/*",
+        # "models/MFD/YOLO/*",
+        # "models/MFR/unimernet_hf_small_2503/*",
         "models/OCR/paddleocr_torch/*",
         # "models/TabRec/TableMaster/*",
         # "models/TabRec/StructEqTable/*",

src/gpp.py CHANGED Viewed

@@ -17,10 +17,10 @@ import os
 import json
 from typing import List, Dict, Any, Optional
 import re
-from src import EmbeddingConfig, GPPConfig
 from src.utils import OpenAIEmbedder, LLMClient
-from src import logger
 def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
     """
@@ -49,21 +49,8 @@ def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
 class GPP:
     def __init__(self, config: GPPConfig):
         self.config = config
-        # Lazy import heavy libraries
-        from sentence_transformers import SentenceTransformer
-        # Embedding models
-        if EmbeddingConfig.PROVIDER == "openai":
-            self.text_embedder = OpenAIEmbedder(EmbeddingConfig.TEXT_MODEL)
-            self.meta_embedder = OpenAIEmbedder(EmbeddingConfig.META_MODEL)
-        else:
-            self.text_embedder = SentenceTransformer(
-                EmbeddingConfig.TEXT_MODEL, use_auth_token=True
-            )
-            self.meta_embedder = SentenceTransformer(
-                EmbeddingConfig.META_MODEL, use_auth_token=True
-            )
-        self.bm25 = None
     def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
         """
@@ -168,27 +155,23 @@ class GPP:
     def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         try:
-            # Lazy import heavy libraries
-            import numpy as np
-            from sentence_transformers import SentenceTransformer
             narrations = [c.get("narration", "") for c in chunks]
-            if EmbeddingConfig.PROVIDER == "openai":
-                embs = self.text_embedder.embed(narrations)
-            else:
-                embs = self.text_embedder.encode(narrations)
-            keep = []
-            for i, emb in enumerate(embs):
-                if not any(
-                    (emb @ embs[j]).item()
-                    / (np.linalg.norm(emb) * np.linalg.norm(embs[j]) + 1e-8)
-                    > self.config.DEDUP_SIM_THRESHOLD
-                    for j in keep
-                ):
-                    keep.append(i)
-            deduped = [chunks[i] for i in keep]
-            logger.info(f"Deduplicated: {len(chunks)}→{len(deduped)}")
             return deduped
         except Exception as e:
             logger.error(f"Deduplication failed: {e}")
@@ -198,7 +181,7 @@ class GPP:
         for idx, c in enumerate(chunks):
             start = max(0, idx - self.config.COREF_CONTEXT_SIZE)
             ctx = "\n".join(chunks[i].get("narration", "") for i in range(start, idx))
-            prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c.get('narration', '')}"
             try:
                 c["narration"] = LLMClient.generate(prompt)
             except Exception as e:
@@ -212,134 +195,68 @@ class GPP:
         for sec, items in sections.items():
             blob = "\n".join(i.get("narration", "") for i in items)
             try:
-                summ = LLMClient.generate(f"Summarize this section:\n{blob}")
                 for i in items:
                     i.setdefault("metadata", {})["section_summary"] = summ
             except Exception as e:
                 logger.error(f"Metadata summarization failed for section {sec}: {e}")
-    def build_bm25(self, chunks: List[Dict[str, Any]]) -> None:
-        """
-        Build BM25 index on token lists for sparse retrieval.
-        """
-        # Lazy import heavy libraries
-        from rank_bm25 import BM25Okapi
-        tokenized = [c["narration"].split() for c in chunks]
-        self.bm25 = BM25Okapi(tokenized)
-    def compute_and_store(self, chunks: List[Dict[str, Any]], output_dir: str) -> None:
         """
-        1. Compute embeddings for each chunk's narration (text_vec)
-           and section_summary (meta_vec).
-        2. Build two HNSWlib indices (one for text_vecs, one for meta_vecs).
-        3. Save both indices to disk.
-        4. Dump human-readable chunk metadata (incl. section_summary)
-           for traceability in the UI.
         """
-        # Lazy import heavy libraries
-        import numpy as np
-        import hnswlib
-        from sentence_transformers import SentenceTransformer
-        # --- 1. Prepare embedder ---
-        if EmbeddingConfig.PROVIDER.lower() == "openai":
-            embedder = OpenAIEmbedder(EmbeddingConfig.TEXT_MODEL)
-            embed_fn = embedder.embed
-        else:
-            st_model = SentenceTransformer(
-                EmbeddingConfig.TEXT_MODEL, use_auth_token=True
-            )
-            embed_fn = lambda texts: st_model.encode(
-                texts, show_progress_bar=False
-            ).tolist()
-        # Batch compute text & meta embeddings ---
-        narrations = [c["narration"] for c in chunks]
-        meta_texts = [c.get("section_summary", "") for c in chunks]
-        logger.info(
-            "computing_embeddings",
-            provider=EmbeddingConfig.PROVIDER,
-            num_chunks=len(chunks),
-        )
-        text_vecs = embed_fn(narrations)
-        meta_vecs = embed_fn(meta_texts)
-        if len(text_vecs) != len(chunks) or len(meta_vecs) != len(chunks):
-            raise RuntimeError(
-                f"Embedding count mismatch: text_vecs={len(text_vecs)}, meta_vecs={len(meta_vecs)}, chunks={len(chunks)}"
             )
-        # Convert to numpy arrays
-        text_matrix = np.vstack(text_vecs).astype(np.float32)
-        meta_matrix = np.vstack(meta_vecs).astype(np.float32)
-        # Build HNSW indices ---
-        dim = text_matrix.shape[1]
-        text_index = hnswlib.Index(space="cosine", dim=dim)
-        text_index.init_index(
-            max_elements=len(chunks),
-            ef_construction=GPPConfig.HNSW_EF_CONSTRUCTION,
-            M=GPPConfig.HNSW_M,
-        )
-        ids = [c["id"] for c in chunks]
-        text_index.add_items(text_matrix, ids)
-        text_index.set_ef(GPPConfig.HNSW_EF_SEARCH)
-        logger.info("text_hnsw_built", elements=len(chunks))
-        # Meta index (same dim)
-        meta_index = hnswlib.Index(space="cosine", dim=dim)
-        meta_index.init_index(
-            max_elements=len(chunks),
-            ef_construction=GPPConfig.HNSW_EF_CONSTRUCTION,
-            M=GPPConfig.HNSW_M,
-        )
-        meta_index.add_items(meta_matrix, ids)
-        meta_index.set_ef(GPPConfig.HNSW_EF_SEARCH)
-        logger.info("meta_hnsw_built", elements=len(chunks))
-        # Persist indices to disk ---
-        text_idx_path = os.path.join(output_dir, "hnsw_text_index.bin")
-        meta_idx_path = os.path.join(output_dir, "hnsw_meta_index.bin")
-        text_index.save_index(text_idx_path)
-        meta_index.save_index(meta_idx_path)
-        logger.info(
-            "hnsw_indices_saved", text_index=text_idx_path, meta_index=meta_idx_path
-        )
-        # Dump chunk metadata for UI traceability ---
-        meta_path = os.path.join(output_dir, "chunk_metadata.json")
-        metadata = {
-            str(c["id"]): {
-                "text": c.get("text", ""),
-                "narration": c["narration"],
-                "type": c.get("type", ""),
-                "section_summary": c.get("section_summary", ""),
-            }
-            for c in chunks
-        }
-        with open(meta_path, "w", encoding="utf-8") as f:
-            json.dump(metadata, f, ensure_ascii=False, indent=2)
-        logger.info("chunk_metadata_saved", path=meta_path)
-    def run(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
         """
-        Executes full GPP: parse -> chunk -> narrate -> enhance -> index.
-        Returns parse output dict augmented with `chunks` for downstream processes.
         """
-        parsed = self.parse_pdf(pdf_path, output_dir)
-        blocks = parsed.get("blocks", [])
         chunks = self.chunk_blocks(blocks)
-        # assigning ID's to chuncks for traceability
         for idx, chunk in enumerate(chunks):
             chunk["id"] = idx
         self.narrate_multimodal(chunks)
-        chunks = self.deduplicate(chunks)
-        self.coref_resolution(chunks)
-        self.metadata_summarization(chunks)
-        self.build_bm25(chunks)
-        self.compute_and_store(chunks, output_dir)
-        parsed["chunks"] = chunks
-        logger.info("GPP pipeline complete.")
-        return parsed

 import json
 from typing import List, Dict, Any, Optional
 import re
+import numpy as np
+from src import EmbeddingConfig, GPPConfig, logger, get_embedder, get_chroma_client
 from src.utils import OpenAIEmbedder, LLMClient
 def parse_markdown_table(md: str) -> Optional[Dict[str, Any]]:
     """
 class GPP:
     def __init__(self, config: GPPConfig):
         self.config = config
+        self.text_embedder = get_embedder()
+        self.chroma_client = get_chroma_client()
     def parse_pdf(self, pdf_path: str, output_dir: str) -> Dict[str, Any]:
         """
     def deduplicate(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
         try:
             narrations = [c.get("narration", "") for c in chunks]
+            embs = self.text_embedder.embed(narrations)
+            # Simple cosine similarity check
+            keep_indices = []
+            for i in range(len(embs)):
+                is_duplicate = False
+                for j_idx in keep_indices:
+                    sim = np.dot(embs[i], embs[j_idx]) / (np.linalg.norm(embs[i]) * np.linalg.norm(embs[j_idx]))
+                    if sim > self.config.DEDUP_SIM_THRESHOLD:
+                        is_duplicate = True
+                        break
+                if not is_duplicate:
+                    keep_indices.append(i)
+            deduped = [chunks[i] for i in keep_indices]
+            logger.info(f"Deduplicated: {len(chunks)} -> {len(deduped)}")
             return deduped
         except Exception as e:
             logger.error(f"Deduplication failed: {e}")
         for idx, c in enumerate(chunks):
             start = max(0, idx - self.config.COREF_CONTEXT_SIZE)
             ctx = "\n".join(chunks[i].get("narration", "") for i in range(start, idx))
+            prompt = f"Context:\n{ctx}\nRewrite pronouns in:\n{c.get('narration', '')}\n\n give only the rewritten text, no other text"
             try:
                 c["narration"] = LLMClient.generate(prompt)
             except Exception as e:
         for sec, items in sections.items():
             blob = "\n".join(i.get("narration", "") for i in items)
             try:
+                summ = LLMClient.generate(f"Summarize this section:\n{blob}\n\n give only the summarized text, no other text")
                 for i in items:
                     i.setdefault("metadata", {})["section_summary"] = summ
             except Exception as e:
                 logger.error(f"Metadata summarization failed for section {sec}: {e}")
+    def store_in_chroma(self, chunks: List[Dict[str, Any]], collection_name: str) -> None:
         """
+        Computes embeddings and stores the chunks in a ChromaDB collection.
         """
+        if not chunks:
+            logger.warning("No chunks to store in ChromaDB.")
+            return
+        collection = self.chroma_client.get_or_create_collection(name=collection_name)
+        # Prepare data for ChromaDB
+        documents = [c['narration'] for c in chunks]
+        metadatas = []
+        for chunk in chunks:
+            # metadata can only contain str, int, float, bool
+            meta = {k: v for k, v in chunk.items() if k not in ['narration', 'text', 'id'] and type(v) in [str, int, float, bool]}
+            meta['text'] = chunk.get('text', '') # Add original text to metadata
+            metadatas.append(meta)
+        ids = [str(c['id']) for c in chunks]
+        logger.info(f"Storing {len(chunks)} chunks in ChromaDB collection '{collection_name}'...")
+        try:
+            collection.add(
+                ids=ids,
+                documents=documents,
+                metadatas=metadatas
             )
+            logger.info("Successfully stored chunks in ChromaDB.")
+        except Exception as e:
+            logger.error(f"Failed to store chunks in ChromaDB: {e}")
+            raise
+    def run(self, pdf_path: str, output_dir: str, collection_name: str) -> Dict[str, Any]:
         """
+        Executes a streamlined GPP: parse -> chunk -> narrate -> store.
+        Heavy enhancement steps are bypassed for maximum efficiency.
         """
+        parsed_output = self.parse_pdf(pdf_path, output_dir)
+        blocks = parsed_output.get("blocks", [])
         chunks = self.chunk_blocks(blocks)
         for idx, chunk in enumerate(chunks):
             chunk["id"] = idx
         self.narrate_multimodal(chunks)
+        # NOTE: Heavy enhancement steps are disabled for performance.
+        # To re-enable, uncomment the following lines:
+        # chunks = self.deduplicate(chunks)
+        # self.coref_resolution(chunks)
+        # self.metadata_summarization(chunks)
+        self.store_in_chroma(chunks, collection_name)
+        parsed_output["chunks"] = chunks
+        parsed_output["collection_name"] = collection_name
+        logger.info("GPP pipeline complete. Data stored in ChromaDB.")
+        return parsed_output

src/qa.py CHANGED Viewed

@@ -9,94 +9,95 @@ This module contains:
 Each component is modular and can be swapped or extended (e.g., add HyDE retriever).
 """
 import os
 from typing import List, Dict, Any, Tuple
-import streamlit as st
-from src import RerankerConfig, logger
 from src.utils import LLMClient
-from src.retriever import Retriever, RetrieverConfig
-class Reranker:
     """
-    Cross-encoder re-ranker using a transformer-based sequence classification model.
     """
-    @staticmethod
-    @st.cache_resource(show_spinner="Loading reranker model...")
-    def load_model_and_tokenizer(model_name, device):
-        from transformers import AutoTokenizer, AutoModelForSequenceClassification
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForSequenceClassification.from_pretrained(model_name)
-        model.to(device)
-        return tokenizer, model
-    def __init__(self, config: RerankerConfig):
-        try:
-            self.tokenizer, self.model = self.load_model_and_tokenizer(config.MODEL_NAME, config.DEVICE)
-        except Exception as e:
-            logger.error(f'Failed to load reranker model: {e}')
-            raise
-    def rerank(self, query: str, candidates: List[Dict[str, Any]], top_k: int) -> List[Dict[str, Any]]:
-        """Score each candidate and return top_k sorted by relevance."""
-        if not candidates:
-            logger.warning('No candidates provided to rerank.')
-            return []
-        try:
-            import torch
-            inputs = self.tokenizer(
-                [query] * len(candidates),
-                [c.get('narration', '') for c in candidates],
-                padding=True,
-                truncation=True,
-                return_tensors='pt'
-            ).to(RerankerConfig.DEVICE)
-            with torch.no_grad():
-                out = self.model(**inputs)
-            logits = out.logits
-            if logits.ndim == 2 and logits.shape[1] == 1:
-                logits = logits.squeeze(-1)  # only squeeze if it's (batch, 1)
-            probs = torch.sigmoid(logits).cpu().numpy().flatten()  # flatten always ensures 1D array
-            paired = [(c, float(probs[idx])) for idx, c in enumerate(candidates)]
-            ranked = sorted(paired, key=lambda x: x[1], reverse=True)
-            return [c for c, _ in ranked[:top_k]]
-        except Exception as e:
-            logger.error(f'Reranking failed: {e}')
-            return candidates[:top_k]
-class AnswerGenerator:
-    """
-    Main interface: initializes Retriever + Reranker once, then
-    answers multiple questions without re-loading models each time.
-    """
-    def __init__(self, chunks: List[Dict[str, Any]]):
-        self.chunks = chunks
-        self.retriever = Retriever(chunks, RetrieverConfig)
-        self.reranker  = Reranker(RerankerConfig)
-        self.top_k = RetrieverConfig.TOP_K // 2
-    def answer(
-        self, question: str
-    ) -> Tuple[str, List[Dict[str, Any]]]:
         candidates = self.retriever.retrieve(question)
-        top_chunks = self.reranker.rerank(question, candidates, self.top_k)
         context = "\n\n".join(f"- {c['narration']}" for c in top_chunks)
         prompt = (
-            "You are a knowledgeable assistant. Use the following snippets to answer."
-            f"\n\nContext information is below: \n"
-            '------------------------------------'
-            f"{context}"
-            '------------------------------------'
-            "Given the context information above I want you \n"
-            "to think step by step to answer the query in a crisp \n"
-            "manner, incase you don't have enough information, \n"
-            "just say I don't know!. \n\n"
-            f"\n\nQuestion: {question} \n"
             "Answer:"
         )
-        answer = LLMClient.generate(prompt)
         return answer, top_chunks

 Each component is modular and can be swapped or extended (e.g., add HyDE retriever).
 """
 import os
+import random
 from typing import List, Dict, Any, Tuple
+from src import logger, RetrieverConfig
 from src.utils import LLMClient
+from src.retriever import Retriever
+class AnswerGenerator:
     """
+    Generates answers by retrieving documents from a vector store
+    and using them to build a context for an LLM.
+    This version is optimized for low latency by skipping the reranking step.
     """
+    def __init__(self, collection_name: str):
+        self.retriever = Retriever(collection_name, RetrieverConfig)
+        self.context_chunks_count = 5 # Use top 5 chunks for the final prompt
+        self.greetings = [
+            "Hello! I'm ready to answer your questions about the document. What would you like to know?",
+            "Hi there! How can I help you with your document today?",
+            "Hey! I've got the document open and I'm ready for your questions.",
+            "Greetings! Ask me anything about the document, and I'll do my best to find the answer for you."
+        ]
+    def _truncate_to_last_sentence(self, text: str) -> str:
+        """Finds the last period or newline and truncates the text to that point."""
+        # Find the last period
+        last_period = text.rfind('.')
+        # Find the last newline
+        last_newline = text.rfind('\n')
+        # Find the last of the two
+        last_marker = max(last_period, last_newline)
+        if last_marker != -1:
+            return text[:last_marker + 1].strip()
+        # If no sentence-ending punctuation, return the text as is (or a portion)
+        return text
+    def answer(self, question: str) -> Tuple[str, List[Dict[str, Any]]]:
+        """
+        Retrieves documents, builds a context, and generates an answer.
+        Handles simple greetings separately to improve user experience.
+        """
+        # Handle simple greetings to avoid a failed retrieval
+        normalized_question = question.lower().strip().rstrip('.,!')
+        greeting_triggers = ["hi", "hello", "hey", "hallo", "hola"]
+        if normalized_question in greeting_triggers:
+            return random.choice(self.greetings), []
+        # Retrieve candidate documents from the vector store
         candidates = self.retriever.retrieve(question)
+        if not candidates:
+            logger.warning("No candidates retrieved from vector store.")
+            return "The document does not contain information on this topic.", []
+        # Use the top N chunks for context, without reranking
+        top_chunks = candidates[:self.context_chunks_count]
         context = "\n\n".join(f"- {c['narration']}" for c in top_chunks)
+        # A more robust prompt that encourages a natural, conversational tone
         prompt = (
+            "You are a helpful and friendly AI assistant for document analysis. "
+            "Your user is asking a question about a document. "
+            "Based *only* on the context provided below, formulate a clear and conversational answer. "
+            "Adopt a helpful and slightly informal tone, as if you were a knowledgeable colleague.\n\n"
+            "CONTEXT:\n"
+            "---------------------\n"
+            f"{context}\n"
+            "---------------------\n\n"
+            "USER'S QUESTION: "
+            f'"{question}"\n\n'
+            "YOUR TASK:\n"
+            "1. Carefully read the provided context.\n"
+            "2. If the context contains the answer, explain it to the user in a natural, conversational way. Do not just repeat the text verbatim.\n"
+            "3. If the context does not contain the necessary information, respond with: "
+            "'I've checked the document, but I couldn't find any information on that topic.'\n"
+            "4. **Crucially, do not use any information outside of the provided context.**\n\n"
             "Answer:"
         )
+        answer, finish_reason = LLMClient.generate(prompt, max_tokens=256)
+        # Handle cases where the response might be cut off
+        if finish_reason == 'length':
+            logger.warning("LLM response was truncated due to token limit.")
+            truncated_answer = self._truncate_to_last_sentence(answer)
+            answer = truncated_answer + " ... (response shortened)"
         return answer, top_chunks

src/retriever.py CHANGED Viewed

@@ -1,110 +1,62 @@
 import os
 from typing import List, Dict, Any
-import streamlit as st
-from src import RetrieverConfig, logger
 class Retriever:
     """
-    Hybrid retriever combining BM25 sparse and dense retrieval (no Redis).
     """
-    @staticmethod
-    @st.cache_resource(show_spinner="Loading embedding model...")
-    def load_embedder(model_name):
-        from sentence_transformers import SentenceTransformer
-        return SentenceTransformer(model_name)
-    def __init__(self, chunks: List[Dict[str, Any]], config: RetrieverConfig):
-        # Lazy import heavy libraries
-        import numpy as np
-        import hnswlib
-        from rank_bm25 import BM25Okapi
-        self.chunks = chunks
-        try:
-            if not isinstance(chunks, list) or not all(isinstance(c, dict) for c in chunks):
-                logger.error("Chunks must be a list of dicts.")
-                raise ValueError("Chunks must be a list of dicts.")
-            corpus = [c.get('narration', '').split() for c in chunks]
-            self.bm25 = BM25Okapi(corpus)
-            self.embedder = self.load_embedder(config.DENSE_MODEL)
-            dim = len(self.embedder.encode(["test"])[0])
-            self.ann = hnswlib.Index(space='cosine', dim=dim)
-            self.ann.init_index(max_elements=len(chunks))
-            embeddings = self.embedder.encode([c.get('narration', '') for c in chunks])
-            self.ann.add_items(embeddings, ids=list(range(len(chunks))))
-            self.ann.set_ef(config.ANN_TOP)
-        except Exception as e:
-            logger.error(f"Retriever init failed: {e}")
-            self.bm25 = None
-            self.embedder = None
-            self.ann = None
-    def retrieve_sparse(self, query: str, top_k: int) -> List[Dict[str, Any]]:
         """
-        Retrieve chunks using BM25 sparse retrieval.
-        Args:
-        query (str): Query string.
-        top_k (int): Number of top chunks to return.
-        Returns:
-        List[Dict[str, Any]]: List of top chunks.
         """
-        if not self.bm25:
-            logger.error("BM25 not initialized.")
-            return []
-        tokenized = query.split()
-        try:
-            import numpy as np  # Ensure np is defined here
-            scores = self.bm25.get_scores(tokenized)
-            top_indices = np.argsort(scores)[::-1][:top_k]
-            return [self.chunks[i] for i in top_indices]
-        except Exception as e:
-            logger.error(f"Sparse retrieval failed: {e}")
             return []
-    def retrieve_dense(self, query: str, top_k: int) -> List[Dict[str, Any]]:
-        """
-        Retrieve chunks using dense retrieval.
-        Args:
-        query (str): Query string.
-        top_k (int): Number of top chunks to return.
-        Returns:
-        List[Dict[str, Any]]: List of top chunks.
-        """
-        if not self.ann or not self.embedder:
-            logger.error("Dense retriever not initialized.")
-            return []
         try:
-            q_emb = self.embedder.encode([query])[0]
-            labels, distances = self.ann.knn_query(q_emb, k=top_k)
-            return [self.chunks[i] for i in labels[0]]
-        except Exception as e:
-            logger.error(f"Dense retrieval failed: {e}")
-            return []
-    def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
-        """
-        Retrieve chunks using hybrid retrieval.
-        Args:
-        query (str): Query string.
-        top_k (int, optional): Number of top chunks to return. Defaults to None.
-        Returns:
-        List[Dict[str, Any]]: List of top chunks.
-        """
-        if top_k is None:
-            top_k = RetrieverConfig.TOP_K
-        sparse = self.retrieve_sparse(query, top_k)
-        dense = self.retrieve_dense(query, top_k)
-        seen = set()
-        combined = []
-        for c in sparse + dense:
-            cid = id(c)
-            if cid not in seen:
-                seen.add(cid)
-                combined.append(c)
-        return combined

 import os
 from typing import List, Dict, Any
+import numpy as np
+from src import RetrieverConfig, logger, get_chroma_client, get_embedder
 class Retriever:
     """
+    Retrieves documents from a ChromaDB collection.
     """
+    def __init__(self, collection_name: str, config: RetrieverConfig):
+        self.collection_name = collection_name
+        self.config = config
+        self.client = get_chroma_client()
+        self.embedder = get_embedder()
+        self.collection = self.client.get_or_create_collection(name=self.collection_name)
+    def retrieve(self, query: str, top_k: int = None) -> List[Dict[str, Any]]:
         """
+        Embeds a query and retrieves the top_k most similar documents from ChromaDB.
         """
+        if top_k is None:
+            top_k = self.config.TOP_K
+        if self.collection.count() == 0:
+            logger.warning(f"Chroma collection '{self.collection_name}' is empty. Cannot retrieve.")
             return []
         try:
+            # 1. Embed the query
+            query_embedding = self.embedder.embed([query])[0]
+            # 2. Query ChromaDB
+            results = self.collection.query(
+                query_embeddings=[query_embedding],
+                n_results=top_k,
+                include=["metadatas", "documents"]
+            )
+            # 3. Format results into chunks
+            # Chroma returns lists of lists, so we access the first element.
+            if not results or not results.get('ids', [[]])[0]:
+                return []
+            ids = results['ids'][0]
+            documents = results['documents'][0]
+            metadatas = results['metadatas'][0]
+            retrieved_chunks = []
+            for i, doc_id in enumerate(ids):
+                chunk = {
+                    'id': doc_id,
+                    'narration': documents[i],
+                    **metadatas[i]  # Add all other metadata from Chroma
+                }
+                retrieved_chunks.append(chunk)
+            return retrieved_chunks
+        except Exception as e:
+            logger.error(f"ChromaDB retrieval failed for collection '{self.collection_name}': {e}")
+            return []

src/utils.py CHANGED Viewed

@@ -6,6 +6,7 @@ import openai
 from typing import List
 from openai import AzureOpenAI
 from langchain_openai import AzureOpenAIEmbeddings
 from src import logger
@@ -15,7 +16,7 @@ class LLMClient:
     Reads API key from environment and exposes `generate(prompt)`.
     """
     @staticmethod
-    def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> str:
         azure_api_key = os.getenv('AZURE_API_KEY')
         azure_endpoint = os.getenv('AZURE_ENDPOINT')
         azure_api_version = os.getenv('AZURE_API_VERSION')
@@ -39,24 +40,57 @@ class LLMClient:
                 **kwargs
             )
             text = resp.choices[0].message.content.strip()
-            return text
         except Exception as e:
             logger.error(f'LLM generation failed: {e}')
             raise
 class OpenAIEmbedder:
     """
-    Wrapper around OpenAI Embeddings API.
-    Usage: embedder = OpenAIEmbedder(model_name)
-           embs = embedder.embed([str1, str2, ...])
     """
     def __init__(self, model_name: str):
-        self.model = model_name
-        openai.api_key = os.getenv("OPENAI_API_KEY")
     def embed(self, texts: List[str]) -> List[List[float]]:
-        embeddings = AzureOpenAIEmbeddings(model=self.model)
-        resp = embeddings.embed_documents(texts)
-        # return list of embedding vectors
-        return resp

 from typing import List
 from openai import AzureOpenAI
 from langchain_openai import AzureOpenAIEmbeddings
+from sentence_transformers import SentenceTransformer
 from src import logger
     Reads API key from environment and exposes `generate(prompt)`.
     """
     @staticmethod
+    def generate(prompt: str, model: str = None, max_tokens: int = 512, **kwargs) -> tuple[str, str]:
         azure_api_key = os.getenv('AZURE_API_KEY')
         azure_endpoint = os.getenv('AZURE_ENDPOINT')
         azure_api_version = os.getenv('AZURE_API_VERSION')
                 **kwargs
             )
             text = resp.choices[0].message.content.strip()
+            finish_reason = resp.choices[0].finish_reason
+            return text, finish_reason
         except Exception as e:
             logger.error(f'LLM generation failed: {e}')
             raise
+class LocalEmbedder:
+    """
+    Wrapper for a local SentenceTransformer model.
+    """
+    def __init__(self, model_name: str):
+        self.model = SentenceTransformer(model_name)
+        logger.info(f"Initialized local embedder with model: {model_name}")
+    def embed(self, texts: List[str]) -> List[List[float]]:
+        """Embeds a list of texts using the local SentenceTransformer model."""
+        try:
+            embeddings = self.model.encode(texts, show_progress_bar=False)
+            return embeddings.tolist()
+        except Exception as e:
+            logger.error(f"Local embedding failed: {e}")
+            raise
 class OpenAIEmbedder:
     """
+    Wrapper around OpenAI and Azure OpenAI Embeddings.
+    Automatically uses Azure credentials if available, otherwise falls back to OpenAI.
     """
     def __init__(self, model_name: str):
+        self.model_name = model_name
+        self.is_azure = os.getenv('AZURE_API_KEY') and os.getenv('AZURE_ENDPOINT')
+        if self.is_azure:
+            logger.info("Using Azure OpenAI for embeddings.")
+            self.embedder = AzureOpenAIEmbeddings(
+                model=self.model_name,
+                azure_deployment=os.getenv("AZURE_EMBEDDING_DEPLOYMENT"), # Assumes a deployment name is set
+                api_version=os.getenv("AZURE_API_VERSION")
+            )
+        else:
+            logger.info("Using standard OpenAI for embeddings.")
+            # This part would need OPENAI_API_KEY to be set
+            from langchain_openai import OpenAIEmbeddings
+            self.embedder = OpenAIEmbeddings(model=self.model_name)
     def embed(self, texts: List[str]) -> List[List[float]]:
+        """Embeds a list of texts."""
+        try:
+            return self.embedder.embed_documents(texts)
+        except Exception as e:
+            logger.error(f"Embedding failed: {e}")
+            raise