Spaces:

Inferno-721
/

Sutra_AI

Sleeping

File size: 19,766 Bytes

0753d2e

--- File: /home/sk/Desktop/chat-with-pdf/app.py ---

import streamlit as st
import os
from utils.pdf_utils import PDFProcessor
from utils.embeddings_utils import EmbeddingsManager
from utils.qa_utils import QASystem
from dotenv import load_dotenv
import openai
import time

def initialize_session_state():
    if 'pdf_processor' not in st.session_state:
        st.session_state['pdf_processor'] = None
    if 'embeddings_manager' not in st.session_state:
        st.session_state['embeddings_manager'] = None
    if 'qa_system' not in st.session_state:
        st.session_state['qa_system'] = None
    if 'processed_pdfs' not in st.session_state:
        st.session_state['processed_pdfs'] = set()
    if 'all_text_chunks' not in st.session_state:
        st.session_state['all_text_chunks'] = []

def main():
    load_dotenv()
    st.set_page_config(page_title="Chat with PDF", layout="wide")
    st.title("📄💬 Chat with PDF")

    initialize_session_state()

    with st.sidebar:
        st.header("🔍 How to Use")
        st.markdown("""
        1. Upload PDF document(s)
        2. Ask questions about the content
        3. View answers and relevant context
        """)
        if 'total_tokens_used' in st.session_state:
            st.markdown("---")
            st.markdown("### 📊 Usage Statistics")
            st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        st.error("OpenAI API key not found in .env file!")
        return

    openai.api_key = api_key

    if not st.session_state['pdf_processor']:
        st.session_state['pdf_processor'] = PDFProcessor()
    if not st.session_state['embeddings_manager']:
        st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
    if not st.session_state['qa_system']:
        st.session_state['qa_system'] = QASystem(api_key)

    st.subheader("📤 Upload PDFs")
    uploaded_files = st.file_uploader(
        "Upload PDF documents", 
        type=['pdf'],
        accept_multiple_files=True
    )

    if uploaded_files:
        new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
        if new_files:
            with st.spinner("Processing PDFs..."):
                for pdf_file in new_files:
                    try:
                        pages = st.session_state['pdf_processor'].extract_text(pdf_file)
                        for page_text in pages.values():
                            chunks = st.session_state['pdf_processor'].chunk_text(page_text)
                            st.session_state['all_text_chunks'].extend(chunks)
                        st.session_state['processed_pdfs'].add(pdf_file.name)
                    except Exception as e:
                        st.error(f"Error processing {pdf_file.name}: {str(e)}")
                        continue

                with st.spinner("Generating embeddings..."):
                    try:
                        st.session_state['embeddings_manager'].generate_embeddings(
                            st.session_state['all_text_chunks']
                        )
                        st.success("✅ Documents processed!")
                    except Exception as e:
                        st.error(f"Error generating embeddings: {str(e)}")
                        return

        if st.session_state['all_text_chunks']:
            st.write("---")
            st.subheader("❓ Ask Questions About Your Documents")
            question = st.text_input("Enter your question:")
            if question:
                try:
                    with st.spinner("Searching for relevant information..."):
                        relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
                            question,
                            k=3
                        )
                        answer = st.session_state['qa_system'].generate_answer(
                            question,
                            relevant_chunks
                        )
                        st.markdown("### 🤖 Answer:")
                        st.write(answer)
                        with st.expander("🔍 View Source Context"):
                            for i, chunk in enumerate(relevant_chunks, 1):
                                st.markdown(f"**Context {i}:**")
                                st.write(chunk)
                                st.markdown("---")
                except openai.error.RateLimitError:
                    st.error("Rate limit exceeded. Please try again later.")
                except Exception as e:
                    st.error(f"Error: {str(e)}")

if __name__ == "__main__":
    main()


--- File: /home/sk/Desktop/chat-with-pdf/requirements.txt ---

streamlit
PyPDF2
openai
python-dotenv
faiss-cpu
numpy
pdf2image
Pillow

--- File: /home/sk/Desktop/chat-with-pdf/.env ---

OPENAI_API_KEY=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

--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py ---

import streamlit as st
import os
from utils.pdf_utils import PDFProcessor
from utils.embeddings_utils import EmbeddingsManager
from utils.qa_utils import QASystem
from dotenv import load_dotenv
import openai
import time

def initialize_session_state():
    if 'pdf_processor' not in st.session_state:
        st.session_state['pdf_processor'] = None
    if 'embeddings_manager' not in st.session_state:
        st.session_state['embeddings_manager'] = None
    if 'qa_system' not in st.session_state:
        st.session_state['qa_system'] = None
    if 'processed_pdfs' not in st.session_state:
        st.session_state['processed_pdfs'] = set()
    if 'all_text_chunks' not in st.session_state:
        st.session_state['all_text_chunks'] = []

def main():
    load_dotenv()
    st.set_page_config(page_title="Chat with PDF", layout="wide")
    st.title("📄💬 Chat with PDF")

    initialize_session_state()

    with st.sidebar:
        st.header("🔍 How to Use")
        st.markdown("""
        1. Upload PDF document(s)
        2. Ask questions about the content
        3. View answers and relevant context
        """)
        if 'total_tokens_used' in st.session_state:
            st.markdown("---")
            st.markdown("### 📊 Usage Statistics")
            st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")

    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        st.error("OpenAI API key not found in .env file!")
        return

    openai.api_key = api_key

    if not st.session_state['pdf_processor']:
        st.session_state['pdf_processor'] = PDFProcessor()
    if not st.session_state['embeddings_manager']:
        st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
    if not st.session_state['qa_system']:
        st.session_state['qa_system'] = QASystem(api_key)

    st.subheader("📤 Upload PDFs")
    uploaded_files = st.file_uploader(
        "Upload PDF documents", 
        type=['pdf'],
        accept_multiple_files=True
    )

    if uploaded_files:
        new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
        if new_files:
            with st.spinner("Processing PDFs..."):
                for pdf_file in new_files:
                    try:
                        pages = st.session_state['pdf_processor'].extract_text(pdf_file)
                        for page_text in pages.values():
                            chunks = st.session_state['pdf_processor'].chunk_text(page_text)
                            st.session_state['all_text_chunks'].extend(chunks)
                        st.session_state['processed_pdfs'].add(pdf_file.name)
                    except Exception as e:
                        st.error(f"Error processing {pdf_file.name}: {str(e)}")
                        continue

                with st.spinner("Generating embeddings..."):
                    try:
                        st.session_state['embeddings_manager'].generate_embeddings(
                            st.session_state['all_text_chunks']
                        )
                        st.success("✅ Documents processed!")
                    except Exception as e:
                        st.error(f"Error generating embeddings: {str(e)}")
                        return

        if st.session_state['all_text_chunks']:
            st.write("---")
            st.subheader("❓ Ask Questions About Your Documents")
            question = st.text_input("Enter your question:")
            if question:
                try:
                    with st.spinner("Searching for relevant information..."):
                        relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
                            question,
                            k=3
                        )
                        answer = st.session_state['qa_system'].generate_answer(
                            question,
                            relevant_chunks
                        )
                        st.markdown("### 🤖 Answer:")
                        st.write(answer)
                        with st.expander("🔍 View Source Context"):
                            for i, chunk in enumerate(relevant_chunks, 1):
                                st.markdown(f"**Context {i}:**")
                                st.write(chunk)
                                st.markdown("---")
                except openai.error.RateLimitError:
                    st.error("Rate limit exceeded. Please try again later.")
                except Exception as e:
                    st.error(f"Error: {str(e)}")

if __name__ == "__main__":
    main()


--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt ---

streamlit
PyPDF2
openai
python-dotenv
faiss-cpu
numpy
pdf2image
Pillow

--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes ---

*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text


--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env ---

OPENAI_API_KEY=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

--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py ---

import openai
from typing import List

class QASystem:
    def __init__(self, api_key: str):
        openai.api_key = api_key
        
    def generate_answer(self, question: str, context: List[str]) -> str:
        prompt = f"""Based on the context provided below, answer the question.
        If the answer is not in the context, respond with "The answer is not in the provided context."

        Context:
        {' '.join(context)}

        Question: {question}
        """
        
        response = openai.chat.completions.create(  # Updated line
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=500
        )
        return response.choices[0].message.content


--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py ---

import openai
import numpy as np
import faiss
from typing import List

class EmbeddingsManager:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.index = None
        self.chunks = []

    def generate_embeddings(self, text_chunks: List[str]):
        """Generate embeddings for text chunks using OpenAI API."""
        batch_size = 10
        embeddings = []

        for i in range(0, len(text_chunks), batch_size):
            batch = text_chunks[i:i + batch_size]
            response = openai.embeddings.create(
                input=batch,
                model="text-embedding-ada-002"
            )
            # Access the embeddings using attributes
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)

        # Create FAISS index
        dimension = len(embeddings[0])
        self.index = faiss.IndexFlatL2(dimension)
        embeddings_array = np.array(embeddings).astype('float32')
        self.index.add(embeddings_array)
        self.chunks = text_chunks

    def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
        """Find most relevant text chunks for a given query."""
        response = openai.embeddings.create(
            input=[query],
            model="text-embedding-ada-002"
        )
        # Access the query embedding using attributes
        query_embedding = response.data[0].embedding

        D, I = self.index.search(
            np.array([query_embedding]).astype('float32'),
            k
        )

        return [self.chunks[i] for i in I[0] if i != -1]


--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py ---

import PyPDF2
from typing import List, Dict

class PDFProcessor:
    def __init__(self):
        self.pages = {}

    def extract_text(self, pdf_file) -> Dict[int, str]:
        """Extract text from PDF and return a dictionary of page numbers and text."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page_num].extract_text()
            self.pages[page_num] = text
        return self.pages

    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks of specified size."""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            current_size += len(word) + 1  # +1 for space
            if current_size > chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_size = len(word)
            else:
                current_chunk.append(word)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks


--- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py ---

import openai
from typing import List

class QASystem:
    def __init__(self, api_key: str):
        openai.api_key = api_key
        
    def generate_answer(self, question: str, context: List[str]) -> str:
        prompt = f"""Based on the context provided below, answer the question.
        If the answer is not in the context, respond with "The answer is not in the provided context."

        Context:
        {' '.join(context)}

        Question: {question}
        """
        
        response = openai.chat.completions.create(  # Updated line
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are an assistant answering questions based on the provided context."},
                {"role": "user", "content": prompt}
            ],
            temperature=0,
            max_tokens=500
        )
        return response.choices[0].message.content


--- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py ---

import openai
import numpy as np
import faiss
from typing import List

class EmbeddingsManager:
    def __init__(self, api_key: str):
        self.api_key = api_key
        self.index = None
        self.chunks = []

    def generate_embeddings(self, text_chunks: List[str]):
        """Generate embeddings for text chunks using OpenAI API."""
        batch_size = 10
        embeddings = []

        for i in range(0, len(text_chunks), batch_size):
            batch = text_chunks[i:i + batch_size]
            response = openai.embeddings.create(
                input=batch,
                model="text-embedding-ada-002"
            )
            # Access the embeddings using attributes
            batch_embeddings = [item.embedding for item in response.data]
            embeddings.extend(batch_embeddings)

        # Create FAISS index
        dimension = len(embeddings[0])
        self.index = faiss.IndexFlatL2(dimension)
        embeddings_array = np.array(embeddings).astype('float32')
        self.index.add(embeddings_array)
        self.chunks = text_chunks

    def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
        """Find most relevant text chunks for a given query."""
        response = openai.embeddings.create(
            input=[query],
            model="text-embedding-ada-002"
        )
        # Access the query embedding using attributes
        query_embedding = response.data[0].embedding

        D, I = self.index.search(
            np.array([query_embedding]).astype('float32'),
            k
        )

        return [self.chunks[i] for i in I[0] if i != -1]


--- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py ---

import PyPDF2
from typing import List, Dict

class PDFProcessor:
    def __init__(self):
        self.pages = {}

    def extract_text(self, pdf_file) -> Dict[int, str]:
        """Extract text from PDF and return a dictionary of page numbers and text."""
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page_num in range(len(pdf_reader.pages)):
            text = pdf_reader.pages[page_num].extract_text()
            self.pages[page_num] = text
        return self.pages

    def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
        """Split text into chunks of specified size."""
        words = text.split()
        chunks = []
        current_chunk = []
        current_size = 0

        for word in words:
            current_size += len(word) + 1  # +1 for space
            if current_size > chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [word]
                current_size = len(word)
            else:
                current_chunk.append(word)

        if current_chunk:
            chunks.append(' '.join(current_chunk))

        return chunks