--- File: /home/sk/Desktop/chat-with-pdf/app.py --- import streamlit as st import os from utils.pdf_utils import PDFProcessor from utils.embeddings_utils import EmbeddingsManager from utils.qa_utils import QASystem from dotenv import load_dotenv import openai import time def initialize_session_state(): if 'pdf_processor' not in st.session_state: st.session_state['pdf_processor'] = None if 'embeddings_manager' not in st.session_state: st.session_state['embeddings_manager'] = None if 'qa_system' not in st.session_state: st.session_state['qa_system'] = None if 'processed_pdfs' not in st.session_state: st.session_state['processed_pdfs'] = set() if 'all_text_chunks' not in st.session_state: st.session_state['all_text_chunks'] = [] def main(): load_dotenv() st.set_page_config(page_title="Chat with PDF", layout="wide") st.title("📄💬 Chat with PDF") initialize_session_state() with st.sidebar: st.header("🔍 How to Use") st.markdown(""" 1. Upload PDF document(s) 2. Ask questions about the content 3. View answers and relevant context """) if 'total_tokens_used' in st.session_state: st.markdown("---") st.markdown("### 📊 Usage Statistics") st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") api_key = os.getenv("OPENAI_API_KEY") if not api_key: st.error("OpenAI API key not found in .env file!") return openai.api_key = api_key if not st.session_state['pdf_processor']: st.session_state['pdf_processor'] = PDFProcessor() if not st.session_state['embeddings_manager']: st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) if not st.session_state['qa_system']: st.session_state['qa_system'] = QASystem(api_key) st.subheader("📤 Upload PDFs") uploaded_files = st.file_uploader( "Upload PDF documents", type=['pdf'], accept_multiple_files=True ) if uploaded_files: new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] if new_files: with st.spinner("Processing PDFs..."): for pdf_file in new_files: try: pages = st.session_state['pdf_processor'].extract_text(pdf_file) for page_text in pages.values(): chunks = st.session_state['pdf_processor'].chunk_text(page_text) st.session_state['all_text_chunks'].extend(chunks) st.session_state['processed_pdfs'].add(pdf_file.name) except Exception as e: st.error(f"Error processing {pdf_file.name}: {str(e)}") continue with st.spinner("Generating embeddings..."): try: st.session_state['embeddings_manager'].generate_embeddings( st.session_state['all_text_chunks'] ) st.success("✅ Documents processed!") except Exception as e: st.error(f"Error generating embeddings: {str(e)}") return if st.session_state['all_text_chunks']: st.write("---") st.subheader("❓ Ask Questions About Your Documents") question = st.text_input("Enter your question:") if question: try: with st.spinner("Searching for relevant information..."): relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( question, k=3 ) answer = st.session_state['qa_system'].generate_answer( question, relevant_chunks ) st.markdown("### 🤖 Answer:") st.write(answer) with st.expander("🔍 View Source Context"): for i, chunk in enumerate(relevant_chunks, 1): st.markdown(f"**Context {i}:**") st.write(chunk) st.markdown("---") except openai.error.RateLimitError: st.error("Rate limit exceeded. Please try again later.") except Exception as e: st.error(f"Error: {str(e)}") if __name__ == "__main__": main() --- File: /home/sk/Desktop/chat-with-pdf/requirements.txt --- streamlit PyPDF2 openai python-dotenv faiss-cpu numpy pdf2image Pillow --- File: /home/sk/Desktop/chat-with-pdf/.env --- OPENAI_API_KEY=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 --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py --- import streamlit as st import os from utils.pdf_utils import PDFProcessor from utils.embeddings_utils import EmbeddingsManager from utils.qa_utils import QASystem from dotenv import load_dotenv import openai import time def initialize_session_state(): if 'pdf_processor' not in st.session_state: st.session_state['pdf_processor'] = None if 'embeddings_manager' not in st.session_state: st.session_state['embeddings_manager'] = None if 'qa_system' not in st.session_state: st.session_state['qa_system'] = None if 'processed_pdfs' not in st.session_state: st.session_state['processed_pdfs'] = set() if 'all_text_chunks' not in st.session_state: st.session_state['all_text_chunks'] = [] def main(): load_dotenv() st.set_page_config(page_title="Chat with PDF", layout="wide") st.title("📄💬 Chat with PDF") initialize_session_state() with st.sidebar: st.header("🔍 How to Use") st.markdown(""" 1. Upload PDF document(s) 2. Ask questions about the content 3. View answers and relevant context """) if 'total_tokens_used' in st.session_state: st.markdown("---") st.markdown("### 📊 Usage Statistics") st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") api_key = os.getenv("OPENAI_API_KEY") if not api_key: st.error("OpenAI API key not found in .env file!") return openai.api_key = api_key if not st.session_state['pdf_processor']: st.session_state['pdf_processor'] = PDFProcessor() if not st.session_state['embeddings_manager']: st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) if not st.session_state['qa_system']: st.session_state['qa_system'] = QASystem(api_key) st.subheader("📤 Upload PDFs") uploaded_files = st.file_uploader( "Upload PDF documents", type=['pdf'], accept_multiple_files=True ) if uploaded_files: new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] if new_files: with st.spinner("Processing PDFs..."): for pdf_file in new_files: try: pages = st.session_state['pdf_processor'].extract_text(pdf_file) for page_text in pages.values(): chunks = st.session_state['pdf_processor'].chunk_text(page_text) st.session_state['all_text_chunks'].extend(chunks) st.session_state['processed_pdfs'].add(pdf_file.name) except Exception as e: st.error(f"Error processing {pdf_file.name}: {str(e)}") continue with st.spinner("Generating embeddings..."): try: st.session_state['embeddings_manager'].generate_embeddings( st.session_state['all_text_chunks'] ) st.success("✅ Documents processed!") except Exception as e: st.error(f"Error generating embeddings: {str(e)}") return if st.session_state['all_text_chunks']: st.write("---") st.subheader("❓ Ask Questions About Your Documents") question = st.text_input("Enter your question:") if question: try: with st.spinner("Searching for relevant information..."): relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( question, k=3 ) answer = st.session_state['qa_system'].generate_answer( question, relevant_chunks ) st.markdown("### 🤖 Answer:") st.write(answer) with st.expander("🔍 View Source Context"): for i, chunk in enumerate(relevant_chunks, 1): st.markdown(f"**Context {i}:**") st.write(chunk) st.markdown("---") except openai.error.RateLimitError: st.error("Rate limit exceeded. Please try again later.") except Exception as e: st.error(f"Error: {str(e)}") if __name__ == "__main__": main() --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt --- streamlit PyPDF2 openai python-dotenv faiss-cpu numpy pdf2image Pillow --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes --- *.7z filter=lfs diff=lfs merge=lfs -text *.arrow filter=lfs diff=lfs merge=lfs -text *.bin filter=lfs diff=lfs merge=lfs -text *.bz2 filter=lfs diff=lfs merge=lfs -text *.ckpt filter=lfs diff=lfs merge=lfs -text *.ftz filter=lfs diff=lfs merge=lfs -text *.gz filter=lfs diff=lfs merge=lfs -text *.h5 filter=lfs diff=lfs merge=lfs -text *.joblib filter=lfs diff=lfs merge=lfs -text *.lfs.* filter=lfs diff=lfs merge=lfs -text *.mlmodel filter=lfs diff=lfs merge=lfs -text *.model filter=lfs diff=lfs merge=lfs -text *.msgpack filter=lfs diff=lfs merge=lfs -text *.npy filter=lfs diff=lfs merge=lfs -text *.npz filter=lfs diff=lfs merge=lfs -text *.onnx filter=lfs diff=lfs merge=lfs -text *.ot filter=lfs diff=lfs merge=lfs -text *.parquet filter=lfs diff=lfs merge=lfs -text *.pb filter=lfs diff=lfs merge=lfs -text *.pickle filter=lfs diff=lfs merge=lfs -text *.pkl filter=lfs diff=lfs merge=lfs -text *.pt filter=lfs diff=lfs merge=lfs -text *.pth filter=lfs diff=lfs merge=lfs -text *.rar filter=lfs diff=lfs merge=lfs -text *.safetensors filter=lfs diff=lfs merge=lfs -text saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.tar.* filter=lfs diff=lfs merge=lfs -text *.tar filter=lfs diff=lfs merge=lfs -text *.tflite filter=lfs diff=lfs merge=lfs -text *.tgz filter=lfs diff=lfs merge=lfs -text *.wasm filter=lfs diff=lfs merge=lfs -text *.xz filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env --- OPENAI_API_KEY=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 --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py --- import openai from typing import List class QASystem: def __init__(self, api_key: str): openai.api_key = api_key def generate_answer(self, question: str, context: List[str]) -> str: prompt = f"""Based on the context provided below, answer the question. If the answer is not in the context, respond with "The answer is not in the provided context." Context: {' '.join(context)} Question: {question} """ response = openai.chat.completions.create( # Updated line model="gpt-4", messages=[ {"role": "system", "content": "You are an assistant answering questions based on the provided context."}, {"role": "user", "content": prompt} ], temperature=0, max_tokens=500 ) return response.choices[0].message.content --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py --- import openai import numpy as np import faiss from typing import List class EmbeddingsManager: def __init__(self, api_key: str): self.api_key = api_key self.index = None self.chunks = [] def generate_embeddings(self, text_chunks: List[str]): """Generate embeddings for text chunks using OpenAI API.""" batch_size = 10 embeddings = [] for i in range(0, len(text_chunks), batch_size): batch = text_chunks[i:i + batch_size] response = openai.embeddings.create( input=batch, model="text-embedding-ada-002" ) # Access the embeddings using attributes batch_embeddings = [item.embedding for item in response.data] embeddings.extend(batch_embeddings) # Create FAISS index dimension = len(embeddings[0]) self.index = faiss.IndexFlatL2(dimension) embeddings_array = np.array(embeddings).astype('float32') self.index.add(embeddings_array) self.chunks = text_chunks def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: """Find most relevant text chunks for a given query.""" response = openai.embeddings.create( input=[query], model="text-embedding-ada-002" ) # Access the query embedding using attributes query_embedding = response.data[0].embedding D, I = self.index.search( np.array([query_embedding]).astype('float32'), k ) return [self.chunks[i] for i in I[0] if i != -1] --- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py --- import PyPDF2 from typing import List, Dict class PDFProcessor: def __init__(self): self.pages = {} def extract_text(self, pdf_file) -> Dict[int, str]: """Extract text from PDF and return a dictionary of page numbers and text.""" pdf_reader = PyPDF2.PdfReader(pdf_file) for page_num in range(len(pdf_reader.pages)): text = pdf_reader.pages[page_num].extract_text() self.pages[page_num] = text return self.pages def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: """Split text into chunks of specified size.""" words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: current_size += len(word) + 1 # +1 for space if current_size > chunk_size: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_size = len(word) else: current_chunk.append(word) if current_chunk: chunks.append(' '.join(current_chunk)) return chunks --- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py --- import openai from typing import List class QASystem: def __init__(self, api_key: str): openai.api_key = api_key def generate_answer(self, question: str, context: List[str]) -> str: prompt = f"""Based on the context provided below, answer the question. If the answer is not in the context, respond with "The answer is not in the provided context." Context: {' '.join(context)} Question: {question} """ response = openai.chat.completions.create( # Updated line model="gpt-4", messages=[ {"role": "system", "content": "You are an assistant answering questions based on the provided context."}, {"role": "user", "content": prompt} ], temperature=0, max_tokens=500 ) return response.choices[0].message.content --- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py --- import openai import numpy as np import faiss from typing import List class EmbeddingsManager: def __init__(self, api_key: str): self.api_key = api_key self.index = None self.chunks = [] def generate_embeddings(self, text_chunks: List[str]): """Generate embeddings for text chunks using OpenAI API.""" batch_size = 10 embeddings = [] for i in range(0, len(text_chunks), batch_size): batch = text_chunks[i:i + batch_size] response = openai.embeddings.create( input=batch, model="text-embedding-ada-002" ) # Access the embeddings using attributes batch_embeddings = [item.embedding for item in response.data] embeddings.extend(batch_embeddings) # Create FAISS index dimension = len(embeddings[0]) self.index = faiss.IndexFlatL2(dimension) embeddings_array = np.array(embeddings).astype('float32') self.index.add(embeddings_array) self.chunks = text_chunks def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: """Find most relevant text chunks for a given query.""" response = openai.embeddings.create( input=[query], model="text-embedding-ada-002" ) # Access the query embedding using attributes query_embedding = response.data[0].embedding D, I = self.index.search( np.array([query_embedding]).astype('float32'), k ) return [self.chunks[i] for i in I[0] if i != -1] --- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py --- import PyPDF2 from typing import List, Dict class PDFProcessor: def __init__(self): self.pages = {} def extract_text(self, pdf_file) -> Dict[int, str]: """Extract text from PDF and return a dictionary of page numbers and text.""" pdf_reader = PyPDF2.PdfReader(pdf_file) for page_num in range(len(pdf_reader.pages)): text = pdf_reader.pages[page_num].extract_text() self.pages[page_num] = text return self.pages def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: """Split text into chunks of specified size.""" words = text.split() chunks = [] current_chunk = [] current_size = 0 for word in words: current_size += len(word) + 1 # +1 for space if current_size > chunk_size: chunks.append(' '.join(current_chunk)) current_chunk = [word] current_size = len(word) else: current_chunk.append(word) if current_chunk: chunks.append(' '.join(current_chunk)) return chunks