Spaces:
Sleeping
Sleeping
--- File: /home/sk/Desktop/chat-with-pdf/app.py --- | |
import streamlit as st | |
import os | |
from utils.pdf_utils import PDFProcessor | |
from utils.embeddings_utils import EmbeddingsManager | |
from utils.qa_utils import QASystem | |
from dotenv import load_dotenv | |
import openai | |
import time | |
def initialize_session_state(): | |
if 'pdf_processor' not in st.session_state: | |
st.session_state['pdf_processor'] = None | |
if 'embeddings_manager' not in st.session_state: | |
st.session_state['embeddings_manager'] = None | |
if 'qa_system' not in st.session_state: | |
st.session_state['qa_system'] = None | |
if 'processed_pdfs' not in st.session_state: | |
st.session_state['processed_pdfs'] = set() | |
if 'all_text_chunks' not in st.session_state: | |
st.session_state['all_text_chunks'] = [] | |
def main(): | |
load_dotenv() | |
st.set_page_config(page_title="Chat with PDF", layout="wide") | |
st.title("ππ¬ Chat with PDF") | |
initialize_session_state() | |
with st.sidebar: | |
st.header("π How to Use") | |
st.markdown(""" | |
1. Upload PDF document(s) | |
2. Ask questions about the content | |
3. View answers and relevant context | |
""") | |
if 'total_tokens_used' in st.session_state: | |
st.markdown("---") | |
st.markdown("### π Usage Statistics") | |
st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
st.error("OpenAI API key not found in .env file!") | |
return | |
openai.api_key = api_key | |
if not st.session_state['pdf_processor']: | |
st.session_state['pdf_processor'] = PDFProcessor() | |
if not st.session_state['embeddings_manager']: | |
st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) | |
if not st.session_state['qa_system']: | |
st.session_state['qa_system'] = QASystem(api_key) | |
st.subheader("π€ Upload PDFs") | |
uploaded_files = st.file_uploader( | |
"Upload PDF documents", | |
type=['pdf'], | |
accept_multiple_files=True | |
) | |
if uploaded_files: | |
new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] | |
if new_files: | |
with st.spinner("Processing PDFs..."): | |
for pdf_file in new_files: | |
try: | |
pages = st.session_state['pdf_processor'].extract_text(pdf_file) | |
for page_text in pages.values(): | |
chunks = st.session_state['pdf_processor'].chunk_text(page_text) | |
st.session_state['all_text_chunks'].extend(chunks) | |
st.session_state['processed_pdfs'].add(pdf_file.name) | |
except Exception as e: | |
st.error(f"Error processing {pdf_file.name}: {str(e)}") | |
continue | |
with st.spinner("Generating embeddings..."): | |
try: | |
st.session_state['embeddings_manager'].generate_embeddings( | |
st.session_state['all_text_chunks'] | |
) | |
st.success("β Documents processed!") | |
except Exception as e: | |
st.error(f"Error generating embeddings: {str(e)}") | |
return | |
if st.session_state['all_text_chunks']: | |
st.write("---") | |
st.subheader("β Ask Questions About Your Documents") | |
question = st.text_input("Enter your question:") | |
if question: | |
try: | |
with st.spinner("Searching for relevant information..."): | |
relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( | |
question, | |
k=3 | |
) | |
answer = st.session_state['qa_system'].generate_answer( | |
question, | |
relevant_chunks | |
) | |
st.markdown("### π€ Answer:") | |
st.write(answer) | |
with st.expander("π View Source Context"): | |
for i, chunk in enumerate(relevant_chunks, 1): | |
st.markdown(f"**Context {i}:**") | |
st.write(chunk) | |
st.markdown("---") | |
except openai.error.RateLimitError: | |
st.error("Rate limit exceeded. Please try again later.") | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
if __name__ == "__main__": | |
main() | |
--- File: /home/sk/Desktop/chat-with-pdf/requirements.txt --- | |
streamlit | |
PyPDF2 | |
openai | |
python-dotenv | |
faiss-cpu | |
numpy | |
pdf2image | |
Pillow | |
--- File: /home/sk/Desktop/chat-with-pdf/.env --- | |
OPENAI_API_KEY=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 | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py --- | |
import streamlit as st | |
import os | |
from utils.pdf_utils import PDFProcessor | |
from utils.embeddings_utils import EmbeddingsManager | |
from utils.qa_utils import QASystem | |
from dotenv import load_dotenv | |
import openai | |
import time | |
def initialize_session_state(): | |
if 'pdf_processor' not in st.session_state: | |
st.session_state['pdf_processor'] = None | |
if 'embeddings_manager' not in st.session_state: | |
st.session_state['embeddings_manager'] = None | |
if 'qa_system' not in st.session_state: | |
st.session_state['qa_system'] = None | |
if 'processed_pdfs' not in st.session_state: | |
st.session_state['processed_pdfs'] = set() | |
if 'all_text_chunks' not in st.session_state: | |
st.session_state['all_text_chunks'] = [] | |
def main(): | |
load_dotenv() | |
st.set_page_config(page_title="Chat with PDF", layout="wide") | |
st.title("ππ¬ Chat with PDF") | |
initialize_session_state() | |
with st.sidebar: | |
st.header("π How to Use") | |
st.markdown(""" | |
1. Upload PDF document(s) | |
2. Ask questions about the content | |
3. View answers and relevant context | |
""") | |
if 'total_tokens_used' in st.session_state: | |
st.markdown("---") | |
st.markdown("### π Usage Statistics") | |
st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}") | |
api_key = os.getenv("OPENAI_API_KEY") | |
if not api_key: | |
st.error("OpenAI API key not found in .env file!") | |
return | |
openai.api_key = api_key | |
if not st.session_state['pdf_processor']: | |
st.session_state['pdf_processor'] = PDFProcessor() | |
if not st.session_state['embeddings_manager']: | |
st.session_state['embeddings_manager'] = EmbeddingsManager(api_key) | |
if not st.session_state['qa_system']: | |
st.session_state['qa_system'] = QASystem(api_key) | |
st.subheader("π€ Upload PDFs") | |
uploaded_files = st.file_uploader( | |
"Upload PDF documents", | |
type=['pdf'], | |
accept_multiple_files=True | |
) | |
if uploaded_files: | |
new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']] | |
if new_files: | |
with st.spinner("Processing PDFs..."): | |
for pdf_file in new_files: | |
try: | |
pages = st.session_state['pdf_processor'].extract_text(pdf_file) | |
for page_text in pages.values(): | |
chunks = st.session_state['pdf_processor'].chunk_text(page_text) | |
st.session_state['all_text_chunks'].extend(chunks) | |
st.session_state['processed_pdfs'].add(pdf_file.name) | |
except Exception as e: | |
st.error(f"Error processing {pdf_file.name}: {str(e)}") | |
continue | |
with st.spinner("Generating embeddings..."): | |
try: | |
st.session_state['embeddings_manager'].generate_embeddings( | |
st.session_state['all_text_chunks'] | |
) | |
st.success("β Documents processed!") | |
except Exception as e: | |
st.error(f"Error generating embeddings: {str(e)}") | |
return | |
if st.session_state['all_text_chunks']: | |
st.write("---") | |
st.subheader("β Ask Questions About Your Documents") | |
question = st.text_input("Enter your question:") | |
if question: | |
try: | |
with st.spinner("Searching for relevant information..."): | |
relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks( | |
question, | |
k=3 | |
) | |
answer = st.session_state['qa_system'].generate_answer( | |
question, | |
relevant_chunks | |
) | |
st.markdown("### π€ Answer:") | |
st.write(answer) | |
with st.expander("π View Source Context"): | |
for i, chunk in enumerate(relevant_chunks, 1): | |
st.markdown(f"**Context {i}:**") | |
st.write(chunk) | |
st.markdown("---") | |
except openai.error.RateLimitError: | |
st.error("Rate limit exceeded. Please try again later.") | |
except Exception as e: | |
st.error(f"Error: {str(e)}") | |
if __name__ == "__main__": | |
main() | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt --- | |
streamlit | |
PyPDF2 | |
openai | |
python-dotenv | |
faiss-cpu | |
numpy | |
pdf2image | |
Pillow | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes --- | |
*.7z filter=lfs diff=lfs merge=lfs -text | |
*.arrow filter=lfs diff=lfs merge=lfs -text | |
*.bin filter=lfs diff=lfs merge=lfs -text | |
*.bz2 filter=lfs diff=lfs merge=lfs -text | |
*.ckpt filter=lfs diff=lfs merge=lfs -text | |
*.ftz filter=lfs diff=lfs merge=lfs -text | |
*.gz filter=lfs diff=lfs merge=lfs -text | |
*.h5 filter=lfs diff=lfs merge=lfs -text | |
*.joblib filter=lfs diff=lfs merge=lfs -text | |
*.lfs.* filter=lfs diff=lfs merge=lfs -text | |
*.mlmodel filter=lfs diff=lfs merge=lfs -text | |
*.model filter=lfs diff=lfs merge=lfs -text | |
*.msgpack filter=lfs diff=lfs merge=lfs -text | |
*.npy filter=lfs diff=lfs merge=lfs -text | |
*.npz filter=lfs diff=lfs merge=lfs -text | |
*.onnx filter=lfs diff=lfs merge=lfs -text | |
*.ot filter=lfs diff=lfs merge=lfs -text | |
*.parquet filter=lfs diff=lfs merge=lfs -text | |
*.pb filter=lfs diff=lfs merge=lfs -text | |
*.pickle filter=lfs diff=lfs merge=lfs -text | |
*.pkl filter=lfs diff=lfs merge=lfs -text | |
*.pt filter=lfs diff=lfs merge=lfs -text | |
*.pth filter=lfs diff=lfs merge=lfs -text | |
*.rar filter=lfs diff=lfs merge=lfs -text | |
*.safetensors filter=lfs diff=lfs merge=lfs -text | |
saved_model/**/* filter=lfs diff=lfs merge=lfs -text | |
*.tar.* filter=lfs diff=lfs merge=lfs -text | |
*.tar filter=lfs diff=lfs merge=lfs -text | |
*.tflite filter=lfs diff=lfs merge=lfs -text | |
*.tgz filter=lfs diff=lfs merge=lfs -text | |
*.wasm filter=lfs diff=lfs merge=lfs -text | |
*.xz filter=lfs diff=lfs merge=lfs -text | |
*.zip filter=lfs diff=lfs merge=lfs -text | |
*.zst filter=lfs diff=lfs merge=lfs -text | |
*tfevents* filter=lfs diff=lfs merge=lfs -text | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env --- | |
OPENAI_API_KEY=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 | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py --- | |
import openai | |
from typing import List | |
class QASystem: | |
def __init__(self, api_key: str): | |
openai.api_key = api_key | |
def generate_answer(self, question: str, context: List[str]) -> str: | |
prompt = f"""Based on the context provided below, answer the question. | |
If the answer is not in the context, respond with "The answer is not in the provided context." | |
Context: | |
{' '.join(context)} | |
Question: {question} | |
""" | |
response = openai.chat.completions.create( # Updated line | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are an assistant answering questions based on the provided context."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0, | |
max_tokens=500 | |
) | |
return response.choices[0].message.content | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py --- | |
import openai | |
import numpy as np | |
import faiss | |
from typing import List | |
class EmbeddingsManager: | |
def __init__(self, api_key: str): | |
self.api_key = api_key | |
self.index = None | |
self.chunks = [] | |
def generate_embeddings(self, text_chunks: List[str]): | |
"""Generate embeddings for text chunks using OpenAI API.""" | |
batch_size = 10 | |
embeddings = [] | |
for i in range(0, len(text_chunks), batch_size): | |
batch = text_chunks[i:i + batch_size] | |
response = openai.embeddings.create( | |
input=batch, | |
model="text-embedding-ada-002" | |
) | |
# Access the embeddings using attributes | |
batch_embeddings = [item.embedding for item in response.data] | |
embeddings.extend(batch_embeddings) | |
# Create FAISS index | |
dimension = len(embeddings[0]) | |
self.index = faiss.IndexFlatL2(dimension) | |
embeddings_array = np.array(embeddings).astype('float32') | |
self.index.add(embeddings_array) | |
self.chunks = text_chunks | |
def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: | |
"""Find most relevant text chunks for a given query.""" | |
response = openai.embeddings.create( | |
input=[query], | |
model="text-embedding-ada-002" | |
) | |
# Access the query embedding using attributes | |
query_embedding = response.data[0].embedding | |
D, I = self.index.search( | |
np.array([query_embedding]).astype('float32'), | |
k | |
) | |
return [self.chunks[i] for i in I[0] if i != -1] | |
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py --- | |
import PyPDF2 | |
from typing import List, Dict | |
class PDFProcessor: | |
def __init__(self): | |
self.pages = {} | |
def extract_text(self, pdf_file) -> Dict[int, str]: | |
"""Extract text from PDF and return a dictionary of page numbers and text.""" | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
text = pdf_reader.pages[page_num].extract_text() | |
self.pages[page_num] = text | |
return self.pages | |
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
"""Split text into chunks of specified size.""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_size = 0 | |
for word in words: | |
current_size += len(word) + 1 # +1 for space | |
if current_size > chunk_size: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_size = len(word) | |
else: | |
current_chunk.append(word) | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |
--- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py --- | |
import openai | |
from typing import List | |
class QASystem: | |
def __init__(self, api_key: str): | |
openai.api_key = api_key | |
def generate_answer(self, question: str, context: List[str]) -> str: | |
prompt = f"""Based on the context provided below, answer the question. | |
If the answer is not in the context, respond with "The answer is not in the provided context." | |
Context: | |
{' '.join(context)} | |
Question: {question} | |
""" | |
response = openai.chat.completions.create( # Updated line | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are an assistant answering questions based on the provided context."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0, | |
max_tokens=500 | |
) | |
return response.choices[0].message.content | |
--- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py --- | |
import openai | |
import numpy as np | |
import faiss | |
from typing import List | |
class EmbeddingsManager: | |
def __init__(self, api_key: str): | |
self.api_key = api_key | |
self.index = None | |
self.chunks = [] | |
def generate_embeddings(self, text_chunks: List[str]): | |
"""Generate embeddings for text chunks using OpenAI API.""" | |
batch_size = 10 | |
embeddings = [] | |
for i in range(0, len(text_chunks), batch_size): | |
batch = text_chunks[i:i + batch_size] | |
response = openai.embeddings.create( | |
input=batch, | |
model="text-embedding-ada-002" | |
) | |
# Access the embeddings using attributes | |
batch_embeddings = [item.embedding for item in response.data] | |
embeddings.extend(batch_embeddings) | |
# Create FAISS index | |
dimension = len(embeddings[0]) | |
self.index = faiss.IndexFlatL2(dimension) | |
embeddings_array = np.array(embeddings).astype('float32') | |
self.index.add(embeddings_array) | |
self.chunks = text_chunks | |
def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]: | |
"""Find most relevant text chunks for a given query.""" | |
response = openai.embeddings.create( | |
input=[query], | |
model="text-embedding-ada-002" | |
) | |
# Access the query embedding using attributes | |
query_embedding = response.data[0].embedding | |
D, I = self.index.search( | |
np.array([query_embedding]).astype('float32'), | |
k | |
) | |
return [self.chunks[i] for i in I[0] if i != -1] | |
--- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py --- | |
import PyPDF2 | |
from typing import List, Dict | |
class PDFProcessor: | |
def __init__(self): | |
self.pages = {} | |
def extract_text(self, pdf_file) -> Dict[int, str]: | |
"""Extract text from PDF and return a dictionary of page numbers and text.""" | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
text = pdf_reader.pages[page_num].extract_text() | |
self.pages[page_num] = text | |
return self.pages | |
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]: | |
"""Split text into chunks of specified size.""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_size = 0 | |
for word in words: | |
current_size += len(word) + 1 # +1 for space | |
if current_size > chunk_size: | |
chunks.append(' '.join(current_chunk)) | |
current_chunk = [word] | |
current_size = len(word) | |
else: | |
current_chunk.append(word) | |
if current_chunk: | |
chunks.append(' '.join(current_chunk)) | |
return chunks | |