Sutra_AI / extracted_text.txt
Inferno-721's picture
Initial
0753d2e
--- File: /home/sk/Desktop/chat-with-pdf/app.py ---
import streamlit as st
import os
from utils.pdf_utils import PDFProcessor
from utils.embeddings_utils import EmbeddingsManager
from utils.qa_utils import QASystem
from dotenv import load_dotenv
import openai
import time
def initialize_session_state():
if 'pdf_processor' not in st.session_state:
st.session_state['pdf_processor'] = None
if 'embeddings_manager' not in st.session_state:
st.session_state['embeddings_manager'] = None
if 'qa_system' not in st.session_state:
st.session_state['qa_system'] = None
if 'processed_pdfs' not in st.session_state:
st.session_state['processed_pdfs'] = set()
if 'all_text_chunks' not in st.session_state:
st.session_state['all_text_chunks'] = []
def main():
load_dotenv()
st.set_page_config(page_title="Chat with PDF", layout="wide")
st.title("πŸ“„πŸ’¬ Chat with PDF")
initialize_session_state()
with st.sidebar:
st.header("πŸ” How to Use")
st.markdown("""
1. Upload PDF document(s)
2. Ask questions about the content
3. View answers and relevant context
""")
if 'total_tokens_used' in st.session_state:
st.markdown("---")
st.markdown("### πŸ“Š Usage Statistics")
st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
st.error("OpenAI API key not found in .env file!")
return
openai.api_key = api_key
if not st.session_state['pdf_processor']:
st.session_state['pdf_processor'] = PDFProcessor()
if not st.session_state['embeddings_manager']:
st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
if not st.session_state['qa_system']:
st.session_state['qa_system'] = QASystem(api_key)
st.subheader("πŸ“€ Upload PDFs")
uploaded_files = st.file_uploader(
"Upload PDF documents",
type=['pdf'],
accept_multiple_files=True
)
if uploaded_files:
new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
if new_files:
with st.spinner("Processing PDFs..."):
for pdf_file in new_files:
try:
pages = st.session_state['pdf_processor'].extract_text(pdf_file)
for page_text in pages.values():
chunks = st.session_state['pdf_processor'].chunk_text(page_text)
st.session_state['all_text_chunks'].extend(chunks)
st.session_state['processed_pdfs'].add(pdf_file.name)
except Exception as e:
st.error(f"Error processing {pdf_file.name}: {str(e)}")
continue
with st.spinner("Generating embeddings..."):
try:
st.session_state['embeddings_manager'].generate_embeddings(
st.session_state['all_text_chunks']
)
st.success("βœ… Documents processed!")
except Exception as e:
st.error(f"Error generating embeddings: {str(e)}")
return
if st.session_state['all_text_chunks']:
st.write("---")
st.subheader("❓ Ask Questions About Your Documents")
question = st.text_input("Enter your question:")
if question:
try:
with st.spinner("Searching for relevant information..."):
relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
question,
k=3
)
answer = st.session_state['qa_system'].generate_answer(
question,
relevant_chunks
)
st.markdown("### πŸ€– Answer:")
st.write(answer)
with st.expander("πŸ” View Source Context"):
for i, chunk in enumerate(relevant_chunks, 1):
st.markdown(f"**Context {i}:**")
st.write(chunk)
st.markdown("---")
except openai.error.RateLimitError:
st.error("Rate limit exceeded. Please try again later.")
except Exception as e:
st.error(f"Error: {str(e)}")
if __name__ == "__main__":
main()
--- File: /home/sk/Desktop/chat-with-pdf/requirements.txt ---
streamlit
PyPDF2
openai
python-dotenv
faiss-cpu
numpy
pdf2image
Pillow
--- File: /home/sk/Desktop/chat-with-pdf/.env ---
OPENAI_API_KEY=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
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/app.py ---
import streamlit as st
import os
from utils.pdf_utils import PDFProcessor
from utils.embeddings_utils import EmbeddingsManager
from utils.qa_utils import QASystem
from dotenv import load_dotenv
import openai
import time
def initialize_session_state():
if 'pdf_processor' not in st.session_state:
st.session_state['pdf_processor'] = None
if 'embeddings_manager' not in st.session_state:
st.session_state['embeddings_manager'] = None
if 'qa_system' not in st.session_state:
st.session_state['qa_system'] = None
if 'processed_pdfs' not in st.session_state:
st.session_state['processed_pdfs'] = set()
if 'all_text_chunks' not in st.session_state:
st.session_state['all_text_chunks'] = []
def main():
load_dotenv()
st.set_page_config(page_title="Chat with PDF", layout="wide")
st.title("πŸ“„πŸ’¬ Chat with PDF")
initialize_session_state()
with st.sidebar:
st.header("πŸ” How to Use")
st.markdown("""
1. Upload PDF document(s)
2. Ask questions about the content
3. View answers and relevant context
""")
if 'total_tokens_used' in st.session_state:
st.markdown("---")
st.markdown("### πŸ“Š Usage Statistics")
st.markdown(f"Total tokens used: {st.session_state.get('total_tokens_used', 0)}")
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
st.error("OpenAI API key not found in .env file!")
return
openai.api_key = api_key
if not st.session_state['pdf_processor']:
st.session_state['pdf_processor'] = PDFProcessor()
if not st.session_state['embeddings_manager']:
st.session_state['embeddings_manager'] = EmbeddingsManager(api_key)
if not st.session_state['qa_system']:
st.session_state['qa_system'] = QASystem(api_key)
st.subheader("πŸ“€ Upload PDFs")
uploaded_files = st.file_uploader(
"Upload PDF documents",
type=['pdf'],
accept_multiple_files=True
)
if uploaded_files:
new_files = [f for f in uploaded_files if f.name not in st.session_state['processed_pdfs']]
if new_files:
with st.spinner("Processing PDFs..."):
for pdf_file in new_files:
try:
pages = st.session_state['pdf_processor'].extract_text(pdf_file)
for page_text in pages.values():
chunks = st.session_state['pdf_processor'].chunk_text(page_text)
st.session_state['all_text_chunks'].extend(chunks)
st.session_state['processed_pdfs'].add(pdf_file.name)
except Exception as e:
st.error(f"Error processing {pdf_file.name}: {str(e)}")
continue
with st.spinner("Generating embeddings..."):
try:
st.session_state['embeddings_manager'].generate_embeddings(
st.session_state['all_text_chunks']
)
st.success("βœ… Documents processed!")
except Exception as e:
st.error(f"Error generating embeddings: {str(e)}")
return
if st.session_state['all_text_chunks']:
st.write("---")
st.subheader("❓ Ask Questions About Your Documents")
question = st.text_input("Enter your question:")
if question:
try:
with st.spinner("Searching for relevant information..."):
relevant_chunks = st.session_state['embeddings_manager'].find_relevant_chunks(
question,
k=3
)
answer = st.session_state['qa_system'].generate_answer(
question,
relevant_chunks
)
st.markdown("### πŸ€– Answer:")
st.write(answer)
with st.expander("πŸ” View Source Context"):
for i, chunk in enumerate(relevant_chunks, 1):
st.markdown(f"**Context {i}:**")
st.write(chunk)
st.markdown("---")
except openai.error.RateLimitError:
st.error("Rate limit exceeded. Please try again later.")
except Exception as e:
st.error(f"Error: {str(e)}")
if __name__ == "__main__":
main()
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/requirements.txt ---
streamlit
PyPDF2
openai
python-dotenv
faiss-cpu
numpy
pdf2image
Pillow
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.gitattributes ---
*.7z filter=lfs diff=lfs merge=lfs -text
*.arrow filter=lfs diff=lfs merge=lfs -text
*.bin filter=lfs diff=lfs merge=lfs -text
*.bz2 filter=lfs diff=lfs merge=lfs -text
*.ckpt filter=lfs diff=lfs merge=lfs -text
*.ftz filter=lfs diff=lfs merge=lfs -text
*.gz filter=lfs diff=lfs merge=lfs -text
*.h5 filter=lfs diff=lfs merge=lfs -text
*.joblib filter=lfs diff=lfs merge=lfs -text
*.lfs.* filter=lfs diff=lfs merge=lfs -text
*.mlmodel filter=lfs diff=lfs merge=lfs -text
*.model filter=lfs diff=lfs merge=lfs -text
*.msgpack filter=lfs diff=lfs merge=lfs -text
*.npy filter=lfs diff=lfs merge=lfs -text
*.npz filter=lfs diff=lfs merge=lfs -text
*.onnx filter=lfs diff=lfs merge=lfs -text
*.ot filter=lfs diff=lfs merge=lfs -text
*.parquet filter=lfs diff=lfs merge=lfs -text
*.pb filter=lfs diff=lfs merge=lfs -text
*.pickle filter=lfs diff=lfs merge=lfs -text
*.pkl filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
*.pth filter=lfs diff=lfs merge=lfs -text
*.rar filter=lfs diff=lfs merge=lfs -text
*.safetensors filter=lfs diff=lfs merge=lfs -text
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.tar.* filter=lfs diff=lfs merge=lfs -text
*.tar filter=lfs diff=lfs merge=lfs -text
*.tflite filter=lfs diff=lfs merge=lfs -text
*.tgz filter=lfs diff=lfs merge=lfs -text
*.wasm filter=lfs diff=lfs merge=lfs -text
*.xz filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/.env ---
OPENAI_API_KEY=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
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/qa_utils.py ---
import openai
from typing import List
class QASystem:
def __init__(self, api_key: str):
openai.api_key = api_key
def generate_answer(self, question: str, context: List[str]) -> str:
prompt = f"""Based on the context provided below, answer the question.
If the answer is not in the context, respond with "The answer is not in the provided context."
Context:
{' '.join(context)}
Question: {question}
"""
response = openai.chat.completions.create( # Updated line
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant answering questions based on the provided context."},
{"role": "user", "content": prompt}
],
temperature=0,
max_tokens=500
)
return response.choices[0].message.content
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/embeddings_utils.py ---
import openai
import numpy as np
import faiss
from typing import List
class EmbeddingsManager:
def __init__(self, api_key: str):
self.api_key = api_key
self.index = None
self.chunks = []
def generate_embeddings(self, text_chunks: List[str]):
"""Generate embeddings for text chunks using OpenAI API."""
batch_size = 10
embeddings = []
for i in range(0, len(text_chunks), batch_size):
batch = text_chunks[i:i + batch_size]
response = openai.embeddings.create(
input=batch,
model="text-embedding-ada-002"
)
# Access the embeddings using attributes
batch_embeddings = [item.embedding for item in response.data]
embeddings.extend(batch_embeddings)
# Create FAISS index
dimension = len(embeddings[0])
self.index = faiss.IndexFlatL2(dimension)
embeddings_array = np.array(embeddings).astype('float32')
self.index.add(embeddings_array)
self.chunks = text_chunks
def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
"""Find most relevant text chunks for a given query."""
response = openai.embeddings.create(
input=[query],
model="text-embedding-ada-002"
)
# Access the query embedding using attributes
query_embedding = response.data[0].embedding
D, I = self.index.search(
np.array([query_embedding]).astype('float32'),
k
)
return [self.chunks[i] for i in I[0] if i != -1]
--- File: /home/sk/Desktop/chat-with-pdf/Chat_with_PDF_Application/utils/pdf_utils.py ---
import PyPDF2
from typing import List, Dict
class PDFProcessor:
def __init__(self):
self.pages = {}
def extract_text(self, pdf_file) -> Dict[int, str]:
"""Extract text from PDF and return a dictionary of page numbers and text."""
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
text = pdf_reader.pages[page_num].extract_text()
self.pages[page_num] = text
return self.pages
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
"""Split text into chunks of specified size."""
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
current_size += len(word) + 1 # +1 for space
if current_size > chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_size = len(word)
else:
current_chunk.append(word)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks
--- File: /home/sk/Desktop/chat-with-pdf/utils/qa_utils.py ---
import openai
from typing import List
class QASystem:
def __init__(self, api_key: str):
openai.api_key = api_key
def generate_answer(self, question: str, context: List[str]) -> str:
prompt = f"""Based on the context provided below, answer the question.
If the answer is not in the context, respond with "The answer is not in the provided context."
Context:
{' '.join(context)}
Question: {question}
"""
response = openai.chat.completions.create( # Updated line
model="gpt-4",
messages=[
{"role": "system", "content": "You are an assistant answering questions based on the provided context."},
{"role": "user", "content": prompt}
],
temperature=0,
max_tokens=500
)
return response.choices[0].message.content
--- File: /home/sk/Desktop/chat-with-pdf/utils/embeddings_utils.py ---
import openai
import numpy as np
import faiss
from typing import List
class EmbeddingsManager:
def __init__(self, api_key: str):
self.api_key = api_key
self.index = None
self.chunks = []
def generate_embeddings(self, text_chunks: List[str]):
"""Generate embeddings for text chunks using OpenAI API."""
batch_size = 10
embeddings = []
for i in range(0, len(text_chunks), batch_size):
batch = text_chunks[i:i + batch_size]
response = openai.embeddings.create(
input=batch,
model="text-embedding-ada-002"
)
# Access the embeddings using attributes
batch_embeddings = [item.embedding for item in response.data]
embeddings.extend(batch_embeddings)
# Create FAISS index
dimension = len(embeddings[0])
self.index = faiss.IndexFlatL2(dimension)
embeddings_array = np.array(embeddings).astype('float32')
self.index.add(embeddings_array)
self.chunks = text_chunks
def find_relevant_chunks(self, query: str, k: int = 3) -> List[str]:
"""Find most relevant text chunks for a given query."""
response = openai.embeddings.create(
input=[query],
model="text-embedding-ada-002"
)
# Access the query embedding using attributes
query_embedding = response.data[0].embedding
D, I = self.index.search(
np.array([query_embedding]).astype('float32'),
k
)
return [self.chunks[i] for i in I[0] if i != -1]
--- File: /home/sk/Desktop/chat-with-pdf/utils/pdf_utils.py ---
import PyPDF2
from typing import List, Dict
class PDFProcessor:
def __init__(self):
self.pages = {}
def extract_text(self, pdf_file) -> Dict[int, str]:
"""Extract text from PDF and return a dictionary of page numbers and text."""
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
text = pdf_reader.pages[page_num].extract_text()
self.pages[page_num] = text
return self.pages
def chunk_text(self, text: str, chunk_size: int = 1000) -> List[str]:
"""Split text into chunks of specified size."""
words = text.split()
chunks = []
current_chunk = []
current_size = 0
for word in words:
current_size += len(word) + 1 # +1 for space
if current_size > chunk_size:
chunks.append(' '.join(current_chunk))
current_chunk = [word]
current_size = len(word)
else:
current_chunk.append(word)
if current_chunk:
chunks.append(' '.join(current_chunk))
return chunks