aquibmoin commited on
Commit
c18941c
·
verified ·
1 Parent(s): b691349

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -2
app.py CHANGED
@@ -56,34 +56,46 @@ def generate_chunk_id(pdf_file, chunk_text, chunk_idx):
56
 
57
  # Function to process PDFs and upsert embeddings to Pinecone
58
  def process_pdfs(pdf_files):
 
59
  for pdf_file in pdf_files:
 
 
 
60
  reader = PdfReader(pdf_file.name)
61
  pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
 
 
62
 
63
  # Split text into smaller chunks
64
  chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]
65
 
 
 
66
  # Generate embeddings in batches
67
  embeddings = encode_chunks_batch(chunks, batch_size=8)
68
 
 
 
69
  # Prepare data for Pinecone with unique IDs
70
  vectors = [
71
  (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
72
  for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
73
  ]
74
 
 
 
75
  # Upsert embeddings into Pinecone
76
  index.upsert(vectors)
77
 
78
  # Fetch index stats
79
  stats = index.describe_index_stats()
80
 
81
- return f"Processed {len(pdf_files)} PDF(s) successfully and embeddings stored in Pinecone. Current Index Stats: {stats}"
82
 
83
  # Gradio Interface
84
  demo = gr.Interface(
85
  fn=process_pdfs,
86
- inputs=gr.Files(label="Upload PDFs", file_types=[".pdf"]),
87
  outputs="text",
88
  title="NASA Bi-encoder PDF Embedding & Pinecone Storage",
89
  description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone."
 
56
 
57
  # Function to process PDFs and upsert embeddings to Pinecone
58
  def process_pdfs(pdf_files):
59
+
60
  for pdf_file in pdf_files:
61
+
62
+ yield "Reading PDF..."
63
+
64
  reader = PdfReader(pdf_file.name)
65
  pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
66
+
67
+ yield "Processing PDF..."
68
 
69
  # Split text into smaller chunks
70
  chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]
71
 
72
+ yield "Generating Embeddings..."
73
+
74
  # Generate embeddings in batches
75
  embeddings = encode_chunks_batch(chunks, batch_size=8)
76
 
77
+ yield "Embeddings generated successfully...Preparing..."
78
+
79
  # Prepare data for Pinecone with unique IDs
80
  vectors = [
81
  (generate_chunk_id(pdf_file, chunk, idx), embedding.tolist(), {"text": chunk})
82
  for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
83
  ]
84
 
85
+ yield "Pushing to Pinecone...Please wait"
86
+
87
  # Upsert embeddings into Pinecone
88
  index.upsert(vectors)
89
 
90
  # Fetch index stats
91
  stats = index.describe_index_stats()
92
 
93
+ yield f"Processed {len(pdf_files)} PDF(s) successfully and embeddings stored in Pinecone. Current Index Stats: {stats}"
94
 
95
  # Gradio Interface
96
  demo = gr.Interface(
97
  fn=process_pdfs,
98
+ inputs=gr.Files(label="Upload PDF", file_types=[".pdf"]),
99
  outputs="text",
100
  title="NASA Bi-encoder PDF Embedding & Pinecone Storage",
101
  description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone."