aquibmoin commited on
Commit
a612ef1
·
verified ·
1 Parent(s): cb841f5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +122 -0
app.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import faiss
3
+ import numpy as np
4
+ import os
5
+ from datasets import load_dataset, Dataset
6
+ from huggingface_hub import HfApi, hf_hub_download
7
+ from PyPDF2 import PdfReader
8
+ from transformers import AutoTokenizer, AutoModel
9
+ import torch
10
+
11
+ # Set HF Dataset Name & Index File
12
+ HF_DATASET_NAME = "aquibmoin/SCDD-Embeddings"
13
+ INDEX_FILE = "faiss_index.faiss"
14
+
15
+ # Load NASA Bi-Encoder
16
+ bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
17
+ bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
18
+ bi_model = AutoModel.from_pretrained(bi_encoder_model_name)
19
+
20
+ # Initialize HF API
21
+ hf_api = HfApi()
22
+
23
+ # Function to extract text from a PDF
24
+ def extract_text_from_pdf(pdf_file):
25
+ text = ""
26
+ with pdf_file as f:
27
+ reader = PdfReader(f)
28
+ for page in reader.pages:
29
+ text += page.extract_text() + "\n"
30
+ return text
31
+
32
+ # Function to split text into chunks
33
+ def get_chunks(text, chunk_size=500):
34
+ return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
35
+
36
+ # Function to generate embeddings
37
+ def generate_embedding(text):
38
+ inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
39
+ with torch.no_grad():
40
+ outputs = bi_model(**inputs)
41
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
42
+ return embedding / np.linalg.norm(embedding) # Normalize for FAISS
43
+
44
+ # Function to load existing FAISS index from HF
45
+ def load_existing_faiss_index():
46
+ try:
47
+ index_path = hf_hub_download(repo_id=HF_DATASET_NAME, filename=INDEX_FILE, repo_type="dataset")
48
+ index = faiss.read_index(index_path)
49
+ print("✅ Loaded existing FAISS index.")
50
+ return index
51
+ except:
52
+ print("⚠️ No existing FAISS index found. Creating a new one.")
53
+ return faiss.IndexFlatIP(768)
54
+
55
+ # Main function to process PDFs & update HF Dataset
56
+ def process_pdfs_and_store(pdf_files):
57
+ index = load_existing_faiss_index()
58
+
59
+ try:
60
+ dataset = load_dataset(HF_DATASET_NAME, split="train")
61
+ existing_chunks = dataset["chunk_text"]
62
+ existing_embeddings = [np.array(emb) for emb in dataset["embedding"]]
63
+ existing_files = dataset["source_file"]
64
+ except:
65
+ existing_chunks, existing_embeddings, existing_files = [], [], []
66
+
67
+ all_chunks, all_embeddings = [], []
68
+
69
+ for pdf_file in pdf_files:
70
+ text = extract_text_from_pdf(pdf_file)
71
+ chunks = get_chunks(text)
72
+ embeddings = [generate_embedding(chunk) for chunk in chunks]
73
+
74
+ all_chunks.extend(chunks)
75
+ all_embeddings.extend(embeddings)
76
+
77
+ all_embeddings_np = np.array(all_embeddings)
78
+
79
+ # Append new embeddings & chunks to the existing ones
80
+ combined_chunks = existing_chunks + all_chunks
81
+ combined_embeddings = existing_embeddings + list(all_embeddings_np)
82
+ combined_files = existing_files + [pdf_file.name for pdf_file in pdf_files for _ in range(len(all_chunks))]
83
+
84
+ combined_embeddings_np = np.array(combined_embeddings)
85
+
86
+ # Update FAISS Index
87
+ index.add(all_embeddings_np)
88
+
89
+ # Save & Upload Updated FAISS Index
90
+ faiss.write_index(index, INDEX_FILE)
91
+ hf_api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=HF_DATASET_NAME, repo_type="dataset")
92
+
93
+ # Update & Push Dataset
94
+ dataset_dict = {
95
+ "chunk_text": combined_chunks,
96
+ "embedding": [emb.tolist() for emb in combined_embeddings_np],
97
+ "source_file": combined_files
98
+ }
99
+
100
+ dataset = Dataset.from_dict(dataset_dict)
101
+ dataset.push_to_hub(HF_DATASET_NAME, split="train")
102
+
103
+ return f"✅ Successfully updated FAISS index & embeddings in {HF_DATASET_NAME}. Total Chunks: {len(combined_chunks)}."
104
+
105
+ # Gradio UI
106
+ with gr.Blocks() as demo:
107
+ gr.Markdown("# 🚀 SCDD Embeddings Generator - Hugging Face Spaces")
108
+ gr.Markdown("Upload PDFs to generate and store embeddings in a FAISS vector store on Hugging Face.")
109
+
110
+ pdf_input = gr.File(label="Upload PDFs (Up to 3)", file_types=[".pdf"], interactive=True, multiple=True)
111
+ submit_button = gr.Button("Generate & Store Embeddings")
112
+
113
+ output_text = gr.Textbox(label="Status")
114
+
115
+ submit_button.click(
116
+ fn=process_pdfs_and_store,
117
+ inputs=[pdf_input],
118
+ outputs=[output_text]
119
+ )
120
+
121
+ # Launch Gradio App
122
+ demo.launch(share=True)