Spaces:

aquibmoin
/

EM-GEN

Sleeping

App Files Files Community

aquibmoin commited on Mar 5

Commit

a612ef1

verified ·

1 Parent(s): cb841f5

Create app.py

Browse files

Files changed (1) hide show

app.py +122 -0

app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+import faiss
+import numpy as np
+import os
+from datasets import load_dataset, Dataset
+from huggingface_hub import HfApi, hf_hub_download
+from PyPDF2 import PdfReader
+from transformers import AutoTokenizer, AutoModel
+import torch
+# Set HF Dataset Name & Index File
+HF_DATASET_NAME = "aquibmoin/SCDD-Embeddings"
+INDEX_FILE = "faiss_index.faiss"
+# Load NASA Bi-Encoder
+bi_encoder_model_name = "nasa-impact/nasa-smd-ibm-st-v2"
+bi_tokenizer = AutoTokenizer.from_pretrained(bi_encoder_model_name)
+bi_model = AutoModel.from_pretrained(bi_encoder_model_name)
+# Initialize HF API
+hf_api = HfApi()
+# Function to extract text from a PDF
+def extract_text_from_pdf(pdf_file):
+    text = ""
+    with pdf_file as f:
+        reader = PdfReader(f)
+        for page in reader.pages:
+            text += page.extract_text() + "\n"
+    return text
+# Function to split text into chunks
+def get_chunks(text, chunk_size=500):
+    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+# Function to generate embeddings
+def generate_embedding(text):
+    inputs = bi_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = bi_model(**inputs)
+    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+    return embedding / np.linalg.norm(embedding)  # Normalize for FAISS
+# Function to load existing FAISS index from HF
+def load_existing_faiss_index():
+    try:
+        index_path = hf_hub_download(repo_id=HF_DATASET_NAME, filename=INDEX_FILE, repo_type="dataset")
+        index = faiss.read_index(index_path)
+        print("✅ Loaded existing FAISS index.")
+        return index
+    except:
+        print("⚠️ No existing FAISS index found. Creating a new one.")
+        return faiss.IndexFlatIP(768)
+# Main function to process PDFs & update HF Dataset
+def process_pdfs_and_store(pdf_files):
+    index = load_existing_faiss_index()
+    try:
+        dataset = load_dataset(HF_DATASET_NAME, split="train")
+        existing_chunks = dataset["chunk_text"]
+        existing_embeddings = [np.array(emb) for emb in dataset["embedding"]]
+        existing_files = dataset["source_file"]
+    except:
+        existing_chunks, existing_embeddings, existing_files = [], [], []
+    all_chunks, all_embeddings = [], []
+    for pdf_file in pdf_files:
+        text = extract_text_from_pdf(pdf_file)
+        chunks = get_chunks(text)
+        embeddings = [generate_embedding(chunk) for chunk in chunks]
+        all_chunks.extend(chunks)
+        all_embeddings.extend(embeddings)
+    all_embeddings_np = np.array(all_embeddings)
+    # Append new embeddings & chunks to the existing ones
+    combined_chunks = existing_chunks + all_chunks
+    combined_embeddings = existing_embeddings + list(all_embeddings_np)
+    combined_files = existing_files + [pdf_file.name for pdf_file in pdf_files for _ in range(len(all_chunks))]
+    combined_embeddings_np = np.array(combined_embeddings)
+    # Update FAISS Index
+    index.add(all_embeddings_np)
+    # Save & Upload Updated FAISS Index
+    faiss.write_index(index, INDEX_FILE)
+    hf_api.upload_file(path_or_fileobj=INDEX_FILE, path_in_repo=INDEX_FILE, repo_id=HF_DATASET_NAME, repo_type="dataset")
+    # Update & Push Dataset
+    dataset_dict = {
+        "chunk_text": combined_chunks,
+        "embedding": [emb.tolist() for emb in combined_embeddings_np],
+        "source_file": combined_files
+    }
+    dataset = Dataset.from_dict(dataset_dict)
+    dataset.push_to_hub(HF_DATASET_NAME, split="train")
+    return f"✅ Successfully updated FAISS index & embeddings in {HF_DATASET_NAME}. Total Chunks: {len(combined_chunks)}."
+# Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🚀 SCDD Embeddings Generator - Hugging Face Spaces")
+    gr.Markdown("Upload PDFs to generate and store embeddings in a FAISS vector store on Hugging Face.")
+    pdf_input = gr.File(label="Upload PDFs (Up to 3)", file_types=[".pdf"], interactive=True, multiple=True)
+    submit_button = gr.Button("Generate & Store Embeddings")
+    output_text = gr.Textbox(label="Status")
+    submit_button.click(
+        fn=process_pdfs_and_store,
+        inputs=[pdf_input],
+        outputs=[output_text]
+    )
+# Launch Gradio App
+demo.launch(share=True)