import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import numpy as np from PyPDF2 import PdfReader from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType import os # Load NASA-specific bi-encoder model tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2") # Initialize Pinecone client pinecone_api_key = os.getenv('PINECONE_API_KEY') pc = Pinecone(api_key=pinecone_api_key) # Create Pinecone index if it doesn't exist index_name = "scdd-index" if index_name not in pc.list_indexes().names(): pc.create_index( name=index_name, dimension=768, spec=ServerlessSpec( cloud=CloudProvider.AWS, region=AwsRegion.US_EAST_1 ), vector_type=VectorType.DENSE, metric="cosine" ) # Connect to the Pinecone index index = pc.Index(index_name) # Function to encode text using bi-encoder in batches def encode_chunks_batch(chunks, batch_size=8): embeddings = [] for i in range(0, len(chunks), batch_size): batch_chunks = chunks[i:i+batch_size] inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): output = model(**inputs) batch_embeddings = output.last_hidden_state.mean(dim=1) batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True) embeddings.extend(batch_embeddings.cpu().numpy()) return embeddings # Function to process PDFs and upsert embeddings to Pinecone def process_pdfs(pdf_files): for pdf_file in pdf_files: reader = PdfReader(pdf_file.name) pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text()) # Split text into smaller chunks chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)] # Generate embeddings in batches embeddings = encode_chunks_batch(chunks, batch_size=8) # Prepare data for Pinecone vectors = [ (f"{os.path.basename(pdf_file.name)}-chunk-{idx}", embedding.tolist(), {"text": chunk}) for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks)) ] # Upsert embeddings into Pinecone index.upsert(vectors) return f"Processed {len(pdf_files)} PDF(s) successfully and embeddings stored in Pinecone." # Gradio Interface demo = gr.Interface( fn=process_pdfs, inputs=gr.Files(label="Upload PDFs", file_types=[".pdf"]), outputs="text", title="NASA Bi-encoder PDF Embedding & Pinecone Storage", description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone." ) demo.launch()