import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from PyPDF2 import PdfReader
from pinecone import Pinecone, ServerlessSpec, CloudProvider, AwsRegion, VectorType
import os

# Load NASA-specific bi-encoder model
tokenizer = AutoTokenizer.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")
model = AutoModel.from_pretrained("nasa-impact/nasa-smd-ibm-st-v2")

# Initialize Pinecone client
pinecone_api_key = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=pinecone_api_key)

# Create Pinecone index if it doesn't exist
index_name = "scdd-index"
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768,
        spec=ServerlessSpec(
            cloud=CloudProvider.AWS,
            region=AwsRegion.US_EAST_1
        ),
        vector_type=VectorType.DENSE,
        metric="cosine"
    )

# Connect to the Pinecone index
index = pc.Index(index_name)

# Function to encode text using bi-encoder in batches
def encode_chunks_batch(chunks, batch_size=8):
    embeddings = []
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i+batch_size]
        inputs = tokenizer(batch_chunks, return_tensors='pt', padding=True, truncation=True, max_length=128)
        with torch.no_grad():
            output = model(**inputs)
            batch_embeddings = output.last_hidden_state.mean(dim=1)
            batch_embeddings = batch_embeddings / batch_embeddings.norm(dim=1, keepdim=True)
            embeddings.extend(batch_embeddings.cpu().numpy())
    return embeddings

# Function to process PDFs and upsert embeddings to Pinecone
def process_pdfs(pdf_files):
    for pdf_file in pdf_files:
        reader = PdfReader(pdf_file.name)
        pdf_text = "".join(page.extract_text() for page in reader.pages if page.extract_text())
        
        # Split text into smaller chunks
        chunks = [pdf_text[i:i+500] for i in range(0, len(pdf_text), 500)]

        # Generate embeddings in batches
        embeddings = encode_chunks_batch(chunks, batch_size=8)

        # Prepare data for Pinecone
        vectors = [
            (f"{os.path.basename(pdf_file.name)}-chunk-{idx}", embedding.tolist(), {"text": chunk})
            for idx, (embedding, chunk) in enumerate(zip(embeddings, chunks))
        ]

        # Upsert embeddings into Pinecone
        index.upsert(vectors)

    return f"Processed {len(pdf_files)} PDF(s) successfully and embeddings stored in Pinecone."

# Gradio Interface
demo = gr.Interface(
    fn=process_pdfs,
    inputs=gr.Files(label="Upload PDFs", file_types=[".pdf"]),
    outputs="text",
    title="NASA Bi-encoder PDF Embedding & Pinecone Storage",
    description="Upload PDF files to generate embeddings with NASA Bi-encoder and store in Pinecone."
)

demo.launch()