import streamlit as st
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer
import torch
import bitsandbytes as bnb  # Required for 4-bit quantization

# Load the tokenizer and the quantized LLaMA model
model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the quantized LLaMA model in 4-bit precision
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,    # Enable 4-bit quantization
    device_map="auto"     # Automatically assigns to CPU/GPU
)

# Enable native 2x faster inference (if applicable, ensure this feature works)
# FastLanguageModel.for_inference(model)  # Uncomment this if FastLanguageModel is available for your model

# Streamlit interface
st.title("Keyword Extractor using LLaMA 4-bit Model")

# Text input area for user input
user_input = st.text_area("Enter text for keyword extraction")

if user_input:
    # Prepare the prompt for keyword extraction
    prompt_template = (
        "Extract keywords and variables from the prompt:\n"
        "{}\n"
    )
    alpaca_prompt = prompt_template.format(user_input)

    # Tokenize the input text
    inputs = tokenizer([alpaca_prompt], return_tensors="pt").to("cuda")

    # Set up the text streamer to display the generated text as it streams
    text_streamer = TextStreamer(tokenizer)

    # Generate keywords and extract variables
    with torch.no_grad():
        output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)

    # Decode the output tokens to get the generated text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Display the result in the Streamlit app
    st.write("Extracted Keywords and Variables:")
    st.write(generated_text)