import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer import torch import bitsandbytes as bnb # Required for 4-bit quantization # Load the tokenizer and the quantized LLaMA model model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the quantized LLaMA model in 4-bit precision model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit=True, # Enable 4-bit quantization device_map="auto" # Automatically assigns to CPU/GPU ) # Enable native 2x faster inference (if applicable, ensure this feature works) # FastLanguageModel.for_inference(model) # Uncomment this if FastLanguageModel is available for your model # Streamlit interface st.title("Keyword Extractor using LLaMA 4-bit Model") # Text input area for user input user_input = st.text_area("Enter text for keyword extraction") if user_input: # Prepare the prompt for keyword extraction prompt_template = ( "Extract keywords and variables from the prompt:\n" "{}\n" ) alpaca_prompt = prompt_template.format(user_input) # Tokenize the input text inputs = tokenizer([alpaca_prompt], return_tensors="pt").to("cuda") # Set up the text streamer to display the generated text as it streams text_streamer = TextStreamer(tokenizer) # Generate keywords and extract variables with torch.no_grad(): output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) # Decode the output tokens to get the generated text generated_text = tokenizer.decode(output[0], skip_special_tokens=True) # Display the result in the Streamlit app st.write("Extracted Keywords and Variables:") st.write(generated_text)