import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Model configuration
MODEL_NAME = "UnfilteredAI/UNfilteredAI-1B"
OFFLOAD_DIR = "./offload"

# Ensure offload directory exists
os.makedirs(OFFLOAD_DIR, exist_ok=True)

# Load tokenizer and model safely for large weights
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    offload_folder=OFFLOAD_DIR,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()

STYLE_PROMPT = """
You are a professional ghostwriter. Your writing is:
- Thick with rich description
- Heavy and raw in tone
- Mature, triggering, and relatable
- Explicit within acceptable content guidelines
- Italicized for internal monologue or emotional emphasis
- Structured and catchy
"""

def stream_generate(prompt, temperature, max_tokens):
    input_text = f"{STYLE_PROMPT}\n\nUser prompt:\n{prompt.strip()}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device)

    output_ids = input_ids
    past_key_values = None

    for _ in range(max_tokens):
        with torch.no_grad():
            outputs = model(input_ids=output_ids[:, -1:], past_key_values=past_key_values, use_cache=True)
            next_token_logits = outputs.logits[:, -1, :] / temperature
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            output_ids = torch.cat([output_ids, next_token], dim=-1)
            past_key_values = outputs.past_key_values

        decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        generated = decoded.replace(input_text, "").strip()
        yield generated

        if tokenizer.decode(next_token[0]) in [tokenizer.eos_token, "\n\n"]:
            break

# Gradio interface
with gr.Blocks(title="🧠 HuggingChat Stream Writer") as demo:
    gr.Markdown("## ✍️ Real-Time HuggingChat-Style Generator")
    gr.Markdown("*Watch your story unfold word by word...*")

    with gr.Row():
        prompt = gr.Textbox(label="Prompt", lines=5, placeholder="Describe a rainy night and inner conflict...")
        temperature = gr.Slider(0.5, 1.5, value=0.9, step=0.1, label="Temperature")
        max_tokens = gr.Slider(50, 800, value=300, step=10, label="Max Tokens")

    with gr.Row():
        output = gr.Textbox(label="Generated Output (streaming)", lines=15)

    gr.Button("Generate").click(fn=stream_generate, inputs=[prompt, temperature, max_tokens], outputs=output)

demo.launch()