import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Model configuration MODEL_NAME = "UnfilteredAI/UNfilteredAI-1B" OFFLOAD_DIR = "./offload" # Ensure offload directory exists os.makedirs(OFFLOAD_DIR, exist_ok=True) # Load tokenizer and model safely for large weights tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", offload_folder=OFFLOAD_DIR, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ) model.eval() STYLE_PROMPT = """ You are a professional ghostwriter. Your writing is: - Thick with rich description - Heavy and raw in tone - Mature, triggering, and relatable - Explicit within acceptable content guidelines - Italicized for internal monologue or emotional emphasis - Structured and catchy """ def stream_generate(prompt, temperature, max_tokens): input_text = f"{STYLE_PROMPT}\n\nUser prompt:\n{prompt.strip()}" input_ids = tokenizer.encode(input_text, return_tensors="pt").to(model.device) output_ids = input_ids past_key_values = None for _ in range(max_tokens): with torch.no_grad(): outputs = model(input_ids=output_ids[:, -1:], past_key_values=past_key_values, use_cache=True) next_token_logits = outputs.logits[:, -1, :] / temperature next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True) output_ids = torch.cat([output_ids, next_token], dim=-1) past_key_values = outputs.past_key_values decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True) generated = decoded.replace(input_text, "").strip() yield generated if tokenizer.decode(next_token[0]) in [tokenizer.eos_token, "\n\n"]: break # Gradio interface with gr.Blocks(title="🧠 HuggingChat Stream Writer") as demo: gr.Markdown("## ✍️ Real-Time HuggingChat-Style Generator") gr.Markdown("*Watch your story unfold word by word...*") with gr.Row(): prompt = gr.Textbox(label="Prompt", lines=5, placeholder="Describe a rainy night and inner conflict...") temperature = gr.Slider(0.5, 1.5, value=0.9, step=0.1, label="Temperature") max_tokens = gr.Slider(50, 800, value=300, step=10, label="Max Tokens") with gr.Row(): output = gr.Textbox(label="Generated Output (streaming)", lines=15) gr.Button("Generate").click(fn=stream_generate, inputs=[prompt, temperature, max_tokens], outputs=output) demo.launch()