import gradio as gr import torch from transformers import AutoModelForImageTextToText, AutoProcessor, TextIteratorStreamer from peft import PeftModel from transformers.image_utils import load_image from threading import Thread import time import html def progress_bar_html(label: str) -> str: """ Returns an HTML snippet for a thin progress bar with a label. The progress bar is styled as a dark animated bar. """ return f'''
{label}
''' model_name = "HuggingFaceTB/SmolVLM2-500M-Video-Instruct" adapter_name = "xco2/smolvlm2-500M-illustration-description" model = AutoModelForImageTextToText.from_pretrained( model_name, ) model = PeftModel.from_pretrained(model, adapter_name) processor = AutoProcessor.from_pretrained(model_name) model = model.merge_and_unload().to(torch.float16).eval() print(f"Successfully load the model: {model}") def model_inference(input_dict, history): text = input_dict["text"] files = input_dict["files"] if len(files) > 1: images = [load_image(image) for image in files] elif len(files) == 1: images = [load_image(files[0])] else: images = [] if text == "" and not images: gr.Error("Please input a query and optionally image(s).") return if text == "" and images: gr.Error("Please input a text query along with the image(s).") return messages = [ { "role": "user", "content": [ *[{"type": "image", "image": image} for image in images], {"type": "text", "text": text}, ], } ] inputs = processor.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device, dtype=model.dtype) streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() buffer = "" yield progress_bar_html("Processing...") for new_text in streamer: escaped_new_text = html.escape(new_text) buffer += escaped_new_text time.sleep(0.01) yield buffer examples = [ [{"text": "Write a descriptive caption for this image in a formal tone.", "files": ["example_images/阿能_129888755.jpg"]}], [{"text": "What are the characters wearing?", "files": ["example_images/阿能_129888755.jpg"]}], ] demo = gr.ChatInterface( fn=model_inference, description="# **Smolvlm2-500M-illustration-description** \n (running on CPU)", examples=examples, fill_height=True, textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]), stop_btn="Stop Generation", multimodal=True, cache_examples=False, ) demo.launch(debug=True)