Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,571 Bytes
010d97b cc25f28 010d97b cc25f28 a71f62f 208f03a a71f62f 208f03a cc25f28 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 |
import gradio as gr
from datasets import load_dataset
import numpy as np
import torch
import random
import time
import spaces
from transformers import AutoProcessor, AutoModelForImageTextToText, AutoModelForVision2Seq
dataset_vqa = load_dataset(
path="KevinNotSmile/nuscenes-qa-mini",
name="day",
split="train",
data_files="day-train/*.arrow",
)
MODEL_VERSIONS = {
"SmolVLM-256M-Instruct": "HuggingFaceTB/SmolVLM-256M-Instruct",
"SmolVLM-500M-Instruct": "HuggingFaceTB/SmolVLM-500M-Instruct",
"SmolVLM-2.2B-Instruct": "HuggingFaceTB/SmolVLM-Instruct",
"SmolVLM2-256M-Instruct": "HuggingFaceTB/SmolVLM2-256M-Instruct",
"SmolVLM2-500M-Instruct": "HuggingFaceTB/SmolVLM2-500M-Instruct",
"SmolVLM2-2.2B-Instruct": "HuggingFaceTB/SmolVLM2-2.2B-Instruct",
}
def load_model_and_processor(version):
model_name = MODEL_VERSIONS[version]
if version.startswith("SmolVLM-"):
model = AutoModelForVision2Seq.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
else:
model = AutoModelForImageTextToText.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained(model_name)
return model, processor
@spaces.GPU
def predict(model_version):
sample = random.choice(dataset_vqa)
model, processor = load_model_and_processor(model_version)
messages = [
{
"role": "system",
"content": "You are analyzing real-time camera feed from a self-driving car's multi-camera setup. "
+ "The position of the cameras with respect to the car is: "
+ "CAM_FRONT_LEFT, CAM_FRONT, CAM_FRONT_RIGHT, CAM_BACK_LEFT, CAM_BACK, CAM_BACK_RIGHT. "
+ "Your task is to perform precise visual analysis and answer questions about the scene."
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "image"},
{"type": "image"},
{"type": "image"},
{"type": "image"},
{"type": "image"},
{"type": "text", "text": f"Answer the following question. {sample['question']}."},
],
},
{
"role": "assistant",
"content": "Answer: "
}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True, do_rescale=False)
images = [
np.array(sample["CAM_FRONT_LEFT"]),
np.array(sample["CAM_FRONT"]),
np.array(sample["CAM_FRONT_RIGHT"]),
np.array(sample["CAM_BACK_LEFT"]),
np.array(sample["CAM_BACK"]),
np.array(sample["CAM_BACK_RIGHT"]),
]
inputs = processor(text=prompt, images=images, return_tensors="pt").to(device=model.device).to(torch.float16)
start = time.time()
generated_ids = model.generate(**inputs, max_new_tokens=1000)
end = time.time()
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
predicted_answer = generated_text.split("Assistant: ")[-1].strip()
expected_answer = sample["answer"].strip()
question = sample["question"].strip()
is_correct = predicted_answer.lower() == expected_answer.lower()
inference_time = round(end - start, 2)
return (
images[0], images[1], images[2], images[3], images[4], images[5],
question, expected_answer, predicted_answer,
"β
Correct" if is_correct else "β Incorrect",
f"{inference_time:.2f} seconds"
)
theme = gr.themes.Soft(primary_hue="indigo", secondary_hue="emerald")
with gr.Blocks(theme=theme, title="π SmolVLM2 VQA Demo (NuScenes multimodal QA dataset)") as demo:
gr.Markdown("# SmolVLM2 VQA Demo (NuScenes multimodal QA dataset)")
gr.Markdown("This is a demo for the **SmolVLM-SmolVLM2 model family** on the **NuScenes multimodal QA dataset**.")
gr.Markdown("You can select different model versions and predict answers to questions based on the camera feed.")
gr.Markdown("[Check out the SmolVLM2 collection](https://huggingface.co/collections/HuggingFaceTB/smolvlm2-smallest-video-lm-ever-67ab6b5e84bf8aaa60cb17c7)")
gr.Markdown("[Check out the SmolVLM collection](https://huggingface.co/collections/HuggingFaceTB/smolvlm-6740bd584b2dcbf51ecb1f39)")
gr.Markdown("[Check out the NuScenes multimodal QA dataset](https://huggingface.co/datasets/KevinNotSmile/nuscenes-qa-mini)")
model_selector = gr.Dropdown(
choices=list(MODEL_VERSIONS.keys()),
value="2-2.2B",
label="Select Model Version"
)
predict_button = gr.Button("Predict on Random Sample")
with gr.Row():
cam_images_front = [
gr.Image(label=cam) for cam in [
"CAM_FRONT_LEFT", "CAM_FRONT", "CAM_FRONT_RIGHT"
]
]
with gr.Row():
cam_images_back = [
gr.Image(label=cam) for cam in [
"CAM_BACK_LEFT", "CAM_BACK", "CAM_BACK_RIGHT"
]
]
cam_images = cam_images_front + cam_images_back
question_text = gr.Textbox(label="Question")
expected_text = gr.Textbox(label="Expected Answer")
predicted_text = gr.Textbox(label="Predicted Answer")
correctness = gr.Textbox(label="Correct?")
timing = gr.Textbox(label="Inference Time")
predict_button.click(
fn=predict,
inputs=[model_selector],
outputs=cam_images + [question_text, expected_text, predicted_text, correctness, timing]
)
demo.launch() |