ariG23498's picture
ariG23498 HF Staff
Update app.py
ff9a584 verified
raw
history blame
2.11 kB
import gradio as gr
import spaces
from torchvision.transforms import v2
from torchcodec.decoders import VideoDecoder
from torchcodec.samplers import clips_at_random_indices
from transformers import VJEPA2VideoProcessor, VJEPA2ForVideoClassification
import torch
import torch.nn.functional as F
import numpy as np
# Load model and processor once at startup
MODEL_ID = "ariG23498/vjepa2-vitl-fpc16-256-ssv2-uvf101"
processor = VJEPA2VideoProcessor.from_pretrained(MODEL_ID)
model = VJEPA2ForVideoClassification.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto",
)
id2label = model.config.id2label
labels = list(id2label.values())
@spaces.GPU # run on GPU zero
def classify_video(video_path):
# Decode and sample frames
decoder = VideoDecoder(video_path)
frame_idx = np.arange(0, model.config.frames_per_clip, 2) # you can define more complex sampling strategy
clip = decoder.get_frames_at(indices=frame_idx).data
# clip = clips_at_random_indices(
# decoder,
# num_clips=1,
# num_frames_per_clip=model.config.frames_per_clip,
# num_indices_between_frames=3,
# ).data
# Preprocess & move to the same device as the model
crop = v2.CenterCrop((processor.crop_size["height"], processor.crop_size["width"]))
inputs = processor(crop(clip), return_tensors="pt").to(model.device)
# Inference
model.eval()
with torch.no_grad():
logits = model(**inputs).logits[0]
# Apply softmax for full probability distribution
probs = F.softmax(logits, dim=0)
# Build and return the label→confidence dict
confidences = { labels[i]: float(probs[i]) for i in range(len(labels)) }
return confidences
# Build and launch the interface
demo = gr.Interface(
fn=classify_video,
inputs=gr.Video(label="Upload Video"),
outputs=gr.Label(label="Class Probabilities"),
examples=["baby_crawling.mp4",],
title="UCF101 Video Classifier",
description="Upload a video clip to get full softmax confidences over UCF101 classes."
)
if __name__ == "__main__":
demo.launch()