Spaces:
Sleeping
Sleeping
feat: Update Modal app and .gitignore
Browse files- .gitignore +1 -0
- modal_whisper_app.py +363 -11
.gitignore
CHANGED
@@ -5,3 +5,4 @@ __pycache__/
|
|
5 |
.env
|
6 |
*.log
|
7 |
.DS_Store
|
|
|
|
5 |
.env
|
6 |
*.log
|
7 |
.DS_Store
|
8 |
+
__pycache__/
|
modal_whisper_app.py
CHANGED
@@ -3,25 +3,87 @@ import os
|
|
3 |
import tempfile
|
4 |
import io
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# Define the Modal image
|
7 |
whisper_image = (
|
8 |
modal.Image.debian_slim(python_version="3.10")
|
9 |
.apt_install("ffmpeg")
|
10 |
-
.run_commands(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
.pip_install(
|
12 |
"transformers[torch]",
|
13 |
"accelerate",
|
14 |
"soundfile",
|
15 |
"moviepy", # Essential for audio extraction from video
|
16 |
"huggingface_hub",
|
17 |
-
"ffmpeg-python"
|
|
|
|
|
18 |
)
|
|
|
|
|
|
|
19 |
)
|
20 |
|
21 |
app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
|
22 |
|
23 |
-
|
24 |
-
MODEL_NAME = os.environ.get("HF_MODEL_NAME", "openai/whisper-base")
|
25 |
|
26 |
# Hugging Face Token - retrieve from memory and set as Modal Secret
|
27 |
# IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
|
@@ -31,21 +93,22 @@ HF_TOKEN_SECRET = modal.Secret.from_name("my-huggingface-secret")
|
|
31 |
@app.function(
|
32 |
image=whisper_image,
|
33 |
secrets=[HF_TOKEN_SECRET],
|
34 |
-
timeout=1200
|
|
|
35 |
)
|
36 |
def transcribe_video_audio(video_bytes: bytes) -> str:
|
37 |
# Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
|
38 |
-
from moviepy.
|
39 |
import soundfile as sf
|
40 |
import torch
|
41 |
-
from transformers import pipeline
|
42 |
from huggingface_hub import login
|
43 |
|
44 |
if not video_bytes:
|
45 |
return "Error: No video data received."
|
46 |
|
47 |
# Login to Hugging Face Hub using the token from Modal secrets
|
48 |
-
hf_token = os.environ.get("HF_TOKEN")
|
49 |
if hf_token:
|
50 |
try:
|
51 |
login(token=hf_token)
|
@@ -55,7 +118,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
|
|
55 |
else:
|
56 |
print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
|
57 |
|
58 |
-
print(f"Processing video for transcription using model: {
|
59 |
|
60 |
# Initialize pipeline inside the function.
|
61 |
# For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
|
@@ -66,7 +129,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
|
|
66 |
|
67 |
transcriber = pipeline(
|
68 |
"automatic-speech-recognition",
|
69 |
-
model=
|
70 |
torch_dtype=torch_dtype,
|
71 |
device=device_map,
|
72 |
)
|
@@ -97,7 +160,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
|
|
97 |
print("Starting transcription...")
|
98 |
# Pass audio as a dictionary for more control, or directly as numpy array
|
99 |
# Adding chunk_length_s for handling long audio files better.
|
100 |
-
result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False)
|
101 |
transcribed_text = result["text"]
|
102 |
|
103 |
print(f"Transcription successful. Length: {len(transcribed_text)}")
|
@@ -160,3 +223,292 @@ def main():
|
|
160 |
|
161 |
# Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
|
162 |
# The Gradio app will call the deployed Modal function via its HTTP endpoint.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import tempfile
|
4 |
import io
|
5 |
|
6 |
+
# Environment variable for model name, configurable in Modal UI or via .env
|
7 |
+
# This will be used by both the pre-caching function and the runtime function
|
8 |
+
WHISPER_MODEL_NAME = os.environ.get("HF_WHISPER_MODEL_NAME", "openai/whisper-large-v3")
|
9 |
+
CAPTION_MODEL_NAME = "Neleac/SpaceTimeGPT"
|
10 |
+
CAPTION_PROCESSOR_NAME = "MCG-NJU/videomae-base"
|
11 |
+
CAPTION_TOKENIZER_NAME = "gpt2" # SpaceTimeGPT uses GPT-2 as decoder
|
12 |
+
ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
|
13 |
+
ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base-finetuned-kinetics" # Often the same as model for VideoMAE
|
14 |
+
|
15 |
+
# Initialize a Modal Dict for caching results
|
16 |
+
# The key will be a hash of the video URL or video content
|
17 |
+
video_analysis_cache = modal.Dict.from_name(
|
18 |
+
"video-analysis-cache", create_if_missing=True
|
19 |
+
)
|
20 |
+
|
21 |
+
def download_whisper_model():
|
22 |
+
import torch
|
23 |
+
from transformers import pipeline
|
24 |
+
print(f"Downloading and caching Whisper model: {WHISPER_MODEL_NAME}")
|
25 |
+
pipeline(
|
26 |
+
"automatic-speech-recognition",
|
27 |
+
model=WHISPER_MODEL_NAME,
|
28 |
+
torch_dtype=torch.float32,
|
29 |
+
device="cpu"
|
30 |
+
)
|
31 |
+
print(f"Whisper model {WHISPER_MODEL_NAME} cached successfully.")
|
32 |
+
|
33 |
+
def download_caption_model():
|
34 |
+
import torch
|
35 |
+
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
|
36 |
+
print(f"Downloading and caching caption model: {CAPTION_MODEL_NAME}")
|
37 |
+
# Download image processor
|
38 |
+
AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
|
39 |
+
print(f"Image processor {CAPTION_PROCESSOR_NAME} cached.")
|
40 |
+
# Download tokenizer
|
41 |
+
AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
|
42 |
+
print(f"Tokenizer {CAPTION_TOKENIZER_NAME} cached.")
|
43 |
+
# Download main model
|
44 |
+
VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
|
45 |
+
print(f"Caption model {CAPTION_MODEL_NAME} cached successfully.")
|
46 |
+
|
47 |
+
def download_action_model():
|
48 |
+
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
|
49 |
+
print(f"Downloading and caching action recognition model: {ACTION_MODEL_NAME}")
|
50 |
+
# Download image processor
|
51 |
+
VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
|
52 |
+
print(f"Action model processor {ACTION_PROCESSOR_NAME} cached.")
|
53 |
+
# Download main model
|
54 |
+
VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
|
55 |
+
print(f"Action model {ACTION_MODEL_NAME} cached successfully.")
|
56 |
+
|
57 |
# Define the Modal image
|
58 |
whisper_image = (
|
59 |
modal.Image.debian_slim(python_version="3.10")
|
60 |
.apt_install("ffmpeg")
|
61 |
+
.run_commands(
|
62 |
+
"echo 'Force reinstalling moviepy...'",
|
63 |
+
"pip install --force-reinstall moviepy",
|
64 |
+
"echo 'Checking moviepy installation...'",
|
65 |
+
"pip show moviepy || echo 'pip show moviepy failed'",
|
66 |
+
"echo 'Attempting to import moviepy.editor during build:'",
|
67 |
+
"python -c 'import moviepy; print(f\"moviepy module loaded from: {moviepy.__file__}\"); from moviepy.video.io.VideoFileClip import VideoFileClip; print(\"moviepy.video.io.VideoFileClip.VideoFileClip class import successful\")'"
|
68 |
+
) # Force install moviepy and add diagnostics
|
69 |
.pip_install(
|
70 |
"transformers[torch]",
|
71 |
"accelerate",
|
72 |
"soundfile",
|
73 |
"moviepy", # Essential for audio extraction from video
|
74 |
"huggingface_hub",
|
75 |
+
"ffmpeg-python",
|
76 |
+
"av", # For video frame extraction
|
77 |
+
"fastapi[standard]" # For web endpoints
|
78 |
)
|
79 |
+
.run_function(download_whisper_model)
|
80 |
+
.run_function(download_caption_model)
|
81 |
+
.run_function(download_action_model) # This runs download_action_model during image build
|
82 |
)
|
83 |
|
84 |
app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
|
85 |
|
86 |
+
|
|
|
87 |
|
88 |
# Hugging Face Token - retrieve from memory and set as Modal Secret
|
89 |
# IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
|
|
|
93 |
@app.function(
|
94 |
image=whisper_image,
|
95 |
secrets=[HF_TOKEN_SECRET],
|
96 |
+
timeout=1200,
|
97 |
+
gpu="any" # Request any available GPU
|
98 |
)
|
99 |
def transcribe_video_audio(video_bytes: bytes) -> str:
|
100 |
# Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
|
101 |
+
from moviepy.video.io.VideoFileClip import VideoFileClip # More specific import for moviepy 2.2.1
|
102 |
import soundfile as sf
|
103 |
import torch
|
104 |
+
from transformers import pipeline # This will now use the pre-cached model
|
105 |
from huggingface_hub import login
|
106 |
|
107 |
if not video_bytes:
|
108 |
return "Error: No video data received."
|
109 |
|
110 |
# Login to Hugging Face Hub using the token from Modal secrets
|
111 |
+
hf_token = os.environ.get("HF_TOKEN") # Standard key for Hugging Face token in Modal secrets if set as HF_TOKEN=...
|
112 |
if hf_token:
|
113 |
try:
|
114 |
login(token=hf_token)
|
|
|
118 |
else:
|
119 |
print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
|
120 |
|
121 |
+
print(f"Processing video for transcription using model: {WHISPER_MODEL_NAME}")
|
122 |
|
123 |
# Initialize pipeline inside the function.
|
124 |
# For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
|
|
|
129 |
|
130 |
transcriber = pipeline(
|
131 |
"automatic-speech-recognition",
|
132 |
+
model=WHISPER_MODEL_NAME,
|
133 |
torch_dtype=torch_dtype,
|
134 |
device=device_map,
|
135 |
)
|
|
|
160 |
print("Starting transcription...")
|
161 |
# Pass audio as a dictionary for more control, or directly as numpy array
|
162 |
# Adding chunk_length_s for handling long audio files better.
|
163 |
+
result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False, generate_kwargs={"temperature": 0.2, "no_repeat_ngram_size": 3, "language": "en"})
|
164 |
transcribed_text = result["text"]
|
165 |
|
166 |
print(f"Transcription successful. Length: {len(transcribed_text)}")
|
|
|
223 |
|
224 |
# Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
|
225 |
# The Gradio app will call the deployed Modal function via its HTTP endpoint.
|
226 |
+
|
227 |
+
@app.function(
|
228 |
+
image=whisper_image,
|
229 |
+
secrets=[HF_TOKEN_SECRET],
|
230 |
+
timeout=900, # Potentially shorter if model is pre-loaded and efficient
|
231 |
+
gpu="any" # Request any available GPU
|
232 |
+
)
|
233 |
+
def generate_video_caption(video_bytes: bytes) -> str:
|
234 |
+
import torch
|
235 |
+
import av # PyAV for frame extraction
|
236 |
+
from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
|
237 |
+
import tempfile
|
238 |
+
import os
|
239 |
+
import numpy as np
|
240 |
+
|
241 |
+
if not video_bytes:
|
242 |
+
return "Error: No video data received for captioning."
|
243 |
+
|
244 |
+
print(f"Starting video captioning with {CAPTION_MODEL_NAME}...")
|
245 |
+
video_path = None
|
246 |
+
try:
|
247 |
+
# 1. Load pre-cached model, processor, and tokenizer
|
248 |
+
# Ensure these names match what's used in download_caption_model
|
249 |
+
image_processor = AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
|
250 |
+
tokenizer = AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
|
251 |
+
model = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
|
252 |
+
|
253 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
254 |
+
model.to(device)
|
255 |
+
print(f"Caption model loaded on device: {device}")
|
256 |
+
|
257 |
+
# 2. Save video_bytes to a temporary file to be read by PyAV
|
258 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
|
259 |
+
tmp_video_file.write(video_bytes)
|
260 |
+
video_path = tmp_video_file.name
|
261 |
+
print(f"Temporary video file for captioning saved: {video_path}")
|
262 |
+
|
263 |
+
# 3. Frame extraction using PyAV
|
264 |
+
container = av.open(video_path)
|
265 |
+
# Select 8 frames evenly spaced throughout the video
|
266 |
+
# Similar to the SpaceTimeGPT example
|
267 |
+
total_frames = container.streams.video[0].frames
|
268 |
+
num_frames_to_sample = 8
|
269 |
+
indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
|
270 |
+
|
271 |
+
frames = []
|
272 |
+
container.seek(0) # Reset stream to the beginning
|
273 |
+
frame_idx = 0
|
274 |
+
target_idx_ptr = 0
|
275 |
+
for frame in container.decode(video=0):
|
276 |
+
if target_idx_ptr < len(indices) and frame_idx == indices[target_idx_ptr]:
|
277 |
+
frames.append(frame.to_image()) # Convert to PIL Image
|
278 |
+
target_idx_ptr += 1
|
279 |
+
frame_idx += 1
|
280 |
+
if len(frames) == num_frames_to_sample:
|
281 |
+
break
|
282 |
+
container.close()
|
283 |
+
|
284 |
+
if not frames:
|
285 |
+
print("No frames extracted, cannot generate caption.")
|
286 |
+
return "Error: Could not extract frames for captioning."
|
287 |
+
print(f"Extracted {len(frames)} frames for captioning.")
|
288 |
+
|
289 |
+
# 4. Generate caption
|
290 |
+
# The SpaceTimeGPT example doesn't use a specific prompt, it generates from frames directly
|
291 |
+
pixel_values = image_processor(images=frames, return_tensors="pt").pixel_values.to(device)
|
292 |
+
# The model card for Neleac/SpaceTimeGPT uses max_length=128, num_beams=5
|
293 |
+
generated_ids = model.generate(pixel_values, max_length=128, num_beams=5)
|
294 |
+
caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
|
295 |
+
|
296 |
+
print(f"Generated caption: {caption}")
|
297 |
+
return caption
|
298 |
+
|
299 |
+
except Exception as e:
|
300 |
+
print(f"Error during video captioning: {e}")
|
301 |
+
import traceback
|
302 |
+
traceback.print_exc()
|
303 |
+
return f"Error: Video captioning failed. Details: {str(e)}"
|
304 |
+
finally:
|
305 |
+
if video_path and os.path.exists(video_path):
|
306 |
+
try:
|
307 |
+
os.remove(video_path)
|
308 |
+
print(f"Removed temporary video file for captioning: {video_path}")
|
309 |
+
except Exception as e_rm:
|
310 |
+
print(f"Error removing temporary captioning video file {video_path}: {e_rm}")
|
311 |
+
|
312 |
+
@app.function(
|
313 |
+
image=whisper_image,
|
314 |
+
secrets=[HF_TOKEN_SECRET],
|
315 |
+
timeout=1800, # Increased timeout for combined processing
|
316 |
+
gpu="any"
|
317 |
+
)
|
318 |
+
@modal.concurrent(max_inputs=10) # Replaces allow_concurrent_inputs
|
319 |
+
@modal.fastapi_endpoint(method="POST") # Replaces web_endpoint
|
320 |
+
async def process_video_context(video_bytes: bytes, video_url: str = None):
|
321 |
+
import json
|
322 |
+
import hashlib
|
323 |
+
|
324 |
+
if not video_bytes:
|
325 |
+
return modal.Response(status_code=400, body=json.dumps({"error": "No video data provided."}))
|
326 |
+
|
327 |
+
# Generate a cache key
|
328 |
+
# If URL is provided, use it. Otherwise, hash the video content (can be slow for large videos).
|
329 |
+
cache_key = ""
|
330 |
+
if video_url:
|
331 |
+
cache_key = hashlib.sha256(video_url.encode()).hexdigest()
|
332 |
+
else:
|
333 |
+
# Hashing large video_bytes can be memory/CPU intensive. Consider alternatives if this is an issue.
|
334 |
+
# For now, let's proceed with hashing bytes if no URL.
|
335 |
+
cache_key = hashlib.sha256(video_bytes).hexdigest()
|
336 |
+
|
337 |
+
print(f"Generated cache key: {cache_key}")
|
338 |
+
|
339 |
+
# Check cache first
|
340 |
+
if cache_key in video_analysis_cache:
|
341 |
+
print(f"Cache hit for key: {cache_key}")
|
342 |
+
cached_result = video_analysis_cache[cache_key]
|
343 |
+
return modal.Response(status_code=200, body=json.dumps(cached_result))
|
344 |
+
|
345 |
+
print(f"Cache miss for key: {cache_key}. Processing video...")
|
346 |
+
|
347 |
+
results = {}
|
348 |
+
error_messages = []
|
349 |
+
|
350 |
+
# Call transcription and captioning in parallel
|
351 |
+
transcription_future = transcribe_video_audio.spawn(video_bytes)
|
352 |
+
caption_call = generate_video_caption.spawn(video_bytes)
|
353 |
+
action_call = generate_action_labels.spawn(video_bytes) # Placeholder for now
|
354 |
+
|
355 |
+
try:
|
356 |
+
transcription_result = await transcription_future
|
357 |
+
if transcription_result.startswith("Error:"):
|
358 |
+
error_messages.append(f"Transcription: {transcription_result}")
|
359 |
+
results["transcription"] = None
|
360 |
+
else:
|
361 |
+
results["transcription"] = transcription_result
|
362 |
+
except Exception as e:
|
363 |
+
print(f"Error in transcription task: {e}")
|
364 |
+
error_messages.append(f"Transcription: Failed with exception - {str(e)}")
|
365 |
+
results["transcription"] = None
|
366 |
+
|
367 |
+
try:
|
368 |
+
caption_result = await caption_call
|
369 |
+
if caption_result.startswith("Error:"):
|
370 |
+
error_messages.append(f"Captioning: {caption_result}")
|
371 |
+
results["video_caption"] = None
|
372 |
+
else:
|
373 |
+
results["video_caption"] = caption_result
|
374 |
+
except Exception as e:
|
375 |
+
print(f"Error in captioning task: {e}")
|
376 |
+
error_messages.append(f"Captioning: Failed with exception - {str(e)}")
|
377 |
+
results["video_caption"] = None
|
378 |
+
|
379 |
+
try:
|
380 |
+
action_result = await action_call # action_result is a dict from generate_action_labels
|
381 |
+
if action_result.get("error"):
|
382 |
+
error_messages.append(f"Action recognition: {action_result.get('error')}")
|
383 |
+
results["action_recognition"] = None
|
384 |
+
else:
|
385 |
+
results["action_recognition"] = action_result.get("actions", "No actions detected or error in result format")
|
386 |
+
except Exception as e:
|
387 |
+
print(f"Error in action recognition task: {e}")
|
388 |
+
import traceback
|
389 |
+
traceback.print_exc()
|
390 |
+
error_messages.append(f"Action recognition: Failed with exception - {str(e)}")
|
391 |
+
results["action_recognition"] = None
|
392 |
+
|
393 |
+
# TODO: Add calls for object detection here in the future
|
394 |
+
results["object_detection"] = "(Object detection/tracking not yet implemented)"
|
395 |
+
|
396 |
+
if error_messages:
|
397 |
+
results["processing_errors"] = error_messages
|
398 |
+
# Store partial results in cache even if there are errors
|
399 |
+
video_analysis_cache[cache_key] = results
|
400 |
+
return modal.Response(status_code=207, body=json.dumps(results)) # 207 Multi-Status
|
401 |
+
|
402 |
+
# Store successful full result in cache
|
403 |
+
video_analysis_cache[cache_key] = results
|
404 |
+
print(f"Successfully processed and cached results for key: {cache_key}")
|
405 |
+
return modal.Response(status_code=200, body=json.dumps(results))
|
406 |
+
|
407 |
+
# Update local entrypoint to use the new main processing function if desired for testing
|
408 |
+
# For now, keeping it as is to test transcription independently if needed.
|
409 |
+
|
410 |
+
@app.function(
|
411 |
+
image=whisper_image,
|
412 |
+
secrets=[HF_TOKEN_SECRET],
|
413 |
+
timeout=700, # Increased timeout slightly for model loading and inference
|
414 |
+
gpu="any" # Requires GPU
|
415 |
+
)
|
416 |
+
def generate_action_labels(video_bytes: bytes) -> dict:
|
417 |
+
import torch
|
418 |
+
import av
|
419 |
+
import numpy as np
|
420 |
+
import tempfile
|
421 |
+
import os
|
422 |
+
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
|
423 |
+
from huggingface_hub import login
|
424 |
+
|
425 |
+
if not video_bytes:
|
426 |
+
return {"actions": [], "error": "No video data received."}
|
427 |
+
|
428 |
+
hf_token = os.environ.get("HF_TOKEN")
|
429 |
+
if hf_token:
|
430 |
+
try:
|
431 |
+
login(token=hf_token)
|
432 |
+
print("Action Recognition: Successfully logged into Hugging Face Hub.")
|
433 |
+
except Exception as e:
|
434 |
+
print(f"Action Recognition: Hugging Face Hub login failed: {e}.")
|
435 |
+
else:
|
436 |
+
print("Action Recognition: HF_TOKEN secret not found. Proceeding without login.")
|
437 |
+
|
438 |
+
video_path = None
|
439 |
+
try:
|
440 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
441 |
+
print(f"Action Recognition: Loading model on device: {device}")
|
442 |
+
|
443 |
+
processor = VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
|
444 |
+
model = VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
|
445 |
+
model.to(device)
|
446 |
+
model.eval()
|
447 |
+
print(f"Action Recognition: Model {ACTION_MODEL_NAME} and processor loaded.")
|
448 |
+
|
449 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
|
450 |
+
tmp_video_file.write(video_bytes)
|
451 |
+
video_path = tmp_video_file.name
|
452 |
+
|
453 |
+
container = av.open(video_path)
|
454 |
+
stream = container.streams.video[0]
|
455 |
+
|
456 |
+
num_frames_to_extract = 16
|
457 |
+
total_frames = stream.frames
|
458 |
+
if total_frames == 0:
|
459 |
+
return {"actions": [], "error": "Video stream has no frames."}
|
460 |
+
|
461 |
+
# Ensure we don't try to select more frames than available, especially for very short videos
|
462 |
+
if total_frames < num_frames_to_extract:
|
463 |
+
print(f"Warning: Video has only {total_frames} frames, less than desired {num_frames_to_extract}. Using all available frames.")
|
464 |
+
num_frames_to_extract = total_frames
|
465 |
+
if num_frames_to_extract == 0: # Double check after adjustment
|
466 |
+
return {"actions": [], "error": "Video stream has no frames after adjustment."}
|
467 |
+
|
468 |
+
indices = np.linspace(0, total_frames - 1, num_frames_to_extract, dtype=int)
|
469 |
+
|
470 |
+
frames = []
|
471 |
+
container.seek(0) # Reset stream to the beginning before decoding specific frames
|
472 |
+
frame_idx_counter = 0
|
473 |
+
target_idx_ptr = 0
|
474 |
+
for frame in container.decode(video=0):
|
475 |
+
if target_idx_ptr < len(indices) and frame_idx_counter == indices[target_idx_ptr]:
|
476 |
+
frames.append(frame.to_image()) # Convert to PIL Image
|
477 |
+
target_idx_ptr += 1
|
478 |
+
frame_idx_counter += 1
|
479 |
+
if target_idx_ptr == len(indices):
|
480 |
+
break
|
481 |
+
|
482 |
+
container.close()
|
483 |
+
|
484 |
+
if not frames:
|
485 |
+
return {"actions": [], "error": "Could not extract frames from video."}
|
486 |
+
|
487 |
+
print(f"Action Recognition: Extracted {len(frames)} frames.")
|
488 |
+
|
489 |
+
# Process frames and predict
|
490 |
+
inputs = processor(frames, return_tensors="pt").to(device)
|
491 |
+
|
492 |
+
with torch.no_grad():
|
493 |
+
outputs = model(**inputs)
|
494 |
+
logits = outputs.logits
|
495 |
+
|
496 |
+
predicted_class_idx = logits.argmax(-1).item()
|
497 |
+
predicted_label = model.config.id2label[predicted_class_idx]
|
498 |
+
|
499 |
+
print(f"Action Recognition: Predicted action: {predicted_label}")
|
500 |
+
return {"actions": [predicted_label], "error": None}
|
501 |
+
|
502 |
+
except Exception as e:
|
503 |
+
print(f"Error during action recognition: {e}")
|
504 |
+
import traceback
|
505 |
+
traceback.print_exc()
|
506 |
+
return {"actions": [], "error": f"Action recognition failed: {str(e)}"}
|
507 |
+
finally:
|
508 |
+
if video_path and os.path.exists(video_path):
|
509 |
+
try:
|
510 |
+
os.remove(video_path)
|
511 |
+
print(f"Removed temporary video file for action recognition: {video_path}")
|
512 |
+
except Exception as e_rm:
|
513 |
+
print(f"Error removing temporary action recognition video file {video_path}: {e_rm}")
|
514 |
+
|