jomasego commited on
Commit
5d99cfb
·
1 Parent(s): 0c1f962

feat: Update Modal app and .gitignore

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. modal_whisper_app.py +363 -11
.gitignore CHANGED
@@ -5,3 +5,4 @@ __pycache__/
5
  .env
6
  *.log
7
  .DS_Store
 
 
5
  .env
6
  *.log
7
  .DS_Store
8
+ __pycache__/
modal_whisper_app.py CHANGED
@@ -3,25 +3,87 @@ import os
3
  import tempfile
4
  import io
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Define the Modal image
7
  whisper_image = (
8
  modal.Image.debian_slim(python_version="3.10")
9
  .apt_install("ffmpeg")
10
- .run_commands("pip install moviepy") # Force install moviepy
 
 
 
 
 
 
 
11
  .pip_install(
12
  "transformers[torch]",
13
  "accelerate",
14
  "soundfile",
15
  "moviepy", # Essential for audio extraction from video
16
  "huggingface_hub",
17
- "ffmpeg-python"
 
 
18
  )
 
 
 
19
  )
20
 
21
  app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
22
 
23
- # Environment variable for model name, configurable in Modal UI or via .env
24
- MODEL_NAME = os.environ.get("HF_MODEL_NAME", "openai/whisper-base")
25
 
26
  # Hugging Face Token - retrieve from memory and set as Modal Secret
27
  # IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
@@ -31,21 +93,22 @@ HF_TOKEN_SECRET = modal.Secret.from_name("my-huggingface-secret")
31
  @app.function(
32
  image=whisper_image,
33
  secrets=[HF_TOKEN_SECRET],
34
- timeout=1200
 
35
  )
36
  def transcribe_video_audio(video_bytes: bytes) -> str:
37
  # Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
38
- from moviepy.editor import VideoFileClip
39
  import soundfile as sf
40
  import torch
41
- from transformers import pipeline
42
  from huggingface_hub import login
43
 
44
  if not video_bytes:
45
  return "Error: No video data received."
46
 
47
  # Login to Hugging Face Hub using the token from Modal secrets
48
- hf_token = os.environ.get("HF_TOKEN")
49
  if hf_token:
50
  try:
51
  login(token=hf_token)
@@ -55,7 +118,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
55
  else:
56
  print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
57
 
58
- print(f"Processing video for transcription using model: {MODEL_NAME}")
59
 
60
  # Initialize pipeline inside the function.
61
  # For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
@@ -66,7 +129,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
66
 
67
  transcriber = pipeline(
68
  "automatic-speech-recognition",
69
- model=MODEL_NAME,
70
  torch_dtype=torch_dtype,
71
  device=device_map,
72
  )
@@ -97,7 +160,7 @@ def transcribe_video_audio(video_bytes: bytes) -> str:
97
  print("Starting transcription...")
98
  # Pass audio as a dictionary for more control, or directly as numpy array
99
  # Adding chunk_length_s for handling long audio files better.
100
- result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False)
101
  transcribed_text = result["text"]
102
 
103
  print(f"Transcription successful. Length: {len(transcribed_text)}")
@@ -160,3 +223,292 @@ def main():
160
 
161
  # Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
162
  # The Gradio app will call the deployed Modal function via its HTTP endpoint.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import tempfile
4
  import io
5
 
6
+ # Environment variable for model name, configurable in Modal UI or via .env
7
+ # This will be used by both the pre-caching function and the runtime function
8
+ WHISPER_MODEL_NAME = os.environ.get("HF_WHISPER_MODEL_NAME", "openai/whisper-large-v3")
9
+ CAPTION_MODEL_NAME = "Neleac/SpaceTimeGPT"
10
+ CAPTION_PROCESSOR_NAME = "MCG-NJU/videomae-base"
11
+ CAPTION_TOKENIZER_NAME = "gpt2" # SpaceTimeGPT uses GPT-2 as decoder
12
+ ACTION_MODEL_NAME = "MCG-NJU/videomae-base-finetuned-kinetics"
13
+ ACTION_PROCESSOR_NAME = "MCG-NJU/videomae-base-finetuned-kinetics" # Often the same as model for VideoMAE
14
+
15
+ # Initialize a Modal Dict for caching results
16
+ # The key will be a hash of the video URL or video content
17
+ video_analysis_cache = modal.Dict.from_name(
18
+ "video-analysis-cache", create_if_missing=True
19
+ )
20
+
21
+ def download_whisper_model():
22
+ import torch
23
+ from transformers import pipeline
24
+ print(f"Downloading and caching Whisper model: {WHISPER_MODEL_NAME}")
25
+ pipeline(
26
+ "automatic-speech-recognition",
27
+ model=WHISPER_MODEL_NAME,
28
+ torch_dtype=torch.float32,
29
+ device="cpu"
30
+ )
31
+ print(f"Whisper model {WHISPER_MODEL_NAME} cached successfully.")
32
+
33
+ def download_caption_model():
34
+ import torch
35
+ from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
36
+ print(f"Downloading and caching caption model: {CAPTION_MODEL_NAME}")
37
+ # Download image processor
38
+ AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
39
+ print(f"Image processor {CAPTION_PROCESSOR_NAME} cached.")
40
+ # Download tokenizer
41
+ AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
42
+ print(f"Tokenizer {CAPTION_TOKENIZER_NAME} cached.")
43
+ # Download main model
44
+ VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
45
+ print(f"Caption model {CAPTION_MODEL_NAME} cached successfully.")
46
+
47
+ def download_action_model():
48
+ from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
49
+ print(f"Downloading and caching action recognition model: {ACTION_MODEL_NAME}")
50
+ # Download image processor
51
+ VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
52
+ print(f"Action model processor {ACTION_PROCESSOR_NAME} cached.")
53
+ # Download main model
54
+ VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
55
+ print(f"Action model {ACTION_MODEL_NAME} cached successfully.")
56
+
57
  # Define the Modal image
58
  whisper_image = (
59
  modal.Image.debian_slim(python_version="3.10")
60
  .apt_install("ffmpeg")
61
+ .run_commands(
62
+ "echo 'Force reinstalling moviepy...'",
63
+ "pip install --force-reinstall moviepy",
64
+ "echo 'Checking moviepy installation...'",
65
+ "pip show moviepy || echo 'pip show moviepy failed'",
66
+ "echo 'Attempting to import moviepy.editor during build:'",
67
+ "python -c 'import moviepy; print(f\"moviepy module loaded from: {moviepy.__file__}\"); from moviepy.video.io.VideoFileClip import VideoFileClip; print(\"moviepy.video.io.VideoFileClip.VideoFileClip class import successful\")'"
68
+ ) # Force install moviepy and add diagnostics
69
  .pip_install(
70
  "transformers[torch]",
71
  "accelerate",
72
  "soundfile",
73
  "moviepy", # Essential for audio extraction from video
74
  "huggingface_hub",
75
+ "ffmpeg-python",
76
+ "av", # For video frame extraction
77
+ "fastapi[standard]" # For web endpoints
78
  )
79
+ .run_function(download_whisper_model)
80
+ .run_function(download_caption_model)
81
+ .run_function(download_action_model) # This runs download_action_model during image build
82
  )
83
 
84
  app = modal.App(name="whisper-transcriber") # Changed from modal.Stub to modal.App
85
 
86
+
 
87
 
88
  # Hugging Face Token - retrieve from memory and set as Modal Secret
89
  # IMPORTANT: Create a Modal Secret named 'my-huggingface-secret' with your actual HF_TOKEN.
 
93
  @app.function(
94
  image=whisper_image,
95
  secrets=[HF_TOKEN_SECRET],
96
+ timeout=1200,
97
+ gpu="any" # Request any available GPU
98
  )
99
  def transcribe_video_audio(video_bytes: bytes) -> str:
100
  # Imports moved inside the function to avoid local ModuleNotFoundError during `modal deploy`
101
+ from moviepy.video.io.VideoFileClip import VideoFileClip # More specific import for moviepy 2.2.1
102
  import soundfile as sf
103
  import torch
104
+ from transformers import pipeline # This will now use the pre-cached model
105
  from huggingface_hub import login
106
 
107
  if not video_bytes:
108
  return "Error: No video data received."
109
 
110
  # Login to Hugging Face Hub using the token from Modal secrets
111
+ hf_token = os.environ.get("HF_TOKEN") # Standard key for Hugging Face token in Modal secrets if set as HF_TOKEN=...
112
  if hf_token:
113
  try:
114
  login(token=hf_token)
 
118
  else:
119
  print("HF_TOKEN secret not found. Proceeding without login (works for public models).")
120
 
121
+ print(f"Processing video for transcription using model: {WHISPER_MODEL_NAME}")
122
 
123
  # Initialize pipeline inside the function.
124
  # For production/frequent use, consider @stub.cls to load the model once per container lifecycle.
 
129
 
130
  transcriber = pipeline(
131
  "automatic-speech-recognition",
132
+ model=WHISPER_MODEL_NAME,
133
  torch_dtype=torch_dtype,
134
  device=device_map,
135
  )
 
160
  print("Starting transcription...")
161
  # Pass audio as a dictionary for more control, or directly as numpy array
162
  # Adding chunk_length_s for handling long audio files better.
163
+ result = transcriber(audio_input.copy(), chunk_length_s=30, batch_size=8, return_timestamps=False, generate_kwargs={"temperature": 0.2, "no_repeat_ngram_size": 3, "language": "en"})
164
  transcribed_text = result["text"]
165
 
166
  print(f"Transcription successful. Length: {len(transcribed_text)}")
 
223
 
224
  # Note: When deploying to Modal, Modal uses the `app.serve()` or `app.deploy()` mechanism.
225
  # The Gradio app will call the deployed Modal function via its HTTP endpoint.
226
+
227
+ @app.function(
228
+ image=whisper_image,
229
+ secrets=[HF_TOKEN_SECRET],
230
+ timeout=900, # Potentially shorter if model is pre-loaded and efficient
231
+ gpu="any" # Request any available GPU
232
+ )
233
+ def generate_video_caption(video_bytes: bytes) -> str:
234
+ import torch
235
+ import av # PyAV for frame extraction
236
+ from transformers import VisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
237
+ import tempfile
238
+ import os
239
+ import numpy as np
240
+
241
+ if not video_bytes:
242
+ return "Error: No video data received for captioning."
243
+
244
+ print(f"Starting video captioning with {CAPTION_MODEL_NAME}...")
245
+ video_path = None
246
+ try:
247
+ # 1. Load pre-cached model, processor, and tokenizer
248
+ # Ensure these names match what's used in download_caption_model
249
+ image_processor = AutoImageProcessor.from_pretrained(CAPTION_PROCESSOR_NAME)
250
+ tokenizer = AutoTokenizer.from_pretrained(CAPTION_TOKENIZER_NAME)
251
+ model = VisionEncoderDecoderModel.from_pretrained(CAPTION_MODEL_NAME)
252
+
253
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
254
+ model.to(device)
255
+ print(f"Caption model loaded on device: {device}")
256
+
257
+ # 2. Save video_bytes to a temporary file to be read by PyAV
258
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
259
+ tmp_video_file.write(video_bytes)
260
+ video_path = tmp_video_file.name
261
+ print(f"Temporary video file for captioning saved: {video_path}")
262
+
263
+ # 3. Frame extraction using PyAV
264
+ container = av.open(video_path)
265
+ # Select 8 frames evenly spaced throughout the video
266
+ # Similar to the SpaceTimeGPT example
267
+ total_frames = container.streams.video[0].frames
268
+ num_frames_to_sample = 8
269
+ indices = np.linspace(0, total_frames - 1, num_frames_to_sample, dtype=int)
270
+
271
+ frames = []
272
+ container.seek(0) # Reset stream to the beginning
273
+ frame_idx = 0
274
+ target_idx_ptr = 0
275
+ for frame in container.decode(video=0):
276
+ if target_idx_ptr < len(indices) and frame_idx == indices[target_idx_ptr]:
277
+ frames.append(frame.to_image()) # Convert to PIL Image
278
+ target_idx_ptr += 1
279
+ frame_idx += 1
280
+ if len(frames) == num_frames_to_sample:
281
+ break
282
+ container.close()
283
+
284
+ if not frames:
285
+ print("No frames extracted, cannot generate caption.")
286
+ return "Error: Could not extract frames for captioning."
287
+ print(f"Extracted {len(frames)} frames for captioning.")
288
+
289
+ # 4. Generate caption
290
+ # The SpaceTimeGPT example doesn't use a specific prompt, it generates from frames directly
291
+ pixel_values = image_processor(images=frames, return_tensors="pt").pixel_values.to(device)
292
+ # The model card for Neleac/SpaceTimeGPT uses max_length=128, num_beams=5
293
+ generated_ids = model.generate(pixel_values, max_length=128, num_beams=5)
294
+ caption = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
295
+
296
+ print(f"Generated caption: {caption}")
297
+ return caption
298
+
299
+ except Exception as e:
300
+ print(f"Error during video captioning: {e}")
301
+ import traceback
302
+ traceback.print_exc()
303
+ return f"Error: Video captioning failed. Details: {str(e)}"
304
+ finally:
305
+ if video_path and os.path.exists(video_path):
306
+ try:
307
+ os.remove(video_path)
308
+ print(f"Removed temporary video file for captioning: {video_path}")
309
+ except Exception as e_rm:
310
+ print(f"Error removing temporary captioning video file {video_path}: {e_rm}")
311
+
312
+ @app.function(
313
+ image=whisper_image,
314
+ secrets=[HF_TOKEN_SECRET],
315
+ timeout=1800, # Increased timeout for combined processing
316
+ gpu="any"
317
+ )
318
+ @modal.concurrent(max_inputs=10) # Replaces allow_concurrent_inputs
319
+ @modal.fastapi_endpoint(method="POST") # Replaces web_endpoint
320
+ async def process_video_context(video_bytes: bytes, video_url: str = None):
321
+ import json
322
+ import hashlib
323
+
324
+ if not video_bytes:
325
+ return modal.Response(status_code=400, body=json.dumps({"error": "No video data provided."}))
326
+
327
+ # Generate a cache key
328
+ # If URL is provided, use it. Otherwise, hash the video content (can be slow for large videos).
329
+ cache_key = ""
330
+ if video_url:
331
+ cache_key = hashlib.sha256(video_url.encode()).hexdigest()
332
+ else:
333
+ # Hashing large video_bytes can be memory/CPU intensive. Consider alternatives if this is an issue.
334
+ # For now, let's proceed with hashing bytes if no URL.
335
+ cache_key = hashlib.sha256(video_bytes).hexdigest()
336
+
337
+ print(f"Generated cache key: {cache_key}")
338
+
339
+ # Check cache first
340
+ if cache_key in video_analysis_cache:
341
+ print(f"Cache hit for key: {cache_key}")
342
+ cached_result = video_analysis_cache[cache_key]
343
+ return modal.Response(status_code=200, body=json.dumps(cached_result))
344
+
345
+ print(f"Cache miss for key: {cache_key}. Processing video...")
346
+
347
+ results = {}
348
+ error_messages = []
349
+
350
+ # Call transcription and captioning in parallel
351
+ transcription_future = transcribe_video_audio.spawn(video_bytes)
352
+ caption_call = generate_video_caption.spawn(video_bytes)
353
+ action_call = generate_action_labels.spawn(video_bytes) # Placeholder for now
354
+
355
+ try:
356
+ transcription_result = await transcription_future
357
+ if transcription_result.startswith("Error:"):
358
+ error_messages.append(f"Transcription: {transcription_result}")
359
+ results["transcription"] = None
360
+ else:
361
+ results["transcription"] = transcription_result
362
+ except Exception as e:
363
+ print(f"Error in transcription task: {e}")
364
+ error_messages.append(f"Transcription: Failed with exception - {str(e)}")
365
+ results["transcription"] = None
366
+
367
+ try:
368
+ caption_result = await caption_call
369
+ if caption_result.startswith("Error:"):
370
+ error_messages.append(f"Captioning: {caption_result}")
371
+ results["video_caption"] = None
372
+ else:
373
+ results["video_caption"] = caption_result
374
+ except Exception as e:
375
+ print(f"Error in captioning task: {e}")
376
+ error_messages.append(f"Captioning: Failed with exception - {str(e)}")
377
+ results["video_caption"] = None
378
+
379
+ try:
380
+ action_result = await action_call # action_result is a dict from generate_action_labels
381
+ if action_result.get("error"):
382
+ error_messages.append(f"Action recognition: {action_result.get('error')}")
383
+ results["action_recognition"] = None
384
+ else:
385
+ results["action_recognition"] = action_result.get("actions", "No actions detected or error in result format")
386
+ except Exception as e:
387
+ print(f"Error in action recognition task: {e}")
388
+ import traceback
389
+ traceback.print_exc()
390
+ error_messages.append(f"Action recognition: Failed with exception - {str(e)}")
391
+ results["action_recognition"] = None
392
+
393
+ # TODO: Add calls for object detection here in the future
394
+ results["object_detection"] = "(Object detection/tracking not yet implemented)"
395
+
396
+ if error_messages:
397
+ results["processing_errors"] = error_messages
398
+ # Store partial results in cache even if there are errors
399
+ video_analysis_cache[cache_key] = results
400
+ return modal.Response(status_code=207, body=json.dumps(results)) # 207 Multi-Status
401
+
402
+ # Store successful full result in cache
403
+ video_analysis_cache[cache_key] = results
404
+ print(f"Successfully processed and cached results for key: {cache_key}")
405
+ return modal.Response(status_code=200, body=json.dumps(results))
406
+
407
+ # Update local entrypoint to use the new main processing function if desired for testing
408
+ # For now, keeping it as is to test transcription independently if needed.
409
+
410
+ @app.function(
411
+ image=whisper_image,
412
+ secrets=[HF_TOKEN_SECRET],
413
+ timeout=700, # Increased timeout slightly for model loading and inference
414
+ gpu="any" # Requires GPU
415
+ )
416
+ def generate_action_labels(video_bytes: bytes) -> dict:
417
+ import torch
418
+ import av
419
+ import numpy as np
420
+ import tempfile
421
+ import os
422
+ from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
423
+ from huggingface_hub import login
424
+
425
+ if not video_bytes:
426
+ return {"actions": [], "error": "No video data received."}
427
+
428
+ hf_token = os.environ.get("HF_TOKEN")
429
+ if hf_token:
430
+ try:
431
+ login(token=hf_token)
432
+ print("Action Recognition: Successfully logged into Hugging Face Hub.")
433
+ except Exception as e:
434
+ print(f"Action Recognition: Hugging Face Hub login failed: {e}.")
435
+ else:
436
+ print("Action Recognition: HF_TOKEN secret not found. Proceeding without login.")
437
+
438
+ video_path = None
439
+ try:
440
+ device = "cuda" if torch.cuda.is_available() else "cpu"
441
+ print(f"Action Recognition: Loading model on device: {device}")
442
+
443
+ processor = VideoMAEImageProcessor.from_pretrained(ACTION_PROCESSOR_NAME)
444
+ model = VideoMAEForVideoClassification.from_pretrained(ACTION_MODEL_NAME)
445
+ model.to(device)
446
+ model.eval()
447
+ print(f"Action Recognition: Model {ACTION_MODEL_NAME} and processor loaded.")
448
+
449
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_video_file:
450
+ tmp_video_file.write(video_bytes)
451
+ video_path = tmp_video_file.name
452
+
453
+ container = av.open(video_path)
454
+ stream = container.streams.video[0]
455
+
456
+ num_frames_to_extract = 16
457
+ total_frames = stream.frames
458
+ if total_frames == 0:
459
+ return {"actions": [], "error": "Video stream has no frames."}
460
+
461
+ # Ensure we don't try to select more frames than available, especially for very short videos
462
+ if total_frames < num_frames_to_extract:
463
+ print(f"Warning: Video has only {total_frames} frames, less than desired {num_frames_to_extract}. Using all available frames.")
464
+ num_frames_to_extract = total_frames
465
+ if num_frames_to_extract == 0: # Double check after adjustment
466
+ return {"actions": [], "error": "Video stream has no frames after adjustment."}
467
+
468
+ indices = np.linspace(0, total_frames - 1, num_frames_to_extract, dtype=int)
469
+
470
+ frames = []
471
+ container.seek(0) # Reset stream to the beginning before decoding specific frames
472
+ frame_idx_counter = 0
473
+ target_idx_ptr = 0
474
+ for frame in container.decode(video=0):
475
+ if target_idx_ptr < len(indices) and frame_idx_counter == indices[target_idx_ptr]:
476
+ frames.append(frame.to_image()) # Convert to PIL Image
477
+ target_idx_ptr += 1
478
+ frame_idx_counter += 1
479
+ if target_idx_ptr == len(indices):
480
+ break
481
+
482
+ container.close()
483
+
484
+ if not frames:
485
+ return {"actions": [], "error": "Could not extract frames from video."}
486
+
487
+ print(f"Action Recognition: Extracted {len(frames)} frames.")
488
+
489
+ # Process frames and predict
490
+ inputs = processor(frames, return_tensors="pt").to(device)
491
+
492
+ with torch.no_grad():
493
+ outputs = model(**inputs)
494
+ logits = outputs.logits
495
+
496
+ predicted_class_idx = logits.argmax(-1).item()
497
+ predicted_label = model.config.id2label[predicted_class_idx]
498
+
499
+ print(f"Action Recognition: Predicted action: {predicted_label}")
500
+ return {"actions": [predicted_label], "error": None}
501
+
502
+ except Exception as e:
503
+ print(f"Error during action recognition: {e}")
504
+ import traceback
505
+ traceback.print_exc()
506
+ return {"actions": [], "error": f"Action recognition failed: {str(e)}"}
507
+ finally:
508
+ if video_path and os.path.exists(video_path):
509
+ try:
510
+ os.remove(video_path)
511
+ print(f"Removed temporary video file for action recognition: {video_path}")
512
+ except Exception as e_rm:
513
+ print(f"Error removing temporary action recognition video file {video_path}: {e_rm}")
514
+