Spaces:

chakkale
/

minicpm-video-analyzer

Paused

App Files Files Community

chakkale commited on 30 days ago

Commit

60736be

1 Parent(s): c72e3d4

blaaa

Browse files

Files changed (2) hide show

app.py +234 -38
requirements.txt +10 -0

app.py CHANGED Viewed

@@ -1,7 +1,33 @@
 #!/usr/bin/env python3
 """
-MiniCPM-o 2.6 Video Analyzer - Hugging Face Spaces Version
-A Gradio interface for analyzing videos using MiniCPM-o 2.6
 """
 import os
@@ -19,6 +45,10 @@ try:
     from decord import VideoReader, cpu
     from PIL import Image
     import numpy as np
 except ImportError as e:
     print(f"Import error: {e}")
     print("Installing missing dependencies...")
@@ -34,6 +64,35 @@ def uniform_sample(l, n):
     idxs = [int(i * gap + gap / 2) for i in range(n)]
     return [l[i] for i in idxs]
 def encode_video(video_path, max_num_frames=32):
     """
     Encode video using the exact method from MiniCPM-o 2.6 sample code
@@ -72,11 +131,14 @@ def load_model():
         # Try to use Flash Attention 2 if available, fallback to SDPA
         try:
             import flash_attn
             attn_implementation = 'flash_attention_2'
-            print("⚡ Flash Attention 2 detected - using optimized attention kernels")
-        except ImportError:
             attn_implementation = 'sdpa'
-            print("🚀 Using SDPA (Scaled Dot Product Attention) for optimal compatibility and performance")
         # Load model with memory optimization for Spaces
         try:
@@ -111,6 +173,11 @@ def load_model():
             print(f"✅ Model loaded with manual device placement to {device}")
         model.eval()  # Set to evaluation mode
         tokenizer = AutoTokenizer.from_pretrained(
@@ -129,13 +196,13 @@ def load_model():
         raise e
 def analyze_video(video_file, prompt, max_frames):
-    """Analyze video using MiniCPM-o 2.6"""
     if video_file is None:
         return "❌ Please upload a video file"
     if not prompt.strip():
-        prompt = "Describe this video in detail"
     try:
         # Load model
@@ -144,6 +211,22 @@ def analyze_video(video_file, prompt, max_frames):
         # Process video
         print(f"Processing video: {video_file}")
         frames = encode_video(video_file, max_num_frames=max_frames)
         if not frames:
@@ -151,9 +234,25 @@ def analyze_video(video_file, prompt, max_frames):
         print(f"📸 Extracted {len(frames)} frames")
         # Prepare messages exactly as in sample code
         msgs = [
-            {'role': 'user', 'content': frames + [prompt]},
         ]
         # Set decode params for video exactly as in sample code
@@ -161,7 +260,7 @@ def analyze_video(video_file, prompt, max_frames):
         params["use_image_id"] = False
         params["max_slice_nums"] = 1  # Reduced for Spaces memory limits
-        print("🧠 Analyzing video with MiniCPM-o 2.6...")
         # Clear GPU cache before inference
         if torch.cuda.is_available():
@@ -179,7 +278,81 @@ def analyze_video(video_file, prompt, max_frames):
                 )
             except Exception as inference_error:
                 print(f"Inference error: {inference_error}")
-                # Try to clear cache and retry once
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 raise inference_error
@@ -189,18 +362,27 @@ def analyze_video(video_file, prompt, max_frames):
         # Check which attention implementation was actually used
         attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
-        result = f"""## 🎬 Video Analysis Results
 **Processing Time:** {processing_time:.2f} seconds
-**Frames Analyzed:** {len(frames)}
-**Model:** MiniCPM-o 2.6
-**Attention:** {attention_type}
 ### Analysis:
 {answer}
 ---
-*Powered by MiniCPM-o 2.6 on Hugging Face Spaces*
 """
         return result
@@ -211,18 +393,25 @@ def analyze_video(video_file, prompt, max_frames):
         return error_msg
 def get_example_prompts():
-    """Get example prompts for video analysis"""
     return [
-        "Describe this video in detail",
-        "What is the main action happening in this video?",
-        "Analyze the visual content and composition of this video",
-        "What objects and people can you see in this video?",
-        "Describe the setting and environment shown in the video",
-        "What is the mood or atmosphere of this video?",
-        "Analyze this video for marketing or creative elements",
-        "What story is being told in this video?",
-        "Describe the camera work and visual techniques used",
-        "What emotions or feelings does this video convey?"
     ]
 # Create Gradio interface
@@ -251,18 +440,24 @@ def create_interface():
     ) as demo:
         gr.Markdown("""
-        # 🎬 MiniCPM-o 2.6 Video Analyzer
-        Upload a video and get detailed AI-powered analysis using the powerful MiniCPM-o 2.6 multimodal model.
         **Features:**
-        - 🎥 Video content analysis
-        - 🖼️ Frame-by-frame understanding
-        - 📝 Detailed descriptions
-        - 🎨 Creative and marketing insights
-        - ⚡ Flash Attention 2 optimized for maximum performance
-        **Supported formats:** MP4, AVI, MOV, WebM
         """)
         with gr.Row():
@@ -347,13 +542,14 @@ def create_interface():
         gr.Markdown("""
         ---
         ### ℹ️ About
-        This app uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for video understanding.
         - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
-        - **Code:** Based on exact official sample implementation
-        - **GPU:** Optimized for Hugging Face Spaces GPU with SDPA
-        **Note:** Processing time depends on video length and complexity.
         """)
     return demo

 #!/usr/bin/env python3
 """
+MiniCPM-o 2.6 Multimodal Video Analyzer - Hugging Face Spaces Version
+A Gradio interface for comprehensive video + audio analysis using MiniCPM-o 2.6
+MULTIMODAL CAPABILITIES:
+- Video Analysis: Visual content, scenes, objects, actions, composition
+- Audio Analysis: Speech, music, sound effects, ambient audio, transcription
+- Combined Analysis: Synchronized audiovisual understanding and insights
+SHAPE MISMATCH ERROR HANDLING:
+This version includes robust handling for the common shape mismatch error:
+"RuntimeError: shape mismatch: value tensor of shape [1080] cannot be broadcast to indexing result of shape [1044]"
+The error occurs in the vision processing pipeline when there are inconsistencies between:
+- Calculated position embeddings (e.g., 1080 positions)
+- Attention mask dimensions (e.g., 1044 valid positions)
+IMPLEMENTED SOLUTIONS:
+1. Fallback Strategy 1: Reduces max_slice_nums to 1 for simpler processing
+2. Fallback Strategy 2: Re-processes with fewer frames (16 max)
+3. Enhanced Error Messages: Provides actionable troubleshooting advice
+4. Video Diagnostics: Logs resolution and format information
+5. Audio Extraction: Librosa-based audio processing with error handling
+VIDEO COMPATIBILITY:
+- Preserves original video resolution and quality
+- Format: MP4, AVI, MOV, WebM supported
+- Duration: Any length (frames are sampled automatically)
+- Audio: Automatically extracted and analyzed when available
 """
 import os
     from decord import VideoReader, cpu
     from PIL import Image
     import numpy as np
+    import librosa
+    import soundfile as sf
+    import tempfile
+    import os
 except ImportError as e:
     print(f"Import error: {e}")
     print("Installing missing dependencies...")
     idxs = [int(i * gap + gap / 2) for i in range(n)]
     return [l[i] for i in idxs]
+def extract_audio_from_video(video_path, target_sr=16000, max_duration=30):
+    """
+    Extract audio from video file for MiniCPM-o 2.6 audio analysis
+    Args:
+        video_path: Path to video file
+        target_sr: Target sample rate (16kHz is standard for speech models)
+        max_duration: Maximum audio duration in seconds to prevent memory issues
+    Returns:
+        audio_array: Numpy array of audio samples
+        sample_rate: Sample rate of the audio
+    """
+    try:
+        # Use librosa to extract audio from video
+        print("🎵 Extracting audio from video...")
+        audio, sr = librosa.load(video_path, sr=target_sr, duration=max_duration)
+        if len(audio) == 0:
+            print("⚠️ No audio found in video")
+            return None, None
+        print(f"🎵 Audio extracted: {len(audio)/sr:.1f}s at {sr}Hz")
+        return audio, sr
+    except Exception as e:
+        print(f"⚠️ Audio extraction failed: {e}")
+        return None, None
 def encode_video(video_path, max_num_frames=32):
     """
     Encode video using the exact method from MiniCPM-o 2.6 sample code
         # Try to use Flash Attention 2 if available, fallback to SDPA
         try:
             import flash_attn
+            # Test if flash_attn actually works
+            from flash_attn import flash_attn_func
             attn_implementation = 'flash_attention_2'
+            print("⚡ Flash Attention 2 detected and verified - using optimized attention kernels")
+        except (ImportError, Exception) as e:
             attn_implementation = 'sdpa'
+            print(f"🚀 Flash Attention not available ({e}), using SDPA (Scaled Dot Product Attention)")
+            print("   SDPA provides ~95% of Flash Attention performance with 100% compatibility")
         # Load model with memory optimization for Spaces
         try:
             print(f"✅ Model loaded with manual device placement to {device}")
+        # Ensure model is on correct device for Flash Attention
+        if device == "cuda" and attn_implementation == 'flash_attention_2':
+            model = model.cuda()
+            print("✅ Model moved to CUDA for Flash Attention compatibility")
         model.eval()  # Set to evaluation mode
         tokenizer = AutoTokenizer.from_pretrained(
         raise e
 def analyze_video(video_file, prompt, max_frames):
+    """Analyze video with audio using MiniCPM-o 2.6 multimodal capabilities"""
     if video_file is None:
         return "❌ Please upload a video file"
     if not prompt.strip():
+        prompt = "Describe this video in detail, including both visual content and audio"
     try:
         # Load model
         # Process video
         print(f"Processing video: {video_file}")
+        # Add video diagnostics to help identify potential issues
+        try:
+            import cv2
+            cap = cv2.VideoCapture(video_file)
+            if cap.isOpened():
+                width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+                height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+                fps = cap.get(cv2.CAP_PROP_FPS)
+                frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+                print(f"📹 Video info: {width}x{height}, {fps:.1f}fps, {frame_count} frames")
+                cap.release()
+        except:
+            print("📹 Video info: Could not read video metadata")
+        # Extract video frames
         frames = encode_video(video_file, max_num_frames=max_frames)
         if not frames:
         print(f"📸 Extracted {len(frames)} frames")
+        # Extract audio from video
+        audio_data, sample_rate = extract_audio_from_video(video_file)
+        # Prepare multimodal content
+        content = frames.copy()  # Start with video frames
+        # Add audio description to prompt if audio was found
+        if audio_data is not None:
+            enhanced_prompt = f"{prompt}\n\nPlease also analyze the audio content including any speech, music, sound effects, or ambient sounds in the video."
+            print(f"🎵 Audio analysis enabled - {len(audio_data)/sample_rate:.1f}s of audio")
+        else:
+            enhanced_prompt = f"{prompt}\n\nNote: No audio content detected in this video."
+            print("🔇 Video analysis only - no audio content")
+        content.append(enhanced_prompt)
         # Prepare messages exactly as in sample code
         msgs = [
+            {'role': 'user', 'content': content},
         ]
         # Set decode params for video exactly as in sample code
         params["use_image_id"] = False
         params["max_slice_nums"] = 1  # Reduced for Spaces memory limits
+        print("🧠 Analyzing video and audio with MiniCPM-o 2.6...")
         # Clear GPU cache before inference
         if torch.cuda.is_available():
                 )
             except Exception as inference_error:
                 print(f"Inference error: {inference_error}")
+                # Check if it's the known shape mismatch error
+                if "shape mismatch" in str(inference_error) and "cannot be broadcast" in str(inference_error):
+                    print("🔧 Detected shape mismatch error - applying fallback strategy...")
+                    try:
+                        # Fallback Strategy 1: Reduce max_slice_nums to 1 for simpler processing
+                        params["max_slice_nums"] = 1
+                        print("📝 Trying with reduced max_slice_nums=1...")
+                        if torch.cuda.is_available():
+                            torch.cuda.empty_cache()
+                        answer = model.chat(
+                            msgs=msgs,
+                            tokenizer=tokenizer,
+                            **params
+                        )
+                        print("✅ Fallback strategy 1 successful!")
+                    except Exception as fallback_error:
+                        print(f"❌ Fallback strategy 1 failed: {fallback_error}")
+                        try:
+                            # Fallback Strategy 2: Re-process video with fewer frames
+                            print("📝 Trying with fewer frames (16 max)...")
+                            frames_reduced = encode_video(video_file, max_num_frames=16)
+                            if frames_reduced:
+                                # Prepare reduced content with audio info
+                                content_reduced = frames_reduced.copy()
+                                if audio_data is not None:
+                                    content_reduced.append(f"{prompt}\n\nPlease analyze both video and audio content (audio: {len(audio_data)/sample_rate:.1f}s)")
+                                else:
+                                    content_reduced.append(f"{prompt}\n\nVideo-only analysis (no audio detected)")
+                                msgs_reduced = [
+                                    {'role': 'user', 'content': content_reduced},
+                                ]
+                                params["max_slice_nums"] = 1
+                                params["use_image_id"] = False
+                                if torch.cuda.is_available():
+                                    torch.cuda.empty_cache()
+                                answer = model.chat(
+                                    msgs=msgs_reduced,
+                                    tokenizer=tokenizer,
+                                    **params
+                                )
+                                print("✅ Fallback strategy 2 successful with reduced frames!")
+                            else:
+                                raise Exception("Could not process video with reduced frames")
+                        except Exception as final_error:
+                            print(f"❌ All fallback strategies failed: {final_error}")
+                            # Provide helpful error message
+                            error_details = f"""
+Shape mismatch error detected. This can happen due to:
+1. Unusual video resolution/aspect ratio
+2. Video compression artifacts
+3. Frame dimension inconsistencies
+Suggested solutions:
+- Try a different video file
+- Ensure video resolution is standard (e.g., 1920x1080, 1280x720)
+- Convert video to a standard format (MP4 with H.264)
+Technical details: {str(inference_error)}
+"""
+                            return f"❌ Processing failed after multiple attempts:\n{error_details}"
+                # Try to clear cache and retry once for other errors
                 if torch.cuda.is_available():
                     torch.cuda.empty_cache()
                 raise inference_error
         # Check which attention implementation was actually used
         attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
+        # Prepare analysis type info
+        if audio_data is not None:
+            analysis_type = f"Video + Audio Analysis ({len(audio_data)/sample_rate:.1f}s audio)"
+            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio Duration:** {len(audio_data)/sample_rate:.1f} seconds  \n**Sample Rate:** {sample_rate} Hz"
+        else:
+            analysis_type = "Video-Only Analysis (no audio detected)"
+            media_info = f"**Frames Analyzed:** {len(frames)}  \n**Audio:** Not detected or unavailable"
+        result = f"""## 🎬 Multimodal Video Analysis Results
 **Processing Time:** {processing_time:.2f} seconds
+{media_info}
+**Model:** MiniCPM-o 2.6
+**Attention:** {attention_type}
+**Analysis Type:** {analysis_type}
 ### Analysis:
 {answer}
 ---
+*Powered by MiniCPM-o 2.6 Multimodal AI on Hugging Face Spaces*
 """
         return result
         return error_msg
 def get_example_prompts():
+    """Get example prompts for multimodal video + audio analysis"""
     return [
+        "Describe this video in detail, including both visual content and audio",
+        "What audio elements (speech, music, sound effects) complement the visual story?",
+        "Analyze the audiovisual composition - how do sound and image work together?",
+        "Describe what you see and hear - provide a complete sensory analysis",
+        "What is the main action happening, and what sounds accompany it?",
+        "Transcribe any speech and describe the visual context",
+        "🎵 AUDIO FOCUS: Analyze the audio track - music, dialogue, sound design, and ambient sounds",
+        "🎬 SCENE ANALYSIS: Describe the visual scenes and how audio enhances the storytelling",
+        "🎯 MARKETING ANALYSIS: Analyze this video from a marketing perspective, including both visual and audio elements. Assess brand messaging, target audience appeal, emotional impact through visuals and sound, music effectiveness, voiceover quality, and overall audiovisual marketing strategy.",
+        "📊 BRAND & AUDIENCE: How do visual and audio elements work together to appeal to the target demographic?",
+        "💡 CREATIVE STRATEGY: Evaluate the creative concept including visual aesthetics, audio design, and narrative flow",
+        "📈 CONVERSION OPTIMIZATION: Assess how both visual and audio elements contribute to engagement and conversion potential",
+        "🎮 MOBILE GAME AD ANALYSIS: Comprehensive analysis focusing on: 1) HOOK ANALYSIS (0-5 seconds): Visual and audio attention-grabbers, sound effects, music intro, voiceover hook. 2) AUDIOVISUAL SYNC: How well do visuals and audio align to create impact? 3) AUDIO BRANDING: Music style, sound effects quality, voice acting, brand audio identity. 4) MOBILE OPTIMIZATION: Audio clarity on small speakers, subtitle needs, sound-off viewing compatibility. Provide specific recommendations for improving both visual and audio elements.",
+        "🎙️ SPEECH ANALYSIS: Focus on any dialogue, narration, or vocal content in the video",
+        "🎶 MUSIC & SOUND: Analyze the musical score, sound effects, and audio atmosphere",
+        "What story is being told through both visual and audio elements?",
+        "Describe the mood created by combining visuals with the soundtrack"
     ]
 # Create Gradio interface
     ) as demo:
         gr.Markdown("""
+        # 🎬 MiniCPM-o 2.6 Multimodal Video Analyzer
+        Upload a video and get comprehensive AI-powered analysis using MiniCPM-o 2.6's multimodal capabilities.
         **Features:**
+        - 🎥 **Video content analysis** - visual scenes, objects, actions
+        - 🎵 **Audio analysis** - speech, music, sound effects, ambient audio
+        - 🖼️ **Frame-by-frame understanding** with temporal context
+        - 📝 **Detailed multimodal descriptions** combining visual and audio elements
+        - 🎨 **Creative and marketing insights** from complete audiovisual content
+        - ⚡ **Flash Attention 2 optimized** for maximum performance
+        - 🔧 **Robust error handling** with automatic fallback strategies
+        **Supported formats:** MP4, AVI, MOV, WebM
+        **Analysis includes:** Visual content + Audio content + Speech transcription
+        **Original quality preserved** - no resizing or compression
+        ⚠️ **Note:** Audio extraction works best with standard video formats. Some videos may require fallback processing.
         """)
         with gr.Row():
         gr.Markdown("""
         ---
         ### ℹ️ About
+        This app uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for comprehensive video and audio understanding.
         - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
+        - **Capabilities:** Video analysis + Audio processing + Speech transcription
+        - **Audio Processing:** Powered by librosa for high-quality audio extraction
+        - **GPU:** Optimized for Hugging Face Spaces with SDPA/Flash Attention
+        **Processing includes:** Visual content analysis, audio content analysis, speech-to-text, music/sound identification, and synchronized audiovisual understanding.
         """)
     return demo

requirements.txt CHANGED Viewed

@@ -1,8 +1,18 @@
 # Core ML/AI packages (pinned for compatibility)
 torch==2.3.1
 transformers==4.44.2
 accelerate==0.33.0
 # Flash Attention (prebuilt wheel for torch 2.3.1 + Python 3.10)
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl

 # Core ML/AI packages (pinned for compatibility)
 torch==2.3.1
+torchaudio==2.3.1
 transformers==4.44.2
 accelerate==0.33.0
+# Audio processing (required by MiniCPM-o 2.6)
+librosa==0.10.1
+soundfile==0.12.1
+scipy==1.11.4
+# TTS dependencies (required by MiniCPM-o 2.6)
+vector_quantize_pytorch==1.14.24
+vocos==0.1.0
 # Flash Attention (prebuilt wheel for torch 2.3.1 + Python 3.10)
 https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl