|
import gradio as gr
|
|
import torch
|
|
from transformers import AutoModel, AutoTokenizer
|
|
import cv2
|
|
import numpy as np
|
|
import tempfile
|
|
import os
|
|
import json
|
|
import time
|
|
from datetime import datetime
|
|
import ffmpeg
|
|
import soundfile as sf
|
|
from PIL import Image
|
|
import requests
|
|
import base64
|
|
import io
|
|
|
|
|
|
def load_model():
|
|
try:
|
|
|
|
model_name = "openbmb/MiniCPM-o-2_6"
|
|
|
|
model = AutoModel.from_pretrained(
|
|
model_name,
|
|
trust_remote_code=True,
|
|
torch_dtype=torch.float16,
|
|
device_map="auto",
|
|
low_cpu_mem_usage=True
|
|
)
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
model_name,
|
|
trust_remote_code=True
|
|
)
|
|
|
|
return model, tokenizer
|
|
except Exception as e:
|
|
print(f"Error loading model: {e}")
|
|
return None, None
|
|
|
|
|
|
print("Loading MiniCPM-o 2.6 model...")
|
|
model, tokenizer = load_model()
|
|
print("Model loaded successfully!" if model else "Failed to load model")
|
|
|
|
def extract_frames_from_video(video_path, max_frames=30):
|
|
"""Extract frames from video at 1fps"""
|
|
frames = []
|
|
timestamps = []
|
|
|
|
try:
|
|
cap = cv2.VideoCapture(video_path)
|
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
frame_interval = int(fps)
|
|
|
|
frame_count = 0
|
|
extracted_count = 0
|
|
|
|
while cap.isOpened() and extracted_count < max_frames:
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break
|
|
|
|
if frame_count % frame_interval == 0:
|
|
|
|
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
frames.append(Image.fromarray(frame_rgb))
|
|
timestamps.append(extracted_count)
|
|
extracted_count += 1
|
|
|
|
frame_count += 1
|
|
|
|
cap.release()
|
|
return frames, timestamps
|
|
except Exception as e:
|
|
print(f"Error extracting frames: {e}")
|
|
return [], []
|
|
|
|
def extract_audio_from_video(video_path):
|
|
"""Extract audio from video"""
|
|
try:
|
|
audio_path = video_path.replace('.mp4', '_audio.wav')
|
|
|
|
|
|
stream = ffmpeg.input(video_path)
|
|
stream = ffmpeg.output(stream, audio_path, acodec='pcm_s16le', ac=1, ar='16000')
|
|
ffmpeg.run(stream, overwrite_output=True, quiet=True)
|
|
|
|
return audio_path
|
|
except Exception as e:
|
|
print(f"Error extracting audio: {e}")
|
|
return None
|
|
|
|
def analyze_multimodal_content(frames, timestamps, audio_path=None):
|
|
"""Analyze video frames and audio using MiniCPM-o"""
|
|
if not model or not tokenizer:
|
|
return "Model not loaded. Please check the model initialization."
|
|
|
|
try:
|
|
analysis_results = []
|
|
|
|
|
|
for i, (frame, timestamp) in enumerate(zip(frames, timestamps)):
|
|
|
|
prompt = f"""You are an expert video narrative analyst specializing in marketing video analysis.
|
|
|
|
Analyze this frame (timestamp: {timestamp}s) and provide:
|
|
|
|
🎬 NARRATIVE ANALYSIS:
|
|
- What story moment is happening?
|
|
- What narrative function does this serve?
|
|
- How does this fit in the overall marketing flow?
|
|
|
|
🎨 VISUAL PSYCHOLOGY:
|
|
- What specific visual techniques are used?
|
|
- How do colors, composition, and lighting affect emotions?
|
|
- What psychological triggers are present?
|
|
|
|
🔗 MARKETING MECHANICS:
|
|
- How does this frame contribute to persuasion?
|
|
- What call-to-action elements are present?
|
|
- How does this build toward conversion?
|
|
|
|
Be specific and actionable in your analysis."""
|
|
|
|
try:
|
|
|
|
if audio_path and i == 0:
|
|
|
|
|
|
pass
|
|
|
|
|
|
msgs = [{'role': 'user', 'content': prompt}]
|
|
|
|
|
|
response = model.chat(
|
|
image=frame,
|
|
msgs=msgs,
|
|
tokenizer=tokenizer,
|
|
sampling=True,
|
|
temperature=0.7,
|
|
max_new_tokens=500
|
|
)
|
|
|
|
analysis_results.append({
|
|
'frame': i + 1,
|
|
'timestamp': f"{timestamp}s",
|
|
'analysis': response[0] if isinstance(response, tuple) else response
|
|
})
|
|
|
|
except Exception as e:
|
|
print(f"Error analyzing frame {i}: {e}")
|
|
analysis_results.append({
|
|
'frame': i + 1,
|
|
'timestamp': f"{timestamp}s",
|
|
'analysis': f"Error analyzing frame: {str(e)}"
|
|
})
|
|
|
|
return analysis_results
|
|
|
|
except Exception as e:
|
|
return f"Error in multimodal analysis: {str(e)}"
|
|
|
|
def generate_comprehensive_summary(analysis_results):
|
|
"""Generate comprehensive summary using MiniCPM-o"""
|
|
if not model or not tokenizer:
|
|
return "Model not loaded for summary generation."
|
|
|
|
try:
|
|
|
|
combined_analysis = "\n\n".join([
|
|
f"Frame {result['frame']} ({result['timestamp']}): {result['analysis']}"
|
|
for result in analysis_results
|
|
])
|
|
|
|
summary_prompt = f"""Based on the detailed frame-by-frame analysis below, provide a comprehensive marketing video analysis:
|
|
|
|
📖 STORY ARCHITECTURE:
|
|
- What is the overall narrative structure?
|
|
- How does the story progress from beginning to end?
|
|
- What transformation or journey is presented?
|
|
|
|
🎯 PERSUASION STRATEGY:
|
|
- What psychological principles are used?
|
|
- How does the video build toward conversion?
|
|
- What specific persuasion techniques are employed?
|
|
|
|
🎨 VISUAL STORYTELLING:
|
|
- How do visual elements support the narrative?
|
|
- What cinematic techniques enhance the message?
|
|
- How does the visual flow create emotional impact?
|
|
|
|
🚀 MARKETING EFFECTIVENESS:
|
|
- What makes this video compelling?
|
|
- How does it capture and maintain attention?
|
|
- What specific elements drive viewer action?
|
|
|
|
Frame Analysis:
|
|
{combined_analysis}
|
|
|
|
Provide specific, actionable insights in 300 words or less."""
|
|
|
|
msgs = [{'role': 'user', 'content': summary_prompt}]
|
|
|
|
response = model.chat(
|
|
image=None,
|
|
msgs=msgs,
|
|
tokenizer=tokenizer,
|
|
sampling=True,
|
|
temperature=0.3,
|
|
max_new_tokens=600
|
|
)
|
|
|
|
return response[0] if isinstance(response, tuple) else response
|
|
|
|
except Exception as e:
|
|
return f"Error generating summary: {str(e)}"
|
|
|
|
def process_video_with_minicpm(video_file):
|
|
"""Main processing function for video analysis"""
|
|
if video_file is None:
|
|
return "Please upload a video file.", "", ""
|
|
|
|
try:
|
|
start_time = time.time()
|
|
|
|
|
|
update_status = "Extracting frames from video..."
|
|
frames, timestamps = extract_frames_from_video(video_file.name)
|
|
|
|
if not frames:
|
|
return "Failed to extract frames from video.", "", ""
|
|
|
|
|
|
update_status = "Extracting audio from video..."
|
|
audio_path = extract_audio_from_video(video_file.name)
|
|
|
|
|
|
update_status = "Analyzing content with MiniCPM-o..."
|
|
analysis_results = analyze_multimodal_content(frames, timestamps, audio_path)
|
|
|
|
if isinstance(analysis_results, str):
|
|
return analysis_results, "", ""
|
|
|
|
|
|
update_status = "Generating comprehensive summary..."
|
|
comprehensive_summary = generate_comprehensive_summary(analysis_results)
|
|
|
|
|
|
frame_analysis = "\n\n".join([
|
|
f"🎬 **Frame {result['frame']} ({result['timestamp']})**\n{result['analysis']}"
|
|
for result in analysis_results
|
|
])
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
|
|
final_report = f"""
|
|
# 🎬 MiniCPM-o Video Analysis Report
|
|
|
|
**Analysis completed in {processing_time:.1f} seconds**
|
|
**Frames analyzed: {len(frames)}**
|
|
**Model: MiniCPM-o 2.6**
|
|
|
|
## 📊 Comprehensive Summary
|
|
|
|
{comprehensive_summary}
|
|
|
|
---
|
|
|
|
## 🎯 Technical Details
|
|
|
|
- **Processing Time**: {processing_time:.1f} seconds
|
|
- **Frames Extracted**: {len(frames)}
|
|
- **Audio Extracted**: {"Yes" if audio_path else "No"}
|
|
- **Model Used**: MiniCPM-o 2.6 (Multimodal)
|
|
- **Analysis Type**: Hybrid Audio-Visual
|
|
|
|
---
|
|
|
|
*Analysis powered by MiniCPM-o 2.6 - A GPT-4o Level MLLM*
|
|
"""
|
|
|
|
return final_report, frame_analysis, comprehensive_summary
|
|
|
|
except Exception as e:
|
|
return f"Error processing video: {str(e)}", "", ""
|
|
|
|
|
|
def create_interface():
|
|
with gr.Blocks(title="MiniCPM-o Video Analyzer", theme=gr.themes.Soft()) as demo:
|
|
gr.Markdown("""
|
|
# 🎬 MiniCPM-o Video Analyzer
|
|
|
|
**Test MiniCPM-o 2.6 for advanced video analysis**
|
|
|
|
Upload a marketing video (up to 30 seconds) to get:
|
|
- 🎯 Frame-by-frame narrative analysis
|
|
- 🎨 Visual psychology insights
|
|
- 🚀 Marketing effectiveness analysis
|
|
- 📊 Comprehensive summary
|
|
|
|
*Powered by MiniCPM-o 2.6 - Local multimodal analysis*
|
|
""")
|
|
|
|
with gr.Row():
|
|
with gr.Column(scale=1):
|
|
video_input = gr.Video(
|
|
label="Upload Marketing Video",
|
|
sources=["upload"],
|
|
include_audio=True
|
|
)
|
|
|
|
analyze_btn = gr.Button(
|
|
"🚀 Analyze with MiniCPM-o",
|
|
variant="primary",
|
|
size="lg"
|
|
)
|
|
|
|
gr.Markdown("""
|
|
**Tips:**
|
|
- Upload videos up to 30 seconds for optimal analysis
|
|
- MP4 format recommended
|
|
- Include audio for comprehensive analysis
|
|
""")
|
|
|
|
with gr.Column(scale=2):
|
|
with gr.Tabs():
|
|
with gr.TabItem("📊 Analysis Report"):
|
|
report_output = gr.Markdown(
|
|
label="Comprehensive Analysis Report",
|
|
value="Upload a video and click 'Analyze with MiniCPM-o' to get started."
|
|
)
|
|
|
|
with gr.TabItem("🎬 Frame Analysis"):
|
|
frame_output = gr.Markdown(
|
|
label="Frame-by-Frame Analysis",
|
|
value="Detailed analysis of each frame will appear here."
|
|
)
|
|
|
|
with gr.TabItem("📝 Summary"):
|
|
summary_output = gr.Markdown(
|
|
label="Executive Summary",
|
|
value="Marketing effectiveness summary will appear here."
|
|
)
|
|
|
|
|
|
analyze_btn.click(
|
|
fn=process_video_with_minicpm,
|
|
inputs=[video_input],
|
|
outputs=[report_output, frame_output, summary_output]
|
|
)
|
|
|
|
|
|
gr.Markdown("""
|
|
## 🎯 What This Analysis Provides
|
|
|
|
- **Narrative Analysis**: Story structure and progression
|
|
- **Visual Psychology**: Color, composition, and emotional triggers
|
|
- **Marketing Mechanics**: Persuasion techniques and conversion strategies
|
|
- **Attention Engineering**: How the video captures and maintains viewer focus
|
|
- **Comparative Insights**: How this compares to your existing GPT-4o analysis
|
|
""")
|
|
|
|
return demo
|
|
|
|
|
|
if __name__ == "__main__":
|
|
demo = create_interface()
|
|
demo.launch(
|
|
server_name="0.0.0.0",
|
|
server_port=7860,
|
|
share=True
|
|
) |