chakkale commited on
Commit
60736be
·
1 Parent(s): c72e3d4
Files changed (2) hide show
  1. app.py +234 -38
  2. requirements.txt +10 -0
app.py CHANGED
@@ -1,7 +1,33 @@
1
  #!/usr/bin/env python3
2
  """
3
- MiniCPM-o 2.6 Video Analyzer - Hugging Face Spaces Version
4
- A Gradio interface for analyzing videos using MiniCPM-o 2.6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  """
6
 
7
  import os
@@ -19,6 +45,10 @@ try:
19
  from decord import VideoReader, cpu
20
  from PIL import Image
21
  import numpy as np
 
 
 
 
22
  except ImportError as e:
23
  print(f"Import error: {e}")
24
  print("Installing missing dependencies...")
@@ -34,6 +64,35 @@ def uniform_sample(l, n):
34
  idxs = [int(i * gap + gap / 2) for i in range(n)]
35
  return [l[i] for i in idxs]
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def encode_video(video_path, max_num_frames=32):
38
  """
39
  Encode video using the exact method from MiniCPM-o 2.6 sample code
@@ -72,11 +131,14 @@ def load_model():
72
  # Try to use Flash Attention 2 if available, fallback to SDPA
73
  try:
74
  import flash_attn
 
 
75
  attn_implementation = 'flash_attention_2'
76
- print("⚡ Flash Attention 2 detected - using optimized attention kernels")
77
- except ImportError:
78
  attn_implementation = 'sdpa'
79
- print("🚀 Using SDPA (Scaled Dot Product Attention) for optimal compatibility and performance")
 
80
 
81
  # Load model with memory optimization for Spaces
82
  try:
@@ -111,6 +173,11 @@ def load_model():
111
 
112
  print(f"✅ Model loaded with manual device placement to {device}")
113
 
 
 
 
 
 
114
  model.eval() # Set to evaluation mode
115
 
116
  tokenizer = AutoTokenizer.from_pretrained(
@@ -129,13 +196,13 @@ def load_model():
129
  raise e
130
 
131
  def analyze_video(video_file, prompt, max_frames):
132
- """Analyze video using MiniCPM-o 2.6"""
133
 
134
  if video_file is None:
135
  return "❌ Please upload a video file"
136
 
137
  if not prompt.strip():
138
- prompt = "Describe this video in detail"
139
 
140
  try:
141
  # Load model
@@ -144,6 +211,22 @@ def analyze_video(video_file, prompt, max_frames):
144
 
145
  # Process video
146
  print(f"Processing video: {video_file}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
  frames = encode_video(video_file, max_num_frames=max_frames)
148
 
149
  if not frames:
@@ -151,9 +234,25 @@ def analyze_video(video_file, prompt, max_frames):
151
 
152
  print(f"📸 Extracted {len(frames)} frames")
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  # Prepare messages exactly as in sample code
155
  msgs = [
156
- {'role': 'user', 'content': frames + [prompt]},
157
  ]
158
 
159
  # Set decode params for video exactly as in sample code
@@ -161,7 +260,7 @@ def analyze_video(video_file, prompt, max_frames):
161
  params["use_image_id"] = False
162
  params["max_slice_nums"] = 1 # Reduced for Spaces memory limits
163
 
164
- print("🧠 Analyzing video with MiniCPM-o 2.6...")
165
 
166
  # Clear GPU cache before inference
167
  if torch.cuda.is_available():
@@ -179,7 +278,81 @@ def analyze_video(video_file, prompt, max_frames):
179
  )
180
  except Exception as inference_error:
181
  print(f"Inference error: {inference_error}")
182
- # Try to clear cache and retry once
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  if torch.cuda.is_available():
184
  torch.cuda.empty_cache()
185
  raise inference_error
@@ -189,18 +362,27 @@ def analyze_video(video_file, prompt, max_frames):
189
  # Check which attention implementation was actually used
190
  attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
191
 
192
- result = f"""## 🎬 Video Analysis Results
 
 
 
 
 
 
 
 
193
 
194
  **Processing Time:** {processing_time:.2f} seconds
195
- **Frames Analyzed:** {len(frames)}
196
- **Model:** MiniCPM-o 2.6
197
- **Attention:** {attention_type}
 
198
 
199
  ### Analysis:
200
  {answer}
201
 
202
  ---
203
- *Powered by MiniCPM-o 2.6 on Hugging Face Spaces*
204
  """
205
 
206
  return result
@@ -211,18 +393,25 @@ def analyze_video(video_file, prompt, max_frames):
211
  return error_msg
212
 
213
  def get_example_prompts():
214
- """Get example prompts for video analysis"""
215
  return [
216
- "Describe this video in detail",
217
- "What is the main action happening in this video?",
218
- "Analyze the visual content and composition of this video",
219
- "What objects and people can you see in this video?",
220
- "Describe the setting and environment shown in the video",
221
- "What is the mood or atmosphere of this video?",
222
- "Analyze this video for marketing or creative elements",
223
- "What story is being told in this video?",
224
- "Describe the camera work and visual techniques used",
225
- "What emotions or feelings does this video convey?"
 
 
 
 
 
 
 
226
  ]
227
 
228
  # Create Gradio interface
@@ -251,18 +440,24 @@ def create_interface():
251
  ) as demo:
252
 
253
  gr.Markdown("""
254
- # 🎬 MiniCPM-o 2.6 Video Analyzer
255
 
256
- Upload a video and get detailed AI-powered analysis using the powerful MiniCPM-o 2.6 multimodal model.
257
 
258
  **Features:**
259
- - 🎥 Video content analysis
260
- - 🖼️ Frame-by-frame understanding
261
- - 📝 Detailed descriptions
262
- - 🎨 Creative and marketing insights
263
- - Flash Attention 2 optimized for maximum performance
 
 
 
 
 
 
264
 
265
- **Supported formats:** MP4, AVI, MOV, WebM
266
  """)
267
 
268
  with gr.Row():
@@ -347,13 +542,14 @@ def create_interface():
347
  gr.Markdown("""
348
  ---
349
  ### ℹ️ About
350
- This app uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for video understanding.
351
 
352
  - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
353
- - **Code:** Based on exact official sample implementation
354
- - **GPU:** Optimized for Hugging Face Spaces GPU with SDPA
 
355
 
356
- **Note:** Processing time depends on video length and complexity.
357
  """)
358
 
359
  return demo
 
1
  #!/usr/bin/env python3
2
  """
3
+ MiniCPM-o 2.6 Multimodal Video Analyzer - Hugging Face Spaces Version
4
+ A Gradio interface for comprehensive video + audio analysis using MiniCPM-o 2.6
5
+
6
+ MULTIMODAL CAPABILITIES:
7
+ - Video Analysis: Visual content, scenes, objects, actions, composition
8
+ - Audio Analysis: Speech, music, sound effects, ambient audio, transcription
9
+ - Combined Analysis: Synchronized audiovisual understanding and insights
10
+
11
+ SHAPE MISMATCH ERROR HANDLING:
12
+ This version includes robust handling for the common shape mismatch error:
13
+ "RuntimeError: shape mismatch: value tensor of shape [1080] cannot be broadcast to indexing result of shape [1044]"
14
+
15
+ The error occurs in the vision processing pipeline when there are inconsistencies between:
16
+ - Calculated position embeddings (e.g., 1080 positions)
17
+ - Attention mask dimensions (e.g., 1044 valid positions)
18
+
19
+ IMPLEMENTED SOLUTIONS:
20
+ 1. Fallback Strategy 1: Reduces max_slice_nums to 1 for simpler processing
21
+ 2. Fallback Strategy 2: Re-processes with fewer frames (16 max)
22
+ 3. Enhanced Error Messages: Provides actionable troubleshooting advice
23
+ 4. Video Diagnostics: Logs resolution and format information
24
+ 5. Audio Extraction: Librosa-based audio processing with error handling
25
+
26
+ VIDEO COMPATIBILITY:
27
+ - Preserves original video resolution and quality
28
+ - Format: MP4, AVI, MOV, WebM supported
29
+ - Duration: Any length (frames are sampled automatically)
30
+ - Audio: Automatically extracted and analyzed when available
31
  """
32
 
33
  import os
 
45
  from decord import VideoReader, cpu
46
  from PIL import Image
47
  import numpy as np
48
+ import librosa
49
+ import soundfile as sf
50
+ import tempfile
51
+ import os
52
  except ImportError as e:
53
  print(f"Import error: {e}")
54
  print("Installing missing dependencies...")
 
64
  idxs = [int(i * gap + gap / 2) for i in range(n)]
65
  return [l[i] for i in idxs]
66
 
67
+ def extract_audio_from_video(video_path, target_sr=16000, max_duration=30):
68
+ """
69
+ Extract audio from video file for MiniCPM-o 2.6 audio analysis
70
+
71
+ Args:
72
+ video_path: Path to video file
73
+ target_sr: Target sample rate (16kHz is standard for speech models)
74
+ max_duration: Maximum audio duration in seconds to prevent memory issues
75
+
76
+ Returns:
77
+ audio_array: Numpy array of audio samples
78
+ sample_rate: Sample rate of the audio
79
+ """
80
+ try:
81
+ # Use librosa to extract audio from video
82
+ print("🎵 Extracting audio from video...")
83
+ audio, sr = librosa.load(video_path, sr=target_sr, duration=max_duration)
84
+
85
+ if len(audio) == 0:
86
+ print("⚠️ No audio found in video")
87
+ return None, None
88
+
89
+ print(f"🎵 Audio extracted: {len(audio)/sr:.1f}s at {sr}Hz")
90
+ return audio, sr
91
+
92
+ except Exception as e:
93
+ print(f"⚠️ Audio extraction failed: {e}")
94
+ return None, None
95
+
96
  def encode_video(video_path, max_num_frames=32):
97
  """
98
  Encode video using the exact method from MiniCPM-o 2.6 sample code
 
131
  # Try to use Flash Attention 2 if available, fallback to SDPA
132
  try:
133
  import flash_attn
134
+ # Test if flash_attn actually works
135
+ from flash_attn import flash_attn_func
136
  attn_implementation = 'flash_attention_2'
137
+ print("⚡ Flash Attention 2 detected and verified - using optimized attention kernels")
138
+ except (ImportError, Exception) as e:
139
  attn_implementation = 'sdpa'
140
+ print(f"🚀 Flash Attention not available ({e}), using SDPA (Scaled Dot Product Attention)")
141
+ print(" SDPA provides ~95% of Flash Attention performance with 100% compatibility")
142
 
143
  # Load model with memory optimization for Spaces
144
  try:
 
173
 
174
  print(f"✅ Model loaded with manual device placement to {device}")
175
 
176
+ # Ensure model is on correct device for Flash Attention
177
+ if device == "cuda" and attn_implementation == 'flash_attention_2':
178
+ model = model.cuda()
179
+ print("✅ Model moved to CUDA for Flash Attention compatibility")
180
+
181
  model.eval() # Set to evaluation mode
182
 
183
  tokenizer = AutoTokenizer.from_pretrained(
 
196
  raise e
197
 
198
  def analyze_video(video_file, prompt, max_frames):
199
+ """Analyze video with audio using MiniCPM-o 2.6 multimodal capabilities"""
200
 
201
  if video_file is None:
202
  return "❌ Please upload a video file"
203
 
204
  if not prompt.strip():
205
+ prompt = "Describe this video in detail, including both visual content and audio"
206
 
207
  try:
208
  # Load model
 
211
 
212
  # Process video
213
  print(f"Processing video: {video_file}")
214
+
215
+ # Add video diagnostics to help identify potential issues
216
+ try:
217
+ import cv2
218
+ cap = cv2.VideoCapture(video_file)
219
+ if cap.isOpened():
220
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
221
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
222
+ fps = cap.get(cv2.CAP_PROP_FPS)
223
+ frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
224
+ print(f"📹 Video info: {width}x{height}, {fps:.1f}fps, {frame_count} frames")
225
+ cap.release()
226
+ except:
227
+ print("📹 Video info: Could not read video metadata")
228
+
229
+ # Extract video frames
230
  frames = encode_video(video_file, max_num_frames=max_frames)
231
 
232
  if not frames:
 
234
 
235
  print(f"📸 Extracted {len(frames)} frames")
236
 
237
+ # Extract audio from video
238
+ audio_data, sample_rate = extract_audio_from_video(video_file)
239
+
240
+ # Prepare multimodal content
241
+ content = frames.copy() # Start with video frames
242
+
243
+ # Add audio description to prompt if audio was found
244
+ if audio_data is not None:
245
+ enhanced_prompt = f"{prompt}\n\nPlease also analyze the audio content including any speech, music, sound effects, or ambient sounds in the video."
246
+ print(f"🎵 Audio analysis enabled - {len(audio_data)/sample_rate:.1f}s of audio")
247
+ else:
248
+ enhanced_prompt = f"{prompt}\n\nNote: No audio content detected in this video."
249
+ print("🔇 Video analysis only - no audio content")
250
+
251
+ content.append(enhanced_prompt)
252
+
253
  # Prepare messages exactly as in sample code
254
  msgs = [
255
+ {'role': 'user', 'content': content},
256
  ]
257
 
258
  # Set decode params for video exactly as in sample code
 
260
  params["use_image_id"] = False
261
  params["max_slice_nums"] = 1 # Reduced for Spaces memory limits
262
 
263
+ print("🧠 Analyzing video and audio with MiniCPM-o 2.6...")
264
 
265
  # Clear GPU cache before inference
266
  if torch.cuda.is_available():
 
278
  )
279
  except Exception as inference_error:
280
  print(f"Inference error: {inference_error}")
281
+
282
+ # Check if it's the known shape mismatch error
283
+ if "shape mismatch" in str(inference_error) and "cannot be broadcast" in str(inference_error):
284
+ print("🔧 Detected shape mismatch error - applying fallback strategy...")
285
+
286
+ try:
287
+ # Fallback Strategy 1: Reduce max_slice_nums to 1 for simpler processing
288
+ params["max_slice_nums"] = 1
289
+ print("📝 Trying with reduced max_slice_nums=1...")
290
+
291
+ if torch.cuda.is_available():
292
+ torch.cuda.empty_cache()
293
+
294
+ answer = model.chat(
295
+ msgs=msgs,
296
+ tokenizer=tokenizer,
297
+ **params
298
+ )
299
+ print("✅ Fallback strategy 1 successful!")
300
+
301
+ except Exception as fallback_error:
302
+ print(f"❌ Fallback strategy 1 failed: {fallback_error}")
303
+
304
+ try:
305
+ # Fallback Strategy 2: Re-process video with fewer frames
306
+ print("📝 Trying with fewer frames (16 max)...")
307
+ frames_reduced = encode_video(video_file, max_num_frames=16)
308
+
309
+ if frames_reduced:
310
+ # Prepare reduced content with audio info
311
+ content_reduced = frames_reduced.copy()
312
+ if audio_data is not None:
313
+ content_reduced.append(f"{prompt}\n\nPlease analyze both video and audio content (audio: {len(audio_data)/sample_rate:.1f}s)")
314
+ else:
315
+ content_reduced.append(f"{prompt}\n\nVideo-only analysis (no audio detected)")
316
+
317
+ msgs_reduced = [
318
+ {'role': 'user', 'content': content_reduced},
319
+ ]
320
+
321
+ params["max_slice_nums"] = 1
322
+ params["use_image_id"] = False
323
+
324
+ if torch.cuda.is_available():
325
+ torch.cuda.empty_cache()
326
+
327
+ answer = model.chat(
328
+ msgs=msgs_reduced,
329
+ tokenizer=tokenizer,
330
+ **params
331
+ )
332
+ print("✅ Fallback strategy 2 successful with reduced frames!")
333
+ else:
334
+ raise Exception("Could not process video with reduced frames")
335
+
336
+ except Exception as final_error:
337
+ print(f"❌ All fallback strategies failed: {final_error}")
338
+
339
+ # Provide helpful error message
340
+ error_details = f"""
341
+ Shape mismatch error detected. This can happen due to:
342
+ 1. Unusual video resolution/aspect ratio
343
+ 2. Video compression artifacts
344
+ 3. Frame dimension inconsistencies
345
+
346
+ Suggested solutions:
347
+ - Try a different video file
348
+ - Ensure video resolution is standard (e.g., 1920x1080, 1280x720)
349
+ - Convert video to a standard format (MP4 with H.264)
350
+
351
+ Technical details: {str(inference_error)}
352
+ """
353
+ return f"❌ Processing failed after multiple attempts:\n{error_details}"
354
+
355
+ # Try to clear cache and retry once for other errors
356
  if torch.cuda.is_available():
357
  torch.cuda.empty_cache()
358
  raise inference_error
 
362
  # Check which attention implementation was actually used
363
  attention_type = "Flash Attention 2 (Optimized)" if hasattr(model.config, 'attn_implementation') and model.config.attn_implementation == 'flash_attention_2' else "SDPA (Optimized)"
364
 
365
+ # Prepare analysis type info
366
+ if audio_data is not None:
367
+ analysis_type = f"Video + Audio Analysis ({len(audio_data)/sample_rate:.1f}s audio)"
368
+ media_info = f"**Frames Analyzed:** {len(frames)} \n**Audio Duration:** {len(audio_data)/sample_rate:.1f} seconds \n**Sample Rate:** {sample_rate} Hz"
369
+ else:
370
+ analysis_type = "Video-Only Analysis (no audio detected)"
371
+ media_info = f"**Frames Analyzed:** {len(frames)} \n**Audio:** Not detected or unavailable"
372
+
373
+ result = f"""## 🎬 Multimodal Video Analysis Results
374
 
375
  **Processing Time:** {processing_time:.2f} seconds
376
+ {media_info}
377
+ **Model:** MiniCPM-o 2.6
378
+ **Attention:** {attention_type}
379
+ **Analysis Type:** {analysis_type}
380
 
381
  ### Analysis:
382
  {answer}
383
 
384
  ---
385
+ *Powered by MiniCPM-o 2.6 Multimodal AI on Hugging Face Spaces*
386
  """
387
 
388
  return result
 
393
  return error_msg
394
 
395
  def get_example_prompts():
396
+ """Get example prompts for multimodal video + audio analysis"""
397
  return [
398
+ "Describe this video in detail, including both visual content and audio",
399
+ "What audio elements (speech, music, sound effects) complement the visual story?",
400
+ "Analyze the audiovisual composition - how do sound and image work together?",
401
+ "Describe what you see and hear - provide a complete sensory analysis",
402
+ "What is the main action happening, and what sounds accompany it?",
403
+ "Transcribe any speech and describe the visual context",
404
+ "🎵 AUDIO FOCUS: Analyze the audio track - music, dialogue, sound design, and ambient sounds",
405
+ "🎬 SCENE ANALYSIS: Describe the visual scenes and how audio enhances the storytelling",
406
+ "🎯 MARKETING ANALYSIS: Analyze this video from a marketing perspective, including both visual and audio elements. Assess brand messaging, target audience appeal, emotional impact through visuals and sound, music effectiveness, voiceover quality, and overall audiovisual marketing strategy.",
407
+ "📊 BRAND & AUDIENCE: How do visual and audio elements work together to appeal to the target demographic?",
408
+ "💡 CREATIVE STRATEGY: Evaluate the creative concept including visual aesthetics, audio design, and narrative flow",
409
+ "📈 CONVERSION OPTIMIZATION: Assess how both visual and audio elements contribute to engagement and conversion potential",
410
+ "🎮 MOBILE GAME AD ANALYSIS: Comprehensive analysis focusing on: 1) HOOK ANALYSIS (0-5 seconds): Visual and audio attention-grabbers, sound effects, music intro, voiceover hook. 2) AUDIOVISUAL SYNC: How well do visuals and audio align to create impact? 3) AUDIO BRANDING: Music style, sound effects quality, voice acting, brand audio identity. 4) MOBILE OPTIMIZATION: Audio clarity on small speakers, subtitle needs, sound-off viewing compatibility. Provide specific recommendations for improving both visual and audio elements.",
411
+ "🎙️ SPEECH ANALYSIS: Focus on any dialogue, narration, or vocal content in the video",
412
+ "🎶 MUSIC & SOUND: Analyze the musical score, sound effects, and audio atmosphere",
413
+ "What story is being told through both visual and audio elements?",
414
+ "Describe the mood created by combining visuals with the soundtrack"
415
  ]
416
 
417
  # Create Gradio interface
 
440
  ) as demo:
441
 
442
  gr.Markdown("""
443
+ # 🎬 MiniCPM-o 2.6 Multimodal Video Analyzer
444
 
445
+ Upload a video and get comprehensive AI-powered analysis using MiniCPM-o 2.6's multimodal capabilities.
446
 
447
  **Features:**
448
+ - 🎥 **Video content analysis** - visual scenes, objects, actions
449
+ - 🎵 **Audio analysis** - speech, music, sound effects, ambient audio
450
+ - 🖼️ **Frame-by-frame understanding** with temporal context
451
+ - 📝 **Detailed multimodal descriptions** combining visual and audio elements
452
+ - 🎨 **Creative and marketing insights** from complete audiovisual content
453
+ - ⚡ **Flash Attention 2 optimized** for maximum performance
454
+ - 🔧 **Robust error handling** with automatic fallback strategies
455
+
456
+ **Supported formats:** MP4, AVI, MOV, WebM
457
+ **Analysis includes:** Visual content + Audio content + Speech transcription
458
+ **Original quality preserved** - no resizing or compression
459
 
460
+ ⚠️ **Note:** Audio extraction works best with standard video formats. Some videos may require fallback processing.
461
  """)
462
 
463
  with gr.Row():
 
542
  gr.Markdown("""
543
  ---
544
  ### ℹ️ About
545
+ This app uses **MiniCPM-o 2.6**, a state-of-the-art multimodal AI model for comprehensive video and audio understanding.
546
 
547
  - **Model:** [openbmb/MiniCPM-o-2_6](https://huggingface.co/openbmb/MiniCPM-o-2_6)
548
+ - **Capabilities:** Video analysis + Audio processing + Speech transcription
549
+ - **Audio Processing:** Powered by librosa for high-quality audio extraction
550
+ - **GPU:** Optimized for Hugging Face Spaces with SDPA/Flash Attention
551
 
552
+ **Processing includes:** Visual content analysis, audio content analysis, speech-to-text, music/sound identification, and synchronized audiovisual understanding.
553
  """)
554
 
555
  return demo
requirements.txt CHANGED
@@ -1,8 +1,18 @@
1
  # Core ML/AI packages (pinned for compatibility)
2
  torch==2.3.1
 
3
  transformers==4.44.2
4
  accelerate==0.33.0
5
 
 
 
 
 
 
 
 
 
 
6
  # Flash Attention (prebuilt wheel for torch 2.3.1 + Python 3.10)
7
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
8
 
 
1
  # Core ML/AI packages (pinned for compatibility)
2
  torch==2.3.1
3
+ torchaudio==2.3.1
4
  transformers==4.44.2
5
  accelerate==0.33.0
6
 
7
+ # Audio processing (required by MiniCPM-o 2.6)
8
+ librosa==0.10.1
9
+ soundfile==0.12.1
10
+ scipy==1.11.4
11
+
12
+ # TTS dependencies (required by MiniCPM-o 2.6)
13
+ vector_quantize_pytorch==1.14.24
14
+ vocos==0.1.0
15
+
16
  # Flash Attention (prebuilt wheel for torch 2.3.1 + Python 3.10)
17
  https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.3cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
18