muzzz commited on
Commit
10b0de3
·
1 Parent(s): b0c020d
Files changed (5) hide show
  1. .gitignore +146 -0
  2. asr.py +200 -0
  3. main.py +360 -0
  4. processing.py +123 -0
  5. pyproject.toml +14 -0
.gitignore ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # uv stuff
86
+ .python-version
87
+ uv.lock
88
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
89
+ __pypackages__/
90
+
91
+ # Celery stuff
92
+ celerybeat-schedule
93
+ celerybeat.pid
94
+
95
+ # SageMan stuff
96
+ *.sage.py
97
+
98
+ # Environments
99
+ .env
100
+ .venv
101
+ env/
102
+ venv/
103
+ ENV/
104
+ env.bak/
105
+ venv.bak/
106
+
107
+ # Spyder project settings
108
+ .spyderproject
109
+ .spyproject
110
+
111
+ # Rope project settings
112
+ .ropeproject
113
+
114
+ # mkdocs documentation
115
+ /site
116
+
117
+ # mypy
118
+ .mypy_cache/
119
+ .dmypy.json
120
+ dmypy.json
121
+
122
+ # Pyre type checker
123
+ .pyre/
124
+
125
+ # pytype static type analyzer
126
+ .pytype/
127
+
128
+ # Cython debug symbols
129
+ cython_debug/
130
+
131
+
132
+ # Project-specific ignores
133
+ # Modal deployment files
134
+ .modal/
135
+ modal_volumes/
136
+
137
+ # Gradio temporary files
138
+ gradio_cached_examples/
139
+ flagged/
140
+
141
+ # AI/ML artifacts
142
+ models/
143
+ data/
144
+ experiments/
145
+ wandb/
146
+ mlruns/
asr.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import uuid
3
+
4
+ MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
5
+
6
+ def download_model():
7
+ try:
8
+ import nemo.collections.asr as nemo_asr # type: ignore
9
+ nemo_asr.models.ASRModel.from_pretrained(MODEL_NAME)
10
+ except ImportError:
11
+ pass
12
+
13
+ asr_image = (
14
+ modal.Image.debian_slim(python_version="3.12")
15
+ .apt_install("git", "ffmpeg")
16
+ .pip_install(
17
+ "torch",
18
+ "librosa",
19
+ "omegaconf",
20
+ "lightning",
21
+ "cuda-python>=12.3",
22
+ "git+https://github.com/NVIDIA/multi-storage-client.git",
23
+ "nemo_toolkit[asr] @ git+https://github.com/NVIDIA/NeMo@main",
24
+ extra_options="-U",
25
+ gpu="A10G",
26
+ )
27
+ .run_function(
28
+ download_model,
29
+ gpu="A10G",
30
+ )
31
+ )
32
+
33
+ with asr_image.imports():
34
+ import nemo.collections.asr as nemo_asr # type: ignore
35
+ from nemo.collections.asr.parts.submodules.rnnt_decoding import RNNTDecodingConfig # type: ignore
36
+ from nemo.collections.asr.parts.utils.streaming_utils import BatchedFrameASRTDT # type: ignore
37
+ from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_rnnt # type: ignore
38
+ import math
39
+ import torch # type: ignore
40
+ from omegaconf import OmegaConf # type: ignore
41
+ import librosa # type: ignore
42
+ import os
43
+
44
+ app = modal.App(name="clipscript-asr-service")
45
+
46
+ # This must be the same volume object used in processing.py
47
+ upload_volume = modal.Volume.from_name(
48
+ "clipscript-uploads", create_if_missing=True
49
+ )
50
+
51
+ @app.cls(
52
+ image=asr_image,
53
+ gpu="A10G",
54
+ scaledown_window=600,
55
+ volumes={"/data": upload_volume}, # Mount the shared volume
56
+ )
57
+ class ASR:
58
+ @modal.enter()
59
+ def startup(self):
60
+ print("loading model...")
61
+ self.model = nemo_asr.models.ASRModel.from_pretrained(MODEL_NAME)
62
+ print("model loaded.")
63
+
64
+ self.model.freeze()
65
+ torch.set_grad_enabled(False)
66
+
67
+ # Configure for buffered inference
68
+ model_cfg = self.model._cfg
69
+ OmegaConf.set_struct(model_cfg.preprocessor, False)
70
+ model_cfg.preprocessor.dither = 0.0
71
+ model_cfg.preprocessor.pad_to = 0
72
+ OmegaConf.set_struct(model_cfg.preprocessor, True)
73
+
74
+ # Setup decoding for TDT model
75
+ decoding_cfg = RNNTDecodingConfig()
76
+ decoding_cfg.strategy = "greedy" # TDT requires greedy
77
+ decoding_cfg.preserve_alignments = True
78
+ decoding_cfg.fused_batch_size = -1
79
+
80
+ if hasattr(self.model, 'change_decoding_strategy'):
81
+ self.model.change_decoding_strategy(decoding_cfg)
82
+
83
+ # Calculate timing parameters
84
+ self.feature_stride = model_cfg.preprocessor['window_stride']
85
+ self.model_stride = 4 # TDT model stride
86
+ self.model_stride_in_secs = self.feature_stride * self.model_stride
87
+
88
+ # Buffered inference parameters
89
+ self.chunk_len_in_secs = 15.0
90
+ self.total_buffer_in_secs = 20.0
91
+ self.batch_size = 64
92
+ self.max_steps_per_timestep = 15
93
+
94
+ # Calculate chunk parameters
95
+ self.tokens_per_chunk = math.ceil(self.chunk_len_in_secs / self.model_stride_in_secs)
96
+
97
+ print("ASR setup complete with buffered inference support.")
98
+
99
+ def _get_audio_duration(self, audio_path: str) -> float:
100
+ try:
101
+ duration = librosa.get_duration(path=audio_path)
102
+ return duration
103
+ except Exception:
104
+ # Fallback: estimate from file size (rough approximation)
105
+ file_size = os.path.getsize(audio_path)
106
+ # Rough estimate: 16kHz, 16-bit mono = ~32KB per second
107
+ return file_size / 32000
108
+
109
+ def _simple_transcribe(self, audio_path: str) -> str:
110
+ print("Using simple transcription...")
111
+ output = self.model.transcribe([audio_path])
112
+
113
+ if not output or not hasattr(output[0], "text"):
114
+ return ""
115
+
116
+ return output[0].text
117
+
118
+ def _buffered_transcribe(self, audio_path: str) -> str:
119
+ print("Using buffered transcription...")
120
+
121
+ # Setup TDT frame processor
122
+ frame_asr = BatchedFrameASRTDT(
123
+ asr_model=self.model,
124
+ frame_len=self.chunk_len_in_secs,
125
+ total_buffer=self.total_buffer_in_secs,
126
+ batch_size=self.batch_size,
127
+ max_steps_per_timestep=self.max_steps_per_timestep,
128
+ stateful_decoding=False,
129
+ )
130
+
131
+ # Calculate delay for TDT
132
+ mid_delay = math.ceil((self.chunk_len_in_secs + (self.total_buffer_in_secs - self.chunk_len_in_secs) / 2) / self.model_stride_in_secs)
133
+
134
+ # Process with buffered inference
135
+ hyps = get_buffered_pred_feat_rnnt(
136
+ asr=frame_asr,
137
+ tokens_per_chunk=self.tokens_per_chunk,
138
+ delay=mid_delay,
139
+ model_stride_in_secs=self.model_stride_in_secs,
140
+ batch_size=self.batch_size,
141
+ manifest=None,
142
+ filepaths=[audio_path],
143
+ accelerator='gpu',
144
+ )
145
+
146
+ # Extract transcription
147
+ if hyps and len(hyps) > 0:
148
+ return hyps[0].text
149
+
150
+ return ""
151
+
152
+ @modal.method()
153
+ def transcribe(self, audio_filename: str = None, audio_bytes: bytes = None, use_buffered: bool | None = None) -> dict[str, str]:
154
+ audio_path = None
155
+ temp_audio_path = None
156
+ try:
157
+ if audio_filename:
158
+ audio_path = f"/data/{audio_filename}"
159
+ elif audio_bytes:
160
+ # When bytes are passed, they must be written to a file for librosa/nemo to read.
161
+ temp_audio_path = f"/tmp/input_{uuid.uuid4()}.wav"
162
+ with open(temp_audio_path, "wb") as f:
163
+ f.write(audio_bytes)
164
+ audio_path = temp_audio_path
165
+ else:
166
+ raise ValueError("Either 'audio_filename' or 'audio_bytes' must be provided.")
167
+
168
+ if not os.path.exists(audio_path):
169
+ return {"text": "", "error": f"Audio file not found at path: {audio_path}"}
170
+
171
+ # Determine transcription method
172
+ if use_buffered is None:
173
+ duration = self._get_audio_duration(audio_path)
174
+ use_buffered = duration > 1800.0 # 30 minutes
175
+ print(f"Audio duration: {duration:.1f}s, using {'buffered' if use_buffered else 'simple'} transcription")
176
+
177
+ if use_buffered:
178
+ text = self._buffered_transcribe(audio_path)
179
+ else:
180
+ text = self._simple_transcribe(audio_path)
181
+
182
+ print("transcription complete.")
183
+ return {"text": text, "error": ""}
184
+
185
+ except Exception as e:
186
+ print(f"Transcription error: {e}")
187
+ return {"text": "", "error": str(e)}
188
+ finally:
189
+ if temp_audio_path and os.path.exists(temp_audio_path):
190
+ os.remove(temp_audio_path)
191
+
192
+ @modal.method()
193
+ def transcribe_simple(self, audio_filename: str = None, audio_bytes: bytes = None) -> dict[str, str]:
194
+ """Force simple transcription (for compatibility)"""
195
+ return self.transcribe(audio_filename=audio_filename, audio_bytes=audio_bytes, use_buffered=False)
196
+
197
+ @modal.method()
198
+ def transcribe_buffered(self, audio_filename: str = None, audio_bytes: bytes = None) -> dict[str, str]:
199
+ """Force buffered transcription"""
200
+ return self.transcribe(audio_filename=audio_filename, audio_bytes=audio_bytes, use_buffered=True)
main.py ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from functools import wraps
2
+ import logging
3
+ import gradio as gr
4
+ import os
5
+ import modal
6
+ from openai import OpenAI
7
+ from dotenv import load_dotenv
8
+ import re
9
+ import time
10
+ import uuid
11
+ import yt_dlp
12
+ import tempfile
13
+ import shutil
14
+ from pathlib import Path
15
+
16
+ load_dotenv()
17
+
18
+
19
+ process_media_remotely = modal.Function.from_name("clipscript-processing-service", "process_media")
20
+ asr_handle = modal.Cls.from_name("clipscript-asr-service", "ASR")
21
+ upload_volume = modal.Volume.from_name("clipscript-uploads", create_if_missing=True)
22
+
23
+
24
+ llm = "deepseek/deepseek-r1-0528:free"
25
+ api_key = os.environ.get("OPENROUTER_API_KEY")
26
+
27
+
28
+ def retry_on_rate_limit(max_retries: int = 5, base_delay: float = 2.0):
29
+ """Decorator for exponential backoff on rate limits"""
30
+ def decorator(func):
31
+ @wraps(func)
32
+ def wrapper(*args, **kwargs):
33
+ delay = base_delay
34
+ for attempt in range(max_retries):
35
+ try:
36
+ return func(*args, **kwargs)
37
+ except Exception as e:
38
+ # Check for 429 status code in different ways
39
+ status_code = getattr(getattr(e, 'response', None), 'status_code', None)
40
+ if status_code == 429 or '429' in str(e) or 'rate limit' in str(e).lower():
41
+ logging.warning(f"Rate limit hit. Retrying in {delay:.1f} seconds...")
42
+ time.sleep(delay)
43
+ delay *= 2
44
+ else:
45
+ raise
46
+ raise Exception("Max retries exceeded due to rate limits or other persistent errors.")
47
+ return wrapper
48
+ return decorator
49
+
50
+
51
+ def extract_youtube_video_id(url: str) -> str:
52
+ """Extract YouTube video ID from various YouTube URL formats."""
53
+ patterns = [
54
+ r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/)([^&\n?#]+)',
55
+ r'youtube\.com\/watch\?.*v=([^&\n?#]+)'
56
+ ]
57
+
58
+ for pattern in patterns:
59
+ match = re.search(pattern, url)
60
+ if match:
61
+ return match.group(1)
62
+ return None
63
+
64
+ def get_youtube_thumbnail_url(video_id: str) -> str:
65
+ """Get the high quality thumbnail URL for a YouTube video."""
66
+ return f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg"
67
+
68
+ client = OpenAI(
69
+ base_url="https://openrouter.ai/api/v1",
70
+ api_key=api_key,
71
+ )
72
+
73
+ def download_and_convert_youtube_audio(url: str) -> str:
74
+ """
75
+ Downloads audio from a YouTube URL and converts it to a 16kHz mono WAV file.
76
+ Uses a temporary directory for all intermediate files, ensuring cleanup.
77
+ Returns the path to the final temporary WAV file.
78
+ """
79
+ temp_dir = tempfile.mkdtemp()
80
+ try:
81
+ output_tmpl = os.path.join(temp_dir, "audio.%(ext)s")
82
+ ydl_opts = {
83
+ "format": "bestaudio/best",
84
+ "outtmpl": output_tmpl,
85
+ "postprocessors": [{
86
+ 'key': 'FFmpegExtractAudio',
87
+ 'preferredcodec': 'wav',
88
+ }],
89
+ 'postprocessor_args': {
90
+ 'extractaudio': ['-ar', '16000', '-ac', '1']
91
+ },
92
+ "quiet": True,
93
+ }
94
+
95
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
96
+ ydl.download([url])
97
+
98
+ # Find the downloaded .wav file
99
+ downloaded_files = list(Path(temp_dir).glob("*.wav"))
100
+ if not downloaded_files:
101
+ raise FileNotFoundError("yt-dlp failed to create a WAV file. The video might be protected or unavailable.")
102
+
103
+ # Move the final file to a new temporary location so we can clean up the directory
104
+ source_path = downloaded_files[0]
105
+ fd, dest_path = tempfile.mkstemp(suffix=".wav")
106
+ os.close(fd)
107
+ shutil.move(source_path, dest_path)
108
+
109
+ return dest_path
110
+ finally:
111
+ shutil.rmtree(temp_dir)
112
+
113
+ def handle_transcription(file, url):
114
+ if not file and not (url and url.strip()):
115
+ gr.Warning("Please upload a file or enter a URL.")
116
+ return "Error: Please upload a file or enter a URL."
117
+
118
+ gr.Info("Starting secure transcription... This might take a moment.")
119
+
120
+ try:
121
+ result = None
122
+ if url and url.strip():
123
+ video_id = extract_youtube_video_id(url)
124
+ if video_id:
125
+ converted_wav_path = None
126
+ try:
127
+ print(f"Detected YouTube URL. Processing locally: {url}")
128
+ converted_wav_path = download_and_convert_youtube_audio(url)
129
+
130
+ # Read audio bytes and call ASR service
131
+ with open(converted_wav_path, "rb") as f:
132
+ audio_bytes = f.read()
133
+
134
+ print("Sending audio bytes to ASR service.")
135
+ result = asr_handle().transcribe.remote(audio_bytes=audio_bytes)
136
+ finally:
137
+ # Clean up the final temp file
138
+ if converted_wav_path and os.path.exists(converted_wav_path):
139
+ os.remove(converted_wav_path)
140
+
141
+ else:
142
+ # Process other URLs remotely and securely.
143
+ print(f"Sending URL to Modal for processing: {url}")
144
+ result = process_media_remotely.remote(url=url)
145
+ elif file is not None:
146
+ # For file uploads:
147
+ # 1. Generate a unique ID for the file.
148
+ upload_id = f"upload-{uuid.uuid4()}"
149
+ print(f"Uploading file to Modal volume with ID: {upload_id}")
150
+
151
+ # 2. Upload the local file to the remote volume
152
+ with upload_volume.batch_upload() as batch:
153
+ batch.put_file(file, upload_id)
154
+
155
+ # 3. Trigger remote processing by passing the upload ID.
156
+ print(f"Sending upload ID to Modal for processing: {upload_id}")
157
+ result = process_media_remotely.remote(upload_id=upload_id)
158
+
159
+ if result.get("error"):
160
+ return f"Error from ASR service: {result['error']}"
161
+
162
+ return result["text"]
163
+
164
+ except Exception as e:
165
+ print(f"An error occurred: {e}")
166
+ # It's good practice to remove the local temp file if it exists
167
+ if file and os.path.exists(file):
168
+ os.remove(file)
169
+ return f"Error: {str(e)}"
170
+ finally:
171
+ # Gradio's gr.File widget creates a temporary file. We should clean it up.
172
+ if file and os.path.exists(file):
173
+ os.remove(file)
174
+
175
+ def add_transcript_to_chat(transcript: str):
176
+ if transcript.startswith("Error"):
177
+ gr.Error("Transcription failed. Please check the logs.")
178
+ return []
179
+ gr.Info("Transcript ready! Generating blog post...")
180
+ # Return empty list for display but store transcript for LLM processing
181
+ return []
182
+
183
+ def user_chat(user_message: str, history: list):
184
+ return "", history + [{"role": "user", "content": user_message}]
185
+
186
+ @retry_on_rate_limit(max_retries=3, base_delay=1.0)
187
+ def _stream_chat_response(history: list, system_prompt: str, transcript: str = None):
188
+ if not history and not transcript:
189
+ # Don't do anything if there's no history and no transcript
190
+ return
191
+
192
+ if transcript.startswith("Error"):
193
+ return
194
+ # Include transcript as first user message if provided, but don't display it
195
+ messages = [{"role": "system", "content": system_prompt}]
196
+ if transcript:
197
+ messages.append({"role": "user", "content": transcript})
198
+ messages.extend(history)
199
+
200
+ stream = client.chat.completions.create(
201
+ model=llm,
202
+ messages=messages,
203
+ stream=True
204
+ )
205
+
206
+ history.append({"role": "assistant", "content": ""})
207
+ response_content = ""
208
+ for chunk in stream:
209
+ content = chunk.choices[0].delta.content
210
+ if content:
211
+ response_content += content
212
+ history[-1]["content"] = response_content
213
+ yield history
214
+
215
+ def generate_blog_post(history: list, transcript: str, context: str):
216
+ system_prompt = """You are an expert blog writer and editor. Your task is to transform a raw video transcription into a well-structured, engaging, and publish-ready blog post in Markdown format.
217
+ Core Mandate: Erase the Video Origin
218
+ This is a critical function. The reader must not know the content came from a video.
219
+ Eliminate all video-specific language: Remove phrases like "in this video," "thanks for watching," "as you can see here," "welcome to the channel," etc.
220
+ Scrub all platform calls-to-action: No "like and subscribe," "hit the bell icon," or "comment below."
221
+ Remove sponsor messages and ads: Completely omit any sponsor mentions.
222
+ Rephrase visual references: Convert "look at this screen" to a description of the information itself (e.g., "The data reveals that...").
223
+ Content & Formatting Rules:
224
+ Title: Create a compelling, SEO-friendly H1 title.
225
+ Structure: Use ## for main headings and ### for subheadings to create a logical flow.
226
+ Readability: Use short paragraphs, bulleted/numbered lists, and bolding for key terms.
227
+ Refine Prose: Convert conversational speech into clean, professional writing.
228
+ Remove all filler words (um, uh, like, you know).
229
+ Fix grammar and consolidate rambling sentences.
230
+ Flow: Start with a strong introduction and end with a concise summary or conclusion.
231
+ Your output must be a complete, polished article in Markdown."""
232
+
233
+ # Combine transcript with additional context if provided
234
+ full_transcript = transcript
235
+ if context and context.strip():
236
+ full_transcript = f"{transcript}\n\n--- Additional Context ---\n{context.strip()}\n\nThis is some additional context relevant to the transcription above."
237
+
238
+ yield from _stream_chat_response(history, system_prompt, full_transcript)
239
+
240
+ def bot_chat(history: list):
241
+ system_prompt = "You are a helpful assistant that helps refine a blog post created from an audio transcript. The user will provide instructions for changes and you will return only the updated blog post."
242
+ yield from _stream_chat_response(history, system_prompt)
243
+
244
+ def update_thumbnail_display(url: str):
245
+ """Update the thumbnail display when YouTube URL is entered."""
246
+ if not url or not url.strip():
247
+ return gr.update(visible=False, value=None)
248
+
249
+ video_id = extract_youtube_video_id(url)
250
+ if video_id:
251
+ thumbnail_url = get_youtube_thumbnail_url(video_id)
252
+ return gr.update(visible=True, value=thumbnail_url)
253
+ else:
254
+ return gr.update(visible=False, value=None)
255
+
256
+ # Gradio Interface
257
+ theme = gr.themes.Ocean()
258
+ with gr.Blocks(title="ClipScript", theme=theme) as demo:
259
+ gr.Markdown("# 🎬➡️📝 ClipScript: Video-to-Blog Transformer", elem_classes="hero-title")
260
+
261
+ gr.Markdown("### Upload an audio file, or provide a YouTube/direct URL *of any size*.")
262
+ with gr.Row():
263
+ # Column 1: File input, URL input, and thumbnail
264
+ with gr.Column(scale=1):
265
+ file_input = gr.File(label="Upload any audio file", type="filepath", height=200, file_types=["audio", ".webm", ".mp3", ".mp4", ".m4a", ".ogg", ".wav"])
266
+
267
+ with gr.Row():
268
+ with gr.Column():
269
+ url_input = gr.Textbox(
270
+ label="YouTube(Recommended) or Direct Audio URL",
271
+ placeholder="youtube.com/watch?v=... OR xyz.com/audio.mp3",
272
+ scale=2
273
+ )
274
+
275
+ # YouTube thumbnail display
276
+ thumbnail_display = gr.Image(
277
+ label="Thumbnail",
278
+ visible=False,
279
+ height=100,
280
+ show_download_button=False,
281
+ interactive=False,
282
+ scale=2
283
+ )
284
+
285
+ # Column 2: Transcript view
286
+ with gr.Column(scale=2):
287
+ transcript_output = gr.Textbox(label="Transcription POWERED by Modal Labs", lines=12, interactive=True, show_copy_button=True)
288
+
289
+ transcribe_button = gr.Button("Blogify", variant="primary")
290
+
291
+ gr.Markdown("---")
292
+
293
+ # Add Context section
294
+ context_input = gr.Textbox(
295
+ label="Additional Context",
296
+ placeholder="Enter any additional context, code, articles, or any references that relate to the video content...",
297
+ lines=5,
298
+ interactive=True
299
+ )
300
+
301
+ chatbot = gr.Chatbot(
302
+ label="Blog Post", type="messages", height=500, show_copy_all_button=True, show_copy_button=True, show_share_button=True
303
+ )
304
+ chat_input = gr.Textbox(
305
+ label="Your message",
306
+ placeholder="Refine the blog post or ask for changes...",
307
+ container=False,
308
+ )
309
+ clear_button = gr.ClearButton([chat_input, chatbot])
310
+
311
+
312
+ # Event handlers to disable/enable inputs based on usage
313
+ def on_file_upload(file):
314
+ if file is not None:
315
+ return gr.update(interactive=False), gr.update(visible=False, value=None)
316
+ else:
317
+ return gr.update(interactive=True), gr.update(visible=False, value=None)
318
+
319
+ def on_url_change(url):
320
+ if url and url.strip():
321
+ thumbnail_update = update_thumbnail_display(url)
322
+ return gr.update(interactive=False), thumbnail_update
323
+ else:
324
+ return gr.update(interactive=True), gr.update(visible=False, value=None)
325
+
326
+ file_input.change(fn=on_file_upload, inputs=file_input, outputs=[url_input, thumbnail_display])
327
+ url_input.change(fn=on_url_change, inputs=url_input, outputs=[file_input, thumbnail_display])
328
+
329
+ # Chained events for blog generation
330
+ (
331
+ transcribe_button.click(
332
+ fn=handle_transcription,
333
+ inputs=[file_input, url_input],
334
+ outputs=transcript_output,
335
+ )
336
+ .then(
337
+ fn=lambda: gr.update(value=None, interactive=True),
338
+ outputs=file_input,
339
+ queue=False,
340
+ )
341
+ .then(
342
+ fn=add_transcript_to_chat,
343
+ inputs=transcript_output,
344
+ outputs=chatbot,
345
+ queue=False,
346
+ )
347
+ .then(fn=generate_blog_post, inputs=[chatbot, transcript_output, context_input], outputs=chatbot)
348
+ )
349
+
350
+ # Event handler for follow-up chat
351
+ chat_input.submit(
352
+ fn=user_chat,
353
+ inputs=[chat_input, chatbot],
354
+ outputs=[chat_input, chatbot],
355
+ queue=False,
356
+ ).then(fn=bot_chat, inputs=chatbot, outputs=chatbot)
357
+
358
+
359
+ if __name__ == "__main__":
360
+ demo.launch()
processing.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import uuid
3
+
4
+ sandbox_image = (
5
+ modal.Image.debian_slim()
6
+ .apt_install("ffmpeg")
7
+ )
8
+
9
+ app = modal.App(
10
+ "clipscript-processing-service",
11
+ )
12
+
13
+ asr_handle = modal.Cls.from_name("clipscript-asr-service", "ASR")
14
+
15
+ # A persistent, named volume to stage file uploads from the Gradio app.
16
+ upload_volume = modal.Volume.from_name(
17
+ "clipscript-uploads", create_if_missing=True
18
+ )
19
+
20
+ @app.function(
21
+ image=sandbox_image,
22
+ volumes={"/data": upload_volume},
23
+ cpu=2.0,
24
+ memory=4096,
25
+ timeout=7200,
26
+ retries=modal.Retries(
27
+ max_retries=3,
28
+ backoff_coefficient=2.0,
29
+ initial_delay=1.0,
30
+ ),
31
+ )
32
+ def process_media(url: str = None, upload_id: str = None):
33
+ """
34
+ Securely processes media from a URL or a file from the upload Volume using a Sandbox.
35
+
36
+ This function orchestrates a Sandbox to perform the download and conversion,
37
+ then passes the resulting audio bytes to the ASR service.
38
+ """
39
+ output_filename = f"processed-{uuid.uuid4()}.wav"
40
+ output_wav_path_in_sandbox = f"/tmp/{output_filename}"
41
+ audio_bytes = None
42
+
43
+ sb = None
44
+ try:
45
+ volumes = {"/data": upload_volume} if upload_id else {}
46
+
47
+ sb = modal.Sandbox.create(
48
+ image=sandbox_image,
49
+ volumes=volumes,
50
+ )
51
+
52
+ cmd = []
53
+ if url:
54
+ print(f"Sandbox: Downloading and converting from non-YouTube URL: {url}")
55
+ cmd = [
56
+ 'ffmpeg', '-i', url,
57
+ '-ar', '16000', '-ac', '1', '-y', output_wav_path_in_sandbox
58
+ ]
59
+ elif upload_id:
60
+ print(f"Sandbox: Converting uploaded file: {upload_id}")
61
+ # Input path is on the mounted volume
62
+ uploaded_file_path_in_sandbox = f"/data/{upload_id}"
63
+ cmd = [
64
+ 'ffmpeg', '-i', uploaded_file_path_in_sandbox,
65
+ '-ar', '16000', '-ac', '1', '-y', output_wav_path_in_sandbox
66
+ ]
67
+ else:
68
+ raise ValueError("Either 'url' or 'upload_id' must be provided.")
69
+
70
+ print("Sandbox: Executing FFMPEG...")
71
+ p = sb.exec(*cmd)
72
+ p.wait()
73
+
74
+ if p.returncode != 0:
75
+ stderr = p.stderr.read()
76
+ raise RuntimeError(f"ffmpeg execution failed: {stderr}")
77
+
78
+ print("Sandbox: Process complete. Reading WAV data from sandbox's filesystem.")
79
+
80
+ # Read the file directly from the sandbox's filesystem.
81
+ with sb.open(output_wav_path_in_sandbox, "rb") as f:
82
+ audio_bytes = f.read()
83
+
84
+ except Exception as e:
85
+ print(f"Error during sandbox processing: {e}")
86
+ raise
87
+ finally:
88
+ if sb:
89
+ print("Terminating sandbox.")
90
+ sb.terminate()
91
+
92
+ if not audio_bytes:
93
+ raise RuntimeError("Processing failed to produce audio data.")
94
+
95
+ # If we processed a user upload, we can now clean up the original file.
96
+ if upload_id:
97
+ try:
98
+ print(f"Cleaning up original upload {upload_id} from volume.")
99
+ upload_volume.remove_file(upload_id)
100
+ upload_volume.commit()
101
+ except Exception as e:
102
+ # This is not a critical error, so we just warn.
103
+ print(f"Warning: Failed to clean up {upload_id} from volume: {e}")
104
+
105
+ print("Sending audio bytes to ASR service.")
106
+
107
+ # Retry ASR service call with exponential backoff
108
+ max_asr_retries = 3
109
+ result = None
110
+ for attempt in range(max_asr_retries):
111
+ try:
112
+ # Pass the audio bytes directly to the ASR service
113
+ result = asr_handle.transcribe.remote(audio_bytes=audio_bytes)
114
+ break
115
+ except Exception as e:
116
+ if attempt == max_asr_retries - 1:
117
+ raise e
118
+ wait_time = 2 ** attempt
119
+ print(f"ASR service attempt {attempt + 1} failed: {e}. Retrying in {wait_time}s...")
120
+ import time
121
+ time.sleep(wait_time)
122
+
123
+ return result
pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "clipscript"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.13"
7
+ dependencies = [
8
+ "gradio>=5.33.1",
9
+ "gradio-client>=1.10.3",
10
+ "modal>=1.0.3",
11
+ "openai>=1.86.0",
12
+ "python-dotenv>=1.1.0",
13
+ "yt-dlp>=2025.6.9",
14
+ ]