Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,17 +30,18 @@ except ImportError:
|
|
| 30 |
print("SVC not available, using basic voice conversion")
|
| 31 |
|
| 32 |
class AICoverGenerator:
|
| 33 |
-
def
|
|
|
|
| 34 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 35 |
self.temp_dir = tempfile.mkdtemp()
|
| 36 |
self.voice_models = {
|
| 37 |
"drake": "Drake Style Voice",
|
| 38 |
-
"ariana": "Ariana Style Voice",
|
| 39 |
"weeknd": "The Weeknd Style Voice",
|
| 40 |
"taylor": "Taylor Swift Style Voice",
|
| 41 |
"custom": "Custom Voice Model"
|
| 42 |
}
|
| 43 |
-
|
| 44 |
# Initialize audio separation model
|
| 45 |
if DEMUCS_AVAILABLE:
|
| 46 |
try:
|
|
@@ -51,24 +52,24 @@ class AICoverGenerator:
|
|
| 51 |
self.separation_model = None
|
| 52 |
else:
|
| 53 |
self.separation_model = None
|
| 54 |
-
|
| 55 |
def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
|
| 56 |
"""Separate vocals and instrumentals from audio"""
|
| 57 |
try:
|
| 58 |
# Load audio
|
| 59 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
| 60 |
-
|
| 61 |
if self.separation_model and DEMUCS_AVAILABLE:
|
| 62 |
# Use Demucs for high-quality separation
|
| 63 |
return self._demucs_separate(audio_path)
|
| 64 |
else:
|
| 65 |
# Use basic spectral subtraction
|
| 66 |
return self._basic_separate(audio, sr)
|
| 67 |
-
|
| 68 |
except Exception as e:
|
| 69 |
print(f"Error in vocal separation: {e}")
|
| 70 |
return None, None
|
| 71 |
-
|
| 72 |
def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
|
| 73 |
"""Use Demucs for audio separation"""
|
| 74 |
try:
|
|
@@ -76,220 +77,223 @@ class AICoverGenerator:
|
|
| 76 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
| 77 |
if audio.ndim == 1:
|
| 78 |
audio = np.stack([audio, audio])
|
| 79 |
-
|
| 80 |
# Convert to tensor
|
| 81 |
audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
| 82 |
-
|
| 83 |
# Apply separation
|
| 84 |
with torch.no_grad():
|
| 85 |
sources = apply_model(self.separation_model, audio_tensor)
|
| 86 |
-
|
| 87 |
# Extract vocals and instrumental
|
| 88 |
vocals = sources[0, 3].cpu().numpy() # vocals channel
|
| 89 |
instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
|
| 90 |
-
|
| 91 |
# Save separated audio
|
| 92 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
| 93 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
| 94 |
-
|
| 95 |
sf.write(vocals_path, vocals.T, 44100)
|
| 96 |
sf.write(instrumental_path, instrumental.T, 44100)
|
| 97 |
-
|
| 98 |
return vocals_path, instrumental_path
|
| 99 |
-
|
| 100 |
except Exception as e:
|
| 101 |
print(f"Demucs separation error: {e}")
|
| 102 |
return self._basic_separate(audio, 44100)
|
| 103 |
-
|
| 104 |
def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
|
| 105 |
"""Basic vocal separation using spectral subtraction"""
|
| 106 |
try:
|
| 107 |
# Convert to mono if stereo
|
| 108 |
if audio.ndim > 1:
|
| 109 |
audio = librosa.to_mono(audio)
|
| 110 |
-
|
| 111 |
# Compute STFT
|
| 112 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 113 |
magnitude, phase = np.abs(stft), np.angle(stft)
|
| 114 |
-
|
| 115 |
# Simple vocal isolation (center channel extraction)
|
| 116 |
# This is a basic approach - real implementation would be more sophisticated
|
| 117 |
vocal_mask = np.ones_like(magnitude)
|
| 118 |
vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
|
| 119 |
vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
|
| 120 |
-
|
| 121 |
# Apply mask
|
| 122 |
vocal_magnitude = magnitude * vocal_mask
|
| 123 |
instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
|
| 124 |
-
|
| 125 |
# Reconstruct audio
|
| 126 |
vocal_stft = vocal_magnitude * np.exp(1j * phase)
|
| 127 |
instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
|
| 128 |
-
|
| 129 |
vocals = librosa.istft(vocal_stft, hop_length=512)
|
| 130 |
instrumental = librosa.istft(instrumental_stft, hop_length=512)
|
| 131 |
-
|
| 132 |
# Save files
|
| 133 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
| 134 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
| 135 |
-
|
| 136 |
sf.write(vocals_path, vocals, sr)
|
| 137 |
sf.write(instrumental_path, instrumental, sr)
|
| 138 |
-
|
| 139 |
return vocals_path, instrumental_path
|
| 140 |
-
|
|
|
|
| 141 |
except Exception as e:
|
| 142 |
print(f"Basic separation error: {e}")
|
| 143 |
return None, None
|
| 144 |
-
|
| 145 |
def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
|
| 146 |
"""Convert vocals to target voice"""
|
| 147 |
try:
|
| 148 |
# Load vocal audio
|
| 149 |
vocals, sr = librosa.load(vocals_path, sr=44100)
|
| 150 |
-
|
| 151 |
# Apply pitch shifting if requested
|
| 152 |
if pitch_shift != 0:
|
| 153 |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
|
| 154 |
-
|
| 155 |
# Simulate voice conversion (in real app, this would use trained models)
|
| 156 |
converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
|
| 157 |
-
|
| 158 |
# Save converted vocals
|
| 159 |
converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
|
| 160 |
sf.write(converted_path, converted_vocals, sr)
|
| 161 |
-
|
| 162 |
return converted_path
|
| 163 |
-
|
| 164 |
except Exception as e:
|
| 165 |
print(f"Voice conversion error: {e}")
|
| 166 |
return vocals_path # Return original if conversion fails
|
| 167 |
-
|
| 168 |
def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
|
| 169 |
-
"""Simulate voice conversion
|
|
|
|
| 170 |
# This is a simplified simulation - real implementation would use trained models
|
| 171 |
-
|
| 172 |
# Apply different effects based on voice model
|
| 173 |
if voice_model == "drake":
|
| 174 |
# Simulate Drake's voice characteristics
|
| 175 |
-
vocals = self._apply_voice_characteristics(vocals,
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
elif voice_model == "ariana":
|
| 180 |
# Simulate Ariana's voice characteristics
|
| 181 |
vocals = self._apply_voice_characteristics(vocals,
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
elif voice_model == "weeknd":
|
| 186 |
# Simulate The Weeknd's voice characteristics
|
| 187 |
vocals = self._apply_voice_characteristics(vocals,
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
elif voice_model == "taylor":
|
| 192 |
# Simulate Taylor Swift's voice characteristics
|
| 193 |
vocals = self._apply_voice_characteristics(vocals,
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
# Blend with original based on strength
|
| 199 |
return vocals * strength + vocals * (1 - strength) * 0.3
|
| 200 |
-
|
| 201 |
def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
|
| 202 |
"""Apply voice characteristics transformation"""
|
| 203 |
sr = 44100
|
| 204 |
-
|
| 205 |
# Apply pitch factor
|
| 206 |
if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
|
| 207 |
-
vocals = librosa.effects.pitch_shift(vocals, sr=sr,
|
| 208 |
-
|
| 209 |
-
|
| 210 |
# Apply formant shifting (simplified)
|
| 211 |
if 'formant_shift' in kwargs:
|
| 212 |
# This is a simplified formant shift - real implementation would be more complex
|
| 213 |
stft = librosa.stft(vocals)
|
| 214 |
magnitude = np.abs(stft)
|
| 215 |
phase = np.angle(stft)
|
| 216 |
-
|
| 217 |
# Shift formants by stretching frequency axis
|
| 218 |
shift_factor = 1 + kwargs['formant_shift']
|
| 219 |
shifted_magnitude = np.zeros_like(magnitude)
|
| 220 |
-
|
| 221 |
for i in range(magnitude.shape[0]):
|
| 222 |
shifted_idx = int(i * shift_factor)
|
| 223 |
if shifted_idx < magnitude.shape[0]:
|
| 224 |
shifted_magnitude[shifted_idx] = magnitude[i]
|
| 225 |
-
|
| 226 |
shifted_stft = shifted_magnitude * np.exp(1j * phase)
|
| 227 |
vocals = librosa.istft(shifted_stft)
|
| 228 |
-
|
| 229 |
# Apply effects
|
| 230 |
if 'roughness' in kwargs:
|
| 231 |
# Add slight distortion for roughness
|
| 232 |
vocals = np.tanh(vocals * (1 + kwargs['roughness']))
|
| 233 |
-
|
| 234 |
if 'breathiness' in kwargs:
|
| 235 |
# Add noise for breathiness
|
| 236 |
noise = np.random.normal(0, 0.01, vocals.shape)
|
| 237 |
vocals = vocals + noise * kwargs['breathiness']
|
| 238 |
-
|
| 239 |
return vocals
|
| 240 |
-
|
| 241 |
def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
|
| 242 |
"""Mix instrumental and converted vocals"""
|
| 243 |
try:
|
| 244 |
# Load audio files
|
| 245 |
instrumental, sr = librosa.load(instrumental_path, sr=44100)
|
| 246 |
vocals, _ = librosa.load(vocals_path, sr=44100)
|
| 247 |
-
|
| 248 |
# Ensure same length
|
| 249 |
min_len = min(len(instrumental), len(vocals))
|
| 250 |
instrumental = instrumental[:min_len]
|
| 251 |
vocals = vocals[:min_len]
|
| 252 |
-
|
| 253 |
# Mix audio
|
| 254 |
mixed = instrumental + vocals * vocal_volume
|
| 255 |
-
|
| 256 |
# Normalize to prevent clipping
|
| 257 |
max_amplitude = np.max(np.abs(mixed))
|
| 258 |
if max_amplitude > 0.95:
|
| 259 |
mixed = mixed / max_amplitude * 0.95
|
| 260 |
-
|
| 261 |
# Save mixed audio
|
| 262 |
output_path = os.path.join(self.temp_dir, "final_cover.wav")
|
| 263 |
sf.write(output_path, mixed, sr)
|
| 264 |
-
|
| 265 |
return output_path
|
| 266 |
-
|
| 267 |
except Exception as e:
|
| 268 |
print(f"Audio mixing error: {e}")
|
| 269 |
return None
|
| 270 |
-
|
| 271 |
def process_custom_voice(self, voice_samples: list) -> str:
|
| 272 |
"""Process custom voice samples for training"""
|
| 273 |
if not voice_samples:
|
| 274 |
return "No voice samples provided"
|
| 275 |
-
|
| 276 |
try:
|
| 277 |
# In a real implementation, this would train a voice model
|
| 278 |
# For demo, we'll just validate the samples
|
| 279 |
total_duration = 0
|
|
|
|
| 280 |
for sample in voice_samples:
|
| 281 |
if sample is not None:
|
| 282 |
audio, sr = librosa.load(sample, sr=44100)
|
| 283 |
duration = len(audio) / sr
|
| 284 |
total_duration += duration
|
| 285 |
-
|
| 286 |
if total_duration < 30:
|
| 287 |
return "Need at least 30 seconds of voice samples"
|
| 288 |
elif total_duration > 300:
|
| 289 |
return "Voice samples too long (max 5 minutes)"
|
| 290 |
else:
|
| 291 |
-
return f"Custom voice model ready
|
| 292 |
-
|
| 293 |
except Exception as e:
|
| 294 |
return f"Error processing voice samples: {e}"
|
| 295 |
|
|
@@ -304,48 +308,50 @@ def generate_cover(
|
|
| 304 |
auto_tune: bool = False,
|
| 305 |
output_format: str = "wav"
|
| 306 |
) -> Tuple[Optional[str], str]:
|
| 307 |
-
"""Main
|
| 308 |
-
|
|
|
|
| 309 |
if audio_file is None:
|
| 310 |
return None, "Please upload an audio file"
|
| 311 |
-
|
| 312 |
try:
|
| 313 |
# Step 1: Separate vocals and instrumentals
|
| 314 |
yield None, "π΅ Separating vocals and instrumentals..."
|
| 315 |
vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
|
| 316 |
-
|
| 317 |
if vocals_path is None:
|
| 318 |
return None, "β Failed to separate vocals"
|
| 319 |
-
|
| 320 |
# Step 2: Convert vocals to target voice
|
| 321 |
yield None, f"π€ Converting vocals to {voice_model} style..."
|
| 322 |
converted_vocals_path = cover_generator.convert_voice(
|
| 323 |
-
vocals_path,
|
| 324 |
-
voice_model,
|
| 325 |
-
pitch_shift,
|
| 326 |
voice_strength / 100
|
| 327 |
)
|
| 328 |
-
|
| 329 |
# Step 3: Apply auto-tune if requested
|
| 330 |
if auto_tune:
|
| 331 |
yield None, "πΌ Applying auto-tune..."
|
| 332 |
# Auto-tune implementation would go here
|
| 333 |
pass
|
| 334 |
-
|
| 335 |
# Step 4: Mix final audio
|
| 336 |
yield None, "π§ Mixing final audio..."
|
| 337 |
final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
|
| 338 |
-
|
| 339 |
if final_path is None:
|
| 340 |
return None, "β Failed to mix audio"
|
| 341 |
-
|
| 342 |
-
# Convert to requested
|
|
|
|
| 343 |
if output_format != "wav":
|
| 344 |
yield None, f"πΎ Converting to {output_format.upper()}..."
|
| 345 |
# Format conversion would go here
|
| 346 |
-
|
| 347 |
return final_path, "β
AI Cover generated successfully!"
|
| 348 |
-
|
| 349 |
except Exception as e:
|
| 350 |
return None, f"β Error: {str(e)}"
|
| 351 |
|
|
@@ -353,18 +359,14 @@ def process_voice_samples(voice_files) -> str:
|
|
| 353 |
"""Process uploaded voice samples for custom voice training"""
|
| 354 |
if not voice_files:
|
| 355 |
return "No voice samples uploaded"
|
| 356 |
-
|
| 357 |
return cover_generator.process_custom_voice(voice_files)
|
| 358 |
|
| 359 |
# Create Gradio interface
|
| 360 |
def create_interface():
|
| 361 |
with gr.Blocks(
|
| 362 |
title="π΅ AI Cover Song Platform",
|
| 363 |
-
theme=gr.themes.Soft
|
| 364 |
-
primary_hue="indigo",
|
| 365 |
-
secondary_hue="purple",
|
| 366 |
-
neutral_hue="slate"
|
| 367 |
-
),
|
| 368 |
css="""
|
| 369 |
.gradio-container {
|
| 370 |
font-family: 'Inter', sans-serif;
|
|
@@ -388,7 +390,7 @@ def create_interface():
|
|
| 388 |
}
|
| 389 |
"""
|
| 390 |
) as app:
|
| 391 |
-
|
| 392 |
# Header
|
| 393 |
with gr.Row():
|
| 394 |
gr.Markdown("""
|
|
@@ -402,7 +404,7 @@ def create_interface():
|
|
| 402 |
</div>
|
| 403 |
</div>
|
| 404 |
""")
|
| 405 |
-
|
| 406 |
# Step 1: Upload Audio
|
| 407 |
with gr.Row():
|
| 408 |
with gr.Column():
|
|
@@ -413,7 +415,7 @@ def create_interface():
|
|
| 413 |
format="wav"
|
| 414 |
)
|
| 415 |
gr.Markdown("*Supports MP3, WAV, FLAC files*")
|
| 416 |
-
|
| 417 |
# Step 2: Voice Selection
|
| 418 |
with gr.Row():
|
| 419 |
with gr.Column():
|
|
@@ -424,7 +426,7 @@ def create_interface():
|
|
| 424 |
value="Drake Style Voice",
|
| 425 |
interactive=True
|
| 426 |
)
|
| 427 |
-
|
| 428 |
# Custom voice training section
|
| 429 |
with gr.Accordion("ποΈ Train Custom Voice (Optional)", open=False):
|
| 430 |
voice_samples = gr.File(
|
|
@@ -434,18 +436,18 @@ def create_interface():
|
|
| 434 |
)
|
| 435 |
train_btn = gr.Button("Train Custom Voice", variant="secondary")
|
| 436 |
training_status = gr.Textbox(label="Training Status", interactive=False)
|
| 437 |
-
|
| 438 |
train_btn.click(
|
| 439 |
process_voice_samples,
|
| 440 |
inputs=[voice_samples],
|
| 441 |
outputs=[training_status]
|
| 442 |
)
|
| 443 |
-
|
| 444 |
# Step 3: Audio Settings
|
| 445 |
with gr.Row():
|
| 446 |
with gr.Column():
|
| 447 |
gr.Markdown("## βοΈ Step 3: Audio Settings")
|
| 448 |
-
|
| 449 |
with gr.Row():
|
| 450 |
pitch_shift = gr.Slider(
|
| 451 |
minimum=-12,
|
|
@@ -461,7 +463,7 @@ def create_interface():
|
|
| 461 |
step=5,
|
| 462 |
label="Voice Strength (%)"
|
| 463 |
)
|
| 464 |
-
|
| 465 |
with gr.Row():
|
| 466 |
auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
|
| 467 |
output_format = gr.Dropdown(
|
|
@@ -469,7 +471,7 @@ def create_interface():
|
|
| 469 |
label="Output Format",
|
| 470 |
value="wav"
|
| 471 |
)
|
| 472 |
-
|
| 473 |
# Step 4: Generate Cover
|
| 474 |
with gr.Row():
|
| 475 |
with gr.Column():
|
|
@@ -479,33 +481,35 @@ def create_interface():
|
|
| 479 |
variant="primary",
|
| 480 |
size="lg"
|
| 481 |
)
|
| 482 |
-
|
| 483 |
progress_text = gr.Textbox(
|
| 484 |
label="Progress",
|
| 485 |
value="Ready to generate cover...",
|
| 486 |
interactive=False
|
| 487 |
)
|
| 488 |
-
|
| 489 |
# Results
|
| 490 |
with gr.Row():
|
| 491 |
with gr.Column():
|
| 492 |
gr.Markdown("## π Results")
|
| 493 |
-
|
| 494 |
with gr.Row():
|
| 495 |
original_audio = gr.Audio(label="Original Song", interactive=False)
|
| 496 |
cover_audio = gr.Audio(label="AI Cover", interactive=False)
|
| 497 |
-
|
| 498 |
# Legal Notice
|
| 499 |
with gr.Row():
|
| 500 |
gr.Markdown("""
|
| 501 |
-
<div style="background: rgba(255, 193, 7, 0.1);
|
|
|
|
|
|
|
| 502 |
<h3>β οΈ Legal & Ethical Notice</h3>
|
| 503 |
-
<p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
|
| 504 |
-
Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
|
| 505 |
Respect copyright laws and artist rights.</p>
|
| 506 |
</div>
|
| 507 |
""")
|
| 508 |
-
|
| 509 |
# Event handlers
|
| 510 |
generate_btn.click(
|
| 511 |
generate_cover,
|
|
@@ -519,14 +523,14 @@ def create_interface():
|
|
| 519 |
],
|
| 520 |
outputs=[cover_audio, progress_text]
|
| 521 |
)
|
| 522 |
-
|
| 523 |
# Update original audio when file is uploaded
|
| 524 |
audio_input.change(
|
| 525 |
lambda x: x,
|
| 526 |
inputs=[audio_input],
|
| 527 |
outputs=[original_audio]
|
| 528 |
)
|
| 529 |
-
|
| 530 |
return app
|
| 531 |
|
| 532 |
# Launch the app
|
|
@@ -537,4 +541,4 @@ if __name__ == "__main__":
|
|
| 537 |
server_port=7860,
|
| 538 |
share=True,
|
| 539 |
show_error=True
|
| 540 |
-
)
|
|
|
|
| 30 |
print("SVC not available, using basic voice conversion")
|
| 31 |
|
| 32 |
class AICoverGenerator:
|
| 33 |
+
def \
|
| 34 |
+
__init__(self):
|
| 35 |
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 36 |
self.temp_dir = tempfile.mkdtemp()
|
| 37 |
self.voice_models = {
|
| 38 |
"drake": "Drake Style Voice",
|
| 39 |
+
"ariana": "Ariana Style Voice",
|
| 40 |
"weeknd": "The Weeknd Style Voice",
|
| 41 |
"taylor": "Taylor Swift Style Voice",
|
| 42 |
"custom": "Custom Voice Model"
|
| 43 |
}
|
| 44 |
+
|
| 45 |
# Initialize audio separation model
|
| 46 |
if DEMUCS_AVAILABLE:
|
| 47 |
try:
|
|
|
|
| 52 |
self.separation_model = None
|
| 53 |
else:
|
| 54 |
self.separation_model = None
|
| 55 |
+
|
| 56 |
def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
|
| 57 |
"""Separate vocals and instrumentals from audio"""
|
| 58 |
try:
|
| 59 |
# Load audio
|
| 60 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
| 61 |
+
|
| 62 |
if self.separation_model and DEMUCS_AVAILABLE:
|
| 63 |
# Use Demucs for high-quality separation
|
| 64 |
return self._demucs_separate(audio_path)
|
| 65 |
else:
|
| 66 |
# Use basic spectral subtraction
|
| 67 |
return self._basic_separate(audio, sr)
|
| 68 |
+
|
| 69 |
except Exception as e:
|
| 70 |
print(f"Error in vocal separation: {e}")
|
| 71 |
return None, None
|
| 72 |
+
|
| 73 |
def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
|
| 74 |
"""Use Demucs for audio separation"""
|
| 75 |
try:
|
|
|
|
| 77 |
audio, sr = librosa.load(audio_path, sr=44100, mono=False)
|
| 78 |
if audio.ndim == 1:
|
| 79 |
audio = np.stack([audio, audio])
|
| 80 |
+
|
| 81 |
# Convert to tensor
|
| 82 |
audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
|
| 83 |
+
|
| 84 |
# Apply separation
|
| 85 |
with torch.no_grad():
|
| 86 |
sources = apply_model(self.separation_model, audio_tensor)
|
| 87 |
+
|
| 88 |
# Extract vocals and instrumental
|
| 89 |
vocals = sources[0, 3].cpu().numpy() # vocals channel
|
| 90 |
instrumental = sources[0, 0].cpu().numpy() # drums + bass + other
|
| 91 |
+
|
| 92 |
# Save separated audio
|
| 93 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
| 94 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
| 95 |
+
|
| 96 |
sf.write(vocals_path, vocals.T, 44100)
|
| 97 |
sf.write(instrumental_path, instrumental.T, 44100)
|
| 98 |
+
|
| 99 |
return vocals_path, instrumental_path
|
| 100 |
+
|
| 101 |
except Exception as e:
|
| 102 |
print(f"Demucs separation error: {e}")
|
| 103 |
return self._basic_separate(audio, 44100)
|
| 104 |
+
|
| 105 |
def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
|
| 106 |
"""Basic vocal separation using spectral subtraction"""
|
| 107 |
try:
|
| 108 |
# Convert to mono if stereo
|
| 109 |
if audio.ndim > 1:
|
| 110 |
audio = librosa.to_mono(audio)
|
| 111 |
+
|
| 112 |
# Compute STFT
|
| 113 |
stft = librosa.stft(audio, n_fft=2048, hop_length=512)
|
| 114 |
magnitude, phase = np.abs(stft), np.angle(stft)
|
| 115 |
+
|
| 116 |
# Simple vocal isolation (center channel extraction)
|
| 117 |
# This is a basic approach - real implementation would be more sophisticated
|
| 118 |
vocal_mask = np.ones_like(magnitude)
|
| 119 |
vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies
|
| 120 |
vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies
|
| 121 |
+
|
| 122 |
# Apply mask
|
| 123 |
vocal_magnitude = magnitude * vocal_mask
|
| 124 |
instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
|
| 125 |
+
|
| 126 |
# Reconstruct audio
|
| 127 |
vocal_stft = vocal_magnitude * np.exp(1j * phase)
|
| 128 |
instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
|
| 129 |
+
|
| 130 |
vocals = librosa.istft(vocal_stft, hop_length=512)
|
| 131 |
instrumental = librosa.istft(instrumental_stft, hop_length=512)
|
| 132 |
+
|
| 133 |
# Save files
|
| 134 |
vocals_path = os.path.join(self.temp_dir, "vocals.wav")
|
| 135 |
instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
|
| 136 |
+
|
| 137 |
sf.write(vocals_path, vocals, sr)
|
| 138 |
sf.write(instrumental_path, instrumental, sr)
|
| 139 |
+
|
| 140 |
return vocals_path, instrumental_path
|
| 141 |
+
|
| 142 |
+
|
| 143 |
except Exception as e:
|
| 144 |
print(f"Basic separation error: {e}")
|
| 145 |
return None, None
|
| 146 |
+
|
| 147 |
def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
|
| 148 |
"""Convert vocals to target voice"""
|
| 149 |
try:
|
| 150 |
# Load vocal audio
|
| 151 |
vocals, sr = librosa.load(vocals_path, sr=44100)
|
| 152 |
+
|
| 153 |
# Apply pitch shifting if requested
|
| 154 |
if pitch_shift != 0:
|
| 155 |
vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
|
| 156 |
+
|
| 157 |
# Simulate voice conversion (in real app, this would use trained models)
|
| 158 |
converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
|
| 159 |
+
|
| 160 |
# Save converted vocals
|
| 161 |
converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
|
| 162 |
sf.write(converted_path, converted_vocals, sr)
|
| 163 |
+
|
| 164 |
return converted_path
|
| 165 |
+
|
| 166 |
except Exception as e:
|
| 167 |
print(f"Voice conversion error: {e}")
|
| 168 |
return vocals_path # Return original if conversion fails
|
| 169 |
+
|
| 170 |
def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
|
| 171 |
+
"""Simulate voice conversion \
|
| 172 |
+
(placeholder for actual model inference)"""
|
| 173 |
# This is a simplified simulation - real implementation would use trained models
|
| 174 |
+
|
| 175 |
# Apply different effects based on voice model
|
| 176 |
if voice_model == "drake":
|
| 177 |
# Simulate Drake's voice characteristics
|
| 178 |
+
vocals = self._apply_voice_characteristics(vocals,
|
| 179 |
+
pitch_factor=0.85,
|
| 180 |
+
formant_shift=-0.1,
|
| 181 |
+
roughness=0.3)
|
| 182 |
elif voice_model == "ariana":
|
| 183 |
# Simulate Ariana's voice characteristics
|
| 184 |
vocals = self._apply_voice_characteristics(vocals,
|
| 185 |
+
pitch_factor=1.2,
|
| 186 |
+
formant_shift=0.2,
|
| 187 |
+
breathiness=0.4)
|
| 188 |
elif voice_model == "weeknd":
|
| 189 |
# Simulate The Weeknd's voice characteristics
|
| 190 |
vocals = self._apply_voice_characteristics(vocals,
|
| 191 |
+
pitch_factor=0.9,
|
| 192 |
+
formant_shift=-0.05,
|
| 193 |
+
reverb=0.3)
|
| 194 |
elif voice_model == "taylor":
|
| 195 |
# Simulate Taylor Swift's voice characteristics
|
| 196 |
vocals = self._apply_voice_characteristics(vocals,
|
| 197 |
+
pitch_factor=1.1,
|
| 198 |
+
formant_shift=0.1,
|
| 199 |
+
clarity=0.8)
|
| 200 |
+
|
| 201 |
# Blend with original based on strength
|
| 202 |
return vocals * strength + vocals * (1 - strength) * 0.3
|
| 203 |
+
|
| 204 |
def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
|
| 205 |
"""Apply voice characteristics transformation"""
|
| 206 |
sr = 44100
|
| 207 |
+
|
| 208 |
# Apply pitch factor
|
| 209 |
if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
|
| 210 |
+
vocals = librosa.effects.pitch_shift(vocals, sr=sr,
|
| 211 |
+
n_steps=12 * np.log2(kwargs['pitch_factor']))
|
| 212 |
+
|
| 213 |
# Apply formant shifting (simplified)
|
| 214 |
if 'formant_shift' in kwargs:
|
| 215 |
# This is a simplified formant shift - real implementation would be more complex
|
| 216 |
stft = librosa.stft(vocals)
|
| 217 |
magnitude = np.abs(stft)
|
| 218 |
phase = np.angle(stft)
|
| 219 |
+
|
| 220 |
# Shift formants by stretching frequency axis
|
| 221 |
shift_factor = 1 + kwargs['formant_shift']
|
| 222 |
shifted_magnitude = np.zeros_like(magnitude)
|
| 223 |
+
|
| 224 |
for i in range(magnitude.shape[0]):
|
| 225 |
shifted_idx = int(i * shift_factor)
|
| 226 |
if shifted_idx < magnitude.shape[0]:
|
| 227 |
shifted_magnitude[shifted_idx] = magnitude[i]
|
| 228 |
+
|
| 229 |
shifted_stft = shifted_magnitude * np.exp(1j * phase)
|
| 230 |
vocals = librosa.istft(shifted_stft)
|
| 231 |
+
|
| 232 |
# Apply effects
|
| 233 |
if 'roughness' in kwargs:
|
| 234 |
# Add slight distortion for roughness
|
| 235 |
vocals = np.tanh(vocals * (1 + kwargs['roughness']))
|
| 236 |
+
|
| 237 |
if 'breathiness' in kwargs:
|
| 238 |
# Add noise for breathiness
|
| 239 |
noise = np.random.normal(0, 0.01, vocals.shape)
|
| 240 |
vocals = vocals + noise * kwargs['breathiness']
|
| 241 |
+
|
| 242 |
return vocals
|
| 243 |
+
|
| 244 |
def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
|
| 245 |
"""Mix instrumental and converted vocals"""
|
| 246 |
try:
|
| 247 |
# Load audio files
|
| 248 |
instrumental, sr = librosa.load(instrumental_path, sr=44100)
|
| 249 |
vocals, _ = librosa.load(vocals_path, sr=44100)
|
| 250 |
+
|
| 251 |
# Ensure same length
|
| 252 |
min_len = min(len(instrumental), len(vocals))
|
| 253 |
instrumental = instrumental[:min_len]
|
| 254 |
vocals = vocals[:min_len]
|
| 255 |
+
|
| 256 |
# Mix audio
|
| 257 |
mixed = instrumental + vocals * vocal_volume
|
| 258 |
+
|
| 259 |
# Normalize to prevent clipping
|
| 260 |
max_amplitude = np.max(np.abs(mixed))
|
| 261 |
if max_amplitude > 0.95:
|
| 262 |
mixed = mixed / max_amplitude * 0.95
|
| 263 |
+
|
| 264 |
# Save mixed audio
|
| 265 |
output_path = os.path.join(self.temp_dir, "final_cover.wav")
|
| 266 |
sf.write(output_path, mixed, sr)
|
| 267 |
+
|
| 268 |
return output_path
|
| 269 |
+
|
| 270 |
except Exception as e:
|
| 271 |
print(f"Audio mixing error: {e}")
|
| 272 |
return None
|
| 273 |
+
|
| 274 |
def process_custom_voice(self, voice_samples: list) -> str:
|
| 275 |
"""Process custom voice samples for training"""
|
| 276 |
if not voice_samples:
|
| 277 |
return "No voice samples provided"
|
| 278 |
+
|
| 279 |
try:
|
| 280 |
# In a real implementation, this would train a voice model
|
| 281 |
# For demo, we'll just validate the samples
|
| 282 |
total_duration = 0
|
| 283 |
+
|
| 284 |
for sample in voice_samples:
|
| 285 |
if sample is not None:
|
| 286 |
audio, sr = librosa.load(sample, sr=44100)
|
| 287 |
duration = len(audio) / sr
|
| 288 |
total_duration += duration
|
| 289 |
+
|
| 290 |
if total_duration < 30:
|
| 291 |
return "Need at least 30 seconds of voice samples"
|
| 292 |
elif total_duration > 300:
|
| 293 |
return "Voice samples too long (max 5 minutes)"
|
| 294 |
else:
|
| 295 |
+
return f"Custom voice model ready!\n({total_duration:.1f}s of training data)"
|
| 296 |
+
|
| 297 |
except Exception as e:
|
| 298 |
return f"Error processing voice samples: {e}"
|
| 299 |
|
|
|
|
| 308 |
auto_tune: bool = False,
|
| 309 |
output_format: str = "wav"
|
| 310 |
) -> Tuple[Optional[str], str]:
|
| 311 |
+
"""Main \
|
| 312 |
+
function to generate AI cover"""
|
| 313 |
+
|
| 314 |
if audio_file is None:
|
| 315 |
return None, "Please upload an audio file"
|
| 316 |
+
|
| 317 |
try:
|
| 318 |
# Step 1: Separate vocals and instrumentals
|
| 319 |
yield None, "π΅ Separating vocals and instrumentals..."
|
| 320 |
vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
|
| 321 |
+
|
| 322 |
if vocals_path is None:
|
| 323 |
return None, "β Failed to separate vocals"
|
| 324 |
+
|
| 325 |
# Step 2: Convert vocals to target voice
|
| 326 |
yield None, f"π€ Converting vocals to {voice_model} style..."
|
| 327 |
converted_vocals_path = cover_generator.convert_voice(
|
| 328 |
+
vocals_path,
|
| 329 |
+
voice_model,
|
| 330 |
+
pitch_shift,
|
| 331 |
voice_strength / 100
|
| 332 |
)
|
| 333 |
+
|
| 334 |
# Step 3: Apply auto-tune if requested
|
| 335 |
if auto_tune:
|
| 336 |
yield None, "πΌ Applying auto-tune..."
|
| 337 |
# Auto-tune implementation would go here
|
| 338 |
pass
|
| 339 |
+
|
| 340 |
# Step 4: Mix final audio
|
| 341 |
yield None, "π§ Mixing final audio..."
|
| 342 |
final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
|
| 343 |
+
|
| 344 |
if final_path is None:
|
| 345 |
return None, "β Failed to mix audio"
|
| 346 |
+
|
| 347 |
+
# Convert to requested \
|
| 348 |
+
format if needed
|
| 349 |
if output_format != "wav":
|
| 350 |
yield None, f"πΎ Converting to {output_format.upper()}..."
|
| 351 |
# Format conversion would go here
|
| 352 |
+
|
| 353 |
return final_path, "β
AI Cover generated successfully!"
|
| 354 |
+
|
| 355 |
except Exception as e:
|
| 356 |
return None, f"β Error: {str(e)}"
|
| 357 |
|
|
|
|
| 359 |
"""Process uploaded voice samples for custom voice training"""
|
| 360 |
if not voice_files:
|
| 361 |
return "No voice samples uploaded"
|
| 362 |
+
|
| 363 |
return cover_generator.process_custom_voice(voice_files)
|
| 364 |
|
| 365 |
# Create Gradio interface
|
| 366 |
def create_interface():
|
| 367 |
with gr.Blocks(
|
| 368 |
title="π΅ AI Cover Song Platform",
|
| 369 |
+
# Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
css="""
|
| 371 |
.gradio-container {
|
| 372 |
font-family: 'Inter', sans-serif;
|
|
|
|
| 390 |
}
|
| 391 |
"""
|
| 392 |
) as app:
|
| 393 |
+
|
| 394 |
# Header
|
| 395 |
with gr.Row():
|
| 396 |
gr.Markdown("""
|
|
|
|
| 404 |
</div>
|
| 405 |
</div>
|
| 406 |
""")
|
| 407 |
+
|
| 408 |
# Step 1: Upload Audio
|
| 409 |
with gr.Row():
|
| 410 |
with gr.Column():
|
|
|
|
| 415 |
format="wav"
|
| 416 |
)
|
| 417 |
gr.Markdown("*Supports MP3, WAV, FLAC files*")
|
| 418 |
+
|
| 419 |
# Step 2: Voice Selection
|
| 420 |
with gr.Row():
|
| 421 |
with gr.Column():
|
|
|
|
| 426 |
value="Drake Style Voice",
|
| 427 |
interactive=True
|
| 428 |
)
|
| 429 |
+
|
| 430 |
# Custom voice training section
|
| 431 |
with gr.Accordion("ποΈ Train Custom Voice (Optional)", open=False):
|
| 432 |
voice_samples = gr.File(
|
|
|
|
| 436 |
)
|
| 437 |
train_btn = gr.Button("Train Custom Voice", variant="secondary")
|
| 438 |
training_status = gr.Textbox(label="Training Status", interactive=False)
|
| 439 |
+
|
| 440 |
train_btn.click(
|
| 441 |
process_voice_samples,
|
| 442 |
inputs=[voice_samples],
|
| 443 |
outputs=[training_status]
|
| 444 |
)
|
| 445 |
+
|
| 446 |
# Step 3: Audio Settings
|
| 447 |
with gr.Row():
|
| 448 |
with gr.Column():
|
| 449 |
gr.Markdown("## βοΈ Step 3: Audio Settings")
|
| 450 |
+
|
| 451 |
with gr.Row():
|
| 452 |
pitch_shift = gr.Slider(
|
| 453 |
minimum=-12,
|
|
|
|
| 463 |
step=5,
|
| 464 |
label="Voice Strength (%)"
|
| 465 |
)
|
| 466 |
+
|
| 467 |
with gr.Row():
|
| 468 |
auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
|
| 469 |
output_format = gr.Dropdown(
|
|
|
|
| 471 |
label="Output Format",
|
| 472 |
value="wav"
|
| 473 |
)
|
| 474 |
+
|
| 475 |
# Step 4: Generate Cover
|
| 476 |
with gr.Row():
|
| 477 |
with gr.Column():
|
|
|
|
| 481 |
variant="primary",
|
| 482 |
size="lg"
|
| 483 |
)
|
| 484 |
+
|
| 485 |
progress_text = gr.Textbox(
|
| 486 |
label="Progress",
|
| 487 |
value="Ready to generate cover...",
|
| 488 |
interactive=False
|
| 489 |
)
|
| 490 |
+
|
| 491 |
# Results
|
| 492 |
with gr.Row():
|
| 493 |
with gr.Column():
|
| 494 |
gr.Markdown("## π Results")
|
| 495 |
+
|
| 496 |
with gr.Row():
|
| 497 |
original_audio = gr.Audio(label="Original Song", interactive=False)
|
| 498 |
cover_audio = gr.Audio(label="AI Cover", interactive=False)
|
| 499 |
+
|
| 500 |
# Legal Notice
|
| 501 |
with gr.Row():
|
| 502 |
gr.Markdown("""
|
| 503 |
+
<div style="background: rgba(255, 193, 7, 0.1);
|
| 504 |
+
border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem;
|
| 505 |
+
margin: 1rem 0;">
|
| 506 |
<h3>β οΈ Legal & Ethical Notice</h3>
|
| 507 |
+
<p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
|
| 508 |
+
Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
|
| 509 |
Respect copyright laws and artist rights.</p>
|
| 510 |
</div>
|
| 511 |
""")
|
| 512 |
+
|
| 513 |
# Event handlers
|
| 514 |
generate_btn.click(
|
| 515 |
generate_cover,
|
|
|
|
| 523 |
],
|
| 524 |
outputs=[cover_audio, progress_text]
|
| 525 |
)
|
| 526 |
+
|
| 527 |
# Update original audio when file is uploaded
|
| 528 |
audio_input.change(
|
| 529 |
lambda x: x,
|
| 530 |
inputs=[audio_input],
|
| 531 |
outputs=[original_audio]
|
| 532 |
)
|
| 533 |
+
|
| 534 |
return app
|
| 535 |
|
| 536 |
# Launch the app
|
|
|
|
| 541 |
server_port=7860,
|
| 542 |
share=True,
|
| 543 |
show_error=True
|
| 544 |
+
)
|