🎵 AI Cover Song Platform
Transform any song with AI voice synthesis
import gradio as gr import torch import librosa import numpy as np import soundfile as sf import os import tempfile from pathlib import Path import json from typing import Tuple, Optional import subprocess import shutil import warnings warnings.filterwarnings("ignore") # NLTK download for 'punkt' tokenizer data import nltk try: nltk.data.find('tokenizers/punkt') except nltk.downloader.DownloadError: nltk.download('punkt') # Import audio processing libraries try: from demucs.pretrained import get_model from demucs.apply import apply_model DEMUCS_AVAILABLE = True except ImportError: DEMUCS_AVAILABLE = False print("Demucs not available, using basic separation") try: import so_vits_svc_fork as svc SVC_AVAILABLE = True except ImportError: SVC_AVAILABLE = False print("SVC not available, using basic voice conversion") class AICoverGenerator: def \ __init__(self): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.temp_dir = tempfile.mkdtemp() self.voice_models = { "drake": "Drake Style Voice", "ariana": "Ariana Style Voice", "weeknd": "The Weeknd Style Voice", "taylor": "Taylor Swift Style Voice", "custom": "Custom Voice Model" } # Initialize audio separation model if DEMUCS_AVAILABLE: try: self.separation_model = get_model('htdemucs') self.separation_model.to(self.device) except Exception as e: print(f"Error loading Demucs: {e}") self.separation_model = None else: self.separation_model = None def separate_vocals(self, audio_path: str) -> Tuple[str, str]: """Separate vocals and instrumentals from audio""" try: # Load audio audio, sr = librosa.load(audio_path, sr=44100, mono=False) if self.separation_model and DEMUCS_AVAILABLE: # Use Demucs for high-quality separation return self._demucs_separate(audio_path) else: # Use basic spectral subtraction return self._basic_separate(audio, sr) except Exception as e: print(f"Error in vocal separation: {e}") return None, None def _demucs_separate(self, audio_path: str) -> Tuple[str, str]: """Use Demucs for audio separation""" try: # Load audio for Demucs audio, sr = librosa.load(audio_path, sr=44100, mono=False) if audio.ndim == 1: audio = np.stack([audio, audio]) # Convert to tensor audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device) # Apply separation with torch.no_grad(): sources = apply_model(self.separation_model, audio_tensor) # Extract vocals and instrumental vocals = sources[0, 3].cpu().numpy() # vocals channel instrumental = sources[0, 0].cpu().numpy() # drums + bass + other # Save separated audio vocals_path = os.path.join(self.temp_dir, "vocals.wav") instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") sf.write(vocals_path, vocals.T, 44100) sf.write(instrumental_path, instrumental.T, 44100) return vocals_path, instrumental_path except Exception as e: print(f"Demucs separation error: {e}") return self._basic_separate(audio, 44100) def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]: """Basic vocal separation using spectral subtraction""" try: # Convert to mono if stereo if audio.ndim > 1: audio = librosa.to_mono(audio) # Compute STFT stft = librosa.stft(audio, n_fft=2048, hop_length=512) magnitude, phase = np.abs(stft), np.angle(stft) # Simple vocal isolation (center channel extraction) # This is a basic approach - real implementation would be more sophisticated vocal_mask = np.ones_like(magnitude) vocal_mask[:, :magnitude.shape[1]//4] *= 0.3 # Reduce low frequencies vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3 # Reduce high frequencies # Apply mask vocal_magnitude = magnitude * vocal_mask instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7) # Reconstruct audio vocal_stft = vocal_magnitude * np.exp(1j * phase) instrumental_stft = instrumental_magnitude * np.exp(1j * phase) vocals = librosa.istft(vocal_stft, hop_length=512) instrumental = librosa.istft(instrumental_stft, hop_length=512) # Save files vocals_path = os.path.join(self.temp_dir, "vocals.wav") instrumental_path = os.path.join(self.temp_dir, "instrumental.wav") sf.write(vocals_path, vocals, sr) sf.write(instrumental_path, instrumental, sr) return vocals_path, instrumental_path except Exception as e: print(f"Basic separation error: {e}") return None, None def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str: """Convert vocals to target voice""" try: # Load vocal audio vocals, sr = librosa.load(vocals_path, sr=44100) # Apply pitch shifting if requested if pitch_shift != 0: vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift) # Simulate voice conversion (in real app, this would use trained models) converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength) # Save converted vocals converted_path = os.path.join(self.temp_dir, "converted_vocals.wav") sf.write(converted_path, converted_vocals, sr) return converted_path except Exception as e: print(f"Voice conversion error: {e}") return vocals_path # Return original if conversion fails def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray: """Simulate voice conversion \ (placeholder for actual model inference)""" # This is a simplified simulation - real implementation would use trained models # Apply different effects based on voice model if voice_model == "drake": # Simulate Drake's voice characteristics vocals = self._apply_voice_characteristics(vocals, pitch_factor=0.85, formant_shift=-0.1, roughness=0.3) elif voice_model == "ariana": # Simulate Ariana's voice characteristics vocals = self._apply_voice_characteristics(vocals, pitch_factor=1.2, formant_shift=0.2, breathiness=0.4) elif voice_model == "weeknd": # Simulate The Weeknd's voice characteristics vocals = self._apply_voice_characteristics(vocals, pitch_factor=0.9, formant_shift=-0.05, reverb=0.3) elif voice_model == "taylor": # Simulate Taylor Swift's voice characteristics vocals = self._apply_voice_characteristics(vocals, pitch_factor=1.1, formant_shift=0.1, clarity=0.8) # Blend with original based on strength return vocals * strength + vocals * (1 - strength) * 0.3 def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray: """Apply voice characteristics transformation""" sr = 44100 # Apply pitch factor if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0: vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=12 * np.log2(kwargs['pitch_factor'])) # Apply formant shifting (simplified) if 'formant_shift' in kwargs: # This is a simplified formant shift - real implementation would be more complex stft = librosa.stft(vocals) magnitude = np.abs(stft) phase = np.angle(stft) # Shift formants by stretching frequency axis shift_factor = 1 + kwargs['formant_shift'] shifted_magnitude = np.zeros_like(magnitude) for i in range(magnitude.shape[0]): shifted_idx = int(i * shift_factor) if shifted_idx < magnitude.shape[0]: shifted_magnitude[shifted_idx] = magnitude[i] shifted_stft = shifted_magnitude * np.exp(1j * phase) vocals = librosa.istft(shifted_stft) # Apply effects if 'roughness' in kwargs: # Add slight distortion for roughness vocals = np.tanh(vocals * (1 + kwargs['roughness'])) if 'breathiness' in kwargs: # Add noise for breathiness noise = np.random.normal(0, 0.01, vocals.shape) vocals = vocals + noise * kwargs['breathiness'] return vocals def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str: """Mix instrumental and converted vocals""" try: # Load audio files instrumental, sr = librosa.load(instrumental_path, sr=44100) vocals, _ = librosa.load(vocals_path, sr=44100) # Ensure same length min_len = min(len(instrumental), len(vocals)) instrumental = instrumental[:min_len] vocals = vocals[:min_len] # Mix audio mixed = instrumental + vocals * vocal_volume # Normalize to prevent clipping max_amplitude = np.max(np.abs(mixed)) if max_amplitude > 0.95: mixed = mixed / max_amplitude * 0.95 # Save mixed audio output_path = os.path.join(self.temp_dir, "final_cover.wav") sf.write(output_path, mixed, sr) return output_path except Exception as e: print(f"Audio mixing error: {e}") return None def process_custom_voice(self, voice_samples: list) -> str: """Process custom voice samples for training""" if not voice_samples: return "No voice samples provided" try: # In a real implementation, this would train a voice model # For demo, we'll just validate the samples total_duration = 0 for sample in voice_samples: if sample is not None: audio, sr = librosa.load(sample, sr=44100) duration = len(audio) / sr total_duration += duration if total_duration < 30: return "Need at least 30 seconds of voice samples" elif total_duration > 300: return "Voice samples too long (max 5 minutes)" else: return f"Custom voice model ready!\n({total_duration:.1f}s of training data)" except Exception as e: return f"Error processing voice samples: {e}" # Initialize the AI Cover Generator cover_generator = AICoverGenerator() def generate_cover( audio_file, voice_model: str, pitch_shift: int = 0, voice_strength: float = 80, auto_tune: bool = False, output_format: str = "wav" ) -> Tuple[Optional[str], str]: """Main \ function to generate AI cover""" if audio_file is None: return None, "Please upload an audio file" try: # Step 1: Separate vocals and instrumentals yield None, "🎵 Separating vocals and instrumentals..." vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name) if vocals_path is None: return None, "❌ Failed to separate vocals" # Step 2: Convert vocals to target voice yield None, f"🎤 Converting vocals to {voice_model} style..." converted_vocals_path = cover_generator.convert_voice( vocals_path, voice_model, pitch_shift, voice_strength / 100 ) # Step 3: Apply auto-tune if requested if auto_tune: yield None, "🎼 Applying auto-tune..." # Auto-tune implementation would go here pass # Step 4: Mix final audio yield None, "🎧 Mixing final audio..." final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path) if final_path is None: return None, "❌ Failed to mix audio" # Convert to requested \ format if needed if output_format != "wav": yield None, f"💾 Converting to {output_format.upper()}..." # Format conversion would go here return final_path, "✅ AI Cover generated successfully!" except Exception as e: return None, f"❌ Error: {str(e)}" def process_voice_samples(voice_files) -> str: """Process uploaded voice samples for custom voice training""" if not voice_files: return "No voice samples uploaded" return cover_generator.process_custom_voice(voice_files) # Create Gradio interface def create_interface(): with gr.Blocks( title="🎵 AI Cover Song Platform", # Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0 (as per requirements.txt change) css=""" .gradio-container { font-family: 'Inter', sans-serif; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); } .main-header { text-align: center; padding: 2rem; background: rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px); border-radius: 20px; margin: 1rem; } .step-container { background: rgba(255, 255, 255, 0.05); backdrop-filter: blur(10px); border-radius: 15px; padding: 1.5rem; margin: 1rem 0; border: 1px solid rgba(255, 255, 255, 0.1); } """ ) as app: # Header with gr.Row(): gr.Markdown("""
Transform any song with AI voice synthesis
This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly. Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content. Respect copyright laws and artist rights.