Spaces:

Nick021402
/

VoiceCraftr

Build error

App Files Files Community

Nick021402 commited on May 23

Commit

02f6d83

verified ·

1 Parent(s): 84fefb5

Update app.py

Browse files

Files changed (1) hide show

app.py +115 -111

app.py CHANGED Viewed

@@ -30,17 +30,18 @@ except ImportError:
     print("SVC not available, using basic voice conversion")
 class AICoverGenerator:
-    def __init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.temp_dir = tempfile.mkdtemp()
         self.voice_models = {
             "drake": "Drake Style Voice",
-            "ariana": "Ariana Style Voice",
             "weeknd": "The Weeknd Style Voice",
             "taylor": "Taylor Swift Style Voice",
             "custom": "Custom Voice Model"
         }
         # Initialize audio separation model
         if DEMUCS_AVAILABLE:
             try:
@@ -51,24 +52,24 @@ class AICoverGenerator:
                 self.separation_model = None
         else:
             self.separation_model = None
     def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
         """Separate vocals and instrumentals from audio"""
         try:
             # Load audio
             audio, sr = librosa.load(audio_path, sr=44100, mono=False)
             if self.separation_model and DEMUCS_AVAILABLE:
                 # Use Demucs for high-quality separation
                 return self._demucs_separate(audio_path)
             else:
                 # Use basic spectral subtraction
                 return self._basic_separate(audio, sr)
         except Exception as e:
             print(f"Error in vocal separation: {e}")
             return None, None
     def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
         """Use Demucs for audio separation"""
         try:
@@ -76,220 +77,223 @@ class AICoverGenerator:
             audio, sr = librosa.load(audio_path, sr=44100, mono=False)
             if audio.ndim == 1:
                 audio = np.stack([audio, audio])
             # Convert to tensor
             audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
             # Apply separation
             with torch.no_grad():
                 sources = apply_model(self.separation_model, audio_tensor)
             # Extract vocals and instrumental
             vocals = sources[0, 3].cpu().numpy()  # vocals channel
             instrumental = sources[0, 0].cpu().numpy()  # drums + bass + other
             # Save separated audio
             vocals_path = os.path.join(self.temp_dir, "vocals.wav")
             instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
             sf.write(vocals_path, vocals.T, 44100)
             sf.write(instrumental_path, instrumental.T, 44100)
             return vocals_path, instrumental_path
         except Exception as e:
             print(f"Demucs separation error: {e}")
             return self._basic_separate(audio, 44100)
     def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
         """Basic vocal separation using spectral subtraction"""
         try:
             # Convert to mono if stereo
             if audio.ndim > 1:
                 audio = librosa.to_mono(audio)
             # Compute STFT
             stft = librosa.stft(audio, n_fft=2048, hop_length=512)
             magnitude, phase = np.abs(stft), np.angle(stft)
             # Simple vocal isolation (center channel extraction)
             # This is a basic approach - real implementation would be more sophisticated
             vocal_mask = np.ones_like(magnitude)
             vocal_mask[:, :magnitude.shape[1]//4] *= 0.3  # Reduce low frequencies
             vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3  # Reduce high frequencies
             # Apply mask
             vocal_magnitude = magnitude * vocal_mask
             instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
             # Reconstruct audio
             vocal_stft = vocal_magnitude * np.exp(1j * phase)
             instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
             vocals = librosa.istft(vocal_stft, hop_length=512)
             instrumental = librosa.istft(instrumental_stft, hop_length=512)
             # Save files
             vocals_path = os.path.join(self.temp_dir, "vocals.wav")
             instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
             sf.write(vocals_path, vocals, sr)
             sf.write(instrumental_path, instrumental, sr)
             return vocals_path, instrumental_path
         except Exception as e:
             print(f"Basic separation error: {e}")
             return None, None
     def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
         """Convert vocals to target voice"""
         try:
             # Load vocal audio
             vocals, sr = librosa.load(vocals_path, sr=44100)
             # Apply pitch shifting if requested
             if pitch_shift != 0:
                 vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
             # Simulate voice conversion (in real app, this would use trained models)
             converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
             # Save converted vocals
             converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
             sf.write(converted_path, converted_vocals, sr)
             return converted_path
         except Exception as e:
             print(f"Voice conversion error: {e}")
             return vocals_path  # Return original if conversion fails
     def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
-        """Simulate voice conversion (placeholder for actual model inference)"""
         # This is a simplified simulation - real implementation would use trained models
         # Apply different effects based on voice model
         if voice_model == "drake":
             # Simulate Drake's voice characteristics
-            vocals = self._apply_voice_characteristics(vocals,
-                                                    pitch_factor=0.85,
-                                                    formant_shift=-0.1,
-                                                    roughness=0.3)
         elif voice_model == "ariana":
             # Simulate Ariana's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
-                                                    pitch_factor=1.2,
-                                                    formant_shift=0.2,
-                                                    breathiness=0.4)
         elif voice_model == "weeknd":
             # Simulate The Weeknd's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
-                                                    pitch_factor=0.9,
-                                                    formant_shift=-0.05,
-                                                    reverb=0.3)
         elif voice_model == "taylor":
             # Simulate Taylor Swift's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
-                                                    pitch_factor=1.1,
-                                                    formant_shift=0.1,
-                                                    clarity=0.8)
         # Blend with original based on strength
         return vocals * strength + vocals * (1 - strength) * 0.3
     def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
         """Apply voice characteristics transformation"""
         sr = 44100
         # Apply pitch factor
         if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
-            vocals = librosa.effects.pitch_shift(vocals, sr=sr,
-                                               n_steps=12 * np.log2(kwargs['pitch_factor']))
         # Apply formant shifting (simplified)
         if 'formant_shift' in kwargs:
             # This is a simplified formant shift - real implementation would be more complex
             stft = librosa.stft(vocals)
             magnitude = np.abs(stft)
             phase = np.angle(stft)
             # Shift formants by stretching frequency axis
             shift_factor = 1 + kwargs['formant_shift']
             shifted_magnitude = np.zeros_like(magnitude)
             for i in range(magnitude.shape[0]):
                 shifted_idx = int(i * shift_factor)
                 if shifted_idx < magnitude.shape[0]:
                     shifted_magnitude[shifted_idx] = magnitude[i]
             shifted_stft = shifted_magnitude * np.exp(1j * phase)
             vocals = librosa.istft(shifted_stft)
         # Apply effects
         if 'roughness' in kwargs:
             # Add slight distortion for roughness
             vocals = np.tanh(vocals * (1 + kwargs['roughness']))
         if 'breathiness' in kwargs:
             # Add noise for breathiness
             noise = np.random.normal(0, 0.01, vocals.shape)
             vocals = vocals + noise * kwargs['breathiness']
         return vocals
     def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
         """Mix instrumental and converted vocals"""
         try:
             # Load audio files
             instrumental, sr = librosa.load(instrumental_path, sr=44100)
             vocals, _ = librosa.load(vocals_path, sr=44100)
             # Ensure same length
             min_len = min(len(instrumental), len(vocals))
             instrumental = instrumental[:min_len]
             vocals = vocals[:min_len]
             # Mix audio
             mixed = instrumental + vocals * vocal_volume
             # Normalize to prevent clipping
             max_amplitude = np.max(np.abs(mixed))
             if max_amplitude > 0.95:
                 mixed = mixed / max_amplitude * 0.95
             # Save mixed audio
             output_path = os.path.join(self.temp_dir, "final_cover.wav")
             sf.write(output_path, mixed, sr)
             return output_path
         except Exception as e:
             print(f"Audio mixing error: {e}")
             return None
     def process_custom_voice(self, voice_samples: list) -> str:
         """Process custom voice samples for training"""
         if not voice_samples:
             return "No voice samples provided"
         try:
             # In a real implementation, this would train a voice model
             # For demo, we'll just validate the samples
             total_duration = 0
             for sample in voice_samples:
                 if sample is not None:
                     audio, sr = librosa.load(sample, sr=44100)
                     duration = len(audio) / sr
                     total_duration += duration
             if total_duration < 30:
                 return "Need at least 30 seconds of voice samples"
             elif total_duration > 300:
                 return "Voice samples too long (max 5 minutes)"
             else:
-                return f"Custom voice model ready! ({total_duration:.1f}s of training data)"
         except Exception as e:
             return f"Error processing voice samples: {e}"
@@ -304,48 +308,50 @@ def generate_cover(
     auto_tune: bool = False,
     output_format: str = "wav"
 ) -> Tuple[Optional[str], str]:
-    """Main function to generate AI cover"""
     if audio_file is None:
         return None, "Please upload an audio file"
     try:
         # Step 1: Separate vocals and instrumentals
         yield None, "🎵 Separating vocals and instrumentals..."
         vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
         if vocals_path is None:
             return None, "❌ Failed to separate vocals"
         # Step 2: Convert vocals to target voice
         yield None, f"🎤 Converting vocals to {voice_model} style..."
         converted_vocals_path = cover_generator.convert_voice(
-            vocals_path,
-            voice_model,
-            pitch_shift,
             voice_strength / 100
         )
         # Step 3: Apply auto-tune if requested
         if auto_tune:
             yield None, "🎼 Applying auto-tune..."
             # Auto-tune implementation would go here
             pass
         # Step 4: Mix final audio
         yield None, "🎧 Mixing final audio..."
         final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
         if final_path is None:
             return None, "❌ Failed to mix audio"
-        # Convert to requested format if needed
         if output_format != "wav":
             yield None, f"💾 Converting to {output_format.upper()}..."
             # Format conversion would go here
         return final_path, "✅ AI Cover generated successfully!"
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
@@ -353,18 +359,14 @@ def process_voice_samples(voice_files) -> str:
     """Process uploaded voice samples for custom voice training"""
     if not voice_files:
         return "No voice samples uploaded"
     return cover_generator.process_custom_voice(voice_files)
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
         title="🎵 AI Cover Song Platform",
-        theme=gr.themes.Soft(
-            primary_hue="indigo",
-            secondary_hue="purple",
-            neutral_hue="slate"
-        ),
         css="""
         .gradio-container {
             font-family: 'Inter', sans-serif;
@@ -388,7 +390,7 @@ def create_interface():
         }
         """
     ) as app:
         # Header
         with gr.Row():
             gr.Markdown("""
@@ -402,7 +404,7 @@ def create_interface():
                 </div>
             </div>
             """)
         # Step 1: Upload Audio
         with gr.Row():
             with gr.Column():
@@ -413,7 +415,7 @@ def create_interface():
                     format="wav"
                 )
                 gr.Markdown("*Supports MP3, WAV, FLAC files*")
         # Step 2: Voice Selection
         with gr.Row():
             with gr.Column():
@@ -424,7 +426,7 @@ def create_interface():
                     value="Drake Style Voice",
                     interactive=True
                 )
                 # Custom voice training section
                 with gr.Accordion("🎙️ Train Custom Voice (Optional)", open=False):
                     voice_samples = gr.File(
@@ -434,18 +436,18 @@ def create_interface():
                     )
                     train_btn = gr.Button("Train Custom Voice", variant="secondary")
                     training_status = gr.Textbox(label="Training Status", interactive=False)
                     train_btn.click(
                         process_voice_samples,
                         inputs=[voice_samples],
                         outputs=[training_status]
                     )
         # Step 3: Audio Settings
         with gr.Row():
             with gr.Column():
                 gr.Markdown("## ⚙️ Step 3: Audio Settings")
                 with gr.Row():
                     pitch_shift = gr.Slider(
                         minimum=-12,
@@ -461,7 +463,7 @@ def create_interface():
                         step=5,
                         label="Voice Strength (%)"
                     )
                 with gr.Row():
                     auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
                     output_format = gr.Dropdown(
@@ -469,7 +471,7 @@ def create_interface():
                         label="Output Format",
                         value="wav"
                     )
         # Step 4: Generate Cover
         with gr.Row():
             with gr.Column():
@@ -479,33 +481,35 @@ def create_interface():
                     variant="primary",
                     size="lg"
                 )
                 progress_text = gr.Textbox(
                     label="Progress",
                     value="Ready to generate cover...",
                     interactive=False
                 )
         # Results
         with gr.Row():
             with gr.Column():
                 gr.Markdown("## 🎉 Results")
                 with gr.Row():
                     original_audio = gr.Audio(label="Original Song", interactive=False)
                     cover_audio = gr.Audio(label="AI Cover", interactive=False)
         # Legal Notice
         with gr.Row():
             gr.Markdown("""
-            <div style="background: rgba(255, 193, 7, 0.1); border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem; margin: 1rem 0;">
                 <h3>⚠️ Legal & Ethical Notice</h3>
-                <p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
-                Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
                 Respect copyright laws and artist rights.</p>
             </div>
             """)
         # Event handlers
         generate_btn.click(
             generate_cover,
@@ -519,14 +523,14 @@ def create_interface():
             ],
             outputs=[cover_audio, progress_text]
         )
         # Update original audio when file is uploaded
         audio_input.change(
             lambda x: x,
             inputs=[audio_input],
             outputs=[original_audio]
         )
     return app
 # Launch the app
@@ -537,4 +541,4 @@ if __name__ == "__main__":
         server_port=7860,
         share=True,
         show_error=True
-    )

     print("SVC not available, using basic voice conversion")
 class AICoverGenerator:
+    def \
+__init__(self):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.temp_dir = tempfile.mkdtemp()
         self.voice_models = {
             "drake": "Drake Style Voice",
+            "ariana": "Ariana Style Voice",
             "weeknd": "The Weeknd Style Voice",
             "taylor": "Taylor Swift Style Voice",
             "custom": "Custom Voice Model"
         }
         # Initialize audio separation model
         if DEMUCS_AVAILABLE:
             try:
                 self.separation_model = None
         else:
             self.separation_model = None
     def separate_vocals(self, audio_path: str) -> Tuple[str, str]:
         """Separate vocals and instrumentals from audio"""
         try:
             # Load audio
             audio, sr = librosa.load(audio_path, sr=44100, mono=False)
             if self.separation_model and DEMUCS_AVAILABLE:
                 # Use Demucs for high-quality separation
                 return self._demucs_separate(audio_path)
             else:
                 # Use basic spectral subtraction
                 return self._basic_separate(audio, sr)
         except Exception as e:
             print(f"Error in vocal separation: {e}")
             return None, None
     def _demucs_separate(self, audio_path: str) -> Tuple[str, str]:
         """Use Demucs for audio separation"""
         try:
             audio, sr = librosa.load(audio_path, sr=44100, mono=False)
             if audio.ndim == 1:
                 audio = np.stack([audio, audio])
             # Convert to tensor
             audio_tensor = torch.from_numpy(audio).float().unsqueeze(0).to(self.device)
             # Apply separation
             with torch.no_grad():
                 sources = apply_model(self.separation_model, audio_tensor)
             # Extract vocals and instrumental
             vocals = sources[0, 3].cpu().numpy()  # vocals channel
             instrumental = sources[0, 0].cpu().numpy()  # drums + bass + other
             # Save separated audio
             vocals_path = os.path.join(self.temp_dir, "vocals.wav")
             instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
             sf.write(vocals_path, vocals.T, 44100)
             sf.write(instrumental_path, instrumental.T, 44100)
             return vocals_path, instrumental_path
         except Exception as e:
             print(f"Demucs separation error: {e}")
             return self._basic_separate(audio, 44100)
     def _basic_separate(self, audio: np.ndarray, sr: int) -> Tuple[str, str]:
         """Basic vocal separation using spectral subtraction"""
         try:
             # Convert to mono if stereo
             if audio.ndim > 1:
                 audio = librosa.to_mono(audio)
             # Compute STFT
             stft = librosa.stft(audio, n_fft=2048, hop_length=512)
             magnitude, phase = np.abs(stft), np.angle(stft)
             # Simple vocal isolation (center channel extraction)
             # This is a basic approach - real implementation would be more sophisticated
             vocal_mask = np.ones_like(magnitude)
             vocal_mask[:, :magnitude.shape[1]//4] *= 0.3  # Reduce low frequencies
             vocal_mask[:, 3*magnitude.shape[1]//4:] *= 0.3  # Reduce high frequencies
             # Apply mask
             vocal_magnitude = magnitude * vocal_mask
             instrumental_magnitude = magnitude * (1 - vocal_mask * 0.7)
             # Reconstruct audio
             vocal_stft = vocal_magnitude * np.exp(1j * phase)
             instrumental_stft = instrumental_magnitude * np.exp(1j * phase)
             vocals = librosa.istft(vocal_stft, hop_length=512)
             instrumental = librosa.istft(instrumental_stft, hop_length=512)
             # Save files
             vocals_path = os.path.join(self.temp_dir, "vocals.wav")
             instrumental_path = os.path.join(self.temp_dir, "instrumental.wav")
             sf.write(vocals_path, vocals, sr)
             sf.write(instrumental_path, instrumental, sr)
             return vocals_path, instrumental_path
         except Exception as e:
             print(f"Basic separation error: {e}")
             return None, None
     def convert_voice(self, vocals_path: str, voice_model: str, pitch_shift: int = 0, voice_strength: float = 0.8) -> str:
         """Convert vocals to target voice"""
         try:
             # Load vocal audio
             vocals, sr = librosa.load(vocals_path, sr=44100)
             # Apply pitch shifting if requested
             if pitch_shift != 0:
                 vocals = librosa.effects.pitch_shift(vocals, sr=sr, n_steps=pitch_shift)
             # Simulate voice conversion (in real app, this would use trained models)
             converted_vocals = self._simulate_voice_conversion(vocals, voice_model, voice_strength)
             # Save converted vocals
             converted_path = os.path.join(self.temp_dir, "converted_vocals.wav")
             sf.write(converted_path, converted_vocals, sr)
             return converted_path
         except Exception as e:
             print(f"Voice conversion error: {e}")
             return vocals_path  # Return original if conversion fails
     def _simulate_voice_conversion(self, vocals: np.ndarray, voice_model: str, strength: float) -> np.ndarray:
+        """Simulate voice conversion \
+(placeholder for actual model inference)"""
         # This is a simplified simulation - real implementation would use trained models
         # Apply different effects based on voice model
         if voice_model == "drake":
             # Simulate Drake's voice characteristics
+            vocals = self._apply_voice_characteristics(vocals,
+                                                      pitch_factor=0.85,
+                                                      formant_shift=-0.1,
+                                                      roughness=0.3)
         elif voice_model == "ariana":
             # Simulate Ariana's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
+                                                       pitch_factor=1.2,
+                                                       formant_shift=0.2,
+                                                       breathiness=0.4)
         elif voice_model == "weeknd":
             # Simulate The Weeknd's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
+                                                       pitch_factor=0.9,
+                                                       formant_shift=-0.05,
+                                                       reverb=0.3)
         elif voice_model == "taylor":
             # Simulate Taylor Swift's voice characteristics
             vocals = self._apply_voice_characteristics(vocals,
+                                                       pitch_factor=1.1,
+                                                       formant_shift=0.1,
+                                                       clarity=0.8)
         # Blend with original based on strength
         return vocals * strength + vocals * (1 - strength) * 0.3
     def _apply_voice_characteristics(self, vocals: np.ndarray, **kwargs) -> np.ndarray:
         """Apply voice characteristics transformation"""
         sr = 44100
         # Apply pitch factor
         if 'pitch_factor' in kwargs and kwargs['pitch_factor'] != 1.0:
+            vocals = librosa.effects.pitch_shift(vocals, sr=sr,
+                                                 n_steps=12 * np.log2(kwargs['pitch_factor']))
         # Apply formant shifting (simplified)
         if 'formant_shift' in kwargs:
             # This is a simplified formant shift - real implementation would be more complex
             stft = librosa.stft(vocals)
             magnitude = np.abs(stft)
             phase = np.angle(stft)
             # Shift formants by stretching frequency axis
             shift_factor = 1 + kwargs['formant_shift']
             shifted_magnitude = np.zeros_like(magnitude)
             for i in range(magnitude.shape[0]):
                 shifted_idx = int(i * shift_factor)
                 if shifted_idx < magnitude.shape[0]:
                     shifted_magnitude[shifted_idx] = magnitude[i]
             shifted_stft = shifted_magnitude * np.exp(1j * phase)
             vocals = librosa.istft(shifted_stft)
         # Apply effects
         if 'roughness' in kwargs:
             # Add slight distortion for roughness
             vocals = np.tanh(vocals * (1 + kwargs['roughness']))
         if 'breathiness' in kwargs:
             # Add noise for breathiness
             noise = np.random.normal(0, 0.01, vocals.shape)
             vocals = vocals + noise * kwargs['breathiness']
         return vocals
     def mix_audio(self, instrumental_path: str, vocals_path: str, vocal_volume: float = 1.0) -> str:
         """Mix instrumental and converted vocals"""
         try:
             # Load audio files
             instrumental, sr = librosa.load(instrumental_path, sr=44100)
             vocals, _ = librosa.load(vocals_path, sr=44100)
             # Ensure same length
             min_len = min(len(instrumental), len(vocals))
             instrumental = instrumental[:min_len]
             vocals = vocals[:min_len]
             # Mix audio
             mixed = instrumental + vocals * vocal_volume
             # Normalize to prevent clipping
             max_amplitude = np.max(np.abs(mixed))
             if max_amplitude > 0.95:
                 mixed = mixed / max_amplitude * 0.95
             # Save mixed audio
             output_path = os.path.join(self.temp_dir, "final_cover.wav")
             sf.write(output_path, mixed, sr)
             return output_path
         except Exception as e:
             print(f"Audio mixing error: {e}")
             return None
     def process_custom_voice(self, voice_samples: list) -> str:
         """Process custom voice samples for training"""
         if not voice_samples:
             return "No voice samples provided"
         try:
             # In a real implementation, this would train a voice model
             # For demo, we'll just validate the samples
             total_duration = 0
             for sample in voice_samples:
                 if sample is not None:
                     audio, sr = librosa.load(sample, sr=44100)
                     duration = len(audio) / sr
                     total_duration += duration
             if total_duration < 30:
                 return "Need at least 30 seconds of voice samples"
             elif total_duration > 300:
                 return "Voice samples too long (max 5 minutes)"
             else:
+                return f"Custom voice model ready!\n({total_duration:.1f}s of training data)"
         except Exception as e:
             return f"Error processing voice samples: {e}"
     auto_tune: bool = False,
     output_format: str = "wav"
 ) -> Tuple[Optional[str], str]:
+    """Main \
+function to generate AI cover"""
     if audio_file is None:
         return None, "Please upload an audio file"
     try:
         # Step 1: Separate vocals and instrumentals
         yield None, "🎵 Separating vocals and instrumentals..."
         vocals_path, instrumental_path = cover_generator.separate_vocals(audio_file.name)
         if vocals_path is None:
             return None, "❌ Failed to separate vocals"
         # Step 2: Convert vocals to target voice
         yield None, f"🎤 Converting vocals to {voice_model} style..."
         converted_vocals_path = cover_generator.convert_voice(
+            vocals_path,
+            voice_model,
+            pitch_shift,
             voice_strength / 100
         )
         # Step 3: Apply auto-tune if requested
         if auto_tune:
             yield None, "🎼 Applying auto-tune..."
             # Auto-tune implementation would go here
             pass
         # Step 4: Mix final audio
         yield None, "🎧 Mixing final audio..."
         final_path = cover_generator.mix_audio(instrumental_path, converted_vocals_path)
         if final_path is None:
             return None, "❌ Failed to mix audio"
+        # Convert to requested \
+format if needed
         if output_format != "wav":
             yield None, f"💾 Converting to {output_format.upper()}..."
             # Format conversion would go here
         return final_path, "✅ AI Cover generated successfully!"
     except Exception as e:
         return None, f"❌ Error: {str(e)}"
     """Process uploaded voice samples for custom voice training"""
     if not voice_files:
         return "No voice samples uploaded"
     return cover_generator.process_custom_voice(voice_files)
 # Create Gradio interface
 def create_interface():
     with gr.Blocks(
         title="🎵 AI Cover Song Platform",
+        # Removed theme=gr.themes.Soft for compatibility with Gradio versions < 4.0.0
         css="""
         .gradio-container {
             font-family: 'Inter', sans-serif;
         }
         """
     ) as app:
         # Header
         with gr.Row():
             gr.Markdown("""
                 </div>
             </div>
             """)
         # Step 1: Upload Audio
         with gr.Row():
             with gr.Column():
                     format="wav"
                 )
                 gr.Markdown("*Supports MP3, WAV, FLAC files*")
         # Step 2: Voice Selection
         with gr.Row():
             with gr.Column():
                     value="Drake Style Voice",
                     interactive=True
                 )
                 # Custom voice training section
                 with gr.Accordion("🎙️ Train Custom Voice (Optional)", open=False):
                     voice_samples = gr.File(
                     )
                     train_btn = gr.Button("Train Custom Voice", variant="secondary")
                     training_status = gr.Textbox(label="Training Status", interactive=False)
                     train_btn.click(
                         process_voice_samples,
                         inputs=[voice_samples],
                         outputs=[training_status]
                     )
         # Step 3: Audio Settings
         with gr.Row():
             with gr.Column():
                 gr.Markdown("## ⚙️ Step 3: Audio Settings")
                 with gr.Row():
                     pitch_shift = gr.Slider(
                         minimum=-12,
                         step=5,
                         label="Voice Strength (%)"
                     )
                 with gr.Row():
                     auto_tune = gr.Checkbox(label="Apply Auto-tune", value=False)
                     output_format = gr.Dropdown(
                         label="Output Format",
                         value="wav"
                     )
         # Step 4: Generate Cover
         with gr.Row():
             with gr.Column():
                     variant="primary",
                     size="lg"
                 )
                 progress_text = gr.Textbox(
                     label="Progress",
                     value="Ready to generate cover...",
                     interactive=False
                 )
         # Results
         with gr.Row():
             with gr.Column():
                 gr.Markdown("## 🎉 Results")
                 with gr.Row():
                     original_audio = gr.Audio(label="Original Song", interactive=False)
                     cover_audio = gr.Audio(label="AI Cover", interactive=False)
         # Legal Notice
         with gr.Row():
             gr.Markdown("""
+            <div style="background: rgba(255, 193, 7, 0.1);
+border: 1px solid rgba(255, 193, 7, 0.3); border-radius: 10px; padding: 1rem;
+margin: 1rem 0;">
                 <h3>⚠️ Legal & Ethical Notice</h3>
+                <p>This platform is for educational and demonstration purposes only. Voice cloning technology should be used responsibly.
+                Always obtain proper consent before cloning someone's voice. Do not use this tool to create misleading or harmful content.
                 Respect copyright laws and artist rights.</p>
             </div>
             """)
         # Event handlers
         generate_btn.click(
             generate_cover,
             ],
             outputs=[cover_audio, progress_text]
         )
         # Update original audio when file is uploaded
         audio_input.change(
             lambda x: x,
             inputs=[audio_input],
             outputs=[original_audio]
         )
     return app
 # Launch the app
         server_port=7860,
         share=True,
         show_error=True
+    )