Spaces:

declare-lab
/

JAM

Running on Zero

App Files Files Community

renhang commited on 16 days ago

Commit

bc3ffb2

1 Parent(s): 072d1f9

update space

Browse files

Files changed (3) hide show

app.py +181 -20
jam_infer.yaml +2 -1
utils.py +184 -0

app.py CHANGED Viewed

@@ -6,9 +6,39 @@ import tempfile
 import requests
 import subprocess
 from pathlib import Path
 from model import Jamify
-from utils import json_to_text, text_to_json
 def download_resources():
     """Download examples data from GitHub repository if not already present"""
@@ -36,10 +66,17 @@ print("Jamify model ready.")
 gr.set_static_paths(paths=[Path.cwd().absolute()])
 @spaces.GPU(duration=100)
-def generate_song(reference_audio, lyrics_text, style_prompt, duration):
     # We need to save the uploaded files to temporary paths to pass to the model
     reference_audio = reference_audio not in ("", None) and reference_audio or None
     # Convert text format to JSON and save to temporary file
     lyrics_json = text_to_json(lyrics_text)
@@ -77,6 +114,7 @@ def load_examples():
             audio_path = example.get('audio_path', '')
             lrc_path = example.get('lrc_path', '')
             duration = example.get('duration', 120)
             # Load lyrics and convert to text format (pre-computed/cached)
             lyrics_text = ""
@@ -93,26 +131,38 @@ def load_examples():
                 'id': example_id,
                 'audio_path': audio_path if os.path.exists(audio_path) else None,
                 'lyrics_text': lyrics_text,
-                'duration': duration
             })
     print(f"Loaded {len(examples)} cached examples")
     return examples
-def load_example(example_idx, examples):
     """Load a specific example and return its data"""
     if 0 <= example_idx < len(examples):
         example = examples[example_idx]
         return (
             example['audio_path'],
-            example['lyrics_text'],
-            example['duration']
         )
-    return None, "", 120
 def clear_form():
     """Clear all form inputs to allow user to create their own song"""
-    return None, "", 120  # audio, lyrics, duration
 def update_button_styles(selected_idx, total_examples):
     """Update button styles to highlight the selected example"""
@@ -138,6 +188,10 @@ examples = load_examples()
 default_audio = examples[0]['audio_path'] if examples else None
 default_lyrics = examples[0]['lyrics_text'] if examples else ""
 default_duration = examples[0]['duration'] if examples else 120
 # Gradio interface
 with gr.Blocks() as demo:
@@ -147,6 +201,10 @@ with gr.Blocks() as demo:
     # State to track selected example (-1 means "Make Your Own" is selected, 0 is first example)
     selected_example = gr.State(0 if examples else -1)
     # Sample buttons section
     if examples:
         gr.Markdown("### Sample Examples")
@@ -178,6 +236,26 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Inputs")
             lyrics_text = gr.Textbox(
                 label="Lyrics",
                 lines=10,
@@ -186,48 +264,131 @@ with gr.Blocks() as demo:
             )
             duration_slider = gr.Slider(minimum=120, maximum=230, value=default_duration, step=1, label="Duration (seconds)")
             with gr.Tab("Style from Audio"):
                 reference_audio = gr.File(label="Reference Audio (.mp3, .wav)", type="filepath", value=default_audio)
-            with gr.Tab("Style from Text"):
-                style_prompt = gr.Textbox(label="Style Prompt", lines=3, placeholder="e.g., A high-energy electronic dance track with a strong bassline and euphoric synths.")
             generate_button = gr.Button("Generate Song", variant="primary")
-        with gr.Column():
             gr.Markdown("### Output")
             output_audio = gr.Audio(label="Generated Song")
     generate_button.click(
         fn=generate_song,
-        inputs=[reference_audio, lyrics_text, style_prompt, duration_slider],
         outputs=output_audio,
         api_name="generate_song"
     )
     # Connect example buttons to load data and update selection
     if examples:
-        def load_example_and_update_selection(idx):
             """Load example data and update button selection state"""
-            audio, lyrics, duration = load_example(idx, examples)
             button_updates = update_button_styles(idx, len(examples))
-            return [audio, lyrics, duration, idx] + button_updates
         def clear_form_and_update_selection():
             """Clear form and update button selection state"""
-            audio, lyrics, duration = clear_form()
             button_updates = update_button_styles(-1, len(examples))
-            return [audio, lyrics, duration, -1] + button_updates
         for i, button in enumerate(example_buttons):
             button.click(
-                fn=lambda idx=i: load_example_and_update_selection(idx),
-                outputs=[reference_audio, lyrics_text, duration_slider, selected_example] + example_buttons + [make_your_own_button]
             )
         # Connect "Make Your Own" button to clear form and update selection
         make_your_own_button.click(
             fn=clear_form_and_update_selection,
-            outputs=[reference_audio, lyrics_text, duration_slider, selected_example] + example_buttons + [make_your_own_button]
         )
 # Create necessary temporary directories for Gradio

 import requests
 import subprocess
 from pathlib import Path
+import torchaudio
 from model import Jamify
+from utils import json_to_text, text_to_json, convert_text_time_to_beats, convert_text_beats_to_time, convert_text_beats_to_time_with_regrouping, text_to_words, beats_to_text_with_regrouping, round_to_quarter_beats
+def crop_audio_to_30_seconds(audio_path):
+    """Crop audio to first 30 seconds and return path to temporary cropped file"""
+    if not audio_path or not os.path.exists(audio_path):
+        return None
+    try:
+        # Load audio
+        waveform, sample_rate = torchaudio.load(audio_path)
+        # Calculate 30 seconds in samples
+        target_samples = sample_rate * 30
+        # Crop to first 30 seconds (or full audio if shorter)
+        if waveform.shape[1] > target_samples:
+            cropped_waveform = waveform[:, :target_samples]
+        else:
+            cropped_waveform = waveform
+        # Save to temporary file
+        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as temp_file:
+            temp_path = temp_file.name
+        torchaudio.save(temp_path, cropped_waveform, sample_rate)
+        return temp_path
+    except Exception as e:
+        print(f"Error processing audio: {e}")
+        return None
 def download_resources():
     """Download examples data from GitHub repository if not already present"""
 gr.set_static_paths(paths=[Path.cwd().absolute()])
 @spaces.GPU(duration=100)
+def generate_song(reference_audio, lyrics_text, duration, mode="time", bpm=120, style_prompt=None):
     # We need to save the uploaded files to temporary paths to pass to the model
     reference_audio = reference_audio not in ("", None) and reference_audio or None
+    # Convert beats to time format if in beats mode
+    if mode == "beats" and lyrics_text:
+        try:
+            lyrics_text = convert_text_beats_to_time(lyrics_text, bpm)
+        except Exception as e:
+            print(f"Error converting beats to time: {e}")
     # Convert text format to JSON and save to temporary file
     lyrics_json = text_to_json(lyrics_text)
             audio_path = example.get('audio_path', '')
             lrc_path = example.get('lrc_path', '')
             duration = example.get('duration', 120)
+            bpm = example.get('bpm', 120.0)  # Read BPM from input.json, default to 120
             # Load lyrics and convert to text format (pre-computed/cached)
             lyrics_text = ""
                 'id': example_id,
                 'audio_path': audio_path if os.path.exists(audio_path) else None,
                 'lyrics_text': lyrics_text,
+                'duration': duration,
+                'bpm': bpm
             })
     print(f"Loaded {len(examples)} cached examples")
     return examples
+def load_example(example_idx, examples, mode="time"):
     """Load a specific example and return its data"""
     if 0 <= example_idx < len(examples):
         example = examples[example_idx]
+        lyrics_text = example['lyrics_text']
+        bpm = example.get('bpm', 120.0)
+        # Convert to beats format if in beats mode
+        if mode == "beats" and lyrics_text:
+            try:
+                lyrics_text = beats_to_text_with_regrouping(lyrics_text, bpm, round_to_quarters=True)
+            except Exception as e:
+                print(f"Error converting to beats format: {e}")
         return (
             example['audio_path'],
+            lyrics_text,
+            example['duration'],
+            bpm
         )
+    return None, "", 120, 120.0
 def clear_form():
     """Clear all form inputs to allow user to create their own song"""
+    return None, "", 120, 120.0  # audio, lyrics, duration, bpm
 def update_button_styles(selected_idx, total_examples):
     """Update button styles to highlight the selected example"""
 default_audio = examples[0]['audio_path'] if examples else None
 default_lyrics = examples[0]['lyrics_text'] if examples else ""
 default_duration = examples[0]['duration'] if examples else 120
+default_bpm = examples[0]['bpm'] if examples else 120.0
+# Create cropped version of default audio for display
+default_audio_display = crop_audio_to_30_seconds(default_audio) if default_audio else None
 # Gradio interface
 with gr.Blocks() as demo:
     # State to track selected example (-1 means "Make Your Own" is selected, 0 is first example)
     selected_example = gr.State(0 if examples else -1)
+    # States for mode and BPM
+    input_mode = gr.State("time")
+    current_bpm = gr.State(default_bpm)
     # Sample buttons section
     if examples:
         gr.Markdown("### Sample Examples")
     with gr.Row():
         with gr.Column():
             gr.Markdown("### Inputs")
+            # Mode switcher
+            mode_radio = gr.Radio(
+                choices=["Time Mode", "Beats Mode"],
+                value="Time Mode",
+                label="Input Format",
+                info="Choose how to specify timing: seconds or musical beats"
+            )
+            # BPM input (initially hidden)
+            bpm_input = gr.Number(
+                label="BPM (Beats Per Minute)",
+                value=default_bpm,
+                minimum=60,
+                maximum=200,
+                step=1,
+                visible=False,
+                info="Tempo for converting beats to time"
+            )
             lyrics_text = gr.Textbox(
                 label="Lyrics",
                 lines=10,
             )
             duration_slider = gr.Slider(minimum=120, maximum=230, value=default_duration, step=1, label="Duration (seconds)")
+        with gr.Column():
+            gr.Markdown("### Style & Generation")
             with gr.Tab("Style from Audio"):
                 reference_audio = gr.File(label="Reference Audio (.mp3, .wav)", type="filepath", value=default_audio)
+                reference_audio_display = gr.Audio(
+                    label="Reference Audio Preview (First 30 seconds)",
+                    value=default_audio_display,
+                    visible=default_audio_display is not None
+                )
             generate_button = gr.Button("Generate Song", variant="primary")
             gr.Markdown("### Output")
             output_audio = gr.Audio(label="Generated Song")
+    # Mode switching functions
+    def switch_mode(mode_choice, current_lyrics, current_bpm_val):
+        """Handle switching between time and beats mode"""
+        mode = "beats" if mode_choice == "Beats Mode" else "time"
+        # Update BPM input visibility
+        bpm_visible = (mode == "beats")
+        # Update lyrics placeholder and convert existing text
+        if mode == "time":
+            placeholder = "Enter lyrics with timestamps: word[start_time:end_time] word[start_time:end_time]...\n\nExample: Hello[0.0:1.2] world[1.5:2.8] this[3.0:3.8] is[4.2:4.6] my[5.0:5.8] song[6.2:7.0]\n\nFormat: Each word followed by [start_seconds:end_seconds] in brackets\nTimestamps should be in seconds with up to 2 decimal places"
+            label = "Lyrics"
+            # Convert from beats to time if there's content
+            converted_lyrics = current_lyrics
+            if current_lyrics.strip():
+                try:
+                    converted_lyrics = convert_text_beats_to_time_with_regrouping(current_lyrics, current_bpm_val)
+                except Exception as e:
+                    print(f"Error converting beats to time: {e}")
+        else:
+            placeholder = "Enter lyrics with beat timestamps: word[start_beat:end_beat] word[start_beat:end_beat]...\n\nExample: Hello[0:1] world[1.5:2.75] this[3:3.75] is[4.25:4.5] my[5:5.75] song[6.25:7]\n\nFormat: Each word followed by [start_beat:end_beat] in brackets\nBeats are in quarter notes (1 beat = quarter note, 0.25 = sixteenth note)"
+            label = "Lyrics (Beats Format)"
+            # Convert from time to beats if there's content
+            converted_lyrics = current_lyrics
+            if current_lyrics.strip():
+                try:
+                    converted_lyrics = beats_to_text_with_regrouping(current_lyrics, current_bpm_val, round_to_quarters=True)
+                except Exception as e:
+                    print(f"Error converting time to beats: {e}")
+        return (
+            gr.update(visible=bpm_visible),  # bpm_input visibility
+            gr.update(placeholder=placeholder, label=label, value=converted_lyrics),  # lyrics_text
+            mode  # input_mode state
+        )
+    def update_bpm_state(bpm_val):
+        """Update the BPM state"""
+        return bpm_val
+    def update_reference_audio_display(audio_file):
+        """Process and display the cropped reference audio"""
+        if audio_file is None:
+            return gr.update(visible=False, value=None)
+        cropped_path = crop_audio_to_30_seconds(audio_file)
+        if cropped_path:
+            return gr.update(visible=True, value=cropped_path)
+        else:
+            return gr.update(visible=False, value=None)
+    # Connect mode switching
+    mode_radio.change(
+        fn=switch_mode,
+        inputs=[mode_radio, lyrics_text, current_bpm],
+        outputs=[bpm_input, lyrics_text, input_mode]
+    )
+    # Connect BPM changes
+    bpm_input.change(
+        fn=update_bpm_state,
+        inputs=[bpm_input],
+        outputs=[current_bpm]
+    )
+    # Connect reference audio file changes to display
+    reference_audio.change(
+        fn=update_reference_audio_display,
+        inputs=[reference_audio],
+        outputs=[reference_audio_display]
+    )
     generate_button.click(
         fn=generate_song,
+        inputs=[reference_audio, lyrics_text, duration_slider, input_mode, current_bpm],
         outputs=output_audio,
         api_name="generate_song"
     )
     # Connect example buttons to load data and update selection
     if examples:
+        def load_example_and_update_selection(idx, current_mode):
             """Load example data and update button selection state"""
+            mode = "beats" if current_mode == "Beats Mode" else "time"
+            audio, lyrics, duration, bpm = load_example(idx, examples, mode)
             button_updates = update_button_styles(idx, len(examples))
+            audio_display_update = update_reference_audio_display(audio)
+            return [audio, lyrics, duration, bpm, idx, audio_display_update] + button_updates
         def clear_form_and_update_selection():
             """Clear form and update button selection state"""
+            audio, lyrics, duration, bpm = clear_form()
             button_updates = update_button_styles(-1, len(examples))
+            audio_display_update = update_reference_audio_display(audio)
+            return [audio, lyrics, duration, bpm, -1, audio_display_update] + button_updates
         for i, button in enumerate(example_buttons):
             button.click(
+                fn=lambda current_mode, idx=i: load_example_and_update_selection(idx, current_mode),
+                inputs=[mode_radio],
+                outputs=[reference_audio, lyrics_text, duration_slider, current_bpm, selected_example, reference_audio_display] + example_buttons + [make_your_own_button]
             )
         # Connect "Make Your Own" button to clear form and update selection
         make_your_own_button.click(
             fn=clear_form_and_update_selection,
+            outputs=[reference_audio, lyrics_text, duration_slider, current_bpm, selected_example, reference_audio_display] + example_buttons + [make_your_own_button]
         )
 # Create necessary temporary directories for Gradio

jam_infer.yaml CHANGED Viewed

@@ -23,9 +23,10 @@ evaluation:
     cfg_range:
       - 0.05
       - 1
     dual_cfg:
       - 4.7
-      - 2.5
     steps: 50
 model:

     cfg_range:
       - 0.05
       - 1
+    fix_dual_cfg: true
     dual_cfg:
       - 4.7
+      - 2.6
     steps: 50
 model:

utils.py CHANGED Viewed

@@ -141,6 +141,190 @@ def json_to_text(json_data: dict) -> str:
     return '\n\n'.join(segment_lines)
 def text_to_json(text: str) -> dict:
     """
     Convert text format to JSON structure expected by the model.

     return '\n\n'.join(segment_lines)
+def round_to_quarter_beats(beat_position: float) -> float:
+    """Round beat position to nearest quarter note for sample display."""
+    return round(beat_position * 4) / 4
+def beats_to_seconds(beat_position: float, bpm: float) -> float:
+    """Convert beat position to time in seconds."""
+    return (beat_position * 60.0) / bpm
+def seconds_to_beats(time_seconds: float, bpm: float) -> float:
+    """Convert time in seconds to beat position."""
+    return (time_seconds * bpm) / 60.0
+def convert_text_time_to_beats(text: str, bpm: float, round_to_quarters: bool = False) -> str:
+    """
+    Convert time-based text format to beats-based format.
+    Args:
+        text: String in format "word[start_sec:end_sec] ..."
+        bpm: Beats per minute for conversion
+        round_to_quarters: If True, round beats to quarter notes (for sample display)
+    Returns:
+        String in format "word[start_beat:end_beat] ..."
+    """
+    if not text.strip():
+        return ""
+    words = text_to_words(text)
+    beat_words = []
+    for word in words:
+        start_beat = seconds_to_beats(word['start'], bpm)
+        end_beat = seconds_to_beats(word['end'], bpm)
+        # Round to quarter notes for sample display
+        if round_to_quarters:
+            start_beat = round_to_quarter_beats(start_beat)
+            end_beat = round_to_quarter_beats(end_beat)
+        # Format to reasonable precision
+        start_str = f"{start_beat:.2f}".rstrip('0').rstrip('.')
+        end_str = f"{end_beat:.2f}".rstrip('0').rstrip('.')
+        beat_words.append(f"{word['word']}[{start_str}:{end_str}]")
+    return " ".join(beat_words)
+def beats_to_text_with_regrouping(text: str, bpm: float, round_to_quarters: bool = False) -> str:
+    """
+    Convert time-based text to beats format with regrouping (like time mode).
+    Args:
+        text: String in format "word[start_sec:end_sec] ..."
+        bpm: Beats per minute for conversion
+        round_to_quarters: If True, round beats to quarter notes (for sample display)
+    Returns:
+        String with beats format grouped into lines
+    """
+    if not text.strip():
+        return ""
+    # First convert to beats format
+    words = text_to_words(text)
+    beat_words = []
+    for word in words:
+        start_beat = seconds_to_beats(word['start'], bpm)
+        end_beat = seconds_to_beats(word['end'], bpm)
+        # Round to quarter notes for sample display
+        if round_to_quarters:
+            start_beat = round_to_quarter_beats(start_beat)
+            end_beat = round_to_quarter_beats(end_beat)
+        beat_words.append({
+            'word': word['word'],
+            'start': start_beat,
+            'end': end_beat
+        })
+    # Group beats into segments (using beat positions instead of seconds)
+    segments = regroup_words(beat_words, max_len=20, gap=2.0)  # 20 beats max, 2 beat gap
+    # Convert each segment to text format
+    segment_lines = []
+    for seg in segments:
+        # Extract words for this segment based on beat range
+        seg_words = []
+        for word in beat_words:
+            if seg['start'] <= word['start'] < seg['end'] or (
+                word['start'] <= seg['start'] < word['end']
+            ):
+                seg_words.append(word)
+        if seg_words:
+            segment_text = words_to_text(seg_words)  # This will format as word[beat:beat]
+            segment_lines.append(segment_text)
+    return '\n\n'.join(segment_lines)
+def convert_text_beats_to_time(text: str, bpm: float) -> str:
+    """
+    Convert beats-based text format to time-based format.
+    Args:
+        text: String in format "word[start_beat:end_beat] ..."
+        bpm: Beats per minute for conversion
+    Returns:
+        String in format "word[start_sec:end_sec] ..."
+    """
+    if not text.strip():
+        return ""
+    # Parse beats format (same pattern as time format)
+    words = text_to_words(text)
+    time_words = []
+    for word in words:
+        # Convert beat positions to time
+        start_time = beats_to_seconds(word['start'], bpm)
+        end_time = beats_to_seconds(word['end'], bpm)
+        # Format to reasonable precision
+        start_str = f"{start_time:.2f}".rstrip('0').rstrip('.')
+        end_str = f"{end_time:.2f}".rstrip('0').rstrip('.')
+        time_words.append(f"{word['word']}[{start_str}:{end_str}]")
+    return " ".join(time_words)
+def convert_text_beats_to_time_with_regrouping(text: str, bpm: float) -> str:
+    """
+    Convert beats-based text format to time-based format while preserving line structure.
+    Args:
+        text: String in format "word[start_beat:end_beat] ..." (can be multi-line)
+        bpm: Beats per minute for conversion
+    Returns:
+        String in format "word[start_sec:end_sec] ..." with preserved line breaks
+    """
+    if not text.strip():
+        return ""
+    # Process each line separately to preserve segmentation
+    lines = text.split('\n')
+    converted_lines = []
+    for line in lines:
+        line = line.strip()
+        if not line:
+            # Preserve empty lines
+            converted_lines.append("")
+            continue
+        # Convert this line from beats to time
+        words = text_to_words(line)
+        time_words = []
+        for word in words:
+            # Convert beat positions to time
+            start_time = beats_to_seconds(word['start'], bpm)
+            end_time = beats_to_seconds(word['end'], bpm)
+            # Format to reasonable precision
+            start_str = f"{start_time:.2f}".rstrip('0').rstrip('.')
+            end_str = f"{end_time:.2f}".rstrip('0').rstrip('.')
+            time_words.append(f"{word['word']}[{start_str}:{end_str}]")
+        if time_words:
+            converted_lines.append(" ".join(time_words))
+    return "\n".join(converted_lines)
 def text_to_json(text: str) -> dict:
     """
     Convert text format to JSON structure expected by the model.