import gradio as gr import librosa import os import spaces from pathlib import Path from pytorch.inference import PianoTranscription from utils import config # from synthviz import create_video # TODO enable video rendering from midi2audio import FluidSynth RESULTS_DIR='results' # Initialize the transcriptor transcriptor = PianoTranscription("Note_pedal") # Soundfont soundfont_path = "soundfont/MuseScore_General.sf3" fs = FluidSynth(soundfont_path) @spaces.GPU def transcribe_and_visualize(audio_file): # Generate a unique filename for the MIDI and video outputs # base_name = os.path.splitext(os.path.basename(audio_file.name))[0] base_name = os.path.splitext(os.path.basename(audio_file))[0] midi_filename = f"{base_name}_transcription.mid" # video_filename = f"{base_name}_output.mp4" # TODO enable video rendering flac_filename = f"{base_name}_transcription.flac" # Load and transcribe audio audio, _ = librosa.core.load(audio_file, sr=config.sample_rate) transcriptor.transcribe(audio, midi_filename) # Create visualization video # create_video(input_midi=midi_filename, video_filename=video_filename) # TODO enable video rendering # Convert MIDI to FLAC fs.midi_to_audio(midi_filename, flac_filename) # Return midi return flac_filename, midi_filename # Create Gradio interface iface = gr.Interface( fn=transcribe_and_visualize, inputs=gr.Audio(type="filepath", label="Upload Piano Audio"), # outputs=gr.Video(label="Transcription Visualization"), outputs=[gr.Audio(label="MIDI transcription"), gr.File(label="MIDI file")], title="MOZART - AI Piano Transcriber", description="Gradio-based piano transcriber, using Bytedance's Piano Transcription AI model. Upload a piano audio file to transcribe it into a MIDI file. Open in a piano roll app like Synthesia to see the magic.", ) # Launch the interface iface.launch()