# Prediction interface for Cog ⚙️ # Reference: https://github.com/replicate/cog/blob/main/docs/python.md import os from pathlib import Path import cog import librosa # model repo: https://github.com/bytedance/piano_transcription # package repo: https://github.com/qiuqiangkong/piano_transcription_inference from piano_transcription_inference import PianoTranscription, sample_rate from synthviz import create_video # adapted from example: https://github.com/minzwon/sota-music-tagging-models/blob/master/predict.py class Predictor(cog.Predictor): transcriptor: PianoTranscription def setup(self): self.transcriptor = PianoTranscription( device="cuda", checkpoint_path="./model.pth" ) @cog.input("audio_input", type=Path, help="Input audio file") def predict(self, audio_input): midi_intermediate_filename = "transcription.mid" video_filename = os.path.join(Path.cwd(), "output.mp4") audio, _ = librosa.core.load(str(audio_input), sr=sample_rate) # Transcribe audio self.transcriptor.transcribe(audio, midi_intermediate_filename) # 'Visualization' output option create_video( input_midi=midi_intermediate_filename, video_filename=video_filename ) print( f"Created video of size {os.path.getsize(video_filename)} bytes at path {video_filename}" ) # Return path to video return Path(video_filename)