Upload 9 files
Browse files- .gitattributes +1 -35
- .gitignore +10 -0
- README.md +157 -13
- app.py +53 -7
- cog.yaml +37 -0
- predict.py +42 -0
- requirements.lock +114 -0
- requirements.txt +9 -0
- runme.sh +42 -0
.gitattributes
CHANGED
@@ -1,35 +1 @@
|
|
1 |
-
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
soundfont/MuseScore_General.sf3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*
|
2 |
+
!*/
|
3 |
+
!*.*
|
4 |
+
*.hdf5
|
5 |
+
*.pyc
|
6 |
+
__pycache__
|
7 |
+
results/
|
8 |
+
video_frames/
|
9 |
+
model.pth
|
10 |
+
|
README.md
CHANGED
@@ -1,13 +1,157 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Piano transcription
|
3 |
+
|
4 |
+
Piano transcription is the task of transcribing piano recordings into MIDI files. This repo is the PyTorch implementation of our proposed high-resolution piano transcription system [1].
|
5 |
+
|
6 |
+
<a href="https://replicate.com/replicate/piano-transcription"><img src="https://replicate.com/replicate/piano-transcription/badge"></a>
|
7 |
+
|
8 |
+
## Demos
|
9 |
+
Here is a demo of our piano transcription system: https://www.youtube.com/watch?v=5U-WL0QvKCg
|
10 |
+
|
11 |
+
[Demo and Docker image on Replicate](https://replicate.ai/bytedance/piano-transcription)
|
12 |
+
|
13 |
+
## Environments
|
14 |
+
This codebase is developed with Python 3.7 and PyTorch 1.4.0 (Should work with other versions, but not fully tested).
|
15 |
+
|
16 |
+
Install dependencies:
|
17 |
+
```
|
18 |
+
pip install -r requirements.txt
|
19 |
+
```
|
20 |
+
|
21 |
+
## Piano transcription using pretrained model
|
22 |
+
The easiest way is to transcribe a new piano recording is to install the piano_transcription_inference package: https://github.com/qiuqiangkong/piano_transcription_inference with pip as follows:
|
23 |
+
|
24 |
+
```
|
25 |
+
pip install piano_transcription_inference
|
26 |
+
```
|
27 |
+
|
28 |
+
Then, execute the following commands to transcribe this [audio](resources/cut_liszt.mp3).
|
29 |
+
|
30 |
+
```
|
31 |
+
from piano_transcription_inference import PianoTranscription, sample_rate, load_audio
|
32 |
+
|
33 |
+
# Load audio
|
34 |
+
(audio, _) = load_audio('resources/cut_liszt.mp3', sr=sample_rate, mono=True)
|
35 |
+
|
36 |
+
# Transcriptor
|
37 |
+
transcriptor = PianoTranscription(device='cuda') # 'cuda' | 'cpu'
|
38 |
+
|
39 |
+
# Transcribe and write out to MIDI file
|
40 |
+
transcribed_dict = transcriptor.transcribe(audio, 'cut_liszt.mid')
|
41 |
+
```
|
42 |
+
|
43 |
+
## Train a piano transcription system from scratch
|
44 |
+
|
45 |
+
This section provides instructions if users would like to train a piano transcription system from scratch.
|
46 |
+
|
47 |
+
### 0. Prepare data
|
48 |
+
We use MAESTRO dataset V2.0.0 [1] to train the piano transcription system. MAESTRO consists of over 200 hours of virtuosic piano performances captured with fine alignment (~3 ms) between note labels and audio waveforms. MAESTRO dataset can be downloaded from https://magenta.tensorflow.org/datasets/maestro.
|
49 |
+
|
50 |
+
Statistics of MAESTRO V2.0.0 [[ref]](https://magenta.tensorflow.org/datasets/maestro#v200):
|
51 |
+
|
52 |
+
| Split | Performances | Duration (hours) | Size (GB) | Notes (millions) |
|
53 |
+
|------------|--------------|------------------|-----------|------------------|
|
54 |
+
| Train | 967 | 161.3 | 97.7 | 5.73 |
|
55 |
+
| Validation | 137 | 19.4 | 11.8 | 0.64 |
|
56 |
+
| Test | 178 | 20.5 | 12.4 | 0.76 |
|
57 |
+
| **Total** | **1282**| **201.2**| **121.8**| **7.13**|
|
58 |
+
|
59 |
+
After downloading, the dataset looks like:
|
60 |
+
|
61 |
+
<pre>
|
62 |
+
dataset_root
|
63 |
+
├── 2004
|
64 |
+
│ └── (264 files)
|
65 |
+
├── 2006
|
66 |
+
│ └── (230 files)
|
67 |
+
├── 2008
|
68 |
+
│ └── (294 files)
|
69 |
+
├── 2009
|
70 |
+
│ └── (250 files)
|
71 |
+
├── 2011
|
72 |
+
│ └── (326 files)
|
73 |
+
├── 2013
|
74 |
+
│ └── (254 files)
|
75 |
+
├── 2014
|
76 |
+
│ └── (210 files)
|
77 |
+
├── 2015
|
78 |
+
│ └── (258 files)
|
79 |
+
├── 2017
|
80 |
+
│ └── (280 files)
|
81 |
+
├── 2018
|
82 |
+
│ └── (198 files)
|
83 |
+
├── LICENSE
|
84 |
+
├── maestro-v2.0.0.csv
|
85 |
+
├── maestro-v2.0.0.json
|
86 |
+
└── README
|
87 |
+
</pre>
|
88 |
+
|
89 |
+
### 1. Train
|
90 |
+
|
91 |
+
Execute the commands line by line in runme.sh, including:
|
92 |
+
|
93 |
+
1) Config dataset path and your workspace.
|
94 |
+
2) Pack audio recordings to hdf5 files.
|
95 |
+
3) Train piano note transcription system.
|
96 |
+
4) Train piano pedal transcription system.
|
97 |
+
5) Combine piano note and piano pedal transcription systems.
|
98 |
+
6) Evaluate.
|
99 |
+
|
100 |
+
All training steps are described in runme.sh. It worth looking into runme.sh to see how the piano transcription system is trained. In total 29 GB GPU memoroy is required with a batch size of 12. Users may consider to reduce the batch size, or use multiple GPU cards to train this system.
|
101 |
+
|
102 |
+
## Results
|
103 |
+
The training uses a single Tesla-V100-PCIE-32GB card. The system is trained for 300k iterations for one week. The training looks like:
|
104 |
+
|
105 |
+
<pre>
|
106 |
+
Namespace(augmentation='none', batch_size=12, cuda=True, early_stop=300000, filename='main', learning_rate=0.0005, loss_type='regress_onset_offset_frame_velocity_bce', max_note_shift=0, mini_data=False, mode='train', model_type='Regress_onset_offset_frame_velocity_CRNN', reduce_iteration=10000, resume_iteration=0, workspace='.../workspaces/piano_transcription')
|
107 |
+
Using GPU.
|
108 |
+
train segments: 571589
|
109 |
+
Evaluate train segments: 571589
|
110 |
+
Evaluate validation segments: 68646
|
111 |
+
Evaluate test segments: 71959
|
112 |
+
------------------------------------
|
113 |
+
Iteration: 0
|
114 |
+
Train statistics: {'frame_ap': 0.0613, 'reg_onset_mae': 0.514, 'reg_offset_mae': 0.482, 'velocity_mae': 0.1362}
|
115 |
+
Validation statistics: {'frame_ap': 0.0605, 'reg_onset_mae': 0.5143, 'reg_offset_mae': 0.4819, 'velocity_mae': 0.133}
|
116 |
+
Test statistics: {'frame_ap': 0.0601, 'reg_onset_mae': 0.5139, 'reg_offset_mae': 0.4821, 'velocity_mae': 0.1283}
|
117 |
+
Dump statistics to .../workspaces/piano_transcription/statistics/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/statistics.pkl
|
118 |
+
Dump statistics to .../workspaces/piano_transcription/statistics/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/statistics_2020-04-28_00-22-33.pickle
|
119 |
+
Train time: 5.498 s, validate time: 92.863 s
|
120 |
+
Model saved to .../workspaces/piano_transcription/checkpoints/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/0_iterations.pth
|
121 |
+
------------------------------------
|
122 |
+
...
|
123 |
+
------------------------------------
|
124 |
+
Iteration: 300000
|
125 |
+
Train statistics: {'frame_ap': 0.9439, 'reg_onset_mae': 0.091, 'reg_offset_mae': 0.127, 'velocity_mae': 0.0241}
|
126 |
+
Validation statistics: {'frame_ap': 0.9245, 'reg_onset_mae': 0.0985, 'reg_offset_mae': 0.1327, 'velocity_mae': 0.0265}
|
127 |
+
Test statistics: {'frame_ap': 0.9285, 'reg_onset_mae': 0.097, 'reg_offset_mae': 0.1353, 'velocity_mae': 0.027}
|
128 |
+
Dump statistics to .../workspaces/piano_transcription/statistics/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/statistics.pkl
|
129 |
+
Dump statistics to .../workspaces/piano_transcription/statistics/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/statistics_2020-04-28_00-22-33.pickle
|
130 |
+
Train time: 8953.815 s, validate time: 93.683 s
|
131 |
+
Model saved to .../workspaces/piano_transcription/checkpoints/main/Regress_onset_offset_frame_velocity_CRNN/loss_type=regress_onset_offset_frame_velocity_bce/augmentation=none/batch_size=12/300000_iterations.pth
|
132 |
+
</pre>
|
133 |
+
|
134 |
+
## Visualization of piano transcription
|
135 |
+
|
136 |
+
**Demo 1.** Lang Lang: Franz Liszt - Love Dream (Liebestraum) [[audio]](resources/cut_liszt.mp3) [[transcribed_midi]](resources/cut_liszt.mid)
|
137 |
+
|
138 |
+
<img src="resources/cut_liszt.png">
|
139 |
+
|
140 |
+
**Demo 2.** Andras Schiff: J.S.Bach - French Suites [[audio]](resources/cut_bach.mp3) [[transcribed_midi]](resources/cut_bach.mid)
|
141 |
+
|
142 |
+
<img src="resources/cut_bach.png">
|
143 |
+
|
144 |
+
## FAQs
|
145 |
+
If users met running out of GPU memory error, then try to reduce batch size.
|
146 |
+
|
147 |
+
## LICENSE
|
148 |
+
Apache 2.0
|
149 |
+
|
150 |
+
## Applications
|
151 |
+
We have built a large-scale classical piano MIDI dataset using our piano transcription system. See https://github.com/bytedance/GiantMIDI-Piano for details.
|
152 |
+
|
153 |
+
## Contact
|
154 |
+
Qiuqiang Kong, kongqiuqiang@bytedance.com
|
155 |
+
|
156 |
+
## Cite
|
157 |
+
[1] Qiuqiang Kong, Bochen Li, Xuchen Song, Yuan Wan, and Yuxuan Wang. "High-resolution Piano Transcription with Pedals by Regressing Onsets and Offsets Times." arXiv preprint arXiv:2010.01815 (2020). [[pdf]](https://arxiv.org/pdf/2010.01815.pdf)
|
app.py
CHANGED
@@ -1,7 +1,53 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import librosa
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
from pytorch.inference import PianoTranscription
|
6 |
+
from utils import config
|
7 |
+
# from synthviz import create_video # TODO enable video rendering
|
8 |
+
from midi2audio import FluidSynth
|
9 |
+
|
10 |
+
RESULTS_DIR='results'
|
11 |
+
|
12 |
+
# Initialize the transcriptor
|
13 |
+
transcriptor = PianoTranscription("Note_pedal")
|
14 |
+
|
15 |
+
# Soundfont
|
16 |
+
soundfont_path = "soundfont/MuseScore_General.sf3"
|
17 |
+
fs = FluidSynth(soundfont_path)
|
18 |
+
|
19 |
+
def transcribe_and_visualize(audio_file):
|
20 |
+
# Generate a unique filename for the MIDI and video outputs
|
21 |
+
# base_name = os.path.splitext(os.path.basename(audio_file.name))[0]
|
22 |
+
base_name = os.path.splitext(os.path.basename(audio_file))[0]
|
23 |
+
midi_filename = f"{RESULTS_DIR}/{base_name}_transcription.mid"
|
24 |
+
video_filename = f"{RESULTS_DIR}/{base_name}_output.mp4"
|
25 |
+
flac_filename = f"{RESULTS_DIR}/{base_name}_transcription.flac"
|
26 |
+
|
27 |
+
# Load and transcribe audio
|
28 |
+
audio, _ = librosa.core.load(audio_file, sr=config.sample_rate)
|
29 |
+
transcriptor.transcribe(audio, midi_filename)
|
30 |
+
|
31 |
+
# Create visualization video # TODO enable video rendering
|
32 |
+
# create_video(input_midi=midi_filename, video_filename=video_filename)
|
33 |
+
|
34 |
+
# return video_filename
|
35 |
+
|
36 |
+
# Convert MIDI to FLAC
|
37 |
+
fs.midi_to_audio(midi_filename, flac_filename)
|
38 |
+
|
39 |
+
# Return midi
|
40 |
+
return flac_filename, midi_filename
|
41 |
+
|
42 |
+
# Create Gradio interface
|
43 |
+
iface = gr.Interface(
|
44 |
+
fn=transcribe_and_visualize,
|
45 |
+
inputs=gr.Audio(type="filepath", label="Upload Piano Audio"),
|
46 |
+
# outputs=gr.Video(label="Transcription Visualization"),
|
47 |
+
outputs=[gr.Audio(label="MIDI transcription"), gr.File(label="MIDI file")],
|
48 |
+
title="MOZART - AI Piano Transcriber",
|
49 |
+
description="Upload a piano audio file to transcribe it and visualize the result.",
|
50 |
+
)
|
51 |
+
|
52 |
+
# Launch the interface
|
53 |
+
iface.launch()
|
cog.yaml
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Configuration for Cog ⚙️
|
2 |
+
# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
|
3 |
+
|
4 |
+
build:
|
5 |
+
gpu: true
|
6 |
+
|
7 |
+
system_packages:
|
8 |
+
- "libgl1-mesa-glx"
|
9 |
+
- "libglib2.0-0"
|
10 |
+
- "libsndfile1-dev"
|
11 |
+
- "ffmpeg"
|
12 |
+
- "timidity"
|
13 |
+
|
14 |
+
python_version: "3.8"
|
15 |
+
|
16 |
+
python_packages:
|
17 |
+
- "torch==1.8.0"
|
18 |
+
- "torchvision==0.9.0"
|
19 |
+
- "piano_transcription_inference==0.0.5"
|
20 |
+
- "librosa==0.6.0"
|
21 |
+
- "h5py==2.10.0"
|
22 |
+
- "pandas==1.1.2"
|
23 |
+
- "librosa==0.6.0"
|
24 |
+
- "numba==0.48"
|
25 |
+
- "mido==1.2.9"
|
26 |
+
- "mir_eval==0.5"
|
27 |
+
- "matplotlib==3.0.3"
|
28 |
+
- "torchlibrosa==0.0.4"
|
29 |
+
- "sox==1.4.0"
|
30 |
+
- "tqdm==4.62.3"
|
31 |
+
- "pretty_midi==0.2.9"
|
32 |
+
- "synthviz==0.0.2"
|
33 |
+
|
34 |
+
run:
|
35 |
+
- "ffmpeg -version"
|
36 |
+
|
37 |
+
predict: "predict.py:Predictor"
|
predict.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Prediction interface for Cog ⚙️
|
2 |
+
# Reference: https://github.com/replicate/cog/blob/main/docs/python.md
|
3 |
+
|
4 |
+
import os
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import cog
|
8 |
+
import librosa
|
9 |
+
|
10 |
+
# model repo: https://github.com/bytedance/piano_transcription
|
11 |
+
# package repo: https://github.com/qiuqiangkong/piano_transcription_inference
|
12 |
+
from piano_transcription_inference import PianoTranscription, sample_rate
|
13 |
+
from synthviz import create_video
|
14 |
+
|
15 |
+
# adapted from example: https://github.com/minzwon/sota-music-tagging-models/blob/master/predict.py
|
16 |
+
|
17 |
+
|
18 |
+
class Predictor(cog.Predictor):
|
19 |
+
transcriptor: PianoTranscription
|
20 |
+
|
21 |
+
def setup(self):
|
22 |
+
self.transcriptor = PianoTranscription(
|
23 |
+
device="cuda", checkpoint_path="./model.pth"
|
24 |
+
)
|
25 |
+
|
26 |
+
@cog.input("audio_input", type=Path, help="Input audio file")
|
27 |
+
def predict(self, audio_input):
|
28 |
+
midi_intermediate_filename = "transcription.mid"
|
29 |
+
video_filename = os.path.join(Path.cwd(), "output.mp4")
|
30 |
+
audio, _ = librosa.core.load(str(audio_input), sr=sample_rate)
|
31 |
+
# Transcribe audio
|
32 |
+
self.transcriptor.transcribe(audio, midi_intermediate_filename)
|
33 |
+
|
34 |
+
# 'Visualization' output option
|
35 |
+
create_video(
|
36 |
+
input_midi=midi_intermediate_filename, video_filename=video_filename
|
37 |
+
)
|
38 |
+
print(
|
39 |
+
f"Created video of size {os.path.getsize(video_filename)} bytes at path {video_filename}"
|
40 |
+
)
|
41 |
+
# Return path to video
|
42 |
+
return Path(video_filename)
|
requirements.lock
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
h5py==3.11.0
|
2 |
+
pandas==2.2.2
|
3 |
+
librosa==0.10.2.post1
|
4 |
+
numba==0.60.0
|
5 |
+
mido==1.3.2
|
6 |
+
mir-eval==0.7
|
7 |
+
matplotlib==3.9.2
|
8 |
+
torchlibrosa==0.1.0
|
9 |
+
sox==1.5.0
|
10 |
+
## The following requirements were added by pip freeze:
|
11 |
+
aiofiles==23.2.1
|
12 |
+
annotated-types==0.7.0
|
13 |
+
anyio==4.4.0
|
14 |
+
audioread==3.0.1
|
15 |
+
certifi==2024.8.30
|
16 |
+
cffi==1.17.1
|
17 |
+
chardet==5.2.0
|
18 |
+
charset-normalizer==3.3.2
|
19 |
+
click==8.1.7
|
20 |
+
contourpy==1.3.0
|
21 |
+
cycler==0.12.1
|
22 |
+
decorator==5.1.1
|
23 |
+
exceptiongroup==1.2.2
|
24 |
+
fastapi==0.114.2
|
25 |
+
ffmpy==0.4.0
|
26 |
+
filelock==3.16.0
|
27 |
+
fluidsynth==0.2
|
28 |
+
fonttools==4.53.1
|
29 |
+
fsspec==2024.9.0
|
30 |
+
future==1.0.0
|
31 |
+
gradio==4.44.0
|
32 |
+
gradio_client==1.3.0
|
33 |
+
h11==0.14.0
|
34 |
+
httpcore==1.0.5
|
35 |
+
httpx==0.27.2
|
36 |
+
huggingface-hub==0.24.7
|
37 |
+
idna==3.9
|
38 |
+
importlib_resources==6.4.5
|
39 |
+
Jinja2==3.1.4
|
40 |
+
joblib==1.4.2
|
41 |
+
jsonpickle==3.3.0
|
42 |
+
kiwisolver==1.4.7
|
43 |
+
lazy_loader==0.4
|
44 |
+
llvmlite==0.43.0
|
45 |
+
markdown-it-py==3.0.0
|
46 |
+
MarkupSafe==2.1.5
|
47 |
+
mdurl==0.1.2
|
48 |
+
midi2audio==0.1.1
|
49 |
+
more-itertools==10.5.0
|
50 |
+
mpmath==1.3.0
|
51 |
+
msgpack==1.1.0
|
52 |
+
music21==9.1.0
|
53 |
+
networkx==3.3
|
54 |
+
numpy==2.0.2
|
55 |
+
nvidia-cublas-cu11==11.11.3.6
|
56 |
+
nvidia-cuda-cupti-cu11==11.8.87
|
57 |
+
nvidia-cuda-nvrtc-cu11==11.8.89
|
58 |
+
nvidia-cuda-runtime-cu11==11.8.89
|
59 |
+
nvidia-cudnn-cu11==9.1.0.70
|
60 |
+
nvidia-cufft-cu11==10.9.0.58
|
61 |
+
nvidia-curand-cu11==10.3.0.86
|
62 |
+
nvidia-cusolver-cu11==11.4.1.48
|
63 |
+
nvidia-cusparse-cu11==11.7.5.86
|
64 |
+
nvidia-nccl-cu11==2.20.5
|
65 |
+
nvidia-nvtx-cu11==11.8.86
|
66 |
+
orjson==3.10.7
|
67 |
+
packaging==23.2
|
68 |
+
pillow==10.4.0
|
69 |
+
platformdirs==4.3.3
|
70 |
+
pooch==1.8.2
|
71 |
+
pretty-midi==0.2.10
|
72 |
+
pycparser==2.22
|
73 |
+
pydantic==2.9.1
|
74 |
+
pydantic_core==2.23.3
|
75 |
+
pydub==0.25.1
|
76 |
+
Pygments==2.18.0
|
77 |
+
pyparsing==3.1.4
|
78 |
+
python-dateutil==2.9.0.post0
|
79 |
+
python-multipart==0.0.9
|
80 |
+
pytz==2024.2
|
81 |
+
PyYAML==6.0.2
|
82 |
+
regex==2024.9.11
|
83 |
+
requests==2.32.3
|
84 |
+
resampy==0.4.3
|
85 |
+
rich==13.8.1
|
86 |
+
ruff==0.6.5
|
87 |
+
safetensors==0.4.5
|
88 |
+
scikit-learn==1.5.2
|
89 |
+
scipy==1.14.1
|
90 |
+
semantic-version==2.10.0
|
91 |
+
shellingham==1.5.4
|
92 |
+
six==1.16.0
|
93 |
+
sniffio==1.3.1
|
94 |
+
soundfile==0.12.1
|
95 |
+
soxr==0.5.0.post1
|
96 |
+
starlette==0.38.5
|
97 |
+
sympy==1.13.2
|
98 |
+
synthviz==0.0.2
|
99 |
+
threadpoolctl==3.5.0
|
100 |
+
tokenizers==0.19.1
|
101 |
+
tomlkit==0.12.0
|
102 |
+
torch==2.4.1+cu118
|
103 |
+
torchaudio==2.4.1+cu118
|
104 |
+
torchvision==0.19.1+cu118
|
105 |
+
tqdm==4.66.5
|
106 |
+
transformers==4.44.2
|
107 |
+
triton==3.0.0
|
108 |
+
typer==0.12.5
|
109 |
+
typing_extensions==4.12.2
|
110 |
+
tzdata==2024.1
|
111 |
+
urllib3==2.2.3
|
112 |
+
uvicorn==0.30.6
|
113 |
+
webcolors==24.8.0
|
114 |
+
websockets==12.0
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
h5py==2.10.0
|
2 |
+
pandas==1.1.2
|
3 |
+
librosa==0.6.0
|
4 |
+
numba==0.48
|
5 |
+
mido==1.2.9
|
6 |
+
mir_eval==0.5
|
7 |
+
matplotlib==3.0.3
|
8 |
+
torchlibrosa==0.0.4
|
9 |
+
sox==1.4.0
|
runme.sh
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# ============ Inference using pretrained model ============
|
4 |
+
# Download checkpoint and inference
|
5 |
+
CHECKPOINT_PATH="CRNN_note_F1=0.9677_pedal_F1=0.9186.pth"
|
6 |
+
# wget -O $CHECKPOINT_PATH "https://zenodo.org/record/4034264/files/CRNN_note_F1%3D0.9677_pedal_F1%3D0.9186.pth?download=1"
|
7 |
+
MODEL_TYPE="Note_pedal"
|
8 |
+
# ORIGINAL
|
9 |
+
# python3 pytorch/inference.py --model_type=$MODEL_TYPE --checkpoint_path=$CHECKPOINT_PATH --audio_path='resources/cut_liszt.mp3' --cuda
|
10 |
+
|
11 |
+
python3 pytorch/inference.py --audio_path='resources/cut_liszt.mp3' --cuda
|
12 |
+
|
13 |
+
# # ============ Train piano transcription system from scratch ============
|
14 |
+
# # MAESTRO dataset directory. Users need to download MAESTRO dataset into this folder.
|
15 |
+
# DATASET_DIR="./datasets/maestro/dataset_root"
|
16 |
+
|
17 |
+
# # Modify to your workspace
|
18 |
+
# WORKSPACE="./workspaces/piano_transcription"
|
19 |
+
|
20 |
+
# # Pack audio files to hdf5 format for training
|
21 |
+
# python3 utils/features.py pack_maestro_dataset_to_hdf5 --dataset_dir=$DATASET_DIR --workspace=$WORKSPACE
|
22 |
+
|
23 |
+
# # --- 1. Train note transcription system ---
|
24 |
+
# python3 pytorch/main.py train --workspace=$WORKSPACE --model_type='Regress_onset_offset_frame_velocity_CRNN' --loss_type='regress_onset_offset_frame_velocity_bce' --augmentation='none' --max_note_shift=0 --batch_size=12 --learning_rate=5e-4 --reduce_iteration=10000 --resume_iteration=0 --early_stop=300000 --cuda
|
25 |
+
|
26 |
+
# # --- 2. Train pedal transcription system ---
|
27 |
+
# python3 pytorch/main.py train --workspace=$WORKSPACE --model_type='Regress_pedal_CRNN' --loss_type='regress_pedal_bce' --augmentation='none' --max_note_shift=0 --batch_size=12 --learning_rate=5e-4 --reduce_iteration=10000 --resume_iteration=0 --early_stop=300000 --cuda
|
28 |
+
|
29 |
+
# # --- 3. Combine the note and pedal models ---
|
30 |
+
# # Users should copy and rename the following paths to their trained model paths
|
31 |
+
# NOTE_CHECKPOINT_PATH="Regress_onset_offset_frame_velocity_CRNN_onset_F1=0.9677.pth"
|
32 |
+
# PEDAL_CHECKPOINT_PATH="Regress_pedal_CRNN_onset_F1=0.9186.pth"
|
33 |
+
# NOTE_PEDAL_CHECKPOINT_PATH="CRNN_note_F1=0.9677_pedal_F1=0.9186.pth"
|
34 |
+
# python3 pytorch/combine_note_and_pedal_models.py --note_checkpoint_path=$NOTE_CHECKPOINT_PATH --pedal_checkpoint_path=$PEDAL_CHECKPOINT_PATH --output_checkpoint_path=$NOTE_PEDAL_CHECKPOINT_PATH
|
35 |
+
|
36 |
+
# # ============ Evaluate (optional) ============
|
37 |
+
# # Inference probability for evaluation
|
38 |
+
# python3 pytorch/calculate_score_for_paper.py infer_prob --workspace=$WORKSPACE --model_type='Note_pedal' --checkpoint_path=$NOTE_PEDAL_CHECKPOINT_PATH --augmentation='none' --dataset='maestro' --split='test' --cuda
|
39 |
+
|
40 |
+
# # Calculate metrics
|
41 |
+
# python3 pytorch/calculate_score_for_paper.py calculate_metrics --workspace=$WORKSPACE --model_type='Note_pedal' --augmentation='aug' --dataset='maestro' --split='test'
|
42 |
+
# python3 pytorch/calculate_score_for_paper.py calculate_metrics --workspace=$WORKSPACE --model_type='Note_pedal' --augmentation='aug' --dataset='maps' --split='test'
|