lllindsey0615 commited on
Commit
a189727
·
1 Parent(s): 78b8054

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. DEFAULT_HF_MODEL_REPO +1 -0
  3. DEFAULT_MODEL +1 -0
  4. LICENSE +21 -0
  5. README.md +243 -7
  6. TODOS +1 -0
  7. app.py +760 -0
  8. assets/.DS_Store +0 -0
  9. conf/c2f.yml +14 -0
  10. conf/generated/cat/c2f.yml +15 -0
  11. conf/generated/cat/coarse.yml +8 -0
  12. conf/generated/cat/interface.yml +6 -0
  13. conf/generated/cat10/c2f.yml +15 -0
  14. conf/generated/cat10/coarse.yml +8 -0
  15. conf/generated/cat10/interface.yml +6 -0
  16. conf/generated/ivo/c2f.yml +15 -0
  17. conf/generated/ivo/coarse.yml +8 -0
  18. conf/generated/ivo/interface.yml +6 -0
  19. conf/generated/lazaro-ros-sep/c2f.yml +15 -0
  20. conf/generated/lazaro-ros-sep/coarse.yml +8 -0
  21. conf/generated/lazaro-ros-sep/interface.yml +6 -0
  22. conf/generated/lazaro-ros/c2f.yml +15 -0
  23. conf/generated/lazaro-ros/coarse.yml +8 -0
  24. conf/generated/lazaro-ros/interface.yml +6 -0
  25. conf/generated/le-poisson-steve/c2f.yml +15 -0
  26. conf/generated/le-poisson-steve/coarse.yml +8 -0
  27. conf/generated/le-poisson-steve/interface.yml +6 -0
  28. conf/generated/march-31/c2f.yml +15 -0
  29. conf/generated/march-31/coarse.yml +8 -0
  30. conf/generated/march-31/interface.yml +6 -0
  31. conf/generated/sax-new/c2f.yml +15 -0
  32. conf/generated/sax-new/coarse.yml +8 -0
  33. conf/generated/sax-new/interface.yml +6 -0
  34. conf/generated/saxophone/c2f.yml +15 -0
  35. conf/generated/saxophone/coarse.yml +8 -0
  36. conf/generated/saxophone/interface.yml +6 -0
  37. conf/interface.yml +10 -0
  38. conf/lora/lora-s2s.yml +27 -0
  39. conf/lora/lora.yml +22 -0
  40. conf/salad_bowl.yml +0 -0
  41. conf/vampnet.yml +49 -0
  42. hello.py +48 -0
  43. requirements.txt +11 -0
  44. scratch/convert_to_wav.sh +1 -0
  45. scratch/rms_mask.txt +14 -0
  46. scratch/separate_folder.sh +1 -0
  47. scripts/exp/eval.py +110 -0
  48. scripts/exp/experiment.py +254 -0
  49. scripts/exp/export.py +75 -0
  50. scripts/exp/fine_tune.py +87 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
DEFAULT_HF_MODEL_REPO ADDED
@@ -0,0 +1 @@
 
 
1
+ hugggof/vampnet
DEFAULT_MODEL ADDED
@@ -0,0 +1 @@
 
 
1
+ default
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Hugo Flores García and Prem Seetharaman
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,249 @@
1
  ---
2
- title: Vampnet Music HARP V3
3
- emoji: 🐢
4
- colorFrom: red
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.42.0
 
8
  app_file: app.py
9
  pinned: false
10
- short_description: Wrapped VampNet model for HARP3
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: salad bowl (vampnet)
3
+ emoji: 🥗
4
+ colorFrom: yellow
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 5.23.2
8
+ python_version: 3.11
9
  app_file: app.py
10
  pinned: false
11
+ license: cc-by-nc-4.0
12
  ---
13
 
14
+ # VampNet
15
+
16
+ # Table of contents
17
+
18
+ - [setting up](#setting-up)
19
+ - [programmatic usage](#programmatic-usage)
20
+ - [launching the web app](#launching-the-web-app)
21
+ - [training / fine-tuning](#training--fine-tuning)
22
+ - [training a model](#training-a-model)
23
+ - [debugging training](#debugging-training)
24
+ - [fine-tuning](#fine-tuning)
25
+ - [exporting your model](#exporting-your-model)
26
+ - [unloop](#unloop)
27
+ - [token telephone](#token-telephone)
28
+ - [a note on argbind](#a-note-on-argbind)
29
+ - [take a look at the pretrained models](#take-a-look-at-the-pretrained-models)
30
+ - [licensing for pretrained models](#licensing-for-pretrained-models)
31
+
32
+ ## setting up
33
+
34
+ python 3.9-3.11 works well. (for example, using conda)
35
+ ```bash
36
+ conda create -n vampnet python=3.9
37
+ conda activate vampnet
38
+ ```
39
+
40
+ install VampNet
41
+
42
+ ```bash
43
+ git clone https://github.com/hugofloresgarcia/vampnet.git
44
+ pip install -e ./vampnet
45
+ ```
46
+
47
+ ## programmatic usage
48
+
49
+ quick start!
50
+ ```python
51
+ import random
52
+ import vampnet
53
+ import audiotools as at
54
+
55
+ # load the default vampnet model
56
+ interface = vampnet.interface.Interface.default()
57
+
58
+ # list available finetuned models
59
+ finetuned_model_choices = interface.available_models()
60
+ print(f"available finetuned models: {finetuned_model_choices}")
61
+
62
+ # pick a random finetuned model
63
+ model_choice = random.choice(finetuned_model_choices)
64
+ print(f"choosing model: {model_choice}")
65
+
66
+ # load a finetuned model
67
+ interface.load_finetuned(model_choice)
68
+
69
+ # load an example audio file
70
+ signal = at.AudioSignal("assets/example.wav")
71
+
72
+ # get the tokens for the audio
73
+ codes = interface.encode(signal)
74
+
75
+ # build a mask for the audio
76
+ mask = interface.build_mask(
77
+ codes, signal,
78
+ periodic_prompt=7,
79
+ upper_codebook_mask=3,
80
+ )
81
+
82
+ # generate the output tokens
83
+ output_tokens = interface.vamp(
84
+ codes, mask, return_mask=False,
85
+ temperature=1.0,
86
+ typical_filtering=True,
87
+ )
88
+
89
+ # convert them to a signal
90
+ output_signal = interface.decode(output_tokens)
91
+
92
+ # save the output signal
93
+ output_signal.write("scratch/output.wav")
94
+ ```
95
+
96
+
97
+ # Launching the Web app
98
+ You can launch a gradio UI to play with vampnet.
99
+
100
+ ```bash
101
+ python app.py
102
+ ```
103
+
104
+ # Training / Fine-tuning
105
+
106
+ ## Training a model
107
+
108
+ To train a model, run the following script:
109
+
110
+ ```bash
111
+ python scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints
112
+ ```
113
+
114
+ for multi-gpu training, use torchrun:
115
+
116
+ ```bash
117
+ torchrun --nproc_per_node gpu scripts/exp/train.py --args.load conf/vampnet.yml --save_path path/to/ckpt
118
+ ```
119
+
120
+ You can edit `conf/vampnet.yml` to change the dataset paths or any training hyperparameters.
121
+
122
+ For coarse2fine models, you can use `conf/c2f.yml` as a starting configuration.
123
+
124
+ See `python scripts/exp/train.py -h` for a list of options.
125
+
126
+ ## Debugging training
127
+
128
+ To debug training, it's easier to debug with 1 gpu and 0 workers
129
+
130
+ ```bash
131
+ CUDA_VISIBLE_DEVICES=0 python -m pdb scripts/exp/train.py --args.load conf/vampnet.yml --save_path /path/to/checkpoints --num_workers 0
132
+ ```
133
+
134
+ # Fine-tuning
135
+
136
+ To fine-tune a model, use the script in `scripts/exp/fine_tune.py`
137
+
138
+ for an audio folder
139
+ ```bash
140
+ python scripts/exp/fine_tune.py /path/to/audio/folder <fine_tune_name>
141
+ ```
142
+
143
+ for multiple files
144
+ ```bash
145
+ python scripts/exp/fine_tune.py "/path/to/audio1.mp3 /path/to/audio2/ /path/to/audio3.wav" <fine_tune_name>
146
+ ```
147
+
148
+ This creates configuration files for a fine tuning train job. The save_paths will be set to `runs/<fine_tune_name>/coarse` and `runs/<fine_tune_name>/c2f`.
149
+
150
+ launch the coarse job:
151
+ ```bash
152
+ python scripts/exp/train.py --args.load conf/generated/<fine_tune_name>/coarse.yml
153
+ ```
154
+
155
+ this will save the coarse model to `runs/<fine_tune_name>/coarse/ckpt/best/`.
156
+
157
+ launch the c2f job:
158
+ ```bash
159
+ python scripts/exp/train.py --args.load conf/generated/<fine_tune_name>/c2f.yml
160
+ ```
161
+
162
+ # Resuming a Training/Finetuning Job from checkpoint.
163
+
164
+ To resume from checkpoint, use the `--resume` flag and the `--save_path` to point to the checkpoint you want to resume from.
165
+ ```bash
166
+ python scripts/exp/train.py --args.load conf/generated/steve/coarse.yml --save_path runs/steve/coarse --resume
167
+ ```
168
+
169
+ # Exporting your model
170
+
171
+ Once your model has been fine-tuned, you can export it to a HuggingFace model.
172
+
173
+ In order to use your model in `app.py`, you will need to export it to HuggingFace.
174
+
175
+ **NOTE**: In order to export, you will need a [huggingface account](https://huggingface.co/).
176
+
177
+ Now, log in to huggingface using the command line:
178
+ ```bash
179
+ huggingface-cli login
180
+ ```
181
+
182
+ replace the contents of the file named `./DEFAULT_HF_MODEL_REPO` with your `<HUGGINGFACE_USERNAME>/vampnet`. A model repo will be automatically created for you with `export.py`. The default is `hugggof/vampnet`.
183
+
184
+ for example, if my username is `hugggof`, I would run the following command:`
185
+ ```bash
186
+ echo 'hugggof/vampnet' > ./DEFAULT_HF_MODEL_REPO
187
+ ```
188
+
189
+ Now, run the following command to export your model (replace `<your_finetuned_model_name>` with the name of your model):
190
+
191
+ ```bash
192
+ python scripts/exp/export.py --name <your_finetuned_model_name> --model latest
193
+ ```
194
+
195
+ Once that's done, your model should appear on the list of available models in the gradio interface.
196
+ Simply run `python app.py` and select your model from the dropdown list.
197
+
198
+
199
+ # Unloop
200
+
201
+ Make sure you have Max installed on your laptop!
202
+
203
+ **NOTE**: To run unloop (with a GPU-powered server), you will need to install the vampnet repo in both your local machine and your GPU server.
204
+
205
+ ## start a vampnet gradio server
206
+
207
+ First, **on your GPU server**, run the gradio server:
208
+ ```bash
209
+ python app.py --args.load conf/interface.yml --Interface.device cuda
210
+ ```
211
+ This will run a vampnet gradio API on your GPU server. Copy the address. It will be something like `https://127.0.0.1:7860/`.
212
+
213
+ **IMPORTANT** Make sure that this gradio port (by default `7860`) is forwarded to your local machine, where you have Max installed.
214
+
215
+ ## start the unloop gradio client
216
+ Now, **on your local machine**, run the unloop gradio client.
217
+ ```
218
+ cd unloop
219
+ pip install -r requirements.txt
220
+ python client.py --vampnet_url https://127.0.0.1:7860/ # replace with your gradio server address
221
+ ```
222
+ This will start a gradio client that connects to the gradio server running on your GPU server.
223
+
224
+ ## start the unloop Max patch
225
+ Now, open the unloop Max patch. It's located at `unloop/max/unloop.maxpat`.
226
+
227
+ In the tape controls, check the heartbeat (`<3`) to make sure the connection to the local gradio client is working.
228
+
229
+ have fun!
230
+
231
+ # Token Telephone
232
+
233
+ Instructions forthcoming, but the sauce is in `token_telephone/tt.py`
234
+
235
+ ## A note on argbind
236
+ This repository relies on [argbind](https://github.com/pseeth/argbind) to manage CLIs and config files.
237
+ Config files are stored in the `conf/` folder.
238
+
239
+ ### Take a look at the pretrained models
240
+ All the pretrained models (trained by hugo) are stored here: https://huggingface.co/hugggof/vampnet
241
+
242
+ ### Licensing for Pretrained Models:
243
+ The weights for the models are licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml). Likewise, any VampNet models fine-tuned on the pretrained models are also licensed [`CC BY-NC-SA 4.0`](https://creativecommons.org/licenses/by-nc-sa/4.0/deed.ml).
244
+
245
+ Download the pretrained models from [this link](https://zenodo.org/record/8136629). Then, extract the models to the `models/` folder.
246
+
247
+
248
+
249
+
TODOS ADDED
@@ -0,0 +1 @@
 
 
1
+ [ ] add sketch2sound finetuning
app.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ from pathlib import Path
3
+ import yaml
4
+ import time
5
+ import uuid
6
+
7
+ import numpy as np
8
+ import audiotools as at
9
+ import argbind
10
+ import shutil
11
+ import torch
12
+ from datetime import datetime
13
+ from pyharp.core import build_endpoint, ModelCard
14
+ from pyharp.labels import OutputLabel, LabelList
15
+ from pyharp.media.audio import save_audio
16
+
17
+ import gradio as gr
18
+ from vampnet.interface import Interface, signal_concat
19
+ from vampnet import mask as pmask
20
+
21
+ device="cpu"
22
+ print(f"using device {device}\n"*10)
23
+
24
+ interface = Interface.default()
25
+ init_model_choice = open("DEFAULT_MODEL").read().strip()
26
+
27
+ # load the init model
28
+ interface.load_finetuned(init_model_choice)
29
+
30
+ def to_output(sig):
31
+ return sig.sample_rate, sig.cpu().detach().numpy()[0][0]
32
+
33
+ MAX_DURATION_S = 10
34
+ def load_audio(file):
35
+ print(file)
36
+ if isinstance(file, str):
37
+ filepath = file
38
+ elif isinstance(file, tuple):
39
+ # not a file
40
+ sr, samples = file
41
+ samples = samples / np.iinfo(samples.dtype).max
42
+ return sr, samples
43
+ else:
44
+ filepath = file.name
45
+ sig = at.AudioSignal.salient_excerpt(
46
+ filepath, duration=MAX_DURATION_S
47
+ )
48
+ sig = at.AudioSignal(filepath)
49
+ return to_output(sig)
50
+
51
+
52
+ def load_example_audio():
53
+ return load_audio("./assets/example.wav")
54
+
55
+ from torch_pitch_shift import pitch_shift, get_fast_shifts
56
+ def shift_pitch(signal, interval: int):
57
+ signal.samples = pitch_shift(
58
+ signal.samples,
59
+ shift=interval,
60
+ sample_rate=signal.sample_rate
61
+ )
62
+ return signal
63
+
64
+
65
+ def onsets(sig: at.AudioSignal, hop_length: int):
66
+ assert sig.batch_size == 1, "batch size must be 1"
67
+ assert sig.num_channels == 1, "mono signals only"
68
+ import librosa
69
+ onset_frame_idxs = librosa.onset.onset_detect(
70
+ y=sig.samples[0][0].detach().cpu().numpy(), sr=sig.sample_rate,
71
+ hop_length=hop_length,
72
+ backtrack=True,
73
+ )
74
+ return onset_frame_idxs
75
+
76
+
77
+ @spaces.GPU
78
+ def new_vampnet_mask(self,
79
+ codes,
80
+ onset_idxs,
81
+ width: int = 5,
82
+ periodic_prompt=2,
83
+ upper_codebook_mask=1,
84
+ drop_amt: float = 0.1
85
+ ):
86
+ from vampnet.newmask import mask_and, mask_or, onset_mask, periodic_mask, drop_ones, codebook_mask
87
+ mask = mask_and(
88
+ periodic_mask(codes, periodic_prompt, 1, random_roll=False),
89
+ mask_or( # this re-masks the onsets, according to a periodic schedule
90
+ onset_mask(onset_idxs, codes, width=width),
91
+ periodic_mask(codes, periodic_prompt, 1, random_roll=False),
92
+ )
93
+ ).int()
94
+ # make sure the onset idxs themselves are unmasked
95
+ # mask = 1 - mask
96
+ mask[:, :, onset_idxs] = 0
97
+ mask = mask.cpu() # debug
98
+ mask = 1-drop_ones(1-mask, drop_amt)
99
+ mask = codebook_mask(mask, upper_codebook_mask)
100
+
101
+
102
+ # save mask as txt (ints)
103
+ np.savetxt("scratch/rms_mask.txt", mask[0].cpu().numpy(), fmt='%d')
104
+ mask = mask.to(self.device)
105
+ return mask[:, :, :]
106
+
107
+ @spaces.GPU
108
+ def mask_preview(periodic_p, n_mask_codebooks, onset_mask_width, dropout):
109
+ # make a mask preview
110
+ codes = torch.zeros((1, 14, 80)).to(device)
111
+ mask = interface.build_mask(
112
+ codes,
113
+ periodic_prompt=periodic_p,
114
+ # onset_mask_width=onset_mask_width,
115
+ _dropout=dropout,
116
+ upper_codebook_mask=n_mask_codebooks,
117
+ )
118
+ # mask = mask.cpu().numpy()
119
+ import matplotlib.pyplot as plt
120
+ plt.clf()
121
+ interface.visualize_codes(mask)
122
+ plt.title("mask preview")
123
+ plt.savefig("scratch/mask-prev.png")
124
+ return "scratch/mask-prev.png"
125
+
126
+
127
+ @spaces.GPU
128
+ def _vamp_internal(
129
+ seed, input_audio, model_choice,
130
+ pitch_shift_amt, periodic_p,
131
+ n_mask_codebooks, onset_mask_width,
132
+ dropout, sampletemp, typical_filtering,
133
+ typical_mass, typical_min_tokens, top_p,
134
+ sample_cutoff, stretch_factor, sampling_steps, beat_mask_ms, num_feedback_steps, api=False, harp=False
135
+ ):
136
+ if torch.cuda.is_available():
137
+ device = "cuda"
138
+ elif torch.backends.mps.is_available():
139
+ device = "mps"
140
+ else:
141
+ device = "cpu"
142
+
143
+
144
+ print("args!")
145
+ print(f"seed: {seed}")
146
+ print(f"input_audio: {input_audio}")
147
+ print(f"model_choice: {model_choice}")
148
+ print(f"pitch_shift_amt: {pitch_shift_amt}")
149
+ print(f"periodic_p: {periodic_p}")
150
+ print(f"n_mask_codebooks: {n_mask_codebooks}")
151
+ print(f"onset_mask_width: {onset_mask_width}")
152
+ print(f"dropout: {dropout}")
153
+ print(f"sampletemp: {sampletemp}")
154
+ print(f"typical_filtering: {typical_filtering}")
155
+ print(f"typical_mass: {typical_mass}")
156
+ print(f"typical_min_tokens: {typical_min_tokens}")
157
+ print(f"top_p: {top_p}")
158
+ print(f"sample_cutoff: {sample_cutoff}")
159
+ print(f"stretch_factor: {stretch_factor}")
160
+ print(f"sampling_steps: {sampling_steps}")
161
+ print(f"api: {api}")
162
+ print(f"beat_mask_ms: {beat_mask_ms}")
163
+ print(f"using device {interface.device}")
164
+ print(f"num feedback steps: {num_feedback_steps}")
165
+
166
+
167
+ t0 = time.time()
168
+ interface.to(device)
169
+ print(f"using device {interface.device}")
170
+ _seed = seed if seed > 0 else None
171
+ if _seed is None:
172
+ _seed = int(torch.randint(0, 2**32, (1,)).item())
173
+ at.util.seed(_seed)
174
+
175
+ if input_audio is None:
176
+ raise gr.Error("no input audio received!")
177
+ sr, input_audio = input_audio
178
+ input_audio = input_audio / np.iinfo(input_audio.dtype).max
179
+
180
+ sig = at.AudioSignal(input_audio, sr).to_mono()
181
+
182
+ loudness = sig.loudness()
183
+ sig = interface._preprocess(sig)
184
+
185
+ # reload the model if necessary
186
+ interface.load_finetuned(model_choice)
187
+
188
+ if pitch_shift_amt != 0:
189
+ sig = shift_pitch(sig, pitch_shift_amt)
190
+
191
+ codes = interface.encode(sig)
192
+
193
+ # mask = new_vampnet_mask(
194
+ # interface,
195
+ # codes,
196
+ # onset_idxs=onsets(sig, hop_length=interface.codec.hop_length),
197
+ # width=onset_mask_width,
198
+ # periodic_prompt=periodic_p,
199
+ # upper_codebook_mask=n_mask_codebooks,
200
+ # drop_amt=dropout
201
+ # ).long()
202
+
203
+
204
+ mask = interface.build_mask(
205
+ codes,
206
+ sig=sig,
207
+ periodic_prompt=periodic_p,
208
+ onset_mask_width=onset_mask_width,
209
+ _dropout=dropout,
210
+ upper_codebook_mask=n_mask_codebooks,
211
+ )
212
+ if beat_mask_ms > 0:
213
+ # bm = pmask.mask_or(
214
+ # pmask.periodic_mask(
215
+ # codes, periodic_p, random_roll=False
216
+ # ),
217
+ # )
218
+ mask = pmask.mask_and(
219
+ mask, interface.make_beat_mask(
220
+ sig, after_beat_s=beat_mask_ms/1000.,
221
+ )
222
+ )
223
+ mask = pmask.codebook_mask(mask, n_mask_codebooks)
224
+ np.savetxt("scratch/rms_mask.txt", mask[0].cpu().numpy(), fmt='%d')
225
+
226
+ interface.set_chunk_size(10.0)
227
+
228
+ # lord help me
229
+ if top_p is not None:
230
+ if top_p > 0:
231
+ pass
232
+ else:
233
+ top_p = None
234
+
235
+ codes, mask_z = interface.vamp(
236
+ codes, mask,
237
+ batch_size=2,
238
+ feedback_steps=num_feedback_steps,
239
+ _sampling_steps=sampling_steps,
240
+ time_stretch_factor=stretch_factor,
241
+ return_mask=True,
242
+ temperature=sampletemp,
243
+ typical_filtering=typical_filtering,
244
+ typical_mass=typical_mass,
245
+ typical_min_tokens=typical_min_tokens,
246
+ top_p=top_p,
247
+ seed=_seed,
248
+ sample_cutoff=sample_cutoff,
249
+ )
250
+ print(f"vamp took {time.time() - t0} seconds")
251
+
252
+ sig = interface.decode(codes)
253
+ sig = sig.normalize(loudness)
254
+
255
+ import matplotlib.pyplot as plt
256
+ plt.clf()
257
+ # plt.imshow(mask_z[0].cpu().numpy(), aspect='auto
258
+ interface.visualize_codes(mask)
259
+ plt.title("actual mask")
260
+ plt.savefig("scratch/mask.png")
261
+ plt.clf()
262
+
263
+ if harp:
264
+ return sig
265
+
266
+ if not api:
267
+ return to_output(sig[0]), to_output(sig[1]), "scratch/mask.png"
268
+ else:
269
+ return to_output(sig[0]), to_output(sig[1])
270
+
271
+ @spaces.GPU
272
+ def vamp(input_audio,
273
+ sampletemp,
274
+ top_p,
275
+ periodic_p,
276
+ dropout,
277
+ stretch_factor,
278
+ onset_mask_width,
279
+ typical_filtering,
280
+ typical_mass,
281
+ typical_min_tokens,
282
+ seed,
283
+ model_choice,
284
+ n_mask_codebooks,
285
+ pitch_shift_amt,
286
+ sample_cutoff,
287
+ sampling_steps,
288
+ beat_mask_ms,
289
+ num_feedback_steps):
290
+ return _vamp_internal(
291
+ seed=seed,
292
+ input_audio=input_audio,
293
+ model_choice=model_choice,
294
+ pitch_shift_amt=pitch_shift_amt,
295
+ periodic_p=periodic_p,
296
+ n_mask_codebooks=n_mask_codebooks,
297
+ onset_mask_width=onset_mask_width,
298
+ dropout=dropout,
299
+ sampletemp=sampletemp,
300
+ typical_filtering=typical_filtering,
301
+ typical_mass=typical_mass,
302
+ typical_min_tokens=typical_min_tokens,
303
+ top_p=top_p,
304
+ sample_cutoff=sample_cutoff,
305
+ stretch_factor=stretch_factor,
306
+ sampling_steps=sampling_steps,
307
+ beat_mask_ms=beat_mask_ms,
308
+ num_feedback_steps=num_feedback_steps,
309
+ api=False,
310
+ )
311
+
312
+ @spaces.GPU
313
+ def api_vamp(input_audio,
314
+ sampletemp, top_p,
315
+ periodic_p,
316
+ dropout,
317
+ stretch_factor,
318
+ onset_mask_width,
319
+ typical_filtering,
320
+ typical_mass,
321
+ typical_min_tokens,
322
+ seed,
323
+ model_choice,
324
+ n_mask_codebooks,
325
+ pitch_shift_amt,
326
+ sample_cutoff,
327
+ sampling_steps,
328
+ beat_mask_ms, num_feedback_steps):
329
+ return _vamp_internal(
330
+ seed=seed,
331
+ input_audio=input_audio,
332
+ model_choice=model_choice,
333
+ pitch_shift_amt=pitch_shift_amt,
334
+ periodic_p=periodic_p,
335
+ n_mask_codebooks=n_mask_codebooks,
336
+ onset_mask_width=onset_mask_width,
337
+ dropout=dropout,
338
+ sampletemp=sampletemp,
339
+ typical_filtering=typical_filtering,
340
+ typical_mass=typical_mass,
341
+ typical_min_tokens=typical_min_tokens,
342
+ top_p=top_p,
343
+ sample_cutoff=sample_cutoff,
344
+ stretch_factor=stretch_factor,
345
+ sampling_steps=sampling_steps,
346
+ beat_mask_ms=beat_mask_ms,
347
+ num_feedback_steps=num_feedback_steps,
348
+ api=True,
349
+ )
350
+
351
+ @spaces.GPU
352
+ def harp_vamp(input_audio, sampletemp, periodic_p, dropout, n_mask_codebooks, model_choice, stretch_factor):
353
+ sig = at.AudioSignal(input_audio).to_mono()
354
+
355
+ input_audio = sig.cpu().detach().numpy()[0][0]
356
+ input_audio = input_audio * np.iinfo(np.int16).max
357
+ input_audio = input_audio.astype(np.int16)
358
+ input_audio = input_audio.reshape(1, -1)
359
+ input_audio = (sig.sample_rate, input_audio)
360
+
361
+ sig = _vamp_internal(
362
+ seed=0,
363
+ input_audio=input_audio,
364
+ model_choice=model_choice,
365
+ pitch_shift_amt=0,
366
+ periodic_p=int(periodic_p),
367
+ n_mask_codebooks=int(n_mask_codebooks),
368
+ onset_mask_width=0,
369
+ dropout=dropout,
370
+ sampletemp=sampletemp,
371
+ typical_filtering=False,
372
+ typical_mass=0.15,
373
+ typical_min_tokens=1,
374
+ top_p=None,
375
+ sample_cutoff=1.0,
376
+ stretch_factor=stretch_factor,
377
+ sampling_steps=36,
378
+ beat_mask_ms=int(0),
379
+ num_feedback_steps=1,
380
+ api=False,
381
+ harp=True,
382
+ )
383
+
384
+ ll = LabelList()
385
+ ll.append(OutputLabel(label='short label', t=0.0, description='longer description'))
386
+ return save_audio(sig.detach().cpu()), ll
387
+
388
+
389
+ with gr.Blocks() as demo:
390
+ with gr.Row():
391
+ with gr.Column():
392
+ manual_audio_upload = gr.File(
393
+ label=f"upload some audio (will be randomly trimmed to max of 100s)",
394
+ file_types=["audio"]
395
+ )
396
+ load_example_audio_button = gr.Button("or load example audio")
397
+
398
+ input_audio = gr.Audio(
399
+ label="input audio",
400
+ interactive=False,
401
+ type="numpy",
402
+ )
403
+
404
+ # audio_mask = gr.Audio(
405
+ # label="audio mask (listen to this to hear the mask hints)",
406
+ # interactive=False,
407
+ # type="numpy",
408
+ # )
409
+
410
+ # connect widgets
411
+ load_example_audio_button.click(
412
+ fn=load_example_audio,
413
+ inputs=[],
414
+ outputs=[ input_audio]
415
+ )
416
+
417
+ manual_audio_upload.change(
418
+ fn=load_audio,
419
+ inputs=[manual_audio_upload],
420
+ outputs=[ input_audio]
421
+ )
422
+
423
+
424
+ # mask settings
425
+ with gr.Column():
426
+ with gr.Accordion("manual controls", open=True):
427
+ periodic_p = gr.Slider(
428
+ label="periodic prompt",
429
+ minimum=0,
430
+ maximum=13,
431
+ step=1,
432
+ value=7,
433
+ )
434
+
435
+ onset_mask_width = gr.Slider(
436
+ label="onset mask width (multiplies with the periodic mask, 1 step ~= 10milliseconds) does not affect mask preview",
437
+ minimum=0,
438
+ maximum=100,
439
+ step=1,
440
+ value=0, visible=True
441
+ )
442
+
443
+ beat_mask_ms = gr.Slider(
444
+ label="beat mask width (milliseconds) does not affect mask preview",
445
+ minimum=1,
446
+ maximum=200,
447
+ step=1,
448
+ value=0,
449
+ visible=True
450
+ )
451
+
452
+ n_mask_codebooks = gr.Slider(
453
+ label="compression prompt ",
454
+ value=3,
455
+ minimum=1,
456
+ maximum=14,
457
+ step=1,
458
+ )
459
+
460
+ dropout = gr.Slider(
461
+ label="mask dropout",
462
+ minimum=0.0,
463
+ maximum=1.0,
464
+ step=0.01,
465
+ value=0.0
466
+ )
467
+
468
+ num_feedback_steps = gr.Slider(
469
+ label="feedback steps (token telephone) -- turn it up for better timbre/rhythm transfer quality, but it's slower!",
470
+ minimum=1,
471
+ maximum=8,
472
+ step=1,
473
+ value=1
474
+ )
475
+
476
+ preset_dropdown = gr.Dropdown(
477
+ label="preset",
478
+ choices=["timbre transfer", "small variation", "small variation (follow beat)", "medium variation", "medium variation (follow beat)", "large variation", "large variation (follow beat)", "unconditional"],
479
+ value="medium variation"
480
+ )
481
+ def change_preset(preset_dropdown):
482
+ if preset_dropdown == "timbre transfer":
483
+ periodic_p = 2
484
+ n_mask_codebooks = 1
485
+ onset_mask_width = 0
486
+ dropout = 0.0
487
+ beat_mask_ms = 0
488
+ elif preset_dropdown == "small variation":
489
+ periodic_p = 5
490
+ n_mask_codebooks = 4
491
+ onset_mask_width = 0
492
+ dropout = 0.0
493
+ beat_mask_ms = 0
494
+ elif preset_dropdown == "small variation (follow beat)":
495
+ periodic_p = 7
496
+ n_mask_codebooks = 4
497
+ onset_mask_width = 0
498
+ dropout = 0.0
499
+ beat_mask_ms = 50
500
+ elif preset_dropdown == "medium variation":
501
+ periodic_p = 7
502
+ n_mask_codebooks = 4
503
+ onset_mask_width = 0
504
+ dropout = 0.0
505
+ beat_mask_ms = 0
506
+ elif preset_dropdown == "medium variation (follow beat)":
507
+ periodic_p = 13
508
+ n_mask_codebooks = 4
509
+ onset_mask_width = 0
510
+ dropout = 0.0
511
+ beat_mask_ms = 50
512
+ elif preset_dropdown == "large variation":
513
+ periodic_p = 13
514
+ n_mask_codebooks = 4
515
+ onset_mask_width = 0
516
+ dropout = 0.2
517
+ beat_mask_ms = 0
518
+ elif preset_dropdown == "large variation (follow beat)":
519
+ periodic_p = 0
520
+ n_mask_codebooks = 4
521
+ onset_mask_width = 0
522
+ dropout = 0.0
523
+ beat_mask_ms=80
524
+ elif preset_dropdown == "unconditional":
525
+ periodic_p=0
526
+ n_mask_codebooks=1
527
+ onset_mask_width=0
528
+ dropout=0.0
529
+ return periodic_p, n_mask_codebooks, onset_mask_width, dropout, beat_mask_ms
530
+ preset_dropdown.change(
531
+ fn=change_preset,
532
+ inputs=[preset_dropdown],
533
+ outputs=[periodic_p, n_mask_codebooks, onset_mask_width, dropout, beat_mask_ms]
534
+ )
535
+ # preset_dropdown.change(
536
+
537
+
538
+ maskimg = gr.Image(
539
+ label="mask image",
540
+ interactive=False,
541
+ type="filepath"
542
+ )
543
+
544
+ with gr.Accordion("extras ", open=False):
545
+ pitch_shift_amt = gr.Slider(
546
+ label="pitch shift amount (semitones)",
547
+ minimum=-12,
548
+ maximum=12,
549
+ step=1,
550
+ value=0,
551
+ )
552
+
553
+ stretch_factor = gr.Slider(
554
+ label="time stretch factor",
555
+ minimum=0,
556
+ maximum=8,
557
+ step=1,
558
+ value=1,
559
+ )
560
+
561
+
562
+
563
+
564
+ with gr.Accordion("sampling settings", open=False):
565
+ sampletemp = gr.Slider(
566
+ label="sample temperature",
567
+ minimum=0.1,
568
+ maximum=10.0,
569
+ value=1.0,
570
+ step=0.001
571
+ )
572
+
573
+ top_p = gr.Slider(
574
+ label="top p (0.0 = off)",
575
+ minimum=0.0,
576
+ maximum=1.0,
577
+ value=0.0
578
+ )
579
+ typical_filtering = gr.Checkbox(
580
+ label="typical filtering ",
581
+ value=True
582
+ )
583
+ typical_mass = gr.Slider(
584
+ label="typical mass (should probably stay between 0.1 and 0.5)",
585
+ minimum=0.01,
586
+ maximum=0.99,
587
+ value=0.15
588
+ )
589
+ typical_min_tokens = gr.Slider(
590
+ label="typical min tokens (should probably stay between 1 and 256)",
591
+ minimum=1,
592
+ maximum=256,
593
+ step=1,
594
+ value=64
595
+ )
596
+ sample_cutoff = gr.Slider(
597
+ label="sample cutoff",
598
+ minimum=0.0,
599
+ maximum=0.9,
600
+ value=1.0,
601
+ step=0.01
602
+ )
603
+ sampling_steps = gr.Slider(
604
+ label="sampling steps",
605
+ minimum=1,
606
+ maximum=128,
607
+ step=1,
608
+ value=36
609
+ )
610
+
611
+
612
+
613
+ seed = gr.Number(
614
+ label="seed (0 for random)",
615
+ value=0,
616
+ precision=0,
617
+ )
618
+
619
+
620
+ # mask settings
621
+ with gr.Column():
622
+
623
+ model_choice = gr.Dropdown(
624
+ label="model choice",
625
+ choices=list(interface.available_models()),
626
+ value=init_model_choice,
627
+ visible=True
628
+ )
629
+
630
+
631
+ vamp_button = gr.Button("generate (vamp)!!!")
632
+
633
+
634
+ audio_outs = []
635
+ use_as_input_btns = []
636
+ for i in range(2):
637
+ with gr.Column():
638
+ audio_outs.append(gr.Audio(
639
+ label=f"output audio {i+1}",
640
+ interactive=False,
641
+ type="numpy"
642
+ ))
643
+ use_as_input_btns.append(
644
+ gr.Button(f"use as input (feedback)")
645
+ )
646
+
647
+ thank_you = gr.Markdown("")
648
+
649
+ # download all the outputs
650
+ # download = gr.File(type="filepath", label="download outputs")
651
+
652
+
653
+ # mask preview change
654
+ for widget in (
655
+ periodic_p, n_mask_codebooks,
656
+ onset_mask_width, dropout
657
+ ):
658
+ widget.change(
659
+ fn=mask_preview,
660
+ inputs=[periodic_p, n_mask_codebooks,
661
+ onset_mask_width, dropout],
662
+ outputs=[maskimg]
663
+ )
664
+
665
+
666
+ _inputs = [
667
+ input_audio,
668
+ sampletemp,
669
+ top_p,
670
+ periodic_p,
671
+ dropout,
672
+ stretch_factor,
673
+ onset_mask_width,
674
+ typical_filtering,
675
+ typical_mass,
676
+ typical_min_tokens,
677
+ seed,
678
+ model_choice,
679
+ n_mask_codebooks,
680
+ pitch_shift_amt,
681
+ sample_cutoff,
682
+ sampling_steps,
683
+ beat_mask_ms,
684
+ num_feedback_steps
685
+ ]
686
+
687
+ # connect widgets
688
+ vamp_button.click(
689
+ fn=vamp,
690
+ inputs=_inputs,
691
+ outputs=[audio_outs[0], audio_outs[1], maskimg],
692
+ )
693
+
694
+ api_vamp_button = gr.Button("api vamp", visible=True)
695
+ api_vamp_button.click(
696
+ fn=api_vamp,
697
+ inputs=[input_audio,
698
+ sampletemp, top_p,
699
+ periodic_p,
700
+ dropout,
701
+ stretch_factor,
702
+ onset_mask_width,
703
+ typical_filtering,
704
+ typical_mass,
705
+ typical_min_tokens,
706
+ seed,
707
+ model_choice,
708
+ n_mask_codebooks,
709
+ pitch_shift_amt,
710
+ sample_cutoff,
711
+ sampling_steps,
712
+ beat_mask_ms,
713
+ num_feedback_steps
714
+ ],
715
+ outputs=[audio_outs[0], audio_outs[1]],
716
+ api_name="vamp"
717
+ )
718
+
719
+
720
+ #NEW: HARP endpoint (new PyHARP API)
721
+ harp_model_card = ModelCard(
722
+ name="vampnet",
723
+ description="generating audio by filling in the blanks.",
724
+ author="hugo flores garcía et al. (descript/northwestern)",
725
+ tags=["sound", "generation"]
726
+ )
727
+
728
+ harp_input_components = [
729
+ gr.Audio(type="filepath", label="Input Audio").harp_required(True),
730
+ gr.Slider(label="Sample Temperature", minimum=0.1, maximum=10.0, value=1.0, step=0.001),
731
+ gr.Slider(label="Periodic Prompt", minimum=0, maximum=13, step=1, value=7),
732
+ gr.Slider(label="Mask Dropout", minimum=0.0, maximum=1.0, step=0.01, value=0.0),
733
+ gr.Slider(label="Compression Prompt", value=3, minimum=1, maximum=14, step=1),
734
+ gr.Dropdown(label="Model Choice", choices=list(interface.available_models()), value=init_model_choice),
735
+ gr.Slider(label="Time Stretch Factor", minimum=0, maximum=8, step=1, value=1),
736
+ ]
737
+
738
+ harp_output_components = [
739
+ gr.Audio(type="filepath", label="Generated Audio"),
740
+ gr.JSON(label="Generated Labels"),
741
+ ]
742
+
743
+ harp_app = build_endpoint(
744
+ model_card=harp_model_card,
745
+ input_components=harp_input_components,
746
+ output_components=harp_output_components,
747
+ process_fn=harp_vamp
748
+ )
749
+
750
+ with gr.Row():
751
+ gr.Markdown("### VST / HARP Plugin Controls")
752
+ for comp in harp_app.values():
753
+ comp.render()
754
+
755
+ try:
756
+ demo.queue()
757
+ demo.launch(share=True)
758
+ except KeyboardInterrupt:
759
+ shutil.rmtree("gradio-outputs", ignore_errors=True)
760
+ raise
assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
conf/c2f.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ VampNet.n_codebooks: 14
5
+ VampNet.n_conditioning_codebooks: 4
6
+
7
+ VampNet.embedding_dim: 1280
8
+ VampNet.n_layers: 16
9
+ VampNet.n_heads: 20
10
+
11
+ AudioDataset.duration: 3.0
12
+
13
+
14
+ AudioDataset.loudness_cutoff: -40.0
conf/generated/cat/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/cat/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - scratch/cat-audio
15
+ val/AudioLoader.sources: *id001
conf/generated/cat/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/cat/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - scratch/cat-audio
8
+ val/AudioLoader.sources: *id001
conf/generated/cat/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - scratch/cat-audio
3
+ Interface.coarse2fine_ckpt: ./runs/cat/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/cat/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/cat10/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/cat10/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - scratch/cat-audio-10s
15
+ val/AudioLoader.sources: *id001
conf/generated/cat10/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/cat10/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - scratch/cat-audio-10s
8
+ val/AudioLoader.sources: *id001
conf/generated/cat10/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - scratch/cat-audio-10s
3
+ Interface.coarse2fine_ckpt: ./runs/cat10/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/cat10/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/ivo/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/ivo/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - ./scratch/miguel/ivo/separated
15
+ val/AudioLoader.sources: *id001
conf/generated/ivo/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/ivo/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - ./scratch/miguel/ivo/separated
8
+ val/AudioLoader.sources: *id001
conf/generated/ivo/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - ./scratch/miguel/ivo/separated
3
+ Interface.coarse2fine_ckpt: ./runs/ivo/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/ivo/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/lazaro-ros-sep/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/lazaro-ros-sep/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - ./scratch/miguel/lazaro-ros/separated
15
+ val/AudioLoader.sources: *id001
conf/generated/lazaro-ros-sep/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/lazaro-ros-sep/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - ./scratch/miguel/lazaro-ros/separated
8
+ val/AudioLoader.sources: *id001
conf/generated/lazaro-ros-sep/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - ./scratch/miguel/lazaro-ros/separated
3
+ Interface.coarse2fine_ckpt: ./runs/lazaro-ros-sep/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/lazaro-ros-sep/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/lazaro-ros/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/lazaro-ros/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - ./scratch/miguel/lazaro-ros
15
+ val/AudioLoader.sources: *id001
conf/generated/lazaro-ros/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/lazaro-ros/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - ./scratch/miguel/lazaro-ros
8
+ val/AudioLoader.sources: *id001
conf/generated/lazaro-ros/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - ./scratch/miguel/lazaro-ros
3
+ Interface.coarse2fine_ckpt: ./runs/lazaro-ros/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/lazaro-ros/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/le-poisson-steve/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/le-poisson-steve/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - scratch/steve
15
+ val/AudioLoader.sources: *id001
conf/generated/le-poisson-steve/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/le-poisson-steve/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - scratch/steve
8
+ val/AudioLoader.sources: *id001
conf/generated/le-poisson-steve/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - scratch/steve
3
+ Interface.coarse2fine_ckpt: ./runs/le-poisson-steve/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/le-poisson-steve/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/march-31/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/march-31/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - sound-journal-march-31
15
+ val/AudioLoader.sources: *id001
conf/generated/march-31/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/march-31/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - sound-journal-march-31
8
+ val/AudioLoader.sources: *id001
conf/generated/march-31/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - sound-journal-march-31
3
+ Interface.coarse2fine_ckpt: ./runs/march-31/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/march-31/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/sax-new/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/sax-new/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - ./scratch/miguel/saxophone-new/
15
+ val/AudioLoader.sources: *id001
conf/generated/sax-new/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/sax-new/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - ./scratch/miguel/saxophone-new/
8
+ val/AudioLoader.sources: *id001
conf/generated/sax-new/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - ./scratch/miguel/saxophone-new/
3
+ Interface.coarse2fine_ckpt: ./runs/sax-new/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/sax-new/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/generated/saxophone/c2f.yml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ AudioDataset.duration: 3.0
4
+ AudioDataset.loudness_cutoff: -40.0
5
+ VampNet.embedding_dim: 1280
6
+ VampNet.n_codebooks: 14
7
+ VampNet.n_conditioning_codebooks: 4
8
+ VampNet.n_heads: 20
9
+ VampNet.n_layers: 16
10
+ fine_tune: true
11
+ fine_tune_checkpoint: ./models/vampnet/c2f.pth
12
+ save_path: ./runs/saxophone/c2f
13
+ train/AudioLoader.sources: &id001
14
+ - scratch/sounds
15
+ val/AudioLoader.sources: *id001
conf/generated/saxophone/coarse.yml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/lora/lora.yml
3
+ fine_tune: true
4
+ fine_tune_checkpoint: ./models/vampnet/coarse.pth
5
+ save_path: ./runs/saxophone/coarse
6
+ train/AudioLoader.sources: &id001
7
+ - scratch/sounds
8
+ val/AudioLoader.sources: *id001
conf/generated/saxophone/interface.yml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ AudioLoader.sources:
2
+ - - scratch/sounds
3
+ Interface.coarse2fine_ckpt: ./runs/saxophone/c2f/latest/vampnet/weights.pth
4
+ Interface.coarse_ckpt: ./runs/saxophone/coarse/latest/vampnet/weights.pth
5
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
conf/interface.yml ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ Interface.coarse_ckpt: ./models/vampnet/coarse.pth
2
+ Interface.coarse2fine_ckpt: ./models/vampnet/c2f.pth
3
+ Interface.codec_ckpt: ./models/vampnet/codec.pth
4
+ Interface.coarse_chunk_size_s: 10
5
+ Interface.coarse2fine_chunk_size_s: 3
6
+ Interface.wavebeat_ckpt: ./models/wavebeat.pth
7
+
8
+ # AudioLoader.sources:
9
+ # - /media/CHONK/null
10
+
conf/lora/lora-s2s.yml ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioDataset.n_examples: 100000000
7
+ val/AudioDataset.n_examples: 500
8
+
9
+
10
+ NoamScheduler.warmup: 500
11
+
12
+ batch_size: 7
13
+ num_workers: 7
14
+ save_iters: [2000, 4000, 10000,20000, 40000, 100000]
15
+ sample_freq: 2000
16
+ val_freq: 1000
17
+
18
+ AdamW.lr: 0.0001
19
+
20
+ # let's us organize sound classes into folders and choose from those sound classes uniformly
21
+ AudioDataset.without_replacement: False
22
+ num_iters: 500000
23
+
24
+
25
+ # control signals to use as conditioning.
26
+ Sketch2SoundController.ctrl_keys: ['rmsq16',]
27
+
conf/lora/lora.yml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ $include:
2
+ - conf/vampnet.yml
3
+
4
+ fine_tune: True
5
+
6
+ train/AudioDataset.n_examples: 100000000
7
+ val/AudioDataset.n_examples: 500
8
+
9
+
10
+ NoamScheduler.warmup: 500
11
+
12
+ batch_size: 7
13
+ num_workers: 7
14
+ save_iters: [2000, 4000, 10000, 20000, 40000, 100000]
15
+ sample_freq: 2000
16
+ val_freq: 1000
17
+
18
+ AdamW.lr: 0.0001
19
+
20
+ # let's us organize sound classes into folders and choose from those sound classes uniformly
21
+ AudioDataset.without_replacement: False
22
+ num_iters: 500000
conf/salad_bowl.yml ADDED
File without changes
conf/vampnet.yml ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ codec_ckpt: ./models/vampnet/codec.pth
3
+ save_path: ckpt
4
+
5
+ num_iters: 1000000000
6
+ save_iters: [10000, 50000, 100000, 300000, 500000]
7
+ val_idx: [0,1,2,3,4,5,6,7,8,9]
8
+ sample_freq: 10000
9
+ val_freq: 1000
10
+
11
+ batch_size: 8
12
+ num_workers: 10
13
+
14
+ # Optimization
15
+ amp: false
16
+
17
+ CrossEntropyLoss.label_smoothing: 0.1
18
+
19
+ AdamW.lr: 0.001
20
+
21
+ NoamScheduler.factor: 2.0
22
+ NoamScheduler.warmup: 10000
23
+
24
+ VampNet.vocab_size: 1024
25
+ VampNet.n_codebooks: 4
26
+ VampNet.n_conditioning_codebooks: 0
27
+ VampNet.r_cond_dim: 0
28
+ VampNet.noise_mode: mask
29
+ VampNet.embedding_dim: 1280
30
+ VampNet.n_layers: 20
31
+ VampNet.n_heads: 20
32
+ VampNet.flash_attn: false
33
+ VampNet.dropout: 0.1
34
+
35
+ AudioLoader.relative_path: ""
36
+ AudioDataset.loudness_cutoff: -30.0
37
+ AudioDataset.without_replacement: true
38
+ AudioLoader.shuffle: true
39
+
40
+ AudioDataset.duration: 10.0
41
+
42
+ train/AudioDataset.n_examples: 10000000
43
+ train/AudioLoader.sources:
44
+ - /media/CHONK/hugo/spotdl/audio-train
45
+
46
+ val/AudioDataset.n_examples: 2000
47
+ val/AudioLoader.sources:
48
+ - /media/CHONK/hugo/spotdl/audio-val
49
+
hello.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import random
2
+ import vampnet
3
+ import audiotools as at
4
+
5
+ # load the default vampnet model
6
+ interface = vampnet.interface.Interface.default()
7
+
8
+ # list available finetuned models
9
+ finetuned_model_choices = interface.available_models()
10
+ print(f"available finetuned models: {finetuned_model_choices}")
11
+
12
+ # pick a random finetuned model
13
+ model_choice = random.choice(finetuned_model_choices)
14
+ print(f"choosing model: {model_choice}")
15
+
16
+ # or pick a specific finetuned model
17
+ print(f"actually, forcing model: default")
18
+ model_choice = "default"
19
+
20
+ # load a finetuned model
21
+ interface.load_finetuned(model_choice)
22
+
23
+ # load an example audio file
24
+ signal = at.AudioSignal("assets/example.wav")
25
+
26
+ # get the tokens for the audio
27
+ codes = interface.encode(signal)
28
+
29
+ # build a mask for the audio
30
+ mask = interface.build_mask(
31
+ codes, signal,
32
+ periodic_prompt=13,
33
+ upper_codebook_mask=3,
34
+ )
35
+
36
+ # generate the output tokens
37
+ output_tokens = interface.vamp(
38
+ codes, mask, return_mask=False,
39
+ temperature=1.0,
40
+ typical_filtering=False,
41
+ debug=True
42
+ )
43
+
44
+ # convert them to a signal
45
+ output_signal = interface.decode(output_tokens)
46
+
47
+ # save the output signal
48
+ output_signal.write("scratch/output.wav")
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ argbind>=0.3.2
3
+ numpy==1.23
4
+ loralib
5
+ wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
6
+ lac @ git+https://github.com/hugofloresgarcia/lac.git
7
+ descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git
8
+ -e git+https://github.com/audacitorch/pyharp.git@develop#egg=pyharp
9
+ torch_pitch_shift
10
+ gradio
11
+ pydantic==2.10.6
scratch/convert_to_wav.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ for f in *.mp3; do ffmpeg -i "$f" "${f%.mp3}.wav"; done
scratch/rms_mask.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
2
+ 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
3
+ 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1
4
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
5
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
6
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
7
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
8
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
9
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
10
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
11
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
12
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
13
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
14
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
scratch/separate_folder.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ for f in *.mp3; do demucs "$f" --two-stems=vocals; done
scripts/exp/eval.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ from functools import partial
4
+
5
+ from frechet_audio_distance import FrechetAudioDistance
6
+ import pandas
7
+ import argbind
8
+ import torch
9
+ from tqdm import tqdm
10
+
11
+ import audiotools
12
+ from audiotools import AudioSignal
13
+
14
+ @argbind.bind(without_prefix=True)
15
+ def eval(
16
+ exp_dir: str = None,
17
+ baseline_key: str = "baseline",
18
+ audio_ext: str = ".wav",
19
+ ):
20
+ assert exp_dir is not None
21
+ exp_dir = Path(exp_dir)
22
+ assert exp_dir.exists(), f"exp_dir {exp_dir} does not exist"
23
+
24
+ # set up our metrics
25
+ # sisdr_loss = audiotools.metrics.distance.SISDRLoss()
26
+ # stft_loss = audiotools.metrics.spectral.MultiScaleSTFTLoss()
27
+ mel_loss = audiotools.metrics.spectral.MelSpectrogramLoss()
28
+ frechet = FrechetAudioDistance(
29
+ use_pca=False,
30
+ use_activation=False,
31
+ verbose=True,
32
+ audio_load_worker=4,
33
+ )
34
+ frechet.model.to("cuda" if torch.cuda.is_available() else "cpu")
35
+
36
+ # figure out what conditions we have
37
+ conditions = [d.name for d in exp_dir.iterdir() if d.is_dir()]
38
+
39
+ assert baseline_key in conditions, f"baseline_key {baseline_key} not found in {exp_dir}"
40
+ conditions.remove(baseline_key)
41
+
42
+ print(f"Found {len(conditions)} conditions in {exp_dir}")
43
+ print(f"conditions: {conditions}")
44
+
45
+ baseline_dir = exp_dir / baseline_key
46
+ baseline_files = sorted(list(baseline_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
47
+
48
+ metrics = []
49
+ for condition in tqdm(conditions):
50
+ cond_dir = exp_dir / condition
51
+ cond_files = sorted(list(cond_dir.glob(f"*{audio_ext}")), key=lambda x: int(x.stem))
52
+
53
+ print(f"computing fad for {baseline_dir} and {cond_dir}")
54
+ frechet_score = frechet.score(baseline_dir, cond_dir)
55
+
56
+ # make sure we have the same number of files
57
+ num_files = min(len(baseline_files), len(cond_files))
58
+ baseline_files = baseline_files[:num_files]
59
+ cond_files = cond_files[:num_files]
60
+ assert len(list(baseline_files)) == len(list(cond_files)), f"number of files in {baseline_dir} and {cond_dir} do not match. {len(list(baseline_files))} vs {len(list(cond_files))}"
61
+
62
+ def process(baseline_file, cond_file):
63
+ # make sure the files match (same name)
64
+ assert baseline_file.stem == cond_file.stem, f"baseline file {baseline_file} and cond file {cond_file} do not match"
65
+
66
+ # load the files
67
+ baseline_sig = AudioSignal(str(baseline_file))
68
+ cond_sig = AudioSignal(str(cond_file))
69
+
70
+ cond_sig.resample(baseline_sig.sample_rate)
71
+ cond_sig.truncate_samples(baseline_sig.length)
72
+
73
+ # if our condition is inpainting, we need to trim the conditioning off
74
+ if "inpaint" in condition:
75
+ ctx_amt = float(condition.split("_")[-1])
76
+ ctx_samples = int(ctx_amt * baseline_sig.sample_rate)
77
+ print(f"found inpainting condition. trimming off {ctx_samples} samples from {cond_file} and {baseline_file}")
78
+ cond_sig.trim(ctx_samples, ctx_samples)
79
+ baseline_sig.trim(ctx_samples, ctx_samples)
80
+
81
+ return {
82
+ # "sisdr": -sisdr_loss(baseline_sig, cond_sig).item(),
83
+ # "stft": stft_loss(baseline_sig, cond_sig).item(),
84
+ "mel": mel_loss(baseline_sig, cond_sig).item(),
85
+ "frechet": frechet_score,
86
+ # "visqol": vsq,
87
+ "condition": condition,
88
+ "file": baseline_file.stem,
89
+ }
90
+
91
+ print(f"processing {len(baseline_files)} files in {baseline_dir} and {cond_dir}")
92
+ metrics.extend(tqdm(map(process, baseline_files, cond_files), total=len(baseline_files)))
93
+
94
+ metric_keys = [k for k in metrics[0].keys() if k not in ("condition", "file")]
95
+
96
+
97
+ for mk in metric_keys:
98
+ stat = pandas.DataFrame(metrics)
99
+ stat = stat.groupby(['condition'])[mk].agg(['mean', 'count', 'std'])
100
+ stat.to_csv(exp_dir / f"stats-{mk}.csv")
101
+
102
+ df = pandas.DataFrame(metrics)
103
+ df.to_csv(exp_dir / "metrics-all.csv", index=False)
104
+
105
+
106
+ if __name__ == "__main__":
107
+ args = argbind.parse_args()
108
+
109
+ with argbind.scope(args):
110
+ eval()
scripts/exp/experiment.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import random
3
+ from typing import List
4
+ import tempfile
5
+ import subprocess
6
+
7
+ import argbind
8
+ from tqdm import tqdm
9
+ import torch
10
+
11
+ from vampnet.interface import Interface
12
+ from vampnet import mask as pmask
13
+ import audiotools as at
14
+
15
+ Interface: Interface = argbind.bind(Interface)
16
+
17
+
18
+
19
+ def calculate_bitrate(
20
+ interface, num_codebooks,
21
+ downsample_factor
22
+ ):
23
+ bit_width = 10
24
+ sr = interface.codec.sample_rate
25
+ hop = interface.codec.hop_size
26
+ rate = (sr / hop) * ((bit_width * num_codebooks) / downsample_factor)
27
+ return rate
28
+
29
+ def baseline(sig, interface):
30
+ return interface.preprocess(sig)
31
+
32
+ def reconstructed(sig, interface):
33
+ return interface.decode(
34
+ interface.encode(sig)
35
+ )
36
+
37
+ def coarse2fine(sig, interface):
38
+ z = interface.encode(sig)
39
+ z = z[:, :interface.c2f.n_conditioning_codebooks, :]
40
+
41
+ z = interface.coarse_to_fine(z)
42
+ return interface.decode(z)
43
+
44
+ class CoarseCond:
45
+
46
+ def __init__(self, num_conditioning_codebooks, downsample_factor):
47
+ self.num_conditioning_codebooks = num_conditioning_codebooks
48
+ self.downsample_factor = downsample_factor
49
+
50
+ def __call__(self, sig, interface):
51
+ z = interface.encode(sig)
52
+ mask = pmask.full_mask(z)
53
+ mask = pmask.codebook_unmask(mask, self.num_conditioning_codebooks)
54
+ mask = pmask.periodic_mask(mask, self.downsample_factor)
55
+
56
+ zv = interface.coarse_vamp(z, mask)
57
+ zv = interface.coarse_to_fine(zv)
58
+ return interface.decode(zv)
59
+
60
+ def opus(sig, interface, bitrate=128):
61
+ sig = interface.preprocess(sig)
62
+
63
+ with tempfile.NamedTemporaryFile(suffix=".wav") as f:
64
+ sig.write(f.name)
65
+
66
+ opus_name = Path(f.name).with_suffix(".opus")
67
+ # convert to opus
68
+ cmd = [
69
+ "ffmpeg", "-y", "-i", f.name,
70
+ "-c:a", "libopus",
71
+ "-b:a", f"{bitrate}",
72
+ opus_name
73
+ ]
74
+ subprocess.run(cmd, check=True)
75
+
76
+ # convert back to wav
77
+ output_name = Path(f"{f.name}-opus").with_suffix(".wav")
78
+ cmd = [
79
+ "ffmpeg", "-y", "-i", opus_name,
80
+ output_name
81
+ ]
82
+
83
+ subprocess.run(cmd, check=True)
84
+
85
+ sig = at.AudioSignal(
86
+ output_name,
87
+ sample_rate=sig.sample_rate
88
+ )
89
+ return sig
90
+
91
+ def mask_ratio_1_step(ratio=1.0):
92
+ def wrapper(sig, interface):
93
+ z = interface.encode(sig)
94
+ mask = pmask.linear_random(z, ratio)
95
+ zv = interface.coarse_vamp(
96
+ z,
97
+ mask,
98
+ sampling_steps=1,
99
+ )
100
+
101
+ return interface.decode(zv)
102
+ return wrapper
103
+
104
+ def num_sampling_steps(num_steps=1):
105
+ def wrapper(sig, interface: Interface):
106
+ z = interface.encode(sig)
107
+ mask = pmask.periodic_mask(z, 16)
108
+ zv = interface.coarse_vamp(
109
+ z,
110
+ mask,
111
+ sampling_steps=num_steps,
112
+ )
113
+
114
+ zv = interface.coarse_to_fine(zv)
115
+ return interface.decode(zv)
116
+ return wrapper
117
+
118
+ def beat_mask(ctx_time):
119
+ def wrapper(sig, interface):
120
+ beat_mask = interface.make_beat_mask(
121
+ sig,
122
+ before_beat_s=ctx_time/2,
123
+ after_beat_s=ctx_time/2,
124
+ invert=True
125
+ )
126
+
127
+ z = interface.encode(sig)
128
+
129
+ zv = interface.coarse_vamp(
130
+ z, beat_mask
131
+ )
132
+
133
+ zv = interface.coarse_to_fine(zv)
134
+ return interface.decode(zv)
135
+ return wrapper
136
+
137
+ def inpaint(ctx_time):
138
+ def wrapper(sig, interface: Interface):
139
+ z = interface.encode(sig)
140
+ mask = pmask.inpaint(z, interface.s2t(ctx_time), interface.s2t(ctx_time))
141
+
142
+ zv = interface.coarse_vamp(z, mask)
143
+ zv = interface.coarse_to_fine(zv)
144
+
145
+ return interface.decode(zv)
146
+ return wrapper
147
+
148
+ def token_noise(noise_amt):
149
+ def wrapper(sig, interface: Interface):
150
+ z = interface.encode(sig)
151
+ mask = pmask.random(z, noise_amt)
152
+ z = torch.where(
153
+ mask,
154
+ torch.randint_like(z, 0, interface.coarse.vocab_size),
155
+ z
156
+ )
157
+ return interface.decode(z)
158
+ return wrapper
159
+
160
+ EXP_REGISTRY = {}
161
+
162
+ EXP_REGISTRY["gen-compression"] = {
163
+ "baseline": baseline,
164
+ "reconstructed": reconstructed,
165
+ "coarse2fine": coarse2fine,
166
+ **{
167
+ f"{n}_codebooks_downsampled_{x}x": CoarseCond(num_conditioning_codebooks=n, downsample_factor=x)
168
+ for (n, x) in (
169
+ (1, 1), # 1 codebook, no downsampling
170
+ (4, 4), # 4 codebooks, downsampled 4x
171
+ (4, 16), # 4 codebooks, downsampled 16x
172
+ (4, 32), # 4 codebooks, downsampled 16x
173
+ )
174
+ },
175
+ **{
176
+ f"token_noise_{x}": mask_ratio_1_step(ratio=x)
177
+ for x in [0.25, 0.5, 0.75]
178
+ },
179
+
180
+ }
181
+
182
+
183
+ EXP_REGISTRY["sampling-steps"] = {
184
+ # "codec": reconstructed,
185
+ **{f"steps_{n}": num_sampling_steps(n) for n in [1, 4, 12, 36, 64, 72]},
186
+ }
187
+
188
+
189
+ EXP_REGISTRY["musical-sampling"] = {
190
+ **{f"beat_mask_{t}": beat_mask(t) for t in [0.075]},
191
+ **{f"inpaint_{t}": inpaint(t) for t in [0.5, 1.0,]}, # multiply these by 2 (they go left and right)
192
+ }
193
+
194
+ @argbind.bind(without_prefix=True)
195
+ def main(
196
+ sources=[
197
+ "/media/CHONK/hugo/spotdl/val",
198
+ ],
199
+ output_dir: str = "./samples",
200
+ max_excerpts: int = 2000,
201
+ exp_type: str = "gen-compression",
202
+ seed: int = 0,
203
+ ext: str = [".mp3"],
204
+ ):
205
+ at.util.seed(seed)
206
+ interface = Interface()
207
+
208
+ output_dir = Path(output_dir)
209
+ output_dir.mkdir(exist_ok=True, parents=True)
210
+
211
+ from audiotools.data.datasets import AudioLoader, AudioDataset
212
+
213
+ loader = AudioLoader(sources=sources, shuffle_state=seed, ext=ext)
214
+ dataset = AudioDataset(loader,
215
+ sample_rate=interface.codec.sample_rate,
216
+ duration=interface.coarse.chunk_size_s,
217
+ n_examples=max_excerpts,
218
+ without_replacement=True,
219
+ )
220
+
221
+ if exp_type in EXP_REGISTRY:
222
+ SAMPLE_CONDS = EXP_REGISTRY[exp_type]
223
+ else:
224
+ raise ValueError(f"Unknown exp_type {exp_type}")
225
+
226
+
227
+ indices = list(range(max_excerpts))
228
+ random.shuffle(indices)
229
+ for i in tqdm(indices):
230
+ # if all our files are already there, skip
231
+ done = []
232
+ for name in SAMPLE_CONDS:
233
+ o_dir = Path(output_dir) / name
234
+ done.append((o_dir / f"{i}.wav").exists())
235
+ if all(done):
236
+ continue
237
+
238
+ sig = dataset[i]["signal"]
239
+ results = {
240
+ name: cond(sig, interface).cpu()
241
+ for name, cond in SAMPLE_CONDS.items()
242
+ }
243
+
244
+ for name, sig in results.items():
245
+ o_dir = Path(output_dir) / name
246
+ o_dir.mkdir(exist_ok=True, parents=True)
247
+
248
+ sig.write(o_dir / f"{i}.wav")
249
+
250
+ if __name__ == "__main__":
251
+ args = argbind.parse_args()
252
+
253
+ with argbind.scope(args):
254
+ main()
scripts/exp/export.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ import shutil
4
+ import argparse
5
+ from vampnet import DEFAULT_HF_MODEL_REPO
6
+ from huggingface_hub import create_repo, repo_exists, HfApi
7
+
8
+
9
+
10
+ parser = argparse.ArgumentParser(description="Export the fine-tuned model to the repo")
11
+ parser.add_argument(
12
+ "--name", type=str, default="lazaro-ros-sep",
13
+ help="name of the fine-tuned model to export"
14
+ )
15
+ parser.add_argument(
16
+ "--model", type=str, default="latest",
17
+ help="model version to export. check runs/<name> for available versions"
18
+ )
19
+ parser.add_argument(
20
+ "--repo", type=str, default=DEFAULT_HF_MODEL_REPO,
21
+ help="name of the repo to export to"
22
+ )
23
+
24
+ args = parser.parse_args()
25
+ name = args.name
26
+ version = args.model
27
+
28
+ ##
29
+ print(f"~~~~~~~~~~~ vampnet export! ~~~~~~~~~~~~")
30
+ print(f"exporting {name} version {version} to {args.repo}\n")
31
+
32
+ run_dir = Path(f"runs/{name}")
33
+ repo_dir = Path("models/vampnet")
34
+
35
+ # create our repo
36
+ new_repo = False
37
+ if not repo_exists(args.repo):
38
+ print(f"repo {args.repo} does not exist, creating it")
39
+ print(f"creating a repo at {args.repo}")
40
+ create_repo(args.repo)
41
+ new_repo = True
42
+
43
+ paths = []
44
+ for part in ("coarse", "c2f"):
45
+ outdir = repo_dir / "loras" / name
46
+ outdir.mkdir(parents=True, exist_ok=True)
47
+ outpath = outdir / f"{part}.pth"
48
+ path = run_dir / part / version / "vampnet" / "weights.pth"
49
+ # path.rename(outpath)
50
+ shutil.copy(path, outpath)
51
+ paths.append(outpath)
52
+ print(f"copied {path} to {outpath}")
53
+
54
+ print(f"uploading files to {args.repo}")
55
+ # upload files to the repo
56
+
57
+ # if it's a new repo, let's add the default models too
58
+ if new_repo:
59
+ paths.extend([repo_dir / "c2f.pth", repo_dir / "coarse.pth", repo_dir / "codec.pth", repo_dir / "wavebeat.pth"])
60
+
61
+ api = HfApi()
62
+
63
+ for path in paths:
64
+ path_in_repo = str(path.relative_to(repo_dir))
65
+ print(f"uploading {path} to {args.repo}/{path_in_repo}")
66
+ api.upload_file(
67
+ path_or_fileobj=path,
68
+ path_in_repo=path_in_repo,
69
+ repo_id=args.repo,
70
+ token=True,
71
+ commit_message=f"uploading {path_in_repo}",
72
+ )
73
+
74
+
75
+ print("done!!! >::0")
scripts/exp/fine_tune.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argbind
2
+ from pathlib import Path
3
+ import yaml
4
+ from typing import List
5
+
6
+
7
+
8
+
9
+ """example output: (yaml)
10
+
11
+ """
12
+
13
+ @argbind.bind(without_prefix=True, positional=True)
14
+ def fine_tune(audio_files_or_folders: List[str], name: str):
15
+
16
+ conf_dir = Path("conf")
17
+ assert conf_dir.exists(), "conf directory not found. are you in the vampnet directory?"
18
+
19
+ conf_dir = conf_dir / "generated"
20
+ conf_dir.mkdir(exist_ok=True)
21
+
22
+ finetune_dir = conf_dir / name
23
+ finetune_dir.mkdir(exist_ok=True)
24
+
25
+ finetune_c2f_conf = {
26
+ "$include": ["conf/lora/lora.yml"],
27
+ "fine_tune": True,
28
+ "train/AudioLoader.sources": audio_files_or_folders,
29
+ "val/AudioLoader.sources": audio_files_or_folders,
30
+ "VampNet.n_codebooks": 14,
31
+ "VampNet.n_conditioning_codebooks": 4,
32
+ "VampNet.embedding_dim": 1280,
33
+ "VampNet.n_layers": 16,
34
+ "VampNet.n_heads": 20,
35
+ "AudioDataset.duration": 3.0,
36
+ "AudioDataset.loudness_cutoff": -40.0,
37
+ "save_path": f"./runs/{name}/c2f",
38
+ "fine_tune_checkpoint": "./models/vampnet/c2f.pth"
39
+ }
40
+
41
+ finetune_coarse_conf = {
42
+ "$include": ["conf/lora/lora.yml"],
43
+ "fine_tune": True,
44
+ "train/AudioLoader.sources": audio_files_or_folders,
45
+ "val/AudioLoader.sources": audio_files_or_folders,
46
+ "save_path": f"./runs/{name}/coarse",
47
+ "fine_tune_checkpoint": "./models/vampnet/coarse.pth"
48
+ }
49
+
50
+ interface_conf = {
51
+ "Interface.coarse_ckpt": f"./runs/{name}/coarse/latest/vampnet/weights.pth",
52
+
53
+ "Interface.coarse2fine_ckpt": f"./runs/{name}/c2f/latest/vampnet/weights.pth",
54
+ "Interface.wavebeat_ckpt": "./models/wavebeat.pth",
55
+
56
+ "Interface.codec_ckpt": "./models/vampnet/codec.pth",
57
+ "AudioLoader.sources": [audio_files_or_folders],
58
+ }
59
+
60
+ # save the confs
61
+ with open(finetune_dir / "c2f.yml", "w") as f:
62
+ yaml.dump(finetune_c2f_conf, f)
63
+
64
+ with open(finetune_dir / "coarse.yml", "w") as f:
65
+ yaml.dump(finetune_coarse_conf, f)
66
+
67
+ with open(finetune_dir / "interface.yml", "w") as f:
68
+ yaml.dump(interface_conf, f)
69
+
70
+
71
+ # print(f"generated confs in {finetune_dir}.
72
+ # run training jobs with `python scripts/exp/train.py --args.load {finetune_dir}/<c2f/coarse>.yml` ")
73
+
74
+ print(f"generated confs in {finetune_dir}.")
75
+ print()
76
+ print(f"you'll need to run two training jobs, though they can run in parallel on separate GPUs.")
77
+ print(f"run the coarse job with \n\tpython scripts/exp/train.py --args.load {finetune_dir}/coarse.yml\n")
78
+ print(f"run the c2f job with \n\tpython scripts/exp/train.py --args.load {finetune_dir}/c2f.yml\n")
79
+ if __name__ == "__main__":
80
+ args = argbind.parse_args()
81
+
82
+ with argbind.scope(args):
83
+ fine_tune()
84
+
85
+
86
+
87
+