Examples
Create a tokenizer
A basic example showing how to create a tokenizer, with a selection of custom parameters.
from miditok import REMI, TokenizerConfig # here we choose to use REMI
# Our parameters
TOKENIZER_PARAMS = {
"pitch_range": (21, 109),
"beat_res": {(0, 4): 8, (4, 12): 4},
"num_velocities": 32,
"special_tokens": ["PAD", "BOS", "EOS", "MASK"],
"use_chords": True,
"use_rests": False,
"use_tempos": True,
"use_time_signatures": False,
"use_programs": False,
"num_tempos": 32, # number of tempo bins
"tempo_range": (40, 250), # (min, max)
}
config = TokenizerConfig(**TOKENIZER_PARAMS)
# Creates the tokenizer
tokenizer = REMI(config)
MIDI - Tokens conversion
Here we convert a MIDI to tokens, decode them back to a MIDI.
from pathlib import Path
# Tokenize a MIDI file
tokens = tokenizer(Path("to", "your_midi.mid")) # automatically detects Score objects, paths, tokens
# Convert to MIDI and save it
generated_midi = tokenizer(tokens) # MidiTok can handle PyTorch/Numpy/Tensorflow tensors
generated_midi.dump_midi(Path("to", "decoded_midi.mid"))
Trains a tokenizer with BPE
Here we train the tokenizer with Byte Pair Encoding (BPE). BPE allows to reduce the lengths of the sequences of tokens, in turn model efficiency, while improving the results quality/model performance.
from miditok import REMI
from pathlib import Path
# Creates the tokenizer and list the file paths
tokenizer = REMI() # using defaults parameters (constants.py)
midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
# Builds the vocabulary with BPE
tokenizer.learn_bpe(vocab_size=30000, files_paths=midi_paths)
Creates a Dataset and collator for training
Creates a Dataset and a collator to be used with a PyTorch DataLoader to train a model
from miditok import REMI
from miditok.pytorch_data import DatasetTok, DataCollator
midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
dataset = DatasetTok(
files_paths=midi_paths,
min_seq_len=100,
max_seq_len=1024,
tokenizer=tokenizer,
)
collator = DataCollator(
tokenizer["PAD_None"], tokenizer["BOS_None"], tokenizer["EOS_None"]
)
from torch.utils.data import DataLoader
data_loader = DataLoader(dataset=dataset, collate_fn=collator)
# Using the data loader in the training loop
for batch in data_loader:
print("Train your model on this batch...")
Tokenize a dataset
Here we tokenize a whole dataset into JSON files storing the tokens ids. We also perform data augmentation on the pitch, velocity and duration dimension.
from miditok import REMI
from miditok.data_augmentation import augment_midi_dataset
from pathlib import Path
# Creates the tokenizer and list the file paths
tokenizer = REMI() # using defaults parameters (constants.py)
midi_paths = list(Path("path", "to", "dataset").glob("**/*.mid"))
# A validation method to discard MIDIs we do not want
# It can also be used for custom pre-processing, for instance if you want to merge
# some tracks before tokenizing a MIDI file
def midi_valid(midi) -> bool:
if any(ts.numerator != 4 for ts in midi.time_signature_changes):
return False # time signature different from 4/*, 4 beats per bar
return True
# Performs data augmentation on one pitch octave (up and down), velocities and
# durations
augment_midi_dataset(
midi_paths,
pitch_offsets=[-12, 12],
velocity_offsets=[-4, 5],
duration_offsets=[-0.5, 1],
out_path=midi_aug_path,
Path("to", "new", "location", "augmented"),
)
tokenizer.tokenize_midi_dataset( # 2 velocity and 1 duration values
midi_paths,
Path("path", "to", "tokens"),
midi_valid,
)