Spaces:

sohamc05
/

NSynth-5K-pretrained-sed

Sleeping

App Files Files Community

NSynth-5K-pretrained-sed / data_util /transforms.py

sohamc10

gradio app

9b0d6c2 3 months ago

raw

history blame contribute delete

8.3 kB

	import os

	import datasets
	import h5py
	import numpy as np
	import pandas as pd
	import torch
	import torchaudio

	from data_util.audioset_classes import as_strong_train_classes

	## Transforms with a similar style to https://github.com/descriptinc/audiotools/blob/master/audiotools/data/transforms.py
	logger = datasets.logging.get_logger(__name__)


	def target_transform(sample):
	del sample["labels"]
	del sample["label_ids"]
	return sample


	def strong_label_transform(sample, strong_label_encoder=None):
	assert strong_label_encoder is not None
	events = pd.DataFrame(sample['events'][0])
	events = events[events['event_label'].isin(set(as_strong_train_classes))]
	strong = strong_label_encoder.encode_strong_df(events).T
	sample["strong"] = [strong]
	sample["event_count"] = [strong.sum(1)]
	# encode ground truth events as string - we will use this for evaluation
	sample["gt_string"] = ["++".join([";;".join([str(e[0]), str(e[1]), e[2]]) for e in
	zip(sample['events'][0]['onset'], sample['events'][0]['offset'],
	sample['events'][0]['event_label'])])]
	del sample['events']
	return sample


	class AddPseudoLabelsTransform:
	def __init__(self, pseudo_labels_file):
	self.pseudo_labels_file = pseudo_labels_file

	if self.pseudo_labels_file is not None:
	# fetch dict of positions for each example
	self.ex2pseudo_idx = {}
	f = h5py.File(self.pseudo_labels_file, "r")
	for i, fname in enumerate(f["filenames"]):
	self.ex2pseudo_idx[fname.decode("UTF-8")] = i
	self._opened_pseudo_hdf5 = None

	@property
	def pseudo_hdf5_file(self):
	if self._opened_pseudo_hdf5 is None:
	self._opened_pseudo_hdf5 = h5py.File(self.pseudo_labels_file, "r")
	return self._opened_pseudo_hdf5

	def add_pseudo_label_transform(self, sample):
	indices = [self.ex2pseudo_idx[fn.rstrip(".mp3")] for fn in sample['filename']]
	pseudo_strong = [torch.from_numpy(np.stack(self.pseudo_hdf5_file["strong_logits"][index])).float()
	for index in indices]
	pseudo_strong = [torch.sigmoid(pseudo_strong[i]) for i in range(len(pseudo_strong))]
	sample['pseudo_strong'] = pseudo_strong
	return sample


	class SequentialTransform:
	"""Apply a sequence of transforms to a batch."""

	def __init__(self, transforms):
	"""
	Args:
	transforms: list of transforms to apply
	"""
	self.transforms = transforms

	def append(self, transform):
	self.transforms.append(transform)

	def __call__(self, batch):
	for t in self.transforms:
	batch = t(batch)
	return batch


	class Mp3DecodeTransform:
	def __init__(
	self,
	mp3_bytes_key="mp3_bytes",
	audio_key="audio",
	sample_rate=32000,
	max_length=10.0,
	min_length=None,
	random_sample_crop=True,
	allow_resample=True,
	resampling_method="sinc_interp_kaiser",
	keep_mp3_bytes=False,
	debug_info_key=None,
	):
	"""Decode mp3 bytes to audio waveform

	Args:
	mp3_bytes_key (str, optional): The key to mp3 bytes in the input batch. Defaults to "mp3_bytes".
	audio_key (str, optional): The key to save the decoded audio in the output batch. Defaults to "audio".
	sample_rate (int, optional): The expected output audio_key. Defaults to 32000.
	max_length (int, float, optional): the maximum output audio length in seconds if float, otherwise in samples. Defaults to 10.
	min_length (int, optional): the minimum output audio length in seconds. Defaults to max_length.
	random_sample_crop (bool, optional): Randomly crop the audio to max_length if its longer otherwise return the first crop. Defaults to True.
	allow_resample (bool, optional): Resample the singal if the sampling rate don't match. Defaults to True.
	resampling_method (str, optional): reampling method from torchaudio.transforms.Resample . Defaults to "sinc_interp_kaiser".
	keep_mp3_bytes (bool, optional): keep the original bytes in the output dict. Defaults to False.

	Raises:
	Exception: if minimp3py is not installed
	"""
	self.mp3_bytes_key = mp3_bytes_key
	self.audio_key = audio_key
	self.sample_rate = sample_rate
	self.max_length = max_length
	if min_length is None:
	min_length = max_length
	self.min_length = min_length
	self.random_sample_crop = random_sample_crop
	self.allow_resample = allow_resample
	self.resampling_method = resampling_method
	self.keep_mp3_bytes = keep_mp3_bytes
	self.debug_info_key = debug_info_key
	self.resamplers_cache = {}
	try:
	import minimp3py # noqa: F401
	except:
	raise Exception(
	"minimp3py is not installed, please install it using: `CFLAGS='-O3 -march=native' pip install https://github.com/f0k/minimp3py/archive/master.zip`"
	)

	def __call__(self, batch):
	import minimp3py

	data_list = batch[self.mp3_bytes_key]
	if self.debug_info_key is not None:
	file_name_list = batch[self.debug_info_key]
	else:
	file_name_list = range(len(data_list))
	audio_list = []
	for data, file_name in zip(data_list, file_name_list):
	try:
	duration, ch, sr = minimp3py.probe(data)
	if isinstance(self.max_length, float):
	max_length = int(self.max_length * sr)
	else:
	max_length = int(self.max_length * sr // self.sample_rate)
	offset = 0
	if self.random_sample_crop and duration > max_length:
	max_offset = max(int(duration - max_length), 0) + 1
	offset = torch.randint(max_offset, (1,)).item()
	waveform, _ = minimp3py.read(data, start=offset, length=max_length)
	waveform = waveform[:, 0] # 0 for the first channel only
	if waveform.dtype != "float32":
	raise RuntimeError("Unexpected wave type")

	waveform = torch.from_numpy(waveform)
	if len(waveform) == 0:
	logger.warning(
	f"Empty waveform for {file_name}, duration {duration}, offset {offset}, max_length {max_length}, sr {sr}, ch {ch}"
	)
	elif sr != self.sample_rate:
	assert self.allow_resample, f"Unexpected sample rate {sr} instead of {self.sample_rate} at {file_name}"
	if self.resamplers_cache.get(sr) is None:
	self.resamplers_cache[sr] = torchaudio.transforms.Resample(
	sr,
	self.sample_rate,
	resampling_method=self.resampling_method,
	)
	waveform = self.resamplers_cache[sr](waveform)
	min_length = self.min_length
	if isinstance(self.min_length, float):
	min_length = int(self.min_length * self.sample_rate)
	if min_length is not None and len(waveform) < min_length:
	waveform = torch.concatenate(
	(
	waveform,
	torch.zeros(
	min_length - len(waveform),
	dtype=waveform.dtype,
	device=waveform.device,
	),
	),
	dim=0,
	)
	audio_list.append(waveform)
	except Exception as e:
	print(f"Error decoding {file_name}: {e}")
	raise e
	batch[self.audio_key] = audio_list
	batch["sampling_rate"] = [self.sample_rate] * len(audio_list)
	if not self.keep_mp3_bytes:
	del batch[self.mp3_bytes_key]
	return batch