FlexSED / src /prepare_clap.py
OpenSound's picture
Upload 544 files
3b6a091 verified
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, ClapTextModelWithProjection
if __name__ == '__main__':
# Load the CLAP model and tokenizer
model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
# Path to the input CSV file
input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv'
output_path = 'clap_embedding/' # Replace with your desired output folder path
# Create the output folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)
# Read the CSV file
df = pd.read_csv(input_csv_path)
# Get unique event labels
events = df['label'].unique()
with torch.no_grad(): # Disable gradient computation
# Process each event
for event in events:
text = event.replace('_', ' ') # Replace underscores with spaces
text = f'The sound of {text}'
print(text)
inputs = tokenizer([text], padding=True, return_tensors="pt")
outputs = model(**inputs)
text_embeds = outputs.text_embeds
# Save the embeddings to a .pt file
output_file = os.path.join(output_path, f"{event}.pt")
torch.save(text_embeds, output_file)
print("Embedding extraction and saving complete!")