|
|
import os |
|
|
import pandas as pd |
|
|
import torch |
|
|
from transformers import AutoTokenizer, ClapTextModelWithProjection |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") |
|
|
model.eval() |
|
|
tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") |
|
|
|
|
|
|
|
|
input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv' |
|
|
output_path = 'clap_embedding/' |
|
|
|
|
|
|
|
|
os.makedirs(output_path, exist_ok=True) |
|
|
|
|
|
|
|
|
df = pd.read_csv(input_csv_path) |
|
|
|
|
|
|
|
|
events = df['label'].unique() |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
|
|
for event in events: |
|
|
text = event.replace('_', ' ') |
|
|
text = f'The sound of {text}' |
|
|
print(text) |
|
|
inputs = tokenizer([text], padding=True, return_tensors="pt") |
|
|
outputs = model(**inputs) |
|
|
text_embeds = outputs.text_embeds |
|
|
|
|
|
|
|
|
output_file = os.path.join(output_path, f"{event}.pt") |
|
|
torch.save(text_embeds, output_file) |
|
|
|
|
|
print("Embedding extraction and saving complete!") |
|
|
|