Spaces:
Runtime error
Runtime error
Add annotations
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +98 -0
- configs.py +40 -0
- costum_datasets.py +67 -0
- datasets/train2014/COCO_train2014_000000000009.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000025.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000030.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000034.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000036.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000049.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000061.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000064.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000071.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000072.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000077.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000078.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000081.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000086.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000089.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000092.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000094.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000109.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000110.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000113.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000127.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000138.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000142.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000144.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000149.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000151.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000154.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000165.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000194.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000201.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000247.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000250.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000260.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000263.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000307.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000308.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000309.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000312.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000315.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000321.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000322.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000326.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000332.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000349.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000368.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000370.jpg +0 -0
- datasets/train2014/COCO_train2014_000000000382.jpg +0 -0
app.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Construct pairs of text and image
|
| 2 |
+
from configs import CFG
|
| 3 |
+
from costum_datasets import make_pairs
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
from text_image_audio import OneEncoder
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
|
| 11 |
+
import torchaudio
|
| 12 |
+
|
| 13 |
+
# Construct pairs of text and image
|
| 14 |
+
training_pairs = make_pairs(CFG.train_annotation_file, CFG.image_dir, 5) # 413.915 -> 82.783 images
|
| 15 |
+
|
| 16 |
+
# Sorted according images
|
| 17 |
+
training_pairs = sorted(training_pairs, key=lambda x: x[0])
|
| 18 |
+
|
| 19 |
+
coco_images, coco_captions = zip(*training_pairs)
|
| 20 |
+
|
| 21 |
+
# Take unique images
|
| 22 |
+
unique_images = set()
|
| 23 |
+
unique_pairs = [(item[0], item[1]) for item in training_pairs if item[0] not in unique_images
|
| 24 |
+
and not unique_images.add(item[0])]
|
| 25 |
+
coco_images, _ = zip(*unique_pairs)
|
| 26 |
+
|
| 27 |
+
# Load model
|
| 28 |
+
model = OneEncoder.from_pretrained("bilalfaye/OneEncoder-text-image-audio")
|
| 29 |
+
|
| 30 |
+
# Load coco image features
|
| 31 |
+
coco_image_features = torch.load("image_embeddings_best.pt", map_location=CFG.device)
|
| 32 |
+
coco_image_features = coco_image_features[:3000]
|
| 33 |
+
|
| 34 |
+
def text_image(query):
|
| 35 |
+
model.text_image_encoder.image_retrieval(query,
|
| 36 |
+
image_paths=coco_images,
|
| 37 |
+
image_embeddings=coco_image_features,
|
| 38 |
+
n=9,
|
| 39 |
+
plot=True,
|
| 40 |
+
temperature=0.0
|
| 41 |
+
)
|
| 42 |
+
return "img.png"
|
| 43 |
+
|
| 44 |
+
def audio_image(query):
|
| 45 |
+
# Load the audio with torchaudio (returns tensor and sample rate)
|
| 46 |
+
waveform, sample_rate = torchaudio.load(query)
|
| 47 |
+
|
| 48 |
+
# Check if audio is stereo
|
| 49 |
+
if waveform.shape[0] > 1: # Stereo (2 channels)
|
| 50 |
+
# Convert stereo to mono: sum the left and right channels and divide by 2
|
| 51 |
+
mono_audio = waveform.mean(dim=0, keepdim=True)
|
| 52 |
+
else:
|
| 53 |
+
# Audio is already mono
|
| 54 |
+
mono_audio = waveform
|
| 55 |
+
|
| 56 |
+
# Resample to 16000 Hz if not already
|
| 57 |
+
if sample_rate != 16000:
|
| 58 |
+
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
|
| 59 |
+
mono_audio = resampler(mono_audio)
|
| 60 |
+
|
| 61 |
+
# Convert to numpy array for pipeline processing (if required)
|
| 62 |
+
mono_audio = mono_audio.squeeze(0).numpy()
|
| 63 |
+
|
| 64 |
+
audio_encoding = model.process_audio([mono_audio])
|
| 65 |
+
|
| 66 |
+
model.image_retrieval(audio_encoding,
|
| 67 |
+
image_paths=coco_images,
|
| 68 |
+
image_embeddings=coco_image_features,
|
| 69 |
+
n=9,
|
| 70 |
+
plot=True,
|
| 71 |
+
temperature=0.0,
|
| 72 |
+
display_audio=False)
|
| 73 |
+
|
| 74 |
+
return "img.png"
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Updated Gradio Interface
|
| 78 |
+
iface = gr.TabbedInterface(
|
| 79 |
+
[
|
| 80 |
+
gr.Interface(
|
| 81 |
+
fn=text_image,
|
| 82 |
+
inputs=gr.Textbox(label="Text Query"),
|
| 83 |
+
outputs="image",
|
| 84 |
+
title="Retrieve images using text as query",
|
| 85 |
+
description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
|
| 86 |
+
),
|
| 87 |
+
gr.Interface(
|
| 88 |
+
fn=audio_image,
|
| 89 |
+
inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="Provide Audio Query"),
|
| 90 |
+
outputs="image",
|
| 91 |
+
title="Retrieve images using audio as query",
|
| 92 |
+
description="Implementation of OneEncoder using one layer on UP for light demo, Only coco train dataset is used in this example (3000 images)."
|
| 93 |
+
)
|
| 94 |
+
],
|
| 95 |
+
tab_names=["Text - Image", "Audio - Image"]
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
iface.launch(debug=True, share=True)
|
configs.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
################################################### PARMETERS ##########################################################
|
| 5 |
+
################################################# PARAMETERS ###########################################################
|
| 6 |
+
|
| 7 |
+
class CFG:
|
| 8 |
+
max_length = 128
|
| 9 |
+
batch_size = 32
|
| 10 |
+
num_workers = 4
|
| 11 |
+
projection_dim = 768
|
| 12 |
+
dropout_rate = 0.1
|
| 13 |
+
num_head = 4
|
| 14 |
+
num_layers = 1
|
| 15 |
+
image_encoder_lr = 1e-4
|
| 16 |
+
radio_encoder_lr = 1e-5
|
| 17 |
+
video_encoder_lr = 1e-4
|
| 18 |
+
text_encoder_lr = 1e-5
|
| 19 |
+
audio_encoder_lr = 1e-5
|
| 20 |
+
modality_token_encoder_lr = 1e-3
|
| 21 |
+
universal_projection_lr = 1e-3
|
| 22 |
+
lr = 1e-3
|
| 23 |
+
weight_decay = 1e-3
|
| 24 |
+
patience = 10
|
| 25 |
+
factor = 0.8
|
| 26 |
+
token_size = 1
|
| 27 |
+
epochs = 100
|
| 28 |
+
image_size = 224
|
| 29 |
+
device = "cpu"
|
| 30 |
+
data_directory = "datasets"
|
| 31 |
+
train_annotation_file = os.path.join(data_directory, "annotations", "captions_train2014.json")
|
| 32 |
+
val_annotation_file = os.path.join(data_directory, "annotations", "captions_val2014.json")
|
| 33 |
+
image_dir = os.path.join(data_directory, "train2014")
|
| 34 |
+
image_dir_val = os.path.join(data_directory, "val2014")
|
| 35 |
+
bert_name = "bert-base-uncased"
|
| 36 |
+
vit_name = "vit_base_patch16_224"
|
| 37 |
+
audio_name = "facebook/wav2vec2-base-960h"
|
| 38 |
+
radio_name = "microsoft/rad-dino"
|
| 39 |
+
video_name = "MCG-NJU/videomae-base"
|
| 40 |
+
sample_rate = 16000
|
costum_datasets.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from configs import CFG
|
| 2 |
+
import os
|
| 3 |
+
import requests
|
| 4 |
+
import zipfile
|
| 5 |
+
from pycocotools.coco import COCO
|
| 6 |
+
import torch
|
| 7 |
+
import cv2
|
| 8 |
+
import albumentations as A
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# Load Coco dataset
|
| 14 |
+
def download_dataset(data_dir="../datasets"):
|
| 15 |
+
# Create caption and image directories
|
| 16 |
+
annotations_dir = os.path.join(data_dir, "annotations")
|
| 17 |
+
images_dir = os.path.join(data_dir, "train2014")
|
| 18 |
+
|
| 19 |
+
# Download annotations (captions)
|
| 20 |
+
zip_file = os.path.join(annotations_dir, "annotations.zip")
|
| 21 |
+
url = "http://images.cocodataset.org/annotations/annotations_trainval2014.zip"
|
| 22 |
+
response = requests.get(url, stream=True)
|
| 23 |
+
# write chunk in zip file
|
| 24 |
+
with open(zip_file, "wb") as f:
|
| 25 |
+
# 8192 = 8KB chunks (block or piece of data)
|
| 26 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 27 |
+
f.write(chunk)
|
| 28 |
+
# unzip file
|
| 29 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
| 30 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
| 31 |
+
os.remove(zip_file)
|
| 32 |
+
|
| 33 |
+
# Download train images
|
| 34 |
+
zip_file = os.path.join(images_dir, "train2014.zip")
|
| 35 |
+
url = "http://images.cocodataset.org/zips/train2014.zip"
|
| 36 |
+
response = requests.get(url, stream=True)
|
| 37 |
+
# write chunk in zip file
|
| 38 |
+
with open(zip_file, "wb") as f:
|
| 39 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 40 |
+
f.write(chunk)
|
| 41 |
+
# unzip file
|
| 42 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
| 43 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
| 44 |
+
os.remove(zip_file)
|
| 45 |
+
|
| 46 |
+
# Download val images
|
| 47 |
+
images_dir = os.path.join(data_dir, "val2014")
|
| 48 |
+
zip_file = os.path.join(images_dir, "val2014.zip")
|
| 49 |
+
url = "http://images.cocodataset.org/zips/val2014.zip"
|
| 50 |
+
response = requests.get(url, stream=True)
|
| 51 |
+
# write chunk in zip file
|
| 52 |
+
with open(zip_file, "wb") as f:
|
| 53 |
+
for chunk in response.iter_content(chunk_size=8192):
|
| 54 |
+
f.write(chunk)
|
| 55 |
+
# unzip file
|
| 56 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
| 57 |
+
zip_ref.extractall(data_dir) # Extract all contents to the specified directory
|
| 58 |
+
os.remove(zip_file)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def make_pairs(annotation_json_files, image_dir, max_captions=3):
|
| 63 |
+
|
| 64 |
+
images = os.listdir(annotation_json_files)
|
| 65 |
+
image_caption = [(os.path.join(annotation_json_files, image), "an image") for image in images]
|
| 66 |
+
return image_caption
|
| 67 |
+
|
datasets/train2014/COCO_train2014_000000000009.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000025.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000030.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000034.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000036.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000049.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000061.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000064.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000071.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000072.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000077.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000078.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000081.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000086.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000089.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000092.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000094.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000109.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000110.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000113.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000127.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000138.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000142.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000144.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000149.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000151.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000154.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000165.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000194.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000201.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000247.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000250.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000260.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000263.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000307.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000308.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000309.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000312.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000315.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000321.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000322.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000326.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000332.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000349.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000368.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000370.jpg
ADDED
|
datasets/train2014/COCO_train2014_000000000382.jpg
ADDED
|