Spaces:
Running
Running
| import pandas as pd | |
| import requests | |
| import os | |
| file_path = "open-images-dataset-train0.tsv" | |
| # Read TSV file, skipping the first row | |
| df = pd.read_csv(file_path, sep="\t", engine="python", skiprows=1, names=["ImageURL", "Subset", "ImageID"]) | |
| # Print first few rows to verify | |
| print("First few rows of the cleaned dataset:") | |
| print(df.head()) | |
| # Create a fixed category folder (since 'Subset' contains numbers, not real categories) | |
| output_folder = "open_images_v7/dataset" | |
| os.makedirs(output_folder, exist_ok=True) | |
| # Limit downloads to the first 100 images | |
| max_images = 100 | |
| for index, row in df.iterrows(): | |
| if index >= max_images: | |
| break # Stop downloading after 100 images | |
| image_url = row["ImageURL"] | |
| image_id = row["ImageID"] | |
| # Ensure the image filename ends with ".jpg" | |
| image_path = os.path.join(output_folder, f"{image_id}.jpg") | |
| try: | |
| response = requests.get(image_url, timeout=10) | |
| if response.status_code == 200: | |
| with open(image_path, "wb") as f: | |
| f.write(response.content) | |
| print(f"β Downloaded: {image_id}.jpg") | |
| else: | |
| print(f"β Failed: {image_id}") | |
| except Exception as e: | |
| print(f"β Error downloading {image_id}: {e}") | |