Spaces:

open-source-metrics
/

repository-statistics

Runtime error

repository-statistics / python-app /bootstrap_dataset.py

Initial commit

38e70c4 about 3 years ago

1.44 kB

	# Script to bootstrap a dataset
	# Bootstraping a dataset requires first running an SQL script in BigQuery and downloading the json.
	# A dataset will then be created on the Hub with the initial data. This script will go step-by-step over the creation.

	import os
	import sys
	from pathlib import Path
	from datasets import Dataset
	from huggingface_hub import dataset_info

	# Define the library name you'd like to work on.
	from huggingface_hub.utils import RepositoryNotFoundError

	library_name = input("Library name: ")
	current_dir = Path(__file__).parent

	if f"{library_name}.csv" not in os.listdir(current_dir / 'csv_files'):
	query = Path(current_dir / 'query.sql').read_text().replace("<PROJECT_NAME>", library_name)

	print("Open the following link: https://console.cloud.google.com/bigquery?project=huggingface-ml\n")
	print(f"Run the following query:\n\n{query}\n\n")
	print("Press paste the results here (Ctrl+D once pasted): ")
	csv_values = sys.stdin.read()

	Path(current_dir / f'csv_files/{library_name}.csv').write_text(csv_values)

	try:
	dataset_info(f'open-source-metrics/{library_name}-pip-installs')
	dataset_exists = True
	except RepositoryNotFoundError:
	dataset_exists = False

	dataset = Dataset.from_csv(str(current_dir / f'csv_files/{library_name}.csv'), delimiter='\t')
	dataset.push_to_hub(f'open-source-metrics/{library_name}-pip-installs', private=True)
	print("Dataset is pushed to Hub.")