Spaces:
Runtime error
Runtime error
| import collections | |
| import os | |
| from datetime import datetime, timedelta | |
| import json | |
| from http.server import SimpleHTTPRequestHandler, ThreadingHTTPServer | |
| from urllib.parse import parse_qs, urlparse | |
| from huggingface_hub import list_datasets, set_access_token, HfFolder | |
| from datasets import load_dataset, DatasetDict, Dataset | |
| import numpy as np | |
| HF_TOKEN = os.environ['HF_TOKEN'] | |
| set_access_token(HF_TOKEN) | |
| HfFolder.save_token(HF_TOKEN) | |
| datasets = { | |
| "stars": load_dataset("open-source-metrics/stars").sort('dates'), | |
| "issues": load_dataset("open-source-metrics/issues").sort('dates'), | |
| "pip": load_dataset("open-source-metrics/pip").sort('day'), | |
| } | |
| external_datasets = { | |
| "stars": load_dataset("open-source-metrics/stars-external").sort('dates'), | |
| "issues": load_dataset("open-source-metrics/issues-external").sort('dates'), | |
| "pip": load_dataset("open-source-metrics/pip-external").sort('day') | |
| } | |
| val = 0 | |
| def _range(e): | |
| global val | |
| e['range'] = val | |
| val += 1 | |
| current_date = datetime.strptime(e['dates'], "%Y-%m-%dT%H:%M:%SZ") | |
| first_date = datetime.fromtimestamp(1) | |
| week = abs(current_date - first_date).days // 7 | |
| e['week'] = week | |
| return e | |
| def _ignore_org_members(e): | |
| global val | |
| e['range_non_org'] = val | |
| if e['type']['authorAssociation'] != 'MEMBER': | |
| val += 1 | |
| return e | |
| stars = {} | |
| for k, v in datasets['stars'].items(): | |
| stars[k] = v.map(_range) | |
| val = 0 | |
| stars_external = {} | |
| for k, v in external_datasets['stars'].items(): | |
| stars_external[k] = v.map(_range) | |
| val = 0 | |
| issues = {} | |
| for k, v in datasets['issues'].items(): | |
| issues[k] = v.map(_range) | |
| val = 0 | |
| issues[k] = issues[k].map(_ignore_org_members) | |
| val = 0 | |
| issues_external = {} | |
| for k, v in external_datasets['issues'].items(): | |
| issues_external[k] = v.map(_range) | |
| val = 0 | |
| issues_external[k] = issues_external[k].map(_ignore_org_members) | |
| val = 0 | |
| datasets['stars'] = DatasetDict(**stars) | |
| datasets['issues'] = DatasetDict(**issues) | |
| external_datasets['stars'] = DatasetDict(**stars_external) | |
| external_datasets['issues'] = DatasetDict(**issues_external) | |
| def link_values(library_names, returned_values): | |
| previous_values = {library_name: None for library_name in library_names} | |
| for library_name in library_names: | |
| for i in returned_values.keys(): | |
| if library_name not in returned_values[i]: | |
| returned_values[i][library_name] = previous_values[library_name] | |
| else: | |
| previous_values[library_name] = returned_values[i][library_name] | |
| return returned_values | |
| def running_mean(x, N, total_length=-1): | |
| cumsum = np.cumsum(np.insert(x, 0, 0)) | |
| to_pad = max(total_length - len(cumsum), 0) | |
| return np.pad(cumsum[N:] - cumsum[:-N], (to_pad, 0)) / float(N) | |
| def parse_name_and_options(path): | |
| url = urlparse(path) | |
| query = parse_qs(url.query) | |
| library_names = query.get("input", None)[0] | |
| library_names = library_names.split(',') | |
| options = query.get("options", None)[0] | |
| options = options.split(',') | |
| return library_names, options | |
| class RequestHandler(SimpleHTTPRequestHandler): | |
| def do_GET(self): | |
| print(self.path) | |
| if self.path == "/": | |
| self.path = "index.html" | |
| return SimpleHTTPRequestHandler.do_GET(self) | |
| if self.path.startswith("/initialize"): | |
| dataset_keys = {k: set(v.keys()) for k, v in datasets.items()} | |
| dataset_with_most_splits = max([d for d in dataset_keys.values()], key=len) | |
| external_dataset_keys = {k: set(v.keys()) for k, v in external_datasets.items()} | |
| external_dataset_with_most_splits = max([d for d in external_dataset_keys.values()], key=len) | |
| warnings = [] | |
| for k, v in dataset_keys.items(): | |
| if len(v) < len(dataset_with_most_splits): | |
| warnings.append( | |
| f"The {k} dataset does not contain all splits. Missing: {dataset_with_most_splits - v}." | |
| f"\nSelecting that split to show the pip install numbers will not work." | |
| ) | |
| for k, v in external_dataset_keys.items(): | |
| if len(v) < len(external_dataset_with_most_splits): | |
| warnings.append( | |
| f"The {k} dataset does not contain all splits. Missing: {external_dataset_with_most_splits - v}" | |
| f".\nSelecting that split to show the pip install numbers will not work." | |
| ) | |
| dataset_with_most_splits = list(dataset_with_most_splits) | |
| dataset_with_most_splits.sort() | |
| external_dataset_with_most_splits = list(external_dataset_with_most_splits) | |
| external_dataset_with_most_splits.sort() | |
| return self.response({ | |
| 'internal': list(dataset_with_most_splits), | |
| 'external': external_dataset_with_most_splits, | |
| 'warnings': warnings | |
| }) | |
| if self.path.startswith("/retrievePipInstalls"): | |
| errors = [] | |
| library_names, options = parse_name_and_options(self.path) | |
| if '1' in options: | |
| returned_values = {} | |
| for library_name in library_names: | |
| ds = None | |
| if library_name in datasets['pip']: | |
| ds = datasets['pip'][library_name] | |
| elif library_name in external_datasets['pip']: | |
| ds = external_datasets['pip'][library_name] | |
| else: | |
| errors.append(f"No {library_name} found in internal or external datasets.") | |
| for i in ds: | |
| if i['day'] in returned_values: | |
| returned_values[i['day']]['Cumulated'] += i['num_downloads'] | |
| else: | |
| returned_values[i['day']] = {'Cumulated': i['num_downloads']} | |
| library_names = ['Cumulated'] | |
| else: | |
| returned_values = {} | |
| for library_name in library_names: | |
| if library_name in datasets['pip']: | |
| ds = datasets['pip'][library_name] | |
| elif library_name in external_datasets['pip']: | |
| ds = external_datasets['pip'][library_name] | |
| else: | |
| errors.append(f"No {library_name} found in internal or external datasets for pip.") | |
| return {'errors': errors} | |
| for i in ds: | |
| if i['day'] in returned_values: | |
| returned_values[i['day']][library_name] = i['num_downloads'] | |
| else: | |
| returned_values[i['day']] = {library_name: i['num_downloads']} | |
| for library_name in library_names: | |
| for i in returned_values.keys(): | |
| if library_name not in returned_values[i]: | |
| returned_values[i][library_name] = None | |
| returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
| output = {l: [k[l] for k in returned_values.values()] for l in library_names} | |
| output['day'] = list(returned_values.keys()) | |
| return self.response(output) | |
| if self.path.startswith("/retrieveStars"): | |
| errors = [] | |
| library_names, options = parse_name_and_options(self.path) | |
| returned_values = {} | |
| dataset_dict = datasets['stars'] | |
| external_dataset_dict = external_datasets['stars'] | |
| week_over_week = '1' in options | |
| for library_name in library_names: | |
| if library_name in dataset_dict: | |
| dataset = dataset_dict[library_name] | |
| elif library_name in external_dataset_dict: | |
| dataset = external_dataset_dict[library_name] | |
| else: | |
| errors.append(f"No {library_name} found in internal or external datasets for stars.") | |
| return {'errors': errors} | |
| last_value = 0 | |
| last_week = dataset[0]['week'] | |
| for i in dataset: | |
| if week_over_week and last_week == i['week']: | |
| continue | |
| if i['dates'] in returned_values: | |
| returned_values[i['dates']][library_name] = i['range'] - last_value | |
| else: | |
| returned_values[i['dates']] = {library_name: i['range'] - last_value} | |
| last_value = i['range'] if week_over_week else 0 | |
| last_week = i['week'] | |
| returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
| returned_values = link_values(library_names, returned_values) | |
| output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} | |
| output['day'] = list(returned_values.keys())[::-1] | |
| # Trim down to a smaller number of points. | |
| output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} | |
| return self.response(output) | |
| if self.path.startswith("/retrieveIssues"): | |
| errors = [] | |
| library_names, options = parse_name_and_options(self.path) | |
| exclude_org_members = '1' in options | |
| week_over_week = '2' in options | |
| returned_values = {} | |
| dataset_dict = datasets['issues'] | |
| external_dataset_dict = external_datasets['issues'] | |
| range_id = 'range' if not exclude_org_members else 'range_non_org' | |
| for library_name in library_names: | |
| if library_name in dataset_dict: | |
| dataset = dataset_dict[library_name] | |
| elif library_name in external_dataset_dict: | |
| dataset = external_dataset_dict[library_name] | |
| else: | |
| errors.append(f"No {library_name} found in internal or external datasets for stars.") | |
| return {'errors': errors} | |
| last_value = 0 | |
| last_week = dataset[0]['week'] | |
| for i in dataset: | |
| if week_over_week and last_week == i['week']: | |
| continue | |
| if i['dates'] in returned_values: | |
| returned_values[i['dates']][library_name] = i[range_id] - last_value | |
| else: | |
| returned_values[i['dates']] = {library_name: i[range_id] - last_value} | |
| last_value = i[range_id] if week_over_week else 0 | |
| last_week = i['week'] | |
| returned_values = collections.OrderedDict(sorted(returned_values.items())) | |
| returned_values = link_values(library_names, returned_values) | |
| output = {l: [k[l] for k in returned_values.values()][::-1] for l in library_names} | |
| output['day'] = list(returned_values.keys())[::-1] | |
| # Trim down to a smaller number of points. | |
| output = {k: [v for i, v in enumerate(value) if i % max(1, int(len(value) / 100)) == 0] for k, value in output.items()} | |
| return self.response(output) | |
| return SimpleHTTPRequestHandler.do_GET(self) | |
| def response(self, output): | |
| self.send_response(200) | |
| self.send_header("Content-Type", "application/json") | |
| self.end_headers() | |
| self.wfile.write(json.dumps(output).encode("utf-8")) | |
| return SimpleHTTPRequestHandler | |
| server = ThreadingHTTPServer(("", 7860), RequestHandler) | |
| print("Running on port 7860") | |
| server.serve_forever() | |