Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
bae8f11
1
Parent(s):
b5f94b5
refactor
Browse files- prep_viewer_data.py +3 -0
prep_viewer_data.py
CHANGED
|
@@ -7,6 +7,7 @@ from huggingface_hub import list_datasets
|
|
| 7 |
from tqdm import tqdm
|
| 8 |
from tqdm.asyncio import tqdm_asyncio
|
| 9 |
|
|
|
|
| 10 |
# Initialize the HTTP client
|
| 11 |
client = httpx.AsyncClient(timeout=60, http2=True)
|
| 12 |
|
|
@@ -115,6 +116,8 @@ async def prep_data(sample_size=200_000, min_likes=1):
|
|
| 115 |
df = pl.read_parquet(
|
| 116 |
"hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
|
| 117 |
)
|
|
|
|
|
|
|
| 118 |
in_train_or_test = set(df["dataset_id"].unique().to_list())
|
| 119 |
|
| 120 |
# Get all datasets
|
|
|
|
| 7 |
from tqdm import tqdm
|
| 8 |
from tqdm.asyncio import tqdm_asyncio
|
| 9 |
|
| 10 |
+
|
| 11 |
# Initialize the HTTP client
|
| 12 |
client = httpx.AsyncClient(timeout=60, http2=True)
|
| 13 |
|
|
|
|
| 116 |
df = pl.read_parquet(
|
| 117 |
"hf://datasets/davanstrien/dataset-viewer-descriptions-processed/data/train-00000-of-00001.parquet"
|
| 118 |
)
|
| 119 |
+
# remove datasets that are already in the train or test set we can remove this later once the model works okay
|
| 120 |
+
|
| 121 |
in_train_or_test = set(df["dataset_id"].unique().to_list())
|
| 122 |
|
| 123 |
# Get all datasets
|