Spaces:
Running
Running
Minor details
Browse files- app.py +47 -24
- utils/notebook_utils.py +1 -1
app.py
CHANGED
|
@@ -14,8 +14,8 @@ from utils.notebook_utils import (
|
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
import os
|
| 16 |
|
| 17 |
-
#
|
| 18 |
-
#
|
| 19 |
# Add template for training
|
| 20 |
|
| 21 |
load_dotenv()
|
|
@@ -76,22 +76,12 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
|
|
| 76 |
rows = content["rows"]
|
| 77 |
rows = [row["row"] for row in rows]
|
| 78 |
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
|
| 79 |
-
|
| 80 |
-
features_dict = {feature["name"]: feature["type"] for feature in features}
|
| 81 |
-
return features_dict, first_rows_df
|
| 82 |
except Exception as e:
|
| 83 |
logging.error(f"Error fetching first rows: {e}")
|
| 84 |
raise
|
| 85 |
|
| 86 |
|
| 87 |
-
def generate_eda_cells(dataset_id):
|
| 88 |
-
yield from generate_cells(dataset_id, eda_cells, "eda")
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
def generate_rag_cells(dataset_id):
|
| 92 |
-
yield from generate_cells(dataset_id, rag_cells, "rag")
|
| 93 |
-
|
| 94 |
-
|
| 95 |
def longest_string_column(df):
|
| 96 |
longest_col = None
|
| 97 |
max_length = 0
|
|
@@ -105,6 +95,14 @@ def longest_string_column(df):
|
|
| 105 |
return longest_col
|
| 106 |
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
def generate_embedding_cells(dataset_id):
|
| 109 |
yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
|
| 110 |
|
|
@@ -131,6 +129,7 @@ def _push_to_hub(
|
|
| 131 |
|
| 132 |
|
| 133 |
def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
|
|
| 134 |
try:
|
| 135 |
libraries = get_compatible_libraries(dataset_id)
|
| 136 |
except Exception as err:
|
|
@@ -139,23 +138,22 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
| 139 |
return []
|
| 140 |
|
| 141 |
if not libraries:
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
return
|
| 145 |
-
|
| 146 |
pandas_library = next(
|
| 147 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
| 148 |
None,
|
| 149 |
)
|
| 150 |
if not pandas_library:
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
first_config_loading_code = pandas_library["loading_codes"][0]
|
| 155 |
first_code = first_config_loading_code["code"]
|
| 156 |
first_config = first_config_loading_code["config_name"]
|
| 157 |
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
|
| 158 |
-
|
| 159 |
|
| 160 |
longest_col = longest_string_column(df)
|
| 161 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
|
@@ -163,17 +161,39 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
| 163 |
replacements = [dataset_id, first_code, html_code, longest_col]
|
| 164 |
has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
|
| 165 |
has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
cells = replace_wildcards(
|
| 167 |
cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
|
| 168 |
)
|
| 169 |
generated_text = ""
|
| 170 |
# Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
|
| 171 |
for cell in cells:
|
| 172 |
-
|
|
|
|
|
|
|
| 173 |
yield generated_text, ""
|
| 174 |
if generated_text.count("\n") > 38:
|
| 175 |
generated_text += (
|
| 176 |
-
f"## See more lines available in the generated notebook
|
| 177 |
)
|
| 178 |
yield generated_text, ""
|
| 179 |
break
|
|
@@ -181,7 +201,10 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
|
|
| 181 |
create_notebook_file(cells, notebook_name=notebook_name)
|
| 182 |
_push_to_hub(dataset_id, notebook_name)
|
| 183 |
notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
|
| 184 |
-
yield
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
|
| 187 |
with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
|
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
import os
|
| 16 |
|
| 17 |
+
# TODOs:
|
| 18 |
+
# Improve UI code preview
|
| 19 |
# Add template for training
|
| 20 |
|
| 21 |
load_dotenv()
|
|
|
|
| 76 |
rows = content["rows"]
|
| 77 |
rows = [row["row"] for row in rows]
|
| 78 |
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
|
| 79 |
+
return first_rows_df
|
|
|
|
|
|
|
| 80 |
except Exception as e:
|
| 81 |
logging.error(f"Error fetching first rows: {e}")
|
| 82 |
raise
|
| 83 |
|
| 84 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
def longest_string_column(df):
|
| 86 |
longest_col = None
|
| 87 |
max_length = 0
|
|
|
|
| 95 |
return longest_col
|
| 96 |
|
| 97 |
|
| 98 |
+
def generate_eda_cells(dataset_id):
|
| 99 |
+
yield from generate_cells(dataset_id, eda_cells, "eda")
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def generate_rag_cells(dataset_id):
|
| 103 |
+
yield from generate_cells(dataset_id, rag_cells, "rag")
|
| 104 |
+
|
| 105 |
+
|
| 106 |
def generate_embedding_cells(dataset_id):
|
| 107 |
yield from generate_cells(dataset_id, embeggins_cells, "embeddings")
|
| 108 |
|
|
|
|
| 129 |
|
| 130 |
|
| 131 |
def generate_cells(dataset_id, cells, notebook_type="eda"):
|
| 132 |
+
logging.info(f"Generating notebook for dataset {dataset_id}")
|
| 133 |
try:
|
| 134 |
libraries = get_compatible_libraries(dataset_id)
|
| 135 |
except Exception as err:
|
|
|
|
| 138 |
return []
|
| 139 |
|
| 140 |
if not libraries:
|
| 141 |
+
logging.error(f"Dataset not compatible with pandas library - not libraries")
|
| 142 |
+
yield "", "## β This dataset is not compatible with pandas library β"
|
| 143 |
+
return
|
|
|
|
| 144 |
pandas_library = next(
|
| 145 |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
|
| 146 |
None,
|
| 147 |
)
|
| 148 |
if not pandas_library:
|
| 149 |
+
logging.error("Dataset not compatible with pandas library - not pandas library")
|
| 150 |
+
yield "", "## β This dataset is not compatible with pandas library β"
|
| 151 |
+
return
|
| 152 |
first_config_loading_code = pandas_library["loading_codes"][0]
|
| 153 |
first_code = first_config_loading_code["code"]
|
| 154 |
first_config = first_config_loading_code["config_name"]
|
| 155 |
first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
|
| 156 |
+
df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
|
| 157 |
|
| 158 |
longest_col = longest_string_column(df)
|
| 159 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
|
|
|
| 161 |
replacements = [dataset_id, first_code, html_code, longest_col]
|
| 162 |
has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
|
| 163 |
has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
|
| 164 |
+
|
| 165 |
+
if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
|
| 166 |
+
logging.error(
|
| 167 |
+
"Dataset does not have categorical columns, which are required for RAG generation."
|
| 168 |
+
)
|
| 169 |
+
yield (
|
| 170 |
+
"",
|
| 171 |
+
"## β This dataset does not have categorical columns, which are required for Embeddings/RAG generation β",
|
| 172 |
+
)
|
| 173 |
+
return
|
| 174 |
+
if notebook_type == "eda" and not (has_categoric_columns or has_numeric_columns):
|
| 175 |
+
logging.error(
|
| 176 |
+
"Dataset does not have categorical or numeric columns, which are required for EDA generation."
|
| 177 |
+
)
|
| 178 |
+
yield (
|
| 179 |
+
"",
|
| 180 |
+
"## β This dataset does not have categorical or numeric columns, which are required for EDA generation β",
|
| 181 |
+
)
|
| 182 |
+
return
|
| 183 |
+
|
| 184 |
cells = replace_wildcards(
|
| 185 |
cells, wildcards, replacements, has_numeric_columns, has_categoric_columns
|
| 186 |
)
|
| 187 |
generated_text = ""
|
| 188 |
# Show only the first 40 lines, would like to have a scroll in gr.Code https://github.com/gradio-app/gradio/issues/9192
|
| 189 |
for cell in cells:
|
| 190 |
+
if cell["cell_type"] == "markdown":
|
| 191 |
+
continue
|
| 192 |
+
generated_text += cell["source"] + "\n\n"
|
| 193 |
yield generated_text, ""
|
| 194 |
if generated_text.count("\n") > 38:
|
| 195 |
generated_text += (
|
| 196 |
+
f"## See more lines available in the generated notebook π€ ......"
|
| 197 |
)
|
| 198 |
yield generated_text, ""
|
| 199 |
break
|
|
|
|
| 201 |
create_notebook_file(cells, notebook_name=notebook_name)
|
| 202 |
_push_to_hub(dataset_id, notebook_name)
|
| 203 |
notebook_link = f"https://colab.research.google.com/#fileId=https%3A//huggingface.co/datasets/asoria/dataset-notebook-creator-content/blob/main/{notebook_name}"
|
| 204 |
+
yield (
|
| 205 |
+
generated_text,
|
| 206 |
+
f"## β
Here you have the [generated notebook]({notebook_link}) β
",
|
| 207 |
+
)
|
| 208 |
|
| 209 |
|
| 210 |
with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
utils/notebook_utils.py
CHANGED
|
@@ -12,7 +12,7 @@ def replace_wildcards(
|
|
| 12 |
continue
|
| 13 |
if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
|
| 14 |
continue
|
| 15 |
-
tmp_text = tmp["source"]
|
| 16 |
for wildcard, replacement in zip(wildcards, replacements):
|
| 17 |
tmp_text = tmp_text.replace(wildcard, replacement)
|
| 18 |
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
|
|
|
|
| 12 |
continue
|
| 13 |
if "type" in tmp and tmp["type"] == "categoric" and not has_categoric_columns:
|
| 14 |
continue
|
| 15 |
+
tmp_text = tmp["source"].strip()
|
| 16 |
for wildcard, replacement in zip(wildcards, replacements):
|
| 17 |
tmp_text = tmp_text.replace(wildcard, replacement)
|
| 18 |
new_templates.append({"cell_type": tmp["cell_type"], "source": tmp_text})
|