Spaces:

asoria
/

auto-notebook-creator

Sleeping

App Files Files Community

asoria commited on Sep 18, 2024

Commit

e77bff1

1 Parent(s): 6785a2b

Adding basic SFT template

Browse files

Files changed (6) hide show

app.py +61 -59
notebooks/eda.json +1 -0
notebooks/embeddings.json +1 -0
notebooks/rag.json +1 -0
notebooks/sft.json +56 -0
utils/api_utils.py +33 -0

app.py CHANGED Viewed

@@ -2,20 +2,22 @@ import gradio as gr
 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 import nbformat as nbf
 from huggingface_hub import HfApi
-from httpx import Client
 import logging
-import pandas as pd
 from utils.notebook_utils import (
     replace_wildcards,
     load_json_files_from_folder,
 )
 from dotenv import load_dotenv
 import os
 from nbconvert import HTMLExporter
 import uuid
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
@@ -25,12 +27,6 @@ assert (
 ), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
-URL = "https://huggingface.co/spaces/asoria/auto-notebook-creator"
-BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
-HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
-client = Client(headers=HEADERS)
 logging.basicConfig(level=logging.INFO)
 # TODO: Validate notebook templates format
@@ -39,18 +35,6 @@ notebook_templates = load_json_files_from_folder(folder_path)
 logging.info(f"Available notebooks {notebook_templates.keys()}")
-def get_compatible_libraries(dataset: str):
-    try:
-        response = client.get(
-            f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
-        )
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        logging.error(f"Error fetching compatible libraries: {e}")
-        raise
 def create_notebook_file(cells, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
@@ -72,22 +56,6 @@ def create_notebook_file(cells, notebook_name):
     return html_data
-def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
-    try:
-        resp = client.get(
-            f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
-        )
-        resp.raise_for_status()
-        content = resp.json()
-        rows = content["rows"]
-        rows = [row["row"] for row in rows]
-        first_rows_df = pd.DataFrame.from_dict(rows).sample(frac=1).head(limit)
-        return first_rows_df
-    except Exception as e:
-        logging.error(f"Error fetching first rows: {e}")
-        raise
 def longest_string_column(df):
     longest_col = None
     max_length = 0
@@ -127,34 +95,62 @@ def generate_cells(dataset_id, notebook_title):
     cells = notebook_templates[notebook_title]["notebook_template"]
     notebook_type = notebook_templates[notebook_title]["notebook_type"]
     dataset_types = notebook_templates[notebook_title]["dataset_types"]
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
         gr.Error("Unable to retrieve dataset info from HF Hub.")
         logging.error(f"Failed to fetch compatible libraries: {err}")
-        return "", "## ❌ This dataset is not accessible from the Hub ❌"
-    if not libraries:
-        logging.error(f"Dataset not compatible with pandas library - not libraries")
-        return "", "## ❌ This dataset is not compatible with pandas library ❌"
-    pandas_library = next(
-        (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
-        None,
-    )
-    if not pandas_library:
-        logging.error("Dataset not compatible with pandas library - not pandas library")
-        return "", "## ❌ This dataset is not compatible with pandas library ❌"
-    first_config_loading_code = pandas_library["loading_codes"][0]
-    first_code = first_config_loading_code["code"]
-    first_config = first_config_loading_code["config_name"]
-    first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
-    df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
     longest_col = longest_string_column(df)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
-    wildcards = ["{dataset_name}", "{first_code}", "{html_code}", "{longest_col}"]
-    replacements = [dataset_id, first_code, html_code, longest_col]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
@@ -196,8 +192,12 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
-    gr.Markdown(f"[![Notebooks: {len(notebook_templates)}](https://img.shields.io/badge/Notebooks-{len(notebook_templates)}-blue.svg)]({URL}/tree/main/notebooks)")
-    gr.Markdown(f"[![Contribute a Notebook](https://img.shields.io/badge/Contribute%20a%20Notebook-8A2BE2)]({URL}/blob/main/CONTRIBUTING.md)")
     text_input = gr.Textbox(label="Suggested notebook type", visible=False)
     gr.Markdown("## 1. Select and preview a dataset from Huggingface Hub")
@@ -259,6 +259,8 @@ with gr.Blocks(css=css) as demo:
         outputs=[code_component, go_to_notebook],
     )
-    gr.Markdown("🚧 Note: Some code may not be compatible with datasets that contain binary data or complex structures. 🚧")
 demo.launch()

 from gradio_huggingfacehub_search import HuggingfaceHubSearch
 import nbformat as nbf
 from huggingface_hub import HfApi
 import logging
 from utils.notebook_utils import (
     replace_wildcards,
     load_json_files_from_folder,
 )
+from utils.api_utils import get_compatible_libraries, get_first_rows, get_splits
 from dotenv import load_dotenv
 import os
 from nbconvert import HTMLExporter
 import uuid
+import pandas as pd
 load_dotenv()
+URL = "https://huggingface.co/spaces/asoria/auto-notebook-creator"
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
 ), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
 logging.basicConfig(level=logging.INFO)
 # TODO: Validate notebook templates format
 logging.info(f"Available notebooks {notebook_templates.keys()}")
 def create_notebook_file(cells, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
     return html_data
 def longest_string_column(df):
     longest_col = None
     max_length = 0
     cells = notebook_templates[notebook_title]["notebook_template"]
     notebook_type = notebook_templates[notebook_title]["notebook_type"]
     dataset_types = notebook_templates[notebook_title]["dataset_types"]
+    compatible_library = notebook_templates[notebook_title]["compatible_library"]
     try:
         libraries = get_compatible_libraries(dataset_id)
+        if not libraries:
+            logging.error(
+                f"Dataset not compatible with any loading library (pandas/datasets)"
+            )
+            return (
+                "",
+                "## ❌ This dataset is not compatible with pandas or datasets libraries ❌",
+            )
+        library_code = next(
+            (
+                lib
+                for lib in libraries.get("libraries", [])
+                if lib["library"] == compatible_library
+            ),
+            None,
+        )
+        if not library_code:
+            logging.error(f"Dataset not compatible with {compatible_library} library")
+            return (
+                "",
+                f"## ❌ This dataset is not compatible with '{compatible_library}' library ❌",
+            )
+        first_config_loading_code = library_code["loading_codes"][0]
+        first_code = first_config_loading_code["code"]
+        first_config = first_config_loading_code["config_name"]
+        first_split = get_splits(dataset_id, first_config)[0]["split"]
+        first_rows = get_first_rows(dataset_id, first_config, first_split)
     except Exception as err:
         gr.Error("Unable to retrieve dataset info from HF Hub.")
         logging.error(f"Failed to fetch compatible libraries: {err}")
+        return "", f"## ❌ This dataset is not accessible from the Hub {err}❌"
+    df = pd.DataFrame.from_dict(first_rows).sample(frac=1).head(3)
     longest_col = longest_string_column(df)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
+    wildcards = [
+        "{dataset_name}",
+        "{first_code}",
+        "{html_code}",
+        "{longest_col}",
+        "{first_config}",
+        "{first_split}",
+    ]
+    replacements = [
+        dataset_id,
+        first_code,
+        html_code,
+        longest_col,
+        first_config,
+        first_split,
+    ]
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
+    gr.Markdown(
+        f"[![Notebooks: {len(notebook_templates)}](https://img.shields.io/badge/Notebooks-{len(notebook_templates)}-blue.svg)]({URL}/tree/main/notebooks)"
+    )
+    gr.Markdown(
+        f"[![Contribute a Notebook](https://img.shields.io/badge/Contribute%20a%20Notebook-8A2BE2)]({URL}/blob/main/CONTRIBUTING.md)"
+    )
     text_input = gr.Textbox(label="Suggested notebook type", visible=False)
     gr.Markdown("## 1. Select and preview a dataset from Huggingface Hub")
         outputs=[code_component, go_to_notebook],
     )
+    gr.Markdown(
+        "🚧 Note: Some code may not be compatible with datasets that contain binary data or complex structures. 🚧"
+    )
 demo.launch()

notebooks/eda.json CHANGED Viewed

@@ -2,6 +2,7 @@
     "notebook_title": "Exploratory data analysis (EDA)",
     "notebook_type": "eda",
     "dataset_types": ["numeric", "text"],
     "notebook_template": [
         {
             "cell_type": "markdown",

     "notebook_title": "Exploratory data analysis (EDA)",
     "notebook_type": "eda",
     "dataset_types": ["numeric", "text"],
+    "compatible_library": "pandas",
     "notebook_template": [
         {
             "cell_type": "markdown",

notebooks/embeddings.json CHANGED Viewed

@@ -2,6 +2,7 @@
     "notebook_title": "Text Embeddings",
     "notebook_type": "embeddings",
     "dataset_types": ["text"],
     "notebook_template": [
         {
             "cell_type": "markdown",

     "notebook_title": "Text Embeddings",
     "notebook_type": "embeddings",
     "dataset_types": ["text"],
+    "compatible_library": "pandas",
     "notebook_template": [
         {
             "cell_type": "markdown",

notebooks/rag.json CHANGED Viewed

@@ -2,6 +2,7 @@
     "notebook_title": "Retrieval-augmented generation (RAG)",
     "notebook_type": "rag",
     "dataset_types": ["text"],
     "notebook_template": [
         {
             "cell_type": "markdown",

     "notebook_title": "Retrieval-augmented generation (RAG)",
     "notebook_type": "rag",
     "dataset_types": ["text"],
+    "compatible_library": "pandas",
     "notebook_template": [
         {
             "cell_type": "markdown",

notebooks/sft.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+    "notebook_title": "Supervised fine-tuning (SFT)",
+    "notebook_type": "sft",
+    "dataset_types": ["text"],
+    "compatible_library": "datasets",
+    "notebook_template": [
+        {
+            "cell_type": "markdown",
+            "source": "---\n# **Supervised fine-tuning Notebook for {dataset_name} dataset**\n---"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 1. Setup necessary libraries and load the dataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Install and import necessary libraries.\n!pip install trl datasets transformers bitsandbytes"
+        },
+        {
+            "cell_type": "code",
+            "source": "from datasets import load_dataset\nfrom trl import SFTTrainer\nfrom transformers import TrainingArguments"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Load the dataset\ndataset = load_dataset('{dataset_name}', name='{first_config}', split='{first_split}')\ndataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the column name that will be used for training\ndataset_text_field = '{longest_col}'"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 2. Configure SFT trainer"
+        },
+        {
+            "cell_type": "code",
+            "source": "model_name = 'facebook/opt-350m'\noutput_model_name = f'{model_name}-{dataset_name}'.replace('/', '-')\n\ntrainer = SFTTrainer(\n  model = model_name,\n  train_dataset=dataset,\n  dataset_text_field=dataset_text_field,\n  max_seq_length=512,\n  args=TrainingArguments(\n      per_device_train_batch_size = 1, #Batch size per GPU for training\n      gradient_accumulation_steps = 4,\n      max_steps = 100, #Total number of training steps.(Overrides epochs)\n      learning_rate = 2e-4,\n      fp16 = True,\n      logging_steps=20,\n      output_dir = output_model_name,\n      optim = 'paged_adamw_8bit' #Optimizer to use\n  )\n)"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Start training\ntrainer.train()"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 3. Push model to hub"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Authenticate to the Hugging Face Hub\nfrom huggingface_hub import notebook_login\nnotebook_login()"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Push the model to Hugging Face Hub\ntrainer.push_to_hub()"
+        }
+    ]
+}

utils/api_utils.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from httpx import Client
+BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
+HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
+client = Client(headers=HEADERS)
+def get_compatible_libraries(dataset: str):
+    response = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
+    )
+    response.raise_for_status()
+    return response.json()
+def get_first_rows(dataset: str, config: str, split: str):
+    resp = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}"
+    )
+    resp.raise_for_status()
+    content = resp.json()
+    rows = content["rows"]
+    return [row["row"] for row in rows]
+def get_splits(dataset: str, config: str):
+    resp = client.get(
+        f"{BASE_DATASETS_SERVER_URL}/splits?dataset={dataset}&config={config}"
+    )
+    resp.raise_for_status()
+    content = resp.json()
+    return content["splits"]