Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria commited on Sep 18, 2024

Commit

fb1a11c

1 Parent(s): ab6348d

Adding contribute tutorial

Browse files

Files changed (7) hide show

CONTRIBUTING.md +43 -0
app.py +30 -45
notebooks/eda.json +80 -0
notebooks/embeddings.json +67 -0
notebooks/finetuning.json +6 -0
notebooks/rag.json +6 -0
utils/notebook_utils.py +19 -1

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,43 @@

+# How to contribute
+Thanks for your interest in contributing! 🙌 This project helps users quickly create notebooks to showcase how they use datasets. The generated code can be added to their repositories or used in research projects.
+## Ways to Contribute
+There are a few ways you can help:
+- 💡**Share ideas**: Got a cool feature in mind? Let us know!
+- 🐞**Report bugs**: If something isn’t working, we’d love to fix it.
+- 🚀**Suggest improvements**: Any suggestions to make the tool better are welcome.
+- 📓**Add new notebook types**: This is one of the most exciting ways to contribute!
+## For Ideas, Bugs, or Suggestions:
+- Start a new discussion [here](https://huggingface.co/spaces/asoria/auto-notebook-creator/discussions/new).
+- Tell me what’s on your mind and include any details that might help.
+## For Adding New Notebook Types:
+- Open a pull request (PR) [here](https://huggingface.co/spaces/asoria/auto-notebook-creator/discussions?new_pr=true).
+- Add a new `.json` file in the notebooks folder. There’s a sample file you can copy and tweak.
+- Submit your PR! 🎉
+## Running the Space Application
+To execute the space, follow these steps:
+1. Set Required Environment Variables:
+- `NOTEBOOKS_REPOSITORY`: The name of the repository where the generated notebooks will be stored. Ensure that you have **write** permissions for this repository. For example, I use [asoria/dataset-notebook-creator-content](https://huggingface.co/datasets/asoria/dataset-notebook-creator-content) repository.
+- `HF_TOKEN`: Your Hugging Face token, used for authentication to push changes to the repository.
+Example setup:
+```bash
+export HF_TOKEN=your_huggingface_token
+export NOTEBOOKS_REPOSITORY=your_repository_name
+```
+2. Execute the following command to start the application:
+```bash
+python app.py
+```
+I am excited to see what you come up with. Thanks for helping make this project even better! 💖

app.py CHANGED Viewed

@@ -6,25 +6,23 @@ from httpx import Client
 import logging
 import pandas as pd
 from utils.notebook_utils import (
-    eda_cells,
     replace_wildcards,
-    rag_cells,
-    embeggins_cells,
 )
 from dotenv import load_dotenv
 import os
 from nbconvert import HTMLExporter
-# TODOs:
-# Improve UI code preview
-# Add template for training
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
-NOTEBOOKS_REPOSITORY = "asoria/dataset-notebook-creator-content"
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
@@ -96,18 +94,6 @@ def longest_string_column(df):
     return longest_col
-def generate_eda_cells(dataset_id):
-    return generate_cells(dataset_id, eda_cells, "eda")
-def generate_rag_cells(dataset_id):
-    return generate_cells(dataset_id, rag_cells, "rag")
-def generate_embedding_cells(dataset_id):
-    return generate_cells(dataset_id, embeggins_cells, "embeddings")
 def _push_to_hub(
     dataset_id,
     notebook_file,
@@ -129,8 +115,15 @@ def _push_to_hub(
         raise
-def generate_cells(dataset_id, cells, notebook_type="eda"):
-    logging.info(f"Generating notebook for dataset {dataset_id}")
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
@@ -161,6 +154,7 @@ def generate_cells(dataset_id, cells, notebook_type="eda"):
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
     if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
         logging.error(
             "Dataset does not have categorical columns, which are required for RAG generation."
@@ -250,34 +244,25 @@ with gr.Blocks(
     gr.Markdown("## 2. Select the type of notebook you want to generate")
     with gr.Row():
-        with gr.Column():
-            generate_eda_btn = gr.Button("EDA", size="sm")
-        with gr.Column():
-            generate_embedding_btn = gr.Button("Embeddings", size="sm")
-        with gr.Column():
-            generate_rag_btn = gr.Button("RAG", size="sm")
-        with gr.Column():
-            generate_training_btn = gr.Button("Training", interactive=False, size="sm")
     gr.Markdown("## 3. Notebook code result")
     code_component = gr.HTML(elem_id="box")
     go_to_notebook = gr.Markdown("", visible=True)
-    generate_eda_btn.click(
-        generate_eda_cells,
-        inputs=[dataset_name],
-        outputs=[code_component, go_to_notebook],
-    )
-    generate_embedding_btn.click(
-        generate_embedding_cells,
-        inputs=[dataset_name],
-        outputs=[code_component, go_to_notebook],
-    )
-    generate_rag_btn.click(
-        generate_rag_cells,
-        inputs=[dataset_name],
         outputs=[code_component, go_to_notebook],
     )

 import logging
 import pandas as pd
 from utils.notebook_utils import (
     replace_wildcards,
+    load_json_files_from_folder,
 )
 from dotenv import load_dotenv
 import os
 from nbconvert import HTMLExporter
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
+NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
+assert (
+    NOTEBOOKS_REPOSITORY is not None
+), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
     return longest_col
 def _push_to_hub(
     dataset_id,
     notebook_file,
         raise
+folder_path = "notebooks"
+notebook_templates = load_json_files_from_folder(folder_path)
+logging.info(f"Available notebooks {notebook_templates.keys()}")
+def generate_cells(dataset_id, notebook_title):
+    logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
+    cells = notebook_templates[notebook_title]["notebook_template"]
+    notebook_type = notebook_templates[notebook_title]["notebook_type"]
     try:
         libraries = get_compatible_libraries(dataset_id)
     except Exception as err:
     has_numeric_columns = len(df.select_dtypes(include=["number"]).columns) > 0
     has_categoric_columns = len(df.select_dtypes(include=["object"]).columns) > 0
+    # TODO: Validate by notebook type
     if notebook_type in ("rag", "embeddings") and not has_categoric_columns:
         logging.error(
             "Dataset does not have categorical columns, which are required for RAG generation."
     gr.Markdown("## 2. Select the type of notebook you want to generate")
     with gr.Row():
+        notebook_type = gr.Dropdown(
+            choices=notebook_templates.keys(), label="Notebook type"
+        )
+        generate_button = gr.Button("Generate Notebook", variant="primary")
+        contribute_btn = gr.Button(
+            "Or Contribute",
+            visible=True,
+            variant="secondary",
+            size="sm",
+            link="https://huggingface.co/spaces/asoria/auto-notebook-creator/blob/main/CONTRIBUTING.md",
+        )
     gr.Markdown("## 3. Notebook code result")
     code_component = gr.HTML(elem_id="box")
     go_to_notebook = gr.Markdown("", visible=True)
+    generate_button.click(
+        generate_cells,
+        inputs=[dataset_name, notebook_type],
         outputs=[code_component, go_to_notebook],
     )

notebooks/eda.json ADDED Viewed

	@@ -0,0 +1,80 @@

+{
+    "notebook_title": "Exploratory data analysis (EDA)",
+    "notebook_type": "eda",
+    "dataset_type": "numeric",
+    "notebook_template": [
+        {
+            "cell_type": "markdown",
+            "source": "\n---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---\n"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 1. Setup necessary libraries and load the dataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Load the dataset as a DataFrame\n{first_code}\n"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 2. Understanding the Dataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# First rows of the dataset and info\nprint(df.head())\nprint(df.info())\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Check for missing values\nprint(df.isnull().sum())\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Identify data types of each column\nprint(df.dtypes)\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Detect duplicated rows\nprint(df.duplicated().sum())\n"
+        },
+        {
+            "cell_type": "code",
+            "source": "\n# Generate descriptive statistics\nprint(df.describe())\n"
+        },
+        {
+            "type": "categoric",
+            "cell_type": "code",
+            "source": "\n# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()\n"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 3. Data Visualization"
+        },
+        {
+            "type": "numeric",
+            "cell_type": "code",
+            "source": "\n# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()\n"
+        },
+        {
+            "type": "numeric",
+            "cell_type": "code",
+            "source": "\n# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.histplot(df[column], kde=True)\n    plt.title(f'Distribution of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Frequency')\n    plt.show()\n"
+        },
+        {
+            "type": "categoric",
+            "cell_type": "code",
+            "source": "\n# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.countplot(x=column, data=df)\n    plt.title(f'Count Plot of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Count')\n    plt.show()\n"
+        },
+        {
+            "type": "numeric",
+            "cell_type": "code",
+            "source": "\n# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.boxplot(df[column])\n    plt.title(f'Box Plot of {column}')\n    plt.xlabel(column)\n    plt.show()\n"
+        }
+    ]
+}

notebooks/embeddings.json ADDED Viewed

	@@ -0,0 +1,67 @@

+{
+    "notebook_title": "Text Embeddings",
+    "notebook_type": "embeddings",
+    "dataset_type": "text",
+    "notebook_template": [
+        {
+            "cell_type": "markdown",
+            "source": "---\n# **Embeddings Notebook for {dataset_name} dataset**\n---"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 1. Setup necessary libraries and load the dataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Install and import necessary libraries.\n!pip install pandas sentence-transformers faiss-cpu "
+        },
+        {
+            "cell_type": "code",
+            "source": "from sentence_transformers import SentenceTransformer\nimport faiss"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Load the dataset as a DataFrame\n{first_code}"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the column name that contains the text data to generate embeddings\ncolumn_to_generate_embeddings = '{longest_col}'"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 2. Loading embedding model and creating FAISS index"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Remove duplicate entries based on the specified column\ndf = df.drop_duplicates(subset=column_to_generate_embeddings)"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Convert the column data to a list of text entries\ntext_list = df[column_to_generate_embeddings].tolist()"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the embedding model you want to use\nmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
+        },
+        {
+            "cell_type": "code",
+            "source": "vectors = model.encode(text_list)\nvector_dimension = vectors.shape[1]\n\n# Initialize the FAISS index with the appropriate dimension (384 for this model)\nindex = faiss.IndexFlatL2(vector_dimension)\n\n# Encode the text list into embeddings and add them to the FAISS index\nindex.add(vectors)"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 3. Perform a text search"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the text you want to search for in the list\ntext_to_search = text_list[0]\nprint(f\"Text to search: {text_to_search}\")"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Generate the embedding for the search query\nquery_embedding = model.encode([text_to_search])"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)\nD, I = index.search(query_embedding, k=10)\n\n# Print the similar documents\nprint(f\"Similar documents: {[text_list[i] for i in I[0]]}\")"
+        }
+    ]
+}

notebooks/finetuning.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "notebook_title": "Supervised fine-tuning (SFT)",
+    "notebook_type": "sft",
+    "dataset_type": "numeric",
+    "notebook_template": []
+}

notebooks/rag.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "notebook_title": "Retrieval-augmented generation (RAG)",
+    "notebook_type": "rag",
+    "dataset_type": "text",
+    "notebook_template": []
+}

utils/notebook_utils.py CHANGED Viewed

@@ -1,3 +1,7 @@
 def replace_wildcards(
     templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
 ):
@@ -20,7 +24,7 @@ def replace_wildcards(
     return new_templates
-embeggins_cells = [
     {
         "cell_type": "markdown",
         "source": """
@@ -475,3 +479,17 @@ def generate_rag_system_prompt():
     Use the provided code to load the dataset; do not use any other method.
     """

+import os
+import json
 def replace_wildcards(
     templates, wildcards, replacements, has_numeric_columns, has_categoric_columns
 ):
     return new_templates
+embeddings_cells = [
     {
         "cell_type": "markdown",
         "source": """
     Use the provided code to load the dataset; do not use any other method.
     """
+def load_json_files_from_folder(folder_path):
+    components = {}
+    for filename in os.listdir(folder_path):
+        if filename.endswith(".json"):
+            file_path = os.path.join(folder_path, filename)
+            with open(file_path, "r") as json_file:
+                data = json.load(json_file)
+                components[data["notebook_title"]] = data
+    return components