Spaces:

asoria
/

auto-notebook-creator

Running

App Files Files Community

asoria commited on Sep 18, 2024

Commit

0b212ec

1 Parent(s): 1c042c7

Adding eda and rag as templates

Browse files

Files changed (5) hide show

app.py +8 -6
notebooks/eda.json +14 -14
notebooks/finetuning.json +0 -6
notebooks/rag.json +82 -1
utils/notebook_utils.py +0 -457

app.py CHANGED Viewed

@@ -32,6 +32,11 @@ client = Client(headers=HEADERS)
 logging.basicConfig(level=logging.INFO)
 def get_compatible_libraries(dataset: str):
     try:
@@ -116,11 +121,6 @@ def _push_to_hub(
         raise
-folder_path = "notebooks"
-notebook_templates = load_json_files_from_folder(folder_path)
-logging.info(f"Available notebooks {notebook_templates.keys()}")
 def generate_cells(dataset_id, notebook_title):
     logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
     cells = notebook_templates[notebook_title]["notebook_template"]
@@ -248,7 +248,9 @@ with gr.Blocks(
     gr.Markdown("## 2. Select the type of notebook you want to generate")
     with gr.Row():
         notebook_type = gr.Dropdown(
-            choices=notebook_templates.keys(), label="Notebook type"
         )
         generate_button = gr.Button("Generate Notebook", variant="primary")
         contribute_btn = gr.Button(

 logging.basicConfig(level=logging.INFO)
+# TODO: Validate notebook templates format
+folder_path = "notebooks"
+notebook_templates = load_json_files_from_folder(folder_path)
+logging.info(f"Available notebooks {notebook_templates.keys()}")
 def get_compatible_libraries(dataset: str):
     try:
         raise
 def generate_cells(dataset_id, notebook_title):
     logging.info(f"Generating {notebook_title} notebook for dataset {dataset_id}")
     cells = notebook_templates[notebook_title]["notebook_template"]
     gr.Markdown("## 2. Select the type of notebook you want to generate")
     with gr.Row():
         notebook_type = gr.Dropdown(
+            choices=notebook_templates.keys(),
+            label="Notebook type",
+            value="Text Embeddings",
         )
         generate_button = gr.Button("Generate Notebook", variant="primary")
         contribute_btn = gr.Button(

notebooks/eda.json CHANGED Viewed

@@ -5,7 +5,7 @@
     "notebook_template": [
         {
             "cell_type": "markdown",
-            "source": "\n---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---\n"
         },
         {
             "cell_type": "markdown",
@@ -13,15 +13,15 @@
         },
         {
             "cell_type": "code",
-            "source": "\n# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn\n"
         },
         {
             "cell_type": "code",
-            "source": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n"
         },
         {
             "cell_type": "code",
-            "source": "\n# Load the dataset as a DataFrame\n{first_code}\n"
         },
         {
             "cell_type": "markdown",
@@ -29,28 +29,28 @@
         },
         {
             "cell_type": "code",
-            "source": "\n# First rows of the dataset and info\nprint(df.head())\nprint(df.info())\n"
         },
         {
             "cell_type": "code",
-            "source": "\n# Check for missing values\nprint(df.isnull().sum())\n"
         },
         {
             "cell_type": "code",
-            "source": "\n# Identify data types of each column\nprint(df.dtypes)\n"
         },
         {
             "cell_type": "code",
-            "source": "\n# Detect duplicated rows\nprint(df.duplicated().sum())\n"
         },
         {
             "cell_type": "code",
-            "source": "\n# Generate descriptive statistics\nprint(df.describe())\n"
         },
         {
             "type": "categoric",
             "cell_type": "code",
-            "source": "\n# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()\n"
         },
         {
             "cell_type": "markdown",
@@ -59,22 +59,22 @@
         {
             "type": "numeric",
             "cell_type": "code",
-            "source": "\n# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()\n"
         },
         {
             "type": "numeric",
             "cell_type": "code",
-            "source": "\n# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.histplot(df[column], kde=True)\n    plt.title(f'Distribution of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Frequency')\n    plt.show()\n"
         },
         {
             "type": "categoric",
             "cell_type": "code",
-            "source": "\n# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.countplot(x=column, data=df)\n    plt.title(f'Count Plot of {column}')\n    plt.xlabel(column)\n    plt.ylabel('Count')\n    plt.show()\n"
         },
         {
             "type": "numeric",
             "cell_type": "code",
-            "source": "\n# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n    plt.figure(figsize=(8, 4))\n    sns.boxplot(df[column])\n    plt.title(f'Box Plot of {column}')\n    plt.xlabel(column)\n    plt.show()\n"
         }
     ]
 }

     "notebook_template": [
         {
             "cell_type": "markdown",
+            "source": "---\n# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**\n---"
         },
         {
             "cell_type": "markdown",
         },
         {
             "cell_type": "code",
+            "source": "# Install and import necessary libraries.\n!pip install pandas matplotlib seaborn"
         },
         {
             "cell_type": "code",
+            "source": "import matplotlib.pyplot as plt\nimport seaborn as sns"
         },
         {
             "cell_type": "code",
+            "source": "# Load the dataset as a DataFrame\n{first_code}"
         },
         {
             "cell_type": "markdown",
         },
         {
             "cell_type": "code",
+            "source": "# First rows of the dataset and info\nprint(df.head())\nprint(df.info())"
         },
         {
             "cell_type": "code",
+            "source": "# Check for missing values\nprint(df.isnull().sum())"
         },
         {
             "cell_type": "code",
+            "source": "# Identify data types of each column\nprint(df.dtypes)"
         },
         {
             "cell_type": "code",
+            "source": "# Detect duplicated rows\nprint(df.duplicated().sum())"
         },
         {
             "cell_type": "code",
+            "source": "# Generate descriptive statistics\nprint(df.describe())"
         },
         {
             "type": "categoric",
             "cell_type": "code",
+            "source": "# Unique values in categorical columns\ndf.select_dtypes(include=['object']).nunique()"
         },
         {
             "cell_type": "markdown",
         {
             "type": "numeric",
             "cell_type": "code",
+            "source": "# Correlation matrix for numerical columns\ncorr_matrix = df.corr(numeric_only=True)\nplt.figure(figsize=(10, 8))\nsns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)\nplt.title('Correlation Matrix')\nplt.show()"
         },
         {
             "type": "numeric",
             "cell_type": "code",
+            "source": "# Distribution plots for numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n   plt.figure(figsize=(8, 4))\n   sns.histplot(df[column], kde=True)\n   plt.title(f'Distribution of {column}')\n   plt.xlabel(column)\n   plt.ylabel('Frequency')\n   plt.show()"
         },
         {
             "type": "categoric",
             "cell_type": "code",
+            "source": "# Count plots for categorical columns\nfor column in df.select_dtypes(include=['object']).columns:\n   plt.figure(figsize=(8, 4))\n   sns.countplot(x=column, data=df)\n   plt.title(f'Count Plot of {column}')\n   plt.xlabel(column)\n   plt.ylabel('Count')\n   plt.show()"
         },
         {
             "type": "numeric",
             "cell_type": "code",
+            "source": "# Box plots for detecting outliers in numerical columns\nfor column in df.select_dtypes(include=['int64', 'float64']).columns:\n   plt.figure(figsize=(8, 4))\n   sns.boxplot(df[column])\n   plt.title(f'Box Plot of {column}')\n   plt.xlabel(column)\n   plt.show()"
         }
     ]
 }

notebooks/finetuning.json DELETED Viewed

@@ -1,6 +0,0 @@
-{
-    "notebook_title": "Supervised fine-tuning (SFT)",
-    "notebook_type": "sft",
-    "dataset_type": "numeric",
-    "notebook_template": []
-}

notebooks/rag.json CHANGED Viewed

@@ -2,5 +2,86 @@
     "notebook_title": "Retrieval-augmented generation (RAG)",
     "notebook_type": "rag",
     "dataset_type": "text",
-    "notebook_template": []
 }

     "notebook_title": "Retrieval-augmented generation (RAG)",
     "notebook_type": "rag",
     "dataset_type": "text",
+    "notebook_template": [
+        {
+            "cell_type": "markdown",
+            "source": "---\n# **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**\n---"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 1. Setup necessary libraries and load the dataset"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Install and import necessary libraries.\n!pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub"
+        },
+        {
+            "cell_type": "code",
+            "source": "from sentence_transformers import SentenceTransformer\nfrom transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\nfrom huggingface_hub import InferenceClient\nimport pandas as pd\nimport faiss\nimport torch"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Load the dataset as a DataFrame\n{first_code}"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the column name that contains the text data to generate embeddings\ncolumn_to_generate_embeddings = '{longest_col}'"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 2. Loading embedding model and creating FAISS index"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Remove duplicate entries based on the specified column\ndf = df.drop_duplicates(subset=column_to_generate_embeddings)"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Convert the column data to a list of text entries\ntext_list = df[column_to_generate_embeddings].tolist()"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the embedding model you want to use\nmodel = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')"
+        },
+        {
+            "cell_type": "code",
+            "source": "vectors = model.encode(text_list)\nvector_dimension = vectors.shape[1]\n\n# Initialize the FAISS index with the appropriate dimension (384 for this model)\nindex = faiss.IndexFlatL2(vector_dimension)\n\n# Encode the text list into embeddings and add them to the FAISS index\nindex.add(vectors)"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 3. Perform a text search"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Specify the text you want to search for in the list\nquery = \"How to cook sushi?\"\n\n# Generate the embedding for the search query\nquery_embedding = model.encode([query])"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)\nD, I = index.search(query_embedding, k=10)\n\n# Print the similar documents found\nprint(f\"Similar documents: {[text_list[i] for i in I[0]]}\")"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 4. Load pipeline and perform inference locally"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Adjust model name as needed\ncheckpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'\n\ndevice = \"cuda\" if torch.cuda.is_available() else \"cpu\" # for GPU usage or \"cpu\" for CPU usage\n\ntokenizer = AutoTokenizer.from_pretrained(checkpoint)\nmodel = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)\n\ngenerator = pipeline(\"text-generation\", model=model, tokenizer=tokenizer, device=0 if device == \"cuda\" else -1)"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query\nselected_elements = [text_list[i] for i in I[0].tolist()]\ncontext = ','.join(selected_elements)\nmessages = [\n    {\n        \"role\": \"system\",\n        \"content\": f\"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}\",\n    },\n    {\"role\": \"user\", \"content\": query},\n]"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Send the prompt to the pipeline and show the answer\noutput = generator(messages)\nprint(\"Generated result:\")\nprint(output[0]['generated_text'][-1]['content']) # Print the assistant's response content"
+        },
+        {
+            "cell_type": "markdown",
+            "source": "## 5. Alternatively call the inference client"
+        },
+        {
+            "cell_type": "code",
+            "source": "# Adjust model name as needed\ncheckpoint = \"meta-llama/Meta-Llama-3-8B-Instruct\"\n\n# Change here your Hugging Face API token\ntoken = \"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\" \n\ninference_client = InferenceClient(checkpoint, token=token)\noutput = inference_client.chat_completion(messages=messages, stream=False)\nprint(\"Generated result:\")\nprint(output.choices[0].message.content)"
+        }
+    ]
 }

utils/notebook_utils.py CHANGED Viewed

@@ -24,463 +24,6 @@ def replace_wildcards(
     return new_templates
-embeddings_cells = [
-    {
-        "cell_type": "markdown",
-        "source": """
----
-# **Embeddings Notebook for {dataset_name} dataset**
----
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 1. Setup necessary libraries and load the dataset",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Install and import necessary libraries.
-!pip install pandas sentence-transformers faiss-cpu
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-import pandas as pd
-from sentence_transformers import SentenceTransformer
-import faiss
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Load the dataset as a DataFrame
-{first_code}
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the column name that contains the text data to generate embeddings
-column_to_generate_embeddings = '{longest_col}'
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 2. Loading embedding model and creating FAISS index",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Remove duplicate entries based on the specified column
-df = df.drop_duplicates(subset=column_to_generate_embeddings)
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Convert the column data to a list of text entries
-text_list = df[column_to_generate_embeddings].tolist()
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the embedding model you want to use
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-vectors = model.encode(text_list)
-vector_dimension = vectors.shape[1]
-# Initialize the FAISS index with the appropriate dimension (384 for this model)
-index = faiss.IndexFlatL2(vector_dimension)
-# Encode the text list into embeddings and add them to the FAISS index
-index.add(vectors)
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 3. Perform a text search",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the text you want to search for in the list
-text_to_search = text_list[0]
-print(f"Text to search: {text_to_search}")
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Generate the embedding for the search query
-query_embedding = model.encode([text_to_search])
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
-D, I = index.search(query_embedding, k=10)
-# Print the similar documents found
-print(f"Similar documents: {[text_list[i] for i in I[0]]}")
-""",
-    },
-]
-eda_cells = [
-    {
-        "cell_type": "markdown",
-        "source": """
----
-# **Exploratory Data Analysis (EDA) Notebook for {dataset_name} dataset**
----
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 1. Setup necessary libraries and load the dataset",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Install and import necessary libraries.
-!pip install pandas matplotlib seaborn
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Load the dataset as a DataFrame
-{first_code}
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 2. Understanding the Dataset",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# First rows of the dataset and info
-print(df.head())
-print(df.info())
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Check for missing values
-print(df.isnull().sum())
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Identify data types of each column
-print(df.dtypes)
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Detect duplicated rows
-print(df.duplicated().sum())
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Generate descriptive statistics
-print(df.describe())
-""",
-    },
-    {
-        "type": "categoric",
-        "cell_type": "code",
-        "source": """
-# Unique values in categorical columns
-df.select_dtypes(include=['object']).nunique()
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 3. Data Visualization",
-    },
-    {
-        "type": "numeric",
-        "cell_type": "code",
-        "source": """
-# Correlation matrix for numerical columns
-corr_matrix = df.corr(numeric_only=True)
-plt.figure(figsize=(10, 8))
-sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)
-plt.title('Correlation Matrix')
-plt.show()
-""",
-    },
-    {
-        "type": "numeric",
-        "cell_type": "code",
-        "source": """
-# Distribution plots for numerical columns
-for column in df.select_dtypes(include=['int64', 'float64']).columns:
-    plt.figure(figsize=(8, 4))
-    sns.histplot(df[column], kde=True)
-    plt.title(f'Distribution of {column}')
-    plt.xlabel(column)
-    plt.ylabel('Frequency')
-    plt.show()
-""",
-    },
-    {
-        "type": "categoric",
-        "cell_type": "code",
-        "source": """
-# Count plots for categorical columns
-for column in df.select_dtypes(include=['object']).columns:
-    plt.figure(figsize=(8, 4))
-    sns.countplot(x=column, data=df)
-    plt.title(f'Count Plot of {column}')
-    plt.xlabel(column)
-    plt.ylabel('Count')
-    plt.show()
-""",
-    },
-    {
-        "type": "numeric",
-        "cell_type": "code",
-        "source": """
-# Box plots for detecting outliers in numerical columns
-for column in df.select_dtypes(include=['int64', 'float64']).columns:
-    plt.figure(figsize=(8, 4))
-    sns.boxplot(df[column])
-    plt.title(f'Box Plot of {column}')
-    plt.xlabel(column)
-    plt.show()
-""",
-    },
-]
-rag_cells = [
-    {
-        "cell_type": "markdown",
-        "source": """
----
-# **Retrieval-Augmented Generation Notebook for {dataset_name} dataset**
----
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 1. Setup necessary libraries and load the dataset",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Install and import necessary libraries.
-!pip install pandas sentence-transformers faiss-cpu transformers torch huggingface_hub
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-from sentence_transformers import SentenceTransformer
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-from huggingface_hub import InferenceClient
-import pandas as pd
-import faiss
-import torch
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Load the dataset as a DataFrame
-{first_code}
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the column name that contains the text data to generate embeddings
-column_to_generate_embeddings = '{longest_col}'
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 2. Loading embedding model and creating FAISS index",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Remove duplicate entries based on the specified column
-df = df.drop_duplicates(subset=column_to_generate_embeddings)
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Convert the column data to a list of text entries
-text_list = df[column_to_generate_embeddings].tolist()
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the embedding model you want to use
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-vectors = model.encode(text_list)
-vector_dimension = vectors.shape[1]
-# Initialize the FAISS index with the appropriate dimension (384 for this model)
-index = faiss.IndexFlatL2(vector_dimension)
-# Encode the text list into embeddings and add them to the FAISS index
-index.add(vectors)
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 3. Perform a text search",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Specify the text you want to search for in the list
-query = "How to cook sushi?"
-# Generate the embedding for the search query
-query_embedding = model.encode([query])
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Perform the search to find the 'k' nearest neighbors (adjust 'k' as needed)
-D, I = index.search(query_embedding, k=10)
-# Print the similar documents found
-print(f"Similar documents: {[text_list[i] for i in I[0]]}")
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 4. Load pipeline and perform inference locally",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Adjust model name as needed
-checkpoint = 'HuggingFaceTB/SmolLM-1.7B-Instruct'
-device = "cuda" if torch.cuda.is_available() else "cpu" # for GPU usage or "cpu" for CPU usage
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query
-selected_elements = [text_list[i] for i in I[0].tolist()]
-context = ','.join(selected_elements)
-messages = [
-    {
-        "role": "system",
-        "content": f"You are an intelligent assistant tasked with providing accurate and concise answers based on the following context. Use the information retrieved to construct your response. Context: {context}",
-    },
-    {"role": "user", "content": query},
-]
-""",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Send the prompt to the pipeline and show the answer
-output = generator(messages)
-print("Generated result:")
-print(output[0]['generated_text'][-1]['content']) # Print the assistant's response content
-""",
-    },
-    {
-        "cell_type": "markdown",
-        "source": "## 5. Alternatively call the inference client",
-    },
-    {
-        "cell_type": "code",
-        "source": """
-# Adjust model name as needed
-checkpoint = "meta-llama/Meta-Llama-3-8B-Instruct"
-# Change here your Hugging Face API token
-token = "hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
-inference_client = InferenceClient(checkpoint, token=token)
-output = inference_client.chat_completion(messages=messages, stream=False)
-print("Generated result:")
-print(output.choices[0].message.content)
-""",
-    },
-]
-def generate_rag_system_prompt():
-    """
-    1. Install necessary libraries.
-    2. Import libraries.
-    3. Load the dataset as a DataFrame using the provided code.
-    4. Select the column for generating embeddings.
-    5. Remove duplicate data.
-    6. Convert the selected column to a list.
-    7. Load the sentence-transformers model.
-    8. Create a FAISS index.
-    9. Encode a query sample.
-    10. Search for similar documents using the FAISS index.
-    11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
-    12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
-    13. Send the prompt to the pipeline and display the answer.
-    Ensure the notebook is well-organized with explanations for each step.
-    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
-    The user will provide the dataset information in the following format:
-    ## Columns and Data Types
-    ## Sample Data
-    ## Loading Data code
-    Use the provided code to load the dataset; do not use any other method.
-    """
 def load_json_files_from_folder(folder_path):
     components = {}

     return new_templates
 def load_json_files_from_folder(folder_path):
     components = {}