Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on Apr 1, 2024

Commit

9fa2182

unverified ·

1 Parent(s): 7f0b93d

Refactor GUI to multiple files

Browse files

Files changed (4) hide show

gui/app.py +4 -250
gui/data.py +22 -0
gui/plots.py +84 -0
gui/processing.py +150 -0

gui/app.py CHANGED Viewed

@@ -1,184 +1,8 @@
-import multiprocessing as mp
-import os
-import tempfile
-import time
-from pathlib import Path
 import gradio as gr
-import numpy as np
-import pandas as pd
-from matplotlib import pyplot as plt
-plt.ioff()
-plt.rcParams["font.family"] = [
-    "IBM Plex Mono",
-    # Fallback fonts:
-    "DejaVu Sans Mono",
-    "Courier New",
-    "monospace",
-]
-empty_df = lambda: pd.DataFrame(
-    {
-        "equation": [],
-        "loss": [],
-        "complexity": [],
-    }
-)
-test_equations = ["sin(2*x)/x + 0.1*x"]
-def generate_data(s: str, num_points: int, noise_level: float, data_seed: int):
-    rstate = np.random.RandomState(data_seed)
-    x = rstate.uniform(-10, 10, num_points)
-    for k, v in {
-        "sin": "np.sin",
-        "cos": "np.cos",
-        "exp": "np.exp",
-        "log": "np.log",
-        "tan": "np.tan",
-        "^": "**",
-    }.items():
-        s = s.replace(k, v)
-    y = eval(s)
-    noise = rstate.normal(0, noise_level, y.shape)
-    y_noisy = y + noise
-    return pd.DataFrame({"x": x}), y_noisy
-def _greet_dispatch(
-    file_input,
-    force_run,
-    test_equation,
-    num_points,
-    noise_level,
-    data_seed,
-    niterations,
-    maxsize,
-    binary_operators,
-    unary_operators,
-    plot_update_delay,
-    parsimony,
-    populations,
-    population_size,
-    ncycles_per_iteration,
-    elementwise_loss,
-    adaptive_parsimony_scaling,
-    optimizer_algorithm,
-    optimizer_iterations,
-    batching,
-    batch_size,
-):
-    """Load data, then spawn a process to run the greet function."""
-    if file_input is not None:
-        # Look at some statistics of the file:
-        df = pd.read_csv(file_input)
-        if len(df) == 0:
-            return (
-                empty_df(),
-                "The file is empty!",
-            )
-        if len(df.columns) == 1:
-            return (
-                empty_df(),
-                "The file has only one column!",
-            )
-        if len(df) > 10_000 and not force_run:
-            return (
-                empty_df(),
-                "You have uploaded a file with more than 10,000 rows. "
-                "This will take very long to run. "
-                "Please upload a subsample of the data, "
-                "or check the box 'Ignore Warnings'.",
-            )
-        col_to_fit = df.columns[-1]
-        y = np.array(df[col_to_fit])
-        X = df.drop([col_to_fit], axis=1)
-    else:
-        X, y = generate_data(test_equation, num_points, noise_level, data_seed)
-    with tempfile.TemporaryDirectory() as tmpdirname:
-        base = Path(tmpdirname)
-        equation_file = base / "hall_of_fame.csv"
-        equation_file_bkup = base / "hall_of_fame.csv.bkup"
-        process = mp.Process(
-            target=greet,
-            kwargs=dict(
-                X=X,
-                y=y,
-                niterations=niterations,
-                maxsize=maxsize,
-                binary_operators=binary_operators,
-                unary_operators=unary_operators,
-                equation_file=equation_file,
-                parsimony=parsimony,
-                populations=populations,
-                population_size=population_size,
-                ncycles_per_iteration=ncycles_per_iteration,
-                elementwise_loss=elementwise_loss,
-                adaptive_parsimony_scaling=adaptive_parsimony_scaling,
-                optimizer_algorithm=optimizer_algorithm,
-                optimizer_iterations=optimizer_iterations,
-                batching=batching,
-                batch_size=batch_size,
-            ),
-        )
-        process.start()
-        last_yield_time = None
-        while process.is_alive():
-            if equation_file_bkup.exists():
-                try:
-                    # First, copy the file to a the copy file
-                    equation_file_copy = base / "hall_of_fame_copy.csv"
-                    os.system(f"cp {equation_file_bkup} {equation_file_copy}")
-                    equations = pd.read_csv(equation_file_copy)
-                    # Ensure it is pareto dominated, with more complex expressions
-                    # having higher loss. Otherwise remove those rows.
-                    # TODO: Not sure why this occurs; could be the result of a late copy?
-                    equations.sort_values("Complexity", ascending=True, inplace=True)
-                    equations.reset_index(inplace=True)
-                    bad_idx = []
-                    min_loss = None
-                    for i in equations.index:
-                        if min_loss is None or equations.loc[i, "Loss"] < min_loss:
-                            min_loss = float(equations.loc[i, "Loss"])
-                        else:
-                            bad_idx.append(i)
-                    equations.drop(index=bad_idx, inplace=True)
-                    while (
-                        last_yield_time is not None
-                        and time.time() - last_yield_time < plot_update_delay
-                    ):
-                        time.sleep(0.1)
-                    yield equations[["Complexity", "Loss", "Equation"]]
-                    last_yield_time = time.time()
-                except pd.errors.EmptyDataError:
-                    pass
-        process.join()
-def greet(
-    *,
-    X,
-    y,
-    **pysr_kwargs,
-):
-    import pysr
-    model = pysr.PySRRegressor(
-        progress=False,
-        timeout_in_seconds=1000,
-        **pysr_kwargs,
-    )
-    model.fit(X, y)
-    return 0
 def _data_layout():
@@ -372,7 +196,7 @@ def main():
                 blocks["run"] = gr.Button()
         blocks["run"].click(
-            _greet_dispatch,
             inputs=[
                 blocks[k]
                 for k in [
@@ -423,75 +247,5 @@ def main():
     demo.launch(debug=True)
-def replot_pareto(df, maxsize):
-    fig, ax = plt.subplots(figsize=(6, 6), dpi=100)
-    if len(df) == 0 or "Equation" not in df.columns:
-        return fig
-    # Plotting the data
-    ax.loglog(
-        df["Complexity"],
-        df["Loss"],
-        marker="o",
-        linestyle="-",
-        color="#333f48",
-        linewidth=1.5,
-        markersize=6,
-    )
-    # Set the axis limits
-    ax.set_xlim(0.5, maxsize + 1)
-    ytop = 2 ** (np.ceil(np.log2(df["Loss"].max())))
-    ybottom = 2 ** (np.floor(np.log2(df["Loss"].min() + 1e-20)))
-    ax.set_ylim(ybottom, ytop)
-    ax.grid(True, which="both", ls="--", linewidth=0.5, color="gray", alpha=0.5)
-    ax.spines["top"].set_visible(False)
-    ax.spines["right"].set_visible(False)
-    # Range-frame the plot
-    for direction in ["bottom", "left"]:
-        ax.spines[direction].set_position(("outward", 10))
-    # Delete far ticks
-    ax.tick_params(axis="both", which="major", labelsize=10, direction="out", length=5)
-    ax.tick_params(axis="both", which="minor", labelsize=8, direction="out", length=3)
-    ax.set_xlabel("Complexity")
-    ax.set_ylabel("Loss")
-    fig.tight_layout(pad=2)
-    return fig
-def replot(test_equation, num_points, noise_level, data_seed):
-    X, y = generate_data(test_equation, num_points, noise_level, data_seed)
-    x = X["x"]
-    plt.rcParams["font.family"] = "IBM Plex Mono"
-    fig, ax = plt.subplots(figsize=(6, 6), dpi=100)
-    ax.scatter(x, y, alpha=0.7, edgecolors="w", s=50)
-    ax.grid(True, which="both", ls="--", linewidth=0.5, color="gray", alpha=0.5)
-    ax.spines["top"].set_visible(False)
-    ax.spines["right"].set_visible(False)
-    # Range-frame the plot
-    for direction in ["bottom", "left"]:
-        ax.spines[direction].set_position(("outward", 10))
-    # Delete far ticks
-    ax.tick_params(axis="both", which="major", labelsize=10, direction="out", length=5)
-    ax.tick_params(axis="both", which="minor", labelsize=8, direction="out", length=3)
-    ax.set_xlabel("x")
-    ax.set_ylabel("y")
-    fig.tight_layout(pad=2)
-    return fig
 if __name__ == "__main__":
     main()

 import gradio as gr
+from .data import test_equations
+from .plots import replot, replot_pareto
+from .processing import process
 def _data_layout():
                 blocks["run"] = gr.Button()
         blocks["run"].click(
+            process,
             inputs=[
                 blocks[k]
                 for k in [
     demo.launch(debug=True)
 if __name__ == "__main__":
     main()

gui/data.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import numpy as np
+import pandas as pd
+test_equations = ["sin(2*x)/x + 0.1*x"]
+def generate_data(s: str, num_points: int, noise_level: float, data_seed: int):
+    rstate = np.random.RandomState(data_seed)
+    x = rstate.uniform(-10, 10, num_points)
+    for k, v in {
+        "sin": "np.sin",
+        "cos": "np.cos",
+        "exp": "np.exp",
+        "log": "np.log",
+        "tan": "np.tan",
+        "^": "**",
+    }.items():
+        s = s.replace(k, v)
+    y = eval(s)
+    noise = rstate.normal(0, noise_level, y.shape)
+    y_noisy = y + noise
+    return pd.DataFrame({"x": x}), y_noisy

gui/plots.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import numpy as np
+import pandas as pd
+from matplotlib import pyplot as plt
+plt.ioff()
+plt.rcParams["font.family"] = [
+    "IBM Plex Mono",
+    # Fallback fonts:
+    "DejaVu Sans Mono",
+    "Courier New",
+    "monospace",
+]
+from .data import generate_data
+def replot_pareto(df: pd.DataFrame, maxsize: int):
+    fig, ax = plt.subplots(figsize=(6, 6), dpi=100)
+    if len(df) == 0 or "Equation" not in df.columns:
+        return fig
+    # Plotting the data
+    ax.loglog(
+        df["Complexity"],
+        df["Loss"],
+        marker="o",
+        linestyle="-",
+        color="#333f48",
+        linewidth=1.5,
+        markersize=6,
+    )
+    # Set the axis limits
+    ax.set_xlim(0.5, maxsize + 1)
+    ytop = 2 ** (np.ceil(np.log2(df["Loss"].max())))
+    ybottom = 2 ** (np.floor(np.log2(df["Loss"].min() + 1e-20)))
+    ax.set_ylim(ybottom, ytop)
+    ax.grid(True, which="both", ls="--", linewidth=0.5, color="gray", alpha=0.5)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    # Range-frame the plot
+    for direction in ["bottom", "left"]:
+        ax.spines[direction].set_position(("outward", 10))
+    # Delete far ticks
+    ax.tick_params(axis="both", which="major", labelsize=10, direction="out", length=5)
+    ax.tick_params(axis="both", which="minor", labelsize=8, direction="out", length=3)
+    ax.set_xlabel("Complexity")
+    ax.set_ylabel("Loss")
+    fig.tight_layout(pad=2)
+    return fig
+def replot(test_equation, num_points, noise_level, data_seed):
+    X, y = generate_data(test_equation, num_points, noise_level, data_seed)
+    x = X["x"]
+    plt.rcParams["font.family"] = "IBM Plex Mono"
+    fig, ax = plt.subplots(figsize=(6, 6), dpi=100)
+    ax.scatter(x, y, alpha=0.7, edgecolors="w", s=50)
+    ax.grid(True, which="both", ls="--", linewidth=0.5, color="gray", alpha=0.5)
+    ax.spines["top"].set_visible(False)
+    ax.spines["right"].set_visible(False)
+    # Range-frame the plot
+    for direction in ["bottom", "left"]:
+        ax.spines[direction].set_position(("outward", 10))
+    # Delete far ticks
+    ax.tick_params(axis="both", which="major", labelsize=10, direction="out", length=5)
+    ax.tick_params(axis="both", which="minor", labelsize=8, direction="out", length=3)
+    ax.set_xlabel("x")
+    ax.set_ylabel("y")
+    fig.tight_layout(pad=2)
+    return fig

gui/processing.py ADDED Viewed

	@@ -0,0 +1,150 @@

+import multiprocessing as mp
+import os
+import tempfile
+import time
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from .data import generate_data
+EMPTY_DF = lambda: pd.DataFrame(
+    {
+        "Equation": [],
+        "Loss": [],
+        "Complexity": [],
+    }
+)
+def process(
+    file_input,
+    force_run,
+    test_equation,
+    num_points,
+    noise_level,
+    data_seed,
+    niterations,
+    maxsize,
+    binary_operators,
+    unary_operators,
+    plot_update_delay,
+    parsimony,
+    populations,
+    population_size,
+    ncycles_per_iteration,
+    elementwise_loss,
+    adaptive_parsimony_scaling,
+    optimizer_algorithm,
+    optimizer_iterations,
+    batching,
+    batch_size,
+):
+    """Load data, then spawn a process to run the greet function."""
+    if file_input is not None:
+        # Look at some statistics of the file:
+        df = pd.read_csv(file_input)
+        if len(df) == 0:
+            return (
+                EMPTY_DF(),
+                "The file is empty!",
+            )
+        if len(df.columns) == 1:
+            return (
+                EMPTY_DF(),
+                "The file has only one column!",
+            )
+        if len(df) > 10_000 and not force_run:
+            return (
+                EMPTY_DF(),
+                "You have uploaded a file with more than 10,000 rows. "
+                "This will take very long to run. "
+                "Please upload a subsample of the data, "
+                "or check the box 'Ignore Warnings'.",
+            )
+        col_to_fit = df.columns[-1]
+        y = np.array(df[col_to_fit])
+        X = df.drop([col_to_fit], axis=1)
+    else:
+        X, y = generate_data(test_equation, num_points, noise_level, data_seed)
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        base = Path(tmpdirname)
+        equation_file = base / "hall_of_fame.csv"
+        equation_file_bkup = base / "hall_of_fame.csv.bkup"
+        process = mp.Process(
+            target=pysr_fit,
+            kwargs=dict(
+                X=X,
+                y=y,
+                niterations=niterations,
+                maxsize=maxsize,
+                binary_operators=binary_operators,
+                unary_operators=unary_operators,
+                equation_file=equation_file,
+                parsimony=parsimony,
+                populations=populations,
+                population_size=population_size,
+                ncycles_per_iteration=ncycles_per_iteration,
+                elementwise_loss=elementwise_loss,
+                adaptive_parsimony_scaling=adaptive_parsimony_scaling,
+                optimizer_algorithm=optimizer_algorithm,
+                optimizer_iterations=optimizer_iterations,
+                batching=batching,
+                batch_size=batch_size,
+            ),
+        )
+        process.start()
+        last_yield_time = None
+        while process.is_alive():
+            if equation_file_bkup.exists():
+                try:
+                    # First, copy the file to a the copy file
+                    equation_file_copy = base / "hall_of_fame_copy.csv"
+                    os.system(f"cp {equation_file_bkup} {equation_file_copy}")
+                    equations = pd.read_csv(equation_file_copy)
+                    # Ensure it is pareto dominated, with more complex expressions
+                    # having higher loss. Otherwise remove those rows.
+                    # TODO: Not sure why this occurs; could be the result of a late copy?
+                    equations.sort_values("Complexity", ascending=True, inplace=True)
+                    equations.reset_index(inplace=True)
+                    bad_idx = []
+                    min_loss = None
+                    for i in equations.index:
+                        if min_loss is None or equations.loc[i, "Loss"] < min_loss:
+                            min_loss = float(equations.loc[i, "Loss"])
+                        else:
+                            bad_idx.append(i)
+                    equations.drop(index=bad_idx, inplace=True)
+                    while (
+                        last_yield_time is not None
+                        and time.time() - last_yield_time < plot_update_delay
+                    ):
+                        time.sleep(0.1)
+                    yield equations[["Complexity", "Loss", "Equation"]]
+                    last_yield_time = time.time()
+                except pd.errors.EmptyDataError:
+                    pass
+        process.join()
+def pysr_fit(
+    *,
+    X,
+    y,
+    **pysr_kwargs,
+):
+    import pysr
+    model = pysr.PySRRegressor(
+        progress=False,
+        timeout_in_seconds=1000,
+        **pysr_kwargs,
+    )
+    model.fit(X, y)