Spaces:

MilesCranmer
/

PySR

Running

deepsource-autofix[bot] commited on Jun 7, 2021

Commit

7d4300a

unverified ·

1 Parent(s): 2b07f83

Format code with black

This commit fixes the style issues introduced in 2b07f83 according to the output
from black.

Details: https://deepsource.io/gh/MilesCranmer/PySR/transform/e437d614-031f-4537-a160-7682b887fb55/

Files changed (10) hide show

benchmarks/hyperparamopt.py +72 -65
example.py +15 -8
pysr/export_jax.py +8 -4
pysr/export_torch.py +32 -16
pysr/feynman_problems.py +68 -35
pysr/sr.py +508 -326
setup.py +3 -9
test/test.py +75 -54
test/test_jax.py +25 -14
test/test_torch.py +26 -15

benchmarks/hyperparamopt.py CHANGED Viewed

@@ -10,6 +10,7 @@ import time
 import contextlib
 import numpy as np
 @contextlib.contextmanager
 def temp_seed(seed):
     state = np.random.get_state()
@@ -20,11 +21,12 @@ def temp_seed(seed):
         np.random.set_state(state)
-#Change the following code to your file
 ################################################################################
-TRIALS_FOLDER = 'trials'
 NUMBER_TRIALS_PER_RUN = 1
 def run_trial(args):
     """Evaluate the model loss using the hyperparams in args
@@ -34,29 +36,29 @@ def run_trial(args):
     """
     print("Running on", args)
-    args['niterations'] = 100
-    args['npop'] = 100
-    args['ncyclesperiteration'] = 1000
-    args['topn'] = 10
-    args['parsimony'] = 0.0
-    args['useFrequency'] = True
-    args['annealing'] = True
-    if args['npop'] < 20 or args['ncyclesperiteration'] < 3:
         print("Bad parameters")
-        return {'status': 'ok', 'loss': np.inf}
-    args['weightDoNothing'] = 1.0
     ntrials = 3
     with temp_seed(0):
-        X = np.random.randn(100, 10)*3
     eval_str = [
-    "np.sign(X[:, 2])*np.abs(X[:, 2])**2.5 + 5*np.cos(X[:, 3]) - 5",
-    "np.exp(X[:, 0]/2) + 12.0 + np.log(np.abs(X[:, 0])*10 + 1)",
-    "(np.exp(X[:, 3]) + 3)/(np.abs(X[:, 1]) + np.cos(X[:, 0]) + 1.1)",
-    "X[:, 0] * np.sin(2*np.pi * (X[:, 1] * X[:, 2] - X[:, 3] / X[:, 4])) + 3.0"
     ]
     print(f"Starting", str(args))
@@ -67,51 +69,50 @@ def run_trial(args):
             for j in range(ntrials):
                 print(f"Starting trial {j}")
                 y = eval(eval_str[i])
-                trial = pysr.pysr(X, y,
                     procs=4,
                     populations=20,
                     binary_operators=["plus", "mult", "pow", "div"],
                     unary_operators=["cos", "exp", "sin", "logm", "abs"],
                     maxsize=25,
-                    constraints={'pow': (-1, 1)},
-                    **args)
-                if len(trial) == 0: raise ValueError
                 trials.append(
-                        np.min(trial['MSE'])**0.5 / np.std(eval(eval_str[i-1]))
                 )
                 print(f"Test {i} trial {j} with", str(args), f"got {trials[-1]}")
     except ValueError:
         print(f"Broken", str(args))
-        return {
-            'status': 'ok', # or 'fail' if nan loss
-            'loss': np.inf
-        }
     loss = np.average(trials)
     print(f"Finished with {loss}", str(args))
-    return {
-        'status': 'ok', # or 'fail' if nan loss
-        'loss': loss
-    }
 space = {
-    'alpha': hp.lognormal('alpha', np.log(10.0), 1.0),
-    'fractionReplacedHof': hp.lognormal('fractionReplacedHof', np.log(0.1), 1.0),
-    'fractionReplaced': hp.lognormal('fractionReplaced', np.log(0.1), 1.0),
-    'perturbationFactor': hp.lognormal('perturbationFactor', np.log(1.0), 1.0),
-    'weightMutateConstant': hp.lognormal('weightMutateConstant', np.log(4.0), 1.0),
-    'weightMutateOperator': hp.lognormal('weightMutateOperator', np.log(0.5), 1.0),
-    'weightAddNode': hp.lognormal('weightAddNode', np.log(0.5), 1.0),
-    'weightInsertNode': hp.lognormal('weightInsertNode', np.log(0.5), 1.0),
-    'weightDeleteNode': hp.lognormal('weightDeleteNode', np.log(0.5), 1.0),
-    'weightSimplify': hp.lognormal('weightSimplify', np.log(0.05), 1.0),
-    'weightRandomize': hp.lognormal('weightRandomize', np.log(0.25), 1.0),
 }
 ################################################################################
 def merge_trials(trials1, trials2_slice):
     """Merge two hyperopt trials objects
@@ -123,24 +124,23 @@ def merge_trials(trials1, trials2_slice):
     """
     max_tid = 0
     if len(trials1.trials) > 0:
-        max_tid = max([trial['tid'] for trial in trials1.trials])
     for trial in trials2_slice:
-        tid = trial['tid'] + max_tid + 1
         hyperopt_trial = Trials().new_trial_docs(
-                tids=[None],
-                specs=[None],
-                results=[None],
-                miscs=[None])
         hyperopt_trial[0] = trial
-        hyperopt_trial[0]['tid'] = tid
-        hyperopt_trial[0]['misc']['tid'] = tid
-        for key in hyperopt_trial[0]['misc']['idxs'].keys():
-            hyperopt_trial[0]['misc']['idxs'][key] = [tid]
-        trials1.insert_trial_docs(hyperopt_trial)
         trials1.refresh()
     return trials1
 loaded_fnames = []
 trials = None
 # Run new hyperparameter trials until killed
@@ -149,15 +149,16 @@ while True:
     # Load up all runs:
     import glob
-    path = TRIALS_FOLDER + '/*.pkl'
     for fname in glob.glob(path):
         if fname in loaded_fnames:
             continue
-        trials_obj = pkl.load(open(fname, 'rb'))
-        n_trials = trials_obj['n']
-        trials_obj = trials_obj['trials']
-        if len(loaded_fnames) == 0:
             trials = trials_obj
         else:
             print("Merging trials")
@@ -171,23 +172,29 @@ while True:
     n = NUMBER_TRIALS_PER_RUN
     try:
-        best = fmin(run_trial,
             space=space,
             algo=tpe.suggest,
             max_evals=n + len(trials.trials),
             trials=trials,
             verbose=1,
-            rstate=np.random.RandomState(np.random.randint(1,10**6))
-            )
     except hyperopt.exceptions.AllTrialsFailed:
         continue
-    print('current best', best)
     hyperopt_trial = Trials()
     # Merge with empty trials dataset:
     save_trials = merge_trials(hyperopt_trial, trials.trials[-n:])
-    new_fname = TRIALS_FOLDER + '/' + str(np.random.randint(0, sys.maxsize)) + str(time.time()) + '.pkl'
-    pkl.dump({'trials': save_trials, 'n': n}, open(new_fname, 'wb'))
     loaded_fnames.append(new_fname)

 import contextlib
 import numpy as np
 @contextlib.contextmanager
 def temp_seed(seed):
     state = np.random.get_state()
         np.random.set_state(state)
+# Change the following code to your file
 ################################################################################
+TRIALS_FOLDER = "trials"
 NUMBER_TRIALS_PER_RUN = 1
 def run_trial(args):
     """Evaluate the model loss using the hyperparams in args
     """
     print("Running on", args)
+    args["niterations"] = 100
+    args["npop"] = 100
+    args["ncyclesperiteration"] = 1000
+    args["topn"] = 10
+    args["parsimony"] = 0.0
+    args["useFrequency"] = True
+    args["annealing"] = True
+    if args["npop"] < 20 or args["ncyclesperiteration"] < 3:
         print("Bad parameters")
+        return {"status": "ok", "loss": np.inf}
+    args["weightDoNothing"] = 1.0
     ntrials = 3
     with temp_seed(0):
+        X = np.random.randn(100, 10) * 3
     eval_str = [
+        "np.sign(X[:, 2])*np.abs(X[:, 2])**2.5 + 5*np.cos(X[:, 3]) - 5",
+        "np.exp(X[:, 0]/2) + 12.0 + np.log(np.abs(X[:, 0])*10 + 1)",
+        "(np.exp(X[:, 3]) + 3)/(np.abs(X[:, 1]) + np.cos(X[:, 0]) + 1.1)",
+        "X[:, 0] * np.sin(2*np.pi * (X[:, 1] * X[:, 2] - X[:, 3] / X[:, 4])) + 3.0",
     ]
     print(f"Starting", str(args))
             for j in range(ntrials):
                 print(f"Starting trial {j}")
                 y = eval(eval_str[i])
+                trial = pysr.pysr(
+                    X,
+                    y,
                     procs=4,
                     populations=20,
                     binary_operators=["plus", "mult", "pow", "div"],
                     unary_operators=["cos", "exp", "sin", "logm", "abs"],
                     maxsize=25,
+                    constraints={"pow": (-1, 1)},
+                    **args,
+                )
+                if len(trial) == 0:
+                    raise ValueError
                 trials.append(
+                    np.min(trial["MSE"]) ** 0.5 / np.std(eval(eval_str[i - 1]))
                 )
                 print(f"Test {i} trial {j} with", str(args), f"got {trials[-1]}")
     except ValueError:
         print(f"Broken", str(args))
+        return {"status": "ok", "loss": np.inf}  # or 'fail' if nan loss
     loss = np.average(trials)
     print(f"Finished with {loss}", str(args))
+    return {"status": "ok", "loss": loss}  # or 'fail' if nan loss
 space = {
+    "alpha": hp.lognormal("alpha", np.log(10.0), 1.0),
+    "fractionReplacedHof": hp.lognormal("fractionReplacedHof", np.log(0.1), 1.0),
+    "fractionReplaced": hp.lognormal("fractionReplaced", np.log(0.1), 1.0),
+    "perturbationFactor": hp.lognormal("perturbationFactor", np.log(1.0), 1.0),
+    "weightMutateConstant": hp.lognormal("weightMutateConstant", np.log(4.0), 1.0),
+    "weightMutateOperator": hp.lognormal("weightMutateOperator", np.log(0.5), 1.0),
+    "weightAddNode": hp.lognormal("weightAddNode", np.log(0.5), 1.0),
+    "weightInsertNode": hp.lognormal("weightInsertNode", np.log(0.5), 1.0),
+    "weightDeleteNode": hp.lognormal("weightDeleteNode", np.log(0.5), 1.0),
+    "weightSimplify": hp.lognormal("weightSimplify", np.log(0.05), 1.0),
+    "weightRandomize": hp.lognormal("weightRandomize", np.log(0.25), 1.0),
 }
 ################################################################################
 def merge_trials(trials1, trials2_slice):
     """Merge two hyperopt trials objects
     """
     max_tid = 0
     if len(trials1.trials) > 0:
+        max_tid = max([trial["tid"] for trial in trials1.trials])
     for trial in trials2_slice:
+        tid = trial["tid"] + max_tid + 1
         hyperopt_trial = Trials().new_trial_docs(
+            tids=[None], specs=[None], results=[None], miscs=[None]
+        )
         hyperopt_trial[0] = trial
+        hyperopt_trial[0]["tid"] = tid
+        hyperopt_trial[0]["misc"]["tid"] = tid
+        for key in hyperopt_trial[0]["misc"]["idxs"].keys():
+            hyperopt_trial[0]["misc"]["idxs"][key] = [tid]
+        trials1.insert_trial_docs(hyperopt_trial)
         trials1.refresh()
     return trials1
 loaded_fnames = []
 trials = None
 # Run new hyperparameter trials until killed
     # Load up all runs:
     import glob
+    path = TRIALS_FOLDER + "/*.pkl"
     for fname in glob.glob(path):
         if fname in loaded_fnames:
             continue
+        trials_obj = pkl.load(open(fname, "rb"))
+        n_trials = trials_obj["n"]
+        trials_obj = trials_obj["trials"]
+        if len(loaded_fnames) == 0:
             trials = trials_obj
         else:
             print("Merging trials")
     n = NUMBER_TRIALS_PER_RUN
     try:
+        best = fmin(
+            run_trial,
             space=space,
             algo=tpe.suggest,
             max_evals=n + len(trials.trials),
             trials=trials,
             verbose=1,
+            rstate=np.random.RandomState(np.random.randint(1, 10 ** 6)),
+        )
     except hyperopt.exceptions.AllTrialsFailed:
         continue
+    print("current best", best)
     hyperopt_trial = Trials()
     # Merge with empty trials dataset:
     save_trials = merge_trials(hyperopt_trial, trials.trials[-n:])
+    new_fname = (
+        TRIALS_FOLDER
+        + "/"
+        + str(np.random.randint(0, sys.maxsize))
+        + str(time.time())
+        + ".pkl"
+    )
+    pkl.dump({"trials": save_trials, "n": n}, open(new_fname, "wb"))
     loaded_fnames.append(new_fname)

example.py CHANGED Viewed

@@ -2,18 +2,25 @@ import numpy as np
 from pysr import pysr, best
 # Dataset
-X = 2*np.random.randn(100, 5)
-y = 2*np.cos(X[:, 3]) + X[:, 0]**2 - 2
 # Learn equations
-equations = pysr(X, y, niterations=5,
     binary_operators=["plus", "mult"],
     unary_operators=[
-      "cos", "exp", "sin", #Pre-defined library of operators (see https://pysr.readthedocs.io/en/latest/docs/operators/)
-      "inv(x) = 1/x"],
-    loss='loss(x, y) = abs(x - y)', # Custom loss function
-    julia_project="../SymbolicRegression.jl") # Define your own operator! (Julia syntax)
-...# (you can use ctl-c to exit early)
 print(best(equations))

 from pysr import pysr, best
 # Dataset
+X = 2 * np.random.randn(100, 5)
+y = 2 * np.cos(X[:, 3]) + X[:, 0] ** 2 - 2
 # Learn equations
+equations = pysr(
+    X,
+    y,
+    niterations=5,
     binary_operators=["plus", "mult"],
     unary_operators=[
+        "cos",
+        "exp",
+        "sin",  # Pre-defined library of operators (see https://pysr.readthedocs.io/en/latest/docs/operators/)
+        "inv(x) = 1/x",
+    ],
+    loss="loss(x, y) = abs(x - y)",  # Custom loss function
+    julia_project="../SymbolicRegression.jl",
+)  # Define your own operator! (Julia syntax)
+...  # (you can use ctl-c to exit early)
 print(best(equations))

pysr/export_jax.py CHANGED Viewed

@@ -58,14 +58,16 @@ def sympy2jaxtext(expr, parameters, symbols_in):
     elif issubclass(expr.func, sympy.Integer):
         return f"{int(expr)}"
     elif issubclass(expr.func, sympy.Symbol):
-        return f"X[:, {[i for i in range(len(symbols_in)) if symbols_in[i] == expr][0]}]"
     else:
         _func = _jnp_func_lookup[expr.func]
         args = [sympy2jaxtext(arg, parameters, symbols_in) for arg in expr.args]
         if _func == MUL:
-            return ' * '.join(['(' + arg + ')' for arg in args])
         elif _func == ADD:
-            return ' + '.join(['(' + arg + ')' for arg in args])
         else:
             return f'{_func}({", ".join(args)})'
@@ -75,6 +77,7 @@ jax = None
 jnp = None
 jsp = None
 def _initialize_jax():
     global jax_initialized
     global jax
@@ -85,6 +88,7 @@ def _initialize_jax():
         import jax as _jax
         from jax import numpy as _jnp
         from jax.scipy import special as _jsp
         jax = _jax
         jnp = _jnp
         jsp = _jsp
@@ -169,7 +173,7 @@ def sympy2jax(expression, symbols_in, selection=None):
     parameters = []
     functional_form_text = sympy2jaxtext(expression, parameters, symbols_in)
-    hash_string = 'A_' + str(abs(hash(str(expression) + str(symbols_in))))
     text = f"def {hash_string}(X, parameters):\n"
     if selection is not None:
         # Impose the feature selection:

     elif issubclass(expr.func, sympy.Integer):
         return f"{int(expr)}"
     elif issubclass(expr.func, sympy.Symbol):
+        return (
+            f"X[:, {[i for i in range(len(symbols_in)) if symbols_in[i] == expr][0]}]"
+        )
     else:
         _func = _jnp_func_lookup[expr.func]
         args = [sympy2jaxtext(arg, parameters, symbols_in) for arg in expr.args]
         if _func == MUL:
+            return " * ".join(["(" + arg + ")" for arg in args])
         elif _func == ADD:
+            return " + ".join(["(" + arg + ")" for arg in args])
         else:
             return f'{_func}({", ".join(args)})'
 jnp = None
 jsp = None
 def _initialize_jax():
     global jax_initialized
     global jax
         import jax as _jax
         from jax import numpy as _jnp
         from jax.scipy import special as _jsp
         jax = _jax
         jnp = _jnp
         jsp = _jsp
     parameters = []
     functional_form_text = sympy2jaxtext(expression, parameters, symbols_in)
+    hash_string = "A_" + str(abs(hash(str(expression) + str(symbols_in))))
     text = f"def {hash_string}(X, parameters):\n"
     if selection is not None:
         # Impose the feature selection:

pysr/export_torch.py CHANGED Viewed

@@ -7,17 +7,21 @@ import collections as co
 import functools as ft
 import sympy
 def _reduce(fn):
     def fn_(*args):
         return ft.reduce(fn, args)
     return fn_
 torch_initialized = False
 torch = None
 _global_func_lookup = None
 _Node = None
 SingleSymPyModule = None
 def _initialize_torch():
     global torch_initialized
     global torch
@@ -29,6 +33,7 @@ def _initialize_torch():
     # but still allow this module to be loaded in __init__
     if not torch_initialized:
         import torch as _torch
         torch = _torch
         _global_func_lookup = {
@@ -85,6 +90,7 @@ def _initialize_torch():
         class _Node(torch.nn.Module):
             """SympyTorch code from https://github.com/patrick-kidger/sympytorch"""
             def __init__(self, *, expr, _memodict, _func_lookup, **kwargs):
                 super().__init__(**kwargs)
@@ -95,9 +101,13 @@ def _initialize_torch():
                     self._torch_func = lambda: self._value
                     self._args = ()
                 elif issubclass(expr.func, sympy.UnevaluatedExpr):
-                    if len(expr.args) != 1 or not issubclass(expr.args[0].func, sympy.Float):
-                        raise ValueError("UnevaluatedExpr should only be used to wrap floats.")
-                    self.register_buffer('_value', torch.tensor(float(expr.args[0])))
                     self._torch_func = lambda: self._value
                     self._args = ()
                 elif issubclass(expr.func, sympy.Integer):
@@ -117,7 +127,12 @@ def _initialize_torch():
                         try:
                             arg_ = _memodict[arg]
                         except KeyError:
-                            arg_ = type(self)(expr=arg, _memodict=_memodict, _func_lookup=_func_lookup, **kwargs)
                             _memodict[arg] = arg_
                         args.append(arg_)
                     self._args = torch.nn.ModuleList(args)
@@ -133,19 +148,22 @@ def _initialize_torch():
                     args.append(arg_)
                 return self._torch_func(*args)
         class SingleSymPyModule(torch.nn.Module):
             """SympyTorch code from https://github.com/patrick-kidger/sympytorch"""
-            def __init__(self, expression, symbols_in,
-                         selection=None, extra_funcs=None, **kwargs):
                 super().__init__(**kwargs)
                 if extra_funcs is None:
                     extra_funcs = {}
                 _func_lookup = co.ChainMap(_global_func_lookup, extra_funcs)
                 _memodict = {}
-                self._node = _Node(expr=expression, _memodict=_memodict, _func_lookup=_func_lookup)
                 self._expression_string = str(expression)
                 self._selection = selection
                 self.symbols_in = [str(symbol) for symbol in symbols_in]
@@ -156,13 +174,11 @@ def _initialize_torch():
             def forward(self, X):
                 if self._selection is not None:
                     X = X[:, self._selection]
-                symbols = {symbol: X[:, i]
-                           for i, symbol in enumerate(self.symbols_in)}
                 return self._node(symbols)
-def sympy2torch(expression, symbols_in,
-                selection=None, extra_torch_mappings=None):
     """Returns a module for a given sympy expression with trainable parameters;
     This function will assume the input to the module is a matrix X, where
@@ -172,6 +188,6 @@ def sympy2torch(expression, symbols_in,
     _initialize_torch()
-    return SingleSymPyModule(expression, symbols_in,
-                             selection=selection,
-                             extra_funcs=extra_torch_mappings)

 import functools as ft
 import sympy
 def _reduce(fn):
     def fn_(*args):
         return ft.reduce(fn, args)
     return fn_
 torch_initialized = False
 torch = None
 _global_func_lookup = None
 _Node = None
 SingleSymPyModule = None
 def _initialize_torch():
     global torch_initialized
     global torch
     # but still allow this module to be loaded in __init__
     if not torch_initialized:
         import torch as _torch
         torch = _torch
         _global_func_lookup = {
         class _Node(torch.nn.Module):
             """SympyTorch code from https://github.com/patrick-kidger/sympytorch"""
             def __init__(self, *, expr, _memodict, _func_lookup, **kwargs):
                 super().__init__(**kwargs)
                     self._torch_func = lambda: self._value
                     self._args = ()
                 elif issubclass(expr.func, sympy.UnevaluatedExpr):
+                    if len(expr.args) != 1 or not issubclass(
+                        expr.args[0].func, sympy.Float
+                    ):
+                        raise ValueError(
+                            "UnevaluatedExpr should only be used to wrap floats."
+                        )
+                    self.register_buffer("_value", torch.tensor(float(expr.args[0])))
                     self._torch_func = lambda: self._value
                     self._args = ()
                 elif issubclass(expr.func, sympy.Integer):
                         try:
                             arg_ = _memodict[arg]
                         except KeyError:
+                            arg_ = type(self)(
+                                expr=arg,
+                                _memodict=_memodict,
+                                _func_lookup=_func_lookup,
+                                **kwargs,
+                            )
                             _memodict[arg] = arg_
                         args.append(arg_)
                     self._args = torch.nn.ModuleList(args)
                     args.append(arg_)
                 return self._torch_func(*args)
         class SingleSymPyModule(torch.nn.Module):
             """SympyTorch code from https://github.com/patrick-kidger/sympytorch"""
+            def __init__(
+                self, expression, symbols_in, selection=None, extra_funcs=None, **kwargs
+            ):
                 super().__init__(**kwargs)
                 if extra_funcs is None:
                     extra_funcs = {}
                 _func_lookup = co.ChainMap(_global_func_lookup, extra_funcs)
                 _memodict = {}
+                self._node = _Node(
+                    expr=expression, _memodict=_memodict, _func_lookup=_func_lookup
+                )
                 self._expression_string = str(expression)
                 self._selection = selection
                 self.symbols_in = [str(symbol) for symbol in symbols_in]
             def forward(self, X):
                 if self._selection is not None:
                     X = X[:, self._selection]
+                symbols = {symbol: X[:, i] for i, symbol in enumerate(self.symbols_in)}
                 return self._node(symbols)
+def sympy2torch(expression, symbols_in, selection=None, extra_torch_mappings=None):
     """Returns a module for a given sympy expression with trainable parameters;
     This function will assume the input to the module is a matrix X, where
     _initialize_torch()
+    return SingleSymPyModule(
+        expression, symbols_in, selection=selection, extra_funcs=extra_torch_mappings
+    )

pysr/feynman_problems.py CHANGED Viewed

@@ -7,6 +7,7 @@ from pathlib import Path
 PKG_DIR = Path(__file__).parents[1]
 FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
 class Problem:
     """
     Problem API to work with PySR.
@@ -15,6 +16,7 @@ class Problem:
     Should be able to call pysr(problem.X, problem.y, var_names=problem.var_names) and have it work
     """
     def __init__(self, X, y, form=None, variable_names=None):
         self.X = X
         self.y = y
@@ -27,34 +29,39 @@ class FeynmanProblem(Problem):
     Stores the data for the problems from the 100 Feynman Equations on Physics.
     This is the benchmark used in the AI Feynman Paper
     """
     def __init__(self, row, gen=False, dp=500):
         """
         row: a row read as a dict from the FeynmanEquations dataset provided in the datasets folder of the repo
         gen: If true the problem will have dp X and y values randomly generated else they will be None
         """
-        self.eq_id      = row['Filename']
-        self.n_vars     = int(row['# variables'])
-        super(FeynmanProblem, self).__init__(None, None, form=row['Formula'],
-                                             variable_names=[row[f'v{i + 1}_name'] for i in range(self.n_vars)])
-        self.low        = [float(row[f'v{i+1}_low'])   for i in range(self.n_vars)]
-        self.high       = [float(row[f'v{i+1}_high'])  for i in range(self.n_vars)]
-        self.dp         = dp
         if gen:
             self.X = np.random.uniform(0.01, 25, size=(self.dp, self.n_vars))
             d = {}
             for var in range(len(self.variable_names)):
                 d[self.variable_names[var]] = self.X[:, var]
-            d['exp'] = np.exp
-            d['sqrt'] = np.sqrt
-            d['pi'] = np.pi
-            d['cos'] = np.cos
-            d['sin'] = np.sin
-            d['tan'] = np.tan
-            d['tanh'] = np.tanh
-            d['ln']   = np.log
-            d['log'] = np.log # Quite sure the Feynman dataset has no base 10 logs
-            d['arcsin'] = np.arcsin
-            self.y = eval(self.form,d)
         return
     def __str__(self):
@@ -77,7 +84,8 @@ class FeynmanProblem(Problem):
             for i, row in enumerate(reader):
                 if ind > first:
                     break
-                if row['Filename'] == '': continue
                 try:
                     p = FeynmanProblem(row, gen=gen, dp=dp)
                     ret.append(p)
@@ -93,18 +101,34 @@ def run_on_problem(problem, verbosity=0, multiprocessing=True):
     Takes in a problem and returns a tuple: (equations, best predicted equation, actual equation)
     """
     from time import time
     starting = time()
-    equations = pysr(problem.X, problem.y, variable_names=problem.variable_names, verbosity=verbosity,)
-    timing = time()-starting
     others = {"time": timing, "problem": problem}
     if not multiprocessing:
-        others['equations'] = equations
     return str(best(equations)), problem.form, others
-def do_feynman_experiments_parallel(first=100, verbosity=0, dp=500, output_file_path="FeynmanExperiment.csv", data_dir=FEYNMAN_DATASET):
     import multiprocessing as mp
     from tqdm import tqdm
-    problems = FeynmanProblem.mk_problems(first=first, gen=True, dp=dp, data_dir=data_dir)
     ids = []
     predictions = []
     true_equations = []
@@ -117,22 +141,31 @@ def do_feynman_experiments_parallel(first=100, verbosity=0, dp=500, output_file_
             pbar.update()
     for res in results:
         prediction, true_equation, others = res
-        problem = others['problem']
         ids.append(problem.eq_id)
         predictions.append(prediction)
         true_equations.append(true_equation)
-        time_takens.append(others['time'])
-    with open(output_file_path, 'a') as f:
-        writer = csv.writer(f, delimiter=',')
-        writer.writerow(['ID', 'Predicted', 'True', 'Time'])
         for i in range(len(ids)):
             writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]])
     return
-def do_feynman_experiments(first=100, verbosity=0, dp=500, output_file_path="FeynmanExperiment.csv", data_dir=FEYNMAN_DATASET):
     from tqdm import tqdm
-    problems = FeynmanProblem.mk_problems(first=first, gen=True, dp=dp, data_dir=data_dir)
     indx = range(len(problems))
     ids = []
     predictions = []
@@ -143,10 +176,10 @@ def do_feynman_experiments(first=100, verbosity=0, dp=500, output_file_path="Fey
         ids.append(problem.eq_id)
         predictions.append(prediction)
         true_equations.append(true_equation)
-        time_takens.append(others['time'])
-    with open(output_file_path, 'a') as f:
-        writer = csv.writer(f, delimiter=',')
-        writer.writerow(['ID', 'Predicted', 'True', 'Time'])
         for i in range(len(ids)):
             writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]])
     return

 PKG_DIR = Path(__file__).parents[1]
 FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
 class Problem:
     """
     Problem API to work with PySR.
     Should be able to call pysr(problem.X, problem.y, var_names=problem.var_names) and have it work
     """
     def __init__(self, X, y, form=None, variable_names=None):
         self.X = X
         self.y = y
     Stores the data for the problems from the 100 Feynman Equations on Physics.
     This is the benchmark used in the AI Feynman Paper
     """
     def __init__(self, row, gen=False, dp=500):
         """
         row: a row read as a dict from the FeynmanEquations dataset provided in the datasets folder of the repo
         gen: If true the problem will have dp X and y values randomly generated else they will be None
         """
+        self.eq_id = row["Filename"]
+        self.n_vars = int(row["# variables"])
+        super(FeynmanProblem, self).__init__(
+            None,
+            None,
+            form=row["Formula"],
+            variable_names=[row[f"v{i + 1}_name"] for i in range(self.n_vars)],
+        )
+        self.low = [float(row[f"v{i+1}_low"]) for i in range(self.n_vars)]
+        self.high = [float(row[f"v{i+1}_high"]) for i in range(self.n_vars)]
+        self.dp = dp
         if gen:
             self.X = np.random.uniform(0.01, 25, size=(self.dp, self.n_vars))
             d = {}
             for var in range(len(self.variable_names)):
                 d[self.variable_names[var]] = self.X[:, var]
+            d["exp"] = np.exp
+            d["sqrt"] = np.sqrt
+            d["pi"] = np.pi
+            d["cos"] = np.cos
+            d["sin"] = np.sin
+            d["tan"] = np.tan
+            d["tanh"] = np.tanh
+            d["ln"] = np.log
+            d["log"] = np.log  # Quite sure the Feynman dataset has no base 10 logs
+            d["arcsin"] = np.arcsin
+            self.y = eval(self.form, d)
         return
     def __str__(self):
             for i, row in enumerate(reader):
                 if ind > first:
                     break
+                if row["Filename"] == "":
+                    continue
                 try:
                     p = FeynmanProblem(row, gen=gen, dp=dp)
                     ret.append(p)
     Takes in a problem and returns a tuple: (equations, best predicted equation, actual equation)
     """
     from time import time
     starting = time()
+    equations = pysr(
+        problem.X,
+        problem.y,
+        variable_names=problem.variable_names,
+        verbosity=verbosity,
+    )
+    timing = time() - starting
     others = {"time": timing, "problem": problem}
     if not multiprocessing:
+        others["equations"] = equations
     return str(best(equations)), problem.form, others
+def do_feynman_experiments_parallel(
+    first=100,
+    verbosity=0,
+    dp=500,
+    output_file_path="FeynmanExperiment.csv",
+    data_dir=FEYNMAN_DATASET,
+):
     import multiprocessing as mp
     from tqdm import tqdm
+    problems = FeynmanProblem.mk_problems(
+        first=first, gen=True, dp=dp, data_dir=data_dir
+    )
     ids = []
     predictions = []
     true_equations = []
             pbar.update()
     for res in results:
         prediction, true_equation, others = res
+        problem = others["problem"]
         ids.append(problem.eq_id)
         predictions.append(prediction)
         true_equations.append(true_equation)
+        time_takens.append(others["time"])
+    with open(output_file_path, "a") as f:
+        writer = csv.writer(f, delimiter=",")
+        writer.writerow(["ID", "Predicted", "True", "Time"])
         for i in range(len(ids)):
             writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]])
     return
+def do_feynman_experiments(
+    first=100,
+    verbosity=0,
+    dp=500,
+    output_file_path="FeynmanExperiment.csv",
+    data_dir=FEYNMAN_DATASET,
+):
     from tqdm import tqdm
+    problems = FeynmanProblem.mk_problems(
+        first=first, gen=True, dp=dp, data_dir=data_dir
+    )
     indx = range(len(problems))
     ids = []
     predictions = []
         ids.append(problem.eq_id)
         predictions.append(prediction)
         true_equations.append(true_equation)
+        time_takens.append(others["time"])
+    with open(output_file_path, "a") as f:
+        writer = csv.writer(f, delimiter=",")
+        writer.writerow(["ID", "Predicted", "True", "Time"])
         for i in range(len(ids)):
             writer.writerow([ids[i], predictions[i], true_equations[i], time_takens[i]])
     return

pysr/sr.py CHANGED Viewed

@@ -15,7 +15,7 @@ from datetime import datetime
 import warnings
 global_state = dict(
-    equation_file='hall_of_fame.csv',
     n_features=None,
     variable_names=[],
     extra_sympy_mappings={},
@@ -25,108 +25,112 @@ global_state = dict(
     output_torch_format=False,
     multioutput=False,
     nout=1,
-    selection=None
 )
 sympy_mappings = {
-    'div':  lambda x, y : x/y,
-    'mult': lambda x, y : x*y,
-    'sqrt_abs':lambda x    : sympy.sqrt(abs(x)),
-    'square':lambda x   : x**2,
-    'cube': lambda x    : x**3,
-    'plus': lambda x, y : x + y,
-    'sub':  lambda x, y : x - y,
-    'neg':  lambda x    : -x,
-    'pow':  lambda x, y : abs(x)**y,
-    'cos':  lambda x    : sympy.cos(x),
-    'sin':  lambda x    : sympy.sin(x),
-    'tan':  lambda x    : sympy.tan(x),
-    'cosh': lambda x    : sympy.cosh(x),
-    'sinh': lambda x    : sympy.sinh(x),
-    'tanh': lambda x    : sympy.tanh(x),
-    'exp':  lambda x    : sympy.exp(x),
-    'acos': lambda x    : sympy.acos(x),
-    'asin': lambda x    : sympy.asin(x),
-    'atan': lambda x    : sympy.atan(x),
-    'acosh':lambda x    : sympy.acosh(abs(x) + 1),
-    'acosh_abs':lambda x : sympy.acosh(abs(x) + 1),
-    'asinh':lambda x    : sympy.asinh(x),
-    'atanh':lambda x    : sympy.atanh(sympy.Mod(x+1, 2)-1),
-    'atanh_clip':lambda x : sympy.atanh(sympy.Mod(x+1, 2)-1),
-    'abs':  lambda x    : abs(x),
-    'mod':  lambda x, y : sympy.Mod(x, y),
-    'erf':  lambda x    : sympy.erf(x),
-    'erfc': lambda x    : sympy.erfc(x),
-    'log_abs': lambda x : sympy.log(abs(x)),
-    'log10_abs':lambda x : sympy.log(abs(x), 10),
-    'log2_abs': lambda x : sympy.log(abs(x), 2),
-    'log1p_abs': lambda x : sympy.log(abs(x) + 1),
-    'floor': lambda x   : sympy.floor(x),
-    'ceil': lambda x    : sympy.ceil(x),
-    'sign': lambda x    : sympy.sign(x),
-    'gamma': lambda x   : sympy.gamma(x),
 }
-def pysr(X, y, weights=None,
-         binary_operators=None,
-         unary_operators=None,
-         procs=4,
-         loss='L2DistLoss()',
-         populations=20,
-         niterations=100,
-         ncyclesperiteration=300,
-         alpha=0.1,
-         annealing=False,
-         fractionReplaced=0.10,
-         fractionReplacedHof=0.10,
-         npop=1000,
-         parsimony=1e-4,
-         migration=True,
-         hofMigration=True,
-         shouldOptimizeConstants=True,
-         topn=10,
-         weightAddNode=1,
-         weightInsertNode=3,
-         weightDeleteNode=3,
-         weightDoNothing=1,
-         weightMutateConstant=10,
-         weightMutateOperator=1,
-         weightRandomize=1,
-         weightSimplify=0.01,
-         perturbationFactor=1.0,
-         timeout=None,
-         extra_sympy_mappings=None,
-         extra_torch_mappings=None,
-         extra_jax_mappings=None,
-         equation_file=None,
-         verbosity=1e9,
-         progress=None,
-         maxsize=20,
-         fast_cycle=False,
-         maxdepth=None,
-         variable_names=None,
-         batching=False,
-         batchSize=50,
-         select_k_features=None,
-         warmupMaxsizeBy=0.0,
-         constraints=None,
-         useFrequency=True,
-         tempdir=None,
-         delete_tempfiles=True,
-         julia_optimization=3,
-         julia_project=None,
-         user_input=True,
-         update=True,
-         temp_equation_file=False,
-         output_jax_format=False,
-         output_torch_format=False,
-         optimizer_algorithm="BFGS",
-         optimizer_nrestarts=3,
-         optimize_probability=1.0,
-         optimizer_iterations=10,
-         tournament_selection_n=10,
-         tournament_selection_p=1.0
-         ):
     """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
     Note: most default parameters have been tuned over several example
     equations, but you should adjust `niterations`,
@@ -244,7 +248,7 @@ def pysr(X, y, weights=None,
     :type: pd.DataFrame/list
     """
     if binary_operators is None:
-        binary_operators = '+ * - /'.split(' ')
     if unary_operators is None:
         unary_operators = []
     if extra_sympy_mappings is None:
@@ -255,16 +259,18 @@ def pysr(X, y, weights=None,
         constraints = {}
     if progress is not None:
-        if progress and ('buffer' not in sys.stdout.__dir__()):
-            warnings.warn("Note: it looks like you are running in Jupyter. The progress bar will be turned off.")
             progress = False
     else:
-        if 'buffer' in sys.stdout.__dir__():
             progress = True
         else:
             progress = False
-    assert optimizer_algorithm in ['NelderMead', 'BFGS']
     assert tournament_selection_n < npop
     if isinstance(X, pd.DataFrame):
@@ -275,25 +281,34 @@ def pysr(X, y, weights=None,
         X = X[:, None]
     if len(variable_names) == 0:
-        variable_names = [f'x{i}' for i in range(X.shape[1])]
-    use_custom_variable_names = (len(variable_names) != 0)
-    _check_assertions(X, binary_operators, unary_operators,
-                     use_custom_variable_names, variable_names, weights, y)
     _check_for_julia_installation()
     if len(X) > 10000 and not batching:
-        warnings.warn("Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://pysr.readthedocs.io/en/latest/docs/options/#batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed.")
     if maxsize > 40:
-        warnings.warn("Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `useFrequency` to False, and perhaps use `warmupMaxsizeBy`.")
     X, variable_names, selection = _handle_feature_selection(
-            X, select_k_features,
-            use_custom_variable_names, variable_names, y
-        )
     if maxdepth is None:
         maxdepth = maxsize
@@ -312,81 +327,102 @@ def pysr(X, y, weights=None,
     else:
         raise NotImplementedError("y shape not supported!")
-    kwargs = dict(X=X, y=y, weights=weights,
-                 alpha=alpha, annealing=annealing, batchSize=batchSize,
-                 batching=batching, binary_operators=binary_operators,
-                 fast_cycle=fast_cycle,
-                 fractionReplaced=fractionReplaced,
-                 ncyclesperiteration=ncyclesperiteration,
-                 niterations=niterations, npop=npop, topn=topn,
-                 verbosity=verbosity, progress=progress, update=update,
-                 julia_optimization=julia_optimization, timeout=timeout,
-                 fractionReplacedHof=fractionReplacedHof,
-                 hofMigration=hofMigration, maxdepth=maxdepth,
-                 maxsize=maxsize, migration=migration,
-                 optimizer_algorithm=optimizer_algorithm,
-                 optimizer_nrestarts=optimizer_nrestarts,
-                 optimize_probability=optimize_probability,
-                 optimizer_iterations=optimizer_iterations,
-                 parsimony=parsimony, perturbationFactor=perturbationFactor,
-                 populations=populations, procs=procs,
-                 shouldOptimizeConstants=shouldOptimizeConstants,
-                 unary_operators=unary_operators, useFrequency=useFrequency,
-                 use_custom_variable_names=use_custom_variable_names,
-                 variable_names=variable_names, warmupMaxsizeBy=warmupMaxsizeBy,
-                 weightAddNode=weightAddNode,
-                 weightDeleteNode=weightDeleteNode,
-                 weightDoNothing=weightDoNothing,
-                 weightInsertNode=weightInsertNode,
-                 weightMutateConstant=weightMutateConstant,
-                 weightMutateOperator=weightMutateOperator,
-                 weightRandomize=weightRandomize,
-                 weightSimplify=weightSimplify,
-                 constraints=constraints,
-                 extra_sympy_mappings=extra_sympy_mappings,
-                 extra_jax_mappings=extra_jax_mappings,
-                 extra_torch_mappings=extra_torch_mappings,
-                 julia_project=julia_project, loss=loss,
-                 output_jax_format=output_jax_format,
-                 output_torch_format=output_torch_format,
-                 selection=selection,
-                 multioutput=multioutput, nout=nout,
-                 tournament_selection_n=tournament_selection_n,
-                 tournament_selection_p=tournament_selection_p)
     kwargs = {**_set_paths(tempdir), **kwargs}
     if temp_equation_file:
-        equation_file = kwargs['tmpdir'] / f'hall_of_fame.csv'
     elif equation_file is None:
         date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
-        equation_file = 'hall_of_fame_' + date_time + '.csv'
     kwargs = {**dict(equation_file=equation_file), **kwargs}
-    pkg_directory = kwargs['pkg_directory']
     manifest_file = None
-    if kwargs['julia_project'] is not None:
-        manifest_filepath = Path(kwargs['julia_project']) / 'Manifest.toml'
     else:
-        manifest_filepath = pkg_directory / 'Manifest.toml'
-    kwargs['need_install'] = False
     if not (manifest_filepath).is_file():
-        kwargs['need_install'] = (not user_input) or _yesno("I will install Julia packages using PySR's Project.toml file. OK?")
-        if kwargs['need_install']:
             print("OK. I will install at launch.")
             assert update
-    kwargs['def_hyperparams'] = _create_inline_operators(**kwargs)
     _handle_constraints(**kwargs)
-    kwargs['constraints_str'] = _make_constraints_str(**kwargs)
-    kwargs['def_hyperparams'] = _make_hyperparams_julia_str(**kwargs)
-    kwargs['def_datasets'] = _make_datasets_julia_str(**kwargs)
     _create_julia_files(**kwargs)
     _final_pysr_process(**kwargs)
@@ -395,7 +431,7 @@ def pysr(X, y, weights=None,
     equations = get_hof(**kwargs)
     if delete_tempfiles:
-        shutil.rmtree(kwargs['tmpdir'])
     return equations
@@ -403,7 +439,7 @@ def pysr(X, y, weights=None,
 def _set_globals(X, **kwargs):
     global global_state
-    global_state['n_features'] = X.shape[1]
     for key, value in kwargs.items():
         if key in global_state:
             global_state[key] = value
@@ -411,34 +447,37 @@ def _set_globals(X, **kwargs):
 def _final_pysr_process(julia_optimization, runfile_filename, timeout, **kwargs):
     command = [
-        f'julia', f'-O{julia_optimization:d}',
         str(runfile_filename),
     ]
     if timeout is not None:
-        command = [f'timeout', f'{timeout}'] + command
     _cmd_runner(command, **kwargs)
 def _cmd_runner(command, progress, **kwargs):
-    if kwargs['verbosity'] > 0:
-        print("Running on", ' '.join(command))
     process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=-1)
     try:
         while True:
             line = process.stdout.readline()
-            if not line: break
-            decoded_line = line.decode('utf-8')
             if progress:
-                decoded_line = (decoded_line
-                                    .replace('\\033[K',  '\033[K')
-                                    .replace('\\033[1A', '\033[1A')
-                                    .replace('\\033[1B', '\033[1B')
-                                    .replace('\\r',      '\r')
-                                    .encode(sys.stdout.encoding, errors='replace')
-                                    )
                 sys.stdout.buffer.write(decoded_line)
                 sys.stdout.flush()
             else:
-                print(decoded_line, end='')
         process.stdout.close()
         process.wait()
@@ -446,62 +485,94 @@ def _cmd_runner(command, progress, **kwargs):
         print("Killing process... will return when done.")
         process.kill()
-def _create_julia_files(dataset_filename, def_datasets,  hyperparam_filename, def_hyperparams,
-                        fractionReplaced, ncyclesperiteration, niterations, npop,
-                        runfile_filename, topn, verbosity, julia_project, procs, weights,
-                        X, variable_names, pkg_directory, need_install, update, **kwargs):
-    with open(hyperparam_filename, 'w') as f:
         print(def_hyperparams, file=f)
-    with open(dataset_filename, 'w') as f:
         print(def_datasets, file=f)
-    with open(runfile_filename, 'w') as f:
         if julia_project is None:
             julia_project = pkg_directory
         else:
             julia_project = Path(julia_project)
-        print(f'import Pkg', file=f)
         print(f'Pkg.activate("{_escape_filename(julia_project)}")', file=f)
         if need_install:
-            print(f'Pkg.instantiate()', file=f)
-            print(f'Pkg.update()', file=f)
-            print(f'Pkg.precompile()', file=f)
         elif update:
-            print(f'Pkg.update()', file=f)
-        print(f'using SymbolicRegression', file=f)
         print(f'include("{_escape_filename(hyperparam_filename)}")', file=f)
         print(f'include("{_escape_filename(dataset_filename)}")', file=f)
         if len(variable_names) == 0:
             varMap = "[" + ",".join([f'"x{i}"' for i in range(X.shape[1])]) + "]"
         else:
-            varMap = "[" + ",".join(['"' + vname + '"' for vname in variable_names]) + "]"
         if weights is not None:
-            print(f'EquationSearch(X, y, weights=weights, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})', file=f)
         else:
-            print(f'EquationSearch(X, y, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})', file=f)
-def _make_datasets_julia_str(X, X_filename, weights, weights_filename, y, y_filename,
-                            multioutput, **kwargs):
     def_datasets = """using DelimitedFiles"""
-    np.savetxt(X_filename, X.astype(np.float32), delimiter=',')
     if multioutput:
-        np.savetxt(y_filename, y.astype(np.float32), delimiter=',')
     else:
-        np.savetxt(y_filename, y.reshape(-1, 1).astype(np.float32), delimiter=',')
     if weights is not None:
         if multioutput:
-            np.savetxt(weights_filename, weights.astype(np.float32), delimiter=',')
         else:
-            np.savetxt(weights_filename, weights.reshape(-1, 1).astype(np.float32), delimiter=',')
     def_datasets += f"""
 X = copy(transpose(readdlm("{_escape_filename(X_filename)}", ',', Float32, '\\n')))"""
     if multioutput:
-        def_datasets+= f"""
 y = copy(transpose(readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')))"""
     else:
-        def_datasets+= f"""
 y = readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')[:, 1]"""
     if weights is not None:
@@ -513,30 +584,69 @@ weights = copy(transpose(readdlm("{_escape_filename(weights_filename)}", ',', Fl
 weights = readdlm("{_escape_filename(weights_filename)}", ',', Float32, '\\n')[:, 1]"""
     return def_datasets
-def _make_hyperparams_julia_str(X, alpha, annealing, batchSize, batching, binary_operators, constraints_str,
-                               def_hyperparams, equation_file, fast_cycle, fractionReplacedHof, hofMigration,
-                               maxdepth, maxsize, migration,
-                               optimizer_algorithm, optimizer_nrestarts,
-                               optimize_probability, optimizer_iterations, npop,
-                               parsimony, perturbationFactor, populations, procs, shouldOptimizeConstants,
-                               unary_operators, useFrequency, use_custom_variable_names,
-                               variable_names, warmupMaxsizeBy, weightAddNode,
-                               ncyclesperiteration, fractionReplaced, topn, verbosity, progress, loss,
-                               weightDeleteNode, weightDoNothing, weightInsertNode, weightMutateConstant,
-                               weightMutateOperator, weightRandomize, weightSimplify, weights,
-                               tournament_selection_n, tournament_selection_p,
-                               **kwargs):
     try:
         term_width = shutil.get_terminal_size().columns
     except:
-        _, term_width = subprocess.check_output(['stty', 'size']).split()
     def tuple_fix(ops):
         if len(ops) > 1:
-            return ', '.join(ops)
         elif len(ops) == 0:
-            return ''
         else:
-            return ops[0] + ','
     def_hyperparams += f"""\n
 plus=(+)
@@ -606,7 +716,7 @@ progress={'true' if progress else 'false'},
 terminal_width={term_width:d}
 """
-    def_hyperparams += '\n)'
     return def_hyperparams
@@ -639,16 +749,20 @@ def _handle_constraints(binary_operators, constraints, unary_operators, **kwargs
     for op in binary_operators:
         if op not in constraints:
             constraints[op] = (-1, -1)
-        if op in ['plus', 'sub']:
             if constraints[op][0] != constraints[op][1]:
                 raise NotImplementedError(
-                    "You need equal constraints on both sides for - and *, due to simplification strategies.")
-        elif op == 'mult':
             # Make sure the complex expression is in the left side.
             if constraints[op][0] == -1:
                 continue
             elif constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]:
-                constraints[op][0], constraints[op][1] = constraints[op][1], constraints[op][0]
 def _create_inline_operators(binary_operators, unary_operators, **kwargs):
@@ -656,27 +770,33 @@ def _create_inline_operators(binary_operators, unary_operators, **kwargs):
     for op_list in [binary_operators, unary_operators]:
         for i in range(len(op_list)):
             op = op_list[i]
-            is_user_defined_operator = '(' in op
             if is_user_defined_operator:
                 def_hyperparams += op + "\n"
                 # Cut off from the first non-alphanumeric char:
                 first_non_char = [
-                    j for j in range(len(op))
-                    if not (op[j].isalpha() or op[j].isdigit())][0]
                 function_name = op[:first_non_char]
                 op_list[i] = function_name
     return def_hyperparams
-def _handle_feature_selection(X, select_k_features, use_custom_variable_names, variable_names, y):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {selection}")
         X = X[:, selection]
         if use_custom_variable_names:
-            variable_names = [variable_names[selection[i]] for i in range(len(selection))]
     else:
         selection = None
     return X, variable_names, selection
@@ -687,22 +807,34 @@ def _set_paths(tempdir):
     pkg_directory = Path(__file__).parents[1]
     default_project_file = pkg_directory / "Project.toml"
     tmpdir = Path(tempfile.mkdtemp(dir=tempdir))
-    hyperparam_filename = tmpdir / f'hyperparams.jl'
-    dataset_filename = tmpdir / f'dataset.jl'
-    runfile_filename = tmpdir / f'runfile.jl'
     X_filename = tmpdir / "X.csv"
     y_filename = tmpdir / "y.csv"
     weights_filename = tmpdir / "weights.csv"
-    return dict(pkg_directory=pkg_directory,
-	    default_project_file=default_project_file,
-	    X_filename=X_filename,
-            dataset_filename=dataset_filename,
-            hyperparam_filename=hyperparam_filename,
-            runfile_filename=runfile_filename, tmpdir=tmpdir,
-            weights_filename=weights_filename, y_filename=y_filename)
-def _check_assertions(X, binary_operators, unary_operators, use_custom_variable_names, variable_names, weights, y):
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
@@ -714,76 +846,108 @@ def _check_assertions(X, binary_operators, unary_operators, use_custom_variable_
     if use_custom_variable_names:
         assert len(variable_names) == X.shape[1]
 def _check_for_julia_installation():
     try:
         process = subprocess.Popen(["julia", "-v"], stdout=subprocess.PIPE, bufsize=-1)
         while True:
             line = process.stdout.readline()
-            if not line: break
         process.stdout.close()
         process.wait()
     except FileNotFoundError:
         import os
-        raise RuntimeError(f"Your current $PATH is: {os.environ['PATH']}\nPySR could not start julia. Make sure julia is installed and on your $PATH.")
     process.kill()
 def run_feature_selection(X, y, select_k_features):
     """Use a gradient boosting tree regressor as a proxy for finding
-        the k most important features in X, returning indices for those
-        features as output."""
     from sklearn.ensemble import RandomForestRegressor
     from sklearn.feature_selection import SelectFromModel, SelectKBest
     clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0)
     clf.fit(X, y)
-    selector = SelectFromModel(clf, threshold=-np.inf,
-            max_features=select_k_features, prefit=True)
     return selector.get_support(indices=True)
-def get_hof(equation_file=None, n_features=None, variable_names=None,
-            output_jax_format=None, output_torch_format=None,
-            selection=None, extra_sympy_mappings=None,
-            extra_jax_mappings=None, extra_torch_mappings=None,
-            multioutput=None, nout=None, **kwargs):
     """Get the equations from a hall of fame file. If no arguments
     entered, the ones used previously from a call to PySR will be used."""
     global global_state
-    if equation_file is None: equation_file = global_state['equation_file']
-    if n_features is None: n_features = global_state['n_features']
-    if variable_names is None: variable_names = global_state['variable_names']
-    if extra_sympy_mappings is None: extra_sympy_mappings = global_state['extra_sympy_mappings']
-    if extra_jax_mappings is None: extra_jax_mappings = global_state['extra_jax_mappings']
-    if extra_torch_mappings is None: extra_torch_mappings = global_state['extra_torch_mappings']
-    if output_torch_format is None: output_torch_format = global_state['output_torch_format']
-    if output_jax_format is None: output_jax_format = global_state['output_jax_format']
-    if multioutput is None: multioutput = global_state['multioutput']
-    if nout is None: nout = global_state['nout']
-    if selection is None: selection = global_state['selection']
-    global_state['selection'] = selection
-    global_state['equation_file'] = equation_file
-    global_state['n_features'] = n_features
-    global_state['variable_names'] = variable_names
-    global_state['extra_sympy_mappings'] = extra_sympy_mappings
-    global_state['extra_jax_mappings'] = extra_jax_mappings
-    global_state['extra_torch_mappings'] = extra_torch_mappings
-    global_state['output_torch_format'] = output_torch_format
-    global_state['output_jax_format'] = output_jax_format
-    global_state['multioutput'] = multioutput
-    global_state['nout'] = nout
-    global_state['selection'] = selection
     try:
         if multioutput:
-            all_outputs = [pd.read_csv(str(equation_file) + f'.out{i}' + '.bkup', sep="|") for i in range(1, nout+1)]
         else:
-            all_outputs = [pd.read_csv(str(equation_file) + '.bkup', sep="|")]
     except FileNotFoundError:
-        raise RuntimeError("Couldn't find equation file! The equation search likely exited before a single iteration completed.")
     ret_outputs = []
@@ -798,19 +962,16 @@ def get_hof(equation_file=None, n_features=None, variable_names=None,
             jax_format = []
         if output_torch_format:
             torch_format = []
-        use_custom_variable_names = (len(variable_names) != 0)
-        local_sympy_mappings = {
-                **extra_sympy_mappings,
-                **sympy_mappings
-        }
         if use_custom_variable_names:
             sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
         else:
-            sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
         for i in range(len(output)):
-            eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
             sympy_format.append(eqn)
             # Numpy:
@@ -819,37 +980,46 @@ def get_hof(equation_file=None, n_features=None, variable_names=None,
             # JAX:
             if output_jax_format:
                 from .export_jax import sympy2jax
                 func, params = sympy2jax(eqn, sympy_symbols, selection)
-                jax_format.append({'callable': func, 'parameters': params})
             # Torch:
             if output_torch_format:
                 from .export_torch import sympy2torch
                 module = sympy2torch(eqn, sympy_symbols, selection=selection)
                 torch_format.append(module)
-            curMSE = output.loc[i, 'MSE']
-            curComplexity = output.loc[i, 'Complexity']
             if lastMSE is None:
                 cur_score = 0.0
             else:
-                cur_score = - np.log(curMSE/lastMSE)/(curComplexity - lastComplexity)
             scores.append(cur_score)
             lastMSE = curMSE
             lastComplexity = curComplexity
-        output['score'] = np.array(scores)
-        output['sympy_format'] = sympy_format
-        output['lambda_format'] = lambda_format
-        output_cols = ['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']
         if output_jax_format:
-            output_cols += ['jax_format']
-            output['jax_format'] = jax_format
         if output_torch_format:
-            output_cols += ['torch_format']
-            output['torch_format'] = torch_format
         ret_outputs.append(output[output_cols])
@@ -858,67 +1028,80 @@ def get_hof(equation_file=None, n_features=None, variable_names=None,
     else:
         return ret_outputs[0]
 def best_row(equations=None):
     """Return the best row of a hall of fame file using the score column.
     By default this uses the last equation file.
     """
-    if equations is None: equations = get_hof()
     if isinstance(equations, list):
-        return [eq.iloc[np.argmax(eq['score'])] for eq in equations]
     else:
-        return equations.iloc[np.argmax(equations['score'])]
 def best_tex(equations=None):
     """Return the equation with the best score, in latex format
     By default this uses the last equation file.
     """
-    if equations is None: equations = get_hof()
     if isinstance(equations, list):
-        return [sympy.latex(best_row(eq)['sympy_format'].simplify()) for eq in equations]
     else:
-        return sympy.latex(best_row(equations)['sympy_format'].simplify())
 def best(equations=None):
     """Return the equation with the best score, in sympy format.
     By default this uses the last equation file.
     """
-    if equations is None: equations = get_hof()
     if isinstance(equations, list):
-        return [best_row(eq)['sympy_format'].simplify() for eq in equations]
     else:
-        return best_row(equations)['sympy_format'].simplify()
 def best_callable(equations=None):
     """Return the equation with the best score, in callable format.
     By default this uses the last equation file.
     """
-    if equations is None: equations = get_hof()
     if isinstance(equations, list):
-        return [best_row(eq)['lambda_format'] for eq in equations]
     else:
-        return best_row(equations)['lambda_format']
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""
     repr = str(filename)
-    repr = repr.replace('\\', '\\\\')
     return repr
 # https://gist.github.com/garrettdreyfus/8153571
 def _yesno(question):
     """Simple Yes/No Function."""
-    prompt = f'{question} (y/n): '
     ans = input(prompt).strip().lower()
-    if ans not in ['y', 'n']:
-        print(f'{ans} is invalid, please try again...')
         return _yesno(question)
-    if ans == 'y':
         return True
     return False
 class CallableEquation(object):
     """Simple wrapper for numpy lambda functions built with sympy"""
     def __init__(self, sympy_symbols, eqn, selection=None):
         self._sympy = eqn
         self._sympy_symbols = sympy_symbols
@@ -933,4 +1116,3 @@ class CallableEquation(object):
             return self._lambda(*X[:, self._selection].T)
         else:
             return self._lambda(*X.T)

 import warnings
 global_state = dict(
+    equation_file="hall_of_fame.csv",
     n_features=None,
     variable_names=[],
     extra_sympy_mappings={},
     output_torch_format=False,
     multioutput=False,
     nout=1,
+    selection=None,
 )
 sympy_mappings = {
+    "div": lambda x, y: x / y,
+    "mult": lambda x, y: x * y,
+    "sqrt_abs": lambda x: sympy.sqrt(abs(x)),
+    "square": lambda x: x ** 2,
+    "cube": lambda x: x ** 3,
+    "plus": lambda x, y: x + y,
+    "sub": lambda x, y: x - y,
+    "neg": lambda x: -x,
+    "pow": lambda x, y: abs(x) ** y,
+    "cos": lambda x: sympy.cos(x),
+    "sin": lambda x: sympy.sin(x),
+    "tan": lambda x: sympy.tan(x),
+    "cosh": lambda x: sympy.cosh(x),
+    "sinh": lambda x: sympy.sinh(x),
+    "tanh": lambda x: sympy.tanh(x),
+    "exp": lambda x: sympy.exp(x),
+    "acos": lambda x: sympy.acos(x),
+    "asin": lambda x: sympy.asin(x),
+    "atan": lambda x: sympy.atan(x),
+    "acosh": lambda x: sympy.acosh(abs(x) + 1),
+    "acosh_abs": lambda x: sympy.acosh(abs(x) + 1),
+    "asinh": lambda x: sympy.asinh(x),
+    "atanh": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
+    "atanh_clip": lambda x: sympy.atanh(sympy.Mod(x + 1, 2) - 1),
+    "abs": lambda x: abs(x),
+    "mod": lambda x, y: sympy.Mod(x, y),
+    "erf": lambda x: sympy.erf(x),
+    "erfc": lambda x: sympy.erfc(x),
+    "log_abs": lambda x: sympy.log(abs(x)),
+    "log10_abs": lambda x: sympy.log(abs(x), 10),
+    "log2_abs": lambda x: sympy.log(abs(x), 2),
+    "log1p_abs": lambda x: sympy.log(abs(x) + 1),
+    "floor": lambda x: sympy.floor(x),
+    "ceil": lambda x: sympy.ceil(x),
+    "sign": lambda x: sympy.sign(x),
+    "gamma": lambda x: sympy.gamma(x),
 }
+def pysr(
+    X,
+    y,
+    weights=None,
+    binary_operators=None,
+    unary_operators=None,
+    procs=4,
+    loss="L2DistLoss()",
+    populations=20,
+    niterations=100,
+    ncyclesperiteration=300,
+    alpha=0.1,
+    annealing=False,
+    fractionReplaced=0.10,
+    fractionReplacedHof=0.10,
+    npop=1000,
+    parsimony=1e-4,
+    migration=True,
+    hofMigration=True,
+    shouldOptimizeConstants=True,
+    topn=10,
+    weightAddNode=1,
+    weightInsertNode=3,
+    weightDeleteNode=3,
+    weightDoNothing=1,
+    weightMutateConstant=10,
+    weightMutateOperator=1,
+    weightRandomize=1,
+    weightSimplify=0.01,
+    perturbationFactor=1.0,
+    timeout=None,
+    extra_sympy_mappings=None,
+    extra_torch_mappings=None,
+    extra_jax_mappings=None,
+    equation_file=None,
+    verbosity=1e9,
+    progress=None,
+    maxsize=20,
+    fast_cycle=False,
+    maxdepth=None,
+    variable_names=None,
+    batching=False,
+    batchSize=50,
+    select_k_features=None,
+    warmupMaxsizeBy=0.0,
+    constraints=None,
+    useFrequency=True,
+    tempdir=None,
+    delete_tempfiles=True,
+    julia_optimization=3,
+    julia_project=None,
+    user_input=True,
+    update=True,
+    temp_equation_file=False,
+    output_jax_format=False,
+    output_torch_format=False,
+    optimizer_algorithm="BFGS",
+    optimizer_nrestarts=3,
+    optimize_probability=1.0,
+    optimizer_iterations=10,
+    tournament_selection_n=10,
+    tournament_selection_p=1.0,
+):
     """Run symbolic regression to fit f(X[i, :]) ~ y[i] for all i.
     Note: most default parameters have been tuned over several example
     equations, but you should adjust `niterations`,
     :type: pd.DataFrame/list
     """
     if binary_operators is None:
+        binary_operators = "+ * - /".split(" ")
     if unary_operators is None:
         unary_operators = []
     if extra_sympy_mappings is None:
         constraints = {}
     if progress is not None:
+        if progress and ("buffer" not in sys.stdout.__dir__()):
+            warnings.warn(
+                "Note: it looks like you are running in Jupyter. The progress bar will be turned off."
+            )
             progress = False
     else:
+        if "buffer" in sys.stdout.__dir__():
             progress = True
         else:
             progress = False
+    assert optimizer_algorithm in ["NelderMead", "BFGS"]
     assert tournament_selection_n < npop
     if isinstance(X, pd.DataFrame):
         X = X[:, None]
     if len(variable_names) == 0:
+        variable_names = [f"x{i}" for i in range(X.shape[1])]
+    use_custom_variable_names = len(variable_names) != 0
+    _check_assertions(
+        X,
+        binary_operators,
+        unary_operators,
+        use_custom_variable_names,
+        variable_names,
+        weights,
+        y,
+    )
     _check_for_julia_installation()
     if len(X) > 10000 and not batching:
+        warnings.warn(
+            "Note: you are running with more than 10,000 datapoints. You should consider turning on batching (https://pysr.readthedocs.io/en/latest/docs/options/#batching). You should also reconsider if you need that many datapoints. Unless you have a large amount of noise (in which case you should smooth your dataset first), generally < 10,000 datapoints is enough to find a functional form with symbolic regression. More datapoints will lower the search speed."
+        )
     if maxsize > 40:
+        warnings.warn(
+            "Note: Using a large maxsize for the equation search will be exponentially slower and use significant memory. You should consider turning `useFrequency` to False, and perhaps use `warmupMaxsizeBy`."
+        )
     X, variable_names, selection = _handle_feature_selection(
+        X, select_k_features, use_custom_variable_names, variable_names, y
+    )
     if maxdepth is None:
         maxdepth = maxsize
     else:
         raise NotImplementedError("y shape not supported!")
+    kwargs = dict(
+        X=X,
+        y=y,
+        weights=weights,
+        alpha=alpha,
+        annealing=annealing,
+        batchSize=batchSize,
+        batching=batching,
+        binary_operators=binary_operators,
+        fast_cycle=fast_cycle,
+        fractionReplaced=fractionReplaced,
+        ncyclesperiteration=ncyclesperiteration,
+        niterations=niterations,
+        npop=npop,
+        topn=topn,
+        verbosity=verbosity,
+        progress=progress,
+        update=update,
+        julia_optimization=julia_optimization,
+        timeout=timeout,
+        fractionReplacedHof=fractionReplacedHof,
+        hofMigration=hofMigration,
+        maxdepth=maxdepth,
+        maxsize=maxsize,
+        migration=migration,
+        optimizer_algorithm=optimizer_algorithm,
+        optimizer_nrestarts=optimizer_nrestarts,
+        optimize_probability=optimize_probability,
+        optimizer_iterations=optimizer_iterations,
+        parsimony=parsimony,
+        perturbationFactor=perturbationFactor,
+        populations=populations,
+        procs=procs,
+        shouldOptimizeConstants=shouldOptimizeConstants,
+        unary_operators=unary_operators,
+        useFrequency=useFrequency,
+        use_custom_variable_names=use_custom_variable_names,
+        variable_names=variable_names,
+        warmupMaxsizeBy=warmupMaxsizeBy,
+        weightAddNode=weightAddNode,
+        weightDeleteNode=weightDeleteNode,
+        weightDoNothing=weightDoNothing,
+        weightInsertNode=weightInsertNode,
+        weightMutateConstant=weightMutateConstant,
+        weightMutateOperator=weightMutateOperator,
+        weightRandomize=weightRandomize,
+        weightSimplify=weightSimplify,
+        constraints=constraints,
+        extra_sympy_mappings=extra_sympy_mappings,
+        extra_jax_mappings=extra_jax_mappings,
+        extra_torch_mappings=extra_torch_mappings,
+        julia_project=julia_project,
+        loss=loss,
+        output_jax_format=output_jax_format,
+        output_torch_format=output_torch_format,
+        selection=selection,
+        multioutput=multioutput,
+        nout=nout,
+        tournament_selection_n=tournament_selection_n,
+        tournament_selection_p=tournament_selection_p,
+    )
     kwargs = {**_set_paths(tempdir), **kwargs}
     if temp_equation_file:
+        equation_file = kwargs["tmpdir"] / f"hall_of_fame.csv"
     elif equation_file is None:
         date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
+        equation_file = "hall_of_fame_" + date_time + ".csv"
     kwargs = {**dict(equation_file=equation_file), **kwargs}
+    pkg_directory = kwargs["pkg_directory"]
     manifest_file = None
+    if kwargs["julia_project"] is not None:
+        manifest_filepath = Path(kwargs["julia_project"]) / "Manifest.toml"
     else:
+        manifest_filepath = pkg_directory / "Manifest.toml"
+    kwargs["need_install"] = False
     if not (manifest_filepath).is_file():
+        kwargs["need_install"] = (not user_input) or _yesno(
+            "I will install Julia packages using PySR's Project.toml file. OK?"
+        )
+        if kwargs["need_install"]:
             print("OK. I will install at launch.")
             assert update
+    kwargs["def_hyperparams"] = _create_inline_operators(**kwargs)
     _handle_constraints(**kwargs)
+    kwargs["constraints_str"] = _make_constraints_str(**kwargs)
+    kwargs["def_hyperparams"] = _make_hyperparams_julia_str(**kwargs)
+    kwargs["def_datasets"] = _make_datasets_julia_str(**kwargs)
     _create_julia_files(**kwargs)
     _final_pysr_process(**kwargs)
     equations = get_hof(**kwargs)
     if delete_tempfiles:
+        shutil.rmtree(kwargs["tmpdir"])
     return equations
 def _set_globals(X, **kwargs):
     global global_state
+    global_state["n_features"] = X.shape[1]
     for key, value in kwargs.items():
         if key in global_state:
             global_state[key] = value
 def _final_pysr_process(julia_optimization, runfile_filename, timeout, **kwargs):
     command = [
+        f"julia",
+        f"-O{julia_optimization:d}",
         str(runfile_filename),
     ]
     if timeout is not None:
+        command = [f"timeout", f"{timeout}"] + command
     _cmd_runner(command, **kwargs)
 def _cmd_runner(command, progress, **kwargs):
+    if kwargs["verbosity"] > 0:
+        print("Running on", " ".join(command))
     process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=-1)
     try:
         while True:
             line = process.stdout.readline()
+            if not line:
+                break
+            decoded_line = line.decode("utf-8")
             if progress:
+                decoded_line = (
+                    decoded_line.replace("\\033[K", "\033[K")
+                    .replace("\\033[1A", "\033[1A")
+                    .replace("\\033[1B", "\033[1B")
+                    .replace("\\r", "\r")
+                    .encode(sys.stdout.encoding, errors="replace")
+                )
                 sys.stdout.buffer.write(decoded_line)
                 sys.stdout.flush()
             else:
+                print(decoded_line, end="")
         process.stdout.close()
         process.wait()
         print("Killing process... will return when done.")
         process.kill()
+def _create_julia_files(
+    dataset_filename,
+    def_datasets,
+    hyperparam_filename,
+    def_hyperparams,
+    fractionReplaced,
+    ncyclesperiteration,
+    niterations,
+    npop,
+    runfile_filename,
+    topn,
+    verbosity,
+    julia_project,
+    procs,
+    weights,
+    X,
+    variable_names,
+    pkg_directory,
+    need_install,
+    update,
+    **kwargs,
+):
+    with open(hyperparam_filename, "w") as f:
         print(def_hyperparams, file=f)
+    with open(dataset_filename, "w") as f:
         print(def_datasets, file=f)
+    with open(runfile_filename, "w") as f:
         if julia_project is None:
             julia_project = pkg_directory
         else:
             julia_project = Path(julia_project)
+        print(f"import Pkg", file=f)
         print(f'Pkg.activate("{_escape_filename(julia_project)}")', file=f)
         if need_install:
+            print(f"Pkg.instantiate()", file=f)
+            print(f"Pkg.update()", file=f)
+            print(f"Pkg.precompile()", file=f)
         elif update:
+            print(f"Pkg.update()", file=f)
+        print(f"using SymbolicRegression", file=f)
         print(f'include("{_escape_filename(hyperparam_filename)}")', file=f)
         print(f'include("{_escape_filename(dataset_filename)}")', file=f)
         if len(variable_names) == 0:
             varMap = "[" + ",".join([f'"x{i}"' for i in range(X.shape[1])]) + "]"
         else:
+            varMap = (
+                "[" + ",".join(['"' + vname + '"' for vname in variable_names]) + "]"
+            )
         if weights is not None:
+            print(
+                f"EquationSearch(X, y, weights=weights, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})",
+                file=f,
+            )
         else:
+            print(
+                f"EquationSearch(X, y, niterations={niterations:d}, varMap={varMap}, options=options, numprocs={procs})",
+                file=f,
+            )
+def _make_datasets_julia_str(
+    X, X_filename, weights, weights_filename, y, y_filename, multioutput, **kwargs
+):
     def_datasets = """using DelimitedFiles"""
+    np.savetxt(X_filename, X.astype(np.float32), delimiter=",")
     if multioutput:
+        np.savetxt(y_filename, y.astype(np.float32), delimiter=",")
     else:
+        np.savetxt(y_filename, y.reshape(-1, 1).astype(np.float32), delimiter=",")
     if weights is not None:
         if multioutput:
+            np.savetxt(weights_filename, weights.astype(np.float32), delimiter=",")
         else:
+            np.savetxt(
+                weights_filename,
+                weights.reshape(-1, 1).astype(np.float32),
+                delimiter=",",
+            )
     def_datasets += f"""
 X = copy(transpose(readdlm("{_escape_filename(X_filename)}", ',', Float32, '\\n')))"""
     if multioutput:
+        def_datasets += f"""
 y = copy(transpose(readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')))"""
     else:
+        def_datasets += f"""
 y = readdlm("{_escape_filename(y_filename)}", ',', Float32, '\\n')[:, 1]"""
     if weights is not None:
 weights = readdlm("{_escape_filename(weights_filename)}", ',', Float32, '\\n')[:, 1]"""
     return def_datasets
+def _make_hyperparams_julia_str(
+    X,
+    alpha,
+    annealing,
+    batchSize,
+    batching,
+    binary_operators,
+    constraints_str,
+    def_hyperparams,
+    equation_file,
+    fast_cycle,
+    fractionReplacedHof,
+    hofMigration,
+    maxdepth,
+    maxsize,
+    migration,
+    optimizer_algorithm,
+    optimizer_nrestarts,
+    optimize_probability,
+    optimizer_iterations,
+    npop,
+    parsimony,
+    perturbationFactor,
+    populations,
+    procs,
+    shouldOptimizeConstants,
+    unary_operators,
+    useFrequency,
+    use_custom_variable_names,
+    variable_names,
+    warmupMaxsizeBy,
+    weightAddNode,
+    ncyclesperiteration,
+    fractionReplaced,
+    topn,
+    verbosity,
+    progress,
+    loss,
+    weightDeleteNode,
+    weightDoNothing,
+    weightInsertNode,
+    weightMutateConstant,
+    weightMutateOperator,
+    weightRandomize,
+    weightSimplify,
+    weights,
+    tournament_selection_n,
+    tournament_selection_p,
+    **kwargs,
+):
     try:
         term_width = shutil.get_terminal_size().columns
     except:
+        _, term_width = subprocess.check_output(["stty", "size"]).split()
     def tuple_fix(ops):
         if len(ops) > 1:
+            return ", ".join(ops)
         elif len(ops) == 0:
+            return ""
         else:
+            return ops[0] + ","
     def_hyperparams += f"""\n
 plus=(+)
 terminal_width={term_width:d}
 """
+    def_hyperparams += "\n)"
     return def_hyperparams
     for op in binary_operators:
         if op not in constraints:
             constraints[op] = (-1, -1)
+        if op in ["plus", "sub"]:
             if constraints[op][0] != constraints[op][1]:
                 raise NotImplementedError(
+                    "You need equal constraints on both sides for - and *, due to simplification strategies."
+                )
+        elif op == "mult":
             # Make sure the complex expression is in the left side.
             if constraints[op][0] == -1:
                 continue
             elif constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]:
+                constraints[op][0], constraints[op][1] = (
+                    constraints[op][1],
+                    constraints[op][0],
+                )
 def _create_inline_operators(binary_operators, unary_operators, **kwargs):
     for op_list in [binary_operators, unary_operators]:
         for i in range(len(op_list)):
             op = op_list[i]
+            is_user_defined_operator = "(" in op
             if is_user_defined_operator:
                 def_hyperparams += op + "\n"
                 # Cut off from the first non-alphanumeric char:
                 first_non_char = [
+                    j
+                    for j in range(len(op))
+                    if not (op[j].isalpha() or op[j].isdigit())
+                ][0]
                 function_name = op[:first_non_char]
                 op_list[i] = function_name
     return def_hyperparams
+def _handle_feature_selection(
+    X, select_k_features, use_custom_variable_names, variable_names, y
+):
     if select_k_features is not None:
         selection = run_feature_selection(X, y, select_k_features)
         print(f"Using features {selection}")
         X = X[:, selection]
         if use_custom_variable_names:
+            variable_names = [
+                variable_names[selection[i]] for i in range(len(selection))
+            ]
     else:
         selection = None
     return X, variable_names, selection
     pkg_directory = Path(__file__).parents[1]
     default_project_file = pkg_directory / "Project.toml"
     tmpdir = Path(tempfile.mkdtemp(dir=tempdir))
+    hyperparam_filename = tmpdir / f"hyperparams.jl"
+    dataset_filename = tmpdir / f"dataset.jl"
+    runfile_filename = tmpdir / f"runfile.jl"
     X_filename = tmpdir / "X.csv"
     y_filename = tmpdir / "y.csv"
     weights_filename = tmpdir / "weights.csv"
+    return dict(
+        pkg_directory=pkg_directory,
+        default_project_file=default_project_file,
+        X_filename=X_filename,
+        dataset_filename=dataset_filename,
+        hyperparam_filename=hyperparam_filename,
+        runfile_filename=runfile_filename,
+        tmpdir=tmpdir,
+        weights_filename=weights_filename,
+        y_filename=y_filename,
+    )
+def _check_assertions(
+    X,
+    binary_operators,
+    unary_operators,
+    use_custom_variable_names,
+    variable_names,
+    weights,
+    y,
+):
     # Check for potential errors before they happen
     assert len(unary_operators) + len(binary_operators) > 0
     assert len(X.shape) == 2
     if use_custom_variable_names:
         assert len(variable_names) == X.shape[1]
 def _check_for_julia_installation():
     try:
         process = subprocess.Popen(["julia", "-v"], stdout=subprocess.PIPE, bufsize=-1)
         while True:
             line = process.stdout.readline()
+            if not line:
+                break
         process.stdout.close()
         process.wait()
     except FileNotFoundError:
         import os
+        raise RuntimeError(
+            f"Your current $PATH is: {os.environ['PATH']}\nPySR could not start julia. Make sure julia is installed and on your $PATH."
+        )
     process.kill()
 def run_feature_selection(X, y, select_k_features):
     """Use a gradient boosting tree regressor as a proxy for finding
+    the k most important features in X, returning indices for those
+    features as output."""
     from sklearn.ensemble import RandomForestRegressor
     from sklearn.feature_selection import SelectFromModel, SelectKBest
     clf = RandomForestRegressor(n_estimators=100, max_depth=3, random_state=0)
     clf.fit(X, y)
+    selector = SelectFromModel(
+        clf, threshold=-np.inf, max_features=select_k_features, prefit=True
+    )
     return selector.get_support(indices=True)
+def get_hof(
+    equation_file=None,
+    n_features=None,
+    variable_names=None,
+    output_jax_format=None,
+    output_torch_format=None,
+    selection=None,
+    extra_sympy_mappings=None,
+    extra_jax_mappings=None,
+    extra_torch_mappings=None,
+    multioutput=None,
+    nout=None,
+    **kwargs,
+):
     """Get the equations from a hall of fame file. If no arguments
     entered, the ones used previously from a call to PySR will be used."""
     global global_state
+    if equation_file is None:
+        equation_file = global_state["equation_file"]
+    if n_features is None:
+        n_features = global_state["n_features"]
+    if variable_names is None:
+        variable_names = global_state["variable_names"]
+    if extra_sympy_mappings is None:
+        extra_sympy_mappings = global_state["extra_sympy_mappings"]
+    if extra_jax_mappings is None:
+        extra_jax_mappings = global_state["extra_jax_mappings"]
+    if extra_torch_mappings is None:
+        extra_torch_mappings = global_state["extra_torch_mappings"]
+    if output_torch_format is None:
+        output_torch_format = global_state["output_torch_format"]
+    if output_jax_format is None:
+        output_jax_format = global_state["output_jax_format"]
+    if multioutput is None:
+        multioutput = global_state["multioutput"]
+    if nout is None:
+        nout = global_state["nout"]
+    if selection is None:
+        selection = global_state["selection"]
+    global_state["selection"] = selection
+    global_state["equation_file"] = equation_file
+    global_state["n_features"] = n_features
+    global_state["variable_names"] = variable_names
+    global_state["extra_sympy_mappings"] = extra_sympy_mappings
+    global_state["extra_jax_mappings"] = extra_jax_mappings
+    global_state["extra_torch_mappings"] = extra_torch_mappings
+    global_state["output_torch_format"] = output_torch_format
+    global_state["output_jax_format"] = output_jax_format
+    global_state["multioutput"] = multioutput
+    global_state["nout"] = nout
+    global_state["selection"] = selection
     try:
         if multioutput:
+            all_outputs = [
+                pd.read_csv(str(equation_file) + f".out{i}" + ".bkup", sep="|")
+                for i in range(1, nout + 1)
+            ]
         else:
+            all_outputs = [pd.read_csv(str(equation_file) + ".bkup", sep="|")]
     except FileNotFoundError:
+        raise RuntimeError(
+            "Couldn't find equation file! The equation search likely exited before a single iteration completed."
+        )
     ret_outputs = []
             jax_format = []
         if output_torch_format:
             torch_format = []
+        use_custom_variable_names = len(variable_names) != 0
+        local_sympy_mappings = {**extra_sympy_mappings, **sympy_mappings}
         if use_custom_variable_names:
             sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
         else:
+            sympy_symbols = [sympy.Symbol("x%d" % i) for i in range(n_features)]
         for i in range(len(output)):
+            eqn = sympify(output.loc[i, "Equation"], locals=local_sympy_mappings)
             sympy_format.append(eqn)
             # Numpy:
             # JAX:
             if output_jax_format:
                 from .export_jax import sympy2jax
                 func, params = sympy2jax(eqn, sympy_symbols, selection)
+                jax_format.append({"callable": func, "parameters": params})
             # Torch:
             if output_torch_format:
                 from .export_torch import sympy2torch
                 module = sympy2torch(eqn, sympy_symbols, selection=selection)
                 torch_format.append(module)
+            curMSE = output.loc[i, "MSE"]
+            curComplexity = output.loc[i, "Complexity"]
             if lastMSE is None:
                 cur_score = 0.0
             else:
+                cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity)
             scores.append(cur_score)
             lastMSE = curMSE
             lastComplexity = curComplexity
+        output["score"] = np.array(scores)
+        output["sympy_format"] = sympy_format
+        output["lambda_format"] = lambda_format
+        output_cols = [
+            "Complexity",
+            "MSE",
+            "score",
+            "Equation",
+            "sympy_format",
+            "lambda_format",
+        ]
         if output_jax_format:
+            output_cols += ["jax_format"]
+            output["jax_format"] = jax_format
         if output_torch_format:
+            output_cols += ["torch_format"]
+            output["torch_format"] = torch_format
         ret_outputs.append(output[output_cols])
     else:
         return ret_outputs[0]
 def best_row(equations=None):
     """Return the best row of a hall of fame file using the score column.
     By default this uses the last equation file.
     """
+    if equations is None:
+        equations = get_hof()
     if isinstance(equations, list):
+        return [eq.iloc[np.argmax(eq["score"])] for eq in equations]
     else:
+        return equations.iloc[np.argmax(equations["score"])]
 def best_tex(equations=None):
     """Return the equation with the best score, in latex format
     By default this uses the last equation file.
     """
+    if equations is None:
+        equations = get_hof()
     if isinstance(equations, list):
+        return [
+            sympy.latex(best_row(eq)["sympy_format"].simplify()) for eq in equations
+        ]
     else:
+        return sympy.latex(best_row(equations)["sympy_format"].simplify())
 def best(equations=None):
     """Return the equation with the best score, in sympy format.
     By default this uses the last equation file.
     """
+    if equations is None:
+        equations = get_hof()
     if isinstance(equations, list):
+        return [best_row(eq)["sympy_format"].simplify() for eq in equations]
     else:
+        return best_row(equations)["sympy_format"].simplify()
 def best_callable(equations=None):
     """Return the equation with the best score, in callable format.
     By default this uses the last equation file.
     """
+    if equations is None:
+        equations = get_hof()
     if isinstance(equations, list):
+        return [best_row(eq)["lambda_format"] for eq in equations]
     else:
+        return best_row(equations)["lambda_format"]
 def _escape_filename(filename):
     """Turns a file into a string representation with correctly escaped backslashes"""
     repr = str(filename)
+    repr = repr.replace("\\", "\\\\")
     return repr
 # https://gist.github.com/garrettdreyfus/8153571
 def _yesno(question):
     """Simple Yes/No Function."""
+    prompt = f"{question} (y/n): "
     ans = input(prompt).strip().lower()
+    if ans not in ["y", "n"]:
+        print(f"{ans} is invalid, please try again...")
         return _yesno(question)
+    if ans == "y":
         return True
     return False
 class CallableEquation(object):
     """Simple wrapper for numpy lambda functions built with sympy"""
     def __init__(self, sympy_symbols, eqn, selection=None):
         self._sympy = eqn
         self._sympy_symbols = sympy_symbols
             return self._lambda(*X[:, self._selection].T)
         else:
             return self._lambda(*X.T)

setup.py CHANGED Viewed

@@ -12,19 +12,13 @@ setuptools.setup(
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/MilesCranmer/pysr",
-    install_requires=[
-        "numpy",
-        "pandas",
-        "sympy"
-        ],
     packages=setuptools.find_packages(),
-    package_data={
-        'pysr': ['../Project.toml', '../datasets/*']
-    },
     include_package_data=False,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
-    python_requires='>=3.7',
 )

     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/MilesCranmer/pysr",
+    install_requires=["numpy", "pandas", "sympy"],
     packages=setuptools.find_packages(),
+    package_data={"pysr": ["../Project.toml", "../datasets/*"]},
     include_package_data=False,
     classifiers=[
         "Programming Language :: Python :: 3",
         "Operating System :: OS Independent",
     ],
+    python_requires=">=3.7",
 )

test/test.py CHANGED Viewed

@@ -6,6 +6,7 @@ import sympy
 from sympy import lambdify
 import pandas as pd
 class TestPipeline(unittest.TestCase):
     def setUp(self):
         self.default_test_kwargs = dict(
@@ -17,86 +18,105 @@ class TestPipeline(unittest.TestCase):
         )
         np.random.seed(0)
         self.X = np.random.randn(100, 5)
     def test_linear_relation(self):
         y = self.X[:, 0]
         equations = pysr(self.X, y, **self.default_test_kwargs)
         print(equations)
-        self.assertLessEqual(equations.iloc[-1]['MSE'], 1e-4)
     def test_multioutput_custom_operator(self):
-        y = self.X[:, [0, 1]]**2
-        equations = pysr(self.X, y,
-                         unary_operators=["sq(x) = x^2"], binary_operators=["plus"],
-                         extra_sympy_mappings={'sq': lambda x: x**2},
-                         **self.default_test_kwargs,
-                         procs=0)
         print(equations)
-        self.assertLessEqual(equations[0].iloc[-1]['MSE'], 1e-4)
-        self.assertLessEqual(equations[1].iloc[-1]['MSE'], 1e-4)
     def test_multioutput_weighted_with_callable(self):
-        y = self.X[:, [0, 1]]**2
         w = np.random.rand(*y.shape)
         w[w < 0.5] = 0.0
         w[w >= 0.5] = 1.0
         # Double equation when weights are 0:
-        y += (1-w) * y
         # Thus, pysr needs to use the weights to find the right equation!
-        equations = pysr(self.X, y, weights=w,
-                         unary_operators=["sq(x) = x^2"], binary_operators=["plus"],
-                         extra_sympy_mappings={'sq': lambda x: x**2},
-                         **self.default_test_kwargs,
-                         procs=0)
         np.testing.assert_almost_equal(
-                best_callable()[0](self.X),
-                self.X[:, 0]**2,
-                decimal=4)
         np.testing.assert_almost_equal(
-                best_callable()[1](self.X),
-                self.X[:, 1]**2,
-                decimal=4)
     def test_empty_operators_single_input(self):
         X = np.random.randn(100, 1)
         y = X[:, 0] + 3.0
-        equations = pysr(X, y,
-                         unary_operators=[], binary_operators=["plus"],
-                         **self.default_test_kwargs)
-        self.assertLessEqual(equations.iloc[-1]['MSE'], 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):
-        equations = pd.DataFrame({
-            'Equation': ['1.0', 'cos(x0)', 'square(cos(x0))'],
-            'MSE': [1.0, 0.1, 1e-5],
-            'Complexity': [1, 2, 3]
-            })
-        equations['Complexity MSE Equation'.split(' ')].to_csv(
-                'equation_file.csv.bkup', sep='|')
         self.equations = get_hof(
-                'equation_file.csv', n_features=2,
-                variables_names='x0 x1'.split(' '),
-                extra_sympy_mappings={}, output_jax_format=False,
-                multioutput=False, nout=1)
     def test_best(self):
-        self.assertEqual(best(self.equations), sympy.cos(sympy.Symbol('x0'))**2)
-        self.assertEqual(best(), sympy.cos(sympy.Symbol('x0'))**2)
     def test_best_tex(self):
-        self.assertEqual(best_tex(self.equations), '\\cos^{2}{\\left(x_{0} \\right)}')
-        self.assertEqual(best_tex(), '\\cos^{2}{\\left(x_{0} \\right)}')
     def test_best_lambda(self):
         X = np.random.randn(10, 2)
-        y = np.cos(X[:, 0])**2
         for f in [best_callable(), best_callable(self.equations)]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
@@ -107,22 +127,23 @@ class TestFeatureSelection(unittest.TestCase):
     def test_feature_selection(self):
         X = np.random.randn(20000, 5)
-        y = X[:, 2]**2 + X[:, 3]**2
         selected = run_feature_selection(X, y, select_k_features=2)
         self.assertEqual(sorted(selected), [2, 3])
     def test_feature_selection_handler(self):
         X = np.random.randn(20000, 5)
-        y = X[:, 2]**2 + X[:, 3]**2
-        var_names = [f'x{i}' for i in range(5)]
         selected_X, selected_var_names, selection = _handle_feature_selection(
-                X, select_k_features=2,
-                use_custom_variable_names=True,
-                variable_names=[f'x{i}' for i in range(5)],
-                y=y)
         self.assertTrue((2 in selection) and (3 in selection))
-        self.assertEqual(set(selected_var_names), set('x2 x3'.split(' ')))
         np.testing.assert_array_equal(
-                np.sort(selected_X, axis=1),
-                np.sort(X[:, [2, 3]], axis=1)
-            )

 from sympy import lambdify
 import pandas as pd
 class TestPipeline(unittest.TestCase):
     def setUp(self):
         self.default_test_kwargs = dict(
         )
         np.random.seed(0)
         self.X = np.random.randn(100, 5)
     def test_linear_relation(self):
         y = self.X[:, 0]
         equations = pysr(self.X, y, **self.default_test_kwargs)
         print(equations)
+        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
     def test_multioutput_custom_operator(self):
+        y = self.X[:, [0, 1]] ** 2
+        equations = pysr(
+            self.X,
+            y,
+            unary_operators=["sq(x) = x^2"],
+            binary_operators=["plus"],
+            extra_sympy_mappings={"sq": lambda x: x ** 2},
+            **self.default_test_kwargs,
+            procs=0,
+        )
         print(equations)
+        self.assertLessEqual(equations[0].iloc[-1]["MSE"], 1e-4)
+        self.assertLessEqual(equations[1].iloc[-1]["MSE"], 1e-4)
     def test_multioutput_weighted_with_callable(self):
+        y = self.X[:, [0, 1]] ** 2
         w = np.random.rand(*y.shape)
         w[w < 0.5] = 0.0
         w[w >= 0.5] = 1.0
         # Double equation when weights are 0:
+        y += (1 - w) * y
         # Thus, pysr needs to use the weights to find the right equation!
+        equations = pysr(
+            self.X,
+            y,
+            weights=w,
+            unary_operators=["sq(x) = x^2"],
+            binary_operators=["plus"],
+            extra_sympy_mappings={"sq": lambda x: x ** 2},
+            **self.default_test_kwargs,
+            procs=0,
+        )
         np.testing.assert_almost_equal(
+            best_callable()[0](self.X), self.X[:, 0] ** 2, decimal=4
+        )
         np.testing.assert_almost_equal(
+            best_callable()[1](self.X), self.X[:, 1] ** 2, decimal=4
+        )
     def test_empty_operators_single_input(self):
         X = np.random.randn(100, 1)
         y = X[:, 0] + 3.0
+        equations = pysr(
+            X,
+            y,
+            unary_operators=[],
+            binary_operators=["plus"],
+            **self.default_test_kwargs,
+        )
+        self.assertLessEqual(equations.iloc[-1]["MSE"], 1e-4)
 class TestBest(unittest.TestCase):
     def setUp(self):
+        equations = pd.DataFrame(
+            {
+                "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
+                "MSE": [1.0, 0.1, 1e-5],
+                "Complexity": [1, 2, 3],
+            }
+        )
+        equations["Complexity MSE Equation".split(" ")].to_csv(
+            "equation_file.csv.bkup", sep="|"
+        )
         self.equations = get_hof(
+            "equation_file.csv",
+            n_features=2,
+            variables_names="x0 x1".split(" "),
+            extra_sympy_mappings={},
+            output_jax_format=False,
+            multioutput=False,
+            nout=1,
+        )
     def test_best(self):
+        self.assertEqual(best(self.equations), sympy.cos(sympy.Symbol("x0")) ** 2)
+        self.assertEqual(best(), sympy.cos(sympy.Symbol("x0")) ** 2)
     def test_best_tex(self):
+        self.assertEqual(best_tex(self.equations), "\\cos^{2}{\\left(x_{0} \\right)}")
+        self.assertEqual(best_tex(), "\\cos^{2}{\\left(x_{0} \\right)}")
     def test_best_lambda(self):
         X = np.random.randn(10, 2)
+        y = np.cos(X[:, 0]) ** 2
         for f in [best_callable(), best_callable(self.equations)]:
             np.testing.assert_almost_equal(f(X), y, decimal=4)
     def test_feature_selection(self):
         X = np.random.randn(20000, 5)
+        y = X[:, 2] ** 2 + X[:, 3] ** 2
         selected = run_feature_selection(X, y, select_k_features=2)
         self.assertEqual(sorted(selected), [2, 3])
     def test_feature_selection_handler(self):
         X = np.random.randn(20000, 5)
+        y = X[:, 2] ** 2 + X[:, 3] ** 2
+        var_names = [f"x{i}" for i in range(5)]
         selected_X, selected_var_names, selection = _handle_feature_selection(
+            X,
+            select_k_features=2,
+            use_custom_variable_names=True,
+            variable_names=[f"x{i}" for i in range(5)],
+            y=y,
+        )
         self.assertTrue((2 in selection) and (3 in selection))
+        self.assertEqual(set(selected_var_names), set("x2 x3".split(" ")))
         np.testing.assert_array_equal(
+            np.sort(selected_X, axis=1), np.sort(X[:, [2, 3]], axis=1)
+        )

test/test_jax.py CHANGED Viewed

@@ -7,37 +7,48 @@ from jax import random
 from jax import grad
 import sympy
 class TestJAX(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
     def test_sympy2jax(self):
-        x, y, z = sympy.symbols('x y z')
         cosx = 1.0 * sympy.cos(x) + y
         key = random.PRNGKey(0)
         X = random.normal(key, (1000, 2))
         true = 1.0 * jnp.cos(X[:, 0]) + X[:, 1]
         f, params = sympy2jax(cosx, [x, y, z])
         self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item())
     def test_pipeline(self):
         X = np.random.randn(100, 10)
-        equations = pd.DataFrame({
-            'Equation': ['1.0', 'cos(x0)', 'square(cos(x0))'],
-            'MSE': [1.0, 0.1, 1e-5],
-            'Complexity': [1, 2, 3]
-            })
-        equations['Complexity MSE Equation'.split(' ')].to_csv(
-                'equation_file.csv.bkup', sep='|')
         equations = get_hof(
-                'equation_file.csv', n_features=2, variables_names='x1 x2 x3'.split(' '),
-                extra_sympy_mappings={}, output_jax_format=True,
-                multioutput=False, nout=1, selection=[1, 2, 3])
         jformat = equations.iloc[-1].jax_format
         np.testing.assert_almost_equal(
-                np.array(jformat['callable'](jnp.array(X), jformat['parameters'])),
-                np.square(np.cos(X[:, 1])), # Select feature 1
-                decimal=4
         )

 from jax import grad
 import sympy
 class TestJAX(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
     def test_sympy2jax(self):
+        x, y, z = sympy.symbols("x y z")
         cosx = 1.0 * sympy.cos(x) + y
         key = random.PRNGKey(0)
         X = random.normal(key, (1000, 2))
         true = 1.0 * jnp.cos(X[:, 0]) + X[:, 1]
         f, params = sympy2jax(cosx, [x, y, z])
         self.assertTrue(jnp.all(jnp.isclose(f(X, params), true)).item())
     def test_pipeline(self):
         X = np.random.randn(100, 10)
+        equations = pd.DataFrame(
+            {
+                "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
+                "MSE": [1.0, 0.1, 1e-5],
+                "Complexity": [1, 2, 3],
+            }
+        )
+        equations["Complexity MSE Equation".split(" ")].to_csv(
+            "equation_file.csv.bkup", sep="|"
+        )
         equations = get_hof(
+            "equation_file.csv",
+            n_features=2,
+            variables_names="x1 x2 x3".split(" "),
+            extra_sympy_mappings={},
+            output_jax_format=True,
+            multioutput=False,
+            nout=1,
+            selection=[1, 2, 3],
+        )
         jformat = equations.iloc[-1].jax_format
         np.testing.assert_almost_equal(
+            np.array(jformat["callable"](jnp.array(X), jformat["parameters"])),
+            np.square(np.cos(X[:, 1])),  # Select feature 1
+            decimal=4,
         )

test/test_torch.py CHANGED Viewed

@@ -5,38 +5,49 @@ from pysr import sympy2torch, get_hof
 import torch
 import sympy
 class TestTorch(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
     def test_sympy2torch(self):
-        x, y, z = sympy.symbols('x y z')
         cosx = 1.0 * sympy.cos(x) + y
         X = torch.tensor(np.random.randn(1000, 3))
         true = 1.0 * torch.cos(X[:, 0]) + X[:, 1]
         torch_module = sympy2torch(cosx, [x, y, z])
         self.assertTrue(
-                np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy()))
         )
     def test_pipeline(self):
         X = np.random.randn(100, 10)
-        equations = pd.DataFrame({
-            'Equation': ['1.0', 'cos(x0)', 'square(cos(x0))'],
-            'MSE': [1.0, 0.1, 1e-5],
-            'Complexity': [1, 2, 3]
-            })
-        equations['Complexity MSE Equation'.split(' ')].to_csv(
-                'equation_file.csv.bkup', sep='|')
         equations = get_hof(
-                'equation_file.csv', n_features=2, variables_names='x1 x2 x3'.split(' '),
-                extra_sympy_mappings={}, output_torch_format=True,
-                multioutput=False, nout=1, selection=[1, 2, 3])
         tformat = equations.iloc[-1].torch_format
         np.testing.assert_almost_equal(
-                tformat(torch.tensor(X)).detach().numpy(),
-                np.square(np.cos(X[:, 1])), #Selection 1st feature
-                decimal=4
         )

 import torch
 import sympy
 class TestTorch(unittest.TestCase):
     def setUp(self):
         np.random.seed(0)
     def test_sympy2torch(self):
+        x, y, z = sympy.symbols("x y z")
         cosx = 1.0 * sympy.cos(x) + y
         X = torch.tensor(np.random.randn(1000, 3))
         true = 1.0 * torch.cos(X[:, 0]) + X[:, 1]
         torch_module = sympy2torch(cosx, [x, y, z])
         self.assertTrue(
+            np.all(np.isclose(torch_module(X).detach().numpy(), true.detach().numpy()))
         )
     def test_pipeline(self):
         X = np.random.randn(100, 10)
+        equations = pd.DataFrame(
+            {
+                "Equation": ["1.0", "cos(x0)", "square(cos(x0))"],
+                "MSE": [1.0, 0.1, 1e-5],
+                "Complexity": [1, 2, 3],
+            }
+        )
+        equations["Complexity MSE Equation".split(" ")].to_csv(
+            "equation_file.csv.bkup", sep="|"
+        )
         equations = get_hof(
+            "equation_file.csv",
+            n_features=2,
+            variables_names="x1 x2 x3".split(" "),
+            extra_sympy_mappings={},
+            output_torch_format=True,
+            multioutput=False,
+            nout=1,
+            selection=[1, 2, 3],
+        )
         tformat = equations.iloc[-1].torch_format
         np.testing.assert_almost_equal(
+            tformat(torch.tensor(X)).detach().numpy(),
+            np.square(np.cos(X[:, 1])),  # Selection 1st feature
+            decimal=4,
         )