Spaces:

MilesCranmer
/

PySR

Running

App Files Files Community

MilesCranmer commited on Jul 28, 2023

Commit

563d8eb

unverified ·

2 Parent(s): 3565353 1e1bd80

Merge pull request #389 from MilesCranmer/backend-update-0.21.2

Browse files

Files changed (8) hide show

docs/examples.md +89 -1
docs/gen_param_docs.py +1 -1
pysr/julia_helpers.py +2 -2
{docs → pysr}/param_groupings.yml +3 -0
pysr/sr.py +176 -36
pysr/test/test.py +167 -0
pysr/version.py +2 -2
requirements.txt +1 -1

docs/examples.md CHANGED Viewed

@@ -433,9 +433,97 @@ equal to:
 $\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
 is nearly the same as the true equation!
-## 10. Additional features
 For the many other features available in PySR, please
 read the [Options section](options.md).

 $\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
 is nearly the same as the true equation!
+## 10. Dimensional constraints
+One other feature we can exploit is dimensional analysis.
+Say that we know the physical units of each feature and output,
+and we want to find an expression that is dimensionally consistent.
+We can do this as follows, using `DynamicQuantities.jl` to assign units,
+passing a string specifying the units for each variable.
+First, let's make some data on Newton's law of gravitation, using
+astropy for units:
+```python
+import numpy as np
+from astropy import units as u, constants as const
+M = (np.random.rand(100) + 0.1) * const.M_sun
+m = 100 * (np.random.rand(100) + 0.1) * u.kg
+r = (np.random.rand(100) + 0.1) * const.R_earth
+G = const.G
+F = G * M * m / r**2
+```
+We can see the units of `F` with `F.unit`.
+Now, let's create our model.
+Since this data has such a large dynamic range,
+let's also create a custom loss function
+that looks at the error in log-space:
+```python
+loss = """function loss_fnc(prediction, target)
+    scatter_loss = abs(log((abs(prediction)+1e-20) / (abs(target)+1e-20)))
+    sign_loss = 10 * (sign(prediction) - sign(target))^2
+    return scatter_loss + sign_loss
+end
+"""
+```
+Now let's define our model:
+```python
+model = PySRRegressor(
+    binary_operators=["+", "-", "*", "/"],
+    unary_operators=["square"],
+    loss=loss,
+    complexity_of_constants=2,
+    maxsize=25,
+    niterations=100,
+    populations=50,
+    # Amount to penalize dimensional violations:
+    dimensional_constraint_penalty=10**5,
+)
+```
+and fit it, passing the unit information.
+To do this, we need to use the format of [DynamicQuantities.jl](https://symbolicml.org/DynamicQuantities.jl/dev/#Usage).
+```python
+# Get numerical arrays to fit:
+X = pd.DataFrame(dict(
+    M=M.value,
+    m=m.value,
+    r=r.value,
+))
+y = F.value
+model.fit(
+    X,
+    y,
+    X_units=["Constants.M_sun", "kg", "Constants.R_earth"],
+    y_units="kg * m / s^2"
+)
+```
+You can observe that all expressions with a loss under
+our penalty are dimensionally consistent!
+(The `"[⋅]"` indicates free units in a constant, which can cancel out other units in the expression.)
+For example,
+```julia
+"y[m s⁻² kg] = (M[kg] * 2.6353e-22[⋅])"
+```
+would indicate that the expression is dimensionally consistent, with
+a constant `"2.6353e-22[m s⁻²]"`.
+Note that this expression has a large dynamic range so may be difficult to find. Consider searching with a larger `niterations` if needed.
+## 11. Additional features
 For the many other features available in PySR, please
 read the [Options section](options.md).

docs/gen_param_docs.py CHANGED Viewed

@@ -53,7 +53,7 @@ def str_param_groups(param_groupings, params, cur_heading=2):
 if __name__ == "__main__":
     # This is the path to the param_groupings.yml file
     # relative to the current file.
-    path = "param_groupings.yml"
     with open(path, "r") as f:
         param_groupings = safe_load(f)

 if __name__ == "__main__":
     # This is the path to the param_groupings.yml file
     # relative to the current file.
+    path = "../pysr/param_groupings.yml"
     with open(path, "r") as f:
         param_groupings = safe_load(f)

pysr/julia_helpers.py CHANGED Viewed

@@ -259,6 +259,7 @@ def init_julia(julia_project=None, quiet=False, julia_kwargs=None, return_aux=Fa
 def _add_sr_to_julia_project(Main, io_arg):
     Main.eval("using Pkg")
     Main.sr_spec = Main.PackageSpec(
         name="SymbolicRegression",
         url="https://github.com/MilesCranmer/SymbolicRegression.jl",
@@ -266,8 +267,7 @@ def _add_sr_to_julia_project(Main, io_arg):
     )
     Main.clustermanagers_spec = Main.PackageSpec(
         name="ClusterManagers",
-        url="https://github.com/JuliaParallel/ClusterManagers.jl",
-        rev="14e7302f068794099344d5d93f71979aaf4fbeb3",
     )
     Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")

 def _add_sr_to_julia_project(Main, io_arg):
     Main.eval("using Pkg")
+    Main.eval("Pkg.Registry.update()")
     Main.sr_spec = Main.PackageSpec(
         name="SymbolicRegression",
         url="https://github.com/MilesCranmer/SymbolicRegression.jl",
     )
     Main.clustermanagers_spec = Main.PackageSpec(
         name="ClusterManagers",
+        version="0.4",
     )
     Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")

{docs → pysr}/param_groupings.yml RENAMED Viewed

@@ -13,6 +13,7 @@
     - loss
     - full_objective
     - model_selection
   - Working with Complexities:
     - parsimony
     - constraints
@@ -72,12 +73,14 @@
   - fast_cycle
   - turbo
   - enable_autodiff
   - random_state
   - deterministic
   - warm_start
 - Monitoring:
   - verbosity
   - update_verbosity
   - progress
 - Environment:
   - temp_equation_file

     - loss
     - full_objective
     - model_selection
+    - dimensional_constraint_penalty
   - Working with Complexities:
     - parsimony
     - constraints
   - fast_cycle
   - turbo
   - enable_autodiff
+- Determinism:
   - random_state
   - deterministic
   - warm_start
 - Monitoring:
   - verbosity
   - update_verbosity
+  - print_precision
   - progress
 - Environment:
   - temp_equation_file

pysr/sr.py CHANGED Viewed

@@ -167,6 +167,8 @@ def _check_assertions(
     variable_names,
     weights,
     y,
 ):
     # Check for potential errors before they happen
     assert len(X.shape) == 2
@@ -184,12 +186,30 @@ def _check_assertions(
                     f"Variable name {var_name} is already a function name."
                 )
             # Check if alphanumeric only:
-            if not re.match(r"^[a-zA-Z0-9_]+$", var_name):
                 raise ValueError(
                     f"Invalid variable name {var_name}. "
                     "Only alphanumeric characters, numbers, "
                     "and underscores are allowed."
                 )
 def best(*args, **kwargs):  # pragma: no cover
@@ -354,6 +374,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         You may pass a function with the same arguments as this (note
         that the name of the function doesn't matter). Here,
         both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
         Default is `None`.
     complexity_of_operators : dict[str, float]
         If you would like to use a complexity other than 1 for an
@@ -371,6 +394,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     parsimony : float
         Multiplicative factor for how much to punish complexity.
         Default is `0.0032`.
     use_frequency : bool
         Whether to measure the frequency of complexities, and use that
         instead of parsimony to explore equation space. Will naturally
@@ -551,6 +577,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         What verbosity level to use for package updates.
         Will take value of `verbosity` if not given.
         Default is `None`.
     progress : bool
         Whether to use a progress bar instead of printing to stdout.
         Default is `True`.
@@ -633,6 +661,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Names of features seen during :term:`fit`. Defined only when `X`
         has feature names that are all strings.
     nout_ : int
         Number of output dimensions.
     selection_mask_ : list[int] of length `select_k_features`
@@ -712,6 +746,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         complexity_of_constants=1,
         complexity_of_variables=1,
         parsimony=0.0032,
         use_frequency=True,
         use_frequency_in_tournament=True,
         adaptive_parsimony_scaling=20.0,
@@ -758,6 +793,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         warm_start=False,
         verbosity=1e9,
         update_verbosity=None,
         progress=True,
         equation_file=None,
         temp_equation_file=False,
@@ -802,6 +838,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         self.complexity_of_constants = complexity_of_constants
         self.complexity_of_variables = complexity_of_variables
         self.parsimony = parsimony
         self.use_frequency = use_frequency
         self.use_frequency_in_tournament = use_frequency_in_tournament
         self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
@@ -853,6 +890,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         # - Runtime user interface
         self.verbosity = verbosity
         self.update_verbosity = update_verbosity
         self.progress = progress
         # - Project management
         self.equation_file = equation_file
@@ -976,11 +1014,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         # Else, we re-create it.
         print(
-            f"{equation_file} does not exist, "
             "so we must create the model from scratch."
         )
-        assert binary_operators is not None
-        assert unary_operators is not None
         assert n_features_in is not None
         # TODO: copy .bkup file if exists.
@@ -995,10 +1032,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         model.n_features_in_ = n_features_in
         if feature_names_in is None:
-            model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
         else:
             assert len(feature_names_in) == n_features_in
             model.feature_names_in_ = feature_names_in
         if selection_mask is None:
             model.selection_mask_ = np.ones(n_features_in, dtype=bool)
@@ -1318,7 +1359,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         return packed_modified_params
-    def _validate_and_set_fit_params(self, X, y, Xresampled, weights, variable_names):
         """
         Validate the parameters passed to the :term`fit` method.
@@ -1340,6 +1383,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             for that particular element of y.
         variable_names : list[str] of length n_features
             Names of each variable in the training dataset, `X`.
         Returns
         -------
@@ -1351,6 +1398,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             Validated resampled training data used for denoising.
         variable_names_validated : list[str] of length n_features
             Validated list of variable names for each feature in `X`.
         """
         if isinstance(X, pd.DataFrame):
@@ -1361,7 +1412,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                     "Using DataFrame column names instead."
                 )
-            if X.columns.is_object() and X.columns.str.contains(" ").any():
                 X.columns = X.columns.str.replace(" ", "_")
                 warnings.warn(
                     "Spaces in DataFrame column names are not supported. "
@@ -1384,7 +1438,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             weights = check_array(weights, ensure_2d=False)
             check_consistent_length(weights, y)
         X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
-        self.feature_names_in_ = _check_feature_names_in(self, variable_names)
         variable_names = self.feature_names_in_
         # Handle multioutput data
@@ -1395,10 +1460,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         else:
             raise NotImplementedError("y shape not supported!")
-        return X, y, Xresampled, weights, variable_names
     def _pre_transform_training_data(
-        self, X, y, Xresampled, variable_names, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
@@ -1418,6 +1486,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         variable_names : list[str]
             Names of each variable in the training dataset, `X`.
             Of length `n_features`.
         random_state : int | np.RandomState
             Pass an int for reproducible results across multiple function calls.
             See :term:`Glossary <random_state>`. Default is `None`.
@@ -1439,6 +1511,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         variable_names_transformed : list[str] of length n_features
             Names of each variable in the transformed dataset,
             `X_transformed`.
         """
         # Feature selection transformation
         if self.select_k_features:
@@ -1453,10 +1529,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             # Reduce variable_names to selection
             variable_names = [variable_names[i] for i in self.selection_mask_]
             # Re-perform data validation and feature name updating
             X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
             # Update feature names with selected variable names
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
             print(f"Using features {self.feature_names_in_}")
         # Denoising transformation
@@ -1476,7 +1557,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             else:
                 X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
-        return X, y, variable_names
     def _run(self, X, y, mutated_params, weights, seed):
         """
@@ -1629,6 +1710,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             tournament_selection_n=self.tournament_selection_n,
             # These have the same name:
             parsimony=self.parsimony,
             alpha=self.alpha,
             maxdepth=maxdepth,
             fast_cycle=self.fast_cycle,
@@ -1648,6 +1730,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             fraction_replaced=self.fraction_replaced,
             topn=self.topn,
             verbosity=self.verbosity,
             optimizer_algorithm=self.optimizer_algorithm,
             optimizer_nrestarts=self.optimizer_nrestarts,
             optimizer_probability=self.optimize_probability,
@@ -1699,6 +1782,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             None if parallelism in ["serial", "multithreading"] else int(self.procs)
         )
         # Call to Julia backend.
         # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
         self.raw_julia_state_ = SymbolicRegression.equation_search(
@@ -1706,7 +1795,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             Main.y,
             weights=Main.weights,
             niterations=int(self.niterations),
-            variable_names=self.feature_names_in_.tolist(),
             options=options,
             numprocs=cprocs,
             parallelism=parallelism,
@@ -1732,6 +1829,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         Xresampled=None,
         weights=None,
         variable_names=None,
     ):
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1759,6 +1858,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             instead of `variable_names`. Cannot contain spaces or special
             characters. Avoid variable names which are also
             function names in `sympy`, such as "N".
         Returns
         -------
@@ -1780,6 +1888,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             self.nout_ = 1
             self.selection_mask_ = None
             self.raw_julia_state_ = None
         random_state = check_random_state(self.random_state)  # For np random
         seed = random_state.get_state()[1][0]  # For julia random
@@ -1788,8 +1898,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
         mutated_params = self._validate_and_set_init_params()
-        X, y, Xresampled, weights, variable_names = self._validate_and_set_fit_params(
-            X, y, Xresampled, weights, variable_names
         )
         if X.shape[0] > 10000 and not self.batching:
@@ -1804,8 +1922,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             )
         # Pre transformations (feature selection and denoising)
-        X, y, variable_names = self._pre_transform_training_data(
-            X, y, Xresampled, variable_names, random_state
         )
         # Warn about large feature counts (still warn if feature count is large
@@ -1834,6 +1952,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             variable_names,
             weights,
             y,
         )
         # Initially, just save model parameters, so that
@@ -2072,17 +2192,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                     with open(cur_filename, "r") as f:
                         buf = f.read()
                     buf = _preprocess_julia_floats(buf)
-                    df = pd.read_csv(StringIO(buf))
-                    # Rename Complexity column to complexity:
-                    df.rename(
-                        columns={
-                            "Complexity": "complexity",
-                            "Loss": "loss",
-                            "Equation": "equation",
-                        },
-                        inplace=True,
-                    )
                     all_outputs.append(df)
             else:
@@ -2092,15 +2203,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
                 with open(filename, "r") as f:
                     buf = f.read()
                 buf = _preprocess_julia_floats(buf)
-                all_outputs = [pd.read_csv(StringIO(buf))]
-                all_outputs[-1].rename(
-                    columns={
-                        "Complexity": "complexity",
-                        "Loss": "loss",
-                        "Equation": "equation",
-                    },
-                    inplace=True,
-                )
         except FileNotFoundError:
             raise RuntimeError(
@@ -2109,6 +2212,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
             )
         return all_outputs
     def get_hof(self):
         """Get the equations from a hall of fame file.
@@ -2409,3 +2541,11 @@ def _preprocess_julia_floats(s: str) -> str:
         s = _apply_regexp_im_sci(s)
         s = _apply_regexp_sci(s)
     return s

     variable_names,
     weights,
     y,
+    X_units,
+    y_units,
 ):
     # Check for potential errors before they happen
     assert len(X.shape) == 2
                     f"Variable name {var_name} is already a function name."
                 )
             # Check if alphanumeric only:
+            if not re.match(r"^[₀₁₂₃₄₅₆₇₈₉a-zA-Z0-9_]+$", var_name):
                 raise ValueError(
                     f"Invalid variable name {var_name}. "
                     "Only alphanumeric characters, numbers, "
                     "and underscores are allowed."
                 )
+    if X_units is not None and len(X_units) != X.shape[1]:
+        raise ValueError(
+            "The number of units in `X_units` must equal the number of features in `X`."
+        )
+    if y_units is not None:
+        good_y_units = False
+        if isinstance(y_units, list):
+            if len(y.shape) == 1:
+                good_y_units = len(y_units) == 1
+            else:
+                good_y_units = len(y_units) == y.shape[1]
+        else:
+            good_y_units = len(y.shape) == 1 or y.shape[1] == 1
+        if not good_y_units:
+            raise ValueError(
+                "The number of units in `y_units` must equal the number of output features in `y`."
+            )
 def best(*args, **kwargs):  # pragma: no cover
         You may pass a function with the same arguments as this (note
         that the name of the function doesn't matter). Here,
         both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
+        If using `batching`, then you should add an
+        `idx` argument to the function, which is `nothing`
+        for non-batched, and a 1D array of indices for batched.
         Default is `None`.
     complexity_of_operators : dict[str, float]
         If you would like to use a complexity other than 1 for an
     parsimony : float
         Multiplicative factor for how much to punish complexity.
         Default is `0.0032`.
+    dimensional_constraint_penalty : float
+        Additive penalty for if dimensional analysis of an expression fails.
+        By default, this is `1000.0`.
     use_frequency : bool
         Whether to measure the frequency of complexities, and use that
         instead of parsimony to explore equation space. Will naturally
         What verbosity level to use for package updates.
         Will take value of `verbosity` if not given.
         Default is `None`.
+    print_precision : int
+        How many significant digits to print for floats. Default is `5`.
     progress : bool
         Whether to use a progress bar instead of printing to stdout.
         Default is `True`.
     feature_names_in_ : ndarray of shape (`n_features_in_`,)
         Names of features seen during :term:`fit`. Defined only when `X`
         has feature names that are all strings.
+    pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
+        Pretty names of features, used only during printing.
+    X_units_ : list[str] of length n_features
+        Units of each variable in the training dataset, `X`.
+    y_units_ : str | list[str] of length n_out
+        Units of each variable in the training dataset, `y`.
     nout_ : int
         Number of output dimensions.
     selection_mask_ : list[int] of length `select_k_features`
         complexity_of_constants=1,
         complexity_of_variables=1,
         parsimony=0.0032,
+        dimensional_constraint_penalty=None,
         use_frequency=True,
         use_frequency_in_tournament=True,
         adaptive_parsimony_scaling=20.0,
         warm_start=False,
         verbosity=1e9,
         update_verbosity=None,
+        print_precision=5,
         progress=True,
         equation_file=None,
         temp_equation_file=False,
         self.complexity_of_constants = complexity_of_constants
         self.complexity_of_variables = complexity_of_variables
         self.parsimony = parsimony
+        self.dimensional_constraint_penalty = dimensional_constraint_penalty
         self.use_frequency = use_frequency
         self.use_frequency_in_tournament = use_frequency_in_tournament
         self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
         # - Runtime user interface
         self.verbosity = verbosity
         self.update_verbosity = update_verbosity
+        self.print_precision = print_precision
         self.progress = progress
         # - Project management
         self.equation_file = equation_file
         # Else, we re-create it.
         print(
+            f"{pkl_filename} does not exist, "
             "so we must create the model from scratch."
         )
+        assert binary_operators is not None or unary_operators is not None
         assert n_features_in is not None
         # TODO: copy .bkup file if exists.
         model.n_features_in_ = n_features_in
         if feature_names_in is None:
+            model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)])
+            model.pretty_feature_names_in_ = np.array(
+                [f"x{_subscriptify(i)}" for i in range(n_features_in)]
+            )
         else:
             assert len(feature_names_in) == n_features_in
             model.feature_names_in_ = feature_names_in
+            model.pretty_feature_names_in_ = None
         if selection_mask is None:
             model.selection_mask_ = np.ones(n_features_in, dtype=bool)
         return packed_modified_params
+    def _validate_and_set_fit_params(
+        self, X, y, Xresampled, weights, variable_names, X_units, y_units
+    ):
         """
         Validate the parameters passed to the :term`fit` method.
             for that particular element of y.
         variable_names : list[str] of length n_features
             Names of each variable in the training dataset, `X`.
+        X_units : list[str] of length n_features
+            Units of each variable in the training dataset, `X`.
+        y_units : str | list[str] of length n_out
+            Units of each variable in the training dataset, `y`.
         Returns
         -------
             Validated resampled training data used for denoising.
         variable_names_validated : list[str] of length n_features
             Validated list of variable names for each feature in `X`.
+        X_units : list[str] of length n_features
+            Validated units for `X`.
+        y_units : str | list[str] of length n_out
+            Validated units for `y`.
         """
         if isinstance(X, pd.DataFrame):
                     "Using DataFrame column names instead."
                 )
+            if (
+                pd.api.types.is_object_dtype(X.columns)
+                and X.columns.str.contains(" ").any()
+            ):
                 X.columns = X.columns.str.replace(" ", "_")
                 warnings.warn(
                     "Spaces in DataFrame column names are not supported. "
             weights = check_array(weights, ensure_2d=False)
             check_consistent_length(weights, y)
         X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
+        self.feature_names_in_ = _check_feature_names_in(
+            self, variable_names, generate_names=False
+        )
+        if self.feature_names_in_ is None:
+            self.feature_names_in_ = np.array([f"x{i}" for i in range(X.shape[1])])
+            self.pretty_feature_names_in_ = np.array(
+                [f"x{_subscriptify(i)}" for i in range(X.shape[1])]
+            )
+        else:
+            self.pretty_feature_names_in_ = None
         variable_names = self.feature_names_in_
         # Handle multioutput data
         else:
             raise NotImplementedError("y shape not supported!")
+        self.X_units_ = copy.deepcopy(X_units)
+        self.y_units_ = copy.deepcopy(y_units)
+        return X, y, Xresampled, weights, variable_names, X_units, y_units
     def _pre_transform_training_data(
+        self, X, y, Xresampled, variable_names, X_units, y_units, random_state
     ):
         """
         Transform the training data before fitting the symbolic regressor.
         variable_names : list[str]
             Names of each variable in the training dataset, `X`.
             Of length `n_features`.
+        X_units : list[str]
+            Units of each variable in the training dataset, `X`.
+        y_units : str | list[str]
+            Units of each variable in the training dataset, `y`.
         random_state : int | np.RandomState
             Pass an int for reproducible results across multiple function calls.
             See :term:`Glossary <random_state>`. Default is `None`.
         variable_names_transformed : list[str] of length n_features
             Names of each variable in the transformed dataset,
             `X_transformed`.
+        X_units_transformed : list[str] of length n_features
+            Units of each variable in the transformed dataset.
+        y_units_transformed : str | list[str] of length n_out
+            Units of each variable in the transformed dataset.
         """
         # Feature selection transformation
         if self.select_k_features:
             # Reduce variable_names to selection
             variable_names = [variable_names[i] for i in self.selection_mask_]
+            if X_units is not None:
+                X_units = [X_units[i] for i in self.selection_mask_]
+                self.X_units_ = copy.deepcopy(X_units)
             # Re-perform data validation and feature name updating
             X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
             # Update feature names with selected variable names
             self.feature_names_in_ = _check_feature_names_in(self, variable_names)
+            self.pretty_feature_names_in_ = None
             print(f"Using features {self.feature_names_in_}")
         # Denoising transformation
             else:
                 X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
+        return X, y, variable_names, X_units, y_units
     def _run(self, X, y, mutated_params, weights, seed):
         """
             tournament_selection_n=self.tournament_selection_n,
             # These have the same name:
             parsimony=self.parsimony,
+            dimensional_constraint_penalty=self.dimensional_constraint_penalty,
             alpha=self.alpha,
             maxdepth=maxdepth,
             fast_cycle=self.fast_cycle,
             fraction_replaced=self.fraction_replaced,
             topn=self.topn,
             verbosity=self.verbosity,
+            print_precision=self.print_precision,
             optimizer_algorithm=self.optimizer_algorithm,
             optimizer_nrestarts=self.optimizer_nrestarts,
             optimizer_probability=self.optimize_probability,
             None if parallelism in ["serial", "multithreading"] else int(self.procs)
         )
+        y_variable_names = None
+        if len(y.shape) > 1:
+            # We set these manually so that they respect Python's 0 indexing
+            # (by default Julia will use y1, y2...)
+            y_variable_names = [f"y{_subscriptify(i)}" for i in range(y.shape[1])]
         # Call to Julia backend.
         # See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
         self.raw_julia_state_ = SymbolicRegression.equation_search(
             Main.y,
             weights=Main.weights,
             niterations=int(self.niterations),
+            variable_names=(
+                self.pretty_feature_names_in_.tolist()
+                if hasattr(self, "pretty_feature_names_in_")
+                and self.pretty_feature_names_in_ is not None
+                else self.feature_names_in_.tolist()
+            ),
+            y_variable_names=y_variable_names,
+            X_units=self.X_units_,
+            y_units=self.y_units_,
             options=options,
             numprocs=cprocs,
             parallelism=parallelism,
         Xresampled=None,
         weights=None,
         variable_names=None,
+        X_units=None,
+        y_units=None,
     ):
         """
         Search for equations to fit the dataset and store them in `self.equations_`.
             instead of `variable_names`. Cannot contain spaces or special
             characters. Avoid variable names which are also
             function names in `sympy`, such as "N".
+        X_units : list[str]
+            A list of units for each variable in `X`. Each unit should be
+            a string representing a Julia expression. See DynamicQuantities.jl
+            https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
+            information.
+        y_units : str | list[str]
+            Similar to `X_units`, but as a unit for the target variable, `y`.
+            If `y` is a matrix, a list of units should be passed. If `X_units`
+            is given but `y_units` is not, then `y_units` will be arbitrary.
         Returns
         -------
             self.nout_ = 1
             self.selection_mask_ = None
             self.raw_julia_state_ = None
+            self.X_units_ = None
+            self.y_units_ = None
         random_state = check_random_state(self.random_state)  # For np random
         seed = random_state.get_state()[1][0]  # For julia random
         mutated_params = self._validate_and_set_init_params()
+        (
+            X,
+            y,
+            Xresampled,
+            weights,
+            variable_names,
+            X_units,
+            y_units,
+        ) = self._validate_and_set_fit_params(
+            X, y, Xresampled, weights, variable_names, X_units, y_units
         )
         if X.shape[0] > 10000 and not self.batching:
             )
         # Pre transformations (feature selection and denoising)
+        X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
+            X, y, Xresampled, variable_names, X_units, y_units, random_state
         )
         # Warn about large feature counts (still warn if feature count is large
             variable_names,
             weights,
             y,
+            X_units,
+            y_units,
         )
         # Initially, just save model parameters, so that
                     with open(cur_filename, "r") as f:
                         buf = f.read()
                     buf = _preprocess_julia_floats(buf)
+                    df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
                     all_outputs.append(df)
             else:
                 with open(filename, "r") as f:
                     buf = f.read()
                 buf = _preprocess_julia_floats(buf)
+                all_outputs = [self._postprocess_dataframe(pd.read_csv(StringIO(buf)))]
         except FileNotFoundError:
             raise RuntimeError(
             )
         return all_outputs
+    def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
+        df = df.rename(
+            columns={
+                "Complexity": "complexity",
+                "Loss": "loss",
+                "Equation": "equation",
+            },
+        )
+        # Regexp replace x₁₂₃ to x123 in `equation`:
+        if (
+            hasattr(self, "pretty_feature_names_in_")
+            and self.pretty_feature_names_in_ is not None
+        ):
+            # df["equation"] = df["equation"].apply(_undo_subscriptify_full)
+            for pname, name in zip(
+                self.pretty_feature_names_in_, self.feature_names_in_
+            ):
+                df["equation"] = df["equation"].apply(
+                    lambda s: re.sub(
+                        r"\b" + f"({pname})" + r"\b",
+                        name,
+                        s,
+                    )
+                    if isinstance(s, str)
+                    else s
+                )
+        return df
     def get_hof(self):
         """Get the equations from a hall of fame file.
         s = _apply_regexp_im_sci(s)
         s = _apply_regexp_sci(s)
     return s
+def _subscriptify(i: int) -> str:
+    """Converts integer to subscript text form.
+    For example, 123 -> "₁₂₃".
+    """
+    return "".join([chr(0x2080 + int(c)) for c in str(i)])

pysr/test/test.py CHANGED Viewed

@@ -19,6 +19,7 @@ from ..sr import (
     _handle_feature_selection,
     _csv_filename_to_pkl_filename,
     idx_model_selection,
 )
 from ..export_latex import to_latex
@@ -711,6 +712,26 @@ class TestMiscellaneous(unittest.TestCase):
         # If any checks failed don't let the test pass.
         self.assertEqual(len(exception_messages), 0)
 TRUE_PREAMBLE = "\n".join(
     [
@@ -906,6 +927,151 @@ class TestLaTeXTable(unittest.TestCase):
         self.assertEqual(latex_table_str, true_latex_table_str)
 def runtests():
     """Run all tests in test.py."""
     suite = unittest.TestSuite()
@@ -916,6 +1082,7 @@ def runtests():
         TestFeatureSelection,
         TestMiscellaneous,
         TestLaTeXTable,
     ]
     for test_case in test_cases:
         tests = loader.loadTestsFromTestCase(test_case)

     _handle_feature_selection,
     _csv_filename_to_pkl_filename,
     idx_model_selection,
+    _check_assertions,
 )
 from ..export_latex import to_latex
         # If any checks failed don't let the test pass.
         self.assertEqual(len(exception_messages), 0)
+    def test_param_groupings(self):
+        """Test that param_groupings are complete"""
+        param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml"
+        # Read the file, discarding lines ending in ":",
+        # and removing leading "\s*-\s*":
+        params = []
+        with open(param_groupings_file, "r") as f:
+            for line in f.readlines():
+                if line.strip().endswith(":"):
+                    continue
+                if line.strip().startswith("-"):
+                    params.append(line.strip()[1:].strip())
+        regressor_params = [
+            p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"]
+        ]
+        # Check the sets are equal:
+        self.assertSetEqual(set(params), set(regressor_params))
 TRUE_PREAMBLE = "\n".join(
     [
         self.assertEqual(latex_table_str, true_latex_table_str)
+class TestDimensionalConstraints(unittest.TestCase):
+    def setUp(self):
+        self.default_test_kwargs = dict(
+            progress=False,
+            model_selection="accuracy",
+            niterations=DEFAULT_NITERATIONS * 2,
+            populations=DEFAULT_POPULATIONS * 2,
+            temp_equation_file=True,
+        )
+        self.rstate = np.random.RandomState(0)
+        self.X = self.rstate.randn(100, 5)
+    def test_dimensional_constraints(self):
+        y = np.cos(self.X[:, [0, 1]])
+        model = PySRRegressor(
+            binary_operators=[
+                "my_add(x, y) = x + y",
+                "my_sub(x, y) = x - y",
+                "my_mul(x, y) = x * y",
+            ],
+            unary_operators=["my_cos(x) = cos(x)"],
+            **self.default_test_kwargs,
+            early_stop_condition=1e-8,
+            select_k_features=3,
+            extra_sympy_mappings={
+                "my_cos": sympy.cos,
+                "my_add": lambda x, y: x + y,
+                "my_sub": lambda x, y: x - y,
+                "my_mul": lambda x, y: x * y,
+            },
+        )
+        model.fit(self.X, y, X_units=["m", "m", "m", "m", "m"], y_units=["m", "m"])
+        # The best expression should have complexity larger than just 2:
+        for i in range(2):
+            self.assertGreater(model.get_best()[i]["complexity"], 2)
+            self.assertLess(model.get_best()[i]["loss"], 1e-6)
+            self.assertGreater(
+                model.equations_[i].query("complexity <= 2").loss.min(), 1e-6
+            )
+    def test_unit_checks(self):
+        """This just checks the number of units passed"""
+        use_custom_variable_names = False
+        variable_names = None
+        weights = None
+        args = (use_custom_variable_names, variable_names, weights)
+        valid_units = [
+            (np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
+            (np.ones((10, 1)), np.ones(10), ["m/s"], None),
+            (np.ones((10, 1)), np.ones(10), None, "m/s"),
+            (np.ones((10, 1)), np.ones(10), None, ["m/s"]),
+            (np.ones((10, 1)), np.ones((10, 1)), None, ["m/s"]),
+            (np.ones((10, 1)), np.ones((10, 2)), None, ["m/s", ""]),
+        ]
+        for X, y, X_units, y_units in valid_units:
+            _check_assertions(
+                X,
+                *args,
+                y,
+                X_units,
+                y_units,
+            )
+        invalid_units = [
+            (np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], None),
+            (np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], "m"),
+            (np.ones((10, 2)), np.ones((10, 2)), ["m/s", "s"], ["m"]),
+            (np.ones((10, 1)), np.ones((10, 1)), "m/s", ["m"]),
+        ]
+        for X, y, X_units, y_units in invalid_units:
+            with self.assertRaises(ValueError):
+                _check_assertions(
+                    X,
+                    *args,
+                    y,
+                    X_units,
+                    y_units,
+                )
+    def test_unit_propagation(self):
+        """Check that units are propagated correctly.
+        This also tests that variables have the correct names.
+        """
+        X = np.ones((100, 3))
+        y = np.ones((100, 1))
+        temp_dir = Path(tempfile.mkdtemp())
+        equation_file = str(temp_dir / "equation_file.csv")
+        model = PySRRegressor(
+            binary_operators=["+", "*"],
+            early_stop_condition="(l, c) -> l < 1e-6 && c == 3",
+            progress=False,
+            model_selection="accuracy",
+            niterations=DEFAULT_NITERATIONS * 2,
+            populations=DEFAULT_POPULATIONS * 2,
+            complexity_of_constants=10,
+            weight_mutate_constant=0.0,
+            should_optimize_constants=False,
+            multithreading=False,
+            deterministic=True,
+            procs=0,
+            random_state=0,
+            equation_file=equation_file,
+            warm_start=True,
+        )
+        model.fit(
+            X,
+            y,
+            X_units=["m", "s", "A"],
+            y_units=["m*A"],
+        )
+        best = model.get_best()
+        self.assertIn("x0", best["equation"])
+        self.assertNotIn("x1", best["equation"])
+        self.assertIn("x2", best["equation"])
+        self.assertEqual(best["complexity"], 3)
+        self.assertEqual(model.equations_.iloc[0].complexity, 1)
+        self.assertGreater(model.equations_.iloc[0].loss, 1e-6)
+        # With pkl file:
+        pkl_file = str(temp_dir / "equation_file.pkl")
+        model2 = PySRRegressor.from_file(pkl_file)
+        best2 = model2.get_best()
+        self.assertIn("x0", best2["equation"])
+        # From csv file alone (we need to delete pkl file:)
+        # First, we delete the pkl file:
+        os.remove(pkl_file)
+        model3 = PySRRegressor.from_file(
+            equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1]
+        )
+        best3 = model3.get_best()
+        self.assertIn("x0", best3["equation"])
+        # Try warm start, but with no units provided (should
+        # be a different dataset, and thus different result):
+        model.fit(X, y)
+        model.early_stop_condition = "(l, c) -> l < 1e-6 && c == 1"
+        self.assertEqual(model.equations_.iloc[0].complexity, 1)
+        self.assertLess(model.equations_.iloc[0].loss, 1e-6)
+# TODO: Determine desired behavior if second .fit() call does not have units
 def runtests():
     """Run all tests in test.py."""
     suite = unittest.TestSuite()
         TestFeatureSelection,
         TestMiscellaneous,
         TestLaTeXTable,
+        TestDimensionalConstraints,
     ]
     for test_case in test_cases:
         tests = loader.loadTestsFromTestCase(test_case)

pysr/version.py CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- __version__ = "0.14.4"
2	- __symbolic_regression_jl_version__ = "0.20.0"


1	+ __version__ = "0.15.0"
2	+ __symbolic_regression_jl_version__ = "0.21.3"

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 sympy
-pandas
 numpy
 scikit_learn>=1.0.0
 julia>=0.6.0

 sympy
+pandas>=0.21.0
 numpy
 scikit_learn>=1.0.0
 julia>=0.6.0