Spaces:
Running
Running
Merge pull request #389 from MilesCranmer/backend-update-0.21.2
Browse files- docs/examples.md +89 -1
- docs/gen_param_docs.py +1 -1
- pysr/julia_helpers.py +2 -2
- {docs → pysr}/param_groupings.yml +3 -0
- pysr/sr.py +176 -36
- pysr/test/test.py +167 -0
- pysr/version.py +2 -2
- requirements.txt +1 -1
docs/examples.md
CHANGED
|
@@ -433,9 +433,97 @@ equal to:
|
|
| 433 |
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
|
| 434 |
is nearly the same as the true equation!
|
| 435 |
|
|
|
|
| 436 |
|
|
|
|
|
|
|
|
|
|
| 437 |
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
|
| 440 |
For the many other features available in PySR, please
|
| 441 |
read the [Options section](options.md).
|
|
|
|
| 433 |
$\frac{x_0^2 x_1 - 2.0000073}{x_2^2 - 1.0000019}$, which
|
| 434 |
is nearly the same as the true equation!
|
| 435 |
|
| 436 |
+
## 10. Dimensional constraints
|
| 437 |
|
| 438 |
+
One other feature we can exploit is dimensional analysis.
|
| 439 |
+
Say that we know the physical units of each feature and output,
|
| 440 |
+
and we want to find an expression that is dimensionally consistent.
|
| 441 |
|
| 442 |
+
We can do this as follows, using `DynamicQuantities.jl` to assign units,
|
| 443 |
+
passing a string specifying the units for each variable.
|
| 444 |
+
First, let's make some data on Newton's law of gravitation, using
|
| 445 |
+
astropy for units:
|
| 446 |
+
|
| 447 |
+
```python
|
| 448 |
+
import numpy as np
|
| 449 |
+
from astropy import units as u, constants as const
|
| 450 |
+
|
| 451 |
+
M = (np.random.rand(100) + 0.1) * const.M_sun
|
| 452 |
+
m = 100 * (np.random.rand(100) + 0.1) * u.kg
|
| 453 |
+
r = (np.random.rand(100) + 0.1) * const.R_earth
|
| 454 |
+
G = const.G
|
| 455 |
+
|
| 456 |
+
F = G * M * m / r**2
|
| 457 |
+
```
|
| 458 |
+
|
| 459 |
+
We can see the units of `F` with `F.unit`.
|
| 460 |
+
|
| 461 |
+
Now, let's create our model.
|
| 462 |
+
Since this data has such a large dynamic range,
|
| 463 |
+
let's also create a custom loss function
|
| 464 |
+
that looks at the error in log-space:
|
| 465 |
+
|
| 466 |
+
```python
|
| 467 |
+
loss = """function loss_fnc(prediction, target)
|
| 468 |
+
scatter_loss = abs(log((abs(prediction)+1e-20) / (abs(target)+1e-20)))
|
| 469 |
+
sign_loss = 10 * (sign(prediction) - sign(target))^2
|
| 470 |
+
return scatter_loss + sign_loss
|
| 471 |
+
end
|
| 472 |
+
"""
|
| 473 |
+
```
|
| 474 |
+
|
| 475 |
+
Now let's define our model:
|
| 476 |
+
|
| 477 |
+
```python
|
| 478 |
+
model = PySRRegressor(
|
| 479 |
+
binary_operators=["+", "-", "*", "/"],
|
| 480 |
+
unary_operators=["square"],
|
| 481 |
+
loss=loss,
|
| 482 |
+
complexity_of_constants=2,
|
| 483 |
+
maxsize=25,
|
| 484 |
+
niterations=100,
|
| 485 |
+
populations=50,
|
| 486 |
+
# Amount to penalize dimensional violations:
|
| 487 |
+
dimensional_constraint_penalty=10**5,
|
| 488 |
+
)
|
| 489 |
+
```
|
| 490 |
+
|
| 491 |
+
and fit it, passing the unit information.
|
| 492 |
+
To do this, we need to use the format of [DynamicQuantities.jl](https://symbolicml.org/DynamicQuantities.jl/dev/#Usage).
|
| 493 |
+
|
| 494 |
+
```python
|
| 495 |
+
# Get numerical arrays to fit:
|
| 496 |
+
X = pd.DataFrame(dict(
|
| 497 |
+
M=M.value,
|
| 498 |
+
m=m.value,
|
| 499 |
+
r=r.value,
|
| 500 |
+
))
|
| 501 |
+
y = F.value
|
| 502 |
+
|
| 503 |
+
model.fit(
|
| 504 |
+
X,
|
| 505 |
+
y,
|
| 506 |
+
X_units=["Constants.M_sun", "kg", "Constants.R_earth"],
|
| 507 |
+
y_units="kg * m / s^2"
|
| 508 |
+
)
|
| 509 |
+
```
|
| 510 |
+
|
| 511 |
+
You can observe that all expressions with a loss under
|
| 512 |
+
our penalty are dimensionally consistent!
|
| 513 |
+
(The `"[⋅]"` indicates free units in a constant, which can cancel out other units in the expression.)
|
| 514 |
+
For example,
|
| 515 |
+
|
| 516 |
+
```julia
|
| 517 |
+
"y[m s⁻² kg] = (M[kg] * 2.6353e-22[⋅])"
|
| 518 |
+
```
|
| 519 |
+
|
| 520 |
+
would indicate that the expression is dimensionally consistent, with
|
| 521 |
+
a constant `"2.6353e-22[m s⁻²]"`.
|
| 522 |
+
|
| 523 |
+
Note that this expression has a large dynamic range so may be difficult to find. Consider searching with a larger `niterations` if needed.
|
| 524 |
+
|
| 525 |
+
|
| 526 |
+
## 11. Additional features
|
| 527 |
|
| 528 |
For the many other features available in PySR, please
|
| 529 |
read the [Options section](options.md).
|
docs/gen_param_docs.py
CHANGED
|
@@ -53,7 +53,7 @@ def str_param_groups(param_groupings, params, cur_heading=2):
|
|
| 53 |
if __name__ == "__main__":
|
| 54 |
# This is the path to the param_groupings.yml file
|
| 55 |
# relative to the current file.
|
| 56 |
-
path = "param_groupings.yml"
|
| 57 |
with open(path, "r") as f:
|
| 58 |
param_groupings = safe_load(f)
|
| 59 |
|
|
|
|
| 53 |
if __name__ == "__main__":
|
| 54 |
# This is the path to the param_groupings.yml file
|
| 55 |
# relative to the current file.
|
| 56 |
+
path = "../pysr/param_groupings.yml"
|
| 57 |
with open(path, "r") as f:
|
| 58 |
param_groupings = safe_load(f)
|
| 59 |
|
pysr/julia_helpers.py
CHANGED
|
@@ -259,6 +259,7 @@ def init_julia(julia_project=None, quiet=False, julia_kwargs=None, return_aux=Fa
|
|
| 259 |
|
| 260 |
def _add_sr_to_julia_project(Main, io_arg):
|
| 261 |
Main.eval("using Pkg")
|
|
|
|
| 262 |
Main.sr_spec = Main.PackageSpec(
|
| 263 |
name="SymbolicRegression",
|
| 264 |
url="https://github.com/MilesCranmer/SymbolicRegression.jl",
|
|
@@ -266,8 +267,7 @@ def _add_sr_to_julia_project(Main, io_arg):
|
|
| 266 |
)
|
| 267 |
Main.clustermanagers_spec = Main.PackageSpec(
|
| 268 |
name="ClusterManagers",
|
| 269 |
-
|
| 270 |
-
rev="14e7302f068794099344d5d93f71979aaf4fbeb3",
|
| 271 |
)
|
| 272 |
Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")
|
| 273 |
|
|
|
|
| 259 |
|
| 260 |
def _add_sr_to_julia_project(Main, io_arg):
|
| 261 |
Main.eval("using Pkg")
|
| 262 |
+
Main.eval("Pkg.Registry.update()")
|
| 263 |
Main.sr_spec = Main.PackageSpec(
|
| 264 |
name="SymbolicRegression",
|
| 265 |
url="https://github.com/MilesCranmer/SymbolicRegression.jl",
|
|
|
|
| 267 |
)
|
| 268 |
Main.clustermanagers_spec = Main.PackageSpec(
|
| 269 |
name="ClusterManagers",
|
| 270 |
+
version="0.4",
|
|
|
|
| 271 |
)
|
| 272 |
Main.eval(f"Pkg.add([sr_spec, clustermanagers_spec], {io_arg})")
|
| 273 |
|
{docs → pysr}/param_groupings.yml
RENAMED
|
@@ -13,6 +13,7 @@
|
|
| 13 |
- loss
|
| 14 |
- full_objective
|
| 15 |
- model_selection
|
|
|
|
| 16 |
- Working with Complexities:
|
| 17 |
- parsimony
|
| 18 |
- constraints
|
|
@@ -72,12 +73,14 @@
|
|
| 72 |
- fast_cycle
|
| 73 |
- turbo
|
| 74 |
- enable_autodiff
|
|
|
|
| 75 |
- random_state
|
| 76 |
- deterministic
|
| 77 |
- warm_start
|
| 78 |
- Monitoring:
|
| 79 |
- verbosity
|
| 80 |
- update_verbosity
|
|
|
|
| 81 |
- progress
|
| 82 |
- Environment:
|
| 83 |
- temp_equation_file
|
|
|
|
| 13 |
- loss
|
| 14 |
- full_objective
|
| 15 |
- model_selection
|
| 16 |
+
- dimensional_constraint_penalty
|
| 17 |
- Working with Complexities:
|
| 18 |
- parsimony
|
| 19 |
- constraints
|
|
|
|
| 73 |
- fast_cycle
|
| 74 |
- turbo
|
| 75 |
- enable_autodiff
|
| 76 |
+
- Determinism:
|
| 77 |
- random_state
|
| 78 |
- deterministic
|
| 79 |
- warm_start
|
| 80 |
- Monitoring:
|
| 81 |
- verbosity
|
| 82 |
- update_verbosity
|
| 83 |
+
- print_precision
|
| 84 |
- progress
|
| 85 |
- Environment:
|
| 86 |
- temp_equation_file
|
pysr/sr.py
CHANGED
|
@@ -167,6 +167,8 @@ def _check_assertions(
|
|
| 167 |
variable_names,
|
| 168 |
weights,
|
| 169 |
y,
|
|
|
|
|
|
|
| 170 |
):
|
| 171 |
# Check for potential errors before they happen
|
| 172 |
assert len(X.shape) == 2
|
|
@@ -184,12 +186,30 @@ def _check_assertions(
|
|
| 184 |
f"Variable name {var_name} is already a function name."
|
| 185 |
)
|
| 186 |
# Check if alphanumeric only:
|
| 187 |
-
if not re.match(r"^[a-zA-Z0-9_]+$", var_name):
|
| 188 |
raise ValueError(
|
| 189 |
f"Invalid variable name {var_name}. "
|
| 190 |
"Only alphanumeric characters, numbers, "
|
| 191 |
"and underscores are allowed."
|
| 192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def best(*args, **kwargs): # pragma: no cover
|
|
@@ -354,6 +374,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 354 |
You may pass a function with the same arguments as this (note
|
| 355 |
that the name of the function doesn't matter). Here,
|
| 356 |
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
|
|
|
|
|
|
|
|
|
|
| 357 |
Default is `None`.
|
| 358 |
complexity_of_operators : dict[str, float]
|
| 359 |
If you would like to use a complexity other than 1 for an
|
|
@@ -371,6 +394,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 371 |
parsimony : float
|
| 372 |
Multiplicative factor for how much to punish complexity.
|
| 373 |
Default is `0.0032`.
|
|
|
|
|
|
|
|
|
|
| 374 |
use_frequency : bool
|
| 375 |
Whether to measure the frequency of complexities, and use that
|
| 376 |
instead of parsimony to explore equation space. Will naturally
|
|
@@ -551,6 +577,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 551 |
What verbosity level to use for package updates.
|
| 552 |
Will take value of `verbosity` if not given.
|
| 553 |
Default is `None`.
|
|
|
|
|
|
|
| 554 |
progress : bool
|
| 555 |
Whether to use a progress bar instead of printing to stdout.
|
| 556 |
Default is `True`.
|
|
@@ -633,6 +661,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 633 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
| 634 |
Names of features seen during :term:`fit`. Defined only when `X`
|
| 635 |
has feature names that are all strings.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
nout_ : int
|
| 637 |
Number of output dimensions.
|
| 638 |
selection_mask_ : list[int] of length `select_k_features`
|
|
@@ -712,6 +746,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 712 |
complexity_of_constants=1,
|
| 713 |
complexity_of_variables=1,
|
| 714 |
parsimony=0.0032,
|
|
|
|
| 715 |
use_frequency=True,
|
| 716 |
use_frequency_in_tournament=True,
|
| 717 |
adaptive_parsimony_scaling=20.0,
|
|
@@ -758,6 +793,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 758 |
warm_start=False,
|
| 759 |
verbosity=1e9,
|
| 760 |
update_verbosity=None,
|
|
|
|
| 761 |
progress=True,
|
| 762 |
equation_file=None,
|
| 763 |
temp_equation_file=False,
|
|
@@ -802,6 +838,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 802 |
self.complexity_of_constants = complexity_of_constants
|
| 803 |
self.complexity_of_variables = complexity_of_variables
|
| 804 |
self.parsimony = parsimony
|
|
|
|
| 805 |
self.use_frequency = use_frequency
|
| 806 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
| 807 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
|
@@ -853,6 +890,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 853 |
# - Runtime user interface
|
| 854 |
self.verbosity = verbosity
|
| 855 |
self.update_verbosity = update_verbosity
|
|
|
|
| 856 |
self.progress = progress
|
| 857 |
# - Project management
|
| 858 |
self.equation_file = equation_file
|
|
@@ -976,11 +1014,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 976 |
|
| 977 |
# Else, we re-create it.
|
| 978 |
print(
|
| 979 |
-
f"{
|
| 980 |
"so we must create the model from scratch."
|
| 981 |
)
|
| 982 |
-
assert binary_operators is not None
|
| 983 |
-
assert unary_operators is not None
|
| 984 |
assert n_features_in is not None
|
| 985 |
|
| 986 |
# TODO: copy .bkup file if exists.
|
|
@@ -995,10 +1032,14 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 995 |
model.n_features_in_ = n_features_in
|
| 996 |
|
| 997 |
if feature_names_in is None:
|
| 998 |
-
model.feature_names_in_ = [f"x{i}" for i in range(n_features_in)]
|
|
|
|
|
|
|
|
|
|
| 999 |
else:
|
| 1000 |
assert len(feature_names_in) == n_features_in
|
| 1001 |
model.feature_names_in_ = feature_names_in
|
|
|
|
| 1002 |
|
| 1003 |
if selection_mask is None:
|
| 1004 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
|
@@ -1318,7 +1359,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1318 |
|
| 1319 |
return packed_modified_params
|
| 1320 |
|
| 1321 |
-
def _validate_and_set_fit_params(
|
|
|
|
|
|
|
| 1322 |
"""
|
| 1323 |
Validate the parameters passed to the :term`fit` method.
|
| 1324 |
|
|
@@ -1340,6 +1383,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1340 |
for that particular element of y.
|
| 1341 |
variable_names : list[str] of length n_features
|
| 1342 |
Names of each variable in the training dataset, `X`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1343 |
|
| 1344 |
Returns
|
| 1345 |
-------
|
|
@@ -1351,6 +1398,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1351 |
Validated resampled training data used for denoising.
|
| 1352 |
variable_names_validated : list[str] of length n_features
|
| 1353 |
Validated list of variable names for each feature in `X`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1354 |
|
| 1355 |
"""
|
| 1356 |
if isinstance(X, pd.DataFrame):
|
|
@@ -1361,7 +1412,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1361 |
"Using DataFrame column names instead."
|
| 1362 |
)
|
| 1363 |
|
| 1364 |
-
if
|
|
|
|
|
|
|
|
|
|
| 1365 |
X.columns = X.columns.str.replace(" ", "_")
|
| 1366 |
warnings.warn(
|
| 1367 |
"Spaces in DataFrame column names are not supported. "
|
|
@@ -1384,7 +1438,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1384 |
weights = check_array(weights, ensure_2d=False)
|
| 1385 |
check_consistent_length(weights, y)
|
| 1386 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1387 |
-
self.feature_names_in_ = _check_feature_names_in(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1388 |
variable_names = self.feature_names_in_
|
| 1389 |
|
| 1390 |
# Handle multioutput data
|
|
@@ -1395,10 +1460,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1395 |
else:
|
| 1396 |
raise NotImplementedError("y shape not supported!")
|
| 1397 |
|
| 1398 |
-
|
|
|
|
|
|
|
|
|
|
| 1399 |
|
| 1400 |
def _pre_transform_training_data(
|
| 1401 |
-
self, X, y, Xresampled, variable_names, random_state
|
| 1402 |
):
|
| 1403 |
"""
|
| 1404 |
Transform the training data before fitting the symbolic regressor.
|
|
@@ -1418,6 +1486,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1418 |
variable_names : list[str]
|
| 1419 |
Names of each variable in the training dataset, `X`.
|
| 1420 |
Of length `n_features`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1421 |
random_state : int | np.RandomState
|
| 1422 |
Pass an int for reproducible results across multiple function calls.
|
| 1423 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
@@ -1439,6 +1511,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1439 |
variable_names_transformed : list[str] of length n_features
|
| 1440 |
Names of each variable in the transformed dataset,
|
| 1441 |
`X_transformed`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1442 |
"""
|
| 1443 |
# Feature selection transformation
|
| 1444 |
if self.select_k_features:
|
|
@@ -1453,10 +1529,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1453 |
# Reduce variable_names to selection
|
| 1454 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
| 1455 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1456 |
# Re-perform data validation and feature name updating
|
| 1457 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1458 |
# Update feature names with selected variable names
|
| 1459 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
|
|
|
| 1460 |
print(f"Using features {self.feature_names_in_}")
|
| 1461 |
|
| 1462 |
# Denoising transformation
|
|
@@ -1476,7 +1557,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1476 |
else:
|
| 1477 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1478 |
|
| 1479 |
-
return X, y, variable_names
|
| 1480 |
|
| 1481 |
def _run(self, X, y, mutated_params, weights, seed):
|
| 1482 |
"""
|
|
@@ -1629,6 +1710,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1629 |
tournament_selection_n=self.tournament_selection_n,
|
| 1630 |
# These have the same name:
|
| 1631 |
parsimony=self.parsimony,
|
|
|
|
| 1632 |
alpha=self.alpha,
|
| 1633 |
maxdepth=maxdepth,
|
| 1634 |
fast_cycle=self.fast_cycle,
|
|
@@ -1648,6 +1730,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1648 |
fraction_replaced=self.fraction_replaced,
|
| 1649 |
topn=self.topn,
|
| 1650 |
verbosity=self.verbosity,
|
|
|
|
| 1651 |
optimizer_algorithm=self.optimizer_algorithm,
|
| 1652 |
optimizer_nrestarts=self.optimizer_nrestarts,
|
| 1653 |
optimizer_probability=self.optimize_probability,
|
|
@@ -1699,6 +1782,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1699 |
None if parallelism in ["serial", "multithreading"] else int(self.procs)
|
| 1700 |
)
|
| 1701 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1702 |
# Call to Julia backend.
|
| 1703 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
| 1704 |
self.raw_julia_state_ = SymbolicRegression.equation_search(
|
|
@@ -1706,7 +1795,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1706 |
Main.y,
|
| 1707 |
weights=Main.weights,
|
| 1708 |
niterations=int(self.niterations),
|
| 1709 |
-
variable_names=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1710 |
options=options,
|
| 1711 |
numprocs=cprocs,
|
| 1712 |
parallelism=parallelism,
|
|
@@ -1732,6 +1829,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1732 |
Xresampled=None,
|
| 1733 |
weights=None,
|
| 1734 |
variable_names=None,
|
|
|
|
|
|
|
| 1735 |
):
|
| 1736 |
"""
|
| 1737 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
@@ -1759,6 +1858,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1759 |
instead of `variable_names`. Cannot contain spaces or special
|
| 1760 |
characters. Avoid variable names which are also
|
| 1761 |
function names in `sympy`, such as "N".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1762 |
|
| 1763 |
Returns
|
| 1764 |
-------
|
|
@@ -1780,6 +1888,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1780 |
self.nout_ = 1
|
| 1781 |
self.selection_mask_ = None
|
| 1782 |
self.raw_julia_state_ = None
|
|
|
|
|
|
|
| 1783 |
|
| 1784 |
random_state = check_random_state(self.random_state) # For np random
|
| 1785 |
seed = random_state.get_state()[1][0] # For julia random
|
|
@@ -1788,8 +1898,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1788 |
|
| 1789 |
mutated_params = self._validate_and_set_init_params()
|
| 1790 |
|
| 1791 |
-
|
| 1792 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1793 |
)
|
| 1794 |
|
| 1795 |
if X.shape[0] > 10000 and not self.batching:
|
|
@@ -1804,8 +1922,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1804 |
)
|
| 1805 |
|
| 1806 |
# Pre transformations (feature selection and denoising)
|
| 1807 |
-
X, y, variable_names = self._pre_transform_training_data(
|
| 1808 |
-
X, y, Xresampled, variable_names, random_state
|
| 1809 |
)
|
| 1810 |
|
| 1811 |
# Warn about large feature counts (still warn if feature count is large
|
|
@@ -1834,6 +1952,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1834 |
variable_names,
|
| 1835 |
weights,
|
| 1836 |
y,
|
|
|
|
|
|
|
| 1837 |
)
|
| 1838 |
|
| 1839 |
# Initially, just save model parameters, so that
|
|
@@ -2072,17 +2192,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2072 |
with open(cur_filename, "r") as f:
|
| 2073 |
buf = f.read()
|
| 2074 |
buf = _preprocess_julia_floats(buf)
|
| 2075 |
-
|
| 2076 |
-
|
| 2077 |
-
# Rename Complexity column to complexity:
|
| 2078 |
-
df.rename(
|
| 2079 |
-
columns={
|
| 2080 |
-
"Complexity": "complexity",
|
| 2081 |
-
"Loss": "loss",
|
| 2082 |
-
"Equation": "equation",
|
| 2083 |
-
},
|
| 2084 |
-
inplace=True,
|
| 2085 |
-
)
|
| 2086 |
|
| 2087 |
all_outputs.append(df)
|
| 2088 |
else:
|
|
@@ -2092,15 +2203,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2092 |
with open(filename, "r") as f:
|
| 2093 |
buf = f.read()
|
| 2094 |
buf = _preprocess_julia_floats(buf)
|
| 2095 |
-
all_outputs = [pd.read_csv(StringIO(buf))]
|
| 2096 |
-
all_outputs[-1].rename(
|
| 2097 |
-
columns={
|
| 2098 |
-
"Complexity": "complexity",
|
| 2099 |
-
"Loss": "loss",
|
| 2100 |
-
"Equation": "equation",
|
| 2101 |
-
},
|
| 2102 |
-
inplace=True,
|
| 2103 |
-
)
|
| 2104 |
|
| 2105 |
except FileNotFoundError:
|
| 2106 |
raise RuntimeError(
|
|
@@ -2109,6 +2212,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2109 |
)
|
| 2110 |
return all_outputs
|
| 2111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2112 |
def get_hof(self):
|
| 2113 |
"""Get the equations from a hall of fame file.
|
| 2114 |
|
|
@@ -2409,3 +2541,11 @@ def _preprocess_julia_floats(s: str) -> str:
|
|
| 2409 |
s = _apply_regexp_im_sci(s)
|
| 2410 |
s = _apply_regexp_sci(s)
|
| 2411 |
return s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
variable_names,
|
| 168 |
weights,
|
| 169 |
y,
|
| 170 |
+
X_units,
|
| 171 |
+
y_units,
|
| 172 |
):
|
| 173 |
# Check for potential errors before they happen
|
| 174 |
assert len(X.shape) == 2
|
|
|
|
| 186 |
f"Variable name {var_name} is already a function name."
|
| 187 |
)
|
| 188 |
# Check if alphanumeric only:
|
| 189 |
+
if not re.match(r"^[₀₁₂₃₄₅₆₇₈₉a-zA-Z0-9_]+$", var_name):
|
| 190 |
raise ValueError(
|
| 191 |
f"Invalid variable name {var_name}. "
|
| 192 |
"Only alphanumeric characters, numbers, "
|
| 193 |
"and underscores are allowed."
|
| 194 |
)
|
| 195 |
+
if X_units is not None and len(X_units) != X.shape[1]:
|
| 196 |
+
raise ValueError(
|
| 197 |
+
"The number of units in `X_units` must equal the number of features in `X`."
|
| 198 |
+
)
|
| 199 |
+
if y_units is not None:
|
| 200 |
+
good_y_units = False
|
| 201 |
+
if isinstance(y_units, list):
|
| 202 |
+
if len(y.shape) == 1:
|
| 203 |
+
good_y_units = len(y_units) == 1
|
| 204 |
+
else:
|
| 205 |
+
good_y_units = len(y_units) == y.shape[1]
|
| 206 |
+
else:
|
| 207 |
+
good_y_units = len(y.shape) == 1 or y.shape[1] == 1
|
| 208 |
+
|
| 209 |
+
if not good_y_units:
|
| 210 |
+
raise ValueError(
|
| 211 |
+
"The number of units in `y_units` must equal the number of output features in `y`."
|
| 212 |
+
)
|
| 213 |
|
| 214 |
|
| 215 |
def best(*args, **kwargs): # pragma: no cover
|
|
|
|
| 374 |
You may pass a function with the same arguments as this (note
|
| 375 |
that the name of the function doesn't matter). Here,
|
| 376 |
both `prediction` and `dataset.y` are 1D arrays of length `dataset.n`.
|
| 377 |
+
If using `batching`, then you should add an
|
| 378 |
+
`idx` argument to the function, which is `nothing`
|
| 379 |
+
for non-batched, and a 1D array of indices for batched.
|
| 380 |
Default is `None`.
|
| 381 |
complexity_of_operators : dict[str, float]
|
| 382 |
If you would like to use a complexity other than 1 for an
|
|
|
|
| 394 |
parsimony : float
|
| 395 |
Multiplicative factor for how much to punish complexity.
|
| 396 |
Default is `0.0032`.
|
| 397 |
+
dimensional_constraint_penalty : float
|
| 398 |
+
Additive penalty for if dimensional analysis of an expression fails.
|
| 399 |
+
By default, this is `1000.0`.
|
| 400 |
use_frequency : bool
|
| 401 |
Whether to measure the frequency of complexities, and use that
|
| 402 |
instead of parsimony to explore equation space. Will naturally
|
|
|
|
| 577 |
What verbosity level to use for package updates.
|
| 578 |
Will take value of `verbosity` if not given.
|
| 579 |
Default is `None`.
|
| 580 |
+
print_precision : int
|
| 581 |
+
How many significant digits to print for floats. Default is `5`.
|
| 582 |
progress : bool
|
| 583 |
Whether to use a progress bar instead of printing to stdout.
|
| 584 |
Default is `True`.
|
|
|
|
| 661 |
feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
| 662 |
Names of features seen during :term:`fit`. Defined only when `X`
|
| 663 |
has feature names that are all strings.
|
| 664 |
+
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
| 665 |
+
Pretty names of features, used only during printing.
|
| 666 |
+
X_units_ : list[str] of length n_features
|
| 667 |
+
Units of each variable in the training dataset, `X`.
|
| 668 |
+
y_units_ : str | list[str] of length n_out
|
| 669 |
+
Units of each variable in the training dataset, `y`.
|
| 670 |
nout_ : int
|
| 671 |
Number of output dimensions.
|
| 672 |
selection_mask_ : list[int] of length `select_k_features`
|
|
|
|
| 746 |
complexity_of_constants=1,
|
| 747 |
complexity_of_variables=1,
|
| 748 |
parsimony=0.0032,
|
| 749 |
+
dimensional_constraint_penalty=None,
|
| 750 |
use_frequency=True,
|
| 751 |
use_frequency_in_tournament=True,
|
| 752 |
adaptive_parsimony_scaling=20.0,
|
|
|
|
| 793 |
warm_start=False,
|
| 794 |
verbosity=1e9,
|
| 795 |
update_verbosity=None,
|
| 796 |
+
print_precision=5,
|
| 797 |
progress=True,
|
| 798 |
equation_file=None,
|
| 799 |
temp_equation_file=False,
|
|
|
|
| 838 |
self.complexity_of_constants = complexity_of_constants
|
| 839 |
self.complexity_of_variables = complexity_of_variables
|
| 840 |
self.parsimony = parsimony
|
| 841 |
+
self.dimensional_constraint_penalty = dimensional_constraint_penalty
|
| 842 |
self.use_frequency = use_frequency
|
| 843 |
self.use_frequency_in_tournament = use_frequency_in_tournament
|
| 844 |
self.adaptive_parsimony_scaling = adaptive_parsimony_scaling
|
|
|
|
| 890 |
# - Runtime user interface
|
| 891 |
self.verbosity = verbosity
|
| 892 |
self.update_verbosity = update_verbosity
|
| 893 |
+
self.print_precision = print_precision
|
| 894 |
self.progress = progress
|
| 895 |
# - Project management
|
| 896 |
self.equation_file = equation_file
|
|
|
|
| 1014 |
|
| 1015 |
# Else, we re-create it.
|
| 1016 |
print(
|
| 1017 |
+
f"{pkl_filename} does not exist, "
|
| 1018 |
"so we must create the model from scratch."
|
| 1019 |
)
|
| 1020 |
+
assert binary_operators is not None or unary_operators is not None
|
|
|
|
| 1021 |
assert n_features_in is not None
|
| 1022 |
|
| 1023 |
# TODO: copy .bkup file if exists.
|
|
|
|
| 1032 |
model.n_features_in_ = n_features_in
|
| 1033 |
|
| 1034 |
if feature_names_in is None:
|
| 1035 |
+
model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)])
|
| 1036 |
+
model.pretty_feature_names_in_ = np.array(
|
| 1037 |
+
[f"x{_subscriptify(i)}" for i in range(n_features_in)]
|
| 1038 |
+
)
|
| 1039 |
else:
|
| 1040 |
assert len(feature_names_in) == n_features_in
|
| 1041 |
model.feature_names_in_ = feature_names_in
|
| 1042 |
+
model.pretty_feature_names_in_ = None
|
| 1043 |
|
| 1044 |
if selection_mask is None:
|
| 1045 |
model.selection_mask_ = np.ones(n_features_in, dtype=bool)
|
|
|
|
| 1359 |
|
| 1360 |
return packed_modified_params
|
| 1361 |
|
| 1362 |
+
def _validate_and_set_fit_params(
|
| 1363 |
+
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1364 |
+
):
|
| 1365 |
"""
|
| 1366 |
Validate the parameters passed to the :term`fit` method.
|
| 1367 |
|
|
|
|
| 1383 |
for that particular element of y.
|
| 1384 |
variable_names : list[str] of length n_features
|
| 1385 |
Names of each variable in the training dataset, `X`.
|
| 1386 |
+
X_units : list[str] of length n_features
|
| 1387 |
+
Units of each variable in the training dataset, `X`.
|
| 1388 |
+
y_units : str | list[str] of length n_out
|
| 1389 |
+
Units of each variable in the training dataset, `y`.
|
| 1390 |
|
| 1391 |
Returns
|
| 1392 |
-------
|
|
|
|
| 1398 |
Validated resampled training data used for denoising.
|
| 1399 |
variable_names_validated : list[str] of length n_features
|
| 1400 |
Validated list of variable names for each feature in `X`.
|
| 1401 |
+
X_units : list[str] of length n_features
|
| 1402 |
+
Validated units for `X`.
|
| 1403 |
+
y_units : str | list[str] of length n_out
|
| 1404 |
+
Validated units for `y`.
|
| 1405 |
|
| 1406 |
"""
|
| 1407 |
if isinstance(X, pd.DataFrame):
|
|
|
|
| 1412 |
"Using DataFrame column names instead."
|
| 1413 |
)
|
| 1414 |
|
| 1415 |
+
if (
|
| 1416 |
+
pd.api.types.is_object_dtype(X.columns)
|
| 1417 |
+
and X.columns.str.contains(" ").any()
|
| 1418 |
+
):
|
| 1419 |
X.columns = X.columns.str.replace(" ", "_")
|
| 1420 |
warnings.warn(
|
| 1421 |
"Spaces in DataFrame column names are not supported. "
|
|
|
|
| 1438 |
weights = check_array(weights, ensure_2d=False)
|
| 1439 |
check_consistent_length(weights, y)
|
| 1440 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1441 |
+
self.feature_names_in_ = _check_feature_names_in(
|
| 1442 |
+
self, variable_names, generate_names=False
|
| 1443 |
+
)
|
| 1444 |
+
|
| 1445 |
+
if self.feature_names_in_ is None:
|
| 1446 |
+
self.feature_names_in_ = np.array([f"x{i}" for i in range(X.shape[1])])
|
| 1447 |
+
self.pretty_feature_names_in_ = np.array(
|
| 1448 |
+
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
| 1449 |
+
)
|
| 1450 |
+
else:
|
| 1451 |
+
self.pretty_feature_names_in_ = None
|
| 1452 |
+
|
| 1453 |
variable_names = self.feature_names_in_
|
| 1454 |
|
| 1455 |
# Handle multioutput data
|
|
|
|
| 1460 |
else:
|
| 1461 |
raise NotImplementedError("y shape not supported!")
|
| 1462 |
|
| 1463 |
+
self.X_units_ = copy.deepcopy(X_units)
|
| 1464 |
+
self.y_units_ = copy.deepcopy(y_units)
|
| 1465 |
+
|
| 1466 |
+
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1467 |
|
| 1468 |
def _pre_transform_training_data(
|
| 1469 |
+
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1470 |
):
|
| 1471 |
"""
|
| 1472 |
Transform the training data before fitting the symbolic regressor.
|
|
|
|
| 1486 |
variable_names : list[str]
|
| 1487 |
Names of each variable in the training dataset, `X`.
|
| 1488 |
Of length `n_features`.
|
| 1489 |
+
X_units : list[str]
|
| 1490 |
+
Units of each variable in the training dataset, `X`.
|
| 1491 |
+
y_units : str | list[str]
|
| 1492 |
+
Units of each variable in the training dataset, `y`.
|
| 1493 |
random_state : int | np.RandomState
|
| 1494 |
Pass an int for reproducible results across multiple function calls.
|
| 1495 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
|
|
| 1511 |
variable_names_transformed : list[str] of length n_features
|
| 1512 |
Names of each variable in the transformed dataset,
|
| 1513 |
`X_transformed`.
|
| 1514 |
+
X_units_transformed : list[str] of length n_features
|
| 1515 |
+
Units of each variable in the transformed dataset.
|
| 1516 |
+
y_units_transformed : str | list[str] of length n_out
|
| 1517 |
+
Units of each variable in the transformed dataset.
|
| 1518 |
"""
|
| 1519 |
# Feature selection transformation
|
| 1520 |
if self.select_k_features:
|
|
|
|
| 1529 |
# Reduce variable_names to selection
|
| 1530 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
| 1531 |
|
| 1532 |
+
if X_units is not None:
|
| 1533 |
+
X_units = [X_units[i] for i in self.selection_mask_]
|
| 1534 |
+
self.X_units_ = copy.deepcopy(X_units)
|
| 1535 |
+
|
| 1536 |
# Re-perform data validation and feature name updating
|
| 1537 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1538 |
# Update feature names with selected variable names
|
| 1539 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1540 |
+
self.pretty_feature_names_in_ = None
|
| 1541 |
print(f"Using features {self.feature_names_in_}")
|
| 1542 |
|
| 1543 |
# Denoising transformation
|
|
|
|
| 1557 |
else:
|
| 1558 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1559 |
|
| 1560 |
+
return X, y, variable_names, X_units, y_units
|
| 1561 |
|
| 1562 |
def _run(self, X, y, mutated_params, weights, seed):
|
| 1563 |
"""
|
|
|
|
| 1710 |
tournament_selection_n=self.tournament_selection_n,
|
| 1711 |
# These have the same name:
|
| 1712 |
parsimony=self.parsimony,
|
| 1713 |
+
dimensional_constraint_penalty=self.dimensional_constraint_penalty,
|
| 1714 |
alpha=self.alpha,
|
| 1715 |
maxdepth=maxdepth,
|
| 1716 |
fast_cycle=self.fast_cycle,
|
|
|
|
| 1730 |
fraction_replaced=self.fraction_replaced,
|
| 1731 |
topn=self.topn,
|
| 1732 |
verbosity=self.verbosity,
|
| 1733 |
+
print_precision=self.print_precision,
|
| 1734 |
optimizer_algorithm=self.optimizer_algorithm,
|
| 1735 |
optimizer_nrestarts=self.optimizer_nrestarts,
|
| 1736 |
optimizer_probability=self.optimize_probability,
|
|
|
|
| 1782 |
None if parallelism in ["serial", "multithreading"] else int(self.procs)
|
| 1783 |
)
|
| 1784 |
|
| 1785 |
+
y_variable_names = None
|
| 1786 |
+
if len(y.shape) > 1:
|
| 1787 |
+
# We set these manually so that they respect Python's 0 indexing
|
| 1788 |
+
# (by default Julia will use y1, y2...)
|
| 1789 |
+
y_variable_names = [f"y{_subscriptify(i)}" for i in range(y.shape[1])]
|
| 1790 |
+
|
| 1791 |
# Call to Julia backend.
|
| 1792 |
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
| 1793 |
self.raw_julia_state_ = SymbolicRegression.equation_search(
|
|
|
|
| 1795 |
Main.y,
|
| 1796 |
weights=Main.weights,
|
| 1797 |
niterations=int(self.niterations),
|
| 1798 |
+
variable_names=(
|
| 1799 |
+
self.pretty_feature_names_in_.tolist()
|
| 1800 |
+
if hasattr(self, "pretty_feature_names_in_")
|
| 1801 |
+
and self.pretty_feature_names_in_ is not None
|
| 1802 |
+
else self.feature_names_in_.tolist()
|
| 1803 |
+
),
|
| 1804 |
+
y_variable_names=y_variable_names,
|
| 1805 |
+
X_units=self.X_units_,
|
| 1806 |
+
y_units=self.y_units_,
|
| 1807 |
options=options,
|
| 1808 |
numprocs=cprocs,
|
| 1809 |
parallelism=parallelism,
|
|
|
|
| 1829 |
Xresampled=None,
|
| 1830 |
weights=None,
|
| 1831 |
variable_names=None,
|
| 1832 |
+
X_units=None,
|
| 1833 |
+
y_units=None,
|
| 1834 |
):
|
| 1835 |
"""
|
| 1836 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
|
| 1858 |
instead of `variable_names`. Cannot contain spaces or special
|
| 1859 |
characters. Avoid variable names which are also
|
| 1860 |
function names in `sympy`, such as "N".
|
| 1861 |
+
X_units : list[str]
|
| 1862 |
+
A list of units for each variable in `X`. Each unit should be
|
| 1863 |
+
a string representing a Julia expression. See DynamicQuantities.jl
|
| 1864 |
+
https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
|
| 1865 |
+
information.
|
| 1866 |
+
y_units : str | list[str]
|
| 1867 |
+
Similar to `X_units`, but as a unit for the target variable, `y`.
|
| 1868 |
+
If `y` is a matrix, a list of units should be passed. If `X_units`
|
| 1869 |
+
is given but `y_units` is not, then `y_units` will be arbitrary.
|
| 1870 |
|
| 1871 |
Returns
|
| 1872 |
-------
|
|
|
|
| 1888 |
self.nout_ = 1
|
| 1889 |
self.selection_mask_ = None
|
| 1890 |
self.raw_julia_state_ = None
|
| 1891 |
+
self.X_units_ = None
|
| 1892 |
+
self.y_units_ = None
|
| 1893 |
|
| 1894 |
random_state = check_random_state(self.random_state) # For np random
|
| 1895 |
seed = random_state.get_state()[1][0] # For julia random
|
|
|
|
| 1898 |
|
| 1899 |
mutated_params = self._validate_and_set_init_params()
|
| 1900 |
|
| 1901 |
+
(
|
| 1902 |
+
X,
|
| 1903 |
+
y,
|
| 1904 |
+
Xresampled,
|
| 1905 |
+
weights,
|
| 1906 |
+
variable_names,
|
| 1907 |
+
X_units,
|
| 1908 |
+
y_units,
|
| 1909 |
+
) = self._validate_and_set_fit_params(
|
| 1910 |
+
X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1911 |
)
|
| 1912 |
|
| 1913 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
|
| 1922 |
)
|
| 1923 |
|
| 1924 |
# Pre transformations (feature selection and denoising)
|
| 1925 |
+
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
| 1926 |
+
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1927 |
)
|
| 1928 |
|
| 1929 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
|
| 1952 |
variable_names,
|
| 1953 |
weights,
|
| 1954 |
y,
|
| 1955 |
+
X_units,
|
| 1956 |
+
y_units,
|
| 1957 |
)
|
| 1958 |
|
| 1959 |
# Initially, just save model parameters, so that
|
|
|
|
| 2192 |
with open(cur_filename, "r") as f:
|
| 2193 |
buf = f.read()
|
| 2194 |
buf = _preprocess_julia_floats(buf)
|
| 2195 |
+
|
| 2196 |
+
df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2197 |
|
| 2198 |
all_outputs.append(df)
|
| 2199 |
else:
|
|
|
|
| 2203 |
with open(filename, "r") as f:
|
| 2204 |
buf = f.read()
|
| 2205 |
buf = _preprocess_julia_floats(buf)
|
| 2206 |
+
all_outputs = [self._postprocess_dataframe(pd.read_csv(StringIO(buf)))]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2207 |
|
| 2208 |
except FileNotFoundError:
|
| 2209 |
raise RuntimeError(
|
|
|
|
| 2212 |
)
|
| 2213 |
return all_outputs
|
| 2214 |
|
| 2215 |
+
def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
|
| 2216 |
+
df = df.rename(
|
| 2217 |
+
columns={
|
| 2218 |
+
"Complexity": "complexity",
|
| 2219 |
+
"Loss": "loss",
|
| 2220 |
+
"Equation": "equation",
|
| 2221 |
+
},
|
| 2222 |
+
)
|
| 2223 |
+
# Regexp replace x₁₂₃ to x123 in `equation`:
|
| 2224 |
+
if (
|
| 2225 |
+
hasattr(self, "pretty_feature_names_in_")
|
| 2226 |
+
and self.pretty_feature_names_in_ is not None
|
| 2227 |
+
):
|
| 2228 |
+
# df["equation"] = df["equation"].apply(_undo_subscriptify_full)
|
| 2229 |
+
for pname, name in zip(
|
| 2230 |
+
self.pretty_feature_names_in_, self.feature_names_in_
|
| 2231 |
+
):
|
| 2232 |
+
df["equation"] = df["equation"].apply(
|
| 2233 |
+
lambda s: re.sub(
|
| 2234 |
+
r"\b" + f"({pname})" + r"\b",
|
| 2235 |
+
name,
|
| 2236 |
+
s,
|
| 2237 |
+
)
|
| 2238 |
+
if isinstance(s, str)
|
| 2239 |
+
else s
|
| 2240 |
+
)
|
| 2241 |
+
|
| 2242 |
+
return df
|
| 2243 |
+
|
| 2244 |
def get_hof(self):
|
| 2245 |
"""Get the equations from a hall of fame file.
|
| 2246 |
|
|
|
|
| 2541 |
s = _apply_regexp_im_sci(s)
|
| 2542 |
s = _apply_regexp_sci(s)
|
| 2543 |
return s
|
| 2544 |
+
|
| 2545 |
+
|
| 2546 |
+
def _subscriptify(i: int) -> str:
|
| 2547 |
+
"""Converts integer to subscript text form.
|
| 2548 |
+
|
| 2549 |
+
For example, 123 -> "₁₂₃".
|
| 2550 |
+
"""
|
| 2551 |
+
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
pysr/test/test.py
CHANGED
|
@@ -19,6 +19,7 @@ from ..sr import (
|
|
| 19 |
_handle_feature_selection,
|
| 20 |
_csv_filename_to_pkl_filename,
|
| 21 |
idx_model_selection,
|
|
|
|
| 22 |
)
|
| 23 |
from ..export_latex import to_latex
|
| 24 |
|
|
@@ -711,6 +712,26 @@ class TestMiscellaneous(unittest.TestCase):
|
|
| 711 |
# If any checks failed don't let the test pass.
|
| 712 |
self.assertEqual(len(exception_messages), 0)
|
| 713 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 714 |
|
| 715 |
TRUE_PREAMBLE = "\n".join(
|
| 716 |
[
|
|
@@ -906,6 +927,151 @@ class TestLaTeXTable(unittest.TestCase):
|
|
| 906 |
self.assertEqual(latex_table_str, true_latex_table_str)
|
| 907 |
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
def runtests():
|
| 910 |
"""Run all tests in test.py."""
|
| 911 |
suite = unittest.TestSuite()
|
|
@@ -916,6 +1082,7 @@ def runtests():
|
|
| 916 |
TestFeatureSelection,
|
| 917 |
TestMiscellaneous,
|
| 918 |
TestLaTeXTable,
|
|
|
|
| 919 |
]
|
| 920 |
for test_case in test_cases:
|
| 921 |
tests = loader.loadTestsFromTestCase(test_case)
|
|
|
|
| 19 |
_handle_feature_selection,
|
| 20 |
_csv_filename_to_pkl_filename,
|
| 21 |
idx_model_selection,
|
| 22 |
+
_check_assertions,
|
| 23 |
)
|
| 24 |
from ..export_latex import to_latex
|
| 25 |
|
|
|
|
| 712 |
# If any checks failed don't let the test pass.
|
| 713 |
self.assertEqual(len(exception_messages), 0)
|
| 714 |
|
| 715 |
+
def test_param_groupings(self):
|
| 716 |
+
"""Test that param_groupings are complete"""
|
| 717 |
+
param_groupings_file = Path(__file__).parent.parent / "param_groupings.yml"
|
| 718 |
+
# Read the file, discarding lines ending in ":",
|
| 719 |
+
# and removing leading "\s*-\s*":
|
| 720 |
+
params = []
|
| 721 |
+
with open(param_groupings_file, "r") as f:
|
| 722 |
+
for line in f.readlines():
|
| 723 |
+
if line.strip().endswith(":"):
|
| 724 |
+
continue
|
| 725 |
+
if line.strip().startswith("-"):
|
| 726 |
+
params.append(line.strip()[1:].strip())
|
| 727 |
+
|
| 728 |
+
regressor_params = [
|
| 729 |
+
p for p in DEFAULT_PARAMS.keys() if p not in ["self", "kwargs"]
|
| 730 |
+
]
|
| 731 |
+
|
| 732 |
+
# Check the sets are equal:
|
| 733 |
+
self.assertSetEqual(set(params), set(regressor_params))
|
| 734 |
+
|
| 735 |
|
| 736 |
TRUE_PREAMBLE = "\n".join(
|
| 737 |
[
|
|
|
|
| 927 |
self.assertEqual(latex_table_str, true_latex_table_str)
|
| 928 |
|
| 929 |
|
| 930 |
+
class TestDimensionalConstraints(unittest.TestCase):
|
| 931 |
+
def setUp(self):
|
| 932 |
+
self.default_test_kwargs = dict(
|
| 933 |
+
progress=False,
|
| 934 |
+
model_selection="accuracy",
|
| 935 |
+
niterations=DEFAULT_NITERATIONS * 2,
|
| 936 |
+
populations=DEFAULT_POPULATIONS * 2,
|
| 937 |
+
temp_equation_file=True,
|
| 938 |
+
)
|
| 939 |
+
self.rstate = np.random.RandomState(0)
|
| 940 |
+
self.X = self.rstate.randn(100, 5)
|
| 941 |
+
|
| 942 |
+
def test_dimensional_constraints(self):
|
| 943 |
+
y = np.cos(self.X[:, [0, 1]])
|
| 944 |
+
model = PySRRegressor(
|
| 945 |
+
binary_operators=[
|
| 946 |
+
"my_add(x, y) = x + y",
|
| 947 |
+
"my_sub(x, y) = x - y",
|
| 948 |
+
"my_mul(x, y) = x * y",
|
| 949 |
+
],
|
| 950 |
+
unary_operators=["my_cos(x) = cos(x)"],
|
| 951 |
+
**self.default_test_kwargs,
|
| 952 |
+
early_stop_condition=1e-8,
|
| 953 |
+
select_k_features=3,
|
| 954 |
+
extra_sympy_mappings={
|
| 955 |
+
"my_cos": sympy.cos,
|
| 956 |
+
"my_add": lambda x, y: x + y,
|
| 957 |
+
"my_sub": lambda x, y: x - y,
|
| 958 |
+
"my_mul": lambda x, y: x * y,
|
| 959 |
+
},
|
| 960 |
+
)
|
| 961 |
+
model.fit(self.X, y, X_units=["m", "m", "m", "m", "m"], y_units=["m", "m"])
|
| 962 |
+
|
| 963 |
+
# The best expression should have complexity larger than just 2:
|
| 964 |
+
for i in range(2):
|
| 965 |
+
self.assertGreater(model.get_best()[i]["complexity"], 2)
|
| 966 |
+
self.assertLess(model.get_best()[i]["loss"], 1e-6)
|
| 967 |
+
self.assertGreater(
|
| 968 |
+
model.equations_[i].query("complexity <= 2").loss.min(), 1e-6
|
| 969 |
+
)
|
| 970 |
+
|
| 971 |
+
def test_unit_checks(self):
|
| 972 |
+
"""This just checks the number of units passed"""
|
| 973 |
+
use_custom_variable_names = False
|
| 974 |
+
variable_names = None
|
| 975 |
+
weights = None
|
| 976 |
+
args = (use_custom_variable_names, variable_names, weights)
|
| 977 |
+
valid_units = [
|
| 978 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s"], "m"),
|
| 979 |
+
(np.ones((10, 1)), np.ones(10), ["m/s"], None),
|
| 980 |
+
(np.ones((10, 1)), np.ones(10), None, "m/s"),
|
| 981 |
+
(np.ones((10, 1)), np.ones(10), None, ["m/s"]),
|
| 982 |
+
(np.ones((10, 1)), np.ones((10, 1)), None, ["m/s"]),
|
| 983 |
+
(np.ones((10, 1)), np.ones((10, 2)), None, ["m/s", ""]),
|
| 984 |
+
]
|
| 985 |
+
for X, y, X_units, y_units in valid_units:
|
| 986 |
+
_check_assertions(
|
| 987 |
+
X,
|
| 988 |
+
*args,
|
| 989 |
+
y,
|
| 990 |
+
X_units,
|
| 991 |
+
y_units,
|
| 992 |
+
)
|
| 993 |
+
invalid_units = [
|
| 994 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], None),
|
| 995 |
+
(np.ones((10, 2)), np.ones(10), ["m/s", "s", "s^2"], "m"),
|
| 996 |
+
(np.ones((10, 2)), np.ones((10, 2)), ["m/s", "s"], ["m"]),
|
| 997 |
+
(np.ones((10, 1)), np.ones((10, 1)), "m/s", ["m"]),
|
| 998 |
+
]
|
| 999 |
+
for X, y, X_units, y_units in invalid_units:
|
| 1000 |
+
with self.assertRaises(ValueError):
|
| 1001 |
+
_check_assertions(
|
| 1002 |
+
X,
|
| 1003 |
+
*args,
|
| 1004 |
+
y,
|
| 1005 |
+
X_units,
|
| 1006 |
+
y_units,
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
def test_unit_propagation(self):
|
| 1010 |
+
"""Check that units are propagated correctly.
|
| 1011 |
+
|
| 1012 |
+
This also tests that variables have the correct names.
|
| 1013 |
+
"""
|
| 1014 |
+
X = np.ones((100, 3))
|
| 1015 |
+
y = np.ones((100, 1))
|
| 1016 |
+
temp_dir = Path(tempfile.mkdtemp())
|
| 1017 |
+
equation_file = str(temp_dir / "equation_file.csv")
|
| 1018 |
+
model = PySRRegressor(
|
| 1019 |
+
binary_operators=["+", "*"],
|
| 1020 |
+
early_stop_condition="(l, c) -> l < 1e-6 && c == 3",
|
| 1021 |
+
progress=False,
|
| 1022 |
+
model_selection="accuracy",
|
| 1023 |
+
niterations=DEFAULT_NITERATIONS * 2,
|
| 1024 |
+
populations=DEFAULT_POPULATIONS * 2,
|
| 1025 |
+
complexity_of_constants=10,
|
| 1026 |
+
weight_mutate_constant=0.0,
|
| 1027 |
+
should_optimize_constants=False,
|
| 1028 |
+
multithreading=False,
|
| 1029 |
+
deterministic=True,
|
| 1030 |
+
procs=0,
|
| 1031 |
+
random_state=0,
|
| 1032 |
+
equation_file=equation_file,
|
| 1033 |
+
warm_start=True,
|
| 1034 |
+
)
|
| 1035 |
+
model.fit(
|
| 1036 |
+
X,
|
| 1037 |
+
y,
|
| 1038 |
+
X_units=["m", "s", "A"],
|
| 1039 |
+
y_units=["m*A"],
|
| 1040 |
+
)
|
| 1041 |
+
best = model.get_best()
|
| 1042 |
+
self.assertIn("x0", best["equation"])
|
| 1043 |
+
self.assertNotIn("x1", best["equation"])
|
| 1044 |
+
self.assertIn("x2", best["equation"])
|
| 1045 |
+
self.assertEqual(best["complexity"], 3)
|
| 1046 |
+
self.assertEqual(model.equations_.iloc[0].complexity, 1)
|
| 1047 |
+
self.assertGreater(model.equations_.iloc[0].loss, 1e-6)
|
| 1048 |
+
|
| 1049 |
+
# With pkl file:
|
| 1050 |
+
pkl_file = str(temp_dir / "equation_file.pkl")
|
| 1051 |
+
model2 = PySRRegressor.from_file(pkl_file)
|
| 1052 |
+
best2 = model2.get_best()
|
| 1053 |
+
self.assertIn("x0", best2["equation"])
|
| 1054 |
+
|
| 1055 |
+
# From csv file alone (we need to delete pkl file:)
|
| 1056 |
+
# First, we delete the pkl file:
|
| 1057 |
+
os.remove(pkl_file)
|
| 1058 |
+
model3 = PySRRegressor.from_file(
|
| 1059 |
+
equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1]
|
| 1060 |
+
)
|
| 1061 |
+
best3 = model3.get_best()
|
| 1062 |
+
self.assertIn("x0", best3["equation"])
|
| 1063 |
+
|
| 1064 |
+
# Try warm start, but with no units provided (should
|
| 1065 |
+
# be a different dataset, and thus different result):
|
| 1066 |
+
model.fit(X, y)
|
| 1067 |
+
model.early_stop_condition = "(l, c) -> l < 1e-6 && c == 1"
|
| 1068 |
+
self.assertEqual(model.equations_.iloc[0].complexity, 1)
|
| 1069 |
+
self.assertLess(model.equations_.iloc[0].loss, 1e-6)
|
| 1070 |
+
|
| 1071 |
+
|
| 1072 |
+
# TODO: Determine desired behavior if second .fit() call does not have units
|
| 1073 |
+
|
| 1074 |
+
|
| 1075 |
def runtests():
|
| 1076 |
"""Run all tests in test.py."""
|
| 1077 |
suite = unittest.TestSuite()
|
|
|
|
| 1082 |
TestFeatureSelection,
|
| 1083 |
TestMiscellaneous,
|
| 1084 |
TestLaTeXTable,
|
| 1085 |
+
TestDimensionalConstraints,
|
| 1086 |
]
|
| 1087 |
for test_case in test_cases:
|
| 1088 |
tests = loader.loadTestsFromTestCase(test_case)
|
pysr/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
__version__ = "0.
|
| 2 |
-
__symbolic_regression_jl_version__ = "0.
|
|
|
|
| 1 |
+
__version__ = "0.15.0"
|
| 2 |
+
__symbolic_regression_jl_version__ = "0.21.3"
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
sympy
|
| 2 |
-
pandas
|
| 3 |
numpy
|
| 4 |
scikit_learn>=1.0.0
|
| 5 |
julia>=0.6.0
|
|
|
|
| 1 |
sympy
|
| 2 |
+
pandas>=0.21.0
|
| 3 |
numpy
|
| 4 |
scikit_learn>=1.0.0
|
| 5 |
julia>=0.6.0
|