Spaces:
Running
Running
tttc3
commited on
Commit
·
3ef5500
1
Parent(s):
a62a370
Added control of random_state for numpy and julia
Browse files- pysr/sr.py +44 -18
pysr/sr.py
CHANGED
|
@@ -2,6 +2,7 @@ import os
|
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
|
|
|
| 5 |
import sympy
|
| 6 |
from sympy import sympify
|
| 7 |
import re
|
|
@@ -172,6 +173,10 @@ def best_callable(*args, **kwargs): # pragma: no cover
|
|
| 172 |
)
|
| 173 |
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
| 176 |
"""
|
| 177 |
High-performance symbolic regression.
|
|
@@ -422,6 +427,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 422 |
What precision to use for the data. By default this is 32
|
| 423 |
(float32), but you can select 64 or 16 as well.
|
| 424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 425 |
verbosity : int, default=1e9
|
| 426 |
What verbosity level to use. 0 means minimal print statements.
|
| 427 |
|
|
@@ -566,9 +575,6 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 566 |
array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
|
| 567 |
"""
|
| 568 |
|
| 569 |
-
# Class validation constants
|
| 570 |
-
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
| 571 |
-
|
| 572 |
def __init__(
|
| 573 |
self,
|
| 574 |
model_selection="best",
|
|
@@ -626,6 +632,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 626 |
batch_size=50,
|
| 627 |
fast_cycle=False,
|
| 628 |
precision=32,
|
|
|
|
| 629 |
verbosity=1e9,
|
| 630 |
update_verbosity=None,
|
| 631 |
progress=True,
|
|
@@ -709,6 +716,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 709 |
self.batch_size = batch_size
|
| 710 |
self.fast_cycle = fast_cycle
|
| 711 |
self.precision = precision
|
|
|
|
| 712 |
# Additional runtime parameters
|
| 713 |
# - Runtime user interface
|
| 714 |
self.verbosity = verbosity
|
|
@@ -940,9 +948,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 940 |
)
|
| 941 |
|
| 942 |
# NotImplementedError - Values that could be supported at a later time
|
| 943 |
-
if self.optimizer_algorithm not in
|
| 944 |
raise NotImplementedError(
|
| 945 |
-
f"PySR currently only supports the following optimizer algorithms: {
|
| 946 |
)
|
| 947 |
|
| 948 |
if isinstance(X, pd.DataFrame):
|
|
@@ -988,7 +996,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 988 |
|
| 989 |
return X, y, Xresampled, variable_names
|
| 990 |
|
| 991 |
-
def _pre_transform_training_data(
|
|
|
|
|
|
|
| 992 |
"""
|
| 993 |
Transforms the training data before fitting the symbolic regressor.
|
| 994 |
|
|
@@ -1009,6 +1019,10 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1009 |
variable_names : list[str] of length n_features
|
| 1010 |
Names of each variable in the training dataset, `X`.
|
| 1011 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1012 |
Returns
|
| 1013 |
-------
|
| 1014 |
X_transformed : ndarray of shape (n_samples, n_features)
|
|
@@ -1031,7 +1045,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1031 |
"""
|
| 1032 |
# Feature selection transformation
|
| 1033 |
if self.select_k_features:
|
| 1034 |
-
self.selection_mask_ = run_feature_selection(
|
|
|
|
|
|
|
| 1035 |
X = X[:, self.selection_mask_]
|
| 1036 |
|
| 1037 |
if Xresampled is not None:
|
|
@@ -1051,7 +1067,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1051 |
if self.nout_ > 1:
|
| 1052 |
y = np.stack(
|
| 1053 |
[
|
| 1054 |
-
_denoise(
|
|
|
|
|
|
|
| 1055 |
for i in range(self.nout_)
|
| 1056 |
],
|
| 1057 |
axis=1,
|
|
@@ -1059,11 +1077,11 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1059 |
if Xresampled is not None:
|
| 1060 |
X = Xresampled
|
| 1061 |
else:
|
| 1062 |
-
X, y = _denoise(X, y, Xresampled=Xresampled)
|
| 1063 |
|
| 1064 |
return X, y, variable_names
|
| 1065 |
|
| 1066 |
-
def _run(self, X, y, weights):
|
| 1067 |
"""
|
| 1068 |
Run the symbolic regression fitting process on the julia backend.
|
| 1069 |
|
|
@@ -1245,7 +1263,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1245 |
]
|
| 1246 |
|
| 1247 |
# Call to Julia backend.
|
| 1248 |
-
# See https://github.com/
|
| 1249 |
options = Main.Options(
|
| 1250 |
binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
|
| 1251 |
unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
|
|
@@ -1294,6 +1312,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1294 |
skip_mutation_failures=self.skip_mutation_failures,
|
| 1295 |
max_evals=self.max_evals,
|
| 1296 |
earlyStopCondition=self.early_stop_condition,
|
|
|
|
| 1297 |
)
|
| 1298 |
|
| 1299 |
# Convert data to desired precision
|
|
@@ -1316,7 +1335,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1316 |
cprocs = 0 if multithreading else self.procs
|
| 1317 |
|
| 1318 |
# Call to Julia backend.
|
| 1319 |
-
# See https://github.com/
|
| 1320 |
self.raw_julia_state_ = Main.EquationSearch(
|
| 1321 |
Main.X,
|
| 1322 |
Main.y,
|
|
@@ -1390,6 +1409,9 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1390 |
self.selection_mask_ = None
|
| 1391 |
self.raw_julia_state_ = None
|
| 1392 |
|
|
|
|
|
|
|
|
|
|
| 1393 |
self._setup_equation_file()
|
| 1394 |
|
| 1395 |
# Parameter input validation (for parameters defined in __init__)
|
|
@@ -1410,7 +1432,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1410 |
|
| 1411 |
# Pre transformations (feature selection and denoising)
|
| 1412 |
X, y, variable_names = self._pre_transform_training_data(
|
| 1413 |
-
X, y, Xresampled, variable_names
|
| 1414 |
)
|
| 1415 |
|
| 1416 |
# Warn about large feature counts (still warn if feature count is large
|
|
@@ -1443,7 +1465,7 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1443 |
|
| 1444 |
# Fitting procedure
|
| 1445 |
if not from_equation_file:
|
| 1446 |
-
self._run(X=X, y=y, weights=weights)
|
| 1447 |
else:
|
| 1448 |
self.equations_ = self.get_hof()
|
| 1449 |
|
|
@@ -1790,13 +1812,15 @@ class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
|
| 1790 |
return ret_outputs[0]
|
| 1791 |
|
| 1792 |
|
| 1793 |
-
def _denoise(X, y, Xresampled=None):
|
| 1794 |
"""Denoise the dataset using a Gaussian process"""
|
| 1795 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 1796 |
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
|
| 1797 |
|
| 1798 |
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
| 1799 |
-
gpr = GaussianProcessRegressor(
|
|
|
|
|
|
|
| 1800 |
gpr.fit(X, y)
|
| 1801 |
if Xresampled is not None:
|
| 1802 |
return Xresampled, gpr.predict(Xresampled)
|
|
@@ -1816,7 +1840,7 @@ def _handle_feature_selection(X, select_k_features, y, variable_names):
|
|
| 1816 |
return X, selection
|
| 1817 |
|
| 1818 |
|
| 1819 |
-
def run_feature_selection(X, y, select_k_features):
|
| 1820 |
"""
|
| 1821 |
Use a gradient boosting tree regressor as a proxy for finding
|
| 1822 |
the k most important features in X, returning indices for those
|
|
@@ -1825,7 +1849,9 @@ def run_feature_selection(X, y, select_k_features):
|
|
| 1825 |
from sklearn.ensemble import RandomForestRegressor
|
| 1826 |
from sklearn.feature_selection import SelectFromModel
|
| 1827 |
|
| 1828 |
-
clf = RandomForestRegressor(
|
|
|
|
|
|
|
| 1829 |
clf.fit(X, y)
|
| 1830 |
selector = SelectFromModel(
|
| 1831 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
|
|
|
| 2 |
import sys
|
| 3 |
import numpy as np
|
| 4 |
import pandas as pd
|
| 5 |
+
from sklearn.utils import check_array, check_random_state
|
| 6 |
import sympy
|
| 7 |
from sympy import sympify
|
| 8 |
import re
|
|
|
|
| 173 |
)
|
| 174 |
|
| 175 |
|
| 176 |
+
# Class validation constants
|
| 177 |
+
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
| 178 |
+
|
| 179 |
+
|
| 180 |
class PySRRegressor(BaseEstimator, RegressorMixin, MultiOutputMixin):
|
| 181 |
"""
|
| 182 |
High-performance symbolic regression.
|
|
|
|
| 427 |
What precision to use for the data. By default this is 32
|
| 428 |
(float32), but you can select 64 or 16 as well.
|
| 429 |
|
| 430 |
+
random_state : int, Numpy RandomState instance or None, default=None
|
| 431 |
+
Pass an int for reproducible results across multiple function calls.
|
| 432 |
+
See :term:`Glossary <random_state>`.
|
| 433 |
+
|
| 434 |
verbosity : int, default=1e9
|
| 435 |
What verbosity level to use. 0 means minimal print statements.
|
| 436 |
|
|
|
|
| 575 |
array([-1.15907818, -1.15907818, -1.15907818, -1.15907818, -1.15907818])
|
| 576 |
"""
|
| 577 |
|
|
|
|
|
|
|
|
|
|
| 578 |
def __init__(
|
| 579 |
self,
|
| 580 |
model_selection="best",
|
|
|
|
| 632 |
batch_size=50,
|
| 633 |
fast_cycle=False,
|
| 634 |
precision=32,
|
| 635 |
+
random_state=None,
|
| 636 |
verbosity=1e9,
|
| 637 |
update_verbosity=None,
|
| 638 |
progress=True,
|
|
|
|
| 716 |
self.batch_size = batch_size
|
| 717 |
self.fast_cycle = fast_cycle
|
| 718 |
self.precision = precision
|
| 719 |
+
self.random_state = random_state
|
| 720 |
# Additional runtime parameters
|
| 721 |
# - Runtime user interface
|
| 722 |
self.verbosity = verbosity
|
|
|
|
| 948 |
)
|
| 949 |
|
| 950 |
# NotImplementedError - Values that could be supported at a later time
|
| 951 |
+
if self.optimizer_algorithm not in VALID_OPTIMIZER_ALGORITHMS:
|
| 952 |
raise NotImplementedError(
|
| 953 |
+
f"PySR currently only supports the following optimizer algorithms: {VALID_OPTIMIZER_ALGORITHMS}"
|
| 954 |
)
|
| 955 |
|
| 956 |
if isinstance(X, pd.DataFrame):
|
|
|
|
| 996 |
|
| 997 |
return X, y, Xresampled, variable_names
|
| 998 |
|
| 999 |
+
def _pre_transform_training_data(
|
| 1000 |
+
self, X, y, Xresampled, variable_names, random_state
|
| 1001 |
+
):
|
| 1002 |
"""
|
| 1003 |
Transforms the training data before fitting the symbolic regressor.
|
| 1004 |
|
|
|
|
| 1019 |
variable_names : list[str] of length n_features
|
| 1020 |
Names of each variable in the training dataset, `X`.
|
| 1021 |
|
| 1022 |
+
random_state : int, Numpy RandomState instance or None, default=None
|
| 1023 |
+
Pass an int for reproducible results across multiple function calls.
|
| 1024 |
+
See :term:`Glossary <random_state>`.
|
| 1025 |
+
|
| 1026 |
Returns
|
| 1027 |
-------
|
| 1028 |
X_transformed : ndarray of shape (n_samples, n_features)
|
|
|
|
| 1045 |
"""
|
| 1046 |
# Feature selection transformation
|
| 1047 |
if self.select_k_features:
|
| 1048 |
+
self.selection_mask_ = run_feature_selection(
|
| 1049 |
+
X, y, self.select_k_features, random_state=random_state
|
| 1050 |
+
)
|
| 1051 |
X = X[:, self.selection_mask_]
|
| 1052 |
|
| 1053 |
if Xresampled is not None:
|
|
|
|
| 1067 |
if self.nout_ > 1:
|
| 1068 |
y = np.stack(
|
| 1069 |
[
|
| 1070 |
+
_denoise(
|
| 1071 |
+
X, y[:, i], Xresampled=Xresampled, random_state=random_state
|
| 1072 |
+
)[1]
|
| 1073 |
for i in range(self.nout_)
|
| 1074 |
],
|
| 1075 |
axis=1,
|
|
|
|
| 1077 |
if Xresampled is not None:
|
| 1078 |
X = Xresampled
|
| 1079 |
else:
|
| 1080 |
+
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1081 |
|
| 1082 |
return X, y, variable_names
|
| 1083 |
|
| 1084 |
+
def _run(self, X, y, weights, seed):
|
| 1085 |
"""
|
| 1086 |
Run the symbolic regression fitting process on the julia backend.
|
| 1087 |
|
|
|
|
| 1263 |
]
|
| 1264 |
|
| 1265 |
# Call to Julia backend.
|
| 1266 |
+
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
|
| 1267 |
options = Main.Options(
|
| 1268 |
binary_operators=Main.eval(str(tuple(binary_operators)).replace("'", "")),
|
| 1269 |
unary_operators=Main.eval(str(tuple(unary_operators)).replace("'", "")),
|
|
|
|
| 1312 |
skip_mutation_failures=self.skip_mutation_failures,
|
| 1313 |
max_evals=self.max_evals,
|
| 1314 |
earlyStopCondition=self.early_stop_condition,
|
| 1315 |
+
seed=seed,
|
| 1316 |
)
|
| 1317 |
|
| 1318 |
# Convert data to desired precision
|
|
|
|
| 1335 |
cprocs = 0 if multithreading else self.procs
|
| 1336 |
|
| 1337 |
# Call to Julia backend.
|
| 1338 |
+
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl
|
| 1339 |
self.raw_julia_state_ = Main.EquationSearch(
|
| 1340 |
Main.X,
|
| 1341 |
Main.y,
|
|
|
|
| 1409 |
self.selection_mask_ = None
|
| 1410 |
self.raw_julia_state_ = None
|
| 1411 |
|
| 1412 |
+
random_state = check_random_state(self.random_state) # For np random
|
| 1413 |
+
seed = random_state.get_state()[1][0] # For julia random
|
| 1414 |
+
|
| 1415 |
self._setup_equation_file()
|
| 1416 |
|
| 1417 |
# Parameter input validation (for parameters defined in __init__)
|
|
|
|
| 1432 |
|
| 1433 |
# Pre transformations (feature selection and denoising)
|
| 1434 |
X, y, variable_names = self._pre_transform_training_data(
|
| 1435 |
+
X, y, Xresampled, variable_names, random_state
|
| 1436 |
)
|
| 1437 |
|
| 1438 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
|
| 1465 |
|
| 1466 |
# Fitting procedure
|
| 1467 |
if not from_equation_file:
|
| 1468 |
+
self._run(X=X, y=y, weights=weights, seed=seed)
|
| 1469 |
else:
|
| 1470 |
self.equations_ = self.get_hof()
|
| 1471 |
|
|
|
|
| 1812 |
return ret_outputs[0]
|
| 1813 |
|
| 1814 |
|
| 1815 |
+
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 1816 |
"""Denoise the dataset using a Gaussian process"""
|
| 1817 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 1818 |
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
|
| 1819 |
|
| 1820 |
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
| 1821 |
+
gpr = GaussianProcessRegressor(
|
| 1822 |
+
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
| 1823 |
+
)
|
| 1824 |
gpr.fit(X, y)
|
| 1825 |
if Xresampled is not None:
|
| 1826 |
return Xresampled, gpr.predict(Xresampled)
|
|
|
|
| 1840 |
return X, selection
|
| 1841 |
|
| 1842 |
|
| 1843 |
+
def run_feature_selection(X, y, select_k_features, random_state=None):
|
| 1844 |
"""
|
| 1845 |
Use a gradient boosting tree regressor as a proxy for finding
|
| 1846 |
the k most important features in X, returning indices for those
|
|
|
|
| 1849 |
from sklearn.ensemble import RandomForestRegressor
|
| 1850 |
from sklearn.feature_selection import SelectFromModel
|
| 1851 |
|
| 1852 |
+
clf = RandomForestRegressor(
|
| 1853 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
| 1854 |
+
)
|
| 1855 |
clf.fit(X, y)
|
| 1856 |
selector = SelectFromModel(
|
| 1857 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|