Spaces:
Running
Running
refactor: more type declarations
Browse files- pysr/sr.py +51 -22
pysr/sr.py
CHANGED
|
@@ -21,9 +21,12 @@ else:
|
|
| 21 |
|
| 22 |
import numpy as np
|
| 23 |
import pandas as pd
|
|
|
|
|
|
|
| 24 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
| 25 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 26 |
-
from sklearn.utils.validation import _check_feature_names_in
|
|
|
|
| 27 |
|
| 28 |
from .denoising import denoise, multi_denoise
|
| 29 |
from .deprecated import DEPRECATED_KWARGS
|
|
@@ -179,6 +182,21 @@ VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
|
|
| 179 |
|
| 180 |
|
| 181 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
"""
|
| 183 |
High-performance symbolic regression algorithm.
|
| 184 |
|
|
@@ -603,22 +621,17 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 603 |
Units of each variable in the training dataset, `y`.
|
| 604 |
nout_ : int
|
| 605 |
Number of output dimensions.
|
| 606 |
-
selection_mask_ :
|
| 607 |
-
|
| 608 |
-
`select_k_features` is set.
|
| 609 |
tempdir_ : Path
|
| 610 |
Path to the temporary equations directory.
|
| 611 |
-
equation_file_ : str
|
| 612 |
Output equation file name produced by the julia backend.
|
| 613 |
julia_state_stream_ : ndarray
|
| 614 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
| 615 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
| 616 |
-
julia_state_
|
| 617 |
-
The deserialized state.
|
| 618 |
julia_options_stream_ : ndarray
|
| 619 |
The serialized julia options, stored as an array of uint8,
|
| 620 |
-
julia_options_
|
| 621 |
-
The deserialized julia options.
|
| 622 |
equation_file_contents_ : list[pandas.DataFrame]
|
| 623 |
Contents of the equation file output by the Julia backend.
|
| 624 |
show_pickle_warnings_ : bool
|
|
@@ -926,7 +939,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 926 |
Names of the features passed to the model.
|
| 927 |
Not needed if loading from a pickle file.
|
| 928 |
selection_mask : list[bool]
|
| 929 |
-
If using select_k_features
|
| 930 |
Not needed if loading from a pickle file.
|
| 931 |
nout : int
|
| 932 |
Number of outputs of the model.
|
|
@@ -1124,10 +1137,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1124 |
|
| 1125 |
@property
|
| 1126 |
def julia_options_(self):
|
|
|
|
| 1127 |
return jl_deserialize(self.julia_options_stream_)
|
| 1128 |
|
| 1129 |
@property
|
| 1130 |
def julia_state_(self):
|
|
|
|
| 1131 |
return jl_deserialize(self.julia_state_stream_)
|
| 1132 |
|
| 1133 |
@property
|
|
@@ -1140,7 +1155,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1140 |
)
|
| 1141 |
return self.julia_state_
|
| 1142 |
|
| 1143 |
-
def get_best(self, index=None):
|
| 1144 |
"""
|
| 1145 |
Get best equation using `model_selection`.
|
| 1146 |
|
|
@@ -1316,7 +1331,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1316 |
|
| 1317 |
def _validate_and_set_fit_params(
|
| 1318 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1319 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1320 |
"""
|
| 1321 |
Validate the parameters passed to the :term`fit` method.
|
| 1322 |
|
|
@@ -1336,7 +1359,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1336 |
Weight array of the same shape as `y`.
|
| 1337 |
Each element is how to weight the mean-square-error loss
|
| 1338 |
for that particular element of y.
|
| 1339 |
-
variable_names :
|
| 1340 |
Names of each variable in the training dataset, `X`.
|
| 1341 |
X_units : list[str] of length n_features
|
| 1342 |
Units of each variable in the training dataset, `X`.
|
|
@@ -1392,7 +1415,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1392 |
if weights is not None:
|
| 1393 |
weights = check_array(weights, ensure_2d=False)
|
| 1394 |
check_consistent_length(weights, y)
|
| 1395 |
-
X, y = self.
|
| 1396 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
| 1397 |
self, variable_names, generate_names=False
|
| 1398 |
)
|
|
@@ -1402,10 +1425,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1402 |
self.display_feature_names_in_ = np.array(
|
| 1403 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
| 1404 |
)
|
|
|
|
| 1405 |
else:
|
| 1406 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1407 |
-
|
| 1408 |
-
variable_names = self.feature_names_in_
|
| 1409 |
|
| 1410 |
# Handle multioutput data
|
| 1411 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
@@ -1420,6 +1443,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1420 |
|
| 1421 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1423 |
def _pre_transform_training_data(
|
| 1424 |
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1425 |
):
|
|
@@ -1489,7 +1518,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1489 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1490 |
|
| 1491 |
# Re-perform data validation and feature name updating
|
| 1492 |
-
X, y = self.
|
| 1493 |
# Update feature names with selected variable names
|
| 1494 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1495 |
self.display_feature_names_in_ = self.feature_names_in_
|
|
@@ -1506,7 +1535,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1506 |
|
| 1507 |
return X, y, variable_names, X_units, y_units
|
| 1508 |
|
| 1509 |
-
def _run(self, X, y, mutated_params, weights, seed):
|
| 1510 |
"""
|
| 1511 |
Run the symbolic regression fitting process on the julia backend.
|
| 1512 |
|
|
@@ -1784,9 +1813,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1784 |
y,
|
| 1785 |
Xresampled=None,
|
| 1786 |
weights=None,
|
| 1787 |
-
variable_names: Optional[
|
| 1788 |
-
X_units: Optional[
|
| 1789 |
-
y_units: Optional[
|
| 1790 |
) -> "PySRRegressor":
|
| 1791 |
"""
|
| 1792 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
@@ -2003,7 +2032,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2003 |
# reordered/reindexed to match those of the transformed (denoised and
|
| 2004 |
# feature selected) X in fit.
|
| 2005 |
X = X.reindex(columns=self.feature_names_in_)
|
| 2006 |
-
X = self.
|
| 2007 |
|
| 2008 |
try:
|
| 2009 |
if isinstance(best_equation, list):
|
|
|
|
| 21 |
|
| 22 |
import numpy as np
|
| 23 |
import pandas as pd
|
| 24 |
+
from numpy import ndarray
|
| 25 |
+
from numpy.typing import NDArray
|
| 26 |
from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
| 27 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 28 |
+
from sklearn.utils.validation import _check_feature_names_in # type: ignore
|
| 29 |
+
from sklearn.utils.validation import check_is_fitted
|
| 30 |
|
| 31 |
from .denoising import denoise, multi_denoise
|
| 32 |
from .deprecated import DEPRECATED_KWARGS
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
| 185 |
+
equations_: Optional[Union[pd.DataFrame, List[pd.DataFrame]]]
|
| 186 |
+
n_features_in_: int
|
| 187 |
+
feature_names_in_: ArrayLike[str]
|
| 188 |
+
display_feature_names_in_: ArrayLike[str]
|
| 189 |
+
X_units_: Optional[ArrayLike[str]]
|
| 190 |
+
y_units_: Optional[Union[str, ArrayLike[str]]]
|
| 191 |
+
nout_: int
|
| 192 |
+
selection_mask_: Optional[NDArray[np.bool_]]
|
| 193 |
+
tempdir_: Path
|
| 194 |
+
equation_file_: Union[str, Path]
|
| 195 |
+
julia_state_stream_: Optional[NDArray[np.uint8]]
|
| 196 |
+
julia_options_stream_: Optional[NDArray[np.uint8]]
|
| 197 |
+
equation_file_contents_: Optional[List[pd.DataFrame]]
|
| 198 |
+
show_pickle_warnings_: bool
|
| 199 |
+
|
| 200 |
"""
|
| 201 |
High-performance symbolic regression algorithm.
|
| 202 |
|
|
|
|
| 621 |
Units of each variable in the training dataset, `y`.
|
| 622 |
nout_ : int
|
| 623 |
Number of output dimensions.
|
| 624 |
+
selection_mask_ : ndarray of shape (`n_features_in_`,)
|
| 625 |
+
Mask of which features of `X` to use when `select_k_features` is set.
|
|
|
|
| 626 |
tempdir_ : Path
|
| 627 |
Path to the temporary equations directory.
|
| 628 |
+
equation_file_ : Union[str, Path]
|
| 629 |
Output equation file name produced by the julia backend.
|
| 630 |
julia_state_stream_ : ndarray
|
| 631 |
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
|
| 632 |
stored as an array of uint8, produced by Julia's Serialization.serialize function.
|
|
|
|
|
|
|
| 633 |
julia_options_stream_ : ndarray
|
| 634 |
The serialized julia options, stored as an array of uint8,
|
|
|
|
|
|
|
| 635 |
equation_file_contents_ : list[pandas.DataFrame]
|
| 636 |
Contents of the equation file output by the Julia backend.
|
| 637 |
show_pickle_warnings_ : bool
|
|
|
|
| 939 |
Names of the features passed to the model.
|
| 940 |
Not needed if loading from a pickle file.
|
| 941 |
selection_mask : list[bool]
|
| 942 |
+
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
| 943 |
Not needed if loading from a pickle file.
|
| 944 |
nout : int
|
| 945 |
Number of outputs of the model.
|
|
|
|
| 1137 |
|
| 1138 |
@property
|
| 1139 |
def julia_options_(self):
|
| 1140 |
+
"""The deserialized julia options."""
|
| 1141 |
return jl_deserialize(self.julia_options_stream_)
|
| 1142 |
|
| 1143 |
@property
|
| 1144 |
def julia_state_(self):
|
| 1145 |
+
"""The deserialized state."""
|
| 1146 |
return jl_deserialize(self.julia_state_stream_)
|
| 1147 |
|
| 1148 |
@property
|
|
|
|
| 1155 |
)
|
| 1156 |
return self.julia_state_
|
| 1157 |
|
| 1158 |
+
def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
|
| 1159 |
"""
|
| 1160 |
Get best equation using `model_selection`.
|
| 1161 |
|
|
|
|
| 1331 |
|
| 1332 |
def _validate_and_set_fit_params(
|
| 1333 |
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1334 |
+
) -> Tuple[
|
| 1335 |
+
ndarray,
|
| 1336 |
+
ndarray,
|
| 1337 |
+
Optional[ndarray],
|
| 1338 |
+
Optional[ndarray],
|
| 1339 |
+
ndarray,
|
| 1340 |
+
Optional[ArrayLike[str]],
|
| 1341 |
+
Optional[Union[str, ArrayLike[str]]],
|
| 1342 |
+
]:
|
| 1343 |
"""
|
| 1344 |
Validate the parameters passed to the :term`fit` method.
|
| 1345 |
|
|
|
|
| 1359 |
Weight array of the same shape as `y`.
|
| 1360 |
Each element is how to weight the mean-square-error loss
|
| 1361 |
for that particular element of y.
|
| 1362 |
+
variable_names : ndarray of length n_features
|
| 1363 |
Names of each variable in the training dataset, `X`.
|
| 1364 |
X_units : list[str] of length n_features
|
| 1365 |
Units of each variable in the training dataset, `X`.
|
|
|
|
| 1415 |
if weights is not None:
|
| 1416 |
weights = check_array(weights, ensure_2d=False)
|
| 1417 |
check_consistent_length(weights, y)
|
| 1418 |
+
X, y = self._validate_data_X_y(X, y)
|
| 1419 |
self.feature_names_in_ = _safe_check_feature_names_in(
|
| 1420 |
self, variable_names, generate_names=False
|
| 1421 |
)
|
|
|
|
| 1425 |
self.display_feature_names_in_ = np.array(
|
| 1426 |
[f"x{_subscriptify(i)}" for i in range(X.shape[1])]
|
| 1427 |
)
|
| 1428 |
+
variable_names = self.feature_names_in_
|
| 1429 |
else:
|
| 1430 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1431 |
+
variable_names = self.feature_names_in_
|
|
|
|
| 1432 |
|
| 1433 |
# Handle multioutput data
|
| 1434 |
if len(y.shape) == 1 or (len(y.shape) == 2 and y.shape[1] == 1):
|
|
|
|
| 1443 |
|
| 1444 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1445 |
|
| 1446 |
+
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
| 1447 |
+
return self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
| 1448 |
+
|
| 1449 |
+
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
| 1450 |
+
return self._validate_data(X=X, reset=False) # type: ignore
|
| 1451 |
+
|
| 1452 |
def _pre_transform_training_data(
|
| 1453 |
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1454 |
):
|
|
|
|
| 1518 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1519 |
|
| 1520 |
# Re-perform data validation and feature name updating
|
| 1521 |
+
X, y = self._validate_data_X_y(X, y)
|
| 1522 |
# Update feature names with selected variable names
|
| 1523 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1524 |
self.display_feature_names_in_ = self.feature_names_in_
|
|
|
|
| 1535 |
|
| 1536 |
return X, y, variable_names, X_units, y_units
|
| 1537 |
|
| 1538 |
+
def _run(self, X, y, mutated_params, weights, seed: int):
|
| 1539 |
"""
|
| 1540 |
Run the symbolic regression fitting process on the julia backend.
|
| 1541 |
|
|
|
|
| 1813 |
y,
|
| 1814 |
Xresampled=None,
|
| 1815 |
weights=None,
|
| 1816 |
+
variable_names: Optional[ArrayLike[str]] = None,
|
| 1817 |
+
X_units: Optional[ArrayLike[str]] = None,
|
| 1818 |
+
y_units: Optional[Union[str, ArrayLike[str]]] = None,
|
| 1819 |
) -> "PySRRegressor":
|
| 1820 |
"""
|
| 1821 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
|
| 2032 |
# reordered/reindexed to match those of the transformed (denoised and
|
| 2033 |
# feature selected) X in fit.
|
| 2034 |
X = X.reindex(columns=self.feature_names_in_)
|
| 2035 |
+
X = self._validate_data_X(X)
|
| 2036 |
|
| 2037 |
try:
|
| 2038 |
if isinstance(best_equation, list):
|