Spaces:
Running
Running
Enable dimensional constraints
Browse files- pysr/sr.py +85 -8
pysr/sr.py
CHANGED
|
@@ -167,6 +167,8 @@ def _check_assertions(
|
|
| 167 |
variable_names,
|
| 168 |
weights,
|
| 169 |
y,
|
|
|
|
|
|
|
| 170 |
):
|
| 171 |
# Check for potential errors before they happen
|
| 172 |
assert len(X.shape) == 2
|
|
@@ -190,6 +192,24 @@ def _check_assertions(
|
|
| 190 |
"Only alphanumeric characters, numbers, "
|
| 191 |
"and underscores are allowed."
|
| 192 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
|
| 195 |
def best(*args, **kwargs): # pragma: no cover
|
|
@@ -635,6 +655,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 635 |
has feature names that are all strings.
|
| 636 |
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
| 637 |
Pretty names of features, used only during printing.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
nout_ : int
|
| 639 |
Number of output dimensions.
|
| 640 |
selection_mask_ : list[int] of length `select_k_features`
|
|
@@ -1324,7 +1348,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1324 |
|
| 1325 |
return packed_modified_params
|
| 1326 |
|
| 1327 |
-
def _validate_and_set_fit_params(
|
|
|
|
|
|
|
| 1328 |
"""
|
| 1329 |
Validate the parameters passed to the :term`fit` method.
|
| 1330 |
|
|
@@ -1346,6 +1372,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1346 |
for that particular element of y.
|
| 1347 |
variable_names : list[str] of length n_features
|
| 1348 |
Names of each variable in the training dataset, `X`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1349 |
|
| 1350 |
Returns
|
| 1351 |
-------
|
|
@@ -1357,6 +1387,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1357 |
Validated resampled training data used for denoising.
|
| 1358 |
variable_names_validated : list[str] of length n_features
|
| 1359 |
Validated list of variable names for each feature in `X`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1360 |
|
| 1361 |
"""
|
| 1362 |
if isinstance(X, pd.DataFrame):
|
|
@@ -1415,10 +1449,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1415 |
else:
|
| 1416 |
raise NotImplementedError("y shape not supported!")
|
| 1417 |
|
| 1418 |
-
|
|
|
|
|
|
|
|
|
|
| 1419 |
|
| 1420 |
def _pre_transform_training_data(
|
| 1421 |
-
self, X, y, Xresampled, variable_names, random_state
|
| 1422 |
):
|
| 1423 |
"""
|
| 1424 |
Transform the training data before fitting the symbolic regressor.
|
|
@@ -1438,6 +1475,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1438 |
variable_names : list[str]
|
| 1439 |
Names of each variable in the training dataset, `X`.
|
| 1440 |
Of length `n_features`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1441 |
random_state : int | np.RandomState
|
| 1442 |
Pass an int for reproducible results across multiple function calls.
|
| 1443 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
@@ -1459,6 +1500,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1459 |
variable_names_transformed : list[str] of length n_features
|
| 1460 |
Names of each variable in the transformed dataset,
|
| 1461 |
`X_transformed`.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1462 |
"""
|
| 1463 |
# Feature selection transformation
|
| 1464 |
if self.select_k_features:
|
|
@@ -1473,6 +1518,13 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1473 |
# Reduce variable_names to selection
|
| 1474 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
| 1475 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1476 |
# Re-perform data validation and feature name updating
|
| 1477 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1478 |
# Update feature names with selected variable names
|
|
@@ -1497,7 +1549,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1497 |
else:
|
| 1498 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1499 |
|
| 1500 |
-
return X, y, variable_names
|
| 1501 |
|
| 1502 |
def _run(self, X, y, mutated_params, weights, seed):
|
| 1503 |
"""
|
|
@@ -1733,6 +1785,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1733 |
and self.pretty_feature_names_in_ is not None
|
| 1734 |
else self.feature_names_in_.tolist()
|
| 1735 |
),
|
|
|
|
|
|
|
| 1736 |
options=options,
|
| 1737 |
numprocs=cprocs,
|
| 1738 |
parallelism=parallelism,
|
|
@@ -1758,6 +1812,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1758 |
Xresampled=None,
|
| 1759 |
weights=None,
|
| 1760 |
variable_names=None,
|
|
|
|
|
|
|
| 1761 |
):
|
| 1762 |
"""
|
| 1763 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
@@ -1785,6 +1841,15 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1785 |
instead of `variable_names`. Cannot contain spaces or special
|
| 1786 |
characters. Avoid variable names which are also
|
| 1787 |
function names in `sympy`, such as "N".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1788 |
|
| 1789 |
Returns
|
| 1790 |
-------
|
|
@@ -1806,6 +1871,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1806 |
self.nout_ = 1
|
| 1807 |
self.selection_mask_ = None
|
| 1808 |
self.raw_julia_state_ = None
|
|
|
|
|
|
|
| 1809 |
|
| 1810 |
random_state = check_random_state(self.random_state) # For np random
|
| 1811 |
seed = random_state.get_state()[1][0] # For julia random
|
|
@@ -1814,8 +1881,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1814 |
|
| 1815 |
mutated_params = self._validate_and_set_init_params()
|
| 1816 |
|
| 1817 |
-
|
| 1818 |
-
X,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1819 |
)
|
| 1820 |
|
| 1821 |
if X.shape[0] > 10000 and not self.batching:
|
|
@@ -1830,8 +1905,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1830 |
)
|
| 1831 |
|
| 1832 |
# Pre transformations (feature selection and denoising)
|
| 1833 |
-
X, y, variable_names = self._pre_transform_training_data(
|
| 1834 |
-
X, y, Xresampled, variable_names, random_state
|
| 1835 |
)
|
| 1836 |
|
| 1837 |
# Warn about large feature counts (still warn if feature count is large
|
|
@@ -1860,6 +1935,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1860 |
variable_names,
|
| 1861 |
weights,
|
| 1862 |
y,
|
|
|
|
|
|
|
| 1863 |
)
|
| 1864 |
|
| 1865 |
# Initially, just save model parameters, so that
|
|
|
|
| 167 |
variable_names,
|
| 168 |
weights,
|
| 169 |
y,
|
| 170 |
+
X_units,
|
| 171 |
+
y_units,
|
| 172 |
):
|
| 173 |
# Check for potential errors before they happen
|
| 174 |
assert len(X.shape) == 2
|
|
|
|
| 192 |
"Only alphanumeric characters, numbers, "
|
| 193 |
"and underscores are allowed."
|
| 194 |
)
|
| 195 |
+
if X_units is not None and len(X_units) != X.shape[1]:
|
| 196 |
+
raise ValueError(
|
| 197 |
+
"The number of units in `X_units` must equal the number of features in `X`."
|
| 198 |
+
)
|
| 199 |
+
if y_units is not None:
|
| 200 |
+
good_y_units = False
|
| 201 |
+
if isinstance(y_units, list):
|
| 202 |
+
if len(y.shape) == 1:
|
| 203 |
+
good_y_units = len(y_units) == 1
|
| 204 |
+
else:
|
| 205 |
+
good_y_units = len(y_units) == y.shape[1]
|
| 206 |
+
else:
|
| 207 |
+
good_y_units = len(y.shape) == 1 or y.shape[1] == 1
|
| 208 |
+
|
| 209 |
+
if not good_y_units:
|
| 210 |
+
raise ValueError(
|
| 211 |
+
"The number of units in `y_units` must equal the number of output features in `y`."
|
| 212 |
+
)
|
| 213 |
|
| 214 |
|
| 215 |
def best(*args, **kwargs): # pragma: no cover
|
|
|
|
| 655 |
has feature names that are all strings.
|
| 656 |
pretty_feature_names_in_ : ndarray of shape (`n_features_in_`,)
|
| 657 |
Pretty names of features, used only during printing.
|
| 658 |
+
X_units_ : list[str] of length n_features
|
| 659 |
+
Units of each variable in the training dataset, `X`.
|
| 660 |
+
y_units_ : str | list[str] of length n_out
|
| 661 |
+
Units of each variable in the training dataset, `y`.
|
| 662 |
nout_ : int
|
| 663 |
Number of output dimensions.
|
| 664 |
selection_mask_ : list[int] of length `select_k_features`
|
|
|
|
| 1348 |
|
| 1349 |
return packed_modified_params
|
| 1350 |
|
| 1351 |
+
def _validate_and_set_fit_params(
|
| 1352 |
+
self, X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1353 |
+
):
|
| 1354 |
"""
|
| 1355 |
Validate the parameters passed to the :term`fit` method.
|
| 1356 |
|
|
|
|
| 1372 |
for that particular element of y.
|
| 1373 |
variable_names : list[str] of length n_features
|
| 1374 |
Names of each variable in the training dataset, `X`.
|
| 1375 |
+
X_units : list[str] of length n_features
|
| 1376 |
+
Units of each variable in the training dataset, `X`.
|
| 1377 |
+
y_units : str | list[str] of length n_out
|
| 1378 |
+
Units of each variable in the training dataset, `y`.
|
| 1379 |
|
| 1380 |
Returns
|
| 1381 |
-------
|
|
|
|
| 1387 |
Validated resampled training data used for denoising.
|
| 1388 |
variable_names_validated : list[str] of length n_features
|
| 1389 |
Validated list of variable names for each feature in `X`.
|
| 1390 |
+
X_units : list[str] of length n_features
|
| 1391 |
+
Validated units for `X`.
|
| 1392 |
+
y_units : str | list[str] of length n_out
|
| 1393 |
+
Validated units for `y`.
|
| 1394 |
|
| 1395 |
"""
|
| 1396 |
if isinstance(X, pd.DataFrame):
|
|
|
|
| 1449 |
else:
|
| 1450 |
raise NotImplementedError("y shape not supported!")
|
| 1451 |
|
| 1452 |
+
self.X_units_ = copy.deepcopy(X_units)
|
| 1453 |
+
self.y_units_ = copy.deepcopy(y_units)
|
| 1454 |
+
|
| 1455 |
+
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1456 |
|
| 1457 |
def _pre_transform_training_data(
|
| 1458 |
+
self, X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1459 |
):
|
| 1460 |
"""
|
| 1461 |
Transform the training data before fitting the symbolic regressor.
|
|
|
|
| 1475 |
variable_names : list[str]
|
| 1476 |
Names of each variable in the training dataset, `X`.
|
| 1477 |
Of length `n_features`.
|
| 1478 |
+
X_units : list[str]
|
| 1479 |
+
Units of each variable in the training dataset, `X`.
|
| 1480 |
+
y_units : str | list[str]
|
| 1481 |
+
Units of each variable in the training dataset, `y`.
|
| 1482 |
random_state : int | np.RandomState
|
| 1483 |
Pass an int for reproducible results across multiple function calls.
|
| 1484 |
See :term:`Glossary <random_state>`. Default is `None`.
|
|
|
|
| 1500 |
variable_names_transformed : list[str] of length n_features
|
| 1501 |
Names of each variable in the transformed dataset,
|
| 1502 |
`X_transformed`.
|
| 1503 |
+
X_units_transformed : list[str] of length n_features
|
| 1504 |
+
Units of each variable in the transformed dataset.
|
| 1505 |
+
y_units_transformed : str | list[str] of length n_out
|
| 1506 |
+
Units of each variable in the transformed dataset.
|
| 1507 |
"""
|
| 1508 |
# Feature selection transformation
|
| 1509 |
if self.select_k_features:
|
|
|
|
| 1518 |
# Reduce variable_names to selection
|
| 1519 |
variable_names = [variable_names[i] for i in self.selection_mask_]
|
| 1520 |
|
| 1521 |
+
if X_units is not None:
|
| 1522 |
+
X_units = [X_units[i] for i in self.selection_mask_]
|
| 1523 |
+
self.X_units_ = copy.deepcopy(X_units)
|
| 1524 |
+
if y_units is not None:
|
| 1525 |
+
y_units = [y_units[i] for i in self.selection_mask_]
|
| 1526 |
+
self.y_units_ = copy.deepcopy(y_units)
|
| 1527 |
+
|
| 1528 |
# Re-perform data validation and feature name updating
|
| 1529 |
X, y = self._validate_data(X=X, y=y, reset=True, multi_output=True)
|
| 1530 |
# Update feature names with selected variable names
|
|
|
|
| 1549 |
else:
|
| 1550 |
X, y = _denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1551 |
|
| 1552 |
+
return X, y, variable_names, X_units, y_units
|
| 1553 |
|
| 1554 |
def _run(self, X, y, mutated_params, weights, seed):
|
| 1555 |
"""
|
|
|
|
| 1785 |
and self.pretty_feature_names_in_ is not None
|
| 1786 |
else self.feature_names_in_.tolist()
|
| 1787 |
),
|
| 1788 |
+
X_units=self.X_units_,
|
| 1789 |
+
y_units=self.y_units_,
|
| 1790 |
options=options,
|
| 1791 |
numprocs=cprocs,
|
| 1792 |
parallelism=parallelism,
|
|
|
|
| 1812 |
Xresampled=None,
|
| 1813 |
weights=None,
|
| 1814 |
variable_names=None,
|
| 1815 |
+
X_units=None,
|
| 1816 |
+
y_units=None,
|
| 1817 |
):
|
| 1818 |
"""
|
| 1819 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
|
|
|
| 1841 |
instead of `variable_names`. Cannot contain spaces or special
|
| 1842 |
characters. Avoid variable names which are also
|
| 1843 |
function names in `sympy`, such as "N".
|
| 1844 |
+
X_units : list[str]
|
| 1845 |
+
A list of units for each variable in `X`. Each unit should be
|
| 1846 |
+
a string representing a Julia expression. See DynamicQuantities.jl
|
| 1847 |
+
https://symbolicml.org/DynamicQuantities.jl/dev/units/ for more
|
| 1848 |
+
information.
|
| 1849 |
+
y_units : str | list[str]
|
| 1850 |
+
Similar to `X_units`, but as a unit for the target variable, `y`.
|
| 1851 |
+
If `y` is a matrix, a list of units should be passed. If `X_units`
|
| 1852 |
+
is given but `y_units` is not, then `y_units` will be arbitrary.
|
| 1853 |
|
| 1854 |
Returns
|
| 1855 |
-------
|
|
|
|
| 1871 |
self.nout_ = 1
|
| 1872 |
self.selection_mask_ = None
|
| 1873 |
self.raw_julia_state_ = None
|
| 1874 |
+
self.X_units_ = None
|
| 1875 |
+
self.y_units_ = None
|
| 1876 |
|
| 1877 |
random_state = check_random_state(self.random_state) # For np random
|
| 1878 |
seed = random_state.get_state()[1][0] # For julia random
|
|
|
|
| 1881 |
|
| 1882 |
mutated_params = self._validate_and_set_init_params()
|
| 1883 |
|
| 1884 |
+
(
|
| 1885 |
+
X,
|
| 1886 |
+
y,
|
| 1887 |
+
Xresampled,
|
| 1888 |
+
weights,
|
| 1889 |
+
variable_names,
|
| 1890 |
+
X_units,
|
| 1891 |
+
y_units,
|
| 1892 |
+
) = self._validate_and_set_fit_params(
|
| 1893 |
+
X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1894 |
)
|
| 1895 |
|
| 1896 |
if X.shape[0] > 10000 and not self.batching:
|
|
|
|
| 1905 |
)
|
| 1906 |
|
| 1907 |
# Pre transformations (feature selection and denoising)
|
| 1908 |
+
X, y, variable_names, X_units, y_units = self._pre_transform_training_data(
|
| 1909 |
+
X, y, Xresampled, variable_names, X_units, y_units, random_state
|
| 1910 |
)
|
| 1911 |
|
| 1912 |
# Warn about large feature counts (still warn if feature count is large
|
|
|
|
| 1935 |
variable_names,
|
| 1936 |
weights,
|
| 1937 |
y,
|
| 1938 |
+
X_units,
|
| 1939 |
+
y_units,
|
| 1940 |
)
|
| 1941 |
|
| 1942 |
# Initially, just save model parameters, so that
|