Spaces:
Running
Running
fix: variety of typing information
Browse files- pysr/denoising.py +17 -4
- pysr/feature_selection.py +19 -3
- pysr/julia_helpers.py +4 -0
- pysr/julia_import.py +3 -2
- pysr/sr.py +41 -26
pysr/denoising.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
| 1 |
"""Functions for denoising data during preprocessing."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
-
def denoise(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""Denoise the dataset using a Gaussian process."""
|
| 8 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 9 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
@@ -15,12 +23,17 @@ def denoise(X, y, Xresampled=None, random_state=None):
|
|
| 15 |
gpr.fit(X, y)
|
| 16 |
|
| 17 |
if Xresampled is not None:
|
| 18 |
-
return Xresampled, gpr.predict(Xresampled)
|
| 19 |
|
| 20 |
-
return X, gpr.predict(X)
|
| 21 |
|
| 22 |
|
| 23 |
-
def multi_denoise(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
"""Perform `denoise` along each column of `y` independently."""
|
| 25 |
y = np.stack(
|
| 26 |
[
|
|
|
|
| 1 |
"""Functions for denoising data during preprocessing."""
|
| 2 |
|
| 3 |
+
from typing import Optional, Tuple, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
+
from numpy import ndarray
|
| 7 |
|
| 8 |
|
| 9 |
+
def denoise(
|
| 10 |
+
X: ndarray,
|
| 11 |
+
y: ndarray,
|
| 12 |
+
Xresampled: Optional[ndarray] = None,
|
| 13 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 14 |
+
) -> Tuple[ndarray, ndarray]:
|
| 15 |
"""Denoise the dataset using a Gaussian process."""
|
| 16 |
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 17 |
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
|
|
|
| 23 |
gpr.fit(X, y)
|
| 24 |
|
| 25 |
if Xresampled is not None:
|
| 26 |
+
return Xresampled, cast(ndarray, gpr.predict(Xresampled))
|
| 27 |
|
| 28 |
+
return X, cast(ndarray, gpr.predict(X))
|
| 29 |
|
| 30 |
|
| 31 |
+
def multi_denoise(
|
| 32 |
+
X: ndarray,
|
| 33 |
+
y: ndarray,
|
| 34 |
+
Xresampled: Optional[ndarray] = None,
|
| 35 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 36 |
+
):
|
| 37 |
"""Perform `denoise` along each column of `y` independently."""
|
| 38 |
y = np.stack(
|
| 39 |
[
|
pysr/feature_selection.py
CHANGED
|
@@ -1,9 +1,20 @@
|
|
| 1 |
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
-
def run_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
Find most important features.
|
| 9 |
|
|
@@ -21,11 +32,16 @@ def run_feature_selection(X, y, select_k_features, random_state=None):
|
|
| 21 |
selector = SelectFromModel(
|
| 22 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 23 |
)
|
| 24 |
-
return selector.get_support(indices=True)
|
| 25 |
|
| 26 |
|
| 27 |
# Function has not been removed only due to usage in module tests
|
| 28 |
-
def _handle_feature_selection(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
if select_k_features is not None:
|
| 30 |
selection = run_feature_selection(X, y, select_k_features)
|
| 31 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
|
|
|
| 1 |
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
|
| 3 |
+
from typing import Optional, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
+
from numpy import ndarray
|
| 7 |
+
from numpy.typing import NDArray
|
| 8 |
+
|
| 9 |
+
from .utils import ArrayLike
|
| 10 |
|
| 11 |
|
| 12 |
+
def run_feature_selection(
|
| 13 |
+
X: ndarray,
|
| 14 |
+
y: ndarray,
|
| 15 |
+
select_k_features: int,
|
| 16 |
+
random_state: Optional[np.random.RandomState] = None,
|
| 17 |
+
) -> NDArray[np.intp]:
|
| 18 |
"""
|
| 19 |
Find most important features.
|
| 20 |
|
|
|
|
| 32 |
selector = SelectFromModel(
|
| 33 |
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 34 |
)
|
| 35 |
+
return cast(NDArray[np.intp], selector.get_support(indices=True))
|
| 36 |
|
| 37 |
|
| 38 |
# Function has not been removed only due to usage in module tests
|
| 39 |
+
def _handle_feature_selection(
|
| 40 |
+
X: ndarray,
|
| 41 |
+
select_k_features: Optional[int],
|
| 42 |
+
y: ndarray,
|
| 43 |
+
variable_names: ArrayLike[str],
|
| 44 |
+
):
|
| 45 |
if select_k_features is not None:
|
| 46 |
selection = run_feature_selection(X, y, select_k_features)
|
| 47 |
print(f"Using features {[variable_names[i] for i in selection]}")
|
pysr/julia_helpers.py
CHANGED
|
@@ -1,11 +1,15 @@
|
|
| 1 |
"""Functions for initializing the Julia environment and installing deps."""
|
| 2 |
|
|
|
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
from juliacall import convert as jl_convert # type: ignore
|
| 5 |
|
| 6 |
from .deprecated import init_julia, install
|
| 7 |
from .julia_import import jl
|
| 8 |
|
|
|
|
|
|
|
| 9 |
jl.seval("using Serialization: Serialization")
|
| 10 |
jl.seval("using PythonCall: PythonCall")
|
| 11 |
|
|
|
|
| 1 |
"""Functions for initializing the Julia environment and installing deps."""
|
| 2 |
|
| 3 |
+
from typing import Any, Callable, cast
|
| 4 |
+
|
| 5 |
import numpy as np
|
| 6 |
from juliacall import convert as jl_convert # type: ignore
|
| 7 |
|
| 8 |
from .deprecated import init_julia, install
|
| 9 |
from .julia_import import jl
|
| 10 |
|
| 11 |
+
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
|
| 12 |
+
|
| 13 |
jl.seval("using Serialization: Serialization")
|
| 14 |
jl.seval("using PythonCall: PythonCall")
|
| 15 |
|
pysr/julia_import.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import warnings
|
| 4 |
-
from
|
|
|
|
| 5 |
|
| 6 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
| 7 |
# about the relevant environment variables. If not loaded,
|
|
@@ -43,7 +44,7 @@ if autoload_extensions is not None:
|
|
| 43 |
|
| 44 |
from juliacall import Main as jl # type: ignore
|
| 45 |
|
| 46 |
-
jl
|
| 47 |
|
| 48 |
|
| 49 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
import warnings
|
| 4 |
+
from types import ModuleType
|
| 5 |
+
from typing import cast
|
| 6 |
|
| 7 |
# Check if JuliaCall is already loaded, and if so, warn the user
|
| 8 |
# about the relevant environment variables. If not loaded,
|
|
|
|
| 44 |
|
| 45 |
from juliacall import Main as jl # type: ignore
|
| 46 |
|
| 47 |
+
jl = cast(ModuleType, jl)
|
| 48 |
|
| 49 |
|
| 50 |
jl_version = (jl.VERSION.major, jl.VERSION.minor, jl.VERSION.patch)
|
pysr/sr.py
CHANGED
|
@@ -679,7 +679,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 679 |
X_units_: Optional[ArrayLike[str]]
|
| 680 |
y_units_: Optional[Union[str, ArrayLike[str]]]
|
| 681 |
nout_: int
|
| 682 |
-
selection_mask_: Optional[NDArray[np.
|
| 683 |
tempdir_: Path
|
| 684 |
equation_file_: Union[str, Path]
|
| 685 |
julia_state_stream_: Optional[NDArray[np.uint8]]
|
|
@@ -921,12 +921,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 921 |
cls,
|
| 922 |
equation_file,
|
| 923 |
*,
|
| 924 |
-
binary_operators=None,
|
| 925 |
-
unary_operators=None,
|
| 926 |
-
n_features_in=None,
|
| 927 |
-
feature_names_in=None,
|
| 928 |
-
selection_mask=None,
|
| 929 |
-
nout=1,
|
| 930 |
**pysr_kwargs,
|
| 931 |
):
|
| 932 |
"""
|
|
@@ -949,7 +949,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 949 |
feature_names_in : list[str]
|
| 950 |
Names of the features passed to the model.
|
| 951 |
Not needed if loading from a pickle file.
|
| 952 |
-
selection_mask :
|
| 953 |
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
| 954 |
Not needed if loading from a pickle file.
|
| 955 |
nout : int
|
|
@@ -1021,7 +1021,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1021 |
model.display_feature_names_in_ = feature_names_in
|
| 1022 |
|
| 1023 |
if selection_mask is None:
|
| 1024 |
-
model.selection_mask_ = np.
|
| 1025 |
else:
|
| 1026 |
model.selection_mask_ = selection_mask
|
| 1027 |
|
|
@@ -1197,19 +1197,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1197 |
), "With multiple output features, index must be a list."
|
| 1198 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1199 |
elif isinstance(self.equations_, pd.DataFrame):
|
| 1200 |
-
return self.equations_.iloc[index]
|
| 1201 |
else:
|
| 1202 |
raise ValueError("No equations have been generated yet.")
|
| 1203 |
|
| 1204 |
if isinstance(self.equations_, list):
|
| 1205 |
return [
|
| 1206 |
-
eq.loc[idx_model_selection(eq, self.model_selection)]
|
| 1207 |
for eq in self.equations_
|
| 1208 |
]
|
| 1209 |
elif isinstance(self.equations_, pd.DataFrame):
|
| 1210 |
-
return
|
| 1211 |
-
|
| 1212 |
-
|
|
|
|
|
|
|
|
|
|
| 1213 |
else:
|
| 1214 |
raise ValueError("No equations have been generated yet.")
|
| 1215 |
|
|
@@ -1351,7 +1354,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1351 |
ndarray,
|
| 1352 |
Optional[ndarray],
|
| 1353 |
Optional[ndarray],
|
| 1354 |
-
|
| 1355 |
Optional[ArrayLike[str]],
|
| 1356 |
Optional[Union[str, ArrayLike[str]]],
|
| 1357 |
]:
|
|
@@ -1459,13 +1462,22 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1459 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1460 |
|
| 1461 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
| 1462 |
-
|
|
|
|
| 1463 |
|
| 1464 |
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
| 1465 |
-
|
|
|
|
| 1466 |
|
| 1467 |
def _pre_transform_training_data(
|
| 1468 |
-
self,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1469 |
):
|
| 1470 |
"""
|
| 1471 |
Transform the training data before fitting the symbolic regressor.
|
|
@@ -1474,12 +1486,12 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1474 |
|
| 1475 |
Parameters
|
| 1476 |
----------
|
| 1477 |
-
X : ndarray
|
| 1478 |
Training data of shape (n_samples, n_features).
|
| 1479 |
-
y : ndarray
|
| 1480 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
| 1481 |
Will be cast to X's dtype if necessary.
|
| 1482 |
-
Xresampled : ndarray |
|
| 1483 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
| 1484 |
used for denoising.
|
| 1485 |
variable_names : list[str]
|
|
@@ -1517,24 +1529,27 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1517 |
"""
|
| 1518 |
# Feature selection transformation
|
| 1519 |
if self.select_k_features:
|
| 1520 |
-
|
| 1521 |
X, y, self.select_k_features, random_state=random_state
|
| 1522 |
)
|
| 1523 |
-
X = X[:,
|
| 1524 |
|
| 1525 |
if Xresampled is not None:
|
| 1526 |
-
Xresampled = Xresampled[:,
|
| 1527 |
|
| 1528 |
# Reduce variable_names to selection
|
| 1529 |
-
variable_names =
|
|
|
|
|
|
|
| 1530 |
|
| 1531 |
if X_units is not None:
|
| 1532 |
-
X_units = [X_units[i] for i in
|
| 1533 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1534 |
|
| 1535 |
# Re-perform data validation and feature name updating
|
| 1536 |
X, y = self._validate_data_X_y(X, y)
|
| 1537 |
# Update feature names with selected variable names
|
|
|
|
| 1538 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1539 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1540 |
print(f"Using features {self.feature_names_in_}")
|
|
|
|
| 679 |
X_units_: Optional[ArrayLike[str]]
|
| 680 |
y_units_: Optional[Union[str, ArrayLike[str]]]
|
| 681 |
nout_: int
|
| 682 |
+
selection_mask_: Optional[NDArray[np.intp]]
|
| 683 |
tempdir_: Path
|
| 684 |
equation_file_: Union[str, Path]
|
| 685 |
julia_state_stream_: Optional[NDArray[np.uint8]]
|
|
|
|
| 921 |
cls,
|
| 922 |
equation_file,
|
| 923 |
*,
|
| 924 |
+
binary_operators: Optional[List[str]] = None,
|
| 925 |
+
unary_operators: Optional[List[str]] = None,
|
| 926 |
+
n_features_in: Optional[int] = None,
|
| 927 |
+
feature_names_in: Optional[ArrayLike[str]] = None,
|
| 928 |
+
selection_mask: Optional[NDArray[np.intp]] = None,
|
| 929 |
+
nout: int = 1,
|
| 930 |
**pysr_kwargs,
|
| 931 |
):
|
| 932 |
"""
|
|
|
|
| 949 |
feature_names_in : list[str]
|
| 950 |
Names of the features passed to the model.
|
| 951 |
Not needed if loading from a pickle file.
|
| 952 |
+
selection_mask : NDArray[np.intp]
|
| 953 |
If using `select_k_features`, you must pass `model.selection_mask_` here.
|
| 954 |
Not needed if loading from a pickle file.
|
| 955 |
nout : int
|
|
|
|
| 1021 |
model.display_feature_names_in_ = feature_names_in
|
| 1022 |
|
| 1023 |
if selection_mask is None:
|
| 1024 |
+
model.selection_mask_ = np.arange(n_features_in, dtype=np.intp)
|
| 1025 |
else:
|
| 1026 |
model.selection_mask_ = selection_mask
|
| 1027 |
|
|
|
|
| 1197 |
), "With multiple output features, index must be a list."
|
| 1198 |
return [eq.iloc[i] for eq, i in zip(self.equations_, index)]
|
| 1199 |
elif isinstance(self.equations_, pd.DataFrame):
|
| 1200 |
+
return cast(pd.Series, self.equations_.iloc[index])
|
| 1201 |
else:
|
| 1202 |
raise ValueError("No equations have been generated yet.")
|
| 1203 |
|
| 1204 |
if isinstance(self.equations_, list):
|
| 1205 |
return [
|
| 1206 |
+
cast(pd.Series, eq.loc[idx_model_selection(eq, self.model_selection)])
|
| 1207 |
for eq in self.equations_
|
| 1208 |
]
|
| 1209 |
elif isinstance(self.equations_, pd.DataFrame):
|
| 1210 |
+
return cast(
|
| 1211 |
+
pd.Series,
|
| 1212 |
+
self.equations_.loc[
|
| 1213 |
+
idx_model_selection(self.equations_, self.model_selection)
|
| 1214 |
+
],
|
| 1215 |
+
)
|
| 1216 |
else:
|
| 1217 |
raise ValueError("No equations have been generated yet.")
|
| 1218 |
|
|
|
|
| 1354 |
ndarray,
|
| 1355 |
Optional[ndarray],
|
| 1356 |
Optional[ndarray],
|
| 1357 |
+
ArrayLike[str],
|
| 1358 |
Optional[ArrayLike[str]],
|
| 1359 |
Optional[Union[str, ArrayLike[str]]],
|
| 1360 |
]:
|
|
|
|
| 1462 |
return X, y, Xresampled, weights, variable_names, X_units, y_units
|
| 1463 |
|
| 1464 |
def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
|
| 1465 |
+
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
|
| 1466 |
+
return cast(Tuple[ndarray, ndarray], raw_out)
|
| 1467 |
|
| 1468 |
def _validate_data_X(self, X) -> Tuple[ndarray]:
|
| 1469 |
+
raw_out = self._validate_data(X=X, reset=False) # type: ignore
|
| 1470 |
+
return cast(Tuple[ndarray], raw_out)
|
| 1471 |
|
| 1472 |
def _pre_transform_training_data(
|
| 1473 |
+
self,
|
| 1474 |
+
X: ndarray,
|
| 1475 |
+
y: ndarray,
|
| 1476 |
+
Xresampled: Union[ndarray, None],
|
| 1477 |
+
variable_names: ArrayLike[str],
|
| 1478 |
+
X_units: Union[ArrayLike[str], None],
|
| 1479 |
+
y_units: Union[ArrayLike[str], str, None],
|
| 1480 |
+
random_state: np.random.RandomState,
|
| 1481 |
):
|
| 1482 |
"""
|
| 1483 |
Transform the training data before fitting the symbolic regressor.
|
|
|
|
| 1486 |
|
| 1487 |
Parameters
|
| 1488 |
----------
|
| 1489 |
+
X : ndarray
|
| 1490 |
Training data of shape (n_samples, n_features).
|
| 1491 |
+
y : ndarray
|
| 1492 |
Target values of shape (n_samples,) or (n_samples, n_targets).
|
| 1493 |
Will be cast to X's dtype if necessary.
|
| 1494 |
+
Xresampled : ndarray | None
|
| 1495 |
Resampled training data, of shape `(n_resampled, n_features)`,
|
| 1496 |
used for denoising.
|
| 1497 |
variable_names : list[str]
|
|
|
|
| 1529 |
"""
|
| 1530 |
# Feature selection transformation
|
| 1531 |
if self.select_k_features:
|
| 1532 |
+
selection_mask = run_feature_selection(
|
| 1533 |
X, y, self.select_k_features, random_state=random_state
|
| 1534 |
)
|
| 1535 |
+
X = X[:, selection_mask]
|
| 1536 |
|
| 1537 |
if Xresampled is not None:
|
| 1538 |
+
Xresampled = Xresampled[:, selection_mask]
|
| 1539 |
|
| 1540 |
# Reduce variable_names to selection
|
| 1541 |
+
variable_names = cast(
|
| 1542 |
+
ArrayLike[str], [variable_names[i] for i in selection_mask]
|
| 1543 |
+
)
|
| 1544 |
|
| 1545 |
if X_units is not None:
|
| 1546 |
+
X_units = cast(ArrayLike[str], [X_units[i] for i in selection_mask])
|
| 1547 |
self.X_units_ = copy.deepcopy(X_units)
|
| 1548 |
|
| 1549 |
# Re-perform data validation and feature name updating
|
| 1550 |
X, y = self._validate_data_X_y(X, y)
|
| 1551 |
# Update feature names with selected variable names
|
| 1552 |
+
self.selection_mask_ = selection_mask
|
| 1553 |
self.feature_names_in_ = _check_feature_names_in(self, variable_names)
|
| 1554 |
self.display_feature_names_in_ = self.feature_names_in_
|
| 1555 |
print(f"Using features {self.feature_names_in_}")
|