Spaces:
Running
Running
Commit
·
c822df8
1
Parent(s):
12e6d5e
Move feature selection functionality to separate file
Browse files- pysr/feature_selection.py +35 -0
- pysr/sr.py +1 -33
- pysr/test/test.py +3 -8
pysr/feature_selection.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def run_feature_selection(X, y, select_k_features, random_state=None) -> np.ndarray:
|
| 6 |
+
"""
|
| 7 |
+
Find most important features.
|
| 8 |
+
|
| 9 |
+
Uses a gradient boosting tree regressor as a proxy for finding
|
| 10 |
+
the k most important features in X, returning indices for those
|
| 11 |
+
features as output.
|
| 12 |
+
"""
|
| 13 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 14 |
+
from sklearn.feature_selection import SelectFromModel
|
| 15 |
+
|
| 16 |
+
clf = RandomForestRegressor(
|
| 17 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
| 18 |
+
)
|
| 19 |
+
clf.fit(X, y)
|
| 20 |
+
selector = SelectFromModel(
|
| 21 |
+
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 22 |
+
)
|
| 23 |
+
return selector.get_support(indices=True)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Function has not been removed only due to usage in module tests
|
| 27 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
| 28 |
+
if select_k_features is not None:
|
| 29 |
+
selection = run_feature_selection(X, y, select_k_features)
|
| 30 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
| 31 |
+
X = X[:, selection]
|
| 32 |
+
else:
|
| 33 |
+
selection = None
|
| 34 |
+
|
| 35 |
+
return X, selection
|
pysr/sr.py
CHANGED
|
@@ -25,6 +25,7 @@ from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
|
| 25 |
from .export_numpy import sympy2numpy
|
| 26 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 27 |
from .export_torch import sympy2torch
|
|
|
|
| 28 |
from .julia_helpers import (
|
| 29 |
_escape_filename,
|
| 30 |
_load_backend,
|
|
@@ -2385,36 +2386,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
|
| 2385 |
f"{model_selection} is not a valid model selection strategy."
|
| 2386 |
)
|
| 2387 |
return chosen_idx
|
| 2388 |
-
|
| 2389 |
-
|
| 2390 |
-
# Function has not been removed only due to usage in module tests
|
| 2391 |
-
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
| 2392 |
-
if select_k_features is not None:
|
| 2393 |
-
selection = run_feature_selection(X, y, select_k_features)
|
| 2394 |
-
print(f"Using features {[variable_names[i] for i in selection]}")
|
| 2395 |
-
X = X[:, selection]
|
| 2396 |
-
|
| 2397 |
-
else:
|
| 2398 |
-
selection = None
|
| 2399 |
-
return X, selection
|
| 2400 |
-
|
| 2401 |
-
|
| 2402 |
-
def run_feature_selection(X, y, select_k_features, random_state=None):
|
| 2403 |
-
"""
|
| 2404 |
-
Find most important features.
|
| 2405 |
-
|
| 2406 |
-
Uses a gradient boosting tree regressor as a proxy for finding
|
| 2407 |
-
the k most important features in X, returning indices for those
|
| 2408 |
-
features as output.
|
| 2409 |
-
"""
|
| 2410 |
-
from sklearn.ensemble import RandomForestRegressor
|
| 2411 |
-
from sklearn.feature_selection import SelectFromModel
|
| 2412 |
-
|
| 2413 |
-
clf = RandomForestRegressor(
|
| 2414 |
-
n_estimators=100, max_depth=3, random_state=random_state
|
| 2415 |
-
)
|
| 2416 |
-
clf.fit(X, y)
|
| 2417 |
-
selector = SelectFromModel(
|
| 2418 |
-
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 2419 |
-
)
|
| 2420 |
-
return selector.get_support(indices=True)
|
|
|
|
| 25 |
from .export_numpy import sympy2numpy
|
| 26 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 27 |
from .export_torch import sympy2torch
|
| 28 |
+
from .feature_selection import run_feature_selection
|
| 29 |
from .julia_helpers import (
|
| 30 |
_escape_filename,
|
| 31 |
_load_backend,
|
|
|
|
| 2386 |
f"{model_selection} is not a valid model selection strategy."
|
| 2387 |
)
|
| 2388 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pysr/test/test.py
CHANGED
|
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
|
|
| 14 |
|
| 15 |
from .. import PySRRegressor, julia_helpers
|
| 16 |
from ..export_latex import sympy2latex
|
| 17 |
-
from ..
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
_handle_feature_selection,
|
| 21 |
-
_process_constraints,
|
| 22 |
-
idx_model_selection,
|
| 23 |
-
run_feature_selection,
|
| 24 |
-
)
|
| 25 |
|
| 26 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
| 27 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
|
|
|
| 14 |
|
| 15 |
from .. import PySRRegressor, julia_helpers
|
| 16 |
from ..export_latex import sympy2latex
|
| 17 |
+
from ..feature_selection import _handle_feature_selection, run_feature_selection
|
| 18 |
+
from ..sr import _check_assertions, _process_constraints, idx_model_selection
|
| 19 |
+
from ..utils import _csv_filename_to_pkl_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
| 22 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|