Spaces:
Running
Running
Merge pull request #428 from MilesCranmer/refactor-utils
Browse files- .github/workflows/CI.yml +26 -0
- mypy.ini +8 -0
- pysr/__init__.py +2 -1
- pysr/denoising.py +35 -0
- pysr/deprecated.py +54 -0
- pysr/export_latex.py +13 -11
- pysr/export_sympy.py +2 -2
- pysr/feature_selection.py +35 -0
- pysr/feynman_problems.py +1 -1
- pysr/sr.py +19 -159
- pysr/test/test.py +3 -8
- pysr/utils.py +55 -0
.github/workflows/CI.yml
CHANGED
|
@@ -143,3 +143,29 @@ jobs:
|
|
| 143 |
run: |
|
| 144 |
pip install coveralls
|
| 145 |
coveralls --finish
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
run: |
|
| 144 |
pip install coveralls
|
| 145 |
coveralls --finish
|
| 146 |
+
|
| 147 |
+
types:
|
| 148 |
+
name: Check types
|
| 149 |
+
runs-on: ubuntu-latest
|
| 150 |
+
defaults:
|
| 151 |
+
run:
|
| 152 |
+
shell: bash -l {0}
|
| 153 |
+
strategy:
|
| 154 |
+
matrix:
|
| 155 |
+
python-version: ['3.10']
|
| 156 |
+
|
| 157 |
+
steps:
|
| 158 |
+
- uses: actions/checkout@v3
|
| 159 |
+
- name: "Set up Python"
|
| 160 |
+
uses: actions/setup-python@v4
|
| 161 |
+
with:
|
| 162 |
+
python-version: ${{ matrix.python-version }}
|
| 163 |
+
cache: pip
|
| 164 |
+
- name: "Install PySR and all dependencies"
|
| 165 |
+
run: |
|
| 166 |
+
python -m pip install --upgrade pip
|
| 167 |
+
pip install -r requirements.txt
|
| 168 |
+
pip install mypy jax jaxlib torch
|
| 169 |
+
python setup.py install
|
| 170 |
+
- name: "Run mypy"
|
| 171 |
+
run: mypy --install-types --non-interactive pysr
|
mypy.ini
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[mypy]
|
| 2 |
+
warn_return_any = True
|
| 3 |
+
|
| 4 |
+
[mypy-sklearn.*]
|
| 5 |
+
ignore_missing_imports = True
|
| 6 |
+
|
| 7 |
+
[mypy-julia.*]
|
| 8 |
+
ignore_missing_imports = True
|
pysr/__init__.py
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
from . import sklearn_monkeypatch
|
|
|
|
| 2 |
from .export_jax import sympy2jax
|
| 3 |
from .export_torch import sympy2torch
|
| 4 |
from .feynman_problems import FeynmanProblem, Problem
|
| 5 |
from .julia_helpers import install
|
| 6 |
-
from .sr import PySRRegressor
|
| 7 |
from .version import __version__
|
| 8 |
|
| 9 |
__all__ = [
|
|
|
|
| 1 |
from . import sklearn_monkeypatch
|
| 2 |
+
from .deprecated import best, best_callable, best_row, best_tex, pysr
|
| 3 |
from .export_jax import sympy2jax
|
| 4 |
from .export_torch import sympy2torch
|
| 5 |
from .feynman_problems import FeynmanProblem, Problem
|
| 6 |
from .julia_helpers import install
|
| 7 |
+
from .sr import PySRRegressor
|
| 8 |
from .version import __version__
|
| 9 |
|
| 10 |
__all__ = [
|
pysr/denoising.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Functions for denoising data during preprocessing."""
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def denoise(X, y, Xresampled=None, random_state=None):
|
| 6 |
+
"""Denoise the dataset using a Gaussian process."""
|
| 7 |
+
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 8 |
+
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
| 9 |
+
|
| 10 |
+
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
| 11 |
+
gpr = GaussianProcessRegressor(
|
| 12 |
+
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
| 13 |
+
)
|
| 14 |
+
gpr.fit(X, y)
|
| 15 |
+
|
| 16 |
+
if Xresampled is not None:
|
| 17 |
+
return Xresampled, gpr.predict(Xresampled)
|
| 18 |
+
|
| 19 |
+
return X, gpr.predict(X)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def multi_denoise(X, y, Xresampled=None, random_state=None):
|
| 23 |
+
"""Perform `denoise` along each column of `y` independently."""
|
| 24 |
+
y = np.stack(
|
| 25 |
+
[
|
| 26 |
+
denoise(X, y[:, i], Xresampled=Xresampled, random_state=random_state)[1]
|
| 27 |
+
for i in range(y.shape[1])
|
| 28 |
+
],
|
| 29 |
+
axis=1,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
if Xresampled is not None:
|
| 33 |
+
return Xresampled, y
|
| 34 |
+
|
| 35 |
+
return X, y
|
pysr/deprecated.py
CHANGED
|
@@ -1,4 +1,58 @@
|
|
| 1 |
"""Various functions to deprecate features."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
|
| 4 |
def make_deprecated_kwargs_for_pysr_regressor():
|
|
|
|
| 1 |
"""Various functions to deprecate features."""
|
| 2 |
+
import warnings
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def pysr(X, y, weights=None, **kwargs): # pragma: no cover
|
| 6 |
+
from .sr import PySRRegressor
|
| 7 |
+
|
| 8 |
+
warnings.warn(
|
| 9 |
+
"Calling `pysr` is deprecated. "
|
| 10 |
+
"Please use `model = PySRRegressor(**params); "
|
| 11 |
+
"model.fit(X, y)` going forward.",
|
| 12 |
+
FutureWarning,
|
| 13 |
+
)
|
| 14 |
+
model = PySRRegressor(**kwargs)
|
| 15 |
+
model.fit(X, y, weights=weights)
|
| 16 |
+
return model.equations_
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def best(*args, **kwargs): # pragma: no cover
|
| 20 |
+
raise NotImplementedError(
|
| 21 |
+
"`best` has been deprecated. "
|
| 22 |
+
"Please use the `PySRRegressor` interface. "
|
| 23 |
+
"After fitting, you can return `.sympy()` "
|
| 24 |
+
"to get the sympy representation "
|
| 25 |
+
"of the best equation."
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def best_row(*args, **kwargs): # pragma: no cover
|
| 30 |
+
raise NotImplementedError(
|
| 31 |
+
"`best_row` has been deprecated. "
|
| 32 |
+
"Please use the `PySRRegressor` interface. "
|
| 33 |
+
"After fitting, you can run `print(model)` to view the best equation, "
|
| 34 |
+
"or "
|
| 35 |
+
"`model.get_best()` to return the best equation's "
|
| 36 |
+
"row in `model.equations_`."
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def best_tex(*args, **kwargs): # pragma: no cover
|
| 41 |
+
raise NotImplementedError(
|
| 42 |
+
"`best_tex` has been deprecated. "
|
| 43 |
+
"Please use the `PySRRegressor` interface. "
|
| 44 |
+
"After fitting, you can return `.latex()` to "
|
| 45 |
+
"get the sympy representation "
|
| 46 |
+
"of the best equation."
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def best_callable(*args, **kwargs): # pragma: no cover
|
| 51 |
+
raise NotImplementedError(
|
| 52 |
+
"`best_callable` has been deprecated. Please use the `PySRRegressor` "
|
| 53 |
+
"interface. After fitting, you can use "
|
| 54 |
+
"`.predict(X)` to use the best callable."
|
| 55 |
+
)
|
| 56 |
|
| 57 |
|
| 58 |
def make_deprecated_kwargs_for_pysr_regressor():
|
pysr/export_latex.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""Functions to help export PySR equations to LaTeX."""
|
| 2 |
-
from typing import List
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
import sympy
|
|
@@ -19,14 +19,16 @@ class PreciseLatexPrinter(LatexPrinter):
|
|
| 19 |
return super()._print_Float(reduced_float)
|
| 20 |
|
| 21 |
|
| 22 |
-
def sympy2latex(expr, prec=3, full_prec=True, **settings):
|
| 23 |
"""Convert sympy expression to LaTeX with custom precision."""
|
| 24 |
settings["full_prec"] = full_prec
|
| 25 |
printer = PreciseLatexPrinter(settings=settings, prec=prec)
|
| 26 |
return printer.doprint(expr)
|
| 27 |
|
| 28 |
|
| 29 |
-
def generate_table_environment(
|
|
|
|
|
|
|
| 30 |
margins = "c" * len(columns)
|
| 31 |
column_map = {
|
| 32 |
"complexity": "Complexity",
|
|
@@ -58,12 +60,12 @@ def generate_table_environment(columns=["equation", "complexity", "loss"]):
|
|
| 58 |
|
| 59 |
def sympy2latextable(
|
| 60 |
equations: pd.DataFrame,
|
| 61 |
-
indices: List[int] = None,
|
| 62 |
precision: int = 3,
|
| 63 |
-
columns=["equation", "complexity", "loss", "score"],
|
| 64 |
max_equation_length: int = 50,
|
| 65 |
output_variable_name: str = "y",
|
| 66 |
-
):
|
| 67 |
"""Generate a booktabs-style LaTeX table for a single set of equations."""
|
| 68 |
assert isinstance(equations, pd.DataFrame)
|
| 69 |
|
|
@@ -71,7 +73,7 @@ def sympy2latextable(
|
|
| 71 |
latex_table_content = []
|
| 72 |
|
| 73 |
if indices is None:
|
| 74 |
-
indices =
|
| 75 |
|
| 76 |
for i in indices:
|
| 77 |
latex_equation = sympy2latex(
|
|
@@ -126,11 +128,11 @@ def sympy2latextable(
|
|
| 126 |
|
| 127 |
def sympy2multilatextable(
|
| 128 |
equations: List[pd.DataFrame],
|
| 129 |
-
indices: List[List[int]] = None,
|
| 130 |
precision: int = 3,
|
| 131 |
-
columns=["equation", "complexity", "loss", "score"],
|
| 132 |
-
output_variable_names: str = None,
|
| 133 |
-
):
|
| 134 |
"""Generate multiple latex tables for a list of equation sets."""
|
| 135 |
# TODO: Let user specify custom output variable
|
| 136 |
|
|
|
|
| 1 |
"""Functions to help export PySR equations to LaTeX."""
|
| 2 |
+
from typing import List, Optional, Tuple
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
import sympy
|
|
|
|
| 19 |
return super()._print_Float(reduced_float)
|
| 20 |
|
| 21 |
|
| 22 |
+
def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str:
|
| 23 |
"""Convert sympy expression to LaTeX with custom precision."""
|
| 24 |
settings["full_prec"] = full_prec
|
| 25 |
printer = PreciseLatexPrinter(settings=settings, prec=prec)
|
| 26 |
return printer.doprint(expr)
|
| 27 |
|
| 28 |
|
| 29 |
+
def generate_table_environment(
|
| 30 |
+
columns: List[str] = ["equation", "complexity", "loss"]
|
| 31 |
+
) -> Tuple[str, str]:
|
| 32 |
margins = "c" * len(columns)
|
| 33 |
column_map = {
|
| 34 |
"complexity": "Complexity",
|
|
|
|
| 60 |
|
| 61 |
def sympy2latextable(
|
| 62 |
equations: pd.DataFrame,
|
| 63 |
+
indices: Optional[List[int]] = None,
|
| 64 |
precision: int = 3,
|
| 65 |
+
columns: List[str] = ["equation", "complexity", "loss", "score"],
|
| 66 |
max_equation_length: int = 50,
|
| 67 |
output_variable_name: str = "y",
|
| 68 |
+
) -> str:
|
| 69 |
"""Generate a booktabs-style LaTeX table for a single set of equations."""
|
| 70 |
assert isinstance(equations, pd.DataFrame)
|
| 71 |
|
|
|
|
| 73 |
latex_table_content = []
|
| 74 |
|
| 75 |
if indices is None:
|
| 76 |
+
indices = list(equations.index)
|
| 77 |
|
| 78 |
for i in indices:
|
| 79 |
latex_equation = sympy2latex(
|
|
|
|
| 128 |
|
| 129 |
def sympy2multilatextable(
|
| 130 |
equations: List[pd.DataFrame],
|
| 131 |
+
indices: Optional[List[List[int]]] = None,
|
| 132 |
precision: int = 3,
|
| 133 |
+
columns: List[str] = ["equation", "complexity", "loss", "score"],
|
| 134 |
+
output_variable_names: Optional[List[str]] = None,
|
| 135 |
+
) -> str:
|
| 136 |
"""Generate multiple latex tables for a list of equation sets."""
|
| 137 |
# TODO: Let user specify custom output variable
|
| 138 |
|
pysr/export_sympy.py
CHANGED
|
@@ -51,14 +51,14 @@ sympy_mappings = {
|
|
| 51 |
|
| 52 |
|
| 53 |
def create_sympy_symbols(
|
| 54 |
-
feature_names_in:
|
| 55 |
) -> List[sympy.Symbol]:
|
| 56 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
| 57 |
|
| 58 |
|
| 59 |
def pysr2sympy(
|
| 60 |
equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
|
| 61 |
-
)
|
| 62 |
local_sympy_mappings = {
|
| 63 |
**(extra_sympy_mappings if extra_sympy_mappings else {}),
|
| 64 |
**sympy_mappings,
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
def create_sympy_symbols(
|
| 54 |
+
feature_names_in: List[str],
|
| 55 |
) -> List[sympy.Symbol]:
|
| 56 |
return [sympy.Symbol(variable) for variable in feature_names_in]
|
| 57 |
|
| 58 |
|
| 59 |
def pysr2sympy(
|
| 60 |
equation: str, *, extra_sympy_mappings: Optional[Dict[str, Callable]] = None
|
| 61 |
+
):
|
| 62 |
local_sympy_mappings = {
|
| 63 |
**(extra_sympy_mappings if extra_sympy_mappings else {}),
|
| 64 |
**sympy_mappings,
|
pysr/feature_selection.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Functions for doing feature selection during preprocessing."""
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def run_feature_selection(X, y, select_k_features, random_state=None):
|
| 6 |
+
"""
|
| 7 |
+
Find most important features.
|
| 8 |
+
|
| 9 |
+
Uses a gradient boosting tree regressor as a proxy for finding
|
| 10 |
+
the k most important features in X, returning indices for those
|
| 11 |
+
features as output.
|
| 12 |
+
"""
|
| 13 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 14 |
+
from sklearn.feature_selection import SelectFromModel
|
| 15 |
+
|
| 16 |
+
clf = RandomForestRegressor(
|
| 17 |
+
n_estimators=100, max_depth=3, random_state=random_state
|
| 18 |
+
)
|
| 19 |
+
clf.fit(X, y)
|
| 20 |
+
selector = SelectFromModel(
|
| 21 |
+
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 22 |
+
)
|
| 23 |
+
return selector.get_support(indices=True)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
# Function has not been removed only due to usage in module tests
|
| 27 |
+
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
| 28 |
+
if select_k_features is not None:
|
| 29 |
+
selection = run_feature_selection(X, y, select_k_features)
|
| 30 |
+
print(f"Using features {[variable_names[i] for i in selection]}")
|
| 31 |
+
X = X[:, selection]
|
| 32 |
+
else:
|
| 33 |
+
selection = None
|
| 34 |
+
|
| 35 |
+
return X, selection
|
pysr/feynman_problems.py
CHANGED
|
@@ -4,7 +4,7 @@ from pathlib import Path
|
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
-
from .
|
| 8 |
|
| 9 |
PKG_DIR = Path(__file__).parents[1]
|
| 10 |
FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
|
|
|
|
| 4 |
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
+
from .deprecated import best, pysr
|
| 8 |
|
| 9 |
PKG_DIR = Path(__file__).parents[1]
|
| 10 |
FEYNMAN_DATASET = PKG_DIR / "datasets" / "FeynmanEquations.csv"
|
pysr/sr.py
CHANGED
|
@@ -11,6 +11,7 @@ from datetime import datetime
|
|
| 11 |
from io import StringIO
|
| 12 |
from multiprocessing import cpu_count
|
| 13 |
from pathlib import Path
|
|
|
|
| 14 |
|
| 15 |
import numpy as np
|
| 16 |
import pandas as pd
|
|
@@ -18,12 +19,14 @@ from sklearn.base import BaseEstimator, MultiOutputMixin, RegressorMixin
|
|
| 18 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 19 |
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
|
| 20 |
|
|
|
|
| 21 |
from .deprecated import make_deprecated_kwargs_for_pysr_regressor
|
| 22 |
from .export_jax import sympy2jax
|
| 23 |
from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
| 24 |
from .export_numpy import sympy2numpy
|
| 25 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 26 |
from .export_torch import sympy2torch
|
|
|
|
| 27 |
from .julia_helpers import (
|
| 28 |
_escape_filename,
|
| 29 |
_load_backend,
|
|
@@ -33,23 +36,18 @@ from .julia_helpers import (
|
|
| 33 |
init_julia,
|
| 34 |
is_julia_version_greater_eq,
|
| 35 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
Main = None # TODO: Rename to more descriptive name like "julia_runtime"
|
| 38 |
|
| 39 |
already_ran = False
|
| 40 |
|
| 41 |
|
| 42 |
-
def pysr(X, y, weights=None, **kwargs): # pragma: no cover
|
| 43 |
-
warnings.warn(
|
| 44 |
-
"Calling `pysr` is deprecated. "
|
| 45 |
-
"Please use `model = PySRRegressor(**params); model.fit(X, y)` going forward.",
|
| 46 |
-
FutureWarning,
|
| 47 |
-
)
|
| 48 |
-
model = PySRRegressor(**kwargs)
|
| 49 |
-
model.fit(X, y, weights=weights)
|
| 50 |
-
return model.equations_
|
| 51 |
-
|
| 52 |
-
|
| 53 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
| 54 |
constraints = constraints.copy()
|
| 55 |
for op in unary_operators:
|
|
@@ -172,37 +170,6 @@ def _check_assertions(
|
|
| 172 |
)
|
| 173 |
|
| 174 |
|
| 175 |
-
def best(*args, **kwargs): # pragma: no cover
|
| 176 |
-
raise NotImplementedError(
|
| 177 |
-
"`best` has been deprecated. Please use the `PySRRegressor` interface. "
|
| 178 |
-
"After fitting, you can return `.sympy()` to get the sympy representation "
|
| 179 |
-
"of the best equation."
|
| 180 |
-
)
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
def best_row(*args, **kwargs): # pragma: no cover
|
| 184 |
-
raise NotImplementedError(
|
| 185 |
-
"`best_row` has been deprecated. Please use the `PySRRegressor` interface. "
|
| 186 |
-
"After fitting, you can run `print(model)` to view the best equation, or "
|
| 187 |
-
"`model.get_best()` to return the best equation's row in `model.equations_`."
|
| 188 |
-
)
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
def best_tex(*args, **kwargs): # pragma: no cover
|
| 192 |
-
raise NotImplementedError(
|
| 193 |
-
"`best_tex` has been deprecated. Please use the `PySRRegressor` interface. "
|
| 194 |
-
"After fitting, you can return `.latex()` to get the sympy representation "
|
| 195 |
-
"of the best equation."
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
def best_callable(*args, **kwargs): # pragma: no cover
|
| 200 |
-
raise NotImplementedError(
|
| 201 |
-
"`best_callable` has been deprecated. Please use the `PySRRegressor` "
|
| 202 |
-
"interface. After fitting, you can use `.predict(X)` to use the best callable."
|
| 203 |
-
)
|
| 204 |
-
|
| 205 |
-
|
| 206 |
# Class validation constants
|
| 207 |
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
| 208 |
|
|
@@ -945,10 +912,8 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 945 |
model : PySRRegressor
|
| 946 |
The model with fitted equations.
|
| 947 |
"""
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
else:
|
| 951 |
-
pkl_filename = equation_file
|
| 952 |
|
| 953 |
# Try to load model from <equation_file>.pkl
|
| 954 |
print(f"Checking if {pkl_filename} exists...")
|
|
@@ -1502,19 +1467,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1502 |
# Denoising transformation
|
| 1503 |
if self.denoise:
|
| 1504 |
if self.nout_ > 1:
|
| 1505 |
-
y =
|
| 1506 |
-
|
| 1507 |
-
_denoise(
|
| 1508 |
-
X, y[:, i], Xresampled=Xresampled, random_state=random_state
|
| 1509 |
-
)[1]
|
| 1510 |
-
for i in range(self.nout_)
|
| 1511 |
-
],
|
| 1512 |
-
axis=1,
|
| 1513 |
)
|
| 1514 |
-
if Xresampled is not None:
|
| 1515 |
-
X = Xresampled
|
| 1516 |
else:
|
| 1517 |
-
X, y =
|
| 1518 |
|
| 1519 |
return X, y, variable_names, X_units, y_units
|
| 1520 |
|
|
@@ -1783,10 +1740,10 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 1783 |
y,
|
| 1784 |
Xresampled=None,
|
| 1785 |
weights=None,
|
| 1786 |
-
variable_names=None,
|
| 1787 |
-
X_units=None,
|
| 1788 |
-
y_units=None,
|
| 1789 |
-
):
|
| 1790 |
"""
|
| 1791 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
| 1792 |
|
|
@@ -2373,7 +2330,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
|
|
| 2373 |
return "\n".join(preamble_string + [table_string])
|
| 2374 |
|
| 2375 |
|
| 2376 |
-
def idx_model_selection(equations: pd.DataFrame, model_selection: str)
|
| 2377 |
"""Select an expression and return its index."""
|
| 2378 |
if model_selection == "accuracy":
|
| 2379 |
chosen_idx = equations["loss"].idxmin()
|
|
@@ -2388,100 +2345,3 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str) -> int:
|
|
| 2388 |
f"{model_selection} is not a valid model selection strategy."
|
| 2389 |
)
|
| 2390 |
return chosen_idx
|
| 2391 |
-
|
| 2392 |
-
|
| 2393 |
-
def _denoise(X, y, Xresampled=None, random_state=None):
|
| 2394 |
-
"""Denoise the dataset using a Gaussian process."""
|
| 2395 |
-
from sklearn.gaussian_process import GaussianProcessRegressor
|
| 2396 |
-
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
|
| 2397 |
-
|
| 2398 |
-
gp_kernel = RBF(np.ones(X.shape[1])) + WhiteKernel(1e-1) + ConstantKernel()
|
| 2399 |
-
gpr = GaussianProcessRegressor(
|
| 2400 |
-
kernel=gp_kernel, n_restarts_optimizer=50, random_state=random_state
|
| 2401 |
-
)
|
| 2402 |
-
gpr.fit(X, y)
|
| 2403 |
-
if Xresampled is not None:
|
| 2404 |
-
return Xresampled, gpr.predict(Xresampled)
|
| 2405 |
-
|
| 2406 |
-
return X, gpr.predict(X)
|
| 2407 |
-
|
| 2408 |
-
|
| 2409 |
-
# Function has not been removed only due to usage in module tests
|
| 2410 |
-
def _handle_feature_selection(X, select_k_features, y, variable_names):
|
| 2411 |
-
if select_k_features is not None:
|
| 2412 |
-
selection = run_feature_selection(X, y, select_k_features)
|
| 2413 |
-
print(f"Using features {[variable_names[i] for i in selection]}")
|
| 2414 |
-
X = X[:, selection]
|
| 2415 |
-
|
| 2416 |
-
else:
|
| 2417 |
-
selection = None
|
| 2418 |
-
return X, selection
|
| 2419 |
-
|
| 2420 |
-
|
| 2421 |
-
def run_feature_selection(X, y, select_k_features, random_state=None):
|
| 2422 |
-
"""
|
| 2423 |
-
Find most important features.
|
| 2424 |
-
|
| 2425 |
-
Uses a gradient boosting tree regressor as a proxy for finding
|
| 2426 |
-
the k most important features in X, returning indices for those
|
| 2427 |
-
features as output.
|
| 2428 |
-
"""
|
| 2429 |
-
from sklearn.ensemble import RandomForestRegressor
|
| 2430 |
-
from sklearn.feature_selection import SelectFromModel
|
| 2431 |
-
|
| 2432 |
-
clf = RandomForestRegressor(
|
| 2433 |
-
n_estimators=100, max_depth=3, random_state=random_state
|
| 2434 |
-
)
|
| 2435 |
-
clf.fit(X, y)
|
| 2436 |
-
selector = SelectFromModel(
|
| 2437 |
-
clf, threshold=-np.inf, max_features=select_k_features, prefit=True
|
| 2438 |
-
)
|
| 2439 |
-
return selector.get_support(indices=True)
|
| 2440 |
-
|
| 2441 |
-
|
| 2442 |
-
def _csv_filename_to_pkl_filename(csv_filename) -> str:
|
| 2443 |
-
# Assume that the csv filename is of the form "foo.csv"
|
| 2444 |
-
assert str(csv_filename).endswith(".csv")
|
| 2445 |
-
|
| 2446 |
-
dirname = str(os.path.dirname(csv_filename))
|
| 2447 |
-
basename = str(os.path.basename(csv_filename))
|
| 2448 |
-
base = str(os.path.splitext(basename)[0])
|
| 2449 |
-
|
| 2450 |
-
pkl_basename = base + ".pkl"
|
| 2451 |
-
|
| 2452 |
-
return os.path.join(dirname, pkl_basename)
|
| 2453 |
-
|
| 2454 |
-
|
| 2455 |
-
_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
|
| 2456 |
-
_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
|
| 2457 |
-
_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
|
| 2458 |
-
|
| 2459 |
-
_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
|
| 2460 |
-
_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
|
| 2461 |
-
_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
|
| 2462 |
-
|
| 2463 |
-
|
| 2464 |
-
def _preprocess_julia_floats(s: str) -> str:
|
| 2465 |
-
if isinstance(s, str):
|
| 2466 |
-
s = _apply_regexp_im(s)
|
| 2467 |
-
s = _apply_regexp_im_sci(s)
|
| 2468 |
-
s = _apply_regexp_sci(s)
|
| 2469 |
-
return s
|
| 2470 |
-
|
| 2471 |
-
|
| 2472 |
-
def _subscriptify(i: int) -> str:
|
| 2473 |
-
"""Converts integer to subscript text form.
|
| 2474 |
-
|
| 2475 |
-
For example, 123 -> "₁₂₃".
|
| 2476 |
-
"""
|
| 2477 |
-
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|
| 2478 |
-
|
| 2479 |
-
|
| 2480 |
-
def _safe_check_feature_names_in(self, variable_names, generate_names=True):
|
| 2481 |
-
"""_check_feature_names_in with compat for old versions."""
|
| 2482 |
-
try:
|
| 2483 |
-
return _check_feature_names_in(
|
| 2484 |
-
self, variable_names, generate_names=generate_names
|
| 2485 |
-
)
|
| 2486 |
-
except TypeError:
|
| 2487 |
-
return _check_feature_names_in(self, variable_names)
|
|
|
|
| 11 |
from io import StringIO
|
| 12 |
from multiprocessing import cpu_count
|
| 13 |
from pathlib import Path
|
| 14 |
+
from typing import List, Optional
|
| 15 |
|
| 16 |
import numpy as np
|
| 17 |
import pandas as pd
|
|
|
|
| 19 |
from sklearn.utils import check_array, check_consistent_length, check_random_state
|
| 20 |
from sklearn.utils.validation import _check_feature_names_in, check_is_fitted
|
| 21 |
|
| 22 |
+
from .denoising import denoise, multi_denoise
|
| 23 |
from .deprecated import make_deprecated_kwargs_for_pysr_regressor
|
| 24 |
from .export_jax import sympy2jax
|
| 25 |
from .export_latex import sympy2latex, sympy2latextable, sympy2multilatextable
|
| 26 |
from .export_numpy import sympy2numpy
|
| 27 |
from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
|
| 28 |
from .export_torch import sympy2torch
|
| 29 |
+
from .feature_selection import run_feature_selection
|
| 30 |
from .julia_helpers import (
|
| 31 |
_escape_filename,
|
| 32 |
_load_backend,
|
|
|
|
| 36 |
init_julia,
|
| 37 |
is_julia_version_greater_eq,
|
| 38 |
)
|
| 39 |
+
from .utils import (
|
| 40 |
+
_csv_filename_to_pkl_filename,
|
| 41 |
+
_preprocess_julia_floats,
|
| 42 |
+
_safe_check_feature_names_in,
|
| 43 |
+
_subscriptify,
|
| 44 |
+
)
|
| 45 |
|
| 46 |
Main = None # TODO: Rename to more descriptive name like "julia_runtime"
|
| 47 |
|
| 48 |
already_ran = False
|
| 49 |
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
def _process_constraints(binary_operators, unary_operators, constraints):
|
| 52 |
constraints = constraints.copy()
|
| 53 |
for op in unary_operators:
|
|
|
|
| 170 |
)
|
| 171 |
|
| 172 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
# Class validation constants
|
| 174 |
VALID_OPTIMIZER_ALGORITHMS = ["NelderMead", "BFGS"]
|
| 175 |
|
|
|
|
| 912 |
model : PySRRegressor
|
| 913 |
The model with fitted equations.
|
| 914 |
"""
|
| 915 |
+
|
| 916 |
+
pkl_filename = _csv_filename_to_pkl_filename(equation_file)
|
|
|
|
|
|
|
| 917 |
|
| 918 |
# Try to load model from <equation_file>.pkl
|
| 919 |
print(f"Checking if {pkl_filename} exists...")
|
|
|
|
| 1467 |
# Denoising transformation
|
| 1468 |
if self.denoise:
|
| 1469 |
if self.nout_ > 1:
|
| 1470 |
+
X, y = multi_denoise(
|
| 1471 |
+
X, y, Xresampled=Xresampled, random_state=random_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1472 |
)
|
|
|
|
|
|
|
| 1473 |
else:
|
| 1474 |
+
X, y = denoise(X, y, Xresampled=Xresampled, random_state=random_state)
|
| 1475 |
|
| 1476 |
return X, y, variable_names, X_units, y_units
|
| 1477 |
|
|
|
|
| 1740 |
y,
|
| 1741 |
Xresampled=None,
|
| 1742 |
weights=None,
|
| 1743 |
+
variable_names: Optional[List[str]] = None,
|
| 1744 |
+
X_units: Optional[List[str]] = None,
|
| 1745 |
+
y_units: Optional[List[str]] = None,
|
| 1746 |
+
) -> "PySRRegressor":
|
| 1747 |
"""
|
| 1748 |
Search for equations to fit the dataset and store them in `self.equations_`.
|
| 1749 |
|
|
|
|
| 2330 |
return "\n".join(preamble_string + [table_string])
|
| 2331 |
|
| 2332 |
|
| 2333 |
+
def idx_model_selection(equations: pd.DataFrame, model_selection: str):
|
| 2334 |
"""Select an expression and return its index."""
|
| 2335 |
if model_selection == "accuracy":
|
| 2336 |
chosen_idx = equations["loss"].idxmin()
|
|
|
|
| 2345 |
f"{model_selection} is not a valid model selection strategy."
|
| 2346 |
)
|
| 2347 |
return chosen_idx
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pysr/test/test.py
CHANGED
|
@@ -14,14 +14,9 @@ from sklearn.utils.estimator_checks import check_estimator
|
|
| 14 |
|
| 15 |
from .. import PySRRegressor, julia_helpers
|
| 16 |
from ..export_latex import sympy2latex
|
| 17 |
-
from ..
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
_handle_feature_selection,
|
| 21 |
-
_process_constraints,
|
| 22 |
-
idx_model_selection,
|
| 23 |
-
run_feature_selection,
|
| 24 |
-
)
|
| 25 |
|
| 26 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
| 27 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
|
|
|
| 14 |
|
| 15 |
from .. import PySRRegressor, julia_helpers
|
| 16 |
from ..export_latex import sympy2latex
|
| 17 |
+
from ..feature_selection import _handle_feature_selection, run_feature_selection
|
| 18 |
+
from ..sr import _check_assertions, _process_constraints, idx_model_selection
|
| 19 |
+
from ..utils import _csv_filename_to_pkl_filename
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
DEFAULT_PARAMS = inspect.signature(PySRRegressor.__init__).parameters
|
| 22 |
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
|
pysr/utils.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
from sklearn.utils.validation import _check_feature_names_in
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def _csv_filename_to_pkl_filename(csv_filename: str) -> str:
|
| 8 |
+
if os.path.splitext(csv_filename)[1] == ".pkl":
|
| 9 |
+
return csv_filename
|
| 10 |
+
|
| 11 |
+
# Assume that the csv filename is of the form "foo.csv"
|
| 12 |
+
assert str(csv_filename).endswith(".csv")
|
| 13 |
+
|
| 14 |
+
dirname = str(os.path.dirname(csv_filename))
|
| 15 |
+
basename = str(os.path.basename(csv_filename))
|
| 16 |
+
base = str(os.path.splitext(basename)[0])
|
| 17 |
+
|
| 18 |
+
pkl_basename = base + ".pkl"
|
| 19 |
+
|
| 20 |
+
return os.path.join(dirname, pkl_basename)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
|
| 24 |
+
_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
|
| 25 |
+
_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
|
| 26 |
+
|
| 27 |
+
_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
|
| 28 |
+
_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
|
| 29 |
+
_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _preprocess_julia_floats(s: str) -> str:
|
| 33 |
+
if isinstance(s, str):
|
| 34 |
+
s = _apply_regexp_im(s)
|
| 35 |
+
s = _apply_regexp_im_sci(s)
|
| 36 |
+
s = _apply_regexp_sci(s)
|
| 37 |
+
return s
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _safe_check_feature_names_in(self, variable_names, generate_names=True):
|
| 41 |
+
"""_check_feature_names_in with compat for old versions."""
|
| 42 |
+
try:
|
| 43 |
+
return _check_feature_names_in(
|
| 44 |
+
self, variable_names, generate_names=generate_names
|
| 45 |
+
)
|
| 46 |
+
except TypeError:
|
| 47 |
+
return _check_feature_names_in(self, variable_names)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _subscriptify(i: int) -> str:
|
| 51 |
+
"""Converts integer to subscript text form.
|
| 52 |
+
|
| 53 |
+
For example, 123 -> "₁₂₃".
|
| 54 |
+
"""
|
| 55 |
+
return "".join([chr(0x2080 + int(c)) for c in str(i)])
|