Spaces:
Sleeping
Sleeping
Commit
·
bf37f2a
1
Parent(s):
ac2e8e0
Allow early quit + can pretty-print best equation
Browse files- pysr/__init__.py +1 -1
- pysr/sr.py +103 -34
pysr/__init__.py
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
from .sr import pysr
|
|
|
|
| 1 |
+
from .sr import pysr, get_hof, best, best_tex, best_function
|
pysr/sr.py
CHANGED
|
@@ -6,13 +6,19 @@ import numpy as np
|
|
| 6 |
import pandas as pd
|
| 7 |
import sympy
|
| 8 |
from sympy import sympify, Symbol, lambdify
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
sympy_mappings = {
|
| 11 |
'div': lambda x, y : x/y,
|
| 12 |
'mult': lambda x, y : x*y,
|
| 13 |
'plus': lambda x, y : x + y,
|
| 14 |
'neg': lambda x : -x,
|
| 15 |
-
'pow': lambda x, y : sympy.sign(x)*
|
| 16 |
'cos': lambda x : sympy.cos(x),
|
| 17 |
'sin': lambda x : sympy.sin(x),
|
| 18 |
'tan': lambda x : sympy.tan(x),
|
|
@@ -26,13 +32,13 @@ sympy_mappings = {
|
|
| 26 |
'acosh':lambda x : sympy.acosh(x),
|
| 27 |
'asinh':lambda x : sympy.asinh(x),
|
| 28 |
'atanh':lambda x : sympy.atanh(x),
|
| 29 |
-
'abs': lambda x :
|
| 30 |
'mod': lambda x, y : sympy.Mod(x, y),
|
| 31 |
'erf': lambda x : sympy.erf(x),
|
| 32 |
'erfc': lambda x : sympy.erfc(x),
|
| 33 |
-
'logm': lambda x : sympy.log(
|
| 34 |
-
'logm10':lambda x : sympy.log10(
|
| 35 |
-
'logm2': lambda x : sympy.log2(
|
| 36 |
'log1p': lambda x : sympy.log(x + 1),
|
| 37 |
'floor': lambda x : sympy.floor(x),
|
| 38 |
'ceil': lambda x : sympy.ceil(x),
|
|
@@ -189,11 +195,6 @@ def pysr(X=None, y=None, weights=None,
|
|
| 189 |
if populations is None:
|
| 190 |
populations = procs
|
| 191 |
|
| 192 |
-
local_sympy_mappings = {
|
| 193 |
-
**extra_sympy_mappings,
|
| 194 |
-
**sympy_mappings
|
| 195 |
-
}
|
| 196 |
-
|
| 197 |
rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
|
| 198 |
|
| 199 |
if isinstance(binary_operators, str): binary_operators = [binary_operators]
|
|
@@ -302,17 +303,64 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
| 302 |
|
| 303 |
|
| 304 |
command = [
|
| 305 |
-
f'julia -O{julia_optimization:d}',
|
| 306 |
-
f'-p {procs}',
|
| 307 |
f'/tmp/.runfile_{rand_string}.jl',
|
| 308 |
]
|
| 309 |
if timeout is not None:
|
| 310 |
-
command = [f'timeout {timeout}'] + command
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
try:
|
| 315 |
-
output = pd.read_csv(equation_file, sep="|")
|
| 316 |
except FileNotFoundError:
|
| 317 |
print("Couldn't find equation file!")
|
| 318 |
return pd.DataFrame()
|
|
@@ -322,10 +370,17 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
| 322 |
lastComplexity = 0
|
| 323 |
sympy_format = []
|
| 324 |
lambda_format = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
if use_custom_variable_names:
|
| 326 |
-
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(
|
| 327 |
else:
|
| 328 |
-
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(
|
|
|
|
| 329 |
for i in range(len(output)):
|
| 330 |
eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
|
| 331 |
sympy_format.append(eqn)
|
|
@@ -342,25 +397,39 @@ const varMap = {'["' + '", "'.join(variable_names) + '"]'}"""
|
|
| 342 |
lastMSE = curMSE
|
| 343 |
lastComplexity = curComplexity
|
| 344 |
|
| 345 |
-
|
| 346 |
output['score'] = np.array(scores)
|
| 347 |
output['sympy_format'] = sympy_format
|
| 348 |
output['lambda_format'] = lambda_format
|
| 349 |
-
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
| 350 |
-
|
| 351 |
|
| 352 |
-
|
| 353 |
-
"""Use a gradient boosting tree regressor as a proxy for finding
|
| 354 |
-
the k most important features in X, returning indices for those
|
| 355 |
-
features as output."""
|
| 356 |
-
|
| 357 |
-
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
| 358 |
-
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
import sympy
|
| 8 |
from sympy import sympify, Symbol, lambdify
|
| 9 |
+
import subprocess
|
| 10 |
+
|
| 11 |
+
global_equation_file = 'hall_of_fame.csv'
|
| 12 |
+
global_n_features = None
|
| 13 |
+
global_variable_names = []
|
| 14 |
+
global_extra_sympy_mappings = {}
|
| 15 |
|
| 16 |
sympy_mappings = {
|
| 17 |
'div': lambda x, y : x/y,
|
| 18 |
'mult': lambda x, y : x*y,
|
| 19 |
'plus': lambda x, y : x + y,
|
| 20 |
'neg': lambda x : -x,
|
| 21 |
+
'pow': lambda x, y : sympy.sign(x)*abs(x)**y,
|
| 22 |
'cos': lambda x : sympy.cos(x),
|
| 23 |
'sin': lambda x : sympy.sin(x),
|
| 24 |
'tan': lambda x : sympy.tan(x),
|
|
|
|
| 32 |
'acosh':lambda x : sympy.acosh(x),
|
| 33 |
'asinh':lambda x : sympy.asinh(x),
|
| 34 |
'atanh':lambda x : sympy.atanh(x),
|
| 35 |
+
'abs': lambda x : abs(x),
|
| 36 |
'mod': lambda x, y : sympy.Mod(x, y),
|
| 37 |
'erf': lambda x : sympy.erf(x),
|
| 38 |
'erfc': lambda x : sympy.erfc(x),
|
| 39 |
+
'logm': lambda x : sympy.log(abs(x)),
|
| 40 |
+
'logm10':lambda x : sympy.log10(abs(x)),
|
| 41 |
+
'logm2': lambda x : sympy.log2(abs(x)),
|
| 42 |
'log1p': lambda x : sympy.log(x + 1),
|
| 43 |
'floor': lambda x : sympy.floor(x),
|
| 44 |
'ceil': lambda x : sympy.ceil(x),
|
|
|
|
| 195 |
if populations is None:
|
| 196 |
populations = procs
|
| 197 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
rand_string = f'{"".join([str(np.random.rand())[2] for i in range(20)])}'
|
| 199 |
|
| 200 |
if isinstance(binary_operators, str): binary_operators = [binary_operators]
|
|
|
|
| 303 |
|
| 304 |
|
| 305 |
command = [
|
| 306 |
+
f'julia', f'-O{julia_optimization:d}',
|
| 307 |
+
f'-p', f'{procs}',
|
| 308 |
f'/tmp/.runfile_{rand_string}.jl',
|
| 309 |
]
|
| 310 |
if timeout is not None:
|
| 311 |
+
command = [f'timeout', f'{timeout}'] + command
|
| 312 |
+
|
| 313 |
+
global global_n_features
|
| 314 |
+
global global_equation_file
|
| 315 |
+
global global_variable_names
|
| 316 |
+
global global_extra_sympy_mappings
|
| 317 |
+
|
| 318 |
+
global_n_features = X.shape[1]
|
| 319 |
+
global_equation_file = equation_file
|
| 320 |
+
global_variable_names = variable_names
|
| 321 |
+
global_extra_sympy_mappings = extra_sympy_mappings
|
| 322 |
+
|
| 323 |
+
print("Running on", ' '.join(command))
|
| 324 |
+
process = subprocess.Popen(command)
|
| 325 |
+
while True:
|
| 326 |
+
try:
|
| 327 |
+
process.wait()
|
| 328 |
+
except KeyboardInterrupt:
|
| 329 |
+
process.kill()
|
| 330 |
+
|
| 331 |
+
return get_hof()
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def run_feature_selection(X, y, select_k_features):
|
| 335 |
+
"""Use a gradient boosting tree regressor as a proxy for finding
|
| 336 |
+
the k most important features in X, returning indices for those
|
| 337 |
+
features as output."""
|
| 338 |
+
|
| 339 |
+
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
|
| 340 |
+
from sklearn.feature_selection import SelectFromModel, SelectKBest
|
| 341 |
+
|
| 342 |
+
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls') #RandomForestRegressor()
|
| 343 |
+
clf.fit(X, y)
|
| 344 |
+
selector = SelectFromModel(clf, threshold=-np.inf,
|
| 345 |
+
max_features=select_k_features, prefit=True)
|
| 346 |
+
return selector.get_support(indices=True)
|
| 347 |
+
|
| 348 |
+
def get_hof(equation_file=None, n_features=None, variable_names=None, extra_sympy_mappings=None):
|
| 349 |
+
"""Get the equations from a hall of fame file. If no arguments
|
| 350 |
+
entered, the ones used previously from a call to PySR will be used."""
|
| 351 |
+
|
| 352 |
+
global global_n_features
|
| 353 |
+
global global_equation_file
|
| 354 |
+
global global_variable_names
|
| 355 |
+
global global_extra_sympy_mappings
|
| 356 |
+
|
| 357 |
+
if equation_file is None: equation_file = global_equation_file
|
| 358 |
+
if n_features is None: n_features = global_n_features
|
| 359 |
+
if variable_names is None: variable_names = global_variable_names
|
| 360 |
+
if extra_sympy_mappings is None: extra_sympy_mappings = global_extra_sympy_mappings
|
| 361 |
+
|
| 362 |
try:
|
| 363 |
+
output = pd.read_csv(equation_file + '.bkup', sep="|")
|
| 364 |
except FileNotFoundError:
|
| 365 |
print("Couldn't find equation file!")
|
| 366 |
return pd.DataFrame()
|
|
|
|
| 370 |
lastComplexity = 0
|
| 371 |
sympy_format = []
|
| 372 |
lambda_format = []
|
| 373 |
+
use_custom_variable_names = (len(variable_names) != 0)
|
| 374 |
+
local_sympy_mappings = {
|
| 375 |
+
**extra_sympy_mappings,
|
| 376 |
+
**sympy_mappings
|
| 377 |
+
}
|
| 378 |
+
|
| 379 |
if use_custom_variable_names:
|
| 380 |
+
sympy_symbols = [sympy.Symbol(variable_names[i]) for i in range(n_features)]
|
| 381 |
else:
|
| 382 |
+
sympy_symbols = [sympy.Symbol('x%d'%i) for i in range(n_features)]
|
| 383 |
+
|
| 384 |
for i in range(len(output)):
|
| 385 |
eqn = sympify(output.loc[i, 'Equation'], locals=local_sympy_mappings)
|
| 386 |
sympy_format.append(eqn)
|
|
|
|
| 397 |
lastMSE = curMSE
|
| 398 |
lastComplexity = curComplexity
|
| 399 |
|
|
|
|
| 400 |
output['score'] = np.array(scores)
|
| 401 |
output['sympy_format'] = sympy_format
|
| 402 |
output['lambda_format'] = lambda_format
|
|
|
|
|
|
|
| 403 |
|
| 404 |
+
return output[['Complexity', 'MSE', 'score', 'Equation', 'sympy_format', 'lambda_format']]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
+
def best_row(equations=None):
|
| 407 |
+
"""Return the best columns of a hall of fame file using the score column."""
|
| 408 |
+
if equations is None: equations = get_hof()
|
| 409 |
+
best_idx = np.argmax(equations['score'])
|
| 410 |
+
return equations.iloc[best_idx]
|
| 411 |
+
|
| 412 |
+
def best_tex(equations=None):
|
| 413 |
+
"""Return the equation with the best score, in latex format"""
|
| 414 |
+
if equations is None: equations = get_hof()
|
| 415 |
+
best_sympy = best_row(equations)['sympy_format']
|
| 416 |
+
return sympy.latex(best_sympy.simplify())
|
| 417 |
+
|
| 418 |
+
def best(equations=None):
|
| 419 |
+
"""Return the equation with the best score, in latex format"""
|
| 420 |
+
if equations is None: equations = get_hof()
|
| 421 |
+
best_sympy = best_row(equations)['sympy_format']
|
| 422 |
+
return best_sympy.simplify()
|
| 423 |
+
|
| 424 |
+
def best_tex(equations=None):
|
| 425 |
+
"""Return the equation with the best score, in latex format"""
|
| 426 |
+
if equations is None: equations = get_hof()
|
| 427 |
+
best_sympy = best_row(equations)['sympy_format']
|
| 428 |
+
return sympy.latex(best_sympy.simplify())
|
| 429 |
+
|
| 430 |
+
def best_function(equations=None):
|
| 431 |
+
"""Return the equation with the best score, in callable format"""
|
| 432 |
+
if equations is None: equations = get_hof()
|
| 433 |
+
return best_row(equations)['lambda_format']
|
| 434 |
|
| 435 |
|