Automated Equation Discovery With The Bayesian Machine Scientist¶
In this tutorial we will demonstrate how to autonomously recover equations from data using the Bayesian Machine Scientist. We will follow the sci-kit learn workflow.
Note: this tutorial requires Python 3.10 to run successfully.
Continuous Model Recovery¶
- Install relevant subpackages from AutoRA
In [ ]:
Copied!
# fix default prior and release new package version
!pip install -q "autora[theorist-bms]"
# fix default prior and release new package version
!pip install -q "autora[theorist-bms]"
- Import relevant modules from AutoRA
In [ ]:
Copied!
from autora.theorist.bms import BMSRegressor
from autora.experiment_runner.synthetic.psychophysics.weber_fechner_law import weber_fechner_law
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from autora.theorist.bms import BMSRegressor
from autora.experiment_runner.synthetic.psychophysics.weber_fechner_law import weber_fechner_law
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
- Import your data (Here we create noisy synthetic data as a demonstration)
In [ ]:
Copied!
constant = 3.0
# synthetic experiment from autora inventory
synthetic_runner = weber_fechner_law(constant=constant)
# experiment meta data:
synthetic_runner.variables
constant = 3.0
# synthetic experiment from autora inventory
synthetic_runner = weber_fechner_law(constant=constant)
# experiment meta data:
synthetic_runner.variables
In [ ]:
Copied!
# independent variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
# dependent variable
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]
ivs, dvs
# independent variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
# dependent variable
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]
ivs, dvs
In [ ]:
Copied!
# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
experiment_data
# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
experiment_data
In [ ]:
Copied!
# observations
observations = experiment_data[dvs]
# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
# observations
observations = experiment_data[dvs]
# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
Comparing Models¶
We will repeat these steps for each model 4. Initialize Model 5. Fit Model to the Data 6. Plot the Results
Some Functions you can use to plot¶
In [ ]:
Copied!
def present_results(model, x_test, y_test, arg='default', model_name=None, variable_names=None, select_indices=None, figsize=None, *args):
compare_results(models=[model], x_test=x_test, y_test=y_test, arg=arg, model_names=[model_name], variable_names=variable_names, select_indices=select_indices, figsize=figsize, *args)
def present_results(model, x_test, y_test, arg='default', model_name=None, variable_names=None, select_indices=None, figsize=None, *args):
compare_results(models=[model], x_test=x_test, y_test=y_test, arg=arg, model_names=[model_name], variable_names=variable_names, select_indices=select_indices, figsize=figsize, *args)
In [ ]:
Copied!
def compare_results(models, x_test, y_test, arg='default', model_names=None, variable_names=None, observation_name=None, select_indices=None, figsize=None, *args):
if model_names is None or model_names == [None]:
names = ['Model '+str(i+1) for i in range(len(models))]
else:
names = model_names
if len(x_test.shape) == 1:
x_test = x_test.reshape(1, -1)
num_var = x_test.shape[1]
if variable_names is None:
var_names = ['Variable '+str(i+1) for i in range(num_var)]
else:
var_names = variable_names
if observation_name is None:
obs_label = 'Observations'
else:
obs_label = observation_name
match arg:
case 'default':
for i, model in enumerate(models):
print(model)
synthetic_runner.plotter(model)
case '2d':
if figsize is None:
size = (8,3)
else:
assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
size = figsize
for i, model in enumerate(models):
fig = plt.figure(figsize=size)
axes = []
y_predict = model.predict(x_test)
for j in range(num_var):
axes.append(fig.add_subplot(1, num_var, j+1))
axes[j].set_xlabel(var_names[j])
axes[j].set_ylabel(obs_label)
axes[j].set_title(names[i]+' fit on '+var_names[j])
axes[j].scatter(x_test[:,j], y_test, label='Ground Truth', alpha=0.5)
axes[j].scatter(x_test[:,j], y_predict, label='Predicted', alpha=0.5)
axes[j].legend()
for arg in args:
assert isinstance(arg, str), 'arguments must be in the form of a string'
try:
exec('axes[j].'+arg)
except:
raise RuntimeError(f'argument "{arg}" could not be executed')
fig.tight_layout()
plt.show()
case '3d':
if figsize is None:
size = (15,5)
else:
assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
size = figsize
axes = []
fig = plt.figure(figsize=size)
if select_indices is None:
idx = (0,1)
else:
len(select_indices) == 2 and isinstance(select_indices, tuple), 'incorrect format for select_indices\nshould be tuple of form (i,j)'
idx = select_indices
for i, model in enumerate(models):
y_predict = model.predict(x_test)
ax = fig.add_subplot(1, 3, i+1, projection='3d')
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
axes.append(ax)
axes[i].set_xlabel(var_names[idx[0]])
axes[i].set_ylabel(var_names[idx[1]])
axes[i].set_zlabel(obs_label)
axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_test, s=1, label='Ground Truth')
axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_predict, s=1, label='Predicted')
axes[i].set_title(names[i])
axes[i].legend()
axes[i].set_facecolor('white')
for arg in args:
assert isinstance(arg, str), 'arguments must be in the form of a string'
try:
exec('axes[j].'+arg)
except:
raise RuntimeError(f'argument "{arg}" could not be executed')
fig.tight_layout()
plt.show()
case 'choice':
for model in models:
y_pred = np.where(model.predict(x_test) > 0.5, 1, 0)
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()
plt.show()
def compare_results(models, x_test, y_test, arg='default', model_names=None, variable_names=None, observation_name=None, select_indices=None, figsize=None, *args):
if model_names is None or model_names == [None]:
names = ['Model '+str(i+1) for i in range(len(models))]
else:
names = model_names
if len(x_test.shape) == 1:
x_test = x_test.reshape(1, -1)
num_var = x_test.shape[1]
if variable_names is None:
var_names = ['Variable '+str(i+1) for i in range(num_var)]
else:
var_names = variable_names
if observation_name is None:
obs_label = 'Observations'
else:
obs_label = observation_name
match arg:
case 'default':
for i, model in enumerate(models):
print(model)
synthetic_runner.plotter(model)
case '2d':
if figsize is None:
size = (8,3)
else:
assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
size = figsize
for i, model in enumerate(models):
fig = plt.figure(figsize=size)
axes = []
y_predict = model.predict(x_test)
for j in range(num_var):
axes.append(fig.add_subplot(1, num_var, j+1))
axes[j].set_xlabel(var_names[j])
axes[j].set_ylabel(obs_label)
axes[j].set_title(names[i]+' fit on '+var_names[j])
axes[j].scatter(x_test[:,j], y_test, label='Ground Truth', alpha=0.5)
axes[j].scatter(x_test[:,j], y_predict, label='Predicted', alpha=0.5)
axes[j].legend()
for arg in args:
assert isinstance(arg, str), 'arguments must be in the form of a string'
try:
exec('axes[j].'+arg)
except:
raise RuntimeError(f'argument "{arg}" could not be executed')
fig.tight_layout()
plt.show()
case '3d':
if figsize is None:
size = (15,5)
else:
assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
size = figsize
axes = []
fig = plt.figure(figsize=size)
if select_indices is None:
idx = (0,1)
else:
len(select_indices) == 2 and isinstance(select_indices, tuple), 'incorrect format for select_indices\nshould be tuple of form (i,j)'
idx = select_indices
for i, model in enumerate(models):
y_predict = model.predict(x_test)
ax = fig.add_subplot(1, 3, i+1, projection='3d')
ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
axes.append(ax)
axes[i].set_xlabel(var_names[idx[0]])
axes[i].set_ylabel(var_names[idx[1]])
axes[i].set_zlabel(obs_label)
axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_test, s=1, label='Ground Truth')
axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_predict, s=1, label='Predicted')
axes[i].set_title(names[i])
axes[i].legend()
axes[i].set_facecolor('white')
for arg in args:
assert isinstance(arg, str), 'arguments must be in the form of a string'
try:
exec('axes[j].'+arg)
except:
raise RuntimeError(f'argument "{arg}" could not be executed')
fig.tight_layout()
plt.show()
case 'choice':
for model in models:
y_pred = np.where(model.predict(x_test) > 0.5, 1, 0)
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
cmd = ConfusionMatrixDisplay(cm)
cmd.plot()
plt.show()
1. Polynomial Regressor¶
make model if needed
In [ ]:
Copied!
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
class PolynomialRegressor:
"""
This theorist fits a polynomial function to the data.
"""
def __init__(self, degree: int = 3):
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
self.model = LinearRegression()
def fit(self, x, y):
features = self.poly.fit_transform(x, y)
self.model.fit(features, y)
return self
def predict(self, x):
features = self.poly.fit_transform(x)
return self.model.predict(features)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
class PolynomialRegressor:
"""
This theorist fits a polynomial function to the data.
"""
def __init__(self, degree: int = 3):
self.poly = PolynomialFeatures(degree=degree, include_bias=False)
self.model = LinearRegression()
def fit(self, x, y):
features = self.poly.fit_transform(x, y)
self.model.fit(features, y)
return self
def predict(self, x):
features = self.poly.fit_transform(x)
return self.model.predict(features)
- Initialize the model
In [ ]:
Copied!
np.random.seed(0)
poly_model = PolynomialRegressor(degree=3)
np.random.seed(0)
poly_model = PolynomialRegressor(degree=3)
- Fit Model to the Data
In [ ]:
Copied!
poly_model.fit(conditions_train, observations_train)
poly_model.fit(conditions_train, observations_train)
- Plot the Results
In [ ]:
Copied!
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test)
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test)
In [ ]:
Copied!
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='2d')
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='2d')
In [ ]:
Copied!
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='3d')
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='3d')
2. Neural Network¶
For this section, we are using torch:
In [ ]:
Copied!
!pip install -q torch
!pip install -q torch
- Initialize Model
In [ ]:
Copied!
from sklearn.neural_network import MLPRegressor
import torch
nn_model = MLPRegressor(random_state=1, max_iter=500)
from sklearn.neural_network import MLPRegressor
import torch
nn_model = MLPRegressor(random_state=1, max_iter=500)
- Fit Model to the Data
In [ ]:
Copied!
np.random.seed(0)
torch.manual_seed(0)
nn_model.fit(conditions_train, observations_train)
np.random.seed(0)
torch.manual_seed(0)
nn_model.fit(conditions_train, observations_train)
- Plot the Results
In [ ]:
Copied!
synthetic_runner.plotter(nn_model)
synthetic_runner.plotter(nn_model)
3. Bayesian Machine Scientist¶
- Initialize Model
In [ ]:
Copied!
bms_model = BMSRegressor(epochs=1500)
bms_model = BMSRegressor(epochs=1500)
- Fit Model to Data
In [ ]:
Copied!
bms_model.fit(conditions_train, observations_train, seed=0)
bms_model.fit(conditions_train, observations_train, seed=0)
- Plot the Results
In [ ]:
Copied!
synthetic_runner.plotter(bms_model)
synthetic_runner.plotter(bms_model)
In [ ]:
Copied!
print(bms_model)
print(bms_model)
Summary - Model Comparison¶
In [ ]:
Copied!
models = [poly_model, nn_model, bms_model]
names =['poly_model', 'nn_model', 'bms_model']
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='2d')
models = [poly_model, nn_model, bms_model]
names =['poly_model', 'nn_model', 'bms_model']
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='2d')
In [ ]:
Copied!
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='3d')
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='3d')
Choice Model Recovery¶
- Import relevant modules from AutoRA
In [ ]:
Copied!
from autora.experiment_runner.synthetic.psychology.luce_choice_ratio import luce_choice_ratio
from autora.experiment_runner.synthetic.psychology.luce_choice_ratio import luce_choice_ratio
- Import your data (Here we create noisy synthetic data as a demonstration)
In [ ]:
Copied!
# experimental parameter to recover
focus = 0.8
# synthetic experiment from autora inventory
synthetic_runner = luce_choice_ratio(focus=focus)
# variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]
# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
observations = experiment_data[dvs]
# set probabilities to choice values
observations = np.where(observations < 0.5, 0, 1)
# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
# experimental parameter to recover
focus = 0.8
# synthetic experiment from autora inventory
synthetic_runner = luce_choice_ratio(focus=focus)
# variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]
# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
observations = experiment_data[dvs]
# set probabilities to choice values
observations = np.where(observations < 0.5, 0, 1)
# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
- Initialize Model
- Fit Model to the Data
- Plot the Results
Comparing Models¶
In [ ]:
Copied!
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
Logistic Regressor¶
- Initialize Model
In [ ]:
Copied!
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
- Fit Model to the Data
In [ ]:
Copied!
log_model.fit(conditions_train, observations_train)
# log_model.predict(conditions_test)
log_model.fit(conditions_train, observations_train)
# log_model.predict(conditions_test)
- Plot the Results
In [ ]:
Copied!
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
Neural Network Classifier¶
- Initialize Model
In [ ]:
Copied!
from sklearn.neural_network import MLPClassifier
nn_class_model = MLPClassifier(max_iter=3000, activation='logistic')
from sklearn.neural_network import MLPClassifier
nn_class_model = MLPClassifier(max_iter=3000, activation='logistic')
- Fit Model to the Data
In [ ]:
Copied!
nn_class_model.fit(conditions_train, observations_train)
nn_class_model.fit(conditions_train, observations_train)
- Plot the Results
In [ ]:
Copied!
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
Bayesian Machine Scientist¶
- Initialize Model
In [ ]:
Copied!
bms_model = BMSRegressor(epochs=1500)
bms_model = BMSRegressor(epochs=1500)
- Fit Model to the Data
In [ ]:
Copied!
bms_model.fit(conditions_train, observations_train, seed=0)
bms_model.fit(conditions_train, observations_train, seed=0)
- Plot the Results
In [ ]:
Copied!
print(bms_model)
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
Summary - Model Comparison¶
In [ ]:
Copied!
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)
Exercises¶
FYI: AutoRA Toolkit Pre-release for integrating symbolic regression into any other modeling method of your choice¶
Exercise 1: Try it yourself!¶
- Install packages
In [ ]:
Copied!
# Install package that you will need here
# Install package that you will need here
- Import packages
In [ ]:
Copied!
# Import packages you will need here
import pandas
# Import packages you will need here
import pandas
- Load Data
In [ ]:
Copied!
## Uncomment the next lines and input the file location or replace with a line of your own
# FILE_LOCATION = ''
# df = pd.read_csv(FILE_LOCATION)
## Extract your conditions and observations from your loaded data
# conditions =
# observations =
## Run this line and make sure you train with the train datasets and test with the test datasets
## This ensures that your model hasn't overfit
# conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
## Uncomment the next lines and input the file location or replace with a line of your own
# FILE_LOCATION = ''
# df = pd.read_csv(FILE_LOCATION)
## Extract your conditions and observations from your loaded data
# conditions =
# observations =
## Run this line and make sure you train with the train datasets and test with the test datasets
## This ensures that your model hasn't overfit
# conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
- Initialize Model
In [ ]:
Copied!
# We will use BMS for this exercise. Feel free to initialize another model to compare it to
# Minimum recommended number of epochs is 1500
bms_model = BMSRegressor(epochs=1500)
# We will use BMS for this exercise. Feel free to initialize another model to compare it to
# Minimum recommended number of epochs is 1500
bms_model = BMSRegressor(epochs=1500)
- Fit model to the Data
In [ ]:
Copied!
## Uncomment this next line after you have completed step 3
# bms_model.fit(conditions_train, observations_train)
## Uncomment this next line after you have completed step 3
# bms_model.fit(conditions_train, observations_train)
- Plot the results
In [ ]:
Copied!
## Uncomment this next line after you have completed step 5
# plot_type = "default", "2d", "3d", or "choice"
# present_results(bms_model, conditions_test, observations_test, arg=plot_type)
## You can uncomment these lines if you would like to compare BMS to a model of your choice
# your_models_name =
# your_model =
#
# models = [bms_model, your_model]
# model_names = ["bms_model", your_models_name]
# compare results(models=models, x_test=conditions_test, y_test=observations_test, model_names=model_names, arg=plot_type)
## Uncomment this next line after you have completed step 5
# plot_type = "default", "2d", "3d", or "choice"
# present_results(bms_model, conditions_test, observations_test, arg=plot_type)
## You can uncomment these lines if you would like to compare BMS to a model of your choice
# your_models_name =
# your_model =
#
# models = [bms_model, your_model]
# model_names = ["bms_model", your_models_name]
# compare results(models=models, x_test=conditions_test, y_test=observations_test, model_names=model_names, arg=plot_type)
Exercise 2: Using the AutoRA closed loop¶
See https://autoresearch.github.io/autora/tutorials/ for a tutorial on closed-loop experimentation.