Automated Equation Discovery With The Bayesian Machine Scientist¶

In this tutorial we will demonstrate how to autonomously recover equations from data using the Bayesian Machine Scientist. We will follow the sci-kit learn workflow.

Note: this tutorial requires Python 3.10 to run successfully.

Continuous Model Recovery¶

Install relevant subpackages from AutoRA

In [ ]:

Copied!

# fix default prior and release new package version
!pip install -q "autora[theorist-bms]"
# fix default prior and release new package version
!pip install -q "autora[theorist-bms]"

Import relevant modules from AutoRA

In [ ]:

Copied!





from autora.theorist.bms import BMSRegressor
from autora.experiment_runner.synthetic.psychophysics.weber_fechner_law import weber_fechner_law
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from autora.theorist.bms import BMSRegressor
from autora.experiment_runner.synthetic.psychophysics.weber_fechner_law import weber_fechner_law
import numpy as np
from sklearn.base import BaseEstimator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Import your data (Here we create noisy synthetic data as a demonstration)

In [ ]:

Copied!

constant = 3.0

# synthetic experiment from autora inventory
synthetic_runner = weber_fechner_law(constant=constant)

# experiment meta data:
synthetic_runner.variables
constant = 3.0

# synthetic experiment from autora inventory
synthetic_runner = weber_fechner_law(constant=constant)

# experiment meta data:
synthetic_runner.variables

In [ ]:

Copied!

# independent variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]

# dependent variable
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]

ivs, dvs
# independent variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]

# dependent variable
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]

ivs, dvs

In [ ]:

Copied!

# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)

experiment_data
# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)

experiment_data

In [ ]:

Copied!

# observations
observations = experiment_data[dvs]

# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
# observations
observations = experiment_data[dvs]

# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)

Comparing Models¶

We will repeat these steps for each model 4. Initialize Model 5. Fit Model to the Data 6. Plot the Results

Some Functions you can use to plot¶

In [ ]:

Copied!

def present_results(model, x_test, y_test, arg='default', model_name=None, variable_names=None, select_indices=None, figsize=None, *args):
  compare_results(models=[model], x_test=x_test, y_test=y_test, arg=arg, model_names=[model_name], variable_names=variable_names, select_indices=select_indices, figsize=figsize, *args)
def present_results(model, x_test, y_test, arg='default', model_name=None, variable_names=None, select_indices=None, figsize=None, *args):
  compare_results(models=[model], x_test=x_test, y_test=y_test, arg=arg, model_names=[model_name], variable_names=variable_names, select_indices=select_indices, figsize=figsize, *args)

In [ ]:

Copied!





def compare_results(models, x_test, y_test, arg='default', model_names=None, variable_names=None, observation_name=None, select_indices=None, figsize=None, *args):
  if model_names is None or model_names == [None]:
    names = ['Model '+str(i+1) for i in range(len(models))]
  else:
    names = model_names
  if len(x_test.shape) == 1:
    x_test = x_test.reshape(1, -1)
  num_var = x_test.shape[1]
  if variable_names is None:
    var_names = ['Variable '+str(i+1) for i in range(num_var)]
  else:
    var_names = variable_names
  if observation_name is None:
    obs_label = 'Observations'
  else:
    obs_label = observation_name
  match arg:
    case 'default':
      for i, model in enumerate(models):
        print(model)
        synthetic_runner.plotter(model)
    case '2d':
      if figsize is None:
        size = (8,3)
      else:
        assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
        size = figsize
      for i, model in enumerate(models):
        fig = plt.figure(figsize=size)
        axes = []
        y_predict = model.predict(x_test)
        for j in range(num_var):
          axes.append(fig.add_subplot(1, num_var, j+1))
          axes[j].set_xlabel(var_names[j])
          axes[j].set_ylabel(obs_label)
          axes[j].set_title(names[i]+' fit on '+var_names[j])
          axes[j].scatter(x_test[:,j], y_test, label='Ground Truth', alpha=0.5)
          axes[j].scatter(x_test[:,j], y_predict, label='Predicted', alpha=0.5)
          axes[j].legend()
          for arg in args:
            assert isinstance(arg, str), 'arguments must be in the form of a string'
            try:
              exec('axes[j].'+arg)
            except:
              raise RuntimeError(f'argument "{arg}" could not be executed')

      fig.tight_layout()
      plt.show()
    case '3d':
      if figsize is None:
        size = (15,5)
      else:
        assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
        size = figsize
      axes = []
      fig = plt.figure(figsize=size)
      if select_indices is None:
        idx = (0,1)
      else:
        len(select_indices) == 2 and isinstance(select_indices, tuple), 'incorrect format for select_indices\nshould be tuple of form (i,j)'
        idx = select_indices
      for i, model in enumerate(models):
        y_predict = model.predict(x_test)
        ax = fig.add_subplot(1, 3, i+1, projection='3d')
        ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        axes.append(ax)
        axes[i].set_xlabel(var_names[idx[0]])
        axes[i].set_ylabel(var_names[idx[1]])
        axes[i].set_zlabel(obs_label)
        axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_test, s=1, label='Ground Truth')
        axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_predict, s=1, label='Predicted')
        axes[i].set_title(names[i])
        axes[i].legend()
        axes[i].set_facecolor('white')
        for arg in args:
            assert isinstance(arg, str), 'arguments must be in the form of a string'
            try:
              exec('axes[j].'+arg)
            except:
              raise RuntimeError(f'argument "{arg}" could not be executed')
      fig.tight_layout()
      plt.show()
    case 'choice':
      for model in models:
        y_pred = np.where(model.predict(x_test) > 0.5, 1, 0)
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cmd = ConfusionMatrixDisplay(cm)
        cmd.plot()
        plt.show()
def compare_results(models, x_test, y_test, arg='default', model_names=None, variable_names=None, observation_name=None, select_indices=None, figsize=None, *args):
  if model_names is None or model_names == [None]:
    names = ['Model '+str(i+1) for i in range(len(models))]
  else:
    names = model_names
  if len(x_test.shape) == 1:
    x_test = x_test.reshape(1, -1)
  num_var = x_test.shape[1]
  if variable_names is None:
    var_names = ['Variable '+str(i+1) for i in range(num_var)]
  else:
    var_names = variable_names
  if observation_name is None:
    obs_label = 'Observations'
  else:
    obs_label = observation_name
  match arg:
    case 'default':
      for i, model in enumerate(models):
        print(model)
        synthetic_runner.plotter(model)
    case '2d':
      if figsize is None:
        size = (8,3)
      else:
        assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
        size = figsize
      for i, model in enumerate(models):
        fig = plt.figure(figsize=size)
        axes = []
        y_predict = model.predict(x_test)
        for j in range(num_var):
          axes.append(fig.add_subplot(1, num_var, j+1))
          axes[j].set_xlabel(var_names[j])
          axes[j].set_ylabel(obs_label)
          axes[j].set_title(names[i]+' fit on '+var_names[j])
          axes[j].scatter(x_test[:,j], y_test, label='Ground Truth', alpha=0.5)
          axes[j].scatter(x_test[:,j], y_predict, label='Predicted', alpha=0.5)
          axes[j].legend()
          for arg in args:
            assert isinstance(arg, str), 'arguments must be in the form of a string'
            try:
              exec('axes[j].'+arg)
            except:
              raise RuntimeError(f'argument "{arg}" could not be executed')

      fig.tight_layout()
      plt.show()
    case '3d':
      if figsize is None:
        size = (15,5)
      else:
        assert len(figsize) == 2 and isinstance(figsize, tuple), 'incorrect format for figure shape\nshould be tuple of form (i,j)'
        size = figsize
      axes = []
      fig = plt.figure(figsize=size)
      if select_indices is None:
        idx = (0,1)
      else:
        len(select_indices) == 2 and isinstance(select_indices, tuple), 'incorrect format for select_indices\nshould be tuple of form (i,j)'
        idx = select_indices
      for i, model in enumerate(models):
        y_predict = model.predict(x_test)
        ax = fig.add_subplot(1, 3, i+1, projection='3d')
        ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        ax.w_yaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        ax.w_zaxis.set_pane_color((1.0, 1.0, 1.0, 0.0))
        axes.append(ax)
        axes[i].set_xlabel(var_names[idx[0]])
        axes[i].set_ylabel(var_names[idx[1]])
        axes[i].set_zlabel(obs_label)
        axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_test, s=1, label='Ground Truth')
        axes[i].scatter(x_test[:, idx[0]], x_test[:, idx[1]], y_predict, s=1, label='Predicted')
        axes[i].set_title(names[i])
        axes[i].legend()
        axes[i].set_facecolor('white')
        for arg in args:
            assert isinstance(arg, str), 'arguments must be in the form of a string'
            try:
              exec('axes[j].'+arg)
            except:
              raise RuntimeError(f'argument "{arg}" could not be executed')
      fig.tight_layout()
      plt.show()
    case 'choice':
      for model in models:
        y_pred = np.where(model.predict(x_test) > 0.5, 1, 0)
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cmd = ConfusionMatrixDisplay(cm)
        cmd.plot()
        plt.show()

1. Polynomial Regressor¶

make model if needed

In [ ]:

Copied!





from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


class PolynomialRegressor:
    """
    This theorist fits a polynomial function to the data.
    """

    def __init__(self, degree: int = 3):
      self.poly = PolynomialFeatures(degree=degree, include_bias=False)
      self.model = LinearRegression()

    def fit(self, x, y):
      features = self.poly.fit_transform(x, y)
      self.model.fit(features, y)
      return self

    def predict(self, x):
      features = self.poly.fit_transform(x)
      return self.model.predict(features)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression


class PolynomialRegressor:
    """
    This theorist fits a polynomial function to the data.
    """

    def __init__(self, degree: int = 3):
      self.poly = PolynomialFeatures(degree=degree, include_bias=False)
      self.model = LinearRegression()

    def fit(self, x, y):
      features = self.poly.fit_transform(x, y)
      self.model.fit(features, y)
      return self

    def predict(self, x):
      features = self.poly.fit_transform(x)
      return self.model.predict(features)

Initialize the model

In [ ]:

Copied!

np.random.seed(0)
poly_model = PolynomialRegressor(degree=3)
np.random.seed(0)
poly_model = PolynomialRegressor(degree=3)

Fit Model to the Data

In [ ]:

Copied!

poly_model.fit(conditions_train, observations_train)
poly_model.fit(conditions_train, observations_train)

Plot the Results

In [ ]:

Copied!

present_results(model=poly_model, x_test=conditions_test, y_test=observations_test)
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test)

In [ ]:

Copied!

present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='2d')
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='2d')

In [ ]:

Copied!

present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='3d')
present_results(model=poly_model, x_test=conditions_test, y_test=observations_test, arg='3d')

Summary - Polynomial Linear Regressor¶

Expressivity: Low

Interpretability: High

2. Neural Network¶

For this section, we are using torch:

In [ ]:

Copied!

!pip install -q torch
!pip install -q torch

Initialize Model

In [ ]:

Copied!

from sklearn.neural_network import MLPRegressor
import torch
nn_model = MLPRegressor(random_state=1, max_iter=500)
from sklearn.neural_network import MLPRegressor
import torch
nn_model = MLPRegressor(random_state=1, max_iter=500)

Fit Model to the Data

In [ ]:

Copied!

np.random.seed(0)
torch.manual_seed(0)
nn_model.fit(conditions_train, observations_train)
np.random.seed(0)
torch.manual_seed(0)
nn_model.fit(conditions_train, observations_train)

Plot the Results

In [ ]:

Copied!

synthetic_runner.plotter(nn_model)
synthetic_runner.plotter(nn_model)

Summary - Neural Network Regressor¶

Expressivity: High

Interpretability: Low

3. Bayesian Machine Scientist¶

Initialize Model

In [ ]:

Copied!

bms_model = BMSRegressor(epochs=1500)
bms_model = BMSRegressor(epochs=1500)

Fit Model to Data

In [ ]:

Copied!

bms_model.fit(conditions_train, observations_train, seed=0)
bms_model.fit(conditions_train, observations_train, seed=0)

Plot the Results

In [ ]:

Copied!

synthetic_runner.plotter(bms_model)
synthetic_runner.plotter(bms_model)

In [ ]:

Copied!

print(bms_model)
print(bms_model)

Summary - BMS Regressor¶

Expressiveness: High

Interpretability: High

Summary - Model Comparison¶

In [ ]:

Copied!

models = [poly_model, nn_model, bms_model]
names =['poly_model', 'nn_model', 'bms_model']
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='2d')
models = [poly_model, nn_model, bms_model]
names =['poly_model', 'nn_model', 'bms_model']
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='2d')

In [ ]:

Copied!

compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='3d')
compare_results(models=models, x_test=conditions_test, y_test=observations_test, model_names=names, arg='3d')

Choice Model Recovery¶

Import relevant modules from AutoRA

In [ ]:

Copied!

from autora.experiment_runner.synthetic.psychology.luce_choice_ratio import luce_choice_ratio
from autora.experiment_runner.synthetic.psychology.luce_choice_ratio import luce_choice_ratio

Import your data (Here we create noisy synthetic data as a demonstration)

In [ ]:

Copied!





# experimental parameter to recover
focus = 0.8

# synthetic experiment from autora inventory
synthetic_runner = luce_choice_ratio(focus=focus)

# variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]


# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
observations = experiment_data[dvs]

# set probabilities to choice values
observations = np.where(observations < 0.5, 0, 1)

# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
# experimental parameter to recover
focus = 0.8

# synthetic experiment from autora inventory
synthetic_runner = luce_choice_ratio(focus=focus)

# variables
ivs = [iv.name for iv in synthetic_runner.variables.independent_variables]
dvs = [dv.name for dv in synthetic_runner.variables.dependent_variables]


# experimental data
conditions = synthetic_runner.domain()
experiment_data = synthetic_runner.run(conditions, added_noise=0.01)
observations = experiment_data[dvs]

# set probabilities to choice values
observations = np.where(observations < 0.5, 0, 1)

# split into train and test datasets
conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)

Initialize Model
Fit Model to the Data
Plot the Results

Comparing Models¶

In [ ]:

Copied!

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

Logistic Regressor¶

Initialize Model

In [ ]:

Copied!

from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()

Fit Model to the Data

In [ ]:

Copied!

log_model.fit(conditions_train, observations_train)
# log_model.predict(conditions_test)
log_model.fit(conditions_train, observations_train)
# log_model.predict(conditions_test)

Plot the Results

In [ ]:

Copied!

present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')

Summary - Logisitic Regressor¶

Expressiveness: Low

Interpretability: Medium

Neural Network Classifier¶

Initialize Model

In [ ]:

Copied!

from sklearn.neural_network import MLPClassifier

nn_class_model = MLPClassifier(max_iter=3000, activation='logistic')
from sklearn.neural_network import MLPClassifier

nn_class_model = MLPClassifier(max_iter=3000, activation='logistic')

Fit Model to the Data

In [ ]:

Copied!

nn_class_model.fit(conditions_train, observations_train)
nn_class_model.fit(conditions_train, observations_train)

Plot the Results

In [ ]:

Copied!

present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')

Summary - Neural Network Regressor¶

Expressiveness: High

Interpretability: Low

Bayesian Machine Scientist¶

Initialize Model

In [ ]:

Copied!

bms_model = BMSRegressor(epochs=1500)
bms_model = BMSRegressor(epochs=1500)

Fit Model to the Data

In [ ]:

Copied!

bms_model.fit(conditions_train, observations_train, seed=0)
bms_model.fit(conditions_train, observations_train, seed=0)

Plot the Results

In [ ]:

Copied!

print(bms_model)
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')

Summary - BMS Regressor¶

Expressiveness: High

Interpretability: High

Summary - Model Comparison¶

In [ ]:

Copied!





present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)
present_results(model=log_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=nn_class_model, x_test=conditions_test, y_test=observations_test, arg='choice')
present_results(model=bms_model, x_test=conditions_test, y_test=observations_test, arg='choice')
print(bms_model)

Exercises¶

FYI: AutoRA Toolkit Pre-release for integrating symbolic regression into any other modeling method of your choice¶

Exercise 1: Try it yourself!¶

Install packages

In [ ]:

Copied!

# Install package that you will need here
# Install package that you will need here

Import packages

In [ ]:

Copied!

# Import packages you will need here
import pandas
# Import packages you will need here
import pandas

Load Data

In [ ]:

Copied!





## Uncomment the next lines and input the file location or replace with a line of your own
# FILE_LOCATION = ''
# df = pd.read_csv(FILE_LOCATION)

## Extract your conditions and observations from your loaded data
# conditions =
# observations =

## Run this line and make sure you train with the train datasets and test with the test datasets
## This ensures that your model hasn't overfit
# conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)
## Uncomment the next lines and input the file location or replace with a line of your own
# FILE_LOCATION = ''
# df = pd.read_csv(FILE_LOCATION)

## Extract your conditions and observations from your loaded data
# conditions =
# observations =

## Run this line and make sure you train with the train datasets and test with the test datasets
## This ensures that your model hasn't overfit
# conditions_train, conditions_test, observations_train, observations_test = train_test_split(conditions, observations)

Initialize Model

In [ ]:

Copied!

# We will use BMS for this exercise. Feel free to initialize another model to compare it to
# Minimum recommended number of epochs is 1500
bms_model = BMSRegressor(epochs=1500)
# We will use BMS for this exercise. Feel free to initialize another model to compare it to
# Minimum recommended number of epochs is 1500
bms_model = BMSRegressor(epochs=1500)

Fit model to the Data

In [ ]:

Copied!

## Uncomment this next line after you have completed step 3
# bms_model.fit(conditions_train, observations_train)
## Uncomment this next line after you have completed step 3
# bms_model.fit(conditions_train, observations_train)

Plot the results

In [ ]:

Copied!





## Uncomment this next line after you have completed step 5
# plot_type = "default", "2d", "3d", or "choice"
# present_results(bms_model, conditions_test, observations_test, arg=plot_type)

## You can uncomment these lines if you would like to compare BMS to a model of your choice
# your_models_name =
# your_model =
#
# models = [bms_model, your_model]
# model_names = ["bms_model", your_models_name]
# compare results(models=models, x_test=conditions_test, y_test=observations_test, model_names=model_names, arg=plot_type)
## Uncomment this next line after you have completed step 5
# plot_type = "default", "2d", "3d", or "choice"
# present_results(bms_model, conditions_test, observations_test, arg=plot_type)

## You can uncomment these lines if you would like to compare BMS to a model of your choice
# your_models_name =
# your_model =
#
# models = [bms_model, your_model]
# model_names = ["bms_model", your_models_name]
# compare results(models=models, x_test=conditions_test, y_test=observations_test, model_names=model_names, arg=plot_type)

Exercise 2: Using the AutoRA closed loop¶

See https://autoresearch.github.io/autora/tutorials/ for a tutorial on closed-loop experimentation.