Toy Example - Deep prior#

import tensorflow_probability as tfp
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns
import pandas as pd
import numpy as np
import elicit as el

from bayesflow.inference_networks import InvertibleNetwork
from elicit.extras import utils

tfd = tfp.distributions

The Model#

Generative model#

\[\begin{align*} (\beta_0, \beta_1, \sigma) &\sim p_\lambda(\boldsymbol{\theta}) \\ \mu &= \beta_0 + \beta_1X \\ y_{pred} &\sim \text{Normal}(\mu, \sigma) \end{align*}\]

Implementation#

Predictor#

# create a predictor ranging from 1 to 200 
# standardize predictor
# select the 5th, 25th, 50th, 75th, and 95th quantile of the std. predictor for querying the expert
def std_predictor(N, quantiles):
    X = tf.cast(np.arange(N), tf.float32)
    X_std = (X-tf.reduce_mean(X))/tf.math.reduce_std(X)
    X_sel = tfp.stats.percentile(X_std, quantiles)
    return X_sel
    
std_predictor(N=200, quantiles=[5,25,50,75,95])
<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-1.5502049 , -0.85737586,  0.00866036,  0.85737586,  1.5502049 ],
      dtype=float32)>

Generative model#

class ToyModel:
    def __call__(self, prior_samples, design_matrix, **kwargs):
        B = prior_samples.shape[0]
        S = prior_samples.shape[1]

        # preprocess shape of design matrix
        X = tf.broadcast_to(design_matrix[None, None,:],
                           (B,S,len(design_matrix)))
        # linear predictor (= mu)
        epred = tf.add(prior_samples[:, :, 0][:,:,None],
                       tf.multiply(prior_samples[:, :, 1][:,:,None], X)
                       )
        # data-generating model
        likelihood = tfd.Normal(
            loc=epred, scale=tf.expand_dims(prior_samples[:, :, -1], -1)
        )
        # prior predictive distribution (=height)
        ypred = likelihood.sample()
        
        # selected observations
        y_X0, y_X1, y_X2, y_X3, y_X4 = (ypred[:,:,0], ypred[:,:,1], ypred[:,:,2],
                                        ypred[:,:,3], ypred[:,:,4])

        # log R2 (log for numerical stability)
        log_R2 = utils.log_R2(ypred, epred)

        # correlation between parameters 
        cor = utils.pearson_correlation(prior_samples)

        return dict(
            likelihood=likelihood,
            ypred=ypred, epred=epred,
            prior_samples=prior_samples,
            y_X0=y_X0, y_X1=y_X1, y_X2=y_X2,
            y_X3=y_X3, y_X4=y_X4,
            log_R2=log_R2,
            cor=cor
        )

Model input for elicit method#

# specify the model
model=el.model(
        obj=ToyModel,
        design_matrix=std_predictor(N=200, quantiles=[5,25,50,75,95])
        )

Model parameters#

  • intercept parameter \(\beta_0\)

  • slope parameter \(\beta_1\)

  • error term \(\sigma\)

To be learned hyperparameters \(\lambda\): reflecting the weights of the deep neural networks within the normalizing flow architecture.

Parameter input for elicit method#

parameters=[
        el.parameter(name="beta0"),
        el.parameter(name="beta1"),
        el.parameter(name="sigma")
    ]

Target quantities and elicitation techniques#

Target quantities

  • query expert regarding prior predictions \(y \mid X_{i}\) with \(i\) being the 5th, 25th, 50th, 75th, and 95th quantile of the predictor.

  • \(R^2 = \frac{\text{Var}(\mu)}{\text{Var}(y)}\) (we use \(\log R^2\) for numerical stability)

  • correlation between model parameters

Elicitation technique

  • query each prior prediction using quantile-based elicitation using \(Q_p(y \mid X)\) for \(p=5, 25, 50, 75, 95\)

  • query \(R^2\) using quantile-based elicitation using \(Q_p(y \mid X)\) for \(p=5, 25, 50, 75, 95\)

  • regarding the correlation structure, we assume independence between the model parameters (thus, \(\boldsymbol{\rho}_\boldsymbol{\theta}=\mathbf{0}\))

Importance of elicited statistics in loss

  • all elicited statistics should have equal importance (weight=1.0)

  • for computing the discrepancy between expert-elicited statistics and model simulations with use the Maximum Mean Discrepancy with Energy kernel

Targets input for elicit method#

targets=[
        el.target(
            name=f"y_X{i}",
            query=el.queries.quantiles((5, 25, 50, 75, 95)),
            loss=el.losses.MMD2(kernel="energy"),
            weight=1.0
        ) for i in range(5)
        ]+[
        el.target(
            name="log_R2",
            query=el.queries.quantiles((5, 25, 50, 75, 95)),
            loss=el.losses.MMD2(kernel="energy"),
            weight=1.0
        ),
        el.target(
            name="cor",
            query=el.queries.identity(),
            loss=el.losses.L2,
            weight=1.0
        )
    ]

Expert elicitation#

  • instead of querying a “real” expert, we define a ground truth (i.e., oracle) and simulate the oracle-elicited statistics

Expert input for elicit method (here: oracle)#

# specify ground truth
ground_truth = {
    "beta0": tfd.Normal(loc=5, scale=1),
    "beta1": tfd.Normal(loc=2, scale=1),
    "sigma": tfd.HalfNormal(scale=10.0),
}

# define oracle
expert=el.expert.simulator(
        ground_truth = ground_truth,
        num_samples = 10_000
    )

Normalizing Flow#

ToDo

network=el.networks.NF(
        inference_network=InvertibleNetwork,
        network_specs=dict(
            num_params=3,
            num_coupling_layers=3,
            coupling_design="affine",
            coupling_settings={
                "dropout": False,
                "dense_args": {
                    "units": 128,
                    "activation": "relu",
                    "kernel_regularizer": None,
                },
                "num_dense": 2,
            },
            permutation="fixed"
        ),
        base_distribution=el.networks.base_normal
    )

Training: Learn prior distributions based on expert data#

All inputs for elicit method

elicit = el.Elicit(
    model=model,
    parameters=parameters,
    targets=targets,
    expert=expert,
    optimizer=el.optimizer(
        optimizer=tf.keras.optimizers.Adam,
        learning_rate=0.001,
        clipnorm=1.0
        ),
    trainer=el.trainer(
        method="deep_prior",
        name="toy",
        seed=3,
        epochs=200
    ),
    network=network
)

# run method
hist = elicit.fit(save_dir=None)
Training
100%|██████████| 200/200 [05:40<00:00,  1.70s/it]

Results#

Convergence#

def plot_convergence(hist):
    _, axs = plt.subplots(1,3, constrained_layout=True, figsize=(7,3), sharex=True)
    axs[0].plot(hist["loss"])
    for i,k in enumerate(hist["hyperparameter"]):
        axs[i+1].plot(hist["hyperparameter"][k], label=[r"$\beta_0$", r"$\beta_1$", r"$\sigma$"])
        axs[i+1].legend(handlelength=0.5, fontsize="small", frameon=False, ncol=3, columnspacing=0.4)
        axs[i+1].set_title(f"Hyperparameter {k}", size="small")
    axs[0].set_title("Total loss", size="medium")
    axs[0].set_ylabel(r"$\mathcal{L}(\lambda)$")
    axs[1].set_ylabel(r"$\lambda$")
    [axs[i].set_xlabel("epochs") for i in range(2)]
    plt.suptitle("Convergence", ha="left", x=0)
    plt.show()

plot_convergence(hist)
../_images/ee12f0a145cf20b4248b3df2a55fd6f9e42c43519f348b349e1be6966792cd50.png

Expert expectations#

def plot_expert_data(res):
    _, axs = plt.subplots(2,3, constrained_layout=True, figsize=(6,4), sharex=True, sharey=True)
    for j,key in enumerate(res["expert_elicited_statistics"]):
        if key=="identity_cor":
            continue
        if j<3:
            axs[0,j].axline((0,0), slope=1, color="darkgrey", linestyle="dashed", lw=1)
            axs[0,j].plot(res["expert_elicited_statistics"][key][0,:], 
                         tf.reduce_mean(res["elicited_statistics"][key],0), "o", ms=5, color="black")
            axs[0,j].set_title(key)
            axs[0,j].set_xlabel("expert")
        else:
            axs[1,j-3].axline((0,0), slope=1, color="darkgrey", linestyle="dashed", lw=1)
            axs[1,j-3].plot(res["expert_elicited_statistics"][key][0,:], 
                         tf.reduce_mean(res["elicited_statistics"][key],0), "o", ms=5, color="black")
            axs[1,j-3].set_title(key)
            axs[1,j-3].set_xlabel("expert")
    [axs[i,0].set_ylabel("model") for i in range(2)]
    plt.show()

plot_expert_data(elicit.results)
../_images/f6422393e437eb235c298f27c29b49480469ba67d392e1110569629f8039ae04.png

Learned joint prior#

# extract number of samples from simulated priors 
num_samples = elicit.trainer["num_samples"]

# create pandas dataframes out of trained priors and orcale priors
model_priors_df = pd.DataFrame(elicit.results["prior_samples"][0,:,:], 
                               columns = ["b0", "b1", "sigma"])
oracle_priors_df = pd.DataFrame(elicit.results["expert_prior_samples"][0,:num_samples,:],
                                columns = ["b0", "b1", "sigma"])
# add label to sub-dataframes
model_priors_df["type"]="trained"
oracle_priors_df["type"]="true"

# concat dataframes and ensure sigma is pos.
df = pd.concat([expert_priors_df, model_priors_df])
df["sigma"] = abs(df["sigma"])

# plot joint prior distribution
sns.pairplot(df, diag_kind="hist", hue="type", kind="kde", corner=True)
plt.show()
../_images/72d305857b2e3f9d98fecb3c250a0657d2748db944ac5792c31599a6e5d74e4d.png

Add-on: Model averaging#

ToDo