Source code for pymc_experimental.bart.bart

#   Copyright 2020 The PyMC Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import aesara.tensor as at
import numpy as np

from aeppl.logprob import _logprob
from aesara.tensor.random.op import RandomVariable, default_supp_shape_from_params
from pandas import DataFrame, Series

from pymc.distributions.distribution import NoDistribution, _get_moment

__all__ = ["BART"]


class BARTRV(RandomVariable):
    """Base class for BART."""

    name = "BART"
    ndim_supp = 1
    ndims_params = [2, 1, 0, 0, 0, 1]
    dtype = "floatX"
    _print_name = ("BART", "\\operatorname{BART}")
    all_trees = None

    def _shape_from_params(self, dist_params, rep_param_idx=1, param_shapes=None):
        return default_supp_shape_from_params(
            self.ndim_supp, dist_params, rep_param_idx, param_shapes
        )

    def _infer_shape(cls, size, dist_params, param_shapes=None):
        dist_shape = (cls.X.shape[0],)
        return dist_shape

    @classmethod
    def rng_fn(cls, rng=np.random.default_rng(), *args, **kwargs):
        return np.full_like(cls.Y, cls.Y.mean())


bart = BARTRV()


[docs]class BART(NoDistribution):
    """
    Bayesian Additive Regression Tree distribution.

    Distribution representing a sum over trees

    Parameters
    ----------
    X : array-like
        The covariate matrix.
    Y : array-like
        The response vector.
    m : int
        Number of trees
    alpha : float
        Control the prior probability over the depth of the trees. Even when it can takes values in
        the interval (0, 1), it is recommended to be in the interval (0, 0.5].
    k : float
        Scale parameter for the values of the leaf nodes. Defaults to 2. Recomended to be between 1
        and 3.
    split_prior : array-like
        Each element of split_prior should be in the [0, 1] interval and the elements should sum to
        1. Otherwise they will be normalized.
        Defaults to None, i.e. all covariates have the same prior probability to be selected.
    """

    def __new__(
        cls,
        name,
        X,
        Y,
        m=50,
        alpha=0.25,
        k=2,
        split_prior=None,
        **kwargs,
    ):

        X, Y = preprocess_XY(X, Y)

        bart_op = type(
            f"BART_{name}",
            (BARTRV,),
            dict(
                name="BART",
                inplace=False,
                initval=Y.mean(),
                X=X,
                Y=Y,
                m=m,
                alpha=alpha,
                k=k,
                split_prior=split_prior,
            ),
        )()

        NoDistribution.register(BARTRV)

        @_get_moment.register(BARTRV)
        def get_moment(rv, size, *rv_inputs):
            return cls.get_moment(rv, size, *rv_inputs)

        cls.rv_op = bart_op
        params = [X, Y, m, alpha, k]
        return super().__new__(cls, name, *params, **kwargs)

[docs]    @classmethod
    def dist(cls, *params, **kwargs):
        return super().dist(params, **kwargs)

[docs]    def logp(x, *inputs):
        """Calculate log probability.

        Parameters
        ----------
        x: numeric, TensorVariable
            Value for which log-probability is calculated.

        Returns
        -------
        TensorVariable
        """
        return at.zeros_like(x)

    @classmethod
    def get_moment(cls, rv, size, *rv_inputs):
        mean = at.fill(size, rv.Y.mean())
        return mean


def preprocess_XY(X, Y):
    if isinstance(Y, (Series, DataFrame)):
        Y = Y.to_numpy()
    if isinstance(X, (Series, DataFrame)):
        X = X.to_numpy()
        # X = np.random.normal(X, X.std(0)/100)
    Y = Y.astype(float)
    X = X.astype(float)
    return X, Y


@_logprob.register(BARTRV)
def logp(op, value_var, *dist_params, **kwargs):
    _dist_params = dist_params[3:]
    value_var = value_var[0]
    return BART.logp(value_var, *_dist_params)
Quick search

Source code for pymc_experimental.bart.bart