Source code for pymc_marketing.clv.models.gamma_gamma

#   Copyright 2022 - 2025 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Gamma-Gamma Model for expected future monetary value."""

import numpy as np
import pandas
import pymc as pm
import pytensor.tensor as pt
import xarray
from pymc.util import RandomState

from pymc_marketing.clv.models import CLVModel
from pymc_marketing.clv.utils import customer_lifetime_value, to_xarray
from pymc_marketing.model_config import ModelConfig
from pymc_marketing.prior import Prior



[docs]
class BaseGammaGammaModel(CLVModel):
    """Base class for Gamma-Gamma models."""


[docs]
    def distribution_customer_spend(
        self,
        data: pandas.DataFrame,
        random_seed: RandomState | None = None,
    ) -> xarray.DataArray:
        """Posterior distribution of mean spend values for each customer.

        Parameters
        ----------
        data : ~pandas.DataFrame
            DataFrame containing the following columns:

            * `customer_id`: Unique customer identifier
            * `frequency`: Number of purchases
            * `monetary_value`: Mean spend values for each customer

        random_seed : ~RandomState, optional
            Optional random seed to fix sampling results.

        """
        x = data["frequency"]
        z_mean = data["monetary_value"]

        coords = {"customer_id": np.unique(data["customer_id"])}
        with pm.Model(coords=coords):
            p = pm.HalfFlat("p")
            q = pm.HalfFlat("q")
            v = pm.HalfFlat("v")

            # Eq 5 from [1], p.3
            nu = pm.Gamma("nu", p * x + q, v + x * z_mean, dims=("customer_id",))
            pm.Deterministic("mean_spend", p / nu, dims=("customer_id",))

            return pm.sample_posterior_predictive(
                self.idata,
                var_names=["nu", "mean_spend"],
                random_seed=random_seed,
            ).posterior_predictive["mean_spend"]



[docs]
    def expected_customer_spend(
        self,
        data: pandas.DataFrame,
    ) -> xarray.DataArray:
        """Compute the expected future mean spend value per customer.

        The computations are based on Eq 5 from [1], p.3.

        Adapted from: https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/lifetimes/fitters/gamma_gamma_fitter.py#L117

        data : ~pandas.DataFrame
            DataFrame containing the following columns:

            * `customer_id`: Unique customer identifier
            * `frequency`: Number of transactions observed for each customer
            * `monetary_value`: Mean transaction value of repeat purchases for each customer

        References
        ----------
        .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary
               value". February, 2, 1-9. https://www.brucehardie.com/notes/025/gamma_gamma.pdf

        """
        mean_transaction_value, frequency = to_xarray(
            data["customer_id"],
            data["monetary_value"],
            data["frequency"],
        )
        posterior = self.fit_result

        p = posterior["p"]
        q = posterior["q"]
        v = posterior["v"]

        individual_weight = p * frequency / (p * frequency + q - 1)
        population_mean = v * p / (q - 1)
        return (
            1 - individual_weight
        ) * population_mean + individual_weight * mean_transaction_value



[docs]
    def distribution_new_customer_spend(
        self, n: int = 1, random_seed: RandomState | None = None
    ) -> xarray.DataArray:
        """Posterior distribution of mean spend values for new customers.

        Parameters
        ----------
        n : int, optional
            Number of posterior distributions to generate. This can usually be left at the default value of 1.

        random_seed : ~RandomState, optional
            Optional random seed to fix sampling results.

        """
        coords = {"new_customer_id": range(n)}
        with pm.Model(coords=coords):
            p = pm.HalfFlat("p")
            q = pm.HalfFlat("q")
            v = pm.HalfFlat("v")

            nu = pm.Gamma("nu", q, v, dims=("new_customer_id",))
            pm.Deterministic("mean_spend", p / nu, dims=("new_customer_id",))

            return pm.sample_posterior_predictive(
                self.idata,
                var_names=["nu", "mean_spend"],
                random_seed=random_seed,
            ).posterior_predictive["mean_spend"]



[docs]
    def expected_new_customer_spend(self) -> xarray.DataArray:
        """Compute the expected mean spend value for a new customer."""
        posterior = self.fit_result
        p_mean = posterior["p"]
        q_mean = posterior["q"]
        v_mean = posterior["v"]

        # Closed form solution to the posterior of nu
        # Eq 3 from [1], p.3
        mean_spend = p_mean * v_mean / (q_mean - 1)
        # TODO: We could also provide the variance
        # var_spend = (p_mean ** 2 * v_mean ** 2) / ((q_mean - 1) ** 2 * (q_mean - 2))

        return mean_spend



[docs]
    def expected_customer_lifetime_value(
        self,
        transaction_model: CLVModel,
        data: pandas.DataFrame,
        future_t: int = 12,
        discount_rate: float = 0.00,
        time_unit: str = "D",
    ) -> xarray.DataArray:
        """Compute the average lifetime value for a group of one or more customers.

        In addition, it applies a discount rate for net present value estimations.

        Note `future_t` is measured in months regardless of `time_unit` specified.

        Adapted from lifetimes package
        https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/fitters/gamma_gamma_fitter.py#L246

        Parameters
        ----------
        transaction_model : ~CLVModel
            Predictive model for future transactions. `BetaGeoModel` and `ParetoNBDModel` are currently supported.
        data : ~pandas.DataFrame
            DataFrame containing the following columns:

            * `customer_id`: Unique customer identifier
            * `frequency`: Number of repeat purchases observed for each customer
            * `recency`: Time between the first and the last purchase
            * `T`: Time between the first purchase and the end of the observation period
            * `monetary_value`: Mean spend values of repeat purchases for each customer
        future_t : int, optional
            The lifetime expected for the user in months. Default: 12
        discount_rate : float, optional
            The monthly adjusted discount rate. Default: 0.00
        time_unit : string, optional
            Unit of time of the purchase history. Defaults to "D" for daily.
            Other options are "W" (weekly), "M" (monthly), and "H" (hourly).
            Example: If your dataset contains information about weekly purchases,
            you should use "W".

        Returns
        -------
        xarray
            DataArray containing estimated customer lifetime values

        """
        # Use Gamma-Gamma estimates for the expected_spend values
        predicted_monetary_value = self.expected_customer_spend(data=data)
        data.loc[:, "future_spend"] = predicted_monetary_value.mean(
            ("chain", "draw")
        ).copy()

        return customer_lifetime_value(
            transaction_model=transaction_model,
            data=data,
            future_t=future_t,
            discount_rate=discount_rate,
            time_unit=time_unit,
        )





[docs]
class GammaGammaModel(BaseGammaGammaModel):
    """Gamma-Gamma Model for expected future monetary value.

    The Gamma-Gamma model assumes expected future spend follows a Gamma distribution,
    and the scale of this distribution is also Gamma-distributed.

    This model is conditioned on the mean value of repeat transactions for each customer, and is based
    on [1]_, [2]_. Data must be summarized by *frequency* and *monetary_value* for each customer,
    using `clv.rfm_summary()` or equivalent.

    See `GammaGammaModelIndividual` for an equivalent model conditioned
    on individual transaction values.

    Parameters
    ----------
    data : ~pandas.DataFrame
        DataFrame containing the following columns:

        * `customer_id`: Unique customer identifier
        * `monetary_value`: Mean transaction value of repeat purchases for each customer
        * `frequency`: Number of repeat transactions observed for each customer
    model_config : dict, optional
        Dictionary of model prior parameters. If not provided, the model will use default priors specified in the
        `default_model_config` class attribute.
    sampler_config : dict, optional
        Dictionary of sampler parameters. Defaults to *None*.

    Examples
    --------

    .. code-block:: python

        import pymc as pm
        from pymc_marketing.clv import GammaGammaModel

        model = GammaGammaModel(
            data=pandas.DataFrame(
                {
                    "customer_id": [0, 1, 2, 3, ...],
                    "monetary_value": [23.5, 19.3, 11.2, 100.5, ...],
                    "frequency": [6, 8, 2, 1, ...],
                }
            ),
            model_config={
                "p": {"dist": "HalfNormal", kwargs: {}},
                "q": {"dist": "HalfStudentT", kwargs: {"nu": 4, "sigma": 10}},
                "v": {"dist": "HalfCauchy", kwargs: {"beta": 1}},
            },
            sampler_config={
                "draws": 1000,
                "tune": 1000,
                "chains": 2,
                "cores": 2,
                "nuts_kwargs": {"target_accept": 0.95},
            },
        )

        model.fit()
        print(model.fit_summary())

        # Predict spend of customers for which we know transaction history, conditioned on data.
        expected_customer_spend = (
            model.expected_customer_spend(
                data=pandas.DataFrame(
                    {
                        "customer_id": [0, 1, 2, 3, ...],
                        "monetary_value": [23.5, 19.3, 11.2, 100.5, ...],
                        "frequency": [6, 8, 2, 1, ...],
                    }
                ),
            ),
        )
        print(expected_customer_spend.mean("customer_id"))

        # Predict spend of 10 new customers, conditioned on data
        new_customer_spend = model.expected_new_customer_spend(n=10)
        print(new_customer_spend.mean("new_customer_id"))

    References
    ----------
    .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary
           value". https://www.brucehardie.com/notes/025/gamma_gamma.pdf
    .. [2] Peter S. Fader, Bruce G. S. Hardie, and Ka Lok Lee (2005), “RFM and CLV:
           Using iso-value curves for customer base analysis”, Journal of Marketing
           Research, 42 (November), 415-430.
           https://journals.sagepub.com/doi/pdf/10.1509/jmkr.2005.42.4.415

    """

    _model_type = "Gamma-Gamma Model (Mean Transactions)"


[docs]
    def __init__(
        self,
        data: pandas.DataFrame,
        model_config: dict | None = None,
        sampler_config: dict | None = None,
    ):
        self._validate_cols(
            data,
            required_cols=["customer_id", "monetary_value", "frequency"],
            must_be_unique=["customer_id"],
        )
        super().__init__(
            data=data, model_config=model_config, sampler_config=sampler_config
        )


    @property
    def default_model_config(self) -> ModelConfig:
        """Default model configuration."""
        return {
            "p": Prior("HalfFlat"),
            "q": Prior("HalfFlat"),
            "v": Prior("HalfFlat"),
        }


[docs]
    def build_model(self) -> None:  # type: ignore[override]
        """Build the model."""
        z_mean = pt.as_tensor_variable(self.data["monetary_value"])
        x = pt.as_tensor_variable(self.data["frequency"])

        coords = {"customer_id": self.data["customer_id"]}
        with pm.Model(coords=coords) as self.model:
            p = self.model_config["p"].create_variable("p")
            q = self.model_config["q"].create_variable("q")
            v = self.model_config["v"].create_variable("v")

            # Likelihood for mean_spend, marginalizing over nu
            # Eq 1a from [1], p.2
            pm.Potential(
                "likelihood",
                (
                    pt.gammaln(p * x + q)
                    - pt.gammaln(p * x)
                    - pt.gammaln(q)
                    + q * pt.log(v)
                    + (p * x - 1) * pt.log(z_mean)
                    + (p * x) * pt.log(x)
                    - (p * x + q) * pt.log(x * z_mean + v)
                ),
            )




# TODO: This model requires further evaluation and reference in a notebook

[docs]
class GammaGammaModelIndividual(BaseGammaGammaModel):
    """Gamma-Gamma Model for expected future monetary value.

    The Gamma-Gamma model assumes expected future spend follows a Gamma distribution,
    and the scale of this distribution is also Gamma-distributed.

    This model is conditioned on the spend values of each purchase for each customer,
    and is based on [1]_, [2]_.

    See `GammaGammaModel` for an equivalent model conditioned on mean transaction values
    of repeat purchases for the customer population.

    Parameters
    ----------
    data : ~pandas.DataFrame
        Dataframe containing the following columns:

        * `customer_id`: Unique customer identifier
        * `individual_transaction_value`: Monetary value of each purchase for each customer
    model_config : dict, optional
        Dictionary of model prior parameters. If not provided, the model will use default priors specified in the
        `default_model_config` class attribute.
    sampler_config : dict, optional
        Dictionary of sampler parameters. Defaults to *None*.


    Examples
    --------

    .. code-block:: python

        import pymc as pm
        from pymc_marketing.clv import GammaGammaModelIndividual

        model = GammaGammaModelIndividual(
            data=pandas.DataFrame(
                {
                "customer_id": [0, 0, 0, 1, 1, 2, ...],
                "individual_transaction_value": [5.3. 5.7, 6.9, 13.5, 0.3, 19.2 ...],
                }
            ),
            model_config={
                "p": {dist: 'HalfNorm', kwargs: {}},
                "q": {dist: 'HalfStudentT', kwargs: {"nu": 4, "sigma": 10}},
                "v": {dist: 'HalfCauchy', kwargs: {}},
            },
            sampler_config={
                "draws": 1000,
                "tune": 1000,
                "chains": 2,
                "cores": 2,
                "nuts_kwargs": {"target_accept": 0.95},
            },
        )

        model.fit()
        print(model.fit_summary())

        # Predict spend of customers for which we know transaction history,
        # conditioned on data. May include customers not included in fitting
        expected_customer_spend = model.expected_customer_spend(
            data=pandas.DataFrame(
                {
                "customer_id": [0, 0, 0, 1, 1, 2, ...],
                "individual_transaction_value": [5.3. 5.7, 6.9, 13.5, 0.3, 19.2 ...],
                }
            ),
        )
        print(expected_customer_spend.mean("customer_id"))

        # Predict spend of 10 new customers, conditioned on data
        new_customer_spend = model.expected_new_customer_spend(n=10)
        print(new_customer_spend.mean("new_customer_id"))

    References
    ----------
    .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary
           value". http://www.brucehardie.com/notes/025/gamma_gamma.pdf
    .. [2] Peter S. Fader, Bruce G. S. Hardie, and Ka Lok Lee (2005), “RFM and CLV:
           Using iso-value curves for customer base analysis”, Journal of Marketing
           Research, 42 (November), 415-430.
           https://journals.sagepub.com/doi/pdf/10.1509/jmkr.2005.42.4.415

    """

    _model_type = "Gamma-Gamma Model (Individual Transactions)"


[docs]
    def __init__(
        self,
        data: pandas.DataFrame,
        model_config: dict | None = None,
        sampler_config: dict | None = None,
    ):
        self._validate_cols(
            data, required_cols=["customer_id", "individual_transaction_value"]
        )
        super().__init__(
            data=data, model_config=model_config, sampler_config=sampler_config
        )


    @property
    def default_model_config(self) -> dict:
        """Default model configuration."""
        return {
            "p": Prior("HalfFlat"),
            "q": Prior("HalfFlat"),
            "v": Prior("HalfFlat"),
        }


[docs]
    def build_model(self) -> None:  # type: ignore[override]
        """Build the model."""
        z = self.data["individual_transaction_value"]

        coords = {
            "customer_id": np.unique(self.data["customer_id"]),
            "obs": range(self.data.shape[0]),
        }
        with pm.Model(coords=coords) as self.model:
            p = self.model_config["p"].create_variable("p")
            q = self.model_config["q"].create_variable("q")
            v = self.model_config["v"].create_variable("v")

            nu = pm.Gamma("nu", q, v, dims=("customer_id",))
            pm.Gamma(
                "spend", p, nu[self.data["customer_id"]], observed=z, dims=("obs",)
            )