Source code for pymc_marketing.clv.models.gamma_gamma

#   Copyright 2022 - 2025 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Gamma-Gamma Model for expected future monetary value."""

import numpy as np
import pandas
import pymc as pm
import pytensor.tensor as pt
import xarray
from pymc.util import RandomState

from pymc_marketing.clv.models import CLVModel
from pymc_marketing.clv.utils import customer_lifetime_value, to_xarray
from pymc_marketing.model_config import ModelConfig
from pymc_marketing.prior import Prior


[docs] class BaseGammaGammaModel(CLVModel): """Base class for Gamma-Gamma models."""
[docs] def distribution_customer_spend( self, data: pandas.DataFrame, random_seed: RandomState | None = None, ) -> xarray.DataArray: """Posterior distribution of mean spend values for each customer. Parameters ---------- data : ~pandas.DataFrame DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of purchases * `monetary_value`: Mean spend values for each customer random_seed : ~RandomState, optional Optional random seed to fix sampling results. """ x = data["frequency"] z_mean = data["monetary_value"] coords = {"customer_id": np.unique(data["customer_id"])} with pm.Model(coords=coords): p = pm.HalfFlat("p") q = pm.HalfFlat("q") v = pm.HalfFlat("v") # Eq 5 from [1], p.3 nu = pm.Gamma("nu", p * x + q, v + x * z_mean, dims=("customer_id",)) pm.Deterministic("mean_spend", p / nu, dims=("customer_id",)) return pm.sample_posterior_predictive( self.idata, var_names=["nu", "mean_spend"], random_seed=random_seed, ).posterior_predictive["mean_spend"]
[docs] def expected_customer_spend( self, data: pandas.DataFrame, ) -> xarray.DataArray: """Compute the expected future mean spend value per customer. The computations are based on Eq 5 from [1], p.3. Adapted from: https://github.com/CamDavidsonPilon/lifetimes/blob/aae339c5437ec31717309ba0ec394427e19753c4/lifetimes/fitters/gamma_gamma_fitter.py#L117 data : ~pandas.DataFrame DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of transactions observed for each customer * `monetary_value`: Mean transaction value of repeat purchases for each customer References ---------- .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary value". February, 2, 1-9. https://www.brucehardie.com/notes/025/gamma_gamma.pdf """ mean_transaction_value, frequency = to_xarray( data["customer_id"], data["monetary_value"], data["frequency"], ) posterior = self.fit_result p = posterior["p"] q = posterior["q"] v = posterior["v"] individual_weight = p * frequency / (p * frequency + q - 1) population_mean = v * p / (q - 1) return ( 1 - individual_weight ) * population_mean + individual_weight * mean_transaction_value
[docs] def distribution_new_customer_spend( self, n: int = 1, random_seed: RandomState | None = None ) -> xarray.DataArray: """Posterior distribution of mean spend values for new customers. Parameters ---------- n : int, optional Number of posterior distributions to generate. This can usually be left at the default value of 1. random_seed : ~RandomState, optional Optional random seed to fix sampling results. """ coords = {"new_customer_id": range(n)} with pm.Model(coords=coords): p = pm.HalfFlat("p") q = pm.HalfFlat("q") v = pm.HalfFlat("v") nu = pm.Gamma("nu", q, v, dims=("new_customer_id",)) pm.Deterministic("mean_spend", p / nu, dims=("new_customer_id",)) return pm.sample_posterior_predictive( self.idata, var_names=["nu", "mean_spend"], random_seed=random_seed, ).posterior_predictive["mean_spend"]
[docs] def expected_new_customer_spend(self) -> xarray.DataArray: """Compute the expected mean spend value for a new customer.""" posterior = self.fit_result p_mean = posterior["p"] q_mean = posterior["q"] v_mean = posterior["v"] # Closed form solution to the posterior of nu # Eq 3 from [1], p.3 mean_spend = p_mean * v_mean / (q_mean - 1) # TODO: We could also provide the variance # var_spend = (p_mean ** 2 * v_mean ** 2) / ((q_mean - 1) ** 2 * (q_mean - 2)) return mean_spend
[docs] def expected_customer_lifetime_value( self, transaction_model: CLVModel, data: pandas.DataFrame, future_t: int = 12, discount_rate: float = 0.00, time_unit: str = "D", ) -> xarray.DataArray: """Compute the average lifetime value for a group of one or more customers. In addition, it applies a discount rate for net present value estimations. Note `future_t` is measured in months regardless of `time_unit` specified. Adapted from lifetimes package https://github.com/CamDavidsonPilon/lifetimes/blob/41e394923ad72b17b5da93e88cfabab43f51abe2/lifetimes/fitters/gamma_gamma_fitter.py#L246 Parameters ---------- transaction_model : ~CLVModel Predictive model for future transactions. `BetaGeoModel` and `ParetoNBDModel` are currently supported. data : ~pandas.DataFrame DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `frequency`: Number of repeat purchases observed for each customer * `recency`: Time between the first and the last purchase * `T`: Time between the first purchase and the end of the observation period * `monetary_value`: Mean spend values of repeat purchases for each customer future_t : int, optional The lifetime expected for the user in months. Default: 12 discount_rate : float, optional The monthly adjusted discount rate. Default: 0.00 time_unit : string, optional Unit of time of the purchase history. Defaults to "D" for daily. Other options are "W" (weekly), "M" (monthly), and "H" (hourly). Example: If your dataset contains information about weekly purchases, you should use "W". Returns ------- xarray DataArray containing estimated customer lifetime values """ # Use Gamma-Gamma estimates for the expected_spend values predicted_monetary_value = self.expected_customer_spend(data=data) data.loc[:, "future_spend"] = predicted_monetary_value.mean( ("chain", "draw") ).copy() return customer_lifetime_value( transaction_model=transaction_model, data=data, future_t=future_t, discount_rate=discount_rate, time_unit=time_unit, )
[docs] class GammaGammaModel(BaseGammaGammaModel): """Gamma-Gamma Model for expected future monetary value. The Gamma-Gamma model assumes expected future spend follows a Gamma distribution, and the scale of this distribution is also Gamma-distributed. This model is conditioned on the mean value of repeat transactions for each customer, and is based on [1]_, [2]_. Data must be summarized by *frequency* and *monetary_value* for each customer, using `clv.rfm_summary()` or equivalent. See `GammaGammaModelIndividual` for an equivalent model conditioned on individual transaction values. Parameters ---------- data : ~pandas.DataFrame DataFrame containing the following columns: * `customer_id`: Unique customer identifier * `monetary_value`: Mean transaction value of repeat purchases for each customer * `frequency`: Number of repeat transactions observed for each customer model_config : dict, optional Dictionary of model prior parameters. If not provided, the model will use default priors specified in the `default_model_config` class attribute. sampler_config : dict, optional Dictionary of sampler parameters. Defaults to *None*. Examples -------- .. code-block:: python import pymc as pm from pymc_marketing.clv import GammaGammaModel model = GammaGammaModel( data=pandas.DataFrame( { "customer_id": [0, 1, 2, 3, ...], "monetary_value": [23.5, 19.3, 11.2, 100.5, ...], "frequency": [6, 8, 2, 1, ...], } ), model_config={ "p": {"dist": "HalfNormal", kwargs: {}}, "q": {"dist": "HalfStudentT", kwargs: {"nu": 4, "sigma": 10}}, "v": {"dist": "HalfCauchy", kwargs: {"beta": 1}}, }, sampler_config={ "draws": 1000, "tune": 1000, "chains": 2, "cores": 2, "nuts_kwargs": {"target_accept": 0.95}, }, ) model.fit() print(model.fit_summary()) # Predict spend of customers for which we know transaction history, conditioned on data. expected_customer_spend = ( model.expected_customer_spend( data=pandas.DataFrame( { "customer_id": [0, 1, 2, 3, ...], "monetary_value": [23.5, 19.3, 11.2, 100.5, ...], "frequency": [6, 8, 2, 1, ...], } ), ), ) print(expected_customer_spend.mean("customer_id")) # Predict spend of 10 new customers, conditioned on data new_customer_spend = model.expected_new_customer_spend(n=10) print(new_customer_spend.mean("new_customer_id")) References ---------- .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary value". https://www.brucehardie.com/notes/025/gamma_gamma.pdf .. [2] Peter S. Fader, Bruce G. S. Hardie, and Ka Lok Lee (2005), “RFM and CLV: Using iso-value curves for customer base analysis”, Journal of Marketing Research, 42 (November), 415-430. https://journals.sagepub.com/doi/pdf/10.1509/jmkr.2005.42.4.415 """ _model_type = "Gamma-Gamma Model (Mean Transactions)"
[docs] def __init__( self, data: pandas.DataFrame, model_config: dict | None = None, sampler_config: dict | None = None, ): self._validate_cols( data, required_cols=["customer_id", "monetary_value", "frequency"], must_be_unique=["customer_id"], ) super().__init__( data=data, model_config=model_config, sampler_config=sampler_config )
@property def default_model_config(self) -> ModelConfig: """Default model configuration.""" return { "p": Prior("HalfFlat"), "q": Prior("HalfFlat"), "v": Prior("HalfFlat"), }
[docs] def build_model(self) -> None: # type: ignore[override] """Build the model.""" z_mean = pt.as_tensor_variable(self.data["monetary_value"]) x = pt.as_tensor_variable(self.data["frequency"]) coords = {"customer_id": self.data["customer_id"]} with pm.Model(coords=coords) as self.model: p = self.model_config["p"].create_variable("p") q = self.model_config["q"].create_variable("q") v = self.model_config["v"].create_variable("v") # Likelihood for mean_spend, marginalizing over nu # Eq 1a from [1], p.2 pm.Potential( "likelihood", ( pt.gammaln(p * x + q) - pt.gammaln(p * x) - pt.gammaln(q) + q * pt.log(v) + (p * x - 1) * pt.log(z_mean) + (p * x) * pt.log(x) - (p * x + q) * pt.log(x * z_mean + v) ), )
# TODO: This model requires further evaluation and reference in a notebook
[docs] class GammaGammaModelIndividual(BaseGammaGammaModel): """Gamma-Gamma Model for expected future monetary value. The Gamma-Gamma model assumes expected future spend follows a Gamma distribution, and the scale of this distribution is also Gamma-distributed. This model is conditioned on the spend values of each purchase for each customer, and is based on [1]_, [2]_. See `GammaGammaModel` for an equivalent model conditioned on mean transaction values of repeat purchases for the customer population. Parameters ---------- data : ~pandas.DataFrame Dataframe containing the following columns: * `customer_id`: Unique customer identifier * `individual_transaction_value`: Monetary value of each purchase for each customer model_config : dict, optional Dictionary of model prior parameters. If not provided, the model will use default priors specified in the `default_model_config` class attribute. sampler_config : dict, optional Dictionary of sampler parameters. Defaults to *None*. Examples -------- .. code-block:: python import pymc as pm from pymc_marketing.clv import GammaGammaModelIndividual model = GammaGammaModelIndividual( data=pandas.DataFrame( { "customer_id": [0, 0, 0, 1, 1, 2, ...], "individual_transaction_value": [5.3. 5.7, 6.9, 13.5, 0.3, 19.2 ...], } ), model_config={ "p": {dist: 'HalfNorm', kwargs: {}}, "q": {dist: 'HalfStudentT', kwargs: {"nu": 4, "sigma": 10}}, "v": {dist: 'HalfCauchy', kwargs: {}}, }, sampler_config={ "draws": 1000, "tune": 1000, "chains": 2, "cores": 2, "nuts_kwargs": {"target_accept": 0.95}, }, ) model.fit() print(model.fit_summary()) # Predict spend of customers for which we know transaction history, # conditioned on data. May include customers not included in fitting expected_customer_spend = model.expected_customer_spend( data=pandas.DataFrame( { "customer_id": [0, 0, 0, 1, 1, 2, ...], "individual_transaction_value": [5.3. 5.7, 6.9, 13.5, 0.3, 19.2 ...], } ), ) print(expected_customer_spend.mean("customer_id")) # Predict spend of 10 new customers, conditioned on data new_customer_spend = model.expected_new_customer_spend(n=10) print(new_customer_spend.mean("new_customer_id")) References ---------- .. [1] Fader, P. S., & Hardie, B. G. (2013). "The Gamma-Gamma model of monetary value". http://www.brucehardie.com/notes/025/gamma_gamma.pdf .. [2] Peter S. Fader, Bruce G. S. Hardie, and Ka Lok Lee (2005), “RFM and CLV: Using iso-value curves for customer base analysis”, Journal of Marketing Research, 42 (November), 415-430. https://journals.sagepub.com/doi/pdf/10.1509/jmkr.2005.42.4.415 """ _model_type = "Gamma-Gamma Model (Individual Transactions)"
[docs] def __init__( self, data: pandas.DataFrame, model_config: dict | None = None, sampler_config: dict | None = None, ): self._validate_cols( data, required_cols=["customer_id", "individual_transaction_value"] ) super().__init__( data=data, model_config=model_config, sampler_config=sampler_config )
@property def default_model_config(self) -> dict: """Default model configuration.""" return { "p": Prior("HalfFlat"), "q": Prior("HalfFlat"), "v": Prior("HalfFlat"), }
[docs] def build_model(self) -> None: # type: ignore[override] """Build the model.""" z = self.data["individual_transaction_value"] coords = { "customer_id": np.unique(self.data["customer_id"]), "obs": range(self.data.shape[0]), } with pm.Model(coords=coords) as self.model: p = self.model_config["p"].create_variable("p") q = self.model_config["q"].create_variable("q") v = self.model_config["v"].create_variable("v") nu = pm.Gamma("nu", q, v, dims=("customer_id",)) pm.Gamma( "spend", p, nu[self.data["customer_id"]], observed=z, dims=("obs",) )