Source code for pymc_marketing.mmm.builders.yaml

#   Copyright 2022 - 2025 The PyMC Labs Developers
#
#   Licensed under the Apache License, Version 2.0 (the "License");
#   you may not use this file except in compliance with the License.
#   You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.
"""Builder for MMM projects."""

from __future__ import annotations

import os
import warnings
from collections.abc import Mapping
from pathlib import Path
from typing import Any

import pandas as pd
import yaml  # type: ignore

from pymc_marketing.mmm.builders.factories import build
from pymc_marketing.mmm.multidimensional import MMM
from pymc_marketing.utils import from_netcdf


def _load_df(path: str | Path) -> pd.DataFrame:
    """
    Read a DataFrame from *path* based on extension.

    Currently supports: .parquet, .csv, .txt
    """
    path = Path(path)
    if path.suffix == ".parquet":
        return pd.read_parquet(path)
    if path.suffix in {".csv", ".txt"}:
        return pd.read_csv(path)
    raise ValueError(f"Unrecognised tabular format: {path}")



[docs]
def build_mmm_from_yaml(
    config_path: str | Path,
    *,
    X: pd.DataFrame | None = None,
    y: pd.DataFrame | pd.Series | None = None,
) -> MMM:
    """
    Build an MMM model from *config_path*.

    The configuration keys:

    - `model` (required): MMM initialization parameters
    - `effects` (optional): list of additive effects in the model
    - `data` (optional): paths to X and y data
    - `original_scale_vars` (optional): list of original scale variables
    - `idata_path` (optional): path to inference data

    Parameters
    ----------
    config_path : str | Path
        YAML file with model configuration.
    X : pandas.DataFrame, optional
        Pre-loaded covariate matrix.  If omitted, the loader tries to read it
        from a path in the YAML under `data.X_path`.
    y : pandas.DataFrame | pandas.Series, optional
        Pre-loaded target vector.  If omitted, the loader tries to read it
        from a path in the YAML under `data.y_path`.

    Returns
    -------
    model : MMM
    """
    cfg: Mapping[str, Any] = yaml.safe_load(Path(config_path).read_text())

    # 1 ─────────────────────────────────── shell (no effects yet)
    model_config = cfg["model"]["kwargs"]  # Get model kwargs

    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=DeprecationWarning)
        model = build(cfg["model"])

    # 2 ──────────────────────────────── resolve covariates / target
    data_cfg: Mapping[str, Any] = cfg.get("data", {})
    if X is None:
        if "X_path" not in data_cfg:
            raise ValueError("X not provided and no `data.X_path` found in YAML.")
        X = _load_df(data_cfg["X_path"])
    if y is None:
        if "y_path" not in data_cfg:
            raise ValueError("y not provided and no `data.y_path` found in YAML.")
        y = _load_df(data_cfg["y_path"])

    # Convert date column after loading data
    date_column = model_config.get("date_column")
    if date_column:
        date_col_in_X = date_column in X.columns

        if date_column in X.columns:
            X[date_column] = pd.to_datetime(X[date_column])

        if not date_col_in_X:
            raise ValueError(
                f"Date column '{date_column}' specified in config not found in either X or y data."
            )

    # 3 ───────────────────────────────────── effects (preserve order)
    # Build and append each effect
    for eff_spec in cfg.get("effects", []):
        effect = build(eff_spec)
        model.mu_effects.append(effect)

    # 4 ───────────────────────────────────────────── build PyMC graph
    model.build_model(X, y)  # this **must** precede any idata loading

    # 5 ───────────────────────── add original scale contribution variables
    original_scale_vars = cfg.get("original_scale_vars", [])
    if original_scale_vars:
        model.add_original_scale_contribution_variable(var=original_scale_vars)

    # 6 ──────────────────────────────────────────── attach inference data
    if (idata_fp := cfg.get("idata_path")) is not None:
        idata_path = Path(idata_fp)
        if os.path.exists(idata_path):
            model.idata = from_netcdf(idata_path)

    return model