blocks-transformer/block.py

import json
import logging
from functools import lru_cache
from pathlib import Path
from typing import Any, Dict, List, Tuple

import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from pandas.api.types import (
    is_bool_dtype,
    is_categorical_dtype,
    is_float_dtype,
    is_integer_dtype,
    is_object_dtype,
)

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Paths & constants
# ---------------------------------------------------------------------------

try:
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # Fallback for environments where __file__ is not defined (e.g. some REPLs / notebooks)
    BASE_DIR = Path.cwd()

A_MODEL_PATH = BASE_DIR / "xgboost_model_A.joblib"
A_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_A.json"

B_MODEL_PATH = BASE_DIR / "xgboost_model_B.joblib"
B_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_B.json"

T_MODEL_PATH = BASE_DIR / "xgboost_model_T.joblib"
T_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_T.json"


# ---------------------------------------------------------------------------
# Loaders
# ---------------------------------------------------------------------------

def _load_category_orders(path: Path) -> Dict[str, Any]:
    """Load category orders JSON from disk."""
    with open(path, "r") as f:
        return json.load(f)


@lru_cache(maxsize=1)
def _load_a_model():
    """Load and cache model A."""
    logger.info("Loading model A from %s", A_MODEL_PATH)
    return joblib.load(A_MODEL_PATH)


@lru_cache(maxsize=1)
def _load_b_model():
    """Load and cache model B."""
    logger.info("Loading model B from %s", B_MODEL_PATH)
    return joblib.load(B_MODEL_PATH)


@lru_cache(maxsize=1)
def _load_t_model():
    """Load and cache model T."""
    logger.info("Loading model T from %s", T_MODEL_PATH)
    return joblib.load(T_MODEL_PATH)


@lru_cache(maxsize=None)
def _load_category_orders_cached(path: Path) -> Dict[str, Any]:
    """
    Cache category orders per path to avoid disk I/O on each scoring.
    """
    logger.info("Loading category orders from %s", path)
    return _load_category_orders(path)


def _get_expected_features(model: Any, df: pd.DataFrame) -> List[str]:
    """
    Get the expected feature names from the model.

    If the model has no 'feature_names' attribute, fall back to df columns.
    This is a defensive measure; ideally, feature names should always
    be stored with the model.
    """
    feature_names = getattr(model, "feature_names", None)
    if feature_names is None:
        logger.warning(
            "Model has no attribute 'feature_names'; using DataFrame columns order."
        )
        feature_names = list(df.columns)
    return list(feature_names)


# ---------------------------------------------------------------------------
# Preprocessing helpers
# ---------------------------------------------------------------------------
MISSING_SENTINELS = [None, "", "null", np.nan, "nan", " "]


def _to_string_category(series: pd.Series) -> pd.Series:
    """
    Force a categorical series whose categories are strings (not floats),
    backed by NumPy object dtype (not pandas StringDtype) for XGBoost
    compatibility.
    """
    s = series.copy()
    s.replace(MISSING_SENTINELS, np.nan, inplace=True)
    # Use classic Python strings (object dtype), not pandas' StringDtype,
    # so that XGBoost's numpy-based dtype checks work correctly.
    s = s.astype(str)
    return s.astype("category")


def _sanitize_expected_feature_dtypes(
    df: pd.DataFrame,
    expected_features: List[str],
    categorical_feature_names: List[str],
) -> pd.DataFrame:
    """
    XGBoost DMatrix does NOT allow object dtype.

    For each expected feature column:
      - If dtype is numeric or bool, keep as-is.
      - If categorical with float categories, convert to string categories.
      - If object:
          * try numeric coercion
          * if still not usable, cast to string category.

    This mirrors the safety checks needed to satisfy the
    XGBoost 3.x pandas backend (`enable_categorical=True`).
    """
    df = df.copy()
    categorical_set = set(categorical_feature_names)

    for col in expected_features:
        if col not in df.columns:
            # Ensure column exists so downstream checks don't fail here.
            df[col] = np.nan

        dtype = df[col].dtype

        # If this feature is known to be categorical from training-time
        # category_orders, assume _prepare_* already produced a proper
        # pandas Categorical with the training categories and leave it
        # untouched so that category codes match training.
        if col in categorical_set:
            continue

        # For non-categorical features, XGBoost expects numeric or bool.
        if is_bool_dtype(dtype) or is_integer_dtype(dtype) or is_float_dtype(dtype):
            continue

        # Anything else (object, string, unexpected categorical) -> numeric coercion.
        numeric = pd.to_numeric(df[col], errors="coerce")
        df[col] = numeric

    return df

def _prepare_a(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model A.

    For each column with category orders:
      - If all category labels are numeric-like, coerce both labels and
        data to floats and build a numeric categorical.
      - Otherwise, treat as string categories.
    """
    df = df.copy()
    for col, raw_categories in category_orders.items():
        if col not in df.columns:
            df[col] = np.nan

        df[col].replace(MISSING_SENTINELS, np.nan, inplace=True)

        # Detect whether category labels are numeric-like and, if so,
        # map numeric values onto the canonical string labels used
        # during training (e.g. -4 -> "-4.0").
        numeric_like = True
        numeric_label_map: Dict[float, str] = {}
        for v in raw_categories:
            try:
                numeric_label_map[float(v)] = str(v)
            except (TypeError, ValueError):
                numeric_like = False
                break

        if numeric_like:
            def _map_value(val: Any) -> Any:
                if pd.isna(val):
                    return np.nan
                try:
                    key = float(val)
                except (TypeError, ValueError):
                    return np.nan
                return numeric_label_map.get(key, np.nan)

            df[col] = df[col].map(_map_value)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
        else:
            # Pure string categories: coerce to plain strings
            df[col] = df[col].astype(str)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)

    return df


def _prepare_with_lower(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Shared preparation logic for models B and T where
    categorical values are lowercased strings.
    """
    df = df.copy()
    for col, raw_categories in category_orders.items():
        if col not in df.columns:
            df[col] = np.nan

        # Normalize missing-like representations
        df[col].replace(MISSING_SENTINELS, np.nan, inplace=True)

        # Detect whether category labels are numeric-like and, if so,
        # map numeric values onto the canonical string labels used
        # during training (e.g. -4 -> "-4.0"). Otherwise, treat
        # them as lowercased string categories.
        numeric_like = True
        numeric_label_map: Dict[float, str] = {}
        for v in raw_categories:
            try:
                numeric_label_map[float(v)] = str(v)
            except (TypeError, ValueError):
                numeric_like = False
                break

        if numeric_like:
            def _map_value(val: Any) -> Any:
                if pd.isna(val):
                    return np.nan
                try:
                    key = float(val)
                except (TypeError, ValueError):
                    return np.nan
                return numeric_label_map.get(key, np.nan)

            df[col] = df[col].map(_map_value)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
        else:
            # String categories: lower-case string representation
            df[col] = df[col].astype(str).str.lower()
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)

    return df


def _prepare_b(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model B (lowercased categorical values).
    """
    return _prepare_with_lower(df, category_orders)


def _prepare_t(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model T (lowercased categorical values).
    """
    return _prepare_with_lower(df, category_orders)


# ---------------------------------------------------------------------------
# Per-model processing functions
# ---------------------------------------------------------------------------

def processing_a(input_data: pd.DataFrame) -> float:
    """
    Run model A on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model A is empty.")

    model = _load_a_model()
    category_orders = _load_category_orders_cached(A_CATEGORY_ORDERS_PATH)
    df = _prepare_a(df, category_orders)

    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    # Ensure all expected features exist in df
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model A: {sorted(missing_features)}"
        )

    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)

    if len(predictions) == 0:
        raise RuntimeError("Model A returned no predictions.")

    pd_a = float(predictions[0])
    return pd_a


def processing_b(input_data: pd.DataFrame) -> float:
    """
    Run model B on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model B is empty.")

    model = _load_b_model()
    category_orders = _load_category_orders_cached(B_CATEGORY_ORDERS_PATH)
    df = _prepare_b(df, category_orders)

    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model B: {sorted(missing_features)}"
        )

    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)

    if len(predictions) == 0:
        raise RuntimeError("Model B returned no predictions.")

    pd_b = float(predictions[0])
    return pd_b


def processing_t(input_data: pd.DataFrame) -> float:
    """
    Run model T on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model T is empty.")

    model = _load_t_model()
    category_orders = _load_category_orders_cached(T_CATEGORY_ORDERS_PATH)
    df = _prepare_t(df, category_orders)

    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model T: {sorted(missing_features)}"
        )

    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)

    if len(predictions) == 0:
        raise RuntimeError("Model T returned no predictions.")

    pd_t = float(predictions[0])
    return pd_t


def processing_all(
    df_a: pd.DataFrame,
    df_b: pd.DataFrame,
    df_t: pd.DataFrame,
) -> Tuple[float, float, float]:
    """
    Convenience function to run all three models and return their predictions.
    """
    return (
        processing_a(df_a),
        processing_b(df_b),
        processing_t(df_t),
    )


# ---------------------------------------------------------------------------
# Main entrypoint for batch-style input
# ---------------------------------------------------------------------------

def __main__(results: List[Dict[str, Any]]) -> Tuple[float, float, float]:
    """
    Main entrypoint for processing a list of results dicts.

    Expected shape of each element in `results`:
    {
        "model_a_features": { ... feature_name: value ... },
        "model_b_features": { ... feature_name: value ... },
        "model_t_features": { ... feature_name: value ... },
    }
    """
    logger.info("Data received in processing block: %s", results)

    df = pd.DataFrame(results)
    if df.empty:
        raise ValueError("Input results list is empty.")

    if not {"model_a_features", "model_b_features", "model_t_features"}.issubset(df.columns):
        missing = {
            "model_a_features",
            "model_b_features",
            "model_t_features",
        } - set(df.columns)
        raise KeyError(
            f"Missing expected keys in results: {sorted(missing)}"
        )

    # Each cell of these columns is expected to be a dict-like object
    df_a = pd.DataFrame(list(df["model_a_features"]))
    df_b = pd.DataFrame(list(df["model_b_features"]))
    df_t = pd.DataFrame(list(df["model_t_features"]))

    pd_a, pd_b, pd_t = processing_all(df_a, df_b, df_t)
    return {
        "pd_a": pd_a,
        "pd_b": pd_b,
        "pd_t": pd_t,
    }