Add pd v3 processing block

2025-12-04 10:57:36 -05:00
13 changed files with 50605 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -1 +1,11 @@
-**Hello world!!!**
+# PD V3 Processing
 - **Inputs:** Treated feature dictionaries per model from pre-processing.
 - **Outputs:** Raw and isotonic PD scores for models A/B plus model T probability.
 - **Artifacts:** Model binaries located under `models/` (XGBoost + isotonic joblib files).
 - **Tests:** `python -m unittest sequence-3.pd_v3_processing.test_block`.
 - **Signature:** Sequence-3 convention: `__main__` must keep an explicit typed parameter list covering every input (int/float/str) and build the record from those args before scoring; keep aligned with the block schemas.
 ## Schema notes
 - `request_schema.json` and `response_schema.json` for this block are frozen. They describe arrays of `{name, value}` dicts (no nested dict-of-dicts), so that structure must be preserved—do not switch these schemas to dict-of-dicts or object-of-dicts even as the block code evolves. Arrays-of-dicts are still allowed where schematically appropriate.
--- a/init.py
+++ b/init.py
@ -0,0 +1 @@
 __all__ = ["__main__"]
--- a/block.py
+++ b/block.py
@ -1,21 +1,422 @@
-@flowx_block
+import json
-def example_function(request: dict) -> dict:
+import logging
 from functools import lru_cache
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
-  # Processing logic here...
+import joblib
 import numpy as np
 import pandas as pd
 import xgboost as xgb
 from pandas.api.types import (
    is_bool_dtype,
    is_categorical_dtype,
    is_float_dtype,
    is_integer_dtype,
    is_object_dtype,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Paths & constants
 # ---------------------------------------------------------------------------
 try:
    BASE_DIR = Path(__file__).resolve().parent
 except NameError:
    # Fallback for environments where __file__ is not defined (e.g. some REPLs / notebooks)
    BASE_DIR = Path.cwd()
 A_MODEL_PATH = BASE_DIR / "xgboost_model_A.joblib"
 A_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_A.json"
 B_MODEL_PATH = BASE_DIR / "xgboost_model_B.joblib"
 B_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_B.json"
 T_MODEL_PATH = BASE_DIR / "xgboost_model_T.joblib"
 T_CATEGORY_ORDERS_PATH = BASE_DIR / "category_orders_train_T.json"
 # ---------------------------------------------------------------------------
 # Loaders
 # ---------------------------------------------------------------------------
 def _load_category_orders(path: Path) -> Dict[str, Any]:
    """Load category orders JSON from disk."""
    with open(path, "r") as f:
        return json.load(f)
@lru_cache(maxsize=1)
 def _load_a_model():
    """Load and cache model A."""
    logger.info("Loading model A from %s", A_MODEL_PATH)
    return joblib.load(A_MODEL_PATH)
@lru_cache(maxsize=1)
 def _load_b_model():
    """Load and cache model B."""
    logger.info("Loading model B from %s", B_MODEL_PATH)
    return joblib.load(B_MODEL_PATH)
@lru_cache(maxsize=1)
 def _load_t_model():
    """Load and cache model T."""
    logger.info("Loading model T from %s", T_MODEL_PATH)
    return joblib.load(T_MODEL_PATH)
@lru_cache(maxsize=None)
 def _load_category_orders_cached(path: Path) -> Dict[str, Any]:
    """
    Cache category orders per path to avoid disk I/O on each scoring.
    """
    logger.info("Loading category orders from %s", path)
    return _load_category_orders(path)
 def _get_expected_features(model: Any, df: pd.DataFrame) -> List[str]:
    """
    Get the expected feature names from the model.
    If the model has no 'feature_names' attribute, fall back to df columns.
    This is a defensive measure; ideally, feature names should always
    be stored with the model.
    """
    feature_names = getattr(model, "feature_names", None)
    if feature_names is None:
        logger.warning(
            "Model has no attribute 'feature_names'; using DataFrame columns order."
        )
        feature_names = list(df.columns)
    return list(feature_names)
 # ---------------------------------------------------------------------------
 # Preprocessing helpers
 # ---------------------------------------------------------------------------
 MISSING_SENTINELS = [None, "", "null", np.nan, "nan", " "]
 def _to_string_category(series: pd.Series) -> pd.Series:
    """
    Force a categorical series whose categories are strings (not floats),
    backed by NumPy object dtype (not pandas StringDtype) for XGBoost
    compatibility.
    """
    s = series.copy()
    s.replace(MISSING_SENTINELS, np.nan, inplace=True)
    # Use classic Python strings (object dtype), not pandas' StringDtype,
    # so that XGBoost's numpy-based dtype checks work correctly.
    s = s.astype(str)
    return s.astype("category")
 def _sanitize_expected_feature_dtypes(
    df: pd.DataFrame,
    expected_features: List[str],
    categorical_feature_names: List[str],
 ) -> pd.DataFrame:
    """
    XGBoost DMatrix does NOT allow object dtype.
    For each expected feature column:
      - If dtype is numeric or bool, keep as-is.
      - If categorical with float categories, convert to string categories.
      - If object:
          * try numeric coercion
          * if still not usable, cast to string category.
    This mirrors the safety checks needed to satisfy the
    XGBoost 3.x pandas backend (`enable_categorical=True`).
    """
    df = df.copy()
    categorical_set = set(categorical_feature_names)
    for col in expected_features:
        if col not in df.columns:
            # Ensure column exists so downstream checks don't fail here.
            df[col] = np.nan
        dtype = df[col].dtype
        # If this feature is known to be categorical from training-time
        # category_orders, assume _prepare_* already produced a proper
        # pandas Categorical with the training categories and leave it
        # untouched so that category codes match training.
        if col in categorical_set:
            continue
        # For non-categorical features, XGBoost expects numeric or bool.
        if is_bool_dtype(dtype) or is_integer_dtype(dtype) or is_float_dtype(dtype):
            continue
        # Anything else (object, string, unexpected categorical) -> numeric coercion.
        numeric = pd.to_numeric(df[col], errors="coerce")
        df[col] = numeric
    return df
 def _prepare_a(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model A.
    For each column with category orders:
      - If all category labels are numeric-like, coerce both labels and
        data to floats and build a numeric categorical.
      - Otherwise, treat as string categories.
    """
    df = df.copy()
    for col, raw_categories in category_orders.items():
        if col not in df.columns:
            df[col] = np.nan
        df[col].replace(MISSING_SENTINELS, np.nan, inplace=True)
        # Detect whether category labels are numeric-like and, if so,
        # map numeric values onto the canonical string labels used
        # during training (e.g. -4 -> "-4.0").
        numeric_like = True
        numeric_label_map: Dict[float, str] = {}
        for v in raw_categories:
            try:
                numeric_label_map[float(v)] = str(v)
            except (TypeError, ValueError):
                numeric_like = False
                break
        if numeric_like:
            def _map_value(val: Any) -> Any:
                if pd.isna(val):
                    return np.nan
                try:
                    key = float(val)
                except (TypeError, ValueError):
                    return np.nan
                return numeric_label_map.get(key, np.nan)
            df[col] = df[col].map(_map_value)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
        else:
            # Pure string categories: coerce to plain strings
            df[col] = df[col].astype(str)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
    return df
 def _prepare_with_lower(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Shared preparation logic for models B and T where
    categorical values are lowercased strings.
    """
    df = df.copy()
    for col, raw_categories in category_orders.items():
        if col not in df.columns:
            df[col] = np.nan
        # Normalize missing-like representations
        df[col].replace(MISSING_SENTINELS, np.nan, inplace=True)
        # Detect whether category labels are numeric-like and, if so,
        # map numeric values onto the canonical string labels used
        # during training (e.g. -4 -> "-4.0"). Otherwise, treat
        # them as lowercased string categories.
        numeric_like = True
        numeric_label_map: Dict[float, str] = {}
        for v in raw_categories:
            try:
                numeric_label_map[float(v)] = str(v)
            except (TypeError, ValueError):
                numeric_like = False
                break
        if numeric_like:
            def _map_value(val: Any) -> Any:
                if pd.isna(val):
                    return np.nan
                try:
                    key = float(val)
                except (TypeError, ValueError):
                    return np.nan
                return numeric_label_map.get(key, np.nan)
            df[col] = df[col].map(_map_value)
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
        else:
            # String categories: lower-case string representation
            df[col] = df[col].astype(str).str.lower()
            df[col] = pd.Categorical(df[col], categories=raw_categories, ordered=True)
    return df
 def _prepare_b(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model B (lowercased categorical values).
    """
    return _prepare_with_lower(df, category_orders)
 def _prepare_t(df: pd.DataFrame, category_orders: Dict[str, List[Any]]) -> pd.DataFrame:
    """
    Prepare features for model T (lowercased categorical values).
    """
    return _prepare_with_lower(df, category_orders)
 # ---------------------------------------------------------------------------
 # Per-model processing functions
 # ---------------------------------------------------------------------------
 def processing_a(input_data: pd.DataFrame) -> float:
    """
    Run model A on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model A is empty.")
    model = _load_a_model()
    category_orders = _load_category_orders_cached(A_CATEGORY_ORDERS_PATH)
    df = _prepare_a(df, category_orders)
    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    # Ensure all expected features exist in df
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model A: {sorted(missing_features)}"
        )
    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)
    if len(predictions) == 0:
        raise RuntimeError("Model A returned no predictions.")
    pd_a = float(predictions[0])
    return pd_a
 def processing_b(input_data: pd.DataFrame) -> float:
    """
    Run model B on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model B is empty.")
    model = _load_b_model()
    category_orders = _load_category_orders_cached(B_CATEGORY_ORDERS_PATH)
    df = _prepare_b(df, category_orders)
    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model B: {sorted(missing_features)}"
        )
    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)
    if len(predictions) == 0:
        raise RuntimeError("Model B returned no predictions.")
    pd_b = float(predictions[0])
    return pd_b
 def processing_t(input_data: pd.DataFrame) -> float:
    """
    Run model T on input_data and return the first prediction as float.
    """
    df = pd.DataFrame(input_data)
    if df.empty:
        raise ValueError("Input DataFrame for model T is empty.")
    model = _load_t_model()
    category_orders = _load_category_orders_cached(T_CATEGORY_ORDERS_PATH)
    df = _prepare_t(df, category_orders)
    expected_features = _get_expected_features(model, df)
    df = _sanitize_expected_feature_dtypes(df, expected_features, list(category_orders.keys()))
    missing_features = set(expected_features) - set(df.columns)
    if missing_features:
        raise KeyError(
            f"Missing expected features for model T: {sorted(missing_features)}"
        )
    dmatrix = xgb.DMatrix(df[expected_features], enable_categorical=True, missing=np.nan)
    predictions = model.predict(dmatrix)
    if len(predictions) == 0:
        raise RuntimeError("Model T returned no predictions.")
    pd_t = float(predictions[0])
    return pd_t
 def processing_all(
    df_a: pd.DataFrame,
    df_b: pd.DataFrame,
    df_t: pd.DataFrame,
 ) -> Tuple[float, float, float]:
    """
    Convenience function to run all three models and return their predictions.
    """
    return (
        processing_a(df_a),
        processing_b(df_b),
        processing_t(df_t),
    )
 # ---------------------------------------------------------------------------
 # Main entrypoint for batch-style input
 # ---------------------------------------------------------------------------
 def __main__(results: List[Dict[str, Any]]) -> Tuple[float, float, float]:
    """
    Main entrypoint for processing a list of results dicts.
    Expected shape of each element in `results`:
    {
        "model_a_features": { ... feature_name: value ... },
        "model_b_features": { ... feature_name: value ... },
        "model_t_features": { ... feature_name: value ... },
    }
    """
    logger.info("Data received in processing block: %s", results)
    df = pd.DataFrame(results)
    if df.empty:
        raise ValueError("Input results list is empty.")
    if not {"model_a_features", "model_b_features", "model_t_features"}.issubset(df.columns):
        missing = {
            "model_a_features",
            "model_b_features",
            "model_t_features",
        } - set(df.columns)
        raise KeyError(
            f"Missing expected keys in results: {sorted(missing)}"
        )
    # Each cell of these columns is expected to be a dict-like object
    df_a = pd.DataFrame(list(df["model_a_features"]))
    df_b = pd.DataFrame(list(df["model_b_features"]))
    df_t = pd.DataFrame(list(df["model_t_features"]))
    pd_a, pd_b, pd_t = processing_all(df_a, df_b, df_t)
    return {
-    "meta_info": [
+        "pd_a": pd_a,
-      {
+        "pd_b": pd_b,
-        "name": "created_date",
+        "pd_t": pd_t,
        "type": "string",
        "value": "2024-11-05"
      }
    ],
    "fields": [
      {
        "name": "",
        "type": "",
        "value": ""
      }
    ]
    }
--- a/category_orders_train_A.json
+++ b/category_orders_train_A.json
@ -0,0 +1,15 @@
 {
  "G300S": [
    "-1.0",
    "-2.0",
    "-4.0",
    "0.0",
    "1.0",
    "10.0",
    "2.0",
    "3.0",
    "4.0",
    "5.0",
    "9.0"
  ]
 }
--- a/category_orders_train_B.json
+++ b/category_orders_train_B.json
--- a/category_orders_train_T.json
+++ b/category_orders_train_T.json
--- a/request_schema.json
+++ b/request_schema.json
@ -1 +1,11 @@
-{}
+{
  "$schema": "http://json-schema.org/draft-07/schema",
  "type": "object",
  "properties": {
    "results": {
      "type": ["array", "null"],
      "items": {"type": "object"}
    }
  },
  "required": []
 }
--- a/requirements.txt
+++ b/requirements.txt
@ -1 +1,4 @@
-{}
+joblib==1.5.2
 numpy==2.2.6
 pandas==2.2.3
 xgboost==3.1.1
--- a/response_schema.json
+++ b/response_schema.json
@ -1 +1,11 @@
-{}
+{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "properties": {
    "pd_a": { "type": "number" },
    "pd_b": { "type": "number" },
    "pd_t": { "type": "number" }
  },
  "required": ["pd_a", "pd_b", "pd_t"],
  "additionalProperties": false
 }
--- a/test_block.py
+++ b/test_block.py
@ -0,0 +1,18 @@
 import unittest
 from block import __main__ 
 # from reference import __main__
 data = [{'model_a_features': {'AEPMAG05': 297, 'RET201': None, 'PER201': 1.0, 'PER202': 1.0, 'PER222': 1.0, 'PER225': 1.51, 'PER235': 1949, 'CTM18': None, 'SC20S': None, 'AT36SD': 6, 'FI36SD': 999, 'G250BD': 1, 'G250CD': 1, 'US36SD': 999, 'CV13': 3.0, 'CV25': 0.0, 'CV26': 3.0, 'AT01S': 36, 'AT104S': 8, 'FI02S': 1, 'FI20S': 119, 'FI35S': 1225, 'G051S': 3, 'G205S': 611, 'G210S': None, 'G218A': 0, 'G225S': 606, 'G230S': None, 'G234S': None, 'G300S': 1, 'IN02S': 1, 'IN12S': 1, 'OF20S': None, 'RT20S': 114, 'INAP01': 175, 'G106S': 220, 'US02S': 1, 'US20S': 119, 'US24S': 1, 'US28S': 1449, 'US32S': 1225, 'US35S': 1225, 'US36S': None, 'SE20S': None, 'US51A': 12, 'G205B': 611, 'INST_TRD': 28, 'RTL_TRD': 8, 'AGG402': 210, 'AGG403': 205, 'AGG423': 1344, 'AGG424': 484, 'AGG903': 1, 'TRV03': 3, 'TRV04': 5, 'BALMAG01': 188, 'score_results': 603, 'PER201_unk': 0, 'G225S_unk': 0, 'SC20S_unk': 1, 'RET201_unk': 1, 'US24S_unk': 0}, 'model_b_features': {'UTLMAG01': 112, 'AEPMAG04': 286, 'PER201': 1.0, 'PER203': 1.31, 'PER222': 1.0, 'PER223': 1.31, 'PER224': 1.15, 'PER225': 1.51, 'PER235': 1949, 'CTM23': None, 'CT321': 65, 'CTC20': None, 'CTA17': 24, 'CTA18': 5, 'SC21S': None, 'SCC92': None, 'SCBALM01': None, 'AT36SD': 6, 'FI36SD': 999, 'RE36SD': 6, 'SE36SD': None, 'US36SD': 999, 'LQA232YR': 100.0, 'LQR325YR': -255.0, 'RLE902': None, 'CV25': 0.0, 'CV26': 3.0, 'RVDEXQ2': 9, 'AT01S': 36, 'AT104S': 8, 'AU20S': None, 'BI21S': None, 'BR33S': 1148, 'CO06S': None, 'FI02S': 1, 'FI03S': 1, 'FI20S': 119, 'FI32S': 1225, 'FI33S': 1225, 'FI34S': 85, 'FI35S': 1225, 'FI101S': 1225, 'FR21S': None, 'FR32S': None, 'G020S': 11, 'G102S': None, 'G205S': 611, 'G210S': None, 'G213A': None, 'G225S': 606, 'G234S': None, 'G301S': 1, 'G990S': None, 'IN02S': 1, 'IN12S': 1, 'MT21S': None, 'OF09S': None, 'OF21S': None, 'OF29S': None, 'OF35S': None, 'RE32S': 2384, 'RT36S': 6, 'ST01S': 0, 'INAP01': 175, 'G106S': 220, 'S204S': None, 'US02S': 1, 'US03S': 1, 'US12S': 1, 'US20S': 119, 'US24S': 1, 'US30S': 100.0, 'US34S': 85, 'SE20S': None, 'SE21S': None, 'SE34S': None, 'SE36S': None, 'JT20S': None, 'JT33S': None, 'JT70S': None, 'G404S': None, 'G405S': None, 'G406S': None, 'G407S': None, 'G416S': None, 'G417S': None, 'US51A': 12, 'INST_TRD': 28, 'NOMT_TRD': 36, 'AGG512': 1169, 'AGG516': 1193, 'AGG902': 2, 'AGG903': 1, 'TRV03': 3, 'TRV04': 5, 'TRV06': 8, 'BALMAG01': 188, 'RVLR14': 'RTRRRRRR', 'PAYMNT06': 1.31, 'PAYMNT07': 1.0, 'score_results': 603}, 'model_t_features': {'PDMAG01': 310, 'AEPMAG05': 297, 'AUT201': None, 'PER201': 1.0, 'PER203': 1.31, 'PER204': 1.15, 'PER205': 1.48, 'PER223': 1.31, 'PER225': 1.51, 'PER253': 54, 'CTA17': 24, 'SE21CD': None, 'RLE907': None, 'CV26': 3.0, 'AT35B': 1037, 'FI28S': 1449, 'FI32S': 1225, 'INAP01': 175, 'US01S': 26, 'US28S': 1449, 'US34S': 85, 'US101S': 1225, 'SE02S': None, 'SE06S': None, 'SE09S': None, 'SE20S': None, 'TRV06': 8, 'TRV10': 12, 'PAYMNT06': 1.31, 'AEPMAG05_unk': 0, 'PER201_unk': 0}}]
 class TestBlock(unittest.TestCase):
    def test_main_returns_scores(self):
        block_result = __main__(data)
        print(block_result)
        self.assertIsInstance(block_result, dict)
        self.assertIn("pd_a", block_result)
        self.assertIn("pd_b", block_result)
        self.assertIn("pd_t", block_result)
 if __name__ == "__main__":  # pragma: no cover
    unittest.main()
--- a/xgboost_model_A.joblib
+++ b/xgboost_model_A.joblib
--- a/xgboost_model_B.joblib
+++ b/xgboost_model_B.joblib
--- a/xgboost_model_T.joblib
+++ b/xgboost_model_T.joblib