diff --git a/README.md b/README.md index 59a3efc..0848c52 100644 --- a/README.md +++ b/README.md @@ -1 +1,9 @@ -**Hello world!!!** +## Overview +This block (`block.py`) is responsible for loading and scoring the model. + +## Key Inputs & Outputs +- **Request**: Refer to `request_schema.json` for detailed input fields and validation rules. +- **Response**: Refer to `response_schema.json` for the returned structure and data types. + +## Implementation Details +- All core logic resides in `block.py` within the `__main__` function. diff --git a/block.py b/block.py index 3b227f9..96267f1 100644 --- a/block.py +++ b/block.py @@ -1,21 +1,86 @@ -@flowx_block -def example_function(request: dict) -> dict: +import logging +import joblib +import xgboost as xgb +import pandas as pd +import json +import math - # Processing logic here... +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) - return { - "meta_info": [ - { - "name": "created_date", - "type": "string", - "value": "2024-11-05" - } - ], - "fields": [ - { - "name": "", - "type": "", - "value": "" - } - ] - } +# with open('C:/Users/abinisha/flowx/kiwi-blocks/sequence-2/fraud_v1_pre_processing/category_orders_train.json', 'r') as f: +with open('./category_orders_train.json', 'r') as f: + category_orders = json.load(f) + +def __main__(user_age: int, persona_entity_confidence_score: float, persona_selfie_similarity_score_right: float, + persona_selfie_similarity_score_left: float, persona_hesitation_percentage: float, + persona_hesitation_count: float, device_id_age_max: int, selfie_consistency_score_avg: float, + device_consistency: int, selfie_consistency_score: float, global_fs_ls: int, inquiry_frequency: int, + confidence_score_min: float, contract_date_fs_sub: int, browser_os: str, user_city_ip_match: int, + device_id_age_avg: float, persona_distraction_events: float, sub_fs_ls: int, device_id_age_min: int, + confidence_score_max: float, persona_phone_risk_score: float, ip_address_risk_level: str, + login_frequency: float, suspect_score: int, confidence_score: float, name_consistency: int, + ip_location_consistency: int) -> dict: + + input_data = { + "user_age": user_age, + "persona_entity_confidence_score": persona_entity_confidence_score, + "persona_selfie_similarity_score_right": persona_selfie_similarity_score_right, + "persona_selfie_similarity_score_left": persona_selfie_similarity_score_left, + "persona_hesitation_percentage": persona_hesitation_percentage, + "persona_hesitation_count": persona_hesitation_count, + "device_id_age_max": device_id_age_max, + "selfie_consistency_score_avg": selfie_consistency_score_avg, + "device_consistency": device_consistency, + "selfie_consistency_score": selfie_consistency_score, + "global_fs_ls": global_fs_ls, + "inquiry_frequency": inquiry_frequency, + "confidence_score_min": confidence_score_min, + "contract_date_fs_sub": contract_date_fs_sub, + "browser_os": browser_os, + "user_city_ip_match": user_city_ip_match, + "device_id_age_avg": device_id_age_avg, + "persona_distraction_events": persona_distraction_events, + "sub_fs_ls": sub_fs_ls, + "device_id_age_min": device_id_age_min, + "confidence_score_max": confidence_score_max, + "persona_phone_risk_score": persona_phone_risk_score, + "ip_address_risk_level": ip_address_risk_level, + "login_frequency": login_frequency, + "suspect_score": suspect_score, + "confidence_score": confidence_score, + "name_consistency": name_consistency, + "ip_location_consistency": ip_location_consistency + } + + # Load the model + model = joblib.load("./xgboost_model.joblib") + # model = joblib.load("C:/Users/abinisha/flowx/kiwi-blocks/sequence-2/fraud_v1_processing/xgboost_model.joblib") + + df = pd.DataFrame(input_data, index=[False]) + + # Ensure categorical columns are treated as categories + categorical_columns = ['browser_os', 'ip_address_risk_level'] + for col in categorical_columns: + if col in df.columns: + df[col] = df[col].str.lower().replace([None, "", "null", math.nan], "none") + df[col] = pd.Categorical(df[col], categories=category_orders.get(col, [])) + + # Ensure all columns are numeric where possible + for col in df.columns: + if col not in categorical_columns: + df[col] = pd.to_numeric(df[col], errors='ignore') + + model_feature_names = model.feature_names + + dmatrix = xgb.DMatrix(df[model_feature_names], enable_categorical=True) + + prediction = model.predict(dmatrix)[0] + + logger.info(f"Fraud V1 Predicted Score: {prediction}") + + return {'probability': float(prediction)} diff --git a/category_orders_train.json b/category_orders_train.json new file mode 100644 index 0000000..f9d7a79 --- /dev/null +++ b/category_orders_train.json @@ -0,0 +1 @@ +{"browser_os": ["android", "chrome os", "ios", "linux", "mac os x", "none", "windows"], "ip_address_risk_level": ["high-risk", "low-risk", "none"]} \ No newline at end of file diff --git a/request_schema.json b/request_schema.json index 0967ef4..8116680 100644 --- a/request_schema.json +++ b/request_schema.json @@ -1 +1,119 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "user_age": { + "type": ["integer", "null"], + "description": "Age of the user at the contract date, based on birthdate and contract date" + }, + "persona_entity_confidence_score": { + "type": ["number", "null"], + "description": "Based on confidence reasons assign a score between 0 and 100" + }, + "persona_selfie_similarity_score_right": { + "type": ["number", "null"], + "description": "Similarity score from the right side selfie" + }, + "persona_selfie_similarity_score_left": { + "type": ["number", "null"], + "description": "Similarity score from the left side selfie" + }, + "persona_hesitation_percentage": { + "type": ["number", "null"], + "description": "Percentage of time in the flow where the customer did not enter inputs" + }, + "persona_hesitation_count": { + "type": ["number", "null"], + "description": "Persona hesitation count" + }, + "device_id_age_max": { + "type": ["integer", "null"], + "description": "This calculates the maximum device age for a user and loan, similar to min and avg logic but for the max value." + }, + "selfie_consistency_score_avg": { + "type": ["number", "null"], + "description": "Average selfie consistency score for the user's persona activity" + }, + "device_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct devices associated with a user and loan" + }, + "selfie_consistency_score": { + "type": ["number", "null"], + "description": "Average similarity score between left and right selfie" + }, + "global_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last global appearance of the device" + }, + "inquiry_frequency": { + "type": ["integer", "null"], + "description": "Number of inquiries made by the user regarding the loan" + }, + "confidence_score_min": { + "type": ["number", "null"], + "description": "The minimum recorded confidence score for the user and loan during the timeframe." + }, + "contract_date_fs_sub": { + "type": ["integer", "null"], + "description": "Days between the first subscription appearance and contract date" + }, + "browser_os": { + "type": ["string", "null"], + "description": "Browser OS" + }, + "user_city_ip_match": { + "type": ["integer", "null"], + "description": "Checks if the user's city matches the IP city" + }, + "device_id_age_avg": { + "type": ["number", "null"], + "description": "This calculates the rolling average of device_id_age for a user and loan. If no previous rows, the current value is returned." + }, + "persona_distraction_events": { + "type": ["number", "null"], + "description": "Persona distraction events" + }, + "sub_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last subscription activity" + }, + "device_id_age_min": { + "type": ["integer", "null"], + "description": "This calculates the minimum device age for a user and loan, falling back to the current device age if no prior values exist." + }, + "confidence_score_max": { + "type": ["number", "null"], + "description": "The maximum confidence score recorded for the user and loan combination." + }, + "persona_phone_risk_score": { + "type": ["number", "null"], + "description": "Risk associated with the phone number. The risk score ranges from 0 to 100. The higher the risk score, the higher the risk level." + }, + "ip_address_risk_level": { + "type": ["string", "null"], + "description": "Checks if the IP country code matches the persona country code" + }, + "login_frequency": { + "type": ["number", "null"], + "description": "This counts the number of times the user logs in based on the inquiry_updated_at timestamp, providing insights into the user's login behavior throughout the loan process." + }, + "suspect_score": { + "type": ["integer", "null"], + "description": "Suspect score" + }, + "confidence_score": { + "type": ["number", "null"], + "description": "Confidence score" + }, + "name_consistency": { + "type": ["integer", "null"], + "description": "Checks if the first name in the persona matches the user-provided first name" + }, + "ip_location_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct IP locations for a user and loan" + } + }, + "required": [] +} diff --git a/requirements.txt b/requirements.txt index 0967ef4..c4c4de5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,5 @@ -{} +jsonschema==4.23.0 +xgboost==1.7.5 +joblib==1.3.2 +pandas==2.2.2 +numpy==1.23.5 \ No newline at end of file diff --git a/response_schema.json b/response_schema.json index 0967ef4..b1ad4f0 100644 --- a/response_schema.json +++ b/response_schema.json @@ -1 +1,10 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "probability": { + "type": "number", + "description": "Fraud Model predicted score." + } + } +} \ No newline at end of file diff --git a/xgboost_model.joblib b/xgboost_model.joblib new file mode 100644 index 0000000..aa7938d Binary files /dev/null and b/xgboost_model.joblib differ