From d20e15fd4da59f3c99dfedae530bee866d90ce93 Mon Sep 17 00:00:00 2001 From: Admin User Date: Wed, 5 Feb 2025 19:10:31 +0000 Subject: [PATCH] ETD Pre-preprocessing block --- README.md | 12 ++++- block.py | 103 ++++++++++++++++++++++++++++++------- request_schema.json | 120 ++++++++++++++++++++++++++++++++++++++++++- requirements.txt | 2 +- response_schema.json | 119 +++++++++++++++++++++++++++++++++++++++++- test_block.py | 24 +++++++++ 6 files changed, 357 insertions(+), 23 deletions(-) create mode 100644 test_block.py diff --git a/README.md b/README.md index 59a3efc..c1dca4f 100644 --- a/README.md +++ b/README.md @@ -1 +1,11 @@ -**Hello world!!!** +## Overview +This block (`block.py`) is responsible for preparing and validating inputs for the model. It performs data cleansing and returns a normalized output dictionary. + +## Key Inputs & Outputs +- **Request**: Refer to `request_schema.json` for detailed input fields and validation rules. +- **Response**: Refer to `response_schema.json` for the returned structure and data types. + +## Implementation Details +- All core logic resides in `block.py` within the `__main__` function. +- Example usage and validation are demonstrated in `test_block.py`. + diff --git a/block.py b/block.py index 3b227f9..96e3e51 100644 --- a/block.py +++ b/block.py @@ -1,21 +1,86 @@ -@flowx_block -def example_function(request: dict) -> dict: +import logging +import pandas as pd +import math +import json - # Processing logic here... +# Configure logging +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", +) +logger = logging.getLogger(__name__) + +# with open('C:/Users/abinisha/flowx/kiwi-blocks/sequence-2/fraud_v1_pre_processing/category_orders_train.json', 'r') as f: +# category_orders = json.load(f) + +def __main__(user_age: int,persona_entity_confidence_score: float,persona_selfie_similarity_score_right: float, + persona_selfie_similarity_score_left: float,persona_hesitation_percentage: float, + persona_hesitation_count: float,device_id_age_max: int,selfie_consistency_score_avg: float, + device_consistency: int,selfie_consistency_score: float,global_fs_ls: int,inquiry_frequency: int, + confidence_score_min: float,contract_date_fs_sub: int,browser_os: str,user_city_ip_match: int, + device_id_age_avg: float,persona_distraction_events: float,sub_fs_ls: int,device_id_age_min: int, + confidence_score_max: float,persona_phone_risk_score: float,ip_address_risk_level: str, + login_frequency: float,suspect_score: int,confidence_score: float,name_consistency: int, + ip_location_consistency: int) ->dict: + + dtypes = { + 'user_age': 'int', + 'persona_entity_confidence_score': 'float', + 'persona_selfie_similarity_score_right': 'float', + 'persona_selfie_similarity_score_left': 'float', + 'persona_hesitation_percentage': 'float', + 'persona_hesitation_count': 'float', + 'device_id_age_max': 'int', + 'selfie_consistency_score_avg': 'float', + 'device_consistency': 'int', + 'selfie_consistency_score': 'float', + 'global_fs_ls': 'int', + 'inquiry_frequency': 'int', + 'confidence_score_min': 'float', + 'contract_date_fs_sub': 'int', + 'browser_os': 'string', + 'user_city_ip_match': 'int', + 'device_id_age_avg': 'float', + 'persona_distraction_events': 'float', + 'sub_fs_ls': 'int', + 'device_id_age_min': 'int', + 'confidence_score_max': 'float', + 'persona_phone_risk_score': 'float', + 'ip_address_risk_level': 'string', + 'login_frequency': 'float', + 'suspect_score': 'int', + 'confidence_score': 'float', + 'name_consistency': 'int', + 'ip_location_consistency': 'int' + } + + input_data = {"user_age" : user_age,"persona_entity_confidence_score" : persona_entity_confidence_score, + "persona_selfie_similarity_score_right" : persona_selfie_similarity_score_right, + "persona_selfie_similarity_score_left" : persona_selfie_similarity_score_left, + "persona_hesitation_percentage" : persona_hesitation_percentage, + "persona_hesitation_count" : persona_hesitation_count,"device_id_age_max" : device_id_age_max, + "selfie_consistency_score_avg" : selfie_consistency_score_avg,"device_consistency" : device_consistency, + "selfie_consistency_score" : selfie_consistency_score,"global_fs_ls" : global_fs_ls, + "inquiry_frequency" : inquiry_frequency,"confidence_score_min" : confidence_score_min, + "contract_date_fs_sub" : contract_date_fs_sub,"browser_os" : browser_os, + "user_city_ip_match" : user_city_ip_match,"device_id_age_avg" : device_id_age_avg, + "persona_distraction_events" : persona_distraction_events,"sub_fs_ls" : sub_fs_ls, + "device_id_age_min" : device_id_age_min,"confidence_score_max" : confidence_score_max, + "persona_phone_risk_score" : persona_phone_risk_score,"ip_address_risk_level" : ip_address_risk_level, + "login_frequency" : login_frequency,"suspect_score" : suspect_score,"confidence_score" : confidence_score, + "name_consistency" : name_consistency,"ip_location_consistency" : ip_location_consistency} + + df = pd.DataFrame(input_data, index=[False]) + + for column, dtype in dtypes.items(): + if dtype == 'int' or dtype == 'float': + df[column] = pd.to_numeric(df[column], errors='coerce') + else: + df[column] = df[column].astype(str).str.lower() + + output_data = df.iloc[0].where(pd.notnull(df.iloc[0]), None).to_dict() + + logger.info(f"Fraud V1 Pre processed data: {output_data}") + + return output_data - return { - "meta_info": [ - { - "name": "created_date", - "type": "string", - "value": "2024-11-05" - } - ], - "fields": [ - { - "name": "", - "type": "", - "value": "" - } - ] - } diff --git a/request_schema.json b/request_schema.json index 0967ef4..8116680 100644 --- a/request_schema.json +++ b/request_schema.json @@ -1 +1,119 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "user_age": { + "type": ["integer", "null"], + "description": "Age of the user at the contract date, based on birthdate and contract date" + }, + "persona_entity_confidence_score": { + "type": ["number", "null"], + "description": "Based on confidence reasons assign a score between 0 and 100" + }, + "persona_selfie_similarity_score_right": { + "type": ["number", "null"], + "description": "Similarity score from the right side selfie" + }, + "persona_selfie_similarity_score_left": { + "type": ["number", "null"], + "description": "Similarity score from the left side selfie" + }, + "persona_hesitation_percentage": { + "type": ["number", "null"], + "description": "Percentage of time in the flow where the customer did not enter inputs" + }, + "persona_hesitation_count": { + "type": ["number", "null"], + "description": "Persona hesitation count" + }, + "device_id_age_max": { + "type": ["integer", "null"], + "description": "This calculates the maximum device age for a user and loan, similar to min and avg logic but for the max value." + }, + "selfie_consistency_score_avg": { + "type": ["number", "null"], + "description": "Average selfie consistency score for the user's persona activity" + }, + "device_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct devices associated with a user and loan" + }, + "selfie_consistency_score": { + "type": ["number", "null"], + "description": "Average similarity score between left and right selfie" + }, + "global_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last global appearance of the device" + }, + "inquiry_frequency": { + "type": ["integer", "null"], + "description": "Number of inquiries made by the user regarding the loan" + }, + "confidence_score_min": { + "type": ["number", "null"], + "description": "The minimum recorded confidence score for the user and loan during the timeframe." + }, + "contract_date_fs_sub": { + "type": ["integer", "null"], + "description": "Days between the first subscription appearance and contract date" + }, + "browser_os": { + "type": ["string", "null"], + "description": "Browser OS" + }, + "user_city_ip_match": { + "type": ["integer", "null"], + "description": "Checks if the user's city matches the IP city" + }, + "device_id_age_avg": { + "type": ["number", "null"], + "description": "This calculates the rolling average of device_id_age for a user and loan. If no previous rows, the current value is returned." + }, + "persona_distraction_events": { + "type": ["number", "null"], + "description": "Persona distraction events" + }, + "sub_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last subscription activity" + }, + "device_id_age_min": { + "type": ["integer", "null"], + "description": "This calculates the minimum device age for a user and loan, falling back to the current device age if no prior values exist." + }, + "confidence_score_max": { + "type": ["number", "null"], + "description": "The maximum confidence score recorded for the user and loan combination." + }, + "persona_phone_risk_score": { + "type": ["number", "null"], + "description": "Risk associated with the phone number. The risk score ranges from 0 to 100. The higher the risk score, the higher the risk level." + }, + "ip_address_risk_level": { + "type": ["string", "null"], + "description": "Checks if the IP country code matches the persona country code" + }, + "login_frequency": { + "type": ["number", "null"], + "description": "This counts the number of times the user logs in based on the inquiry_updated_at timestamp, providing insights into the user's login behavior throughout the loan process." + }, + "suspect_score": { + "type": ["integer", "null"], + "description": "Suspect score" + }, + "confidence_score": { + "type": ["number", "null"], + "description": "Confidence score" + }, + "name_consistency": { + "type": ["integer", "null"], + "description": "Checks if the first name in the persona matches the user-provided first name" + }, + "ip_location_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct IP locations for a user and loan" + } + }, + "required": [] +} diff --git a/requirements.txt b/requirements.txt index 0967ef4..1ce7bd9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -{} +pandas==2.2.2 \ No newline at end of file diff --git a/response_schema.json b/response_schema.json index 0967ef4..ce5c18b 100644 --- a/response_schema.json +++ b/response_schema.json @@ -1 +1,118 @@ -{} +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "user_age": { + "type": ["integer", "null"], + "description": "Age of the user at the contract date, based on birthdate and contract date" + }, + "persona_entity_confidence_score": { + "type": ["number", "null"], + "description": "Based on confidence reasons assign a score between 0 and 100" + }, + "persona_selfie_similarity_score_right": { + "type": ["number", "null"], + "description": "Similarity score from the right side selfie" + }, + "persona_selfie_similarity_score_left": { + "type": ["number", "null"], + "description": "Similarity score from the left side selfie" + }, + "persona_hesitation_percentage": { + "type": ["number", "null"], + "description": "Percentage of time in the flow where the customer did not enter inputs" + }, + "persona_hesitation_count": { + "type": ["number", "null"], + "description": "Persona hesitation count" + }, + "device_id_age_max": { + "type": ["integer", "null"], + "description": "This calculates the maximum device age for a user and loan, similar to min and avg logic but for the max value." + }, + "selfie_consistency_score_avg": { + "type": ["number", "null"], + "description": "Average selfie consistency score for the user's persona activity" + }, + "device_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct devices associated with a user and loan" + }, + "selfie_consistency_score": { + "type": ["number", "null"], + "description": "Average similarity score between left and right selfie" + }, + "global_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last global appearance of the device" + }, + "inquiry_frequency": { + "type": ["integer", "null"], + "description": "Number of inquiries made by the user regarding the loan" + }, + "confidence_score_min": { + "type": ["number", "null"], + "description": "The minimum recorded confidence score for the user and loan during the timeframe." + }, + "contract_date_fs_sub": { + "type": ["integer", "null"], + "description": "Days between the first subscription appearance and contract date" + }, + "browser_os": { + "type": ["string", "null"], + "description": "Browser OS" + }, + "user_city_ip_match": { + "type": ["integer", "null"], + "description": "Checks if the user's city matches the IP city" + }, + "device_id_age_avg": { + "type": ["number", "null"], + "description": "This calculates the rolling average of device_id_age for a user and loan. If no previous rows, the current value is returned." + }, + "persona_distraction_events": { + "type": ["number", "null"], + "description": "Persona distraction events" + }, + "sub_fs_ls": { + "type": ["integer", "null"], + "description": "Days between the first and last subscription activity" + }, + "device_id_age_min": { + "type": ["integer", "null"], + "description": "This calculates the minimum device age for a user and loan, falling back to the current device age if no prior values exist." + }, + "confidence_score_max": { + "type": ["number", "null"], + "description": "The maximum confidence score recorded for the user and loan combination." + }, + "persona_phone_risk_score": { + "type": ["number", "null"], + "description": "Risk associated with the phone number. The risk score ranges from 0 to 100. The higher the risk score, the higher the risk level." + }, + "ip_address_risk_level": { + "type": ["string", "null"], + "description": "Checks if the IP country code matches the persona country code" + }, + "login_frequency": { + "type": ["number", "null"], + "description": "This counts the number of times the user logs in based on the inquiry_updated_at timestamp, providing insights into the user's login behavior throughout the loan process." + }, + "suspect_score": { + "type": ["integer", "null"], + "description": "Suspect score" + }, + "confidence_score": { + "type": ["number", "null"], + "description": "Confidence score" + }, + "name_consistency": { + "type": ["integer", "null"], + "description": "Checks if the first name in the persona matches the user-provided first name" + }, + "ip_location_consistency": { + "type": ["integer", "null"], + "description": "Number of distinct IP locations for a user and loan" + } + } +} diff --git a/test_block.py b/test_block.py new file mode 100644 index 0000000..17735ca --- /dev/null +++ b/test_block.py @@ -0,0 +1,24 @@ +import unittest +import pandas as pd +from block import __main__ + +class TestBlock(unittest.TestCase): + + def test_main_success(self): + result = __main__(user_age = 43,persona_entity_confidence_score = None,persona_selfie_similarity_score_right = None,persona_selfie_similarity_score_left = None,persona_hesitation_percentage = None,persona_hesitation_count = None,device_id_age_max = 181.0,selfie_consistency_score_avg = None,device_consistency = 1,selfie_consistency_score = None,global_fs_ls = 146.0,inquiry_frequency = 0,confidence_score_min = 1.0,contract_date_fs_sub = 181.0,browser_os = 'Android',user_city_ip_match = 1.0,device_id_age_avg = 181.0,persona_distraction_events = None,sub_fs_ls = 146.0,device_id_age_min = 181.0,confidence_score_max = 1.0,persona_phone_risk_score = None,ip_address_risk_level = None,login_frequency = 0,suspect_score = 0.0,confidence_score = 1.0,name_consistency = None,ip_location_consistency = 1) + + expected_result = {"user_age":43,"persona_entity_confidence_score":None,"persona_selfie_similarity_score_right":None,"persona_selfie_similarity_score_left":None,"persona_hesitation_percentage":None,"persona_hesitation_count":None,"device_id_age_max":181.0,"selfie_consistency_score_avg":None,"device_consistency":1,"selfie_consistency_score":None,"global_fs_ls":146.0,"inquiry_frequency":0,"confidence_score_min":1.0,"contract_date_fs_sub":181.0,"browser_os":"android","user_city_ip_match":1.0,"device_id_age_avg":181.0,"persona_distraction_events":None,"sub_fs_ls":146.0,"device_id_age_min":181.0,"confidence_score_max":1.0,"persona_phone_risk_score":None,"ip_address_risk_level":"none","login_frequency":0,"suspect_score":0.0,"confidence_score":1.0,"name_consistency":None,"ip_location_consistency":1} + for key, expected_value in expected_result.items(): + if isinstance(expected_value, float): + self.assertAlmostEqual(result[key], expected_value, places=6, msg=f"Mismatch for {key}") + elif expected_value is None: + self.assertTrue(pd.isna(result[key]), msg=f"Mismatch for {key}") + else: + self.assertEqual(result[key], expected_value, msg=f"Mismatch for {key}") + + # def test_main_invalid_input(self): + # with self.assertRaises(TypeError): + # __main__(user_age = '43',persona_entity_confidence_score = None,persona_selfie_similarity_score_right = None,persona_selfie_similarity_score_left = None,persona_hesitation_percentage = None,persona_hesitation_count = None,device_id_age_max = 181.0,selfie_consistency_score_avg = None,device_consistency = 1,selfie_consistency_score = None,global_fs_ls = 146.0,inquiry_frequency = 0,confidence_score_min = 1.0,contract_date_fs_sub = 181.0,browser_os = 'Android',user_city_ip_match = 1.0,device_id_age_avg = 181.0,persona_distraction_events = None,sub_fs_ls = 146.0,device_id_age_min = 181.0,confidence_score_max = 1.0,persona_phone_risk_score = None,ip_address_risk_level = None,login_frequency = 0,suspect_score = 0.0,confidence_score = 1.0,name_consistency = None,ip_location_consistency = 1) # Invalid input type (string) + +if __name__ == "__main__": + unittest.main()