From d20e15fd4da59f3c99dfedae530bee866d90ce93 Mon Sep 17 00:00:00 2001
From: Admin User <kiwi_admin_user@centurion.com>
Date: Wed, 5 Feb 2025 19:10:31 +0000
Subject: [PATCH] ETD Pre-preprocessing block

---
 README.md            |  12 ++++-
 block.py             | 103 ++++++++++++++++++++++++++++++-------
 request_schema.json  | 120 ++++++++++++++++++++++++++++++++++++++++++-
 requirements.txt     |   2 +-
 response_schema.json | 119 +++++++++++++++++++++++++++++++++++++++++-
 test_block.py        |  24 +++++++++
 6 files changed, 357 insertions(+), 23 deletions(-)
 create mode 100644 test_block.py

diff --git a/README.md b/README.md
index 59a3efc..c1dca4f 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,11 @@
-**Hello world!!!**
+## Overview
+This block (`block.py`) is responsible for preparing and validating inputs for the model. It performs data cleansing and returns a normalized output dictionary.
+
+## Key Inputs & Outputs
+- **Request**: Refer to `request_schema.json` for detailed input fields and validation rules.  
+- **Response**: Refer to `response_schema.json` for the returned structure and data types.
+
+## Implementation Details
+- All core logic resides in `block.py` within the `__main__` function.
+- Example usage and validation are demonstrated in `test_block.py`.
+
diff --git a/block.py b/block.py
index 3b227f9..96e3e51 100644
--- a/block.py
+++ b/block.py
@@ -1,21 +1,86 @@
-@flowx_block
-def example_function(request: dict) -> dict:
+import logging
+import pandas as pd
+import math
+import json
 
-  # Processing logic here...
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(name)s - %(message)s",
+)
+logger = logging.getLogger(__name__)
+
+# with open('C:/Users/abinisha/flowx/kiwi-blocks/sequence-2/fraud_v1_pre_processing/category_orders_train.json', 'r') as f:
+#     category_orders = json.load(f)
+
+def __main__(user_age: int,persona_entity_confidence_score: float,persona_selfie_similarity_score_right: float,
+             persona_selfie_similarity_score_left: float,persona_hesitation_percentage: float,
+             persona_hesitation_count: float,device_id_age_max: int,selfie_consistency_score_avg: float,
+             device_consistency: int,selfie_consistency_score: float,global_fs_ls: int,inquiry_frequency: int,
+             confidence_score_min: float,contract_date_fs_sub: int,browser_os: str,user_city_ip_match: int,
+             device_id_age_avg: float,persona_distraction_events: float,sub_fs_ls: int,device_id_age_min: int,
+             confidence_score_max: float,persona_phone_risk_score: float,ip_address_risk_level: str,
+             login_frequency: float,suspect_score: int,confidence_score: float,name_consistency: int,
+             ip_location_consistency: int) ->dict:
+
+    dtypes = {
+        'user_age': 'int',
+        'persona_entity_confidence_score': 'float',
+        'persona_selfie_similarity_score_right': 'float',
+        'persona_selfie_similarity_score_left': 'float',
+        'persona_hesitation_percentage': 'float',
+        'persona_hesitation_count': 'float',
+        'device_id_age_max': 'int',
+        'selfie_consistency_score_avg': 'float',
+        'device_consistency': 'int',
+        'selfie_consistency_score': 'float',
+        'global_fs_ls': 'int',
+        'inquiry_frequency': 'int',
+        'confidence_score_min': 'float',
+        'contract_date_fs_sub': 'int',
+        'browser_os': 'string',
+        'user_city_ip_match': 'int',
+        'device_id_age_avg': 'float',
+        'persona_distraction_events': 'float',
+        'sub_fs_ls': 'int',
+        'device_id_age_min': 'int',
+        'confidence_score_max': 'float',
+        'persona_phone_risk_score': 'float',
+        'ip_address_risk_level': 'string',
+        'login_frequency': 'float',
+        'suspect_score': 'int',
+        'confidence_score': 'float',
+        'name_consistency': 'int',
+        'ip_location_consistency': 'int'
+    }
+
+    input_data = {"user_age" : user_age,"persona_entity_confidence_score" : persona_entity_confidence_score,
+        "persona_selfie_similarity_score_right" : persona_selfie_similarity_score_right,
+        "persona_selfie_similarity_score_left" : persona_selfie_similarity_score_left,
+        "persona_hesitation_percentage" : persona_hesitation_percentage,
+        "persona_hesitation_count" : persona_hesitation_count,"device_id_age_max" : device_id_age_max,
+        "selfie_consistency_score_avg" : selfie_consistency_score_avg,"device_consistency" : device_consistency,
+        "selfie_consistency_score" : selfie_consistency_score,"global_fs_ls" : global_fs_ls,
+        "inquiry_frequency" : inquiry_frequency,"confidence_score_min" : confidence_score_min,
+        "contract_date_fs_sub" : contract_date_fs_sub,"browser_os" : browser_os,
+        "user_city_ip_match" : user_city_ip_match,"device_id_age_avg" : device_id_age_avg,
+        "persona_distraction_events" : persona_distraction_events,"sub_fs_ls" : sub_fs_ls,
+        "device_id_age_min" : device_id_age_min,"confidence_score_max" : confidence_score_max,
+        "persona_phone_risk_score" : persona_phone_risk_score,"ip_address_risk_level" : ip_address_risk_level,
+        "login_frequency" : login_frequency,"suspect_score" : suspect_score,"confidence_score" : confidence_score,
+        "name_consistency" : name_consistency,"ip_location_consistency" : ip_location_consistency}
+
+    df = pd.DataFrame(input_data, index=[False])
+
+    for column, dtype in dtypes.items():
+        if dtype == 'int' or dtype == 'float':
+            df[column] = pd.to_numeric(df[column], errors='coerce')
+        else:
+            df[column] = df[column].astype(str).str.lower()
+            
+    output_data = df.iloc[0].where(pd.notnull(df.iloc[0]), None).to_dict()
+
+    logger.info(f"Fraud V1 Pre processed data: {output_data}")
+
+    return output_data
 
-  return {
-    "meta_info": [
-      {
-        "name": "created_date",
-        "type": "string",
-        "value": "2024-11-05"
-      }
-    ],
-    "fields": [
-      {
-        "name": "",
-        "type": "",
-        "value": ""
-      }
-    ]
-  }
diff --git a/request_schema.json b/request_schema.json
index 0967ef4..8116680 100644
--- a/request_schema.json
+++ b/request_schema.json
@@ -1 +1,119 @@
-{}
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "user_age": {
+      "type": ["integer", "null"],
+      "description": "Age of the user at the contract date, based on birthdate and contract date"
+    },
+    "persona_entity_confidence_score": {
+      "type": ["number", "null"],
+      "description": "Based on confidence reasons assign a score between 0 and 100"
+    },
+    "persona_selfie_similarity_score_right": {
+      "type": ["number", "null"],
+      "description": "Similarity score from the right side selfie"
+    },
+    "persona_selfie_similarity_score_left": {
+      "type": ["number", "null"],
+      "description": "Similarity score from the left side selfie"
+    },
+    "persona_hesitation_percentage": {
+      "type": ["number", "null"],
+      "description": "Percentage of time in the flow where the customer did not enter inputs"
+    },
+    "persona_hesitation_count": {
+      "type": ["number", "null"],
+      "description": "Persona hesitation count"
+    },
+    "device_id_age_max": {
+      "type": ["integer", "null"],
+      "description": "This calculates the maximum device age for a user and loan, similar to min and avg logic but for the max value."
+    },
+    "selfie_consistency_score_avg": {
+      "type": ["number", "null"],
+      "description": "Average selfie consistency score for the user's persona activity"
+    },
+    "device_consistency": {
+      "type": ["integer", "null"],
+      "description": "Number of distinct devices associated with a user and loan"
+    },
+    "selfie_consistency_score": {
+      "type": ["number", "null"],
+      "description": "Average similarity score between left and right selfie"
+    },
+    "global_fs_ls": {
+      "type": ["integer", "null"],
+      "description": "Days between the first and last global appearance of the device"
+    },
+    "inquiry_frequency": {
+      "type": ["integer", "null"],
+      "description": "Number of inquiries made by the user regarding the loan"
+    },
+    "confidence_score_min": {
+      "type": ["number", "null"],
+      "description": "The minimum recorded confidence score for the user and loan during the timeframe."
+    },
+    "contract_date_fs_sub": {
+      "type": ["integer", "null"],
+      "description": "Days between the first subscription appearance and contract date"
+    },
+    "browser_os": {
+      "type": ["string", "null"],
+      "description": "Browser OS"
+    },
+    "user_city_ip_match": {
+      "type": ["integer", "null"],
+      "description": "Checks if the user's city matches the IP city"
+    },
+    "device_id_age_avg": {
+      "type": ["number", "null"],
+      "description": "This calculates the rolling average of device_id_age for a user and loan. If no previous rows, the current value is returned."
+    },
+    "persona_distraction_events": {
+      "type": ["number", "null"],
+      "description": "Persona distraction events"
+    },
+    "sub_fs_ls": {
+      "type": ["integer", "null"],
+      "description": "Days between the first and last subscription activity"
+    },
+    "device_id_age_min": {
+      "type": ["integer", "null"],
+      "description": "This calculates the minimum device age for a user and loan, falling back to the current device age if no prior values exist."
+    },
+    "confidence_score_max": {
+      "type": ["number", "null"],
+      "description": "The maximum confidence score recorded for the user and loan combination."
+    },
+    "persona_phone_risk_score": {
+      "type": ["number", "null"],
+      "description": "Risk associated with the phone number. The risk score ranges from 0 to 100. The higher the risk score, the higher the risk level."
+    },
+    "ip_address_risk_level": {
+      "type": ["string", "null"],
+      "description": "Checks if the IP country code matches the persona country code"
+    },
+    "login_frequency": {
+      "type": ["number", "null"],
+      "description": "This counts the number of times the user logs in based on the inquiry_updated_at timestamp, providing insights into the user's login behavior throughout the loan process."
+    },
+    "suspect_score": {
+      "type": ["integer", "null"],
+      "description": "Suspect score"
+    },
+    "confidence_score": {
+      "type": ["number", "null"],
+      "description": "Confidence score"
+    },
+    "name_consistency": {
+      "type": ["integer", "null"],
+      "description": "Checks if the first name in the persona matches the user-provided first name"
+    },
+    "ip_location_consistency": {
+      "type": ["integer", "null"],
+      "description": "Number of distinct IP locations for a user and loan"
+    }
+  },
+  "required": []
+}
diff --git a/requirements.txt b/requirements.txt
index 0967ef4..1ce7bd9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-{}
+pandas==2.2.2
\ No newline at end of file
diff --git a/response_schema.json b/response_schema.json
index 0967ef4..ce5c18b 100644
--- a/response_schema.json
+++ b/response_schema.json
@@ -1 +1,118 @@
-{}
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "properties": {
+    "user_age": {
+      "type": ["integer", "null"],
+      "description": "Age of the user at the contract date, based on birthdate and contract date"
+    },
+    "persona_entity_confidence_score": {
+      "type": ["number", "null"],
+      "description": "Based on confidence reasons assign a score between 0 and 100"
+    },
+    "persona_selfie_similarity_score_right": {
+      "type": ["number", "null"],
+      "description": "Similarity score from the right side selfie"
+    },
+    "persona_selfie_similarity_score_left": {
+      "type": ["number", "null"],
+      "description": "Similarity score from the left side selfie"
+    },
+    "persona_hesitation_percentage": {
+      "type": ["number", "null"],
+      "description": "Percentage of time in the flow where the customer did not enter inputs"
+    },
+    "persona_hesitation_count": {
+      "type": ["number", "null"],
+      "description": "Persona hesitation count"
+    },
+    "device_id_age_max": {
+      "type": ["integer", "null"],
+      "description": "This calculates the maximum device age for a user and loan, similar to min and avg logic but for the max value."
+    },
+    "selfie_consistency_score_avg": {
+      "type": ["number", "null"],
+      "description": "Average selfie consistency score for the user's persona activity"
+    },
+    "device_consistency": {
+      "type": ["integer", "null"],
+      "description": "Number of distinct devices associated with a user and loan"
+    },
+    "selfie_consistency_score": {
+      "type": ["number", "null"],
+      "description": "Average similarity score between left and right selfie"
+    },
+    "global_fs_ls": {
+      "type": ["integer", "null"],
+      "description": "Days between the first and last global appearance of the device"
+    },
+    "inquiry_frequency": {
+      "type": ["integer", "null"],
+      "description": "Number of inquiries made by the user regarding the loan"
+    },
+    "confidence_score_min": {
+      "type": ["number", "null"],
+      "description": "The minimum recorded confidence score for the user and loan during the timeframe."
+    },
+    "contract_date_fs_sub": {
+      "type": ["integer", "null"],
+      "description": "Days between the first subscription appearance and contract date"
+    },
+    "browser_os": {
+      "type": ["string", "null"],
+      "description": "Browser OS"
+    },
+    "user_city_ip_match": {
+      "type": ["integer", "null"],
+      "description": "Checks if the user's city matches the IP city"
+    },
+    "device_id_age_avg": {
+      "type": ["number", "null"],
+      "description": "This calculates the rolling average of device_id_age for a user and loan. If no previous rows, the current value is returned."
+    },
+    "persona_distraction_events": {
+      "type": ["number", "null"],
+      "description": "Persona distraction events"
+    },
+    "sub_fs_ls": {
+      "type": ["integer", "null"],
+      "description": "Days between the first and last subscription activity"
+    },
+    "device_id_age_min": {
+      "type": ["integer", "null"],
+      "description": "This calculates the minimum device age for a user and loan, falling back to the current device age if no prior values exist."
+    },
+    "confidence_score_max": {
+      "type": ["number", "null"],
+      "description": "The maximum confidence score recorded for the user and loan combination."
+    },
+    "persona_phone_risk_score": {
+      "type": ["number", "null"],
+      "description": "Risk associated with the phone number. The risk score ranges from 0 to 100. The higher the risk score, the higher the risk level."
+    },
+    "ip_address_risk_level": {
+      "type": ["string", "null"],
+      "description": "Checks if the IP country code matches the persona country code"
+    },
+    "login_frequency": {
+      "type": ["number", "null"],
+      "description": "This counts the number of times the user logs in based on the inquiry_updated_at timestamp, providing insights into the user's login behavior throughout the loan process."
+    },
+    "suspect_score": {
+      "type": ["integer", "null"],
+      "description": "Suspect score"
+    },
+    "confidence_score": {
+      "type": ["number", "null"],
+      "description": "Confidence score"
+    },
+    "name_consistency": {
+      "type": ["integer", "null"],
+      "description": "Checks if the first name in the persona matches the user-provided first name"
+    },
+    "ip_location_consistency": {
+      "type": ["integer", "null"],
+      "description": "Number of distinct IP locations for a user and loan"
+    }
+  }
+}
diff --git a/test_block.py b/test_block.py
new file mode 100644
index 0000000..17735ca
--- /dev/null
+++ b/test_block.py
@@ -0,0 +1,24 @@
+import unittest
+import pandas as pd
+from block import __main__
+
+class TestBlock(unittest.TestCase):
+
+    def test_main_success(self):
+        result = __main__(user_age = 43,persona_entity_confidence_score = None,persona_selfie_similarity_score_right = None,persona_selfie_similarity_score_left = None,persona_hesitation_percentage = None,persona_hesitation_count = None,device_id_age_max = 181.0,selfie_consistency_score_avg = None,device_consistency = 1,selfie_consistency_score = None,global_fs_ls = 146.0,inquiry_frequency = 0,confidence_score_min = 1.0,contract_date_fs_sub = 181.0,browser_os = 'Android',user_city_ip_match = 1.0,device_id_age_avg = 181.0,persona_distraction_events = None,sub_fs_ls = 146.0,device_id_age_min = 181.0,confidence_score_max = 1.0,persona_phone_risk_score = None,ip_address_risk_level = None,login_frequency = 0,suspect_score = 0.0,confidence_score = 1.0,name_consistency = None,ip_location_consistency = 1)
+        
+        expected_result = {"user_age":43,"persona_entity_confidence_score":None,"persona_selfie_similarity_score_right":None,"persona_selfie_similarity_score_left":None,"persona_hesitation_percentage":None,"persona_hesitation_count":None,"device_id_age_max":181.0,"selfie_consistency_score_avg":None,"device_consistency":1,"selfie_consistency_score":None,"global_fs_ls":146.0,"inquiry_frequency":0,"confidence_score_min":1.0,"contract_date_fs_sub":181.0,"browser_os":"android","user_city_ip_match":1.0,"device_id_age_avg":181.0,"persona_distraction_events":None,"sub_fs_ls":146.0,"device_id_age_min":181.0,"confidence_score_max":1.0,"persona_phone_risk_score":None,"ip_address_risk_level":"none","login_frequency":0,"suspect_score":0.0,"confidence_score":1.0,"name_consistency":None,"ip_location_consistency":1}
+        for key, expected_value in expected_result.items():
+            if isinstance(expected_value, float):
+                self.assertAlmostEqual(result[key], expected_value, places=6, msg=f"Mismatch for {key}")
+            elif expected_value is None:
+                self.assertTrue(pd.isna(result[key]), msg=f"Mismatch for {key}")
+            else:
+                self.assertEqual(result[key], expected_value, msg=f"Mismatch for {key}")
+
+    # def test_main_invalid_input(self):
+    #     with self.assertRaises(TypeError):
+    #         __main__(user_age = '43',persona_entity_confidence_score = None,persona_selfie_similarity_score_right = None,persona_selfie_similarity_score_left = None,persona_hesitation_percentage = None,persona_hesitation_count = None,device_id_age_max = 181.0,selfie_consistency_score_avg = None,device_consistency = 1,selfie_consistency_score = None,global_fs_ls = 146.0,inquiry_frequency = 0,confidence_score_min = 1.0,contract_date_fs_sub = 181.0,browser_os = 'Android',user_city_ip_match = 1.0,device_id_age_avg = 181.0,persona_distraction_events = None,sub_fs_ls = 146.0,device_id_age_min = 181.0,confidence_score_max = 1.0,persona_phone_risk_score = None,ip_address_risk_level = None,login_frequency = 0,suspect_score = 0.0,confidence_score = 1.0,name_consistency = None,ip_location_consistency = 1)  # Invalid input type (string)
+
+if __name__ == "__main__":
+    unittest.main()