[add]: code refractoring and cleanup

This commit is contained in:
VivianDee
2025-09-07 23:07:40 +01:00
parent 2cc3d70f4f
commit 6de9583aaf
25 changed files with 86 additions and 60 deletions
+44
View File
@@ -0,0 +1,44 @@
from django.conf import settings
import httpx
import json
from app.config import SIMBRELLA_BASE_URL, SIMBRELLA_ENDPOINT_RAC_CHECKS
from app.utils.logger import logger
class SimbrellaIntegration:
BASE_URL = SIMBRELLA_BASE_URL
ENDPOINT_RAC_CHECKS = SIMBRELLA_ENDPOINT_RAC_CHECKS
@staticmethod
def rac_check(customer_id, account_id, transaction_id):
"""
Calls the RACCheck endpoit
"""
url = f"{SimbrellaIntegration.BASE_URL}/{SimbrellaIntegration.ENDPOINT_RAC_CHECKS}"
logger.info(f"Contacting Rack Checks EndPoint: {str(url)}", exc_info=True)
payload = {
"customerId": customer_id,
"accountId": account_id,
"transactionId": str(transaction_id),
"fbnTransactionId": str(transaction_id),
"countryCode": "NG",
"channel": "USSD"
}
headers = {
"Content-Type": "application/json",
"x-api-key": f"{settings.VALID_API_KEY}",
"App-Id": f"{settings.VALID_APP_ID}",
}
try:
response = httpx.post(url, json=payload, headers=headers, timeout=10.0)
logger.info(f"This is Response: {str(response)}", exc_info=True)
return response
except Exception as e:
logger.error(f"RACCheck API call failed: {str(e)}", exc_info=True)
raise Exception(f"RACCheck API call failed: {str(e)}")
@@ -0,0 +1,36 @@
import time
import threading
import requests
from ...config import SALARY_DETECT_URL, SALARY_DETECT_HEADERS, get_random_salary_payload
from app.utils.logger import logger
class SalaryDetect:
def __init__(self):
self._running = False
self._thread = None
def _run(self):
while self._running:
logger.info(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Detecting salary...")
try:
payload = get_random_salary_payload()
response = requests.post(SALARY_DETECT_URL, headers=SALARY_DETECT_HEADERS, json=payload)
logger.info(f"POST {SALARY_DETECT_URL} status: {response.status_code}, response: {response.text}")
except Exception as e:
logger.error(f"Error during POST: {e}")
logger.info(f"[{time.strftime('%Y-%m-%d %H:%M:%S')}] Salary detection complete")
time.sleep(120)
def start(self):
if not self._running:
self._running = True
self._thread = threading.Thread(target=self._run, daemon=True)
self._thread.start()
def stop(self):
self._running = False
if self._thread:
self._thread.join()
+24
View File
@@ -0,0 +1,24 @@
from .main import SalaryAnalyticsPipeline
from .data_loader import DataLoader
from .keyword_analyzer import KeywordAnalyzer
from .consistent_amount_analyzer import ConsistentAmountAnalyzer
from .transaction_type_analyzer import TransactionTypeAnalyzer
from .salary_earner_analyzer import SalaryEarnerAnalyzer
from .salary_predictor import SalaryPredictor
"""
Salary Analytics Package
A package for analyzing and predicting salary patterns from transaction data.
"""
__version__ = "0.1.0"
__all__ = [
"SalaryAnalyticsPipeline",
"DataLoader",
"KeywordAnalyzer",
"ConsistentAmountAnalyzer",
"TransactionTypeAnalyzer",
"SalaryEarnerAnalyzer",
"SalaryPredictor"
]
@@ -0,0 +1,64 @@
"""
Consistent amount transaction analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class ConsistentAmountAnalyzer:
def __init__(self, df):
self.df = df
self.const_df = None
def calculate_coefficient_of_variation(self, group):
"""Calculate coefficient of variation for a group of transactions."""
amounts = group[group['initiated_by'] == 'C']['amount']
mean = amounts.mean()
std = amounts.std(ddof=0)
if mean == 0:
return float('nan')
return std / mean
def flag_consistent_amounts(self, group, cv_threshold=None):
"""Flag accounts with low variance in transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
filtered_group = group[group['initiated_by'] == 'C']
cv = self.calculate_coefficient_of_variation(filtered_group)
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
return pd.Series(
[is_consistent] * len(group),
index=group.index,
name='is_consistent_amount'
)
def identify_consistent_amount_accounts(self, cv_threshold=None):
"""Identify accounts with consistent transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
# Create a copy of the original DataFrame
self.const_df = self.df.copy()
# Calculate consistent amount flags
consistent_flags = self.const_df.groupby('accountid').apply(
lambda group: self.flag_consistent_amounts(group, cv_threshold)
).reset_index(level=0, drop=True)
# Add the flags to the original DataFrame
self.const_df['is_consistent_amount'] = consistent_flags
return self.const_df
def get_consistent_amount_data(self):
"""Get transactions identified as having consistent amounts."""
if self.const_df is None:
self.identify_consistent_amount_accounts()
return self.const_df[
(self.const_df['is_consistent_amount']) &
(self.const_df['initiated_by'] == 'C')
]
+169
View File
@@ -0,0 +1,169 @@
"""
Data loading and preprocessing module.
"""
from sqlalchemy import create_engine, text
import pandas as pd
from datetime import datetime
import logging
import os
from .config import DB_CONFIG, TABLE_NAME
from app.utils.logger import logger
class DataLoader:
def __init__(self):
self.engine = None
self.df = None
self.chunk_size = 10000 # Load 10,000 rows at a time
def connect(self):
"""Establish database connection."""
try:
logger.info("Attempting to connect to database...")
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['name']}"
self.engine = create_engine(DATABASE_URL)
with self.engine.connect() as conn:
# First check if table exists
check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{TABLE_NAME}')")
table_exists = conn.execute(check_table).scalar()
if not table_exists:
logger.error(f"Table {TABLE_NAME} does not exist in the database")
return False
# Get row count
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
row_count = conn.execute(count_query).scalar()
logger.info(f"Table {TABLE_NAME} exists with {row_count} rows")
# Get version
result = conn.execute(text("SELECT version();"))
logger.info("Connected successfully to database!")
return True
except Exception as e:
logger.error(f"Error connecting to database: {str(e)}")
return False
def load_from_csv(self, file_path):
"""Load data from a CSV file."""
try:
logger.info(f"Loading data from CSV file: {file_path}")
if not os.path.exists(file_path):
logger.error(f"CSV file not found: {file_path}")
return None
# Load data in chunks
chunks = []
for chunk in pd.read_csv(file_path, chunksize=self.chunk_size):
# Preprocess chunk
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
# Rename columns if needed
if 'd1' in chunk.columns:
chunk = chunk.rename(columns={
'd1': 'trx_type',
'd2': 'trx_subtype',
'd3': 'initiated_by',
'd4': 'customer_id'
})
chunk = chunk.dropna()
chunks.append(chunk)
# Combine all chunks
self.df = pd.concat(chunks, ignore_index=True)
logger.info(f"Successfully loaded {len(self.df)} rows from CSV")
# Basic data validation
logger.info("Performing data validation...")
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
logger.info(f"Data types:\n{self.df.dtypes}")
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
return self.df
except Exception as e:
logger.error(f"Error loading data from CSV: {str(e)}")
return None
def load_from_db(self):
"""Load and preprocess transaction data from database in chunks."""
if not self.engine:
logger.info("No database connection. Attempting to connect...")
if not self.connect():
logger.error("Failed to establish database connection")
return None
try:
logger.info(f"Loading data from table: {TABLE_NAME}")
# First get total count
with self.engine.connect() as conn:
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
total_rows = conn.execute(count_query).scalar()
logger.info(f"Total rows to process: {total_rows}")
# Load data in chunks
chunks = []
offset = 0
while True:
logger.info(f"Loading chunk starting at offset {offset}")
query = f"SELECT * FROM {TABLE_NAME} LIMIT {self.chunk_size} OFFSET {offset}"
chunk = pd.read_sql(query, self.engine)
if chunk.empty:
break
# Preprocess chunk
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
# Rename columns
chunk = chunk.rename(columns={
'd1': 'trx_type',
'd2': 'trx_subtype',
'd3': 'initiated_by',
'd4': 'customer_id'
})
chunk = chunk.dropna()
chunks.append(chunk)
offset += self.chunk_size
if offset >= total_rows:
break
# Combine all chunks
self.df = pd.concat(chunks, ignore_index=True)
logger.info(f"Successfully loaded {len(self.df)} rows of data")
# Basic data validation
logger.info("Performing data validation...")
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
logger.info(f"Data types:\n{self.df.dtypes}")
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
return self.df
except Exception as e:
logger.error(f"Error loading data: {str(e)}")
return None
def load_data(self, source='db', file_path=None):
"""Load data from either database or CSV file."""
if source == 'db':
return self.load_from_db()
elif source == 'csv':
if not file_path:
logger.error("File path must be provided when loading from CSV")
return None
return self.load_from_csv(file_path)
else:
logger.error(f"Invalid source: {source}. Must be 'db' or 'csv'")
return None
def get_data(self):
"""Get the loaded DataFrame."""
if self.df is None:
logger.warning("No data loaded. Call load_data() first.")
return self.df
@@ -0,0 +1,47 @@
"""
Keyword-based salary transaction analysis module.
"""
import re
import pandas as pd
from .config import SALARY_KEYWORDS
class KeywordAnalyzer:
def __init__(self, df):
self.df = df
self.desc_df = None
def identify_salary_transactions(self):
"""
Identifies potential salary-related transactions based on keywords
and month-year patterns in the 'description' column.
"""
month_year_patterns = [
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
]
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
combined_pattern = (
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
'|'.join(month_year_patterns)
)
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
combined_pattern,
na=False,
regex=True
)
self.desc_df = self.df.copy()
return self.df
def get_salary_related_data(self):
"""Get transactions identified as salary-related."""
if self.desc_df is None:
self.identify_salary_transactions()
return self.desc_df[
(self.desc_df['is_salary_related'] == True) &
(self.desc_df['initiated_by'] == 'C')
]
+153
View File
@@ -0,0 +1,153 @@
"""
Main module for running the salary analytics pipeline.
"""
import logging
from .data_loader import DataLoader
from .keyword_analyzer import KeywordAnalyzer
from .consistent_amount_analyzer import ConsistentAmountAnalyzer
from .transaction_type_analyzer import TransactionTypeAnalyzer
from .salary_earner_analyzer import SalaryEarnerAnalyzer
from .salary_predictor import SalaryPredictor
from app.utils.logger import logger
class SalaryAnalyticsPipeline:
def __init__(self):
logger.info("Initializing SalaryAnalyticsPipeline")
self.data_loader = None
self.df = None
self.keyword_analyzer = None
self.consistent_amount_analyzer = None
self.transaction_type_analyzer = None
self.salary_earner_analyzer = None
self.salary_predictor = None
def load_data(self, source='db', file_path=None):
"""Load and preprocess the transaction data."""
logger.info("Starting data loading process")
self.data_loader = DataLoader()
self.df = self.data_loader.load_data(source=source, file_path=file_path)
if self.df is not None:
logger.info(f"Successfully loaded data with {len(self.df)} rows")
else:
logger.error("Failed to load data")
return self.df is not None
def run_keyword_analysis(self):
"""Run keyword-based salary transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting keyword analysis")
self.keyword_analyzer = KeywordAnalyzer(self.df)
self.keyword_analyzer.identify_salary_transactions()
keyword_data = self.keyword_analyzer.get_salary_related_data()
# Update main DataFrame with keyword analysis results
self.df['is_salary_related'] = self.df.index.isin(keyword_data.index)
return keyword_data
def run_consistent_amount_analysis(self):
"""Run consistent amount transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting consistent amount analysis")
self.consistent_amount_analyzer = ConsistentAmountAnalyzer(self.df)
self.consistent_amount_analyzer.identify_consistent_amount_accounts()
consistent_data = self.consistent_amount_analyzer.get_consistent_amount_data()
# Update main DataFrame with consistent amount analysis results
self.df['is_consistent_amount'] = self.df.index.isin(consistent_data.index)
return consistent_data
def run_transaction_type_analysis(self):
"""Run transaction type analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting transaction type analysis")
self.transaction_type_analyzer = TransactionTypeAnalyzer(self.df)
self.transaction_type_analyzer.flag_salary_type_transactions()
type_data = self.transaction_type_analyzer.get_salary_type_data()
# Update main DataFrame with transaction type analysis results
self.df['is_salary_type'] = self.df.index.isin(type_data.index)
return type_data
def generate_salary_earner_reports(self):
"""Generate salary earner reports."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
# Ensure all analysis flags are present
required_columns = ['is_salary_related', 'is_consistent_amount', 'is_salary_type']
missing_columns = [col for col in required_columns if col not in self.df.columns]
if missing_columns:
logger.error(f"Missing required columns: {missing_columns}")
raise ValueError(f"Missing required columns: {missing_columns}. Run all analyses first.")
logger.info("Starting salary earner report generation")
self.salary_earner_analyzer = SalaryEarnerAnalyzer(self.df)
return self.salary_earner_analyzer.generate_reports()
def train_salary_prediction_models(self):
"""Train salary prediction models."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting model training")
self.salary_predictor = SalaryPredictor(self.df)
# Get accounts from the salary earner analyzer
if self.salary_earner_analyzer is None:
logger.info("Salary earner analyzer not initialized. Generating reports first.")
self.generate_salary_earner_reports()
consistent_accounts = self.salary_earner_analyzer.final_table['accountid'].unique()
inconsistent_accounts = self.salary_earner_analyzer.likely_salary_earner['accountid'].unique()
self.salary_predictor.train_and_evaluate(consistent_accounts, inconsistent_accounts)
def run_full_pipeline(self, source='db', file_path=None):
"""Run the complete salary analytics pipeline."""
logger.info("Starting full pipeline execution")
if not self.load_data(source=source, file_path=file_path):
logger.error("Failed to load data. Exiting pipeline.")
return False
try:
logger.info("Running keyword analysis...")
self.run_keyword_analysis()
logger.info("Running consistent amount analysis...")
self.run_consistent_amount_analysis()
logger.info("Running transaction type analysis...")
self.run_transaction_type_analysis()
logger.info("Generating salary earner reports...")
self.generate_salary_earner_reports()
logger.info("Training salary prediction models...")
self.train_salary_prediction_models()
logger.info("Pipeline completed successfully!")
return True
except Exception as e:
logger.error(f"Pipeline failed: {str(e)}")
return False
def main():
"""Main function to run the salary analytics pipeline."""
pipeline = SalaryAnalyticsPipeline()
pipeline.run_full_pipeline()
if __name__ == "__main__":
main()
@@ -0,0 +1,169 @@
"""
Salary earner analysis and report generation module.
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from datetime import datetime, timedelta
from .config import MODEL_CONFIG, OUTPUT_PATHS
from app.utils.logger import logger
class SalaryEarnerAnalyzer:
def __init__(self, df):
self.df = df
self.final_table = None
self.likely_salary_earner = None
self.high_earner_details = None
def filter_venn_section(self, **kwargs):
"""Filter accounts based on specified combinations of hypothesis flags."""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = self.df[self.df['initiated_by'] == 'C'].copy()
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
filtered_df = df1[condition]
# Drop any rows with NaN values in critical columns
critical_cols = ['accountid', 'trx_start_date', 'amount']
filtered_df = filtered_df.dropna(subset=critical_cols)
return filtered_df
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
"""Plot and save Venn diagram showing overlap between hypotheses."""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
plt.close()
def generate_salary_earners_table(self, all_three_hypotheses):
"""Generate a table of salary earners with their metrics."""
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Skip if group is empty
if group.empty:
continue
# Calculate required metrics
num_months = len(group)
# Handle last 6 months calculation
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
if last_6_months.empty:
least_inflow = 0
else:
least_inflow = last_6_months['amount'].min()
# Handle average salary calculation
if group['amount'].notna().any():
avg_salary = group['amount'].mean()
else:
avg_salary = 0
# Calculate days_since_last_trx with NaN handling
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
if pd.isna(median_interval):
median_interval = 30 # Default to 30 days if no interval data
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags with NaN handling
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
# Drop rows where all numeric columns are NaN
numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
final_df = final_df.dropna(subset=numeric_cols, how='all')
return final_df
def analyze_salary_earners(self, final_df):
"""Analyze salary earners and identify high earners."""
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy()
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
count_high = len(high_earners)
return high_earner_details, count_high
def generate_reports(self):
"""Generate all salary earner reports."""
# Get accounts flagged by all three hypotheses
all_three_hypotheses = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
# Generate final table
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
logger.info(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
# Generate likely salary earner table
green_section = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
yellow_section = self.filter_venn_section(
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
self.likely_salary_earner = pd.concat([yellow_section, green_section])
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
logger.info(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
# Analyze high earners
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
logger.info(f"\nTotal High Earners: {total_high_earners}")
# Plot hypothesis overlap
self.plot_hypothesis_overlap(
self.df[self.df['is_salary_related']],
self.df[self.df['is_consistent_amount']],
self.df[self.df['is_salary_type']]
)
# Save reports
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
return {
'final_table': self.final_table,
'likely_salary_earner': self.likely_salary_earner,
'high_earner_details': self.high_earner_details,
'total_high_earners': total_high_earners
}
+171
View File
@@ -0,0 +1,171 @@
"""
Salary prediction module using machine learning.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump
from .config import OUTPUT_PATHS
class SalaryPredictor:
def __init__(self, df):
self.df = df
self.model_cons = None
self.model_incons = None
self.scaler_cons = None
self.scaler_incons = None
def add_feature_engineering(self, df):
"""Engineer features for salary prediction."""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(self, df_transactions, accounts):
"""Prepare data for training and testing."""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = self.add_feature_engineering(df_filtered)
# Aggregate monthly data
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
if len(account_data) >= 12:
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(self, X_train, y_train, X_test, y_test):
"""Train and evaluate a Random Forest model."""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
def plot_predictions(self, y_test, y_pred, title, output_path):
"""Plot actual vs predicted values and save to file."""
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title(title)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.savefig(output_path)
plt.close()
def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
"""Train and evaluate models for both consistent and inconsistent salary earners."""
# Train model for consistent salary earners
X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
if len(X_train_cons) > 0:
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
# Save model and scaler
dump(self.model_cons, OUTPUT_PATHS['consistent_model'])
dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler'])
print("Saved consistent salary earner model and scaler.")
# Plot predictions
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
y_pred = self.model_cons.predict(X_test_cons_scaled)
self.plot_predictions(
y_test_cons,
y_pred,
"Actual vs. Predicted Salary (Consistent Earners)",
OUTPUT_PATHS['consistent_earners_plot']
)
else:
print("No accounts with sufficient data for consistent salary earners.")
# Train model for inconsistent salary earners
X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
# Save model and scaler
dump(self.model_incons, OUTPUT_PATHS['inconsistent_model'])
dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler'])
print("Saved inconsistent salary earner model and scaler.")
# Plot predictions
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
y_pred = self.model_incons.predict(X_test_incons_scaled)
self.plot_predictions(
y_test_incons,
y_pred,
"Actual vs. Predicted Salary (Inconsistent Earners)",
OUTPUT_PATHS['inconsistent_earners_plot']
)
else:
print("No accounts with sufficient data for inconsistent salary earners.")
@@ -0,0 +1,43 @@
"""
Transaction type analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class TransactionTypeAnalyzer:
def __init__(self, df):
self.df = df
self.trx_df = None
def flag_salary_type_transactions(self):
"""Flag transactions that match salary criteria based on type and subtype."""
self.df['is_salary_type'] = (
((self.df['trx_type'] == 'T') | (self.df['trx_type'] == 'C')) &
((self.df['trx_subtype'] == 'BI') | (self.df['trx_subtype'] == 'I') |
(self.df['trx_subtype'] == 'BS') | (self.df['trx_subtype'] == 'CI')) &
(self.df['initiated_by'] == 'C') &
(self.df['amount'] > 0)
)
self.trx_df = self.df.copy()
return self.df
def is_salary_earner_by_type(self, group, min_transactions=None, threshold=None):
"""Determine if an account likely belongs to a salary earner."""
if min_transactions is None:
min_transactions = MODEL_CONFIG['min_transactions']
if threshold is None:
threshold = MODEL_CONFIG['threshold']
if len(group) < min_transactions:
return False
valid_ratio = group['is_salary_type'].mean()
return valid_ratio >= threshold
def get_salary_type_data(self):
"""Get transactions identified as salary type."""
if self.trx_df is None:
self.flag_salary_type_transactions()
return self.trx_df[self.trx_df['is_salary_type']]