Added new salary-related terms and improved image outputs in salary.ipynb

This commit is contained in:
2025-04-28 19:44:40 +01:00
parent 8207d8f1ff
commit 591d4611b6
27 changed files with 1782 additions and 12 deletions
+6
View File
@@ -0,0 +1,6 @@
"""
Salary Analytics Package
A package for analyzing and predicting salary patterns from transaction data.
"""
__version__ = "0.1.0"
Binary file not shown.
Binary file not shown.
Binary file not shown.
+212
View File
@@ -0,0 +1,212 @@
"""
FastAPI application for salary analytics.
"""
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, Dict
import os
import socket
import logging
from .main import SalaryAnalyticsPipeline
from .config import OUTPUT_PATHS
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Salary Analytics API",
description="API for analyzing and predicting salary patterns from transaction data",
version="1.0.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)
# Global pipeline instance
pipeline = None
class AnalysisResponse(BaseModel):
"""Response model for analysis endpoints."""
message: str
data: Optional[Dict] = None
file_path: Optional[str] = None
@app.on_event("startup")
async def startup_event():
"""Initialize the pipeline on startup."""
global pipeline
try:
logger.info("Initializing pipeline...")
pipeline = SalaryAnalyticsPipeline()
if not pipeline.load_data():
logger.error("Failed to load data during startup")
raise Exception("Failed to load data during startup")
# Print network information
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Server running on hostname: {hostname}")
logger.info(f"Server IP address: {ip_address}")
logger.info(f"Server is accessible at:")
logger.info(f"- http://localhost:8000")
logger.info(f"- http://127.0.0.1:8000")
logger.info(f"- http://{ip_address}:8000")
logger.info("Pipeline initialized successfully")
except Exception as e:
logger.error(f"Error during startup: {str(e)}")
raise
@app.get("/")
async def root():
"""Root endpoint."""
logger.info("Root endpoint accessed")
return {"message": "Welcome to Salary Analytics API"}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
logger.info("Health check endpoint accessed")
return {"status": "healthy"}
@app.post("/analyze/keyword", response_model=AnalysisResponse)
async def analyze_keyword():
"""Run keyword-based salary transaction analysis."""
try:
logger.info("Starting keyword analysis...")
data = pipeline.run_keyword_analysis()
logger.info(f"Keyword analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Keyword analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in keyword analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/analyze/consistent-amount", response_model=AnalysisResponse)
async def analyze_consistent_amount():
"""Run consistent amount transaction analysis."""
try:
logger.info("Starting consistent amount analysis...")
data = pipeline.run_consistent_amount_analysis()
logger.info(f"Consistent amount analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Consistent amount analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in consistent amount analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/analyze/transaction-type", response_model=AnalysisResponse)
async def analyze_transaction_type():
"""Run transaction type analysis."""
try:
logger.info("Starting transaction type analysis...")
data = pipeline.run_transaction_type_analysis()
logger.info(f"Transaction type analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Transaction type analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in transaction type analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/generate/reports", response_model=AnalysisResponse)
async def generate_reports(background_tasks: BackgroundTasks):
"""Generate salary earner reports."""
try:
logger.info("Starting report generation...")
reports = pipeline.generate_salary_earner_reports()
logger.info("Reports generated successfully")
return AnalysisResponse(
message="Reports generated successfully",
data={
"verified_salary_earners": len(reports['final_table']),
"likely_salary_earners": len(reports['likely_salary_earner']),
"high_earners": reports['total_high_earners']
}
)
except Exception as e:
logger.error(f"Error in report generation: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/train/models", response_model=AnalysisResponse)
async def train_models():
"""Train salary prediction models."""
try:
logger.info("Starting model training...")
pipeline.train_salary_prediction_models()
logger.info("Models trained successfully")
return AnalysisResponse(
message="Models trained successfully"
)
except Exception as e:
logger.error(f"Error in model training: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/download/{report_type}")
async def download_report(report_type: str):
"""Download generated reports."""
try:
logger.info(f"Attempting to download report: {report_type}")
file_paths = {
"high_earners": OUTPUT_PATHS["high_earner_details"],
"likely_earners": OUTPUT_PATHS["likely_salary_earner"],
"final_table": OUTPUT_PATHS["final_table"],
"consistent_plot": OUTPUT_PATHS["consistent_earners_plot"],
"inconsistent_plot": OUTPUT_PATHS["inconsistent_earners_plot"],
"hypothesis_plot": OUTPUT_PATHS["hypothesis_overlap_plot"]
}
if report_type not in file_paths:
logger.error(f"Report type not found: {report_type}")
raise HTTPException(status_code=404, detail="Report type not found")
file_path = file_paths[report_type]
if not os.path.exists(file_path):
logger.error(f"Report file not found: {file_path}")
raise HTTPException(status_code=404, detail="Report file not found")
logger.info(f"Successfully found report file: {file_path}")
return FileResponse(
path=file_path,
filename=os.path.basename(file_path),
media_type="application/octet-stream"
)
except Exception as e:
logger.error(f"Error downloading report: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/run/pipeline", response_model=AnalysisResponse)
async def run_full_pipeline():
"""Run the complete salary analytics pipeline."""
try:
logger.info("Starting full pipeline...")
success = pipeline.run_full_pipeline()
if not success:
logger.error("Pipeline failed")
raise HTTPException(status_code=500, detail="Pipeline failed")
logger.info("Pipeline completed successfully")
return AnalysisResponse(
message="Pipeline completed successfully"
)
except Exception as e:
logger.error(f"Error in pipeline: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
+61
View File
@@ -0,0 +1,61 @@
"""
Configuration settings for the salary analytics package.
"""
import os
# Base directories
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
# Database Configuration
DB_CONFIG = {
"user": "salaryloan",
"password": "salaryloan",
"name": "salaryloan",
"port": "10532",
"host": "dev-data.simbrellang.net"
}
# Table Configuration
TABLE_NAME = "customer_account_transaction_hx"
# Salary Keywords
SALARY_KEYWORDS = [
"salary", "payroll", "income", "wage", "wages",
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
"monthlypay", "netpay", "grosspay",
"remuneration", "stipend", "allowance", "bonus", "commission",
"pension", "retirement", "dividend", "benefits", "reimbursement",
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
"monthly income", "income tax refund", "employer deposit",
"payroll deposit", "salary credit", "income credit", "salary transfer",
"income transfer", "salary received", "income received", "hr deposit",
"company deposit", "employer payment", "employee payment",
"sal",
]
# Model Configuration
MODEL_CONFIG = {
"cv_threshold": 0.10,
"min_transactions": 3,
"threshold": 0.7,
"high_earner_threshold": 10000
}
# File Paths
OUTPUT_PATHS = {
"high_earner_details": os.path.join(CSV_DIR, "high_earner_details.csv"),
"likely_salary_earner": os.path.join(CSV_DIR, "likely_salary_earner.csv"),
"final_table": os.path.join(CSV_DIR, "final_table.csv"),
"consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
"inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
"hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png")
}
@@ -0,0 +1,58 @@
"""
Consistent amount transaction analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class ConsistentAmountAnalyzer:
def __init__(self, df):
self.df = df
self.const_df = None
def calculate_coefficient_of_variation(self, group):
"""Calculate coefficient of variation for a group of transactions."""
amounts = group[group['initiated_by'] == 'C']['amount']
mean = amounts.mean()
std = amounts.std(ddof=0)
if mean == 0:
return float('nan')
return std / mean
def flag_consistent_amounts(self, group, cv_threshold=None):
"""Flag accounts with low variance in transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
filtered_group = group[group['initiated_by'] == 'C']
cv = self.calculate_coefficient_of_variation(filtered_group)
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
return pd.Series(
[is_consistent] * len(group),
index=group.index,
name='is_consistent_amount'
)
def identify_consistent_amount_accounts(self, cv_threshold=None):
"""Identify accounts with consistent transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
self.df = self.df.groupby('accountid').apply(
lambda group: self.flag_consistent_amounts(group, cv_threshold)
).reset_index(level=0, drop=True)
self.const_df = self.df.copy()
return self.df
def get_consistent_amount_data(self):
"""Get transactions identified as having consistent amounts."""
if self.const_df is None:
self.identify_consistent_amount_accounts()
return self.const_df[
(self.const_df['is_consistent_amount']) &
(self.const_df['initiated_by'] == 'C')
]
+113
View File
@@ -0,0 +1,113 @@
"""
Data loading and preprocessing module.
"""
from sqlalchemy import create_engine, text
import pandas as pd
from datetime import datetime
import logging
from .config import DB_CONFIG, TABLE_NAME
logger = logging.getLogger(__name__)
class DataLoader:
def __init__(self):
self.engine = None
self.df = None
self.chunk_size = 10000 # Load 10,000 rows at a time
def connect(self):
"""Establish database connection."""
try:
logger.info("Attempting to connect to database...")
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['name']}"
self.engine = create_engine(DATABASE_URL)
with self.engine.connect() as conn:
# First check if table exists
check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{TABLE_NAME}')")
table_exists = conn.execute(check_table).scalar()
if not table_exists:
logger.error(f"Table {TABLE_NAME} does not exist in the database")
return False
# Get row count
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
row_count = conn.execute(count_query).scalar()
logger.info(f"Table {TABLE_NAME} exists with {row_count} rows")
# Get version
result = conn.execute(text("SELECT version();"))
logger.info("Connected successfully to database!")
return True
except Exception as e:
logger.error(f"Error connecting to database: {str(e)}")
return False
def load_data(self):
"""Load and preprocess transaction data in chunks."""
if not self.engine:
logger.info("No database connection. Attempting to connect...")
if not self.connect():
logger.error("Failed to establish database connection")
return None
try:
logger.info(f"Loading data from table: {TABLE_NAME}")
# First get total count
with self.engine.connect() as conn:
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
total_rows = conn.execute(count_query).scalar()
logger.info(f"Total rows to process: {total_rows}")
# Load data in chunks
chunks = []
offset = 0
while True:
logger.info(f"Loading chunk starting at offset {offset}")
query = f"SELECT * FROM {TABLE_NAME} LIMIT {self.chunk_size} OFFSET {offset}"
chunk = pd.read_sql(query, self.engine)
if chunk.empty:
break
# Preprocess chunk
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
# Rename columns
chunk = chunk.rename(columns={
'd1': 'trx_type',
'd2': 'trx_subtype',
'd3': 'initiated_by',
'd4': 'customer_id'
})
chunks.append(chunk)
offset += self.chunk_size
if offset >= total_rows:
break
# Combine all chunks
self.df = pd.concat(chunks, ignore_index=True)
logger.info(f"Successfully loaded {len(self.df)} rows of data")
# Basic data validation
logger.info("Performing data validation...")
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
logger.info(f"Data types:\n{self.df.dtypes}")
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
return self.df
except Exception as e:
logger.error(f"Error loading data: {str(e)}")
return None
def get_data(self):
"""Get the loaded DataFrame."""
if self.df is None:
logger.warning("No data loaded. Call load_data() first.")
return self.df
+47
View File
@@ -0,0 +1,47 @@
"""
Keyword-based salary transaction analysis module.
"""
import re
import pandas as pd
from .config import SALARY_KEYWORDS
class KeywordAnalyzer:
def __init__(self, df):
self.df = df
self.desc_df = None
def identify_salary_transactions(self):
"""
Identifies potential salary-related transactions based on keywords
and month-year patterns in the 'description' column.
"""
month_year_patterns = [
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
]
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
combined_pattern = (
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
'|'.join(month_year_patterns)
)
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
combined_pattern,
na=False,
regex=True
)
self.desc_df = self.df.copy()
return self.df
def get_salary_related_data(self):
"""Get transactions identified as salary-related."""
if self.desc_df is None:
self.identify_salary_transactions()
return self.desc_df[
(self.desc_df['is_salary_related'] == True) &
(self.desc_df['initiated_by'] == 'C')
]
+134
View File
@@ -0,0 +1,134 @@
"""
Main module for running the salary analytics pipeline.
"""
import logging
from .data_loader import DataLoader
from .keyword_analyzer import KeywordAnalyzer
from .consistent_amount_analyzer import ConsistentAmountAnalyzer
from .transaction_type_analyzer import TransactionTypeAnalyzer
from .salary_earner_analyzer import SalaryEarnerAnalyzer
from .salary_predictor import SalaryPredictor
logger = logging.getLogger(__name__)
class SalaryAnalyticsPipeline:
def __init__(self):
logger.info("Initializing SalaryAnalyticsPipeline")
self.data_loader = None
self.df = None
self.keyword_analyzer = None
self.consistent_amount_analyzer = None
self.transaction_type_analyzer = None
self.salary_earner_analyzer = None
self.salary_predictor = None
def load_data(self):
"""Load and preprocess the transaction data."""
logger.info("Starting data loading process")
self.data_loader = DataLoader()
self.df = self.data_loader.load_data()
if self.df is not None:
logger.info(f"Successfully loaded data with {len(self.df)} rows")
else:
logger.error("Failed to load data")
return self.df is not None
def run_keyword_analysis(self):
"""Run keyword-based salary transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting keyword analysis")
self.keyword_analyzer = KeywordAnalyzer(self.df)
self.keyword_analyzer.identify_salary_transactions()
return self.keyword_analyzer.get_salary_related_data()
def run_consistent_amount_analysis(self):
"""Run consistent amount transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting consistent amount analysis")
self.consistent_amount_analyzer = ConsistentAmountAnalyzer(self.df)
self.consistent_amount_analyzer.identify_consistent_amount_accounts()
return self.consistent_amount_analyzer.get_consistent_amount_data()
def run_transaction_type_analysis(self):
"""Run transaction type analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting transaction type analysis")
self.transaction_type_analyzer = TransactionTypeAnalyzer(self.df)
self.transaction_type_analyzer.flag_salary_type_transactions()
return self.transaction_type_analyzer.get_salary_type_data()
def generate_salary_earner_reports(self):
"""Generate salary earner reports."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting salary earner report generation")
self.salary_earner_analyzer = SalaryEarnerAnalyzer(self.df)
return self.salary_earner_analyzer.generate_reports()
def train_salary_prediction_models(self):
"""Train salary prediction models."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting model training")
self.salary_predictor = SalaryPredictor(self.df)
# Get accounts from the salary earner analyzer
if self.salary_earner_analyzer is None:
logger.info("Salary earner analyzer not initialized. Generating reports first.")
self.generate_salary_earner_reports()
consistent_accounts = self.salary_earner_analyzer.final_table['accountid'].unique()
inconsistent_accounts = self.salary_earner_analyzer.likely_salary_earner['accountid'].unique()
self.salary_predictor.train_and_evaluate(consistent_accounts, inconsistent_accounts)
def run_full_pipeline(self):
"""Run the complete salary analytics pipeline."""
logger.info("Starting full pipeline execution")
if not self.load_data():
logger.error("Failed to load data. Exiting pipeline.")
return False
try:
logger.info("Running keyword analysis...")
self.run_keyword_analysis()
logger.info("Running consistent amount analysis...")
self.run_consistent_amount_analysis()
logger.info("Running transaction type analysis...")
self.run_transaction_type_analysis()
logger.info("Generating salary earner reports...")
self.generate_salary_earner_reports()
logger.info("Training salary prediction models...")
self.train_salary_prediction_models()
logger.info("Pipeline completed successfully!")
return True
except Exception as e:
logger.error(f"Pipeline failed: {str(e)}")
return False
def main():
"""Main function to run the salary analytics pipeline."""
pipeline = SalaryAnalyticsPipeline()
pipeline.run_full_pipeline()
if __name__ == "__main__":
main()
+145
View File
@@ -0,0 +1,145 @@
"""
Salary earner analysis and report generation module.
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from datetime import datetime, timedelta
from .config import MODEL_CONFIG, OUTPUT_PATHS
class SalaryEarnerAnalyzer:
def __init__(self, df):
self.df = df
self.final_table = None
self.likely_salary_earner = None
self.high_earner_details = None
def filter_venn_section(self, **kwargs):
"""Filter accounts based on specified combinations of hypothesis flags."""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = self.df[self.df['initiated_by'] == 'C']
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
return df1[condition]
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
"""Plot and save Venn diagram showing overlap between hypotheses."""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
plt.close()
def generate_salary_earners_table(self, all_three_hypotheses):
"""Generate a table of salary earners with their metrics."""
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Calculate required metrics
num_months = len(group)
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
least_inflow = last_6_months['amount'].min()
avg_salary = group['amount'].mean()
# Calculate days since last transaction
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
final_df = final_df.dropna()
return final_df
def analyze_salary_earners(self, final_df):
"""Analyze salary earners and identify high earners."""
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']]
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
count_high = len(high_earners)
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
return high_earner_details, count_high
def generate_reports(self):
"""Generate all salary earner reports."""
# Get accounts flagged by all three hypotheses
all_three_hypotheses = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
# Generate final table
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
# Generate likely salary earner table
green_section = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
yellow_section = self.filter_venn_section(
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
self.likely_salary_earner = pd.concat([yellow_section, green_section])
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
# Analyze high earners
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
print(f"\nTotal High Earners: {total_high_earners}")
# Plot hypothesis overlap
self.plot_hypothesis_overlap(
self.df[self.df['is_salary_related']],
self.df[self.df['is_consistent_amount']],
self.df[self.df['is_salary_type']]
)
# Save reports
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
return {
'final_table': self.final_table,
'likely_salary_earner': self.likely_salary_earner,
'high_earner_details': self.high_earner_details,
'total_high_earners': total_high_earners
}
+160
View File
@@ -0,0 +1,160 @@
"""
Salary prediction module using machine learning.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from .config import OUTPUT_PATHS
class SalaryPredictor:
def __init__(self, df):
self.df = df
self.model_cons = None
self.model_incons = None
self.scaler_cons = None
self.scaler_incons = None
def add_feature_engineering(self, df):
"""Engineer features for salary prediction."""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(self, df_transactions, accounts):
"""Prepare data for training and testing."""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = self.add_feature_engineering(df_filtered)
# Aggregate monthly data
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
if len(account_data) >= 12:
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(self, X_train, y_train, X_test, y_test):
"""Train and evaluate a Random Forest model."""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
def plot_predictions(self, y_test, y_pred, title, output_path):
"""Plot actual vs predicted values and save to file."""
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title(title)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.savefig(output_path)
plt.close()
def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
"""Train and evaluate models for both consistent and inconsistent salary earners."""
# Train model for consistent salary earners
X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
if len(X_train_cons) > 0:
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
# Plot predictions
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
y_pred = self.model_cons.predict(X_test_cons_scaled)
self.plot_predictions(
y_test_cons,
y_pred,
"Actual vs. Predicted Salary (Consistent Earners)",
OUTPUT_PATHS['consistent_earners_plot']
)
else:
print("No accounts with sufficient data for consistent salary earners.")
# Train model for inconsistent salary earners
X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
# Plot predictions
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
y_pred = self.model_incons.predict(X_test_incons_scaled)
self.plot_predictions(
y_test_incons,
y_pred,
"Actual vs. Predicted Salary (Inconsistent Earners)",
OUTPUT_PATHS['inconsistent_earners_plot']
)
else:
print("No accounts with sufficient data for inconsistent salary earners.")
@@ -0,0 +1,43 @@
"""
Transaction type analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class TransactionTypeAnalyzer:
def __init__(self, df):
self.df = df
self.trx_df = None
def flag_salary_type_transactions(self):
"""Flag transactions that match salary criteria based on type and subtype."""
self.df['is_salary_type'] = (
((self.df['trx_type'] == 'T') | (self.df['trx_type'] == 'C')) &
((self.df['trx_subtype'] == 'BI') | (self.df['trx_subtype'] == 'I') |
(self.df['trx_subtype'] == 'BS') | (self.df['trx_subtype'] == 'CI')) &
(self.df['initiated_by'] == 'C') &
(self.df['amount'] > 0)
)
self.trx_df = self.df.copy()
return self.df
def is_salary_earner_by_type(self, group, min_transactions=None, threshold=None):
"""Determine if an account likely belongs to a salary earner."""
if min_transactions is None:
min_transactions = MODEL_CONFIG['min_transactions']
if threshold is None:
threshold = MODEL_CONFIG['threshold']
if len(group) < min_transactions:
return False
valid_ratio = group['is_salary_type'].mean()
return valid_ratio >= threshold
def get_salary_type_data(self):
"""Get transactions identified as salary type."""
if self.trx_df is None:
self.flag_salary_type_transactions()
return self.trx_df[self.trx_df['is_salary_type']]