Added new salary-related terms and improved image outputs in salary.ipynb

This commit is contained in:
2025-04-28 19:44:40 +01:00
parent 8207d8f1ff
commit 591d4611b6
27 changed files with 1782 additions and 12 deletions
+45
View File
@@ -0,0 +1,45 @@
# Git
.git
.gitignore
# Python
__pycache__/
*.py[cod]
*$py.class
*.so
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# Virtual Environment
venv/
ENV/
# IDE
.idea/
.vscode/
*.swp
*.swo
# Output
output/
# Logs
*.log
# Local development
.env
.env.local
+21
View File
@@ -0,0 +1,21 @@
FROM python:3.11-slim
WORKDIR /app
RUN apt-get update && apt-get install -y libpq-dev && rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY salary_analytics/ ./salary_analytics/
RUN mkdir -p output/csv output/plots
ENV PYTHONPATH=/app
ENV HOST=0.0.0.0
ENV PORT=8000
EXPOSE 8000
# Use host 0.0.0.0 to allow external connections
CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+140
View File
@@ -0,0 +1,140 @@
# Salary Analytics
A comprehensive salary analytics system that analyzes transaction data to identify salary earners, predict future salaries, and generate detailed reports.
## Features
- **Transaction Analysis**
- Keyword-based salary transaction identification
- Consistent amount transaction analysis
- Transaction type analysis
- Hypothesis overlap visualization
- **Salary Earner Classification**
- Verified salary earners identification
- Likely salary earners identification
- High earner detection
- Salary pattern analysis
- **Machine Learning**
- Salary prediction models
- Separate models for consistent and inconsistent earners
- Feature engineering
- Model evaluation metrics
- **Reporting**
- CSV reports generation
- Visualization plots
- High earner details
- Salary earner statistics
## Architecture
The project is organized into the following modules:
```
salary_analytics/
├── __init__.py
├── config.py # Configuration settings
├── data_loader.py # Database connection and data loading
├── keyword_analyzer.py # Keyword-based analysis
├── consistent_amount_analyzer.py # Consistent amount analysis
├── transaction_type_analyzer.py # Transaction type analysis
├── salary_earner_analyzer.py # Salary earner analysis
├── salary_predictor.py # Machine learning models
├── main.py # Main pipeline
└── api.py # FastAPI endpoints
```
## Configuration
The system can be configured through environment variables or the `config.py` file:
```python
# Database Configuration
DB_CONFIG = {
"user": "db_user",
"password": "your_secure_password",
"name": "salary_db",
"port": "5432",
"host": "localhost"
}
# Model Configuration
MODEL_CONFIG = {
"cv_threshold": 0.10,
"min_transactions": 3,
"threshold": 0.7,
"high_earner_threshold": 10000
}
```
## Usage
### Using the API
1. Start the API server:
```bash
uvicorn salary_analytics.api:app --reload
```
2. Access the API documentation:
- Swagger UI: http://localhost:8000/docs
- ReDoc: http://localhost:8000/redoc
### API Endpoints
1. **Basic Endpoints**
- `GET /`: Welcome message
- `GET /health`: Health check
2. **Analysis Endpoints**
- `POST /analyze/keyword`: Run keyword analysis
- `POST /analyze/consistent-amount`: Run consistent amount analysis
- `POST /analyze/transaction-type`: Run transaction type analysis
3. **Report Generation**
- `POST /generate/reports`: Generate all reports
- `GET /download/{report_type}`: Download specific reports
- Available types:
- `high_earners`: High earner details
- `likely_earners`: Likely salary earners
- `final_table`: Final analysis table
- `consistent_plot`: Consistent earners plot
- `inconsistent_plot`: Inconsistent earners plot
- `hypothesis_plot`: Hypothesis overlap plot
4. **Model Training**
- `POST /train/models`: Train prediction models
5. **Pipeline**
- `POST /run/pipeline`: Run complete pipeline
## Docker Deployment
1. Build the Docker image:
```bash
docker-compose build
```
2. Run the container:
```bash
docker-compose up
```
The API will be available at http://localhost:8000
## Output Structure
```
output/
├── csv/
│ ├── high_earner_details.csv
│ ├── likely_salary_earner.csv
│ └── final_table.csv
└── plots/
├── consistent_earners_predictions.png
├── inconsistent_earners_predictions.png
└── hypothesis_overlap.png
```
+22
View File
@@ -0,0 +1,22 @@
version: '3.8'
services:
api:
build: .
ports:
- "8000:8000"
volumes:
- ./output:/app/output
environment:
- DB_USER=salaryloan
- DB_PASSWORD=salaryloan
- DB_NAME=salaryloan
- DB_PORT=10532
- DB_HOST=dev-data.simbrellang.net
restart: unless-stopped
networks:
- salary_network
networks:
salary_network:
driver: bridge
+13
View File
@@ -0,0 +1,13 @@
sqlalchemy
pandas
numpy
matplotlib
seaborn
matplotlib-venn
wordcloud
scikit-learn
psycopg2-binary
fastapi>=0.68.0
uvicorn>=0.15.0
pydantic>=1.8.0
python-multipart>=0.0.5
+13 -12
View File
File diff suppressed because one or more lines are too long
+549
View File
@@ -0,0 +1,549 @@
from readline import redisplay
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import seaborn as sns
from matplotlib_venn import venn3, venn2
from wordcloud import WordCloud
from datetime import datetime, timedelta
DB_USER = "salaryloan"
DB_PASSWORD = "salaryloan"
DB_NAME = "salaryloan"
DB_PORT = "10532"
DB_HOST = "dev-data.simbrellang.net"
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)
try:
with engine.connect() as conn:
result = conn.execute(text("SELECT version();"))
print("Connected successfully!")
except Exception as e:
print("Error connecting to database:", e)
table_name = "customer_account_transaction_hx"
df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
df.head(10)
# Change to date column to datetime
df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])
# Rename columns
df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
'd3': 'initiated_by', 'd4': 'customer_id'})
keywords = [
"salary", "payroll", "income", "wage", "wages",
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
"monthlypay", "netpay", "grosspay",
"remuneration", "stipend", "allowance", "bonus", "commission",
"pension", "retirement", "dividend", "benefits", "reimbursement",
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
"monthly income", "income tax refund", "employer deposit",
"payroll deposit", "salary credit", "income credit", "salary transfer",
"income transfer", "salary received", "income received", "hr deposit",
"company deposit", "employer payment", "employee payment",
"sal",
]
def identify_salary_transactions(df, keywords):
"""
Identifies potential salary-related transactions based on keywords
and month-year patterns in the 'description' column.
Args:
df (pd.DataFrame): The input DataFrame containing transaction data.
keywords (list): A list of salary/income-related keywords to search for.
Returns:
pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
indicating potential salary transactions.
"""
month_year_patterns = [
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
]
escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
combined_pattern = (
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
'|'.join(month_year_patterns)
)
df['is_salary_related'] = df['description'].str.lower().str.contains(
combined_pattern,
na=False,
regex=True
)
return df
desc_df = identify_salary_transactions(df, keywords)
desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
desc_data.head()
def calculate_coefficient_of_variation(group):
"""
Calculates the coefficient of variation (CV) for a group of transactions
where 'initiated_by' is 'C'.
Args:
group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').
Returns:
float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
"""
amounts = group[group['initiated_by'] == 'C']['amount']
mean = amounts.mean()
std = amounts.std(ddof=0)
if mean == 0:
return float('nan')
return std / mean
def flag_consistent_amounts(group, cv_threshold=0.10):
"""
Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.
Args:
group (pd.DataFrame): Transactions for a single account.
cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).
Returns:
pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
"""
# Filter for transactions initiated by 'C' before calculating CV
filtered_group = group[group['initiated_by'] == 'C']
cv = calculate_coefficient_of_variation(filtered_group) # Pass filtered group
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
return pd.Series(
[is_consistent] * len(group),
index=group.index,
name='is_consistent_amount'
)
def identify_consistent_amount_accounts(df, cv_threshold=0.10):
"""
Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.
Args:
df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
cv_threshold (float): Maximum allowed CV (default: 0.10).
Returns:
pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
"""
df = df.groupby('accountid').apply(
lambda group: flag_consistent_amounts(group, cv_threshold)
).reset_index(level=0, drop=True)
return df
const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
const_df = df.merge(const_df, left_index=True, right_index=True)
df['is_consistent_amount'] = const_df['is_consistent_amount']
const_data = const_df[
(const_df['is_consistent_amount']) &
(const_df['initiated_by']=='C')
]
def flag_salary_type_transactions(df):
"""
Flags transactions that match the salary criteria based on type, subtype, and initiator.
Args:
df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].
Returns:
pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
"""
df['is_salary_type'] = (
((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
((df['trx_subtype'] == 'BI') | (df['trx_subtype'] == 'I') | (df['trx_subtype'] == 'BS') | (df['trx_subtype'] == 'CI')) &
(df['initiated_by'] == 'C') &
(df['amount'] > 0)
)
return df
def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
"""
Determines if an account likely belongs to a salary earner based on transaction type criteria.
Args:
group (pd.DataFrame): Transactions for a single account.
min_transactions (int): Minimum transactions required to qualify (default: 3).
threshold (float): Minimum proportion of salary-type transactions (default: 0.7).
Returns:
bool: True if the account meets the criteria, False otherwise.
"""
if len(group) < min_transactions:
return False
valid_ratio = group['is_salary_type'].mean()
return valid_ratio >= threshold
trx_df = flag_salary_type_transactions(df)
trx_data = trx_df[trx_df['is_salary_type']]
def plot_hypothesis_overlap(hypothesis1_df,
hypothesis3_df, hypothesis4_df,
account_col='accountid'):
"""
Plots a Venn diagram showing overlap between the hypotheses.
Args:
hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
account_col (str): Account identifier column.
"""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.show()
plot_hypothesis_overlap(desc_data, const_data, trx_data)
def filter_venn_section(df, **kwargs):
"""
Filters accounts based on specified combinations of hypothesis flags.
Args:
df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
**kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
For example: {'is_salary_related': True, 'is_consistent_amount': False}.
Returns:
pd.DataFrame: Filtered accounts matching the specified Venn section.
"""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = df[df['initiated_by']=='C']
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
return df1[condition]
green_section = filter_venn_section(
df,
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
redisplay(green_section.head(10))
yellow_section = filter_venn_section(
df,
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
# Get accounts flagged by all three hypotheses
all_three_hypotheses = filter_venn_section(
df,
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
def generate_salary_earners_table(all_three_hypotheses):
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Calculate required metrics
num_months = len(group)
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
least_inflow = last_6_months['amount'].min()
avg_salary = group['amount'].mean()
# Estimated next salary
# Calculate days_since_last_trx within the loop
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
final_df = final_df.dropna()
return final_df
final_table = generate_salary_earners_table(all_three_hypotheses)
# Display results
print(f"Found {final_table['accountid'].nunique()} verified salary earners")
likely_salary_earner = pd.concat([yellow_section, green_section])
likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
likely_salary_earner = generate_salary_earners_table(likely_salary_earner)
# Display results
print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")
def analyze_salary_earners(final_df):
"""
Analyzes salary earners and identifies high earners (>=10k predicted salary).
Args:
final_df (pd.DataFrame): DataFrame containing salary earner information.
Returns:
pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
"""
high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
count_high = len(high_earners)
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
return high_earner_details, count_high
high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)
print(f"\nTotal High Earners: {total_high_earners}")
high_earner_details_df.to_csv('high_earner_details.csv', index=False)
likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
final_table.to_csv('final_table.csv', index=False)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def add_feature_engineering(df):
"""
Engineers new features to the input DataFrame for salary prediction.
Adds features like month, month sequence, one-hot encoded transaction type,
3-month rolling sum, and 3-month rolling average of transaction amounts.
Args:
df (pd.DataFrame): The input DataFrame containing transaction data.
Returns:
pd.DataFrame: The DataFrame with engineered features added.
"""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding: one-hot encode trx_type
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics: sort by account and date
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(df_transactions, accounts):
"""
Prepares transaction data for training and testing a salary prediction model.
Filters transactions for specified accounts, performs feature engineering,
aggregates data monthly, filters for accounts with sufficient data, and
creates training and testing sets using a sliding window approach.
Args:
df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
accounts (list): A list of account IDs to include in the data preparation.
Returns:
tuple: A tuple containing the training and testing data as NumPy arrays:
(X_train, y_train, X_test, y_test).
"""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = add_feature_engineering(df_filtered)
# Aggregate monthly data with new features
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create training and testing sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
# Check if account has enough data for training and testing sequences
if len(account_data) >= 12: # Ensure at least 12 months of data
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(X_train, y_train, X_test, y_test):
"""
Trains and evaluates a Random Forest Regressor for salary prediction.
Scales the input features using StandardScaler, trains the model,
predicts on the test set, and calculates evaluation metrics
(MAE, RMSE, R-squared).
Args:
X_train (np.ndarray): Training data features.
y_train (np.ndarray): Training data target (salary).
X_test (np.ndarray): Testing data features.
y_test (np.ndarray): Testing data target (salary).
Returns:
tuple: A tuple containing the trained model and the scaler object:
(model, scaler).
"""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate model
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
consistent_accounts = final_table['accountid'].unique()
X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
if len(X_train_cons) > 0:
model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
else:
print("No accounts with sufficient data for consistent salary earners.")
X_test_cons_scaled = scaler_cons.transform(X_test_cons)
y_pred = model_cons.predict(X_test_cons_scaled)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_cons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
plt.show()
inconsistent_accounts = likely_salary_earner['accountid'].unique()
X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
else:
print("No accounts with sufficient data for inconsistent salary earners.")
X_test_incons_scaled = scaler_incons.transform(X_test_incons)
y_pred = model_incons.predict(X_test_incons_scaled)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_incons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
plt.show()
+6
View File
@@ -0,0 +1,6 @@
"""
Salary Analytics Package
A package for analyzing and predicting salary patterns from transaction data.
"""
__version__ = "0.1.0"
Binary file not shown.
Binary file not shown.
Binary file not shown.
+212
View File
@@ -0,0 +1,212 @@
"""
FastAPI application for salary analytics.
"""
from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, Dict
import os
import socket
import logging
from .main import SalaryAnalyticsPipeline
from .config import OUTPUT_PATHS
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
app = FastAPI(
title="Salary Analytics API",
description="API for analyzing and predicting salary patterns from transaction data",
version="1.0.0"
)
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)
# Global pipeline instance
pipeline = None
class AnalysisResponse(BaseModel):
"""Response model for analysis endpoints."""
message: str
data: Optional[Dict] = None
file_path: Optional[str] = None
@app.on_event("startup")
async def startup_event():
"""Initialize the pipeline on startup."""
global pipeline
try:
logger.info("Initializing pipeline...")
pipeline = SalaryAnalyticsPipeline()
if not pipeline.load_data():
logger.error("Failed to load data during startup")
raise Exception("Failed to load data during startup")
# Print network information
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Server running on hostname: {hostname}")
logger.info(f"Server IP address: {ip_address}")
logger.info(f"Server is accessible at:")
logger.info(f"- http://localhost:8000")
logger.info(f"- http://127.0.0.1:8000")
logger.info(f"- http://{ip_address}:8000")
logger.info("Pipeline initialized successfully")
except Exception as e:
logger.error(f"Error during startup: {str(e)}")
raise
@app.get("/")
async def root():
"""Root endpoint."""
logger.info("Root endpoint accessed")
return {"message": "Welcome to Salary Analytics API"}
@app.get("/health")
async def health_check():
"""Health check endpoint."""
logger.info("Health check endpoint accessed")
return {"status": "healthy"}
@app.post("/analyze/keyword", response_model=AnalysisResponse)
async def analyze_keyword():
"""Run keyword-based salary transaction analysis."""
try:
logger.info("Starting keyword analysis...")
data = pipeline.run_keyword_analysis()
logger.info(f"Keyword analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Keyword analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in keyword analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/analyze/consistent-amount", response_model=AnalysisResponse)
async def analyze_consistent_amount():
"""Run consistent amount transaction analysis."""
try:
logger.info("Starting consistent amount analysis...")
data = pipeline.run_consistent_amount_analysis()
logger.info(f"Consistent amount analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Consistent amount analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in consistent amount analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/analyze/transaction-type", response_model=AnalysisResponse)
async def analyze_transaction_type():
"""Run transaction type analysis."""
try:
logger.info("Starting transaction type analysis...")
data = pipeline.run_transaction_type_analysis()
logger.info(f"Transaction type analysis completed. Found {len(data)} matches")
return AnalysisResponse(
message="Transaction type analysis completed successfully",
data={"count": len(data)}
)
except Exception as e:
logger.error(f"Error in transaction type analysis: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/generate/reports", response_model=AnalysisResponse)
async def generate_reports(background_tasks: BackgroundTasks):
"""Generate salary earner reports."""
try:
logger.info("Starting report generation...")
reports = pipeline.generate_salary_earner_reports()
logger.info("Reports generated successfully")
return AnalysisResponse(
message="Reports generated successfully",
data={
"verified_salary_earners": len(reports['final_table']),
"likely_salary_earners": len(reports['likely_salary_earner']),
"high_earners": reports['total_high_earners']
}
)
except Exception as e:
logger.error(f"Error in report generation: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/train/models", response_model=AnalysisResponse)
async def train_models():
"""Train salary prediction models."""
try:
logger.info("Starting model training...")
pipeline.train_salary_prediction_models()
logger.info("Models trained successfully")
return AnalysisResponse(
message="Models trained successfully"
)
except Exception as e:
logger.error(f"Error in model training: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/download/{report_type}")
async def download_report(report_type: str):
"""Download generated reports."""
try:
logger.info(f"Attempting to download report: {report_type}")
file_paths = {
"high_earners": OUTPUT_PATHS["high_earner_details"],
"likely_earners": OUTPUT_PATHS["likely_salary_earner"],
"final_table": OUTPUT_PATHS["final_table"],
"consistent_plot": OUTPUT_PATHS["consistent_earners_plot"],
"inconsistent_plot": OUTPUT_PATHS["inconsistent_earners_plot"],
"hypothesis_plot": OUTPUT_PATHS["hypothesis_overlap_plot"]
}
if report_type not in file_paths:
logger.error(f"Report type not found: {report_type}")
raise HTTPException(status_code=404, detail="Report type not found")
file_path = file_paths[report_type]
if not os.path.exists(file_path):
logger.error(f"Report file not found: {file_path}")
raise HTTPException(status_code=404, detail="Report file not found")
logger.info(f"Successfully found report file: {file_path}")
return FileResponse(
path=file_path,
filename=os.path.basename(file_path),
media_type="application/octet-stream"
)
except Exception as e:
logger.error(f"Error downloading report: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
@app.post("/run/pipeline", response_model=AnalysisResponse)
async def run_full_pipeline():
"""Run the complete salary analytics pipeline."""
try:
logger.info("Starting full pipeline...")
success = pipeline.run_full_pipeline()
if not success:
logger.error("Pipeline failed")
raise HTTPException(status_code=500, detail="Pipeline failed")
logger.info("Pipeline completed successfully")
return AnalysisResponse(
message="Pipeline completed successfully"
)
except Exception as e:
logger.error(f"Error in pipeline: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))
+61
View File
@@ -0,0 +1,61 @@
"""
Configuration settings for the salary analytics package.
"""
import os
# Base directories
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
# Create directories if they don't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
# Database Configuration
DB_CONFIG = {
"user": "salaryloan",
"password": "salaryloan",
"name": "salaryloan",
"port": "10532",
"host": "dev-data.simbrellang.net"
}
# Table Configuration
TABLE_NAME = "customer_account_transaction_hx"
# Salary Keywords
SALARY_KEYWORDS = [
"salary", "payroll", "income", "wage", "wages",
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
"monthlypay", "netpay", "grosspay",
"remuneration", "stipend", "allowance", "bonus", "commission",
"pension", "retirement", "dividend", "benefits", "reimbursement",
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
"monthly income", "income tax refund", "employer deposit",
"payroll deposit", "salary credit", "income credit", "salary transfer",
"income transfer", "salary received", "income received", "hr deposit",
"company deposit", "employer payment", "employee payment",
"sal",
]
# Model Configuration
MODEL_CONFIG = {
"cv_threshold": 0.10,
"min_transactions": 3,
"threshold": 0.7,
"high_earner_threshold": 10000
}
# File Paths
OUTPUT_PATHS = {
"high_earner_details": os.path.join(CSV_DIR, "high_earner_details.csv"),
"likely_salary_earner": os.path.join(CSV_DIR, "likely_salary_earner.csv"),
"final_table": os.path.join(CSV_DIR, "final_table.csv"),
"consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
"inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
"hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png")
}
@@ -0,0 +1,58 @@
"""
Consistent amount transaction analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class ConsistentAmountAnalyzer:
def __init__(self, df):
self.df = df
self.const_df = None
def calculate_coefficient_of_variation(self, group):
"""Calculate coefficient of variation for a group of transactions."""
amounts = group[group['initiated_by'] == 'C']['amount']
mean = amounts.mean()
std = amounts.std(ddof=0)
if mean == 0:
return float('nan')
return std / mean
def flag_consistent_amounts(self, group, cv_threshold=None):
"""Flag accounts with low variance in transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
filtered_group = group[group['initiated_by'] == 'C']
cv = self.calculate_coefficient_of_variation(filtered_group)
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
return pd.Series(
[is_consistent] * len(group),
index=group.index,
name='is_consistent_amount'
)
def identify_consistent_amount_accounts(self, cv_threshold=None):
"""Identify accounts with consistent transaction amounts."""
if cv_threshold is None:
cv_threshold = MODEL_CONFIG['cv_threshold']
self.df = self.df.groupby('accountid').apply(
lambda group: self.flag_consistent_amounts(group, cv_threshold)
).reset_index(level=0, drop=True)
self.const_df = self.df.copy()
return self.df
def get_consistent_amount_data(self):
"""Get transactions identified as having consistent amounts."""
if self.const_df is None:
self.identify_consistent_amount_accounts()
return self.const_df[
(self.const_df['is_consistent_amount']) &
(self.const_df['initiated_by'] == 'C')
]
+113
View File
@@ -0,0 +1,113 @@
"""
Data loading and preprocessing module.
"""
from sqlalchemy import create_engine, text
import pandas as pd
from datetime import datetime
import logging
from .config import DB_CONFIG, TABLE_NAME
logger = logging.getLogger(__name__)
class DataLoader:
def __init__(self):
self.engine = None
self.df = None
self.chunk_size = 10000 # Load 10,000 rows at a time
def connect(self):
"""Establish database connection."""
try:
logger.info("Attempting to connect to database...")
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['name']}"
self.engine = create_engine(DATABASE_URL)
with self.engine.connect() as conn:
# First check if table exists
check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{TABLE_NAME}')")
table_exists = conn.execute(check_table).scalar()
if not table_exists:
logger.error(f"Table {TABLE_NAME} does not exist in the database")
return False
# Get row count
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
row_count = conn.execute(count_query).scalar()
logger.info(f"Table {TABLE_NAME} exists with {row_count} rows")
# Get version
result = conn.execute(text("SELECT version();"))
logger.info("Connected successfully to database!")
return True
except Exception as e:
logger.error(f"Error connecting to database: {str(e)}")
return False
def load_data(self):
"""Load and preprocess transaction data in chunks."""
if not self.engine:
logger.info("No database connection. Attempting to connect...")
if not self.connect():
logger.error("Failed to establish database connection")
return None
try:
logger.info(f"Loading data from table: {TABLE_NAME}")
# First get total count
with self.engine.connect() as conn:
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
total_rows = conn.execute(count_query).scalar()
logger.info(f"Total rows to process: {total_rows}")
# Load data in chunks
chunks = []
offset = 0
while True:
logger.info(f"Loading chunk starting at offset {offset}")
query = f"SELECT * FROM {TABLE_NAME} LIMIT {self.chunk_size} OFFSET {offset}"
chunk = pd.read_sql(query, self.engine)
if chunk.empty:
break
# Preprocess chunk
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
# Rename columns
chunk = chunk.rename(columns={
'd1': 'trx_type',
'd2': 'trx_subtype',
'd3': 'initiated_by',
'd4': 'customer_id'
})
chunks.append(chunk)
offset += self.chunk_size
if offset >= total_rows:
break
# Combine all chunks
self.df = pd.concat(chunks, ignore_index=True)
logger.info(f"Successfully loaded {len(self.df)} rows of data")
# Basic data validation
logger.info("Performing data validation...")
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
logger.info(f"Data types:\n{self.df.dtypes}")
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
return self.df
except Exception as e:
logger.error(f"Error loading data: {str(e)}")
return None
def get_data(self):
"""Get the loaded DataFrame."""
if self.df is None:
logger.warning("No data loaded. Call load_data() first.")
return self.df
+47
View File
@@ -0,0 +1,47 @@
"""
Keyword-based salary transaction analysis module.
"""
import re
import pandas as pd
from .config import SALARY_KEYWORDS
class KeywordAnalyzer:
def __init__(self, df):
self.df = df
self.desc_df = None
def identify_salary_transactions(self):
"""
Identifies potential salary-related transactions based on keywords
and month-year patterns in the 'description' column.
"""
month_year_patterns = [
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
]
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
combined_pattern = (
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
'|'.join(month_year_patterns)
)
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
combined_pattern,
na=False,
regex=True
)
self.desc_df = self.df.copy()
return self.df
def get_salary_related_data(self):
"""Get transactions identified as salary-related."""
if self.desc_df is None:
self.identify_salary_transactions()
return self.desc_df[
(self.desc_df['is_salary_related'] == True) &
(self.desc_df['initiated_by'] == 'C')
]
+134
View File
@@ -0,0 +1,134 @@
"""
Main module for running the salary analytics pipeline.
"""
import logging
from .data_loader import DataLoader
from .keyword_analyzer import KeywordAnalyzer
from .consistent_amount_analyzer import ConsistentAmountAnalyzer
from .transaction_type_analyzer import TransactionTypeAnalyzer
from .salary_earner_analyzer import SalaryEarnerAnalyzer
from .salary_predictor import SalaryPredictor
logger = logging.getLogger(__name__)
class SalaryAnalyticsPipeline:
def __init__(self):
logger.info("Initializing SalaryAnalyticsPipeline")
self.data_loader = None
self.df = None
self.keyword_analyzer = None
self.consistent_amount_analyzer = None
self.transaction_type_analyzer = None
self.salary_earner_analyzer = None
self.salary_predictor = None
def load_data(self):
"""Load and preprocess the transaction data."""
logger.info("Starting data loading process")
self.data_loader = DataLoader()
self.df = self.data_loader.load_data()
if self.df is not None:
logger.info(f"Successfully loaded data with {len(self.df)} rows")
else:
logger.error("Failed to load data")
return self.df is not None
def run_keyword_analysis(self):
"""Run keyword-based salary transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting keyword analysis")
self.keyword_analyzer = KeywordAnalyzer(self.df)
self.keyword_analyzer.identify_salary_transactions()
return self.keyword_analyzer.get_salary_related_data()
def run_consistent_amount_analysis(self):
"""Run consistent amount transaction analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting consistent amount analysis")
self.consistent_amount_analyzer = ConsistentAmountAnalyzer(self.df)
self.consistent_amount_analyzer.identify_consistent_amount_accounts()
return self.consistent_amount_analyzer.get_consistent_amount_data()
def run_transaction_type_analysis(self):
"""Run transaction type analysis."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting transaction type analysis")
self.transaction_type_analyzer = TransactionTypeAnalyzer(self.df)
self.transaction_type_analyzer.flag_salary_type_transactions()
return self.transaction_type_analyzer.get_salary_type_data()
def generate_salary_earner_reports(self):
"""Generate salary earner reports."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting salary earner report generation")
self.salary_earner_analyzer = SalaryEarnerAnalyzer(self.df)
return self.salary_earner_analyzer.generate_reports()
def train_salary_prediction_models(self):
"""Train salary prediction models."""
if self.df is None:
logger.error("Data not loaded. Call load_data() first.")
raise ValueError("Data not loaded. Call load_data() first.")
logger.info("Starting model training")
self.salary_predictor = SalaryPredictor(self.df)
# Get accounts from the salary earner analyzer
if self.salary_earner_analyzer is None:
logger.info("Salary earner analyzer not initialized. Generating reports first.")
self.generate_salary_earner_reports()
consistent_accounts = self.salary_earner_analyzer.final_table['accountid'].unique()
inconsistent_accounts = self.salary_earner_analyzer.likely_salary_earner['accountid'].unique()
self.salary_predictor.train_and_evaluate(consistent_accounts, inconsistent_accounts)
def run_full_pipeline(self):
"""Run the complete salary analytics pipeline."""
logger.info("Starting full pipeline execution")
if not self.load_data():
logger.error("Failed to load data. Exiting pipeline.")
return False
try:
logger.info("Running keyword analysis...")
self.run_keyword_analysis()
logger.info("Running consistent amount analysis...")
self.run_consistent_amount_analysis()
logger.info("Running transaction type analysis...")
self.run_transaction_type_analysis()
logger.info("Generating salary earner reports...")
self.generate_salary_earner_reports()
logger.info("Training salary prediction models...")
self.train_salary_prediction_models()
logger.info("Pipeline completed successfully!")
return True
except Exception as e:
logger.error(f"Pipeline failed: {str(e)}")
return False
def main():
"""Main function to run the salary analytics pipeline."""
pipeline = SalaryAnalyticsPipeline()
pipeline.run_full_pipeline()
if __name__ == "__main__":
main()
+145
View File
@@ -0,0 +1,145 @@
"""
Salary earner analysis and report generation module.
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from datetime import datetime, timedelta
from .config import MODEL_CONFIG, OUTPUT_PATHS
class SalaryEarnerAnalyzer:
def __init__(self, df):
self.df = df
self.final_table = None
self.likely_salary_earner = None
self.high_earner_details = None
def filter_venn_section(self, **kwargs):
"""Filter accounts based on specified combinations of hypothesis flags."""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = self.df[self.df['initiated_by'] == 'C']
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
return df1[condition]
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
"""Plot and save Venn diagram showing overlap between hypotheses."""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
plt.close()
def generate_salary_earners_table(self, all_three_hypotheses):
"""Generate a table of salary earners with their metrics."""
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Calculate required metrics
num_months = len(group)
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
least_inflow = last_6_months['amount'].min()
avg_salary = group['amount'].mean()
# Calculate days since last transaction
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
final_df = final_df.dropna()
return final_df
def analyze_salary_earners(self, final_df):
"""Analyze salary earners and identify high earners."""
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']]
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
count_high = len(high_earners)
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
return high_earner_details, count_high
def generate_reports(self):
"""Generate all salary earner reports."""
# Get accounts flagged by all three hypotheses
all_three_hypotheses = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
# Generate final table
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
# Generate likely salary earner table
green_section = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
yellow_section = self.filter_venn_section(
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
self.likely_salary_earner = pd.concat([yellow_section, green_section])
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
# Analyze high earners
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
print(f"\nTotal High Earners: {total_high_earners}")
# Plot hypothesis overlap
self.plot_hypothesis_overlap(
self.df[self.df['is_salary_related']],
self.df[self.df['is_consistent_amount']],
self.df[self.df['is_salary_type']]
)
# Save reports
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
return {
'final_table': self.final_table,
'likely_salary_earner': self.likely_salary_earner,
'high_earner_details': self.high_earner_details,
'total_high_earners': total_high_earners
}
+160
View File
@@ -0,0 +1,160 @@
"""
Salary prediction module using machine learning.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from .config import OUTPUT_PATHS
class SalaryPredictor:
def __init__(self, df):
self.df = df
self.model_cons = None
self.model_incons = None
self.scaler_cons = None
self.scaler_incons = None
def add_feature_engineering(self, df):
"""Engineer features for salary prediction."""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(self, df_transactions, accounts):
"""Prepare data for training and testing."""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = self.add_feature_engineering(df_filtered)
# Aggregate monthly data
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
if len(account_data) >= 12:
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(self, X_train, y_train, X_test, y_test):
"""Train and evaluate a Random Forest model."""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
def plot_predictions(self, y_test, y_pred, title, output_path):
"""Plot actual vs predicted values and save to file."""
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title(title)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.savefig(output_path)
plt.close()
def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
"""Train and evaluate models for both consistent and inconsistent salary earners."""
# Train model for consistent salary earners
X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
if len(X_train_cons) > 0:
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
# Plot predictions
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
y_pred = self.model_cons.predict(X_test_cons_scaled)
self.plot_predictions(
y_test_cons,
y_pred,
"Actual vs. Predicted Salary (Consistent Earners)",
OUTPUT_PATHS['consistent_earners_plot']
)
else:
print("No accounts with sufficient data for consistent salary earners.")
# Train model for inconsistent salary earners
X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
# Plot predictions
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
y_pred = self.model_incons.predict(X_test_incons_scaled)
self.plot_predictions(
y_test_incons,
y_pred,
"Actual vs. Predicted Salary (Inconsistent Earners)",
OUTPUT_PATHS['inconsistent_earners_plot']
)
else:
print("No accounts with sufficient data for inconsistent salary earners.")
@@ -0,0 +1,43 @@
"""
Transaction type analysis module.
"""
import pandas as pd
from .config import MODEL_CONFIG
class TransactionTypeAnalyzer:
def __init__(self, df):
self.df = df
self.trx_df = None
def flag_salary_type_transactions(self):
"""Flag transactions that match salary criteria based on type and subtype."""
self.df['is_salary_type'] = (
((self.df['trx_type'] == 'T') | (self.df['trx_type'] == 'C')) &
((self.df['trx_subtype'] == 'BI') | (self.df['trx_subtype'] == 'I') |
(self.df['trx_subtype'] == 'BS') | (self.df['trx_subtype'] == 'CI')) &
(self.df['initiated_by'] == 'C') &
(self.df['amount'] > 0)
)
self.trx_df = self.df.copy()
return self.df
def is_salary_earner_by_type(self, group, min_transactions=None, threshold=None):
"""Determine if an account likely belongs to a salary earner."""
if min_transactions is None:
min_transactions = MODEL_CONFIG['min_transactions']
if threshold is None:
threshold = MODEL_CONFIG['threshold']
if len(group) < min_transactions:
return False
valid_ratio = group['is_salary_type'].mean()
return valid_ratio >= threshold
def get_salary_type_data(self):
"""Get transactions identified as salary type."""
if self.trx_df is None:
self.flag_salary_type_transactions()
return self.trx_df[self.trx_df['is_salary_type']]