Files
AnalysisTesting/salary.py
T

550 lines
20 KiB
Python

from readline import redisplay
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import seaborn as sns
from matplotlib_venn import venn3, venn2
from wordcloud import WordCloud
from datetime import datetime, timedelta
DB_USER = "salaryloan"
DB_PASSWORD = "salaryloan"
DB_NAME = "salaryloan"
DB_PORT = "10532"
DB_HOST = "dev-data.simbrellang.net"
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
engine = create_engine(DATABASE_URL)
try:
with engine.connect() as conn:
result = conn.execute(text("SELECT version();"))
print("Connected successfully!")
except Exception as e:
print("Error connecting to database:", e)
table_name = "customer_account_transaction_hx"
df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
df.head(10)
# Change to date column to datetime
df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])
# Rename columns
df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
'd3': 'initiated_by', 'd4': 'customer_id'})
keywords = [
"salary", "payroll", "income", "wage", "wages",
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
"monthlypay", "netpay", "grosspay",
"remuneration", "stipend", "allowance", "bonus", "commission",
"pension", "retirement", "dividend", "benefits", "reimbursement",
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
"monthly income", "income tax refund", "employer deposit",
"payroll deposit", "salary credit", "income credit", "salary transfer",
"income transfer", "salary received", "income received", "hr deposit",
"company deposit", "employer payment", "employee payment",
"sal",
]
def identify_salary_transactions(df, keywords):
"""
Identifies potential salary-related transactions based on keywords
and month-year patterns in the 'description' column.
Args:
df (pd.DataFrame): The input DataFrame containing transaction data.
keywords (list): A list of salary/income-related keywords to search for.
Returns:
pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
indicating potential salary transactions.
"""
month_year_patterns = [
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
]
escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
combined_pattern = (
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
'|'.join(month_year_patterns)
)
df['is_salary_related'] = df['description'].str.lower().str.contains(
combined_pattern,
na=False,
regex=True
)
return df
desc_df = identify_salary_transactions(df, keywords)
desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
desc_data.head()
def calculate_coefficient_of_variation(group):
"""
Calculates the coefficient of variation (CV) for a group of transactions
where 'initiated_by' is 'C'.
Args:
group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').
Returns:
float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
"""
amounts = group[group['initiated_by'] == 'C']['amount']
mean = amounts.mean()
std = amounts.std(ddof=0)
if mean == 0:
return float('nan')
return std / mean
def flag_consistent_amounts(group, cv_threshold=0.10):
"""
Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.
Args:
group (pd.DataFrame): Transactions for a single account.
cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).
Returns:
pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
"""
# Filter for transactions initiated by 'C' before calculating CV
filtered_group = group[group['initiated_by'] == 'C']
cv = calculate_coefficient_of_variation(filtered_group) # Pass filtered group
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
return pd.Series(
[is_consistent] * len(group),
index=group.index,
name='is_consistent_amount'
)
def identify_consistent_amount_accounts(df, cv_threshold=0.10):
"""
Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.
Args:
df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
cv_threshold (float): Maximum allowed CV (default: 0.10).
Returns:
pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
"""
df = df.groupby('accountid').apply(
lambda group: flag_consistent_amounts(group, cv_threshold)
).reset_index(level=0, drop=True)
return df
const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
const_df = df.merge(const_df, left_index=True, right_index=True)
df['is_consistent_amount'] = const_df['is_consistent_amount']
const_data = const_df[
(const_df['is_consistent_amount']) &
(const_df['initiated_by']=='C')
]
def flag_salary_type_transactions(df):
"""
Flags transactions that match the salary criteria based on type, subtype, and initiator.
Args:
df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].
Returns:
pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
"""
df['is_salary_type'] = (
((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
((df['trx_subtype'] == 'BI') | (df['trx_subtype'] == 'I') | (df['trx_subtype'] == 'BS') | (df['trx_subtype'] == 'CI')) &
(df['initiated_by'] == 'C') &
(df['amount'] > 0)
)
return df
def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
"""
Determines if an account likely belongs to a salary earner based on transaction type criteria.
Args:
group (pd.DataFrame): Transactions for a single account.
min_transactions (int): Minimum transactions required to qualify (default: 3).
threshold (float): Minimum proportion of salary-type transactions (default: 0.7).
Returns:
bool: True if the account meets the criteria, False otherwise.
"""
if len(group) < min_transactions:
return False
valid_ratio = group['is_salary_type'].mean()
return valid_ratio >= threshold
trx_df = flag_salary_type_transactions(df)
trx_data = trx_df[trx_df['is_salary_type']]
def plot_hypothesis_overlap(hypothesis1_df,
hypothesis3_df, hypothesis4_df,
account_col='accountid'):
"""
Plots a Venn diagram showing overlap between the hypotheses.
Args:
hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
account_col (str): Account identifier column.
"""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.show()
plot_hypothesis_overlap(desc_data, const_data, trx_data)
def filter_venn_section(df, **kwargs):
"""
Filters accounts based on specified combinations of hypothesis flags.
Args:
df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
**kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
For example: {'is_salary_related': True, 'is_consistent_amount': False}.
Returns:
pd.DataFrame: Filtered accounts matching the specified Venn section.
"""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = df[df['initiated_by']=='C']
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
return df1[condition]
green_section = filter_venn_section(
df,
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
redisplay(green_section.head(10))
yellow_section = filter_venn_section(
df,
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
# Get accounts flagged by all three hypotheses
all_three_hypotheses = filter_venn_section(
df,
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
def generate_salary_earners_table(all_three_hypotheses):
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Calculate required metrics
num_months = len(group)
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
least_inflow = last_6_months['amount'].min()
avg_salary = group['amount'].mean()
# Estimated next salary
# Calculate days_since_last_trx within the loop
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
final_df = final_df.dropna()
return final_df
final_table = generate_salary_earners_table(all_three_hypotheses)
# Display results
print(f"Found {final_table['accountid'].nunique()} verified salary earners")
likely_salary_earner = pd.concat([yellow_section, green_section])
likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
likely_salary_earner = generate_salary_earners_table(likely_salary_earner)
# Display results
print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")
def analyze_salary_earners(final_df):
"""
Analyzes salary earners and identifies high earners (>=10k predicted salary).
Args:
final_df (pd.DataFrame): DataFrame containing salary earner information.
Returns:
pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
"""
high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
count_high = len(high_earners)
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
return high_earner_details, count_high
high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)
print(f"\nTotal High Earners: {total_high_earners}")
high_earner_details_df.to_csv('high_earner_details.csv', index=False)
likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
final_table.to_csv('final_table.csv', index=False)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
def add_feature_engineering(df):
"""
Engineers new features to the input DataFrame for salary prediction.
Adds features like month, month sequence, one-hot encoded transaction type,
3-month rolling sum, and 3-month rolling average of transaction amounts.
Args:
df (pd.DataFrame): The input DataFrame containing transaction data.
Returns:
pd.DataFrame: The DataFrame with engineered features added.
"""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding: one-hot encode trx_type
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics: sort by account and date
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(df_transactions, accounts):
"""
Prepares transaction data for training and testing a salary prediction model.
Filters transactions for specified accounts, performs feature engineering,
aggregates data monthly, filters for accounts with sufficient data, and
creates training and testing sets using a sliding window approach.
Args:
df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
accounts (list): A list of account IDs to include in the data preparation.
Returns:
tuple: A tuple containing the training and testing data as NumPy arrays:
(X_train, y_train, X_test, y_test).
"""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = add_feature_engineering(df_filtered)
# Aggregate monthly data with new features
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create training and testing sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
# Check if account has enough data for training and testing sequences
if len(account_data) >= 12: # Ensure at least 12 months of data
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(X_train, y_train, X_test, y_test):
"""
Trains and evaluates a Random Forest Regressor for salary prediction.
Scales the input features using StandardScaler, trains the model,
predicts on the test set, and calculates evaluation metrics
(MAE, RMSE, R-squared).
Args:
X_train (np.ndarray): Training data features.
y_train (np.ndarray): Training data target (salary).
X_test (np.ndarray): Testing data features.
y_test (np.ndarray): Testing data target (salary).
Returns:
tuple: A tuple containing the trained model and the scaler object:
(model, scaler).
"""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate model
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
consistent_accounts = final_table['accountid'].unique()
X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
if len(X_train_cons) > 0:
model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
else:
print("No accounts with sufficient data for consistent salary earners.")
X_test_cons_scaled = scaler_cons.transform(X_test_cons)
y_pred = model_cons.predict(X_test_cons_scaled)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_cons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
plt.show()
inconsistent_accounts = likely_salary_earner['accountid'].unique()
X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
else:
print("No accounts with sufficient data for inconsistent salary earners.")
X_test_incons_scaled = scaler_incons.transform(X_test_incons)
y_pred = model_incons.predict(X_test_incons_scaled)
plt.figure(figsize=(10, 5))
plt.scatter(y_test_incons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
plt.show()