Added new salary-related terms and improved image outputs in salary.ipynb
This commit is contained in:
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
Salary earner analysis and report generation module.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib_venn import venn3
|
||||
from datetime import datetime, timedelta
|
||||
from .config import MODEL_CONFIG, OUTPUT_PATHS
|
||||
|
||||
class SalaryEarnerAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.final_table = None
|
||||
self.likely_salary_earner = None
|
||||
self.high_earner_details = None
|
||||
|
||||
def filter_venn_section(self, **kwargs):
|
||||
"""Filter accounts based on specified combinations of hypothesis flags."""
|
||||
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
|
||||
df1 = self.df[self.df['initiated_by'] == 'C']
|
||||
|
||||
invalid_keys = set(kwargs.keys()) - valid_columns
|
||||
if invalid_keys:
|
||||
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
|
||||
|
||||
condition = pd.Series([True] * len(df1), index=df1.index)
|
||||
for key, value in kwargs.items():
|
||||
condition &= (df1[key] == value)
|
||||
|
||||
return df1[condition]
|
||||
|
||||
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
|
||||
"""Plot and save Venn diagram showing overlap between hypotheses."""
|
||||
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
|
||||
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
|
||||
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
|
||||
|
||||
plt.figure(figsize=(10, 10))
|
||||
venn3([set2, set3, set4], set_labels=('Consistent Amount',
|
||||
'Salary Description', 'Transaction Type'))
|
||||
plt.title('Overlap Between Hypotheses')
|
||||
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
|
||||
plt.close()
|
||||
|
||||
def generate_salary_earners_table(self, all_three_hypotheses):
|
||||
"""Generate a table of salary earners with their metrics."""
|
||||
results = []
|
||||
for accountid, group in all_three_hypotheses.groupby('accountid'):
|
||||
# Calculate required metrics
|
||||
num_months = len(group)
|
||||
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
|
||||
least_inflow = last_6_months['amount'].min()
|
||||
avg_salary = group['amount'].mean()
|
||||
|
||||
# Calculate days since last transaction
|
||||
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
|
||||
median_interval = group['days_since_last_trx'].median()
|
||||
|
||||
last_date = group['trx_start_date'].max()
|
||||
next_date = last_date + timedelta(days=median_interval)
|
||||
next_amount = avg_salary
|
||||
|
||||
# Boolean flags
|
||||
days_since_last = (datetime.now() - last_date).days
|
||||
has_45d = days_since_last <= 45
|
||||
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
|
||||
|
||||
results.append({
|
||||
'accountid': accountid,
|
||||
'num_months': num_months,
|
||||
'least_inflow_6m': least_inflow,
|
||||
'avg_monthly_salary': avg_salary,
|
||||
'estimated_next_amount': next_amount,
|
||||
'estimated_next_date': next_date,
|
||||
'45daysalary': has_45d,
|
||||
'2monthssalary': has_2m
|
||||
})
|
||||
|
||||
final_df = pd.DataFrame(results)
|
||||
final_df = final_df.dropna()
|
||||
return final_df
|
||||
|
||||
def analyze_salary_earners(self, final_df):
|
||||
"""Analyze salary earners and identify high earners."""
|
||||
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']]
|
||||
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
|
||||
count_high = len(high_earners)
|
||||
|
||||
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
|
||||
return high_earner_details, count_high
|
||||
|
||||
def generate_reports(self):
|
||||
"""Generate all salary earner reports."""
|
||||
# Get accounts flagged by all three hypotheses
|
||||
all_three_hypotheses = self.filter_venn_section(
|
||||
is_salary_related=True,
|
||||
is_consistent_amount=True,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
# Generate final table
|
||||
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
|
||||
print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
|
||||
|
||||
# Generate likely salary earner table
|
||||
green_section = self.filter_venn_section(
|
||||
is_salary_related=True,
|
||||
is_consistent_amount=False,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
yellow_section = self.filter_venn_section(
|
||||
is_salary_related=False,
|
||||
is_consistent_amount=True,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
self.likely_salary_earner = pd.concat([yellow_section, green_section])
|
||||
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
|
||||
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
|
||||
print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
|
||||
|
||||
# Analyze high earners
|
||||
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
|
||||
print(f"\nTotal High Earners: {total_high_earners}")
|
||||
|
||||
# Plot hypothesis overlap
|
||||
self.plot_hypothesis_overlap(
|
||||
self.df[self.df['is_salary_related']],
|
||||
self.df[self.df['is_consistent_amount']],
|
||||
self.df[self.df['is_salary_type']]
|
||||
)
|
||||
|
||||
# Save reports
|
||||
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
|
||||
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
|
||||
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
|
||||
|
||||
return {
|
||||
'final_table': self.final_table,
|
||||
'likely_salary_earner': self.likely_salary_earner,
|
||||
'high_earner_details': self.high_earner_details,
|
||||
'total_high_earners': total_high_earners
|
||||
}
|
||||
Reference in New Issue
Block a user