Files
AnalysisTesting/salary_analytics/salary_earner_analyzer.py
T
salakojoshua1234_gmail.com 9c429caa56 Implement streaming pipeline endpoint for batch processing
- Added `/run/streaming-pipeline` endpoint to process data in batches from either a database or CSV file.
- Introduced `BatchResponse` model for structured responses.
- Updated README with new endpoint details, including parameters and example usage.
- Enhanced error handling and logging during batch processing.
- Ensured data preprocessing and NaN handling in analysis functions.
2025-05-02 14:25:31 +01:00

168 lines
7.0 KiB
Python

"""
Salary earner analysis and report generation module.
"""
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from datetime import datetime, timedelta
from .config import MODEL_CONFIG, OUTPUT_PATHS
class SalaryEarnerAnalyzer:
def __init__(self, df):
self.df = df
self.final_table = None
self.likely_salary_earner = None
self.high_earner_details = None
def filter_venn_section(self, **kwargs):
"""Filter accounts based on specified combinations of hypothesis flags."""
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
df1 = self.df[self.df['initiated_by'] == 'C'].copy()
invalid_keys = set(kwargs.keys()) - valid_columns
if invalid_keys:
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
condition = pd.Series([True] * len(df1), index=df1.index)
for key, value in kwargs.items():
condition &= (df1[key] == value)
filtered_df = df1[condition]
# Drop any rows with NaN values in critical columns
critical_cols = ['accountid', 'trx_start_date', 'amount']
filtered_df = filtered_df.dropna(subset=critical_cols)
return filtered_df
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
"""Plot and save Venn diagram showing overlap between hypotheses."""
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
plt.figure(figsize=(10, 10))
venn3([set2, set3, set4], set_labels=('Consistent Amount',
'Salary Description', 'Transaction Type'))
plt.title('Overlap Between Hypotheses')
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
plt.close()
def generate_salary_earners_table(self, all_three_hypotheses):
"""Generate a table of salary earners with their metrics."""
results = []
for accountid, group in all_three_hypotheses.groupby('accountid'):
# Skip if group is empty
if group.empty:
continue
# Calculate required metrics
num_months = len(group)
# Handle last 6 months calculation
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
if last_6_months.empty:
least_inflow = 0
else:
least_inflow = last_6_months['amount'].min()
# Handle average salary calculation
if group['amount'].notna().any():
avg_salary = group['amount'].mean()
else:
avg_salary = 0
# Calculate days_since_last_trx with NaN handling
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
median_interval = group['days_since_last_trx'].median()
if pd.isna(median_interval):
median_interval = 30 # Default to 30 days if no interval data
last_date = group['trx_start_date'].max()
next_date = last_date + timedelta(days=median_interval)
next_amount = avg_salary
# Boolean flags with NaN handling
days_since_last = (datetime.now() - last_date).days
has_45d = days_since_last <= 45
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
results.append({
'accountid': accountid,
'num_months': num_months,
'least_inflow_6m': least_inflow,
'avg_monthly_salary': avg_salary,
'estimated_next_amount': next_amount,
'estimated_next_date': next_date,
'45daysalary': has_45d,
'2monthssalary': has_2m
})
final_df = pd.DataFrame(results)
# Drop rows where all numeric columns are NaN
numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
final_df = final_df.dropna(subset=numeric_cols, how='all')
return final_df
def analyze_salary_earners(self, final_df):
"""Analyze salary earners and identify high earners."""
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy()
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
count_high = len(high_earners)
return high_earner_details, count_high
def generate_reports(self):
"""Generate all salary earner reports."""
# Get accounts flagged by all three hypotheses
all_three_hypotheses = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=True,
is_salary_type=True
)
# Generate final table
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
# Generate likely salary earner table
green_section = self.filter_venn_section(
is_salary_related=True,
is_consistent_amount=False,
is_salary_type=True
)
yellow_section = self.filter_venn_section(
is_salary_related=False,
is_consistent_amount=True,
is_salary_type=True
)
self.likely_salary_earner = pd.concat([yellow_section, green_section])
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
# Analyze high earners
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
print(f"\nTotal High Earners: {total_high_earners}")
# Plot hypothesis overlap
self.plot_hypothesis_overlap(
self.df[self.df['is_salary_related']],
self.df[self.df['is_consistent_amount']],
self.df[self.df['is_salary_type']]
)
# Save reports
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
return {
'final_table': self.final_table,
'likely_salary_earner': self.likely_salary_earner,
'high_earner_details': self.high_earner_details,
'total_high_earners': total_high_earners
}