""" Salary earner analysis and report generation module. """ import pandas as pd import matplotlib.pyplot as plt from matplotlib_venn import venn3 from datetime import datetime, timedelta from .config import MODEL_CONFIG, OUTPUT_PATHS class SalaryEarnerAnalyzer: def __init__(self, df): self.df = df self.final_table = None self.likely_salary_earner = None self.high_earner_details = None def filter_venn_section(self, **kwargs): """Filter accounts based on specified combinations of hypothesis flags.""" valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'} df1 = self.df[self.df['initiated_by'] == 'C'] invalid_keys = set(kwargs.keys()) - valid_columns if invalid_keys: raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.") condition = pd.Series([True] * len(df1), index=df1.index) for key, value in kwargs.items(): condition &= (df1[key] == value) return df1[condition] def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'): """Plot and save Venn diagram showing overlap between hypotheses.""" set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']]) set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']]) set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']]) plt.figure(figsize=(10, 10)) venn3([set2, set3, set4], set_labels=('Consistent Amount', 'Salary Description', 'Transaction Type')) plt.title('Overlap Between Hypotheses') plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot']) plt.close() def generate_salary_earners_table(self, all_three_hypotheses): """Generate a table of salary earners with their metrics.""" results = [] for accountid, group in all_three_hypotheses.groupby('accountid'): # Calculate required metrics num_months = len(group) last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))] least_inflow = last_6_months['amount'].min() avg_salary = group['amount'].mean() # Calculate days since last transaction group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days median_interval = group['days_since_last_trx'].median() last_date = group['trx_start_date'].max() next_date = last_date + timedelta(days=median_interval) next_amount = avg_salary # Boolean flags days_since_last = (datetime.now() - last_date).days has_45d = days_since_last <= 45 has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2 results.append({ 'accountid': accountid, 'num_months': num_months, 'least_inflow_6m': least_inflow, 'avg_monthly_salary': avg_salary, 'estimated_next_amount': next_amount, 'estimated_next_date': next_date, '45daysalary': has_45d, '2monthssalary': has_2m }) final_df = pd.DataFrame(results) final_df = final_df.dropna() return final_df def analyze_salary_earners(self, final_df): """Analyze salary earners and identify high earners.""" high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy() high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True) count_high = len(high_earners) return high_earner_details, count_high def generate_reports(self): """Generate all salary earner reports.""" # Get accounts flagged by all three hypotheses all_three_hypotheses = self.filter_venn_section( is_salary_related=True, is_consistent_amount=True, is_salary_type=True ) # Generate final table self.final_table = self.generate_salary_earners_table(all_three_hypotheses) print(f"Found {self.final_table['accountid'].nunique()} verified salary earners") # Generate likely salary earner table green_section = self.filter_venn_section( is_salary_related=True, is_consistent_amount=False, is_salary_type=True ) yellow_section = self.filter_venn_section( is_salary_related=False, is_consistent_amount=True, is_salary_type=True ) self.likely_salary_earner = pd.concat([yellow_section, green_section]) self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id']) self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner) print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners") # Analyze high earners self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table) print(f"\nTotal High Earners: {total_high_earners}") # Plot hypothesis overlap self.plot_hypothesis_overlap( self.df[self.df['is_salary_related']], self.df[self.df['is_consistent_amount']], self.df[self.df['is_salary_type']] ) # Save reports self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False) self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False) self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False) return { 'final_table': self.final_table, 'likely_salary_earner': self.likely_salary_earner, 'high_earner_details': self.high_earner_details, 'total_high_earners': total_high_earners }