""" Salary earner analysis and report generation module. """ import pandas as pd import matplotlib.pyplot as plt from matplotlib_venn import venn3 from datetime import datetime, timedelta import logging from .config import MODEL_CONFIG, OUTPUT_PATHS # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class SalaryEarnerAnalyzer: def __init__(self, df): self.df = df self.final_table = None self.likely_salary_earner = None self.high_earner_details = None def filter_venn_section(self, **kwargs): """Filter accounts based on specified combinations of hypothesis flags.""" valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'} df1 = self.df[self.df['initiated_by'] == 'C'].copy() invalid_keys = set(kwargs.keys()) - valid_columns if invalid_keys: raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.") condition = pd.Series([True] * len(df1), index=df1.index) for key, value in kwargs.items(): condition &= (df1[key] == value) filtered_df = df1[condition] # Drop any rows with NaN values in critical columns critical_cols = ['accountid', 'trx_start_date', 'amount'] filtered_df = filtered_df.dropna(subset=critical_cols) return filtered_df def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'): """Plot and save Venn diagram showing overlap between hypotheses.""" set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']]) set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']]) set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']]) plt.figure(figsize=(10, 10)) venn3([set2, set3, set4], set_labels=('Consistent Amount', 'Salary Description', 'Transaction Type')) plt.title('Overlap Between Hypotheses') plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot']) plt.close() def generate_salary_earners_table(self, all_three_hypotheses): """Generate a table of salary earners with their metrics.""" results = [] for accountid, group in all_three_hypotheses.groupby('accountid'): # Skip if group is empty if group.empty: continue # Calculate required metrics num_months = len(group) # Handle last 6 months calculation last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))] if last_6_months.empty: least_inflow = 0 else: least_inflow = last_6_months['amount'].min() # Handle average salary calculation if group['amount'].notna().any(): avg_salary = group['amount'].mean() else: avg_salary = 0 # Calculate days_since_last_trx with NaN handling group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days median_interval = group['days_since_last_trx'].median() if pd.isna(median_interval): median_interval = 30 # Default to 30 days if no interval data last_date = group['trx_start_date'].max() next_date = last_date + timedelta(days=median_interval) next_amount = avg_salary # Boolean flags with NaN handling days_since_last = (datetime.now() - last_date).days has_45d = days_since_last <= 45 has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2 results.append({ 'accountid': accountid, 'num_months': num_months, 'least_inflow_6m': least_inflow, 'avg_monthly_salary': avg_salary, 'estimated_next_amount': next_amount, 'estimated_next_date': next_date, '45daysalary': has_45d, '2monthssalary': has_2m }) final_df = pd.DataFrame(results) # Drop rows where all numeric columns are NaN numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount'] final_df = final_df.dropna(subset=numeric_cols, how='all') return final_df def analyze_salary_earners(self, final_df): """Analyze salary earners and identify high earners.""" high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy() high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True) count_high = len(high_earners) return high_earner_details, count_high def generate_reports(self): """Generate all salary earner reports.""" # Get accounts flagged by all three hypotheses all_three_hypotheses = self.filter_venn_section( is_salary_related=True, is_consistent_amount=True, is_salary_type=True ) # Generate final table self.final_table = self.generate_salary_earners_table(all_three_hypotheses) logger.info(f"Found {self.final_table['accountid'].nunique()} verified salary earners") # Generate likely salary earner table green_section = self.filter_venn_section( is_salary_related=True, is_consistent_amount=False, is_salary_type=True ) yellow_section = self.filter_venn_section( is_salary_related=False, is_consistent_amount=True, is_salary_type=True ) self.likely_salary_earner = pd.concat([yellow_section, green_section]) self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id']) self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner) logger.info(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners") # Analyze high earners self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table) logger.info(f"\nTotal High Earners: {total_high_earners}") # Plot hypothesis overlap self.plot_hypothesis_overlap( self.df[self.df['is_salary_related']], self.df[self.df['is_consistent_amount']], self.df[self.df['is_salary_type']] ) # Save reports self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False) self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False) self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False) return { 'final_table': self.final_table, 'likely_salary_earner': self.likely_salary_earner, 'high_earner_details': self.high_earner_details, 'total_high_earners': total_high_earners }