AnalysisTesting/salary_analytics/salary_earner_analyzer.py

"""
Salary earner analysis and report generation module.
"""

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
from datetime import datetime, timedelta
from .config import MODEL_CONFIG, OUTPUT_PATHS

class SalaryEarnerAnalyzer:
    def __init__(self, df):
        self.df = df
        self.final_table = None
        self.likely_salary_earner = None
        self.high_earner_details = None

    def filter_venn_section(self, **kwargs):
        """Filter accounts based on specified combinations of hypothesis flags."""
        valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
        df1 = self.df[self.df['initiated_by'] == 'C'].copy()

        invalid_keys = set(kwargs.keys()) - valid_columns
        if invalid_keys:
            raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")

        condition = pd.Series([True] * len(df1), index=df1.index)
        for key, value in kwargs.items():
            condition &= (df1[key] == value)

        filtered_df = df1[condition]

        # Drop any rows with NaN values in critical columns
        critical_cols = ['accountid', 'trx_start_date', 'amount']
        filtered_df = filtered_df.dropna(subset=critical_cols)

        return filtered_df

    def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
        """Plot and save Venn diagram showing overlap between hypotheses."""
        set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
        set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
        set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])

        plt.figure(figsize=(10, 10))
        venn3([set2, set3, set4], set_labels=('Consistent Amount',
                                            'Salary Description', 'Transaction Type'))
        plt.title('Overlap Between Hypotheses')
        plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
        plt.close()

    def generate_salary_earners_table(self, all_three_hypotheses):
        """Generate a table of salary earners with their metrics."""
        results = []
        for accountid, group in all_three_hypotheses.groupby('accountid'):
            # Skip if group is empty
            if group.empty:
                continue

            # Calculate required metrics
            num_months = len(group)

            # Handle last 6 months calculation
            last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
            if last_6_months.empty:
                least_inflow = 0
            else:
                least_inflow = last_6_months['amount'].min()

            # Handle average salary calculation
            if group['amount'].notna().any():
                avg_salary = group['amount'].mean()
            else:
                avg_salary = 0

            # Calculate days_since_last_trx with NaN handling
            group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
            median_interval = group['days_since_last_trx'].median()
            if pd.isna(median_interval):
                median_interval = 30  # Default to 30 days if no interval data

            last_date = group['trx_start_date'].max()
            next_date = last_date + timedelta(days=median_interval)
            next_amount = avg_salary

            # Boolean flags with NaN handling
            days_since_last = (datetime.now() - last_date).days
            has_45d = days_since_last <= 45
            has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2

            results.append({
                'accountid': accountid,
                'num_months': num_months,
                'least_inflow_6m': least_inflow,
                'avg_monthly_salary': avg_salary,
                'estimated_next_amount': next_amount,
                'estimated_next_date': next_date,
                '45daysalary': has_45d,
                '2monthssalary': has_2m
            })

        final_df = pd.DataFrame(results)
        # Drop rows where all numeric columns are NaN
        numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
        final_df = final_df.dropna(subset=numeric_cols, how='all')
        return final_df

    def analyze_salary_earners(self, final_df):
        """Analyze salary earners and identify high earners."""
        high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy()
        high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
        count_high = len(high_earners)

        return high_earner_details, count_high

    def generate_reports(self):
        """Generate all salary earner reports."""
        # Get accounts flagged by all three hypotheses
        all_three_hypotheses = self.filter_venn_section(
            is_salary_related=True,
            is_consistent_amount=True,
            is_salary_type=True
        )

        # Generate final table
        self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
        print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")

        # Generate likely salary earner table
        green_section = self.filter_venn_section(
            is_salary_related=True,
            is_consistent_amount=False,
            is_salary_type=True
        )

        yellow_section = self.filter_venn_section(
            is_salary_related=False,
            is_consistent_amount=True,
            is_salary_type=True
        )

        self.likely_salary_earner = pd.concat([yellow_section, green_section])
        self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
        self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
        print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")

        # Analyze high earners
        self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
        print(f"\nTotal High Earners: {total_high_earners}")

        # Plot hypothesis overlap
        self.plot_hypothesis_overlap(
            self.df[self.df['is_salary_related']],
            self.df[self.df['is_consistent_amount']],
            self.df[self.df['is_salary_type']]
        )

        # Save reports
        self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
        self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
        self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)

        return {
            'final_table': self.final_table,
            'likely_salary_earner': self.likely_salary_earner,
            'high_earner_details': self.high_earner_details,
            'total_high_earners': total_high_earners
        }