1a4e539626
- Introduced `DatabaseOperations` class for managing batch results in the database. - Added functionality to create a batch results table and save batch processing results. - Updated API endpoints to log execution time and handle batch processing errors more effectively. - Improved response handling in analysis endpoints and added batch metadata to results. - Suppressed warnings and improved logging throughout the application.
176 lines
7.2 KiB
Python
176 lines
7.2 KiB
Python
"""
|
|
Salary earner analysis and report generation module.
|
|
"""
|
|
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from matplotlib_venn import venn3
|
|
from datetime import datetime, timedelta
|
|
import logging
|
|
from .config import MODEL_CONFIG, OUTPUT_PATHS
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class SalaryEarnerAnalyzer:
|
|
def __init__(self, df):
|
|
self.df = df
|
|
self.final_table = None
|
|
self.likely_salary_earner = None
|
|
self.high_earner_details = None
|
|
|
|
def filter_venn_section(self, **kwargs):
|
|
"""Filter accounts based on specified combinations of hypothesis flags."""
|
|
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
|
|
df1 = self.df[self.df['initiated_by'] == 'C'].copy()
|
|
|
|
invalid_keys = set(kwargs.keys()) - valid_columns
|
|
if invalid_keys:
|
|
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
|
|
|
|
condition = pd.Series([True] * len(df1), index=df1.index)
|
|
for key, value in kwargs.items():
|
|
condition &= (df1[key] == value)
|
|
|
|
filtered_df = df1[condition]
|
|
|
|
# Drop any rows with NaN values in critical columns
|
|
critical_cols = ['accountid', 'trx_start_date', 'amount']
|
|
filtered_df = filtered_df.dropna(subset=critical_cols)
|
|
|
|
return filtered_df
|
|
|
|
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
|
|
"""Plot and save Venn diagram showing overlap between hypotheses."""
|
|
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
|
|
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
|
|
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
|
|
|
|
plt.figure(figsize=(10, 10))
|
|
venn3([set2, set3, set4], set_labels=('Consistent Amount',
|
|
'Salary Description', 'Transaction Type'))
|
|
plt.title('Overlap Between Hypotheses')
|
|
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
|
|
plt.close()
|
|
|
|
def generate_salary_earners_table(self, all_three_hypotheses):
|
|
"""Generate a table of salary earners with their metrics."""
|
|
results = []
|
|
for accountid, group in all_three_hypotheses.groupby('accountid'):
|
|
# Skip if group is empty
|
|
if group.empty:
|
|
continue
|
|
|
|
# Calculate required metrics
|
|
num_months = len(group)
|
|
|
|
# Handle last 6 months calculation
|
|
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
|
|
if last_6_months.empty:
|
|
least_inflow = 0
|
|
else:
|
|
least_inflow = last_6_months['amount'].min()
|
|
|
|
# Handle average salary calculation
|
|
if group['amount'].notna().any():
|
|
avg_salary = group['amount'].mean()
|
|
else:
|
|
avg_salary = 0
|
|
|
|
# Calculate days_since_last_trx with NaN handling
|
|
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
|
|
median_interval = group['days_since_last_trx'].median()
|
|
if pd.isna(median_interval):
|
|
median_interval = 30 # Default to 30 days if no interval data
|
|
|
|
last_date = group['trx_start_date'].max()
|
|
next_date = last_date + timedelta(days=median_interval)
|
|
next_amount = avg_salary
|
|
|
|
# Boolean flags with NaN handling
|
|
days_since_last = (datetime.now() - last_date).days
|
|
has_45d = days_since_last <= 45
|
|
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
|
|
|
|
results.append({
|
|
'accountid': accountid,
|
|
'num_months': num_months,
|
|
'least_inflow_6m': least_inflow,
|
|
'avg_monthly_salary': avg_salary,
|
|
'estimated_next_amount': next_amount,
|
|
'estimated_next_date': next_date,
|
|
'45daysalary': has_45d,
|
|
'2monthssalary': has_2m
|
|
})
|
|
|
|
final_df = pd.DataFrame(results)
|
|
# Drop rows where all numeric columns are NaN
|
|
numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
|
|
final_df = final_df.dropna(subset=numeric_cols, how='all')
|
|
return final_df
|
|
|
|
def analyze_salary_earners(self, final_df):
|
|
"""Analyze salary earners and identify high earners."""
|
|
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy()
|
|
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
|
|
count_high = len(high_earners)
|
|
|
|
return high_earner_details, count_high
|
|
|
|
def generate_reports(self):
|
|
"""Generate all salary earner reports."""
|
|
# Get accounts flagged by all three hypotheses
|
|
all_three_hypotheses = self.filter_venn_section(
|
|
is_salary_related=True,
|
|
is_consistent_amount=True,
|
|
is_salary_type=True
|
|
)
|
|
|
|
# Generate final table
|
|
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
|
|
logger.info(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
|
|
|
|
# Generate likely salary earner table
|
|
green_section = self.filter_venn_section(
|
|
is_salary_related=True,
|
|
is_consistent_amount=False,
|
|
is_salary_type=True
|
|
)
|
|
|
|
yellow_section = self.filter_venn_section(
|
|
is_salary_related=False,
|
|
is_consistent_amount=True,
|
|
is_salary_type=True
|
|
)
|
|
|
|
self.likely_salary_earner = pd.concat([yellow_section, green_section])
|
|
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
|
|
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
|
|
logger.info(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
|
|
|
|
# Analyze high earners
|
|
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
|
|
logger.info(f"\nTotal High Earners: {total_high_earners}")
|
|
|
|
# Plot hypothesis overlap
|
|
self.plot_hypothesis_overlap(
|
|
self.df[self.df['is_salary_related']],
|
|
self.df[self.df['is_consistent_amount']],
|
|
self.df[self.df['is_salary_type']]
|
|
)
|
|
|
|
# Save reports
|
|
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
|
|
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
|
|
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
|
|
|
|
return {
|
|
'final_table': self.final_table,
|
|
'likely_salary_earner': self.likely_salary_earner,
|
|
'high_earner_details': self.high_earner_details,
|
|
'total_high_earners': total_high_earners
|
|
} |