From 7e7094f0fd45008a388e820fd542ee40632272f8 Mon Sep 17 00:00:00 2001 From: Joshua Salako Date: Mon, 28 Apr 2025 19:45:19 +0100 Subject: [PATCH] Remove salary.py file, eliminating all salary transaction analysis and related functions. --- salary.py | 549 ------------------------------------------------------ 1 file changed, 549 deletions(-) delete mode 100644 salary.py diff --git a/salary.py b/salary.py deleted file mode 100644 index 2fc5c19..0000000 --- a/salary.py +++ /dev/null @@ -1,549 +0,0 @@ -from readline import redisplay -from sqlalchemy import create_engine, text -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -import warnings -warnings.filterwarnings('ignore') -import re -import seaborn as sns -from matplotlib_venn import venn3, venn2 -from wordcloud import WordCloud -from datetime import datetime, timedelta - -DB_USER = "salaryloan" -DB_PASSWORD = "salaryloan" -DB_NAME = "salaryloan" -DB_PORT = "10532" -DB_HOST = "dev-data.simbrellang.net" -DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" - -engine = create_engine(DATABASE_URL) - -try: - with engine.connect() as conn: - result = conn.execute(text("SELECT version();")) - print("Connected successfully!") -except Exception as e: - print("Error connecting to database:", e) - - -table_name = "customer_account_transaction_hx" - -df = pd.read_sql(f"SELECT * FROM {table_name}", engine) -df.head(10) - - -# Change to date column to datetime -df['trx_start_date'] = pd.to_datetime(df['trx_start_date']) -df['trx_end_date'] = pd.to_datetime(df['trx_end_date']) - -# Rename columns -df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype', - 'd3': 'initiated_by', 'd4': 'customer_id'}) - - -keywords = [ - "salary", "payroll", "income", "wage", "wages", - "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation", - "monthlypay", "netpay", "grosspay", - "remuneration", "stipend", "allowance", "bonus", "commission", - "pension", "retirement", "dividend", "benefits", "reimbursement", - "overtime", "incentive", "paycheck", "paycheque", "salary advance", - "monthly income", "income tax refund", "employer deposit", - "payroll deposit", "salary credit", "income credit", "salary transfer", - "income transfer", "salary received", "income received", "hr deposit", - "company deposit", "employer payment", "employee payment", - "sal", -] - - -def identify_salary_transactions(df, keywords): - """ - Identifies potential salary-related transactions based on keywords - and month-year patterns in the 'description' column. - - Args: - df (pd.DataFrame): The input DataFrame containing transaction data. - keywords (list): A list of salary/income-related keywords to search for. - - Returns: - pd.DataFrame: The input DataFrame with an added 'is_salary_related' column - indicating potential salary transactions. - """ - month_year_patterns = [ - r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b", - r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b" - ] - - escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords] - combined_pattern = ( - r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' + - '|'.join(month_year_patterns) - ) - - df['is_salary_related'] = df['description'].str.lower().str.contains( - combined_pattern, - na=False, - regex=True - ) - - return df - - -desc_df = identify_salary_transactions(df, keywords) -desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')] -desc_data.head() - -def calculate_coefficient_of_variation(group): - """ - Calculates the coefficient of variation (CV) for a group of transactions - where 'initiated_by' is 'C'. - - Args: - group (pd.DataFrame): Transactions for a single account (grouped by 'accountid'). - - Returns: - float: Coefficient of variation (std / mean). Returns NaN if mean is zero. - """ - amounts = group[group['initiated_by'] == 'C']['amount'] - mean = amounts.mean() - std = amounts.std(ddof=0) - - if mean == 0: - return float('nan') - return std / mean - -def flag_consistent_amounts(group, cv_threshold=0.10): - """ - Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'. - - Args: - group (pd.DataFrame): Transactions for a single account. - cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10). - - Returns: - pd.Series: Boolean series indicating if the transaction belongs to a consistent account. - """ - # Filter for transactions initiated by 'C' before calculating CV - filtered_group = group[group['initiated_by'] == 'C'] - cv = calculate_coefficient_of_variation(filtered_group) # Pass filtered group - is_consistent = cv <= cv_threshold if not pd.isna(cv) else False - - return pd.Series( - [is_consistent] * len(group), - index=group.index, - name='is_consistent_amount' - ) - -def identify_consistent_amount_accounts(df, cv_threshold=0.10): - """ - Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'. - - Args: - df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by']. - cv_threshold (float): Maximum allowed CV (default: 0.10). - - Returns: - pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column. - """ - df = df.groupby('accountid').apply( - lambda group: flag_consistent_amounts(group, cv_threshold) - ).reset_index(level=0, drop=True) - - return df - - -const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10) -const_df = df.merge(const_df, left_index=True, right_index=True) -df['is_consistent_amount'] = const_df['is_consistent_amount'] -const_data = const_df[ - (const_df['is_consistent_amount']) & - (const_df['initiated_by']=='C') -] - - - -def flag_salary_type_transactions(df): - """ - Flags transactions that match the salary criteria based on type, subtype, and initiator. - - Args: - df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount']. - - Returns: - pd.DataFrame: Input DataFrame with an added 'is_salary_type' column. - """ - df['is_salary_type'] = ( - ((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) & - ((df['trx_subtype'] == 'BI') | (df['trx_subtype'] == 'I') | (df['trx_subtype'] == 'BS') | (df['trx_subtype'] == 'CI')) & - (df['initiated_by'] == 'C') & - (df['amount'] > 0) - ) - return df - - -def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7): - """ - Determines if an account likely belongs to a salary earner based on transaction type criteria. - - Args: - group (pd.DataFrame): Transactions for a single account. - min_transactions (int): Minimum transactions required to qualify (default: 3). - threshold (float): Minimum proportion of salary-type transactions (default: 0.7). - - Returns: - bool: True if the account meets the criteria, False otherwise. - """ - if len(group) < min_transactions: - return False - valid_ratio = group['is_salary_type'].mean() - return valid_ratio >= threshold - -trx_df = flag_salary_type_transactions(df) -trx_data = trx_df[trx_df['is_salary_type']] - - -def plot_hypothesis_overlap(hypothesis1_df, - hypothesis3_df, hypothesis4_df, - account_col='accountid'): - """ - Plots a Venn diagram showing overlap between the hypotheses. - - Args: - hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results - hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results - hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results - hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results - account_col (str): Account identifier column. - """ - set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']]) - set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']]) - set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']]) - - - plt.figure(figsize=(10, 10)) - venn3([set2, set3, set4], set_labels=('Consistent Amount', - 'Salary Description', 'Transaction Type')) - plt.title('Overlap Between Hypotheses') - plt.show() - -plot_hypothesis_overlap(desc_data, const_data, trx_data) - -def filter_venn_section(df, **kwargs): - """ - Filters accounts based on specified combinations of hypothesis flags. - - Args: - df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type']. - **kwargs: Key-value pairs specifying the desired state of each hypothesis flag. - For example: {'is_salary_related': True, 'is_consistent_amount': False}. - - Returns: - pd.DataFrame: Filtered accounts matching the specified Venn section. - """ - valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'} - df1 = df[df['initiated_by']=='C'] - invalid_keys = set(kwargs.keys()) - valid_columns - if invalid_keys: - raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.") - - condition = pd.Series([True] * len(df1), index=df1.index) - for key, value in kwargs.items(): - condition &= (df1[key] == value) - - return df1[condition] - - -green_section = filter_venn_section( - df, - is_salary_related=True, - is_consistent_amount=False, - is_salary_type=True -) - -redisplay(green_section.head(10)) - -yellow_section = filter_venn_section( - df, - is_salary_related=False, - is_consistent_amount=True, - is_salary_type=True -) - - - - -# Get accounts flagged by all three hypotheses -all_three_hypotheses = filter_venn_section( - df, - is_salary_related=True, - is_consistent_amount=True, - is_salary_type=True -) - - - -def generate_salary_earners_table(all_three_hypotheses): - - results = [] - for accountid, group in all_three_hypotheses.groupby('accountid'): - # Calculate required metrics - num_months = len(group) - last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))] - least_inflow = last_6_months['amount'].min() - avg_salary = group['amount'].mean() - - # Estimated next salary - # Calculate days_since_last_trx within the loop - group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days - median_interval = group['days_since_last_trx'].median() - - last_date = group['trx_start_date'].max() - next_date = last_date + timedelta(days=median_interval) - next_amount = avg_salary - - # Boolean flags - days_since_last = (datetime.now() - last_date).days - has_45d = days_since_last <= 45 - has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2 - - results.append({ - 'accountid': accountid, - 'num_months': num_months, - 'least_inflow_6m': least_inflow, - 'avg_monthly_salary': avg_salary, - 'estimated_next_amount': next_amount, - 'estimated_next_date': next_date, - '45daysalary': has_45d, - '2monthssalary': has_2m - }) - - final_df = pd.DataFrame(results) - final_df = final_df.dropna() - return final_df - -final_table = generate_salary_earners_table(all_three_hypotheses) - -# Display results -print(f"Found {final_table['accountid'].nunique()} verified salary earners") - -likely_salary_earner = pd.concat([yellow_section, green_section]) -likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id']) -likely_salary_earner = generate_salary_earners_table(likely_salary_earner) - -# Display results -print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners") - - -def analyze_salary_earners(final_df): - """ - Analyzes salary earners and identifies high earners (>=10k predicted salary). - - Args: - final_df (pd.DataFrame): DataFrame containing salary earner information. - - Returns: - pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows. - """ - high_earners = final_df[final_df['estimated_next_amount'] >= 10000] - high_earners['least_inflow_6m'] = high_earners['least_inflow_6m'] - count_high = len(high_earners) - - high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True) - - return high_earner_details, count_high - -high_earner_details_df, total_high_earners = analyze_salary_earners(final_table) - -print(f"\nTotal High Earners: {total_high_earners}") - - -high_earner_details_df.to_csv('high_earner_details.csv', index=False) -likely_salary_earner.to_csv('likely_salary_earner.csv', index=False) -final_table.to_csv('final_table.csv', index=False) - -import pandas as pd -import numpy as np -import matplotlib.pyplot as plt -from sklearn.preprocessing import StandardScaler, OneHotEncoder -from sklearn.ensemble import RandomForestRegressor -from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score - -def add_feature_engineering(df): - """ - Engineers new features to the input DataFrame for salary prediction. - - Adds features like month, month sequence, one-hot encoded transaction type, - 3-month rolling sum, and 3-month rolling average of transaction amounts. - - Args: - df (pd.DataFrame): The input DataFrame containing transaction data. - - Returns: - pd.DataFrame: The DataFrame with engineered features added. - """ - - df['month'] = df['trx_start_date'].dt.month - df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1 - - # Categorical encoding: one-hot encode trx_type - encoder = OneHotEncoder(sparse_output=False) - encoded_trx_type = encoder.fit_transform(df[['trx_type']]) - encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type'])) - df = pd.concat([df, encoded_df], axis=1) - - # Rolling statistics: sort by account and date - df = df.sort_values(['accountid', 'trx_start_date']) - df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3, - min_periods=1).sum().reset_index(0, drop=True) - df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3, - min_periods=1).mean().reset_index(0, drop=True) - - return df - -def prepare_data(df_transactions, accounts): - """ - Prepares transaction data for training and testing a salary prediction model. - - Filters transactions for specified accounts, performs feature engineering, - aggregates data monthly, filters for accounts with sufficient data, and - creates training and testing sets using a sliding window approach. - - Args: - df_transactions (pd.DataFrame): The input DataFrame containing all transaction data. - accounts (list): A list of account IDs to include in the data preparation. - - Returns: - tuple: A tuple containing the training and testing data as NumPy arrays: - (X_train, y_train, X_test, y_test). - """ - - df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy() - print(f"Filtered data for {len(accounts)} accounts.") - print(f"Total transactions: {len(df_filtered)}") - - # Drop unnecessary columns - df_filtered = df_filtered.drop(['description', 'id', 'customer_id', - 'trx_end_date', 'is_salary_related', - 'is_consistent_amount', 'is_salary_type'], axis=1) - - # Add feature engineering - df_filtered = add_feature_engineering(df_filtered) - - # Aggregate monthly data with new features - agg_funcs = { - 'amount': 'mean', - 'rolling_sum_3m': 'last', - 'rolling_avg_3m': 'last', - 'month': 'first' - } - encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')] - for col in encoded_cols: - agg_funcs[col] = 'sum' - - monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index() - - # Filter accounts with at least 12 months - account_month_counts = monthly_data.groupby('accountid')['month_seq'].max() - valid_accounts = account_month_counts[account_month_counts >= 12].index - monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)] - - # Create training and testing sequences - X_train, y_train, X_test, y_test = [], [], [], [] - feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m', - 'month'] + encoded_cols - - for account in valid_accounts: - account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq') - - # Check if account has enough data for training and testing sequences - if len(account_data) >= 12: # Ensure at least 12 months of data - for t in range(5, 8): - X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten()) - y_train.append(account_data['amount'].iloc[t]) - for t in range(8, 12): - X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten()) - y_test.append(account_data['amount'].iloc[t]) - else: - print(f"Skipping account {account} due to insufficient data (less than 12 months).") - - return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) - - -def train_model(X_train, y_train, X_test, y_test): - """ - Trains and evaluates a Random Forest Regressor for salary prediction. - - Scales the input features using StandardScaler, trains the model, - predicts on the test set, and calculates evaluation metrics - (MAE, RMSE, R-squared). - - Args: - X_train (np.ndarray): Training data features. - y_train (np.ndarray): Training data target (salary). - X_test (np.ndarray): Testing data features. - y_test (np.ndarray): Testing data target (salary). - - Returns: - tuple: A tuple containing the trained model and the scaler object: - (model, scaler). - """ - - # Scale features - scaler = StandardScaler() - X_train_scaled = scaler.fit_transform(X_train) - X_test_scaled = scaler.transform(X_test) - - # Train Random Forest model - model = RandomForestRegressor(n_estimators=100, random_state=42) - model.fit(X_train_scaled, y_train) - - # Evaluate model - y_pred = model.predict(X_test_scaled) - mae = mean_absolute_error(y_test, y_pred) - rmse = np.sqrt(mean_squared_error(y_test, y_pred)) - r2 = r2_score(y_test, y_pred) - print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}") - return model, scaler - -consistent_accounts = final_table['accountid'].unique() -X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts) -if len(X_train_cons) > 0: - model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons) - print("Model trained for consistent salary earners.") -else: - print("No accounts with sufficient data for consistent salary earners.") - - -X_test_cons_scaled = scaler_cons.transform(X_test_cons) -y_pred = model_cons.predict(X_test_cons_scaled) - -plt.figure(figsize=(10, 5)) -plt.scatter(y_test_cons, y_pred, alpha=0.5) -plt.xlabel("Actual Salary") -plt.ylabel("Predicted Salary") -plt.title("Actual vs. Predicted Salary") -plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--') -plt.show() - -inconsistent_accounts = likely_salary_earner['accountid'].unique() -X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts) -if len(X_train_incons) > 0: - print("\nTraining model for inconsistent salary earners...") - model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons) -else: - print("No accounts with sufficient data for inconsistent salary earners.") - - -X_test_incons_scaled = scaler_incons.transform(X_test_incons) -y_pred = model_incons.predict(X_test_incons_scaled) - -plt.figure(figsize=(10, 5)) -plt.scatter(y_test_incons, y_pred, alpha=0.5) -plt.xlabel("Actual Salary") -plt.ylabel("Predicted Salary") -plt.title("Actual vs. Predicted Salary") -plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--') -plt.show() -