from readline import redisplay from sqlalchemy import create_engine, text import pandas as pd import numpy as np import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') import re import seaborn as sns from matplotlib_venn import venn3, venn2 from wordcloud import WordCloud from datetime import datetime, timedelta DB_USER = "salaryloan" DB_PASSWORD = "salaryloan" DB_NAME = "salaryloan" DB_PORT = "10532" DB_HOST = "dev-data.simbrellang.net" DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}" engine = create_engine(DATABASE_URL) try: with engine.connect() as conn: result = conn.execute(text("SELECT version();")) print("Connected successfully!") except Exception as e: print("Error connecting to database:", e) table_name = "customer_account_transaction_hx" df = pd.read_sql(f"SELECT * FROM {table_name}", engine) df.head(10) # Change to date column to datetime df['trx_start_date'] = pd.to_datetime(df['trx_start_date']) df['trx_end_date'] = pd.to_datetime(df['trx_end_date']) # Rename columns df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype', 'd3': 'initiated_by', 'd4': 'customer_id'}) keywords = [ "salary", "payroll", "income", "wage", "wages", "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation", "monthlypay", "netpay", "grosspay", "remuneration", "stipend", "allowance", "bonus", "commission", "pension", "retirement", "dividend", "benefits", "reimbursement", "overtime", "incentive", "paycheck", "paycheque", "salary advance", "monthly income", "income tax refund", "employer deposit", "payroll deposit", "salary credit", "income credit", "salary transfer", "income transfer", "salary received", "income received", "hr deposit", "company deposit", "employer payment", "employee payment", "sal", ] def identify_salary_transactions(df, keywords): """ Identifies potential salary-related transactions based on keywords and month-year patterns in the 'description' column. Args: df (pd.DataFrame): The input DataFrame containing transaction data. keywords (list): A list of salary/income-related keywords to search for. Returns: pd.DataFrame: The input DataFrame with an added 'is_salary_related' column indicating potential salary transactions. """ month_year_patterns = [ r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b", r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b" ] escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords] combined_pattern = ( r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' + '|'.join(month_year_patterns) ) df['is_salary_related'] = df['description'].str.lower().str.contains( combined_pattern, na=False, regex=True ) return df desc_df = identify_salary_transactions(df, keywords) desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')] desc_data.head() def calculate_coefficient_of_variation(group): """ Calculates the coefficient of variation (CV) for a group of transactions where 'initiated_by' is 'C'. Args: group (pd.DataFrame): Transactions for a single account (grouped by 'accountid'). Returns: float: Coefficient of variation (std / mean). Returns NaN if mean is zero. """ amounts = group[group['initiated_by'] == 'C']['amount'] mean = amounts.mean() std = amounts.std(ddof=0) if mean == 0: return float('nan') return std / mean def flag_consistent_amounts(group, cv_threshold=0.10): """ Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'. Args: group (pd.DataFrame): Transactions for a single account. cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10). Returns: pd.Series: Boolean series indicating if the transaction belongs to a consistent account. """ # Filter for transactions initiated by 'C' before calculating CV filtered_group = group[group['initiated_by'] == 'C'] cv = calculate_coefficient_of_variation(filtered_group) # Pass filtered group is_consistent = cv <= cv_threshold if not pd.isna(cv) else False return pd.Series( [is_consistent] * len(group), index=group.index, name='is_consistent_amount' ) def identify_consistent_amount_accounts(df, cv_threshold=0.10): """ Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'. Args: df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by']. cv_threshold (float): Maximum allowed CV (default: 0.10). Returns: pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column. """ df = df.groupby('accountid').apply( lambda group: flag_consistent_amounts(group, cv_threshold) ).reset_index(level=0, drop=True) return df const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10) const_df = df.merge(const_df, left_index=True, right_index=True) df['is_consistent_amount'] = const_df['is_consistent_amount'] const_data = const_df[ (const_df['is_consistent_amount']) & (const_df['initiated_by']=='C') ] def flag_salary_type_transactions(df): """ Flags transactions that match the salary criteria based on type, subtype, and initiator. Args: df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount']. Returns: pd.DataFrame: Input DataFrame with an added 'is_salary_type' column. """ df['is_salary_type'] = ( ((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) & ((df['trx_subtype'] == 'BI') | (df['trx_subtype'] == 'I') | (df['trx_subtype'] == 'BS') | (df['trx_subtype'] == 'CI')) & (df['initiated_by'] == 'C') & (df['amount'] > 0) ) return df def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7): """ Determines if an account likely belongs to a salary earner based on transaction type criteria. Args: group (pd.DataFrame): Transactions for a single account. min_transactions (int): Minimum transactions required to qualify (default: 3). threshold (float): Minimum proportion of salary-type transactions (default: 0.7). Returns: bool: True if the account meets the criteria, False otherwise. """ if len(group) < min_transactions: return False valid_ratio = group['is_salary_type'].mean() return valid_ratio >= threshold trx_df = flag_salary_type_transactions(df) trx_data = trx_df[trx_df['is_salary_type']] def plot_hypothesis_overlap(hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'): """ Plots a Venn diagram showing overlap between the hypotheses. Args: hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results account_col (str): Account identifier column. """ set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']]) set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']]) set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']]) plt.figure(figsize=(10, 10)) venn3([set2, set3, set4], set_labels=('Consistent Amount', 'Salary Description', 'Transaction Type')) plt.title('Overlap Between Hypotheses') plt.show() plot_hypothesis_overlap(desc_data, const_data, trx_data) def filter_venn_section(df, **kwargs): """ Filters accounts based on specified combinations of hypothesis flags. Args: df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type']. **kwargs: Key-value pairs specifying the desired state of each hypothesis flag. For example: {'is_salary_related': True, 'is_consistent_amount': False}. Returns: pd.DataFrame: Filtered accounts matching the specified Venn section. """ valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'} df1 = df[df['initiated_by']=='C'] invalid_keys = set(kwargs.keys()) - valid_columns if invalid_keys: raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.") condition = pd.Series([True] * len(df1), index=df1.index) for key, value in kwargs.items(): condition &= (df1[key] == value) return df1[condition] green_section = filter_venn_section( df, is_salary_related=True, is_consistent_amount=False, is_salary_type=True ) redisplay(green_section.head(10)) yellow_section = filter_venn_section( df, is_salary_related=False, is_consistent_amount=True, is_salary_type=True ) # Get accounts flagged by all three hypotheses all_three_hypotheses = filter_venn_section( df, is_salary_related=True, is_consistent_amount=True, is_salary_type=True ) def generate_salary_earners_table(all_three_hypotheses): results = [] for accountid, group in all_three_hypotheses.groupby('accountid'): # Calculate required metrics num_months = len(group) last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))] least_inflow = last_6_months['amount'].min() avg_salary = group['amount'].mean() # Estimated next salary # Calculate days_since_last_trx within the loop group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days median_interval = group['days_since_last_trx'].median() last_date = group['trx_start_date'].max() next_date = last_date + timedelta(days=median_interval) next_amount = avg_salary # Boolean flags days_since_last = (datetime.now() - last_date).days has_45d = days_since_last <= 45 has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2 results.append({ 'accountid': accountid, 'num_months': num_months, 'least_inflow_6m': least_inflow, 'avg_monthly_salary': avg_salary, 'estimated_next_amount': next_amount, 'estimated_next_date': next_date, '45daysalary': has_45d, '2monthssalary': has_2m }) final_df = pd.DataFrame(results) final_df = final_df.dropna() return final_df final_table = generate_salary_earners_table(all_three_hypotheses) # Display results print(f"Found {final_table['accountid'].nunique()} verified salary earners") likely_salary_earner = pd.concat([yellow_section, green_section]) likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id']) likely_salary_earner = generate_salary_earners_table(likely_salary_earner) # Display results print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners") def analyze_salary_earners(final_df): """ Analyzes salary earners and identifies high earners (>=10k predicted salary). Args: final_df (pd.DataFrame): DataFrame containing salary earner information. Returns: pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows. """ high_earners = final_df[final_df['estimated_next_amount'] >= 10000] high_earners['least_inflow_6m'] = high_earners['least_inflow_6m'] count_high = len(high_earners) high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True) return high_earner_details, count_high high_earner_details_df, total_high_earners = analyze_salary_earners(final_table) print(f"\nTotal High Earners: {total_high_earners}") high_earner_details_df.to_csv('high_earner_details.csv', index=False) likely_salary_earner.to_csv('likely_salary_earner.csv', index=False) final_table.to_csv('final_table.csv', index=False) import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score def add_feature_engineering(df): """ Engineers new features to the input DataFrame for salary prediction. Adds features like month, month sequence, one-hot encoded transaction type, 3-month rolling sum, and 3-month rolling average of transaction amounts. Args: df (pd.DataFrame): The input DataFrame containing transaction data. Returns: pd.DataFrame: The DataFrame with engineered features added. """ df['month'] = df['trx_start_date'].dt.month df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1 # Categorical encoding: one-hot encode trx_type encoder = OneHotEncoder(sparse_output=False) encoded_trx_type = encoder.fit_transform(df[['trx_type']]) encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type'])) df = pd.concat([df, encoded_df], axis=1) # Rolling statistics: sort by account and date df = df.sort_values(['accountid', 'trx_start_date']) df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3, min_periods=1).sum().reset_index(0, drop=True) df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3, min_periods=1).mean().reset_index(0, drop=True) return df def prepare_data(df_transactions, accounts): """ Prepares transaction data for training and testing a salary prediction model. Filters transactions for specified accounts, performs feature engineering, aggregates data monthly, filters for accounts with sufficient data, and creates training and testing sets using a sliding window approach. Args: df_transactions (pd.DataFrame): The input DataFrame containing all transaction data. accounts (list): A list of account IDs to include in the data preparation. Returns: tuple: A tuple containing the training and testing data as NumPy arrays: (X_train, y_train, X_test, y_test). """ df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy() print(f"Filtered data for {len(accounts)} accounts.") print(f"Total transactions: {len(df_filtered)}") # Drop unnecessary columns df_filtered = df_filtered.drop(['description', 'id', 'customer_id', 'trx_end_date', 'is_salary_related', 'is_consistent_amount', 'is_salary_type'], axis=1) # Add feature engineering df_filtered = add_feature_engineering(df_filtered) # Aggregate monthly data with new features agg_funcs = { 'amount': 'mean', 'rolling_sum_3m': 'last', 'rolling_avg_3m': 'last', 'month': 'first' } encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')] for col in encoded_cols: agg_funcs[col] = 'sum' monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index() # Filter accounts with at least 12 months account_month_counts = monthly_data.groupby('accountid')['month_seq'].max() valid_accounts = account_month_counts[account_month_counts >= 12].index monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)] # Create training and testing sequences X_train, y_train, X_test, y_test = [], [], [], [] feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m', 'month'] + encoded_cols for account in valid_accounts: account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq') # Check if account has enough data for training and testing sequences if len(account_data) >= 12: # Ensure at least 12 months of data for t in range(5, 8): X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten()) y_train.append(account_data['amount'].iloc[t]) for t in range(8, 12): X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten()) y_test.append(account_data['amount'].iloc[t]) else: print(f"Skipping account {account} due to insufficient data (less than 12 months).") return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test) def train_model(X_train, y_train, X_test, y_test): """ Trains and evaluates a Random Forest Regressor for salary prediction. Scales the input features using StandardScaler, trains the model, predicts on the test set, and calculates evaluation metrics (MAE, RMSE, R-squared). Args: X_train (np.ndarray): Training data features. y_train (np.ndarray): Training data target (salary). X_test (np.ndarray): Testing data features. y_test (np.ndarray): Testing data target (salary). Returns: tuple: A tuple containing the trained model and the scaler object: (model, scaler). """ # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Train Random Forest model model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train_scaled, y_train) # Evaluate model y_pred = model.predict(X_test_scaled) mae = mean_absolute_error(y_test, y_pred) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) r2 = r2_score(y_test, y_pred) print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}") return model, scaler consistent_accounts = final_table['accountid'].unique() X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts) if len(X_train_cons) > 0: model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons) print("Model trained for consistent salary earners.") else: print("No accounts with sufficient data for consistent salary earners.") X_test_cons_scaled = scaler_cons.transform(X_test_cons) y_pred = model_cons.predict(X_test_cons_scaled) plt.figure(figsize=(10, 5)) plt.scatter(y_test_cons, y_pred, alpha=0.5) plt.xlabel("Actual Salary") plt.ylabel("Predicted Salary") plt.title("Actual vs. Predicted Salary") plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--') plt.show() inconsistent_accounts = likely_salary_earner['accountid'].unique() X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts) if len(X_train_incons) > 0: print("\nTraining model for inconsistent salary earners...") model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons) else: print("No accounts with sufficient data for inconsistent salary earners.") X_test_incons_scaled = scaler_incons.transform(X_test_incons) y_pred = model_incons.predict(X_test_incons_scaled) plt.figure(figsize=(10, 5)) plt.scatter(y_test_incons, y_pred, alpha=0.5) plt.xlabel("Actual Salary") plt.ylabel("Predicted Salary") plt.title("Actual vs. Predicted Salary") plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--') plt.show()