AnalysisTesting/salary.py

from readline import redisplay
from sqlalchemy import create_engine, text
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import re
import seaborn as sns
from matplotlib_venn import venn3, venn2
from wordcloud import WordCloud
from datetime import datetime, timedelta

DB_USER = "salaryloan"
DB_PASSWORD = "salaryloan"
DB_NAME = "salaryloan"
DB_PORT = "10532"
DB_HOST = "dev-data.simbrellang.net"
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"

engine = create_engine(DATABASE_URL)

try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version();"))
        print("Connected successfully!")
except Exception as e:
    print("Error connecting to database:", e)


table_name = "customer_account_transaction_hx"

df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
df.head(10)


# Change to date column to datetime
df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])

# Rename columns
df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
                        'd3': 'initiated_by', 'd4': 'customer_id'})


keywords = [
    "salary", "payroll", "income", "wage", "wages",
    "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
    "monthlypay", "netpay", "grosspay",
    "remuneration", "stipend", "allowance", "bonus", "commission",
    "pension", "retirement", "dividend", "benefits", "reimbursement",
    "overtime", "incentive", "paycheck", "paycheque", "salary advance",
    "monthly income", "income tax refund", "employer deposit",
    "payroll deposit", "salary credit", "income credit", "salary transfer",
    "income transfer", "salary received", "income received", "hr deposit",
    "company deposit", "employer payment", "employee payment",
    "sal",
]


def identify_salary_transactions(df, keywords):
    """
    Identifies potential salary-related transactions based on keywords
    and month-year patterns in the 'description' column.

    Args:
        df (pd.DataFrame): The input DataFrame containing transaction data.
        keywords (list): A list of salary/income-related keywords to search for.

    Returns:
        pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
                      indicating potential salary transactions.
    """
    month_year_patterns = [
        r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
        r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
    ]

    escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
    combined_pattern = (
        r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
        '|'.join(month_year_patterns)
        )

    df['is_salary_related'] = df['description'].str.lower().str.contains(
        combined_pattern,
        na=False,
        regex=True
    )

    return df


desc_df = identify_salary_transactions(df, keywords)
desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
desc_data.head()

def calculate_coefficient_of_variation(group):
    """
    Calculates the coefficient of variation (CV) for a group of transactions
    where 'initiated_by' is 'C'.

    Args:
        group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').

    Returns:
        float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
    """
    amounts = group[group['initiated_by'] == 'C']['amount']
    mean = amounts.mean()
    std = amounts.std(ddof=0)

    if mean == 0:
        return float('nan')
    return std / mean

def flag_consistent_amounts(group, cv_threshold=0.10):
    """
    Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.

    Args:
        group (pd.DataFrame): Transactions for a single account.
        cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).

    Returns:
        pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
    """
    # Filter for transactions initiated by 'C' before calculating CV
    filtered_group = group[group['initiated_by'] == 'C']
    cv = calculate_coefficient_of_variation(filtered_group)  # Pass filtered group
    is_consistent = cv <= cv_threshold if not pd.isna(cv) else False

    return pd.Series(
        [is_consistent] * len(group),
        index=group.index,
        name='is_consistent_amount'
    )

def identify_consistent_amount_accounts(df, cv_threshold=0.10):
    """
    Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.

    Args:
        df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
        cv_threshold (float): Maximum allowed CV (default: 0.10).

    Returns:
        pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
    """
    df = df.groupby('accountid').apply(
        lambda group: flag_consistent_amounts(group, cv_threshold)
    ).reset_index(level=0, drop=True)

    return df


const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
const_df = df.merge(const_df, left_index=True, right_index=True)
df['is_consistent_amount'] = const_df['is_consistent_amount']
const_data = const_df[
    (const_df['is_consistent_amount']) &
    (const_df['initiated_by']=='C')
]


def flag_salary_type_transactions(df):
    """
    Flags transactions that match the salary criteria based on type, subtype, and initiator.

    Args:
        df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].

    Returns:
        pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
    """
    df['is_salary_type'] = (
        ((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
        ((df['trx_subtype'] == 'BI') |  (df['trx_subtype'] == 'I') |  (df['trx_subtype'] == 'BS') |  (df['trx_subtype'] == 'CI')) &
        (df['initiated_by'] == 'C') &
        (df['amount'] > 0)
    )
    return df


def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
    """
    Determines if an account likely belongs to a salary earner based on transaction type criteria.

    Args:
        group (pd.DataFrame): Transactions for a single account.
        min_transactions (int): Minimum transactions required to qualify (default: 3).
        threshold (float): Minimum proportion of salary-type transactions (default: 0.7).

    Returns:
        bool: True if the account meets the criteria, False otherwise.
    """
    if len(group) < min_transactions:
        return False
    valid_ratio = group['is_salary_type'].mean()
    return valid_ratio >= threshold

trx_df = flag_salary_type_transactions(df)
trx_data = trx_df[trx_df['is_salary_type']]


def plot_hypothesis_overlap(hypothesis1_df,
                            hypothesis3_df, hypothesis4_df,
                            account_col='accountid'):
    """
    Plots a Venn diagram showing overlap between the hypotheses.

    Args:
        hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
        hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
        hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
        hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
        account_col (str): Account identifier column.
    """
    set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
    set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
    set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])


    plt.figure(figsize=(10, 10))
    venn3([set2, set3, set4], set_labels=('Consistent Amount',
                                                'Salary Description', 'Transaction Type'))
    plt.title('Overlap Between Hypotheses')
    plt.show()

plot_hypothesis_overlap(desc_data, const_data, trx_data)

def filter_venn_section(df, **kwargs):
    """
    Filters accounts based on specified combinations of hypothesis flags.

    Args:
        df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
        **kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
                  For example: {'is_salary_related': True, 'is_consistent_amount': False}.

    Returns:
        pd.DataFrame: Filtered accounts matching the specified Venn section.
    """
    valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
    df1 = df[df['initiated_by']=='C']
    invalid_keys = set(kwargs.keys()) - valid_columns
    if invalid_keys:
        raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")

    condition = pd.Series([True] * len(df1), index=df1.index)
    for key, value in kwargs.items():
        condition &= (df1[key] == value)

    return df1[condition]


green_section = filter_venn_section(
    df,
    is_salary_related=True,
    is_consistent_amount=False,
    is_salary_type=True
)

redisplay(green_section.head(10))

yellow_section = filter_venn_section(
    df,
    is_salary_related=False,
    is_consistent_amount=True,
    is_salary_type=True
)


# Get accounts flagged by all three hypotheses
all_three_hypotheses = filter_venn_section(
    df,
    is_salary_related=True,
    is_consistent_amount=True,
    is_salary_type=True
)


def generate_salary_earners_table(all_three_hypotheses):

    results = []
    for accountid, group in all_three_hypotheses.groupby('accountid'):
        # Calculate required metrics
        num_months = len(group)
        last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
        least_inflow = last_6_months['amount'].min()
        avg_salary = group['amount'].mean()

        # Estimated next salary
        # Calculate days_since_last_trx within the loop
        group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
        median_interval = group['days_since_last_trx'].median()

        last_date = group['trx_start_date'].max()
        next_date = last_date + timedelta(days=median_interval)
        next_amount = avg_salary

        # Boolean flags
        days_since_last = (datetime.now() - last_date).days
        has_45d = days_since_last <= 45
        has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2

        results.append({
            'accountid': accountid,
            'num_months': num_months,
            'least_inflow_6m': least_inflow,
            'avg_monthly_salary': avg_salary,
            'estimated_next_amount': next_amount,
            'estimated_next_date': next_date,
            '45daysalary': has_45d,
            '2monthssalary': has_2m
        })

    final_df = pd.DataFrame(results)
    final_df = final_df.dropna()
    return final_df

final_table = generate_salary_earners_table(all_three_hypotheses)

# Display results
print(f"Found {final_table['accountid'].nunique()} verified salary earners")

likely_salary_earner = pd.concat([yellow_section, green_section])
likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
likely_salary_earner = generate_salary_earners_table(likely_salary_earner)

# Display results
print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")


def analyze_salary_earners(final_df):
    """
    Analyzes salary earners and identifies high earners (>=10k predicted salary).

    Args:
        final_df (pd.DataFrame): DataFrame containing salary earner information.

    Returns:
        pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
    """
    high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
    high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
    count_high = len(high_earners)

    high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)

    return high_earner_details, count_high

high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)

print(f"\nTotal High Earners: {total_high_earners}")


high_earner_details_df.to_csv('high_earner_details.csv', index=False)
likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
final_table.to_csv('final_table.csv', index=False)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def add_feature_engineering(df):
    """
    Engineers new features to the input DataFrame for salary prediction.

    Adds features like month, month sequence, one-hot encoded transaction type,
    3-month rolling sum, and 3-month rolling average of transaction amounts.

    Args:
        df (pd.DataFrame): The input DataFrame containing transaction data.

    Returns:
        pd.DataFrame: The DataFrame with engineered features added.
    """

    df['month'] = df['trx_start_date'].dt.month
    df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1

    # Categorical encoding: one-hot encode trx_type
    encoder = OneHotEncoder(sparse_output=False)
    encoded_trx_type = encoder.fit_transform(df[['trx_type']])
    encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
    df = pd.concat([df, encoded_df], axis=1)

    # Rolling statistics: sort by account and date
    df = df.sort_values(['accountid', 'trx_start_date'])
    df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
                                                                     min_periods=1).sum().reset_index(0, drop=True)
    df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
                                                                     min_periods=1).mean().reset_index(0, drop=True)

    return df

def prepare_data(df_transactions, accounts):
    """
    Prepares transaction data for training and testing a salary prediction model.

    Filters transactions for specified accounts, performs feature engineering,
    aggregates data monthly, filters for accounts with sufficient data, and
    creates training and testing sets using a sliding window approach.

    Args:
        df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
        accounts (list): A list of account IDs to include in the data preparation.

    Returns:
        tuple: A tuple containing the training and testing data as NumPy arrays:
               (X_train, y_train, X_test, y_test).
    """

    df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
    print(f"Filtered data for {len(accounts)} accounts.")
    print(f"Total transactions: {len(df_filtered)}")

    # Drop unnecessary columns
    df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
                                    'trx_end_date', 'is_salary_related',
                                    'is_consistent_amount', 'is_salary_type'], axis=1)

    # Add feature engineering
    df_filtered = add_feature_engineering(df_filtered)

    # Aggregate monthly data with new features
    agg_funcs = {
        'amount': 'mean',
        'rolling_sum_3m': 'last',
        'rolling_avg_3m': 'last',
        'month': 'first'
    }
    encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
    for col in encoded_cols:
        agg_funcs[col] = 'sum'

    monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()

    # Filter accounts with at least 12 months
    account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
    valid_accounts = account_month_counts[account_month_counts >= 12].index
    monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]

    # Create training and testing sequences
    X_train, y_train, X_test, y_test = [], [], [], []
    feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
                    'month'] + encoded_cols

    for account in valid_accounts:
        account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')

        # Check if account has enough data for training and testing sequences
        if len(account_data) >= 12:  # Ensure at least 12 months of data
            for t in range(5, 8):
                X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
                y_train.append(account_data['amount'].iloc[t])
            for t in range(8, 12):
                X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
                y_test.append(account_data['amount'].iloc[t])
        else:
            print(f"Skipping account {account} due to insufficient data (less than 12 months).")

    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)


def train_model(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates a Random Forest Regressor for salary prediction.

    Scales the input features using StandardScaler, trains the model,
    predicts on the test set, and calculates evaluation metrics
    (MAE, RMSE, R-squared).

    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data target (salary).
        X_test (np.ndarray): Testing data features.
        y_test (np.ndarray): Testing data target (salary).

    Returns:
        tuple: A tuple containing the trained model and the scaler object:
               (model, scaler).
    """

    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Train Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)

    # Evaluate model
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
    return model, scaler

consistent_accounts = final_table['accountid'].unique()
X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
if len(X_train_cons) > 0:
    model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
    print("Model trained for consistent salary earners.")
else:
    print("No accounts with sufficient data for consistent salary earners.")


X_test_cons_scaled = scaler_cons.transform(X_test_cons)
y_pred = model_cons.predict(X_test_cons_scaled)

plt.figure(figsize=(10, 5))
plt.scatter(y_test_cons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
plt.show()

inconsistent_accounts = likely_salary_earner['accountid'].unique()
X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
if len(X_train_incons) > 0:
    print("\nTraining model for inconsistent salary earners...")
    model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
else:
    print("No accounts with sufficient data for inconsistent salary earners.")


X_test_incons_scaled = scaler_incons.transform(X_test_incons)
y_pred = model_incons.predict(X_test_incons_scaled)

plt.figure(figsize=(10, 5))
plt.scatter(y_test_incons, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title("Actual vs. Predicted Salary")
plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
plt.show()