Remove salary.py file, eliminating all salary transaction analysis and related functions.

2025-04-28 19:45:19 +01:00
parent 591d4611b6
commit 7e7094f0fd
1 changed files with 0 additions and 549 deletions
@@ -1,549 +0,0 @@
 from readline import redisplay
 from sqlalchemy import create_engine, text
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import warnings
 warnings.filterwarnings('ignore')
 import re
 import seaborn as sns
 from matplotlib_venn import venn3, venn2
 from wordcloud import WordCloud
 from datetime import datetime, timedelta
 DB_USER = "salaryloan"
 DB_PASSWORD = "salaryloan"
 DB_NAME = "salaryloan"
 DB_PORT = "10532"
 DB_HOST = "dev-data.simbrellang.net"
 DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
 engine = create_engine(DATABASE_URL)
 try:
    with engine.connect() as conn:
        result = conn.execute(text("SELECT version();"))
        print("Connected successfully!")
 except Exception as e:
    print("Error connecting to database:", e)
 table_name = "customer_account_transaction_hx"
 df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
 df.head(10)
 # Change to date column to datetime
 df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
 df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])
 # Rename columns
 df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
                        'd3': 'initiated_by', 'd4': 'customer_id'})
 keywords = [
    "salary", "payroll", "income", "wage", "wages",
    "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
    "monthlypay", "netpay", "grosspay",
    "remuneration", "stipend", "allowance", "bonus", "commission",
    "pension", "retirement", "dividend", "benefits", "reimbursement",
    "overtime", "incentive", "paycheck", "paycheque", "salary advance",
    "monthly income", "income tax refund", "employer deposit",
    "payroll deposit", "salary credit", "income credit", "salary transfer",
    "income transfer", "salary received", "income received", "hr deposit",
    "company deposit", "employer payment", "employee payment",
    "sal",
 ]
 def identify_salary_transactions(df, keywords):
    """
    Identifies potential salary-related transactions based on keywords
    and month-year patterns in the 'description' column.
    Args:
        df (pd.DataFrame): The input DataFrame containing transaction data.
        keywords (list): A list of salary/income-related keywords to search for.
    Returns:
        pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
                      indicating potential salary transactions.
    """
    month_year_patterns = [
        r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
        r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
    ]
    escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
    combined_pattern = (
        r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
        '|'.join(month_year_patterns)
        )
    df['is_salary_related'] = df['description'].str.lower().str.contains(
        combined_pattern,
        na=False,
        regex=True
    )
    return df
 desc_df = identify_salary_transactions(df, keywords)
 desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
 desc_data.head()
 def calculate_coefficient_of_variation(group):
    """
    Calculates the coefficient of variation (CV) for a group of transactions
    where 'initiated_by' is 'C'.
    Args:
        group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').
    Returns:
        float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
    """
    amounts = group[group['initiated_by'] == 'C']['amount']
    mean = amounts.mean()
    std = amounts.std(ddof=0)
    if mean == 0:
        return float('nan')
    return std / mean
 def flag_consistent_amounts(group, cv_threshold=0.10):
    """
    Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.
    Args:
        group (pd.DataFrame): Transactions for a single account.
        cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).
    Returns:
        pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
    """
    # Filter for transactions initiated by 'C' before calculating CV
    filtered_group = group[group['initiated_by'] == 'C']
    cv = calculate_coefficient_of_variation(filtered_group)  # Pass filtered group
    is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
    return pd.Series(
        [is_consistent] * len(group),
        index=group.index,
        name='is_consistent_amount'
    )
 def identify_consistent_amount_accounts(df, cv_threshold=0.10):
    """
    Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.
    Args:
        df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
        cv_threshold (float): Maximum allowed CV (default: 0.10).
    Returns:
        pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
    """
    df = df.groupby('accountid').apply(
        lambda group: flag_consistent_amounts(group, cv_threshold)
    ).reset_index(level=0, drop=True)
    return df
 const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
 const_df = df.merge(const_df, left_index=True, right_index=True)
 df['is_consistent_amount'] = const_df['is_consistent_amount']
 const_data = const_df[
    (const_df['is_consistent_amount']) &
    (const_df['initiated_by']=='C')
 ]
 def flag_salary_type_transactions(df):
    """
    Flags transactions that match the salary criteria based on type, subtype, and initiator.
    Args:
        df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].
    Returns:
        pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
    """
    df['is_salary_type'] = (
        ((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
        ((df['trx_subtype'] == 'BI') |  (df['trx_subtype'] == 'I') |  (df['trx_subtype'] == 'BS') |  (df['trx_subtype'] == 'CI')) &
        (df['initiated_by'] == 'C') &
        (df['amount'] > 0)
    )
    return df
 def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
    """
    Determines if an account likely belongs to a salary earner based on transaction type criteria.
    Args:
        group (pd.DataFrame): Transactions for a single account.
        min_transactions (int): Minimum transactions required to qualify (default: 3).
        threshold (float): Minimum proportion of salary-type transactions (default: 0.7).
    Returns:
        bool: True if the account meets the criteria, False otherwise.
    """
    if len(group) < min_transactions:
        return False
    valid_ratio = group['is_salary_type'].mean()
    return valid_ratio >= threshold
 trx_df = flag_salary_type_transactions(df)
 trx_data = trx_df[trx_df['is_salary_type']]
 def plot_hypothesis_overlap(hypothesis1_df,
                            hypothesis3_df, hypothesis4_df,
                            account_col='accountid'):
    """
    Plots a Venn diagram showing overlap between the hypotheses.
    Args:
        hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
        hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
        hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
        hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
        account_col (str): Account identifier column.
    """
    set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
    set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
    set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
    plt.figure(figsize=(10, 10))
    venn3([set2, set3, set4], set_labels=('Consistent Amount',
                                                'Salary Description', 'Transaction Type'))
    plt.title('Overlap Between Hypotheses')
    plt.show()
 plot_hypothesis_overlap(desc_data, const_data, trx_data)
 def filter_venn_section(df, **kwargs):
    """
    Filters accounts based on specified combinations of hypothesis flags.
    Args:
        df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
        **kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
                  For example: {'is_salary_related': True, 'is_consistent_amount': False}.
    Returns:
        pd.DataFrame: Filtered accounts matching the specified Venn section.
    """
    valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
    df1 = df[df['initiated_by']=='C']
    invalid_keys = set(kwargs.keys()) - valid_columns
    if invalid_keys:
        raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
    condition = pd.Series([True] * len(df1), index=df1.index)
    for key, value in kwargs.items():
        condition &= (df1[key] == value)
    return df1[condition]
 green_section = filter_venn_section(
    df,
    is_salary_related=True,
    is_consistent_amount=False,
    is_salary_type=True
 )
 redisplay(green_section.head(10))
 yellow_section = filter_venn_section(
    df,
    is_salary_related=False,
    is_consistent_amount=True,
    is_salary_type=True
 )
 # Get accounts flagged by all three hypotheses
 all_three_hypotheses = filter_venn_section(
    df,
    is_salary_related=True,
    is_consistent_amount=True,
    is_salary_type=True
 )
 def generate_salary_earners_table(all_three_hypotheses):
    results = []
    for accountid, group in all_three_hypotheses.groupby('accountid'):
        # Calculate required metrics
        num_months = len(group)
        last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
        least_inflow = last_6_months['amount'].min()
        avg_salary = group['amount'].mean()
        # Estimated next salary
        # Calculate days_since_last_trx within the loop
        group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
        median_interval = group['days_since_last_trx'].median()
        last_date = group['trx_start_date'].max()
        next_date = last_date + timedelta(days=median_interval)
        next_amount = avg_salary
        # Boolean flags
        days_since_last = (datetime.now() - last_date).days
        has_45d = days_since_last <= 45
        has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
        results.append({
            'accountid': accountid,
            'num_months': num_months,
            'least_inflow_6m': least_inflow,
            'avg_monthly_salary': avg_salary,
            'estimated_next_amount': next_amount,
            'estimated_next_date': next_date,
            '45daysalary': has_45d,
            '2monthssalary': has_2m
        })
    final_df = pd.DataFrame(results)
    final_df = final_df.dropna()
    return final_df
 final_table = generate_salary_earners_table(all_three_hypotheses)
 # Display results
 print(f"Found {final_table['accountid'].nunique()} verified salary earners")
 likely_salary_earner = pd.concat([yellow_section, green_section])
 likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
 likely_salary_earner = generate_salary_earners_table(likely_salary_earner)
 # Display results
 print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")
 def analyze_salary_earners(final_df):
    """
    Analyzes salary earners and identifies high earners (>=10k predicted salary).
    Args:
        final_df (pd.DataFrame): DataFrame containing salary earner information.
    Returns:
        pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
    """
    high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
    high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
    count_high = len(high_earners)
    high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
    return high_earner_details, count_high
 high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)
 print(f"\nTotal High Earners: {total_high_earners}")
 high_earner_details_df.to_csv('high_earner_details.csv', index=False)
 likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
 final_table.to_csv('final_table.csv', index=False)
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 def add_feature_engineering(df):
    """
    Engineers new features to the input DataFrame for salary prediction.
    Adds features like month, month sequence, one-hot encoded transaction type,
    3-month rolling sum, and 3-month rolling average of transaction amounts.
    Args:
        df (pd.DataFrame): The input DataFrame containing transaction data.
    Returns:
        pd.DataFrame: The DataFrame with engineered features added.
    """
    df['month'] = df['trx_start_date'].dt.month
    df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
    # Categorical encoding: one-hot encode trx_type
    encoder = OneHotEncoder(sparse_output=False)
    encoded_trx_type = encoder.fit_transform(df[['trx_type']])
    encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
    df = pd.concat([df, encoded_df], axis=1)
    # Rolling statistics: sort by account and date
    df = df.sort_values(['accountid', 'trx_start_date'])
    df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
                                                                     min_periods=1).sum().reset_index(0, drop=True)
    df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
                                                                     min_periods=1).mean().reset_index(0, drop=True)
    return df
 def prepare_data(df_transactions, accounts):
    """
    Prepares transaction data for training and testing a salary prediction model.
    Filters transactions for specified accounts, performs feature engineering,
    aggregates data monthly, filters for accounts with sufficient data, and
    creates training and testing sets using a sliding window approach.
    Args:
        df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
        accounts (list): A list of account IDs to include in the data preparation.
    Returns:
        tuple: A tuple containing the training and testing data as NumPy arrays:
               (X_train, y_train, X_test, y_test).
    """
    df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
    print(f"Filtered data for {len(accounts)} accounts.")
    print(f"Total transactions: {len(df_filtered)}")
    # Drop unnecessary columns
    df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
                                    'trx_end_date', 'is_salary_related',
                                    'is_consistent_amount', 'is_salary_type'], axis=1)
    # Add feature engineering
    df_filtered = add_feature_engineering(df_filtered)
    # Aggregate monthly data with new features
    agg_funcs = {
        'amount': 'mean',
        'rolling_sum_3m': 'last',
        'rolling_avg_3m': 'last',
        'month': 'first'
    }
    encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
    for col in encoded_cols:
        agg_funcs[col] = 'sum'
    monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
    # Filter accounts with at least 12 months
    account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
    valid_accounts = account_month_counts[account_month_counts >= 12].index
    monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
    # Create training and testing sequences
    X_train, y_train, X_test, y_test = [], [], [], []
    feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
                    'month'] + encoded_cols
    for account in valid_accounts:
        account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
        # Check if account has enough data for training and testing sequences
        if len(account_data) >= 12:  # Ensure at least 12 months of data
            for t in range(5, 8):
                X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
                y_train.append(account_data['amount'].iloc[t])
            for t in range(8, 12):
                X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
                y_test.append(account_data['amount'].iloc[t])
        else:
            print(f"Skipping account {account} due to insufficient data (less than 12 months).")
    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
 def train_model(X_train, y_train, X_test, y_test):
    """
    Trains and evaluates a Random Forest Regressor for salary prediction.
    Scales the input features using StandardScaler, trains the model,
    predicts on the test set, and calculates evaluation metrics
    (MAE, RMSE, R-squared).
    Args:
        X_train (np.ndarray): Training data features.
        y_train (np.ndarray): Training data target (salary).
        X_test (np.ndarray): Testing data features.
        y_test (np.ndarray): Testing data target (salary).
    Returns:
        tuple: A tuple containing the trained model and the scaler object:
               (model, scaler).
    """
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    # Train Random Forest model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train_scaled, y_train)
    # Evaluate model
    y_pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
    return model, scaler
 consistent_accounts = final_table['accountid'].unique()
 X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
 if len(X_train_cons) > 0:
    model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
    print("Model trained for consistent salary earners.")
 else:
    print("No accounts with sufficient data for consistent salary earners.")
 X_test_cons_scaled = scaler_cons.transform(X_test_cons)
 y_pred = model_cons.predict(X_test_cons_scaled)
 plt.figure(figsize=(10, 5))
 plt.scatter(y_test_cons, y_pred, alpha=0.5)
 plt.xlabel("Actual Salary")
 plt.ylabel("Predicted Salary")
 plt.title("Actual vs. Predicted Salary")
 plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
 plt.show()
 inconsistent_accounts = likely_salary_earner['accountid'].unique()
 X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
 if len(X_train_incons) > 0:
    print("\nTraining model for inconsistent salary earners...")
    model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
 else:
    print("No accounts with sufficient data for inconsistent salary earners.")
 X_test_incons_scaled = scaler_incons.transform(X_test_incons)
 y_pred = model_incons.predict(X_test_incons_scaled)
 plt.figure(figsize=(10, 5))
 plt.scatter(y_test_incons, y_pred, alpha=0.5)
 plt.xlabel("Actual Salary")
 plt.ylabel("Predicted Salary")
 plt.title("Actual vs. Predicted Salary")
 plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
 plt.show()