Remove salary.py file, eliminating all salary transaction analysis and related functions.
This commit is contained in:
@@ -1,549 +0,0 @@
|
|||||||
from readline import redisplay
|
|
||||||
from sqlalchemy import create_engine, text
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
import warnings
|
|
||||||
warnings.filterwarnings('ignore')
|
|
||||||
import re
|
|
||||||
import seaborn as sns
|
|
||||||
from matplotlib_venn import venn3, venn2
|
|
||||||
from wordcloud import WordCloud
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
DB_USER = "salaryloan"
|
|
||||||
DB_PASSWORD = "salaryloan"
|
|
||||||
DB_NAME = "salaryloan"
|
|
||||||
DB_PORT = "10532"
|
|
||||||
DB_HOST = "dev-data.simbrellang.net"
|
|
||||||
DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
|
|
||||||
|
|
||||||
engine = create_engine(DATABASE_URL)
|
|
||||||
|
|
||||||
try:
|
|
||||||
with engine.connect() as conn:
|
|
||||||
result = conn.execute(text("SELECT version();"))
|
|
||||||
print("Connected successfully!")
|
|
||||||
except Exception as e:
|
|
||||||
print("Error connecting to database:", e)
|
|
||||||
|
|
||||||
|
|
||||||
table_name = "customer_account_transaction_hx"
|
|
||||||
|
|
||||||
df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
|
|
||||||
df.head(10)
|
|
||||||
|
|
||||||
|
|
||||||
# Change to date column to datetime
|
|
||||||
df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
|
|
||||||
df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])
|
|
||||||
|
|
||||||
# Rename columns
|
|
||||||
df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
|
|
||||||
'd3': 'initiated_by', 'd4': 'customer_id'})
|
|
||||||
|
|
||||||
|
|
||||||
keywords = [
|
|
||||||
"salary", "payroll", "income", "wage", "wages",
|
|
||||||
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
|
|
||||||
"monthlypay", "netpay", "grosspay",
|
|
||||||
"remuneration", "stipend", "allowance", "bonus", "commission",
|
|
||||||
"pension", "retirement", "dividend", "benefits", "reimbursement",
|
|
||||||
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
|
|
||||||
"monthly income", "income tax refund", "employer deposit",
|
|
||||||
"payroll deposit", "salary credit", "income credit", "salary transfer",
|
|
||||||
"income transfer", "salary received", "income received", "hr deposit",
|
|
||||||
"company deposit", "employer payment", "employee payment",
|
|
||||||
"sal",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def identify_salary_transactions(df, keywords):
|
|
||||||
"""
|
|
||||||
Identifies potential salary-related transactions based on keywords
|
|
||||||
and month-year patterns in the 'description' column.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): The input DataFrame containing transaction data.
|
|
||||||
keywords (list): A list of salary/income-related keywords to search for.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
|
|
||||||
indicating potential salary transactions.
|
|
||||||
"""
|
|
||||||
month_year_patterns = [
|
|
||||||
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
|
|
||||||
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
|
|
||||||
]
|
|
||||||
|
|
||||||
escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
|
|
||||||
combined_pattern = (
|
|
||||||
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
|
|
||||||
'|'.join(month_year_patterns)
|
|
||||||
)
|
|
||||||
|
|
||||||
df['is_salary_related'] = df['description'].str.lower().str.contains(
|
|
||||||
combined_pattern,
|
|
||||||
na=False,
|
|
||||||
regex=True
|
|
||||||
)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
desc_df = identify_salary_transactions(df, keywords)
|
|
||||||
desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
|
|
||||||
desc_data.head()
|
|
||||||
|
|
||||||
def calculate_coefficient_of_variation(group):
|
|
||||||
"""
|
|
||||||
Calculates the coefficient of variation (CV) for a group of transactions
|
|
||||||
where 'initiated_by' is 'C'.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
|
|
||||||
"""
|
|
||||||
amounts = group[group['initiated_by'] == 'C']['amount']
|
|
||||||
mean = amounts.mean()
|
|
||||||
std = amounts.std(ddof=0)
|
|
||||||
|
|
||||||
if mean == 0:
|
|
||||||
return float('nan')
|
|
||||||
return std / mean
|
|
||||||
|
|
||||||
def flag_consistent_amounts(group, cv_threshold=0.10):
|
|
||||||
"""
|
|
||||||
Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
group (pd.DataFrame): Transactions for a single account.
|
|
||||||
cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
|
|
||||||
"""
|
|
||||||
# Filter for transactions initiated by 'C' before calculating CV
|
|
||||||
filtered_group = group[group['initiated_by'] == 'C']
|
|
||||||
cv = calculate_coefficient_of_variation(filtered_group) # Pass filtered group
|
|
||||||
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
|
|
||||||
|
|
||||||
return pd.Series(
|
|
||||||
[is_consistent] * len(group),
|
|
||||||
index=group.index,
|
|
||||||
name='is_consistent_amount'
|
|
||||||
)
|
|
||||||
|
|
||||||
def identify_consistent_amount_accounts(df, cv_threshold=0.10):
|
|
||||||
"""
|
|
||||||
Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
|
|
||||||
cv_threshold (float): Maximum allowed CV (default: 0.10).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
|
|
||||||
"""
|
|
||||||
df = df.groupby('accountid').apply(
|
|
||||||
lambda group: flag_consistent_amounts(group, cv_threshold)
|
|
||||||
).reset_index(level=0, drop=True)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
|
|
||||||
const_df = df.merge(const_df, left_index=True, right_index=True)
|
|
||||||
df['is_consistent_amount'] = const_df['is_consistent_amount']
|
|
||||||
const_data = const_df[
|
|
||||||
(const_df['is_consistent_amount']) &
|
|
||||||
(const_df['initiated_by']=='C')
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def flag_salary_type_transactions(df):
|
|
||||||
"""
|
|
||||||
Flags transactions that match the salary criteria based on type, subtype, and initiator.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
|
|
||||||
"""
|
|
||||||
df['is_salary_type'] = (
|
|
||||||
((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
|
|
||||||
((df['trx_subtype'] == 'BI') | (df['trx_subtype'] == 'I') | (df['trx_subtype'] == 'BS') | (df['trx_subtype'] == 'CI')) &
|
|
||||||
(df['initiated_by'] == 'C') &
|
|
||||||
(df['amount'] > 0)
|
|
||||||
)
|
|
||||||
return df
|
|
||||||
|
|
||||||
|
|
||||||
def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
|
|
||||||
"""
|
|
||||||
Determines if an account likely belongs to a salary earner based on transaction type criteria.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
group (pd.DataFrame): Transactions for a single account.
|
|
||||||
min_transactions (int): Minimum transactions required to qualify (default: 3).
|
|
||||||
threshold (float): Minimum proportion of salary-type transactions (default: 0.7).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the account meets the criteria, False otherwise.
|
|
||||||
"""
|
|
||||||
if len(group) < min_transactions:
|
|
||||||
return False
|
|
||||||
valid_ratio = group['is_salary_type'].mean()
|
|
||||||
return valid_ratio >= threshold
|
|
||||||
|
|
||||||
trx_df = flag_salary_type_transactions(df)
|
|
||||||
trx_data = trx_df[trx_df['is_salary_type']]
|
|
||||||
|
|
||||||
|
|
||||||
def plot_hypothesis_overlap(hypothesis1_df,
|
|
||||||
hypothesis3_df, hypothesis4_df,
|
|
||||||
account_col='accountid'):
|
|
||||||
"""
|
|
||||||
Plots a Venn diagram showing overlap between the hypotheses.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
|
|
||||||
hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
|
|
||||||
hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
|
|
||||||
hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
|
|
||||||
account_col (str): Account identifier column.
|
|
||||||
"""
|
|
||||||
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
|
|
||||||
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
|
|
||||||
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
|
|
||||||
|
|
||||||
|
|
||||||
plt.figure(figsize=(10, 10))
|
|
||||||
venn3([set2, set3, set4], set_labels=('Consistent Amount',
|
|
||||||
'Salary Description', 'Transaction Type'))
|
|
||||||
plt.title('Overlap Between Hypotheses')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
plot_hypothesis_overlap(desc_data, const_data, trx_data)
|
|
||||||
|
|
||||||
def filter_venn_section(df, **kwargs):
|
|
||||||
"""
|
|
||||||
Filters accounts based on specified combinations of hypothesis flags.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
|
|
||||||
**kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
|
|
||||||
For example: {'is_salary_related': True, 'is_consistent_amount': False}.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: Filtered accounts matching the specified Venn section.
|
|
||||||
"""
|
|
||||||
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
|
|
||||||
df1 = df[df['initiated_by']=='C']
|
|
||||||
invalid_keys = set(kwargs.keys()) - valid_columns
|
|
||||||
if invalid_keys:
|
|
||||||
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
|
|
||||||
|
|
||||||
condition = pd.Series([True] * len(df1), index=df1.index)
|
|
||||||
for key, value in kwargs.items():
|
|
||||||
condition &= (df1[key] == value)
|
|
||||||
|
|
||||||
return df1[condition]
|
|
||||||
|
|
||||||
|
|
||||||
green_section = filter_venn_section(
|
|
||||||
df,
|
|
||||||
is_salary_related=True,
|
|
||||||
is_consistent_amount=False,
|
|
||||||
is_salary_type=True
|
|
||||||
)
|
|
||||||
|
|
||||||
redisplay(green_section.head(10))
|
|
||||||
|
|
||||||
yellow_section = filter_venn_section(
|
|
||||||
df,
|
|
||||||
is_salary_related=False,
|
|
||||||
is_consistent_amount=True,
|
|
||||||
is_salary_type=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Get accounts flagged by all three hypotheses
|
|
||||||
all_three_hypotheses = filter_venn_section(
|
|
||||||
df,
|
|
||||||
is_salary_related=True,
|
|
||||||
is_consistent_amount=True,
|
|
||||||
is_salary_type=True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generate_salary_earners_table(all_three_hypotheses):
|
|
||||||
|
|
||||||
results = []
|
|
||||||
for accountid, group in all_three_hypotheses.groupby('accountid'):
|
|
||||||
# Calculate required metrics
|
|
||||||
num_months = len(group)
|
|
||||||
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
|
|
||||||
least_inflow = last_6_months['amount'].min()
|
|
||||||
avg_salary = group['amount'].mean()
|
|
||||||
|
|
||||||
# Estimated next salary
|
|
||||||
# Calculate days_since_last_trx within the loop
|
|
||||||
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
|
|
||||||
median_interval = group['days_since_last_trx'].median()
|
|
||||||
|
|
||||||
last_date = group['trx_start_date'].max()
|
|
||||||
next_date = last_date + timedelta(days=median_interval)
|
|
||||||
next_amount = avg_salary
|
|
||||||
|
|
||||||
# Boolean flags
|
|
||||||
days_since_last = (datetime.now() - last_date).days
|
|
||||||
has_45d = days_since_last <= 45
|
|
||||||
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
|
|
||||||
|
|
||||||
results.append({
|
|
||||||
'accountid': accountid,
|
|
||||||
'num_months': num_months,
|
|
||||||
'least_inflow_6m': least_inflow,
|
|
||||||
'avg_monthly_salary': avg_salary,
|
|
||||||
'estimated_next_amount': next_amount,
|
|
||||||
'estimated_next_date': next_date,
|
|
||||||
'45daysalary': has_45d,
|
|
||||||
'2monthssalary': has_2m
|
|
||||||
})
|
|
||||||
|
|
||||||
final_df = pd.DataFrame(results)
|
|
||||||
final_df = final_df.dropna()
|
|
||||||
return final_df
|
|
||||||
|
|
||||||
final_table = generate_salary_earners_table(all_three_hypotheses)
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
print(f"Found {final_table['accountid'].nunique()} verified salary earners")
|
|
||||||
|
|
||||||
likely_salary_earner = pd.concat([yellow_section, green_section])
|
|
||||||
likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
|
|
||||||
likely_salary_earner = generate_salary_earners_table(likely_salary_earner)
|
|
||||||
|
|
||||||
# Display results
|
|
||||||
print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")
|
|
||||||
|
|
||||||
|
|
||||||
def analyze_salary_earners(final_df):
|
|
||||||
"""
|
|
||||||
Analyzes salary earners and identifies high earners (>=10k predicted salary).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
final_df (pd.DataFrame): DataFrame containing salary earner information.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
|
|
||||||
"""
|
|
||||||
high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
|
|
||||||
high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
|
|
||||||
count_high = len(high_earners)
|
|
||||||
|
|
||||||
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
|
|
||||||
|
|
||||||
return high_earner_details, count_high
|
|
||||||
|
|
||||||
high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)
|
|
||||||
|
|
||||||
print(f"\nTotal High Earners: {total_high_earners}")
|
|
||||||
|
|
||||||
|
|
||||||
high_earner_details_df.to_csv('high_earner_details.csv', index=False)
|
|
||||||
likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
|
|
||||||
final_table.to_csv('final_table.csv', index=False)
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
|
||||||
from sklearn.ensemble import RandomForestRegressor
|
|
||||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
||||||
|
|
||||||
def add_feature_engineering(df):
|
|
||||||
"""
|
|
||||||
Engineers new features to the input DataFrame for salary prediction.
|
|
||||||
|
|
||||||
Adds features like month, month sequence, one-hot encoded transaction type,
|
|
||||||
3-month rolling sum, and 3-month rolling average of transaction amounts.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df (pd.DataFrame): The input DataFrame containing transaction data.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: The DataFrame with engineered features added.
|
|
||||||
"""
|
|
||||||
|
|
||||||
df['month'] = df['trx_start_date'].dt.month
|
|
||||||
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
|
|
||||||
|
|
||||||
# Categorical encoding: one-hot encode trx_type
|
|
||||||
encoder = OneHotEncoder(sparse_output=False)
|
|
||||||
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
|
|
||||||
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
|
|
||||||
df = pd.concat([df, encoded_df], axis=1)
|
|
||||||
|
|
||||||
# Rolling statistics: sort by account and date
|
|
||||||
df = df.sort_values(['accountid', 'trx_start_date'])
|
|
||||||
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
|
|
||||||
min_periods=1).sum().reset_index(0, drop=True)
|
|
||||||
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
|
|
||||||
min_periods=1).mean().reset_index(0, drop=True)
|
|
||||||
|
|
||||||
return df
|
|
||||||
|
|
||||||
def prepare_data(df_transactions, accounts):
|
|
||||||
"""
|
|
||||||
Prepares transaction data for training and testing a salary prediction model.
|
|
||||||
|
|
||||||
Filters transactions for specified accounts, performs feature engineering,
|
|
||||||
aggregates data monthly, filters for accounts with sufficient data, and
|
|
||||||
creates training and testing sets using a sliding window approach.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
|
|
||||||
accounts (list): A list of account IDs to include in the data preparation.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: A tuple containing the training and testing data as NumPy arrays:
|
|
||||||
(X_train, y_train, X_test, y_test).
|
|
||||||
"""
|
|
||||||
|
|
||||||
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
|
|
||||||
print(f"Filtered data for {len(accounts)} accounts.")
|
|
||||||
print(f"Total transactions: {len(df_filtered)}")
|
|
||||||
|
|
||||||
# Drop unnecessary columns
|
|
||||||
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
|
|
||||||
'trx_end_date', 'is_salary_related',
|
|
||||||
'is_consistent_amount', 'is_salary_type'], axis=1)
|
|
||||||
|
|
||||||
# Add feature engineering
|
|
||||||
df_filtered = add_feature_engineering(df_filtered)
|
|
||||||
|
|
||||||
# Aggregate monthly data with new features
|
|
||||||
agg_funcs = {
|
|
||||||
'amount': 'mean',
|
|
||||||
'rolling_sum_3m': 'last',
|
|
||||||
'rolling_avg_3m': 'last',
|
|
||||||
'month': 'first'
|
|
||||||
}
|
|
||||||
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
|
|
||||||
for col in encoded_cols:
|
|
||||||
agg_funcs[col] = 'sum'
|
|
||||||
|
|
||||||
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
|
|
||||||
|
|
||||||
# Filter accounts with at least 12 months
|
|
||||||
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
|
|
||||||
valid_accounts = account_month_counts[account_month_counts >= 12].index
|
|
||||||
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
|
|
||||||
|
|
||||||
# Create training and testing sequences
|
|
||||||
X_train, y_train, X_test, y_test = [], [], [], []
|
|
||||||
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
|
|
||||||
'month'] + encoded_cols
|
|
||||||
|
|
||||||
for account in valid_accounts:
|
|
||||||
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
|
|
||||||
|
|
||||||
# Check if account has enough data for training and testing sequences
|
|
||||||
if len(account_data) >= 12: # Ensure at least 12 months of data
|
|
||||||
for t in range(5, 8):
|
|
||||||
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
|
|
||||||
y_train.append(account_data['amount'].iloc[t])
|
|
||||||
for t in range(8, 12):
|
|
||||||
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
|
|
||||||
y_test.append(account_data['amount'].iloc[t])
|
|
||||||
else:
|
|
||||||
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
|
|
||||||
|
|
||||||
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
|
|
||||||
|
|
||||||
|
|
||||||
def train_model(X_train, y_train, X_test, y_test):
|
|
||||||
"""
|
|
||||||
Trains and evaluates a Random Forest Regressor for salary prediction.
|
|
||||||
|
|
||||||
Scales the input features using StandardScaler, trains the model,
|
|
||||||
predicts on the test set, and calculates evaluation metrics
|
|
||||||
(MAE, RMSE, R-squared).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
X_train (np.ndarray): Training data features.
|
|
||||||
y_train (np.ndarray): Training data target (salary).
|
|
||||||
X_test (np.ndarray): Testing data features.
|
|
||||||
y_test (np.ndarray): Testing data target (salary).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: A tuple containing the trained model and the scaler object:
|
|
||||||
(model, scaler).
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Scale features
|
|
||||||
scaler = StandardScaler()
|
|
||||||
X_train_scaled = scaler.fit_transform(X_train)
|
|
||||||
X_test_scaled = scaler.transform(X_test)
|
|
||||||
|
|
||||||
# Train Random Forest model
|
|
||||||
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
||||||
model.fit(X_train_scaled, y_train)
|
|
||||||
|
|
||||||
# Evaluate model
|
|
||||||
y_pred = model.predict(X_test_scaled)
|
|
||||||
mae = mean_absolute_error(y_test, y_pred)
|
|
||||||
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
|
||||||
r2 = r2_score(y_test, y_pred)
|
|
||||||
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
|
|
||||||
return model, scaler
|
|
||||||
|
|
||||||
consistent_accounts = final_table['accountid'].unique()
|
|
||||||
X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
|
|
||||||
if len(X_train_cons) > 0:
|
|
||||||
model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
|
|
||||||
print("Model trained for consistent salary earners.")
|
|
||||||
else:
|
|
||||||
print("No accounts with sufficient data for consistent salary earners.")
|
|
||||||
|
|
||||||
|
|
||||||
X_test_cons_scaled = scaler_cons.transform(X_test_cons)
|
|
||||||
y_pred = model_cons.predict(X_test_cons_scaled)
|
|
||||||
|
|
||||||
plt.figure(figsize=(10, 5))
|
|
||||||
plt.scatter(y_test_cons, y_pred, alpha=0.5)
|
|
||||||
plt.xlabel("Actual Salary")
|
|
||||||
plt.ylabel("Predicted Salary")
|
|
||||||
plt.title("Actual vs. Predicted Salary")
|
|
||||||
plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
inconsistent_accounts = likely_salary_earner['accountid'].unique()
|
|
||||||
X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
|
|
||||||
if len(X_train_incons) > 0:
|
|
||||||
print("\nTraining model for inconsistent salary earners...")
|
|
||||||
model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
|
|
||||||
else:
|
|
||||||
print("No accounts with sufficient data for inconsistent salary earners.")
|
|
||||||
|
|
||||||
|
|
||||||
X_test_incons_scaled = scaler_incons.transform(X_test_incons)
|
|
||||||
y_pred = model_incons.predict(X_test_incons_scaled)
|
|
||||||
|
|
||||||
plt.figure(figsize=(10, 5))
|
|
||||||
plt.scatter(y_test_incons, y_pred, alpha=0.5)
|
|
||||||
plt.xlabel("Actual Salary")
|
|
||||||
plt.ylabel("Predicted Salary")
|
|
||||||
plt.title("Actual vs. Predicted Salary")
|
|
||||||
plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
|
|
||||||
plt.show()
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user