Files
AnalysisTesting/salary_analytics/salary_predictor.py
T
salakojoshua1234_gmail.com 5767f55686 Update project structure and enhance model persistence
- Added new model and scaler files to .gitignore and output directory.
- Updated Dockerfile to create output/models directory.
- Revised README to include instructions for using a .env file for configuration.
- Enhanced config.py to load database credentials from environment variables.
- Implemented model saving functionality in salary_predictor.py for consistent and inconsistent earners.
2025-05-02 00:16:46 +01:00

171 lines
7.6 KiB
Python

"""
Salary prediction module using machine learning.
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump
from .config import OUTPUT_PATHS
class SalaryPredictor:
def __init__(self, df):
self.df = df
self.model_cons = None
self.model_incons = None
self.scaler_cons = None
self.scaler_incons = None
def add_feature_engineering(self, df):
"""Engineer features for salary prediction."""
df['month'] = df['trx_start_date'].dt.month
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
# Categorical encoding
encoder = OneHotEncoder(sparse_output=False)
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
df = pd.concat([df, encoded_df], axis=1)
# Rolling statistics
df = df.sort_values(['accountid', 'trx_start_date'])
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).sum().reset_index(0, drop=True)
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
min_periods=1).mean().reset_index(0, drop=True)
return df
def prepare_data(self, df_transactions, accounts):
"""Prepare data for training and testing."""
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
print(f"Filtered data for {len(accounts)} accounts.")
print(f"Total transactions: {len(df_filtered)}")
# Drop unnecessary columns
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
'trx_end_date', 'is_salary_related',
'is_consistent_amount', 'is_salary_type'], axis=1)
# Add feature engineering
df_filtered = self.add_feature_engineering(df_filtered)
# Aggregate monthly data
agg_funcs = {
'amount': 'mean',
'rolling_sum_3m': 'last',
'rolling_avg_3m': 'last',
'month': 'first'
}
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
for col in encoded_cols:
agg_funcs[col] = 'sum'
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
# Filter accounts with at least 12 months
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
valid_accounts = account_month_counts[account_month_counts >= 12].index
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
# Create sequences
X_train, y_train, X_test, y_test = [], [], [], []
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
'month'] + encoded_cols
for account in valid_accounts:
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
if len(account_data) >= 12:
for t in range(5, 8):
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_train.append(account_data['amount'].iloc[t])
for t in range(8, 12):
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
y_test.append(account_data['amount'].iloc[t])
else:
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
def train_model(self, X_train, y_train, X_test, y_test):
"""Train and evaluate a Random Forest model."""
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
# Evaluate
y_pred = model.predict(X_test_scaled)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
return model, scaler
def plot_predictions(self, y_test, y_pred, title, output_path):
"""Plot actual vs predicted values and save to file."""
plt.figure(figsize=(10, 5))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel("Actual Salary")
plt.ylabel("Predicted Salary")
plt.title(title)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
plt.savefig(output_path)
plt.close()
def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
"""Train and evaluate models for both consistent and inconsistent salary earners."""
# Train model for consistent salary earners
X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
if len(X_train_cons) > 0:
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
print("Model trained for consistent salary earners.")
# Save model and scaler
dump(self.model_cons, OUTPUT_PATHS['consistent_model'])
dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler'])
print("Saved consistent salary earner model and scaler.")
# Plot predictions
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
y_pred = self.model_cons.predict(X_test_cons_scaled)
self.plot_predictions(
y_test_cons,
y_pred,
"Actual vs. Predicted Salary (Consistent Earners)",
OUTPUT_PATHS['consistent_earners_plot']
)
else:
print("No accounts with sufficient data for consistent salary earners.")
# Train model for inconsistent salary earners
X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
if len(X_train_incons) > 0:
print("\nTraining model for inconsistent salary earners...")
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
# Save model and scaler
dump(self.model_incons, OUTPUT_PATHS['inconsistent_model'])
dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler'])
print("Saved inconsistent salary earner model and scaler.")
# Plot predictions
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
y_pred = self.model_incons.predict(X_test_incons_scaled)
self.plot_predictions(
y_test_incons,
y_pred,
"Actual vs. Predicted Salary (Inconsistent Earners)",
OUTPUT_PATHS['inconsistent_earners_plot']
)
else:
print("No accounts with sufficient data for inconsistent salary earners.")