Update project structure and enhance model persistence
- Added new model and scaler files to .gitignore and output directory. - Updated Dockerfile to create output/models directory. - Revised README to include instructions for using a .env file for configuration. - Enhanced config.py to load database credentials from environment variables. - Implemented model saving functionality in salary_predictor.py for consistent and inconsistent earners.
This commit is contained in:
@@ -0,0 +1,6 @@
|
|||||||
|
# Database Configuration
|
||||||
|
DB_USER=your_username
|
||||||
|
DB_PASSWORD=your_password
|
||||||
|
DB_NAME=your_database
|
||||||
|
DB_PORT=your_port
|
||||||
|
DB_HOST=your_host
|
||||||
@@ -4,3 +4,9 @@ output/csv/high_earner_details.csv
|
|||||||
output/csv/likely_salary_earner.csv
|
output/csv/likely_salary_earner.csv
|
||||||
output/plots/consistent_earners_predictions.png
|
output/plots/consistent_earners_predictions.png
|
||||||
output/plots/hypothesis_overlap.png
|
output/plots/hypothesis_overlap.png
|
||||||
|
output/plots/inconsistent_earners_predictions.png
|
||||||
|
output/models/consistent_model.joblib
|
||||||
|
output/models/inconsistent_model.joblib
|
||||||
|
output/models/consistent_scaler.joblib
|
||||||
|
output/models/inconsistent_scaler.joblib
|
||||||
|
.env
|
||||||
|
|||||||
+1
-2
@@ -9,7 +9,7 @@ RUN pip install -r requirements.txt
|
|||||||
|
|
||||||
COPY salary_analytics/ ./salary_analytics/
|
COPY salary_analytics/ ./salary_analytics/
|
||||||
|
|
||||||
RUN mkdir -p output/csv output/plots
|
RUN mkdir -p output/csv output/plots output/models
|
||||||
|
|
||||||
ENV PYTHONPATH=/app
|
ENV PYTHONPATH=/app
|
||||||
ENV HOST=0.0.0.0
|
ENV HOST=0.0.0.0
|
||||||
@@ -17,5 +17,4 @@ ENV PORT=8000
|
|||||||
|
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
|
|
||||||
# Use host 0.0.0.0 to allow external connections
|
|
||||||
CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||||
@@ -21,6 +21,7 @@ A comprehensive salary analytics system that analyzes transaction data to identi
|
|||||||
- Separate models for consistent and inconsistent earners
|
- Separate models for consistent and inconsistent earners
|
||||||
- Feature engineering
|
- Feature engineering
|
||||||
- Model evaluation metrics
|
- Model evaluation metrics
|
||||||
|
- Model persistence (saved in output/models)
|
||||||
|
|
||||||
- **Reporting**
|
- **Reporting**
|
||||||
- CSV reports generation
|
- CSV reports generation
|
||||||
@@ -48,25 +49,20 @@ salary_analytics/
|
|||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
|
|
||||||
The system can be configured through environment variables or the `config.py` file:
|
The system can be configured through environment variables using a `.env` file:
|
||||||
|
|
||||||
```python
|
1. Copy the example environment file:
|
||||||
# Database Configuration
|
```bash
|
||||||
DB_CONFIG = {
|
cp .env.example .env
|
||||||
"user": "db_user",
|
```
|
||||||
"password": "your_secure_password",
|
|
||||||
"name": "salary_db",
|
|
||||||
"port": "5432",
|
|
||||||
"host": "localhost"
|
|
||||||
}
|
|
||||||
|
|
||||||
# Model Configuration
|
2. Edit the `.env` file with your database credentials:
|
||||||
MODEL_CONFIG = {
|
```bash
|
||||||
"cv_threshold": 0.10,
|
DB_USER=your_username
|
||||||
"min_transactions": 3,
|
DB_PASSWORD=your_password
|
||||||
"threshold": 0.7,
|
DB_NAME=your_database
|
||||||
"high_earner_threshold": 10000
|
DB_PORT=your_port
|
||||||
}
|
DB_HOST=your_host
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
@@ -140,9 +136,15 @@ Note: All analysis endpoints require data to be loaded first. If you try to run
|
|||||||
docker-compose build
|
docker-compose build
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Run the container:
|
2. Run the container with environment variables:
|
||||||
```bash
|
```bash
|
||||||
docker-compose up
|
docker run -v $(pwd)/output:/app/output \
|
||||||
|
-e DB_USER=your_username \
|
||||||
|
-e DB_PASSWORD=your_password \
|
||||||
|
-e DB_NAME=your_database \
|
||||||
|
-e DB_PORT=your_port \
|
||||||
|
-e DB_HOST=your_host \
|
||||||
|
salary-analytics
|
||||||
```
|
```
|
||||||
|
|
||||||
The API will be available at http://localhost:8000
|
The API will be available at http://localhost:8000
|
||||||
@@ -155,8 +157,13 @@ output/
|
|||||||
│ ├── high_earner_details.csv
|
│ ├── high_earner_details.csv
|
||||||
│ ├── likely_salary_earner.csv
|
│ ├── likely_salary_earner.csv
|
||||||
│ └── final_table.csv
|
│ └── final_table.csv
|
||||||
└── plots/
|
├── plots/
|
||||||
├── consistent_earners_predictions.png
|
│ ├── consistent_earners_predictions.png
|
||||||
├── inconsistent_earners_predictions.png
|
│ ├── inconsistent_earners_predictions.png
|
||||||
└── hypothesis_overlap.png
|
│ └── hypothesis_overlap.png
|
||||||
|
└── models/
|
||||||
|
├── consistent_model.joblib
|
||||||
|
├── inconsistent_model.joblib
|
||||||
|
├── consistent_scaler.joblib
|
||||||
|
└── inconsistent_scaler.joblib
|
||||||
```
|
```
|
||||||
@@ -1,5 +1,3 @@
|
|||||||
version: '3.8'
|
|
||||||
|
|
||||||
services:
|
services:
|
||||||
api:
|
api:
|
||||||
build: .
|
build: .
|
||||||
|
|||||||
+12
-10
@@ -1,13 +1,15 @@
|
|||||||
sqlalchemy
|
sqlalchemy>=2.0.0
|
||||||
pandas
|
pandas>=1.5.0
|
||||||
numpy
|
numpy>=1.21.0
|
||||||
matplotlib
|
matplotlib>=3.5.0
|
||||||
seaborn
|
seaborn>=0.12.0
|
||||||
matplotlib-venn
|
matplotlib-venn>=0.11.7
|
||||||
wordcloud
|
wordcloud>=1.8.0
|
||||||
scikit-learn
|
scikit-learn>=1.0.0
|
||||||
psycopg2-binary
|
psycopg2-binary>=2.9.0
|
||||||
fastapi>=0.68.0
|
fastapi>=0.68.0
|
||||||
uvicorn>=0.15.0
|
uvicorn>=0.15.0
|
||||||
pydantic>=1.8.0
|
pydantic>=1.8.0
|
||||||
python-multipart>=0.0.5
|
python-multipart>=0.0.5
|
||||||
|
python-dotenv>=0.19.0
|
||||||
|
joblib>=1.1.0
|
||||||
@@ -3,25 +3,31 @@ Configuration settings for the salary analytics package.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
# Load environment variables
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# Base directories
|
# Base directories
|
||||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
||||||
PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
|
PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
|
||||||
CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
|
CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
|
||||||
|
MODEL_DIR = os.path.join(OUTPUT_DIR, "models")
|
||||||
|
|
||||||
# Create directories if they don't exist
|
# Create directories if they don't exist
|
||||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||||
os.makedirs(PLOTS_DIR, exist_ok=True)
|
os.makedirs(PLOTS_DIR, exist_ok=True)
|
||||||
os.makedirs(CSV_DIR, exist_ok=True)
|
os.makedirs(CSV_DIR, exist_ok=True)
|
||||||
|
os.makedirs(MODEL_DIR, exist_ok=True)
|
||||||
|
|
||||||
# Database Configuration
|
# Database Configuration
|
||||||
DB_CONFIG = {
|
DB_CONFIG = {
|
||||||
"user": "salaryloan",
|
"user": os.getenv("DB_USER", "salaryloan"), # Default value as fallback
|
||||||
"password": "salaryloan",
|
"password": os.getenv("DB_PASSWORD", "salaryloan"),
|
||||||
"name": "salaryloan",
|
"name": os.getenv("DB_NAME", "salaryloan"),
|
||||||
"port": "10532",
|
"port": os.getenv("DB_PORT", "10532"),
|
||||||
"host": "dev-data.simbrellang.net"
|
"host": os.getenv("DB_HOST", "dev-data.simbrellang.net")
|
||||||
}
|
}
|
||||||
|
|
||||||
# Table Configuration
|
# Table Configuration
|
||||||
@@ -57,5 +63,9 @@ OUTPUT_PATHS = {
|
|||||||
"final_table": os.path.join(CSV_DIR, "final_table.csv"),
|
"final_table": os.path.join(CSV_DIR, "final_table.csv"),
|
||||||
"consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
|
"consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
|
||||||
"inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
|
"inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
|
||||||
"hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png")
|
"hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png"),
|
||||||
|
"consistent_model": os.path.join(MODEL_DIR, "consistent_model.joblib"),
|
||||||
|
"inconsistent_model": os.path.join(MODEL_DIR, "inconsistent_model.joblib"),
|
||||||
|
"consistent_scaler": os.path.join(MODEL_DIR, "consistent_scaler.joblib"),
|
||||||
|
"inconsistent_scaler": os.path.join(MODEL_DIR, "inconsistent_scaler.joblib")
|
||||||
}
|
}
|
||||||
@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
|
|||||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||||
from sklearn.ensemble import RandomForestRegressor
|
from sklearn.ensemble import RandomForestRegressor
|
||||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||||
|
from joblib import dump
|
||||||
from .config import OUTPUT_PATHS
|
from .config import OUTPUT_PATHS
|
||||||
|
|
||||||
class SalaryPredictor:
|
class SalaryPredictor:
|
||||||
@@ -129,6 +130,11 @@ class SalaryPredictor:
|
|||||||
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
|
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
|
||||||
print("Model trained for consistent salary earners.")
|
print("Model trained for consistent salary earners.")
|
||||||
|
|
||||||
|
# Save model and scaler
|
||||||
|
dump(self.model_cons, OUTPUT_PATHS['consistent_model'])
|
||||||
|
dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler'])
|
||||||
|
print("Saved consistent salary earner model and scaler.")
|
||||||
|
|
||||||
# Plot predictions
|
# Plot predictions
|
||||||
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
|
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
|
||||||
y_pred = self.model_cons.predict(X_test_cons_scaled)
|
y_pred = self.model_cons.predict(X_test_cons_scaled)
|
||||||
@@ -147,6 +153,11 @@ class SalaryPredictor:
|
|||||||
print("\nTraining model for inconsistent salary earners...")
|
print("\nTraining model for inconsistent salary earners...")
|
||||||
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
|
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
|
||||||
|
|
||||||
|
# Save model and scaler
|
||||||
|
dump(self.model_incons, OUTPUT_PATHS['inconsistent_model'])
|
||||||
|
dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler'])
|
||||||
|
print("Saved inconsistent salary earner model and scaler.")
|
||||||
|
|
||||||
# Plot predictions
|
# Plot predictions
|
||||||
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
|
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
|
||||||
y_pred = self.model_incons.predict(X_test_incons_scaled)
|
y_pred = self.model_incons.predict(X_test_incons_scaled)
|
||||||
|
|||||||
Reference in New Issue
Block a user