From 5767f5568625b6fd079cb1e94cd0958155e2bf60 Mon Sep 17 00:00:00 2001 From: Joshua Salako Date: Fri, 2 May 2025 00:16:46 +0100 Subject: [PATCH] Update project structure and enhance model persistence - Added new model and scaler files to .gitignore and output directory. - Updated Dockerfile to create output/models directory. - Revised README to include instructions for using a .env file for configuration. - Enhanced config.py to load database credentials from environment variables. - Implemented model saving functionality in salary_predictor.py for consistent and inconsistent earners. --- .env.example | 6 ++++ .gitignore | 6 ++++ Dockerfile | 3 +- README.md | 53 ++++++++++++++++------------ docker-compose.yml | 2 -- requirements.txt | 22 ++++++------ salary_analytics/config.py | 22 ++++++++---- salary_analytics/salary_predictor.py | 11 ++++++ 8 files changed, 82 insertions(+), 43 deletions(-) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..1b92bce --- /dev/null +++ b/.env.example @@ -0,0 +1,6 @@ +# Database Configuration +DB_USER=your_username +DB_PASSWORD=your_password +DB_NAME=your_database +DB_PORT=your_port +DB_HOST=your_host \ No newline at end of file diff --git a/.gitignore b/.gitignore index 18dd2cb..80167a9 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,9 @@ output/csv/high_earner_details.csv output/csv/likely_salary_earner.csv output/plots/consistent_earners_predictions.png output/plots/hypothesis_overlap.png +output/plots/inconsistent_earners_predictions.png +output/models/consistent_model.joblib +output/models/inconsistent_model.joblib +output/models/consistent_scaler.joblib +output/models/inconsistent_scaler.joblib +.env diff --git a/Dockerfile b/Dockerfile index 1693c51..2109548 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN pip install -r requirements.txt COPY salary_analytics/ ./salary_analytics/ -RUN mkdir -p output/csv output/plots +RUN mkdir -p output/csv output/plots output/models ENV PYTHONPATH=/app ENV HOST=0.0.0.0 @@ -17,5 +17,4 @@ ENV PORT=8000 EXPOSE 8000 -# Use host 0.0.0.0 to allow external connections CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] \ No newline at end of file diff --git a/README.md b/README.md index 65fbd4c..cbc96a5 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ A comprehensive salary analytics system that analyzes transaction data to identi - Separate models for consistent and inconsistent earners - Feature engineering - Model evaluation metrics + - Model persistence (saved in output/models) - **Reporting** - CSV reports generation @@ -48,25 +49,20 @@ salary_analytics/ ## Configuration -The system can be configured through environment variables or the `config.py` file: +The system can be configured through environment variables using a `.env` file: -```python -# Database Configuration -DB_CONFIG = { - "user": "db_user", - "password": "your_secure_password", - "name": "salary_db", - "port": "5432", - "host": "localhost" -} +1. Copy the example environment file: +```bash +cp .env.example .env +``` -# Model Configuration -MODEL_CONFIG = { - "cv_threshold": 0.10, - "min_transactions": 3, - "threshold": 0.7, - "high_earner_threshold": 10000 -} +2. Edit the `.env` file with your database credentials: +```bash +DB_USER=your_username +DB_PASSWORD=your_password +DB_NAME=your_database +DB_PORT=your_port +DB_HOST=your_host ``` ## Usage @@ -140,9 +136,15 @@ Note: All analysis endpoints require data to be loaded first. If you try to run docker-compose build ``` -2. Run the container: +2. Run the container with environment variables: ```bash -docker-compose up +docker run -v $(pwd)/output:/app/output \ + -e DB_USER=your_username \ + -e DB_PASSWORD=your_password \ + -e DB_NAME=your_database \ + -e DB_PORT=your_port \ + -e DB_HOST=your_host \ + salary-analytics ``` The API will be available at http://localhost:8000 @@ -155,8 +157,13 @@ output/ │ ├── high_earner_details.csv │ ├── likely_salary_earner.csv │ └── final_table.csv -└── plots/ - ├── consistent_earners_predictions.png - ├── inconsistent_earners_predictions.png - └── hypothesis_overlap.png +├── plots/ +│ ├── consistent_earners_predictions.png +│ ├── inconsistent_earners_predictions.png +│ └── hypothesis_overlap.png +└── models/ + ├── consistent_model.joblib + ├── inconsistent_model.joblib + ├── consistent_scaler.joblib + └── inconsistent_scaler.joblib ``` \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 02a9636..e7a7e2d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: api: build: . diff --git a/requirements.txt b/requirements.txt index 2d6c853..c7e2327 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,13 +1,15 @@ -sqlalchemy -pandas -numpy -matplotlib -seaborn -matplotlib-venn -wordcloud -scikit-learn -psycopg2-binary +sqlalchemy>=2.0.0 +pandas>=1.5.0 +numpy>=1.21.0 +matplotlib>=3.5.0 +seaborn>=0.12.0 +matplotlib-venn>=0.11.7 +wordcloud>=1.8.0 +scikit-learn>=1.0.0 +psycopg2-binary>=2.9.0 fastapi>=0.68.0 uvicorn>=0.15.0 pydantic>=1.8.0 -python-multipart>=0.0.5 \ No newline at end of file +python-multipart>=0.0.5 +python-dotenv>=0.19.0 +joblib>=1.1.0 \ No newline at end of file diff --git a/salary_analytics/config.py b/salary_analytics/config.py index e2ef66e..6d1b0fc 100644 --- a/salary_analytics/config.py +++ b/salary_analytics/config.py @@ -3,25 +3,31 @@ Configuration settings for the salary analytics package. """ import os +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() # Base directories BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) OUTPUT_DIR = os.path.join(BASE_DIR, "output") PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots") CSV_DIR = os.path.join(OUTPUT_DIR, "csv") +MODEL_DIR = os.path.join(OUTPUT_DIR, "models") # Create directories if they don't exist os.makedirs(OUTPUT_DIR, exist_ok=True) os.makedirs(PLOTS_DIR, exist_ok=True) os.makedirs(CSV_DIR, exist_ok=True) +os.makedirs(MODEL_DIR, exist_ok=True) # Database Configuration DB_CONFIG = { - "user": "salaryloan", - "password": "salaryloan", - "name": "salaryloan", - "port": "10532", - "host": "dev-data.simbrellang.net" + "user": os.getenv("DB_USER", "salaryloan"), # Default value as fallback + "password": os.getenv("DB_PASSWORD", "salaryloan"), + "name": os.getenv("DB_NAME", "salaryloan"), + "port": os.getenv("DB_PORT", "10532"), + "host": os.getenv("DB_HOST", "dev-data.simbrellang.net") } # Table Configuration @@ -57,5 +63,9 @@ OUTPUT_PATHS = { "final_table": os.path.join(CSV_DIR, "final_table.csv"), "consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"), "inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"), - "hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png") + "hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png"), + "consistent_model": os.path.join(MODEL_DIR, "consistent_model.joblib"), + "inconsistent_model": os.path.join(MODEL_DIR, "inconsistent_model.joblib"), + "consistent_scaler": os.path.join(MODEL_DIR, "consistent_scaler.joblib"), + "inconsistent_scaler": os.path.join(MODEL_DIR, "inconsistent_scaler.joblib") } \ No newline at end of file diff --git a/salary_analytics/salary_predictor.py b/salary_analytics/salary_predictor.py index 17fc7ae..b74dfd5 100644 --- a/salary_analytics/salary_predictor.py +++ b/salary_analytics/salary_predictor.py @@ -8,6 +8,7 @@ import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score +from joblib import dump from .config import OUTPUT_PATHS class SalaryPredictor: @@ -129,6 +130,11 @@ class SalaryPredictor: self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons) print("Model trained for consistent salary earners.") + # Save model and scaler + dump(self.model_cons, OUTPUT_PATHS['consistent_model']) + dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler']) + print("Saved consistent salary earner model and scaler.") + # Plot predictions X_test_cons_scaled = self.scaler_cons.transform(X_test_cons) y_pred = self.model_cons.predict(X_test_cons_scaled) @@ -147,6 +153,11 @@ class SalaryPredictor: print("\nTraining model for inconsistent salary earners...") self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons) + # Save model and scaler + dump(self.model_incons, OUTPUT_PATHS['inconsistent_model']) + dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler']) + print("Saved inconsistent salary earner model and scaler.") + # Plot predictions X_test_incons_scaled = self.scaler_incons.transform(X_test_incons) y_pred = self.model_incons.predict(X_test_incons_scaled)