Update project structure and enhance model persistence

- Added new model and scaler files to .gitignore and output directory. - Updated Dockerfile to create output/models directory. - Revised README to include instructions for using a .env file for configuration. - Enhanced config.py to load database credentials from environment variables. - Implemented model saving functionality in salary_predictor.py for consistent and inconsistent earners.
2025-05-02 00:16:46 +01:00
parent 8acfb436f3
commit 5767f55686
8 changed files with 82 additions and 43 deletions
@@ -0,0 +1,6 @@
 # Database Configuration
 DB_USER=your_username
 DB_PASSWORD=your_password
 DB_NAME=your_database
 DB_PORT=your_port
 DB_HOST=your_host 
@@ -4,3 +4,9 @@ output/csv/high_earner_details.csv
 output/csv/likely_salary_earner.csv
 output/plots/consistent_earners_predictions.png
 output/plots/hypothesis_overlap.png
 output/plots/inconsistent_earners_predictions.png
 output/models/consistent_model.joblib
 output/models/inconsistent_model.joblib
 output/models/consistent_scaler.joblib
 output/models/inconsistent_scaler.joblib
 .env
@@ -9,7 +9,7 @@ RUN pip install -r requirements.txt
 COPY salary_analytics/ ./salary_analytics/
-RUN mkdir -p output/csv output/plots
+RUN mkdir -p output/csv output/plots output/models
 ENV PYTHONPATH=/app
 ENV HOST=0.0.0.0
@@ -17,5 +17,4 @@ ENV PORT=8000
 EXPOSE 8000
 # Use host 0.0.0.0 to allow external connections
 CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] 
@@ -21,6 +21,7 @@ A comprehensive salary analytics system that analyzes transaction data to identi
  - Separate models for consistent and inconsistent earners
  - Feature engineering
  - Model evaluation metrics
  - Model persistence (saved in output/models)
 - **Reporting**
  - CSV reports generation
@@ -48,25 +49,20 @@ salary_analytics/
 ## Configuration
-The system can be configured through environment variables or the `config.py` file:
+The system can be configured through environment variables using a `.env` file:
-```python
+1. Copy the example environment file:
-# Database Configuration
+```bash
-DB_CONFIG = {
+cp .env.example .env
-    "user": "db_user",
+```
    "password": "your_secure_password",
    "name": "salary_db",
    "port": "5432",
    "host": "localhost"
 }
-# Model Configuration
+2. Edit the `.env` file with your database credentials:
-MODEL_CONFIG = {
+```bash
-    "cv_threshold": 0.10,
+DB_USER=your_username
-    "min_transactions": 3,
+DB_PASSWORD=your_password
-    "threshold": 0.7,
+DB_NAME=your_database
-    "high_earner_threshold": 10000
+DB_PORT=your_port
-}
+DB_HOST=your_host
 ```
 ## Usage
@@ -140,9 +136,15 @@ Note: All analysis endpoints require data to be loaded first. If you try to run
 docker-compose build
 ```
-2. Run the container:
+2. Run the container with environment variables:
 ```bash
-docker-compose up
+docker run -v $(pwd)/output:/app/output \
           -e DB_USER=your_username \
           -e DB_PASSWORD=your_password \
           -e DB_NAME=your_database \
           -e DB_PORT=your_port \
           -e DB_HOST=your_host \
           salary-analytics
 ```
 The API will be available at http://localhost:8000
@@ -155,8 +157,13 @@ output/
 │   ├── high_earner_details.csv
 │   ├── likely_salary_earner.csv
 │   └── final_table.csv
-└── plots/
+├── plots/
-    ├── consistent_earners_predictions.png
+│   ├── consistent_earners_predictions.png
-    ├── inconsistent_earners_predictions.png
+│   ├── inconsistent_earners_predictions.png
-    └── hypothesis_overlap.png
+│   └── hypothesis_overlap.png
 └── models/
    ├── consistent_model.joblib
    ├── inconsistent_model.joblib
    ├── consistent_scaler.joblib
    └── inconsistent_scaler.joblib
 ```
@@ -1,5 +1,3 @@
 version: '3.8'
 services:
  api:
    build: .
@@ -1,13 +1,15 @@
-sqlalchemy
+sqlalchemy>=2.0.0
-pandas
+pandas>=1.5.0
-numpy
+numpy>=1.21.0
-matplotlib
+matplotlib>=3.5.0
-seaborn
+seaborn>=0.12.0
-matplotlib-venn
+matplotlib-venn>=0.11.7
-wordcloud
+wordcloud>=1.8.0
-scikit-learn
+scikit-learn>=1.0.0
-psycopg2-binary
+psycopg2-binary>=2.9.0
 fastapi>=0.68.0
 uvicorn>=0.15.0
 pydantic>=1.8.0
-python-multipart>=0.0.5 
+python-multipart>=0.0.5 
 python-dotenv>=0.19.0
 joblib>=1.1.0
@@ -3,25 +3,31 @@ Configuration settings for the salary analytics package.
 """
 import os
 from dotenv import load_dotenv
 # Load environment variables
 load_dotenv()
 # Base directories
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 OUTPUT_DIR = os.path.join(BASE_DIR, "output")
 PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
 CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
 MODEL_DIR = os.path.join(OUTPUT_DIR, "models")
 # Create directories if they don't exist
 os.makedirs(OUTPUT_DIR, exist_ok=True)
 os.makedirs(PLOTS_DIR, exist_ok=True)
 os.makedirs(CSV_DIR, exist_ok=True)
 os.makedirs(MODEL_DIR, exist_ok=True)
 # Database Configuration
 DB_CONFIG = {
-    "user": "salaryloan",
+    "user": os.getenv("DB_USER", "salaryloan"),  # Default value as fallback
-    "password": "salaryloan",
+    "password": os.getenv("DB_PASSWORD", "salaryloan"),
-    "name": "salaryloan",
+    "name": os.getenv("DB_NAME", "salaryloan"),
-    "port": "10532",
+    "port": os.getenv("DB_PORT", "10532"),
-    "host": "dev-data.simbrellang.net"
+    "host": os.getenv("DB_HOST", "dev-data.simbrellang.net")
 }
 # Table Configuration
@@ -57,5 +63,9 @@ OUTPUT_PATHS = {
    "final_table": os.path.join(CSV_DIR, "final_table.csv"),
    "consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
    "inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
-    "hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png")
+    "hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png"),
    "consistent_model": os.path.join(MODEL_DIR, "consistent_model.joblib"),
    "inconsistent_model": os.path.join(MODEL_DIR, "inconsistent_model.joblib"),
    "consistent_scaler": os.path.join(MODEL_DIR, "consistent_scaler.joblib"),
    "inconsistent_scaler": os.path.join(MODEL_DIR, "inconsistent_scaler.joblib")
 } 
@@ -8,6 +8,7 @@ import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler, OneHotEncoder
 from sklearn.ensemble import RandomForestRegressor
 from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
 from joblib import dump
 from .config import OUTPUT_PATHS
 class SalaryPredictor:
@@ -129,6 +130,11 @@ class SalaryPredictor:
            self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
            print("Model trained for consistent salary earners.")
            # Save model and scaler
            dump(self.model_cons, OUTPUT_PATHS['consistent_model'])
            dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler'])
            print("Saved consistent salary earner model and scaler.")
            # Plot predictions
            X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
            y_pred = self.model_cons.predict(X_test_cons_scaled)
@@ -147,6 +153,11 @@ class SalaryPredictor:
            print("\nTraining model for inconsistent salary earners...")
            self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
            # Save model and scaler
            dump(self.model_incons, OUTPUT_PATHS['inconsistent_model'])
            dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler'])
            print("Saved inconsistent salary earner model and scaler.")
            # Plot predictions
            X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
            y_pred = self.model_incons.predict(X_test_incons_scaled)