first commit
This commit is contained in:
@@ -0,0 +1,45 @@
|
||||
# Git
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
*.so
|
||||
.Python
|
||||
env/
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
|
||||
# Virtual Environment
|
||||
venv/
|
||||
ENV/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
.vscode/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# Output
|
||||
output/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
|
||||
# Local development
|
||||
.env
|
||||
.env.local
|
||||
@@ -0,0 +1,6 @@
|
||||
# Database Configuration
|
||||
DB_USER=your_username
|
||||
DB_PASSWORD=your_password
|
||||
DB_NAME=your_database
|
||||
DB_PORT=your_port
|
||||
DB_HOST=your_host
|
||||
+17
@@ -0,0 +1,17 @@
|
||||
transaction.csv
|
||||
output/csv/final_table.csv
|
||||
output/csv/high_earner_details.csv
|
||||
output/csv/likely_salary_earner.csv
|
||||
output/plots/consistent_earners_predictions.png
|
||||
output/plots/hypothesis_overlap.png
|
||||
output/plots/inconsistent_earners_predictions.png
|
||||
output/models/consistent_model.joblib
|
||||
output/models/inconsistent_model.joblib
|
||||
output/models/consistent_scaler.joblib
|
||||
output/models/inconsistent_scaler.joblib
|
||||
.env
|
||||
__pycache__
|
||||
__pycache__/*
|
||||
output/*
|
||||
.idea/*
|
||||
salary_analytics/__pycache__/*
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y libpq-dev && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
|
||||
COPY salary_analytics/ ./salary_analytics/
|
||||
|
||||
RUN mkdir -p output/csv output/plots output/models
|
||||
|
||||
ENV PYTHONPATH=/app
|
||||
ENV HOST=0.0.0.0
|
||||
ENV PORT=8000
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
|
||||
@@ -0,0 +1,201 @@
|
||||
# Salary Analytics
|
||||
|
||||
A comprehensive salary analytics system that analyzes transaction data to identify salary earners, predict future salaries, and generate detailed reports.
|
||||
|
||||
## Features
|
||||
|
||||
- **Transaction Analysis**
|
||||
- Keyword-based salary transaction identification
|
||||
- Consistent amount transaction analysis
|
||||
- Transaction type analysis
|
||||
- Hypothesis overlap visualization
|
||||
|
||||
- **Salary Earner Classification**
|
||||
- Verified salary earners identification
|
||||
- Likely salary earners identification
|
||||
- High earner detection
|
||||
- Salary pattern analysis
|
||||
|
||||
- **Machine Learning**
|
||||
- Salary prediction models
|
||||
- Separate models for consistent and inconsistent earners
|
||||
- Feature engineering
|
||||
- Model evaluation metrics
|
||||
- Model persistence (saved in output/models)
|
||||
|
||||
- **Reporting**
|
||||
- CSV reports generation
|
||||
- Visualization plots
|
||||
- High earner details
|
||||
- Salary earner statistics
|
||||
|
||||
## Architecture
|
||||
|
||||
The project is organized into the following modules:
|
||||
|
||||
```
|
||||
salary_analytics/
|
||||
├── __init__.py
|
||||
├── config.py # Configuration settings
|
||||
├── data_loader.py # Database connection and data loading
|
||||
├── keyword_analyzer.py # Keyword-based analysis
|
||||
├── consistent_amount_analyzer.py # Consistent amount analysis
|
||||
├── transaction_type_analyzer.py # Transaction type analysis
|
||||
├── salary_earner_analyzer.py # Salary earner analysis
|
||||
├── salary_predictor.py # Machine learning models
|
||||
├── main.py # Main pipeline
|
||||
└── api.py # FastAPI endpoints
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
The system can be configured through environment variables using a `.env` file:
|
||||
|
||||
1. Copy the example environment file:
|
||||
```bash
|
||||
cp .env.example .env
|
||||
```
|
||||
|
||||
2. Edit the `.env` file with your database credentials:
|
||||
```bash
|
||||
DB_USER=your_username
|
||||
DB_PASSWORD=your_password
|
||||
DB_NAME=your_database
|
||||
DB_PORT=your_port
|
||||
DB_HOST=your_host
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### Using the API
|
||||
|
||||
1. Start the API server:
|
||||
```bash
|
||||
uvicorn salary_analytics.api:app --reload
|
||||
```
|
||||
|
||||
2. Access the API documentation:
|
||||
- Swagger UI: http://localhost:8000/docs
|
||||
- ReDoc: http://localhost:8000/redoc
|
||||
|
||||
### API Endpoints
|
||||
|
||||
1. **Basic Endpoints**
|
||||
- `GET /`: Welcome message
|
||||
- `GET /health`: Health check
|
||||
|
||||
2. **Data Loading**
|
||||
- `POST /load-data`: Load transaction data
|
||||
- Parameters:
|
||||
- `source`: Data source ('db' or 'csv')
|
||||
- `file`: CSV file (required if source is 'csv')
|
||||
- Example:
|
||||
```bash
|
||||
# Load from database
|
||||
curl -X POST "http://localhost:8000/load-data?source=db"
|
||||
|
||||
# Load from CSV
|
||||
curl -X POST "http://localhost:8000/load-data?source=csv" -F "file=@path/to/your/file.csv"
|
||||
```
|
||||
|
||||
3. **Analysis Endpoints**
|
||||
- `POST /analyze/keyword`: Run keyword analysis
|
||||
- `POST /analyze/consistent-amount`: Run consistent amount analysis
|
||||
- `POST /analyze/transaction-type`: Run transaction type analysis
|
||||
|
||||
4. **Report Generation**
|
||||
- `POST /generate/reports`: Generate all reports
|
||||
- `GET /download/{report_type}`: Download specific reports
|
||||
- Available types:
|
||||
- `high_earners`: High earner details
|
||||
- `likely_earners`: Likely salary earners
|
||||
- `final_table`: Final analysis table
|
||||
- `consistent_plot`: Consistent earners plot
|
||||
- `inconsistent_plot`: Inconsistent earners plot
|
||||
- `hypothesis_plot`: Hypothesis overlap plot
|
||||
|
||||
5. **Model Training**
|
||||
- `POST /train/models`: Train prediction models
|
||||
|
||||
6. **Pipeline**
|
||||
- `POST /run/pipeline`: Run complete pipeline
|
||||
- `POST /run/streaming-pipeline`: Run pipeline in batches
|
||||
- Parameters:
|
||||
- `source`: Data source ('db' or 'csv')
|
||||
- `file`: CSV file (required if source is 'csv')
|
||||
- `batch_size`: Number of rows to process in each batch (default: 10000)
|
||||
- Example:
|
||||
```bash
|
||||
# Run streaming pipeline from database
|
||||
curl -X POST "http://localhost:8000/run/streaming-pipeline?source=db&batch_size=5000"
|
||||
|
||||
# Run streaming pipeline from CSV
|
||||
curl -X POST "http://localhost:8000/run/streaming-pipeline?source=csv&batch_size=5000" -F "file=@path/to/your/file.csv"
|
||||
```
|
||||
- Response:
|
||||
```json
|
||||
[
|
||||
{
|
||||
"batch_number": 1,
|
||||
"total_batches": 10,
|
||||
"processed_rows": 5000,
|
||||
"results_path": "/app/output/csv/batch_results_20240315_123456/batch_1_results.csv",
|
||||
"message": "Successfully processed batch 1 of 10"
|
||||
},
|
||||
// ... more batch responses ...
|
||||
]
|
||||
```
|
||||
|
||||
### Workflow
|
||||
|
||||
1. Start the API server
|
||||
2. Load data using the `/load-data` endpoint
|
||||
3. Run any of the analysis endpoints
|
||||
4. Generate and download reports as needed
|
||||
|
||||
For large datasets, use the streaming pipeline endpoint:
|
||||
1. Start the API server
|
||||
2. Run the streaming pipeline with appropriate batch size
|
||||
3. Monitor batch processing progress
|
||||
4. Access results in the batch results directory
|
||||
|
||||
Note: All analysis endpoints require data to be loaded first. If you try to run any analysis without loading data, you'll receive a 400 error with a message to load data first.
|
||||
|
||||
## Docker Deployment
|
||||
|
||||
1. Build the Docker image:
|
||||
```bash
|
||||
docker-compose build
|
||||
```
|
||||
|
||||
2. Run the container with environment variables:
|
||||
```bash
|
||||
docker run -v $(pwd)/output:/app/output \
|
||||
-e DB_USER=your_username \
|
||||
-e DB_PASSWORD=your_password \
|
||||
-e DB_NAME=your_database \
|
||||
-e DB_PORT=your_port \
|
||||
-e DB_HOST=your_host \
|
||||
salary-analytics
|
||||
```
|
||||
|
||||
The API will be available at http://localhost:8000
|
||||
|
||||
## Output Structure
|
||||
|
||||
```
|
||||
output/
|
||||
├── csv/
|
||||
│ ├── high_earner_details.csv
|
||||
│ ├── likely_salary_earner.csv
|
||||
│ └── final_table.csv
|
||||
├── plots/
|
||||
│ ├── consistent_earners_predictions.png
|
||||
│ ├── inconsistent_earners_predictions.png
|
||||
│ └── hypothesis_overlap.png
|
||||
└── models/
|
||||
├── consistent_model.joblib
|
||||
├── inconsistent_model.joblib
|
||||
├── consistent_scaler.joblib
|
||||
└── inconsistent_scaler.joblib
|
||||
```
|
||||
Binary file not shown.
@@ -0,0 +1,20 @@
|
||||
services:
|
||||
digifi-analytics:
|
||||
build: .
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ./output:/app/output
|
||||
environment:
|
||||
- DB_USER=salaryloan
|
||||
- DB_PASSWORD=salaryloan
|
||||
- DB_NAME=salaryloan
|
||||
- DB_PORT=10532
|
||||
- DB_HOST=dev-data.simbrellang.net
|
||||
restart: unless-stopped
|
||||
networks:
|
||||
- salary_network
|
||||
|
||||
networks:
|
||||
salary_network:
|
||||
driver: bridge
|
||||
+384
@@ -0,0 +1,384 @@
|
||||
accountid,num_months,least_inflow_6m,avg_monthly_salary,estimated_next_amount,estimated_next_date,45daysalary,2monthssalary
|
||||
0171115023,14,4251.1458,4402.972435714286,4402.972435714286,2025-04-12,True,False
|
||||
0194052022,14,26589.45,27382.33928571429,27382.33928571429,2025-04-14,True,False
|
||||
0532452020,14,80000.0,83714.28571428571,83714.28571428571,2025-04-15,True,False
|
||||
0623122023,14,4100.3778,4297.567114285715,4297.567114285715,2025-04-16,True,False
|
||||
0676572026,14,42323.61,44590.94625,44590.94625,2025-04-15,True,False
|
||||
0761118025,14,10209.51,10727.278007142857,10727.278007142857,2025-04-09,True,False
|
||||
0820472015,14,202325.60880000002,210543.31560000003,210543.31560000003,2025-04-18,True,True
|
||||
0835181025,14,42886.261600000005,43769.9071,43769.9071,2025-04-10,True,False
|
||||
0835200018,14,40857.6708,42116.59062857144,42116.59062857144,2025-04-15,True,False
|
||||
0846827013,14,379.44,388.2085714285715,388.2085714285715,2025-04-11,True,False
|
||||
0852762012,14,20648.37,21385.811785714286,21385.811785714286,2025-04-12,True,False
|
||||
0853195013,14,9297.402399999999,9476.1986,9476.1986,2025-04-16,True,False
|
||||
0853687015,14,15977.2405,16463.111321428576,16463.111321428576,2025-04-17,True,False
|
||||
0856304027,14,77548.12,80927.00237142856,80927.00237142856,2025-04-16,True,False
|
||||
0875566019,14,5871.2424,6048.037514285715,6048.037514285715,2025-04-15,True,False
|
||||
0889240019,14,1150220.1988,1189265.8632571427,1189265.8632571427,2025-04-22,True,False
|
||||
0890507013,14,22330.07,23542.273800000003,23542.273800000003,2025-04-14,True,False
|
||||
0897678011,14,55141.2663,56632.604292857155,56632.604292857155,2025-04-19,True,False
|
||||
0903403013,14,67017.0855,69244.66896428574,69244.66896428574,2025-04-13,True,False
|
||||
0903407011,14,46434.82,49585.75421428571,49585.75421428571,2025-04-13,True,False
|
||||
0908634012,14,39312.76,41474.961800000005,41474.961800000005,2025-04-14,True,False
|
||||
0909149016,14,40457.1054,41572.96615714285,41572.96615714285,2025-04-17,True,False
|
||||
0910196014,14,40457.1054,41830.47248571429,41830.47248571429,2025-04-15,True,False
|
||||
0915538013,14,31310.0,32129.285714285714,32129.285714285714,2025-04-15,True,False
|
||||
0926111012,14,12120.0,12642.857142857143,12642.857142857143,2025-04-20,True,True
|
||||
0943467011,14,9421.1076,9678.406757142857,9678.406757142857,2025-04-09,True,False
|
||||
0950731013,14,24131.3997,23712.56365714286,23712.56365714286,2025-04-12,True,False
|
||||
0954497016,14,48274.38220000001,48046.672849999995,48046.672849999995,2025-04-12,True,False
|
||||
0961463013,14,45890.81,47628.104950000015,47628.104950000015,2025-04-20,True,False
|
||||
0966021014,14,41000.0,42786.42857142857,42786.42857142857,2025-04-17,True,False
|
||||
0977809014,14,12341.82,12800.230457142856,12800.230457142856,2025-04-11,True,False
|
||||
0983934012,14,22941.6304,23445.84205714286,23445.84205714286,2025-04-17,True,False
|
||||
0984548012,14,12621.74,13225.780414285715,13225.780414285715,2025-04-12,True,False
|
||||
1007718012,14,22418.0,23378.771428571432,23378.771428571432,2025-04-19,True,False
|
||||
1012475016,14,15642.5776,15900.422285714287,15900.422285714287,2025-04-14,True,False
|
||||
1024520010,14,20607.4643,21583.91416428571,21583.91416428571,2025-04-14,True,False
|
||||
1026391014,14,31812.27,32903.86750000001,32903.86750000001,2025-04-13,True,False
|
||||
1026407016,14,31305.5257,32545.348500000004,32545.348500000004,2025-04-10,True,False
|
||||
1048739025,14,22949.7957,23907.389721428575,23907.389721428575,2025-04-15,True,False
|
||||
1049872017,14,21443.806800000002,22044.473657142862,22044.473657142862,2025-04-20,True,False
|
||||
1050211014,14,48780.1,51706.90600000001,51706.90600000001,2025-04-17,True,False
|
||||
1050617018,14,30535.01,32149.003385714284,32149.003385714284,2025-04-18,True,False
|
||||
1050806027,14,12381.43,12965.125985714287,12965.125985714287,2025-04-13,True,False
|
||||
1055487016,14,50709.042400000006,52150.83902857143,52150.83902857143,2025-04-12,True,False
|
||||
1056604012,14,123586.78,128353.69865714286,128353.69865714286,2025-04-15,True,False
|
||||
1058148017,14,87361.58,91667.25787142858,91667.25787142858,2025-04-18,True,False
|
||||
1059568010,14,110597.92899999999,113413.71785714284,113413.71785714284,2025-04-20,True,False
|
||||
1062072018,14,15979.257000000001,16120.57015714286,16120.57015714286,2025-04-12,True,False
|
||||
1062596015,14,70018.654,72494.56114285714,72494.56114285714,2025-04-16,True,False
|
||||
1068775012,14,4652.6256,4840.178400000002,4840.178400000002,2025-04-14,True,False
|
||||
1071766010,14,11815.6264,12049.5992,12049.5992,2025-04-13,True,False
|
||||
1073657011,14,51075.2152,52736.78514285714,52736.78514285714,2025-04-11,True,False
|
||||
1074328022,14,23416.5174,24121.636621428577,24121.636621428577,2025-04-11,True,False
|
||||
1077617011,14,4960.04,5200.956228571428,5200.956228571428,2025-04-15,True,False
|
||||
1078726011,14,8161.355500000001,8478.805678571429,8478.805678571429,2025-04-16,True,False
|
||||
1078919011,14,19392.19,20334.09637142857,20334.09637142857,2025-04-12,True,False
|
||||
1084193010,14,7566.707200000001,7691.433142857144,7691.433142857144,2025-04-15,True,False
|
||||
1084222017,14,7658.759300000001,7945.827364285715,7945.827364285715,2025-04-11,True,False
|
||||
1090993013,14,7739.27,8098.593250000001,8098.593250000001,2025-04-14,True,False
|
||||
1092014011,14,14662.29,15343.039178571431,15343.039178571431,2025-04-18,True,False
|
||||
1092396010,14,93320.8589,96620.74782857145,96620.74782857145,2025-04-19,True,False
|
||||
1092753013,14,12456.44,12865.72302857143,12865.72302857143,2025-04-16,True,False
|
||||
1099421018,14,19668.6188,20364.114514285717,20364.114514285717,2025-04-18,True,False
|
||||
1099865016,14,23140.25,24743.538750000003,24743.538750000003,2025-04-20,True,False
|
||||
1100425013,14,23596.19,24961.398135714287,24961.398135714287,2025-04-16,True,False
|
||||
1101981019,14,15941.880400000002,16719.80808571429,16719.80808571429,2025-04-20,True,False
|
||||
1102352014,14,30702.0,31820.0,31820.0,2025-04-14,True,False
|
||||
1108793013,14,8138.6995,8364.460928571429,8364.460928571429,2025-04-15,True,False
|
||||
1110716019,14,7937.8936,8079.6417,8079.6417,2025-04-16,True,False
|
||||
1111025015,14,12812.961000000001,13356.651000000002,13356.651000000002,2025-04-17,True,False
|
||||
1112139010,14,6240.65,6543.767285714285,6543.767285714285,2025-04-16,True,False
|
||||
1112274016,14,9728.9159,10100.458657142857,10100.458657142857,2025-04-15,True,False
|
||||
1113984011,14,7638.5289999999995,7957.251214285714,7957.251214285714,2025-04-19,True,False
|
||||
1114046017,14,9856.06,10250.302399999999,10250.302399999999,2025-04-15,True,False
|
||||
1131035016,14,3831.085,3921.4157142857134,3921.4157142857134,2025-04-13,True,False
|
||||
1136038012,14,49953.71,52986.61382142857,52986.61382142857,2025-04-12,True,False
|
||||
1136840017,14,19030.68,19914.247285714286,19914.247285714286,2025-04-13,True,False
|
||||
1137979028,14,16475.6928,16810.283199999998,16810.283199999998,2025-04-10,True,False
|
||||
1144064018,14,34347.7368,34890.31780714286,34890.31780714286,2025-04-18,True,False
|
||||
1145715018,14,52231.6955,53709.25407142857,53709.25407142857,2025-04-20,True,False
|
||||
1148441013,14,71671.28,73616.64331428571,73616.64331428571,2025-04-17,True,False
|
||||
1149064013,14,13041.33,13730.657442857144,13730.657442857144,2025-04-12,True,False
|
||||
1153011012,14,66438.26,68953.42270000001,68953.42270000001,2025-04-15,True,False
|
||||
1155907018,14,78055.5169,80373.99760000002,80373.99760000002,2025-04-16,True,False
|
||||
1166621015,14,10458.3783,11131.442250000002,11131.442250000002,2025-04-15,True,False
|
||||
1166928013,14,117526.77,122899.42234285714,122899.42234285714,2025-04-13,True,False
|
||||
1175464018,14,21420.0,22185.0,22185.0,2025-04-16,True,False
|
||||
1175961012,14,21840.0,21945.0,21945.0,2025-04-13,True,False
|
||||
1181003016,14,53535.21,56785.56203571429,56785.56203571429,2025-04-19,True,False
|
||||
1181900012,14,348.45,362.4964285714286,362.4964285714286,2025-04-10,True,False
|
||||
1185534017,14,3883.652,4015.487428571429,4015.487428571429,2025-04-14,True,False
|
||||
1185747011,14,40857.6708,41973.53155714286,41973.53155714286,2025-04-18,True,False
|
||||
1190093026,14,28436.137000000002,29343.25371428571,29343.25371428571,2025-04-13,True,False
|
||||
1201866010,14,28981.34,30699.519442857145,30699.519442857145,2025-04-15,True,False
|
||||
1202335014,14,3358.0581,3472.051585714287,3472.051585714287,2025-04-15,True,False
|
||||
1214025019,14,11110.0,11526.42857142857,11526.42857142857,2025-04-17,True,False
|
||||
1214711015,14,40056.54,42259.64970000001,42259.64970000001,2025-04-18,True,False
|
||||
1216795013,14,3822.8008,3885.814,3885.814,2025-04-12,True,False
|
||||
1218765016,14,40056.54,41744.637042857146,41744.637042857146,2025-04-17,True,False
|
||||
1226527019,14,8968.5602,9198.682757142857,9198.682757142857,2025-04-19,True,False
|
||||
1226530017,14,11896.3153,12359.043264285716,12359.043264285716,2025-04-14,True,False
|
||||
1228194026,14,21420.0,22440.0,22440.0,2025-04-18,True,False
|
||||
1243004012,14,12207.18,12922.17197142857,12922.17197142857,2025-04-15,True,False
|
||||
1243592016,14,21000.0,22110.0,22110.0,2025-04-17,True,False
|
||||
1246671019,14,4519.9362,4659.205942857144,4659.205942857144,2025-04-16,True,False
|
||||
1250257010,14,8017.65,8372.717357142858,8372.717357142858,2025-04-14,True,False
|
||||
1254162017,14,17290.380800000003,17103.961600000002,17103.961600000002,2025-04-14,True,False
|
||||
1256499010,14,10000.0,10528.57142857143,10528.57142857143,2025-04-17,True,False
|
||||
1259062019,14,27510.5322,28281.13534285715,28281.13534285715,2025-04-08,True,False
|
||||
1261523014,14,54300.4875,54891.51321428572,54891.51321428572,2025-04-14,True,False
|
||||
1263325016,14,28431.6616,29557.667999999998,29557.667999999998,2025-04-13,True,False
|
||||
1270270019,14,40408.5546,42134.690335714295,42134.690335714295,2025-04-13,True,False
|
||||
1271085012,14,37000.0,38876.42857142857,38876.42857142857,2025-04-15,True,False
|
||||
1275044015,14,131415.45309999998,137549.41342857142,137549.41342857142,2025-04-16,True,False
|
||||
1279191010,14,37072.41,38604.11321428571,38604.11321428571,2025-04-18,True,False
|
||||
1285665028,14,27586.08,28551.592800000002,28551.592800000002,2025-04-20,True,False
|
||||
1291708012,14,113682.43,118473.33240714285,118473.33240714285,2025-04-20,True,False
|
||||
1291809012,14,30300.0,31864.285714285714,31864.285714285714,2025-04-14,True,False
|
||||
1291885010,14,14137.3122,14751.117071428578,14751.117071428578,2025-04-16,True,False
|
||||
1298114029,14,27598.57,29037.638292857147,29037.638292857147,2025-04-12,True,False
|
||||
1298415018,14,132727.26,139648.03855714286,139648.03855714286,2025-04-16,True,False
|
||||
1298420010,14,179163.11,189528.97564999998,189528.97564999998,2025-04-20,True,False
|
||||
1311857010,14,21944.381999999998,22743.477142857144,22743.477142857144,2025-04-15,True,False
|
||||
1313574018,28,10000.0,10489.285714285714,10489.285714285714,2025-03-28,True,True
|
||||
1317200010,14,8335.11,8728.0509,8728.0509,2025-04-17,True,False
|
||||
1331055016,14,102000.0,106000.0,106000.0,2025-04-14,True,False
|
||||
1333165010,14,21000.0,22125.0,22125.0,2025-04-17,True,False
|
||||
1342374016,14,45215.080200000004,46449.9458357143,46449.9458357143,2025-04-13,True,False
|
||||
1342804012,14,21290.5374,22103.613085714285,22103.613085714285,2025-04-17,True,False
|
||||
1343673022,14,21000.0,21795.0,21795.0,2025-04-19,True,False
|
||||
1345196019,14,34137.85,36088.58428571429,36088.58428571429,2025-04-19,True,False
|
||||
1350668019,14,5473.5637,5663.241649999999,5663.241649999999,2025-04-13,True,False
|
||||
1355277012,14,92034.91619999999,94226.22372857144,94226.22372857144,2025-04-17,True,False
|
||||
1356399027,14,40457.1054,42116.59062857144,42116.59062857144,2025-04-15,True,False
|
||||
1364716018,14,40457.1054,42145.202442857146,42145.202442857146,2025-04-15,True,False
|
||||
1365385015,14,8548.4438,8744.073928571426,8744.073928571426,2025-04-11,True,False
|
||||
1365921013,14,113682.43,118067.32372857141,118067.32372857141,2025-04-17,True,False
|
||||
1368292019,14,26473.8675,27353.83339285714,27353.83339285714,2025-04-20,True,False
|
||||
1375320015,14,64000.0,66925.71428571429,66925.71428571429,2025-04-18,True,False
|
||||
1377321016,14,67056.28,71079.65680000001,71079.65680000001,2025-04-17,True,False
|
||||
1383019015,14,40056.54,41744.63704285714,41744.63704285714,2025-04-15,True,False
|
||||
1384084016,14,11276.86,11913.197100000003,11913.197100000003,2025-04-16,True,False
|
||||
1395426014,14,52623.78,55217.380585714294,55217.380585714294,2025-04-19,True,False
|
||||
1396795012,14,37770.83,39929.16314285715,39929.16314285715,2025-04-16,True,False
|
||||
1399402010,14,13987.86,14597.331042857142,14597.331042857142,2025-04-14,True,False
|
||||
1408181013,14,9958.7111,10275.643207142859,10275.643207142859,2025-04-20,True,False
|
||||
1408676023,14,46018.778999999995,47146.69025,47146.69025,2025-04-16,True,False
|
||||
1410787014,14,25255.1775,25444.16182142857,25444.16182142857,2025-04-18,True,False
|
||||
1414502015,14,8270.08,8451.840000000002,8451.840000000002,2025-04-17,True,False
|
||||
1414536014,14,2224.02,2285.361428571429,2285.361428571429,2025-04-19,True,False
|
||||
1415176017,14,8031.52,8400.72,8400.72,2025-04-19,True,True
|
||||
1415211013,14,3929.3545,4084.9725,4084.9725,2025-04-20,True,False
|
||||
1417746018,14,4268.94,4485.436242857143,4485.436242857143,2025-04-09,True,False
|
||||
1428498012,14,14898.37,15558.154957142859,15558.154957142859,2025-04-14,True,False
|
||||
1436713013,14,33026.67,34583.641585714286,34583.641585714286,2025-04-13,True,False
|
||||
1443284010,14,22275.0,23516.035714285714,23516.035714285714,2025-04-14,True,False
|
||||
1443552014,14,22720.5,23372.839285714286,23372.839285714286,2025-04-16,True,False
|
||||
1444444015,14,23273.136,23859.853714285717,23859.853714285717,2025-04-19,True,False
|
||||
1452711015,14,8568.35,8923.3245,8923.3245,2025-04-17,True,False
|
||||
1457606015,14,6201.086899999999,6385.277599999999,6385.277599999999,2025-04-15,True,False
|
||||
1462318019,14,40857.6708,41801.86067142857,41801.86067142857,2025-04-18,True,False
|
||||
1465238011,14,15977.089,16519.451285714287,16519.451285714287,2025-04-12,True,False
|
||||
1465352016,14,23419.3326,24354.137892857147,24354.137892857147,2025-04-17,True,False
|
||||
1469561010,14,30370.065000000002,31317.813214285718,31317.813214285718,2025-04-16,True,False
|
||||
1472225019,14,6153.1155,6199.159221428571,6199.159221428571,2025-04-18,True,False
|
||||
1479990013,14,120868.53820000001,125484.45125714291,125484.45125714291,2025-04-20,True,False
|
||||
1486608013,14,14280.0,14710.0,14710.0,2025-04-15,True,False
|
||||
1501871011,14,10605.3,11090.113714285713,11090.113714285713,2025-04-18,True,False
|
||||
1511221015,14,134753.29,139180.89810000002,139180.89810000002,2025-04-13,True,False
|
||||
1511356014,14,4070.02,4314.221199999999,4314.221199999999,2025-04-18,True,False
|
||||
1512095013,14,3997.5432,4120.7168,4120.7168,2025-04-13,True,False
|
||||
1512253017,14,25946.4234,26691.383735714284,26691.383735714284,2025-04-10,True,False
|
||||
1512290014,14,76173.8251,77600.10337857142,77600.10337857142,2025-04-19,True,False
|
||||
1516029010,14,140222.5136,142630.18038571425,142630.18038571425,2025-04-15,True,False
|
||||
1522821017,14,45567.614499999996,47050.012142857144,47050.012142857144,2025-04-17,True,False
|
||||
1523440019,14,42496.2752,43818.64868571429,43818.64868571429,2025-04-17,True,False
|
||||
1530552019,14,9070.9918,9359.672585714286,9359.672585714286,2025-04-12,True,False
|
||||
1533377013,14,21840.0,22215.0,22215.0,2025-04-13,True,False
|
||||
1538119014,14,40056.54,41916.30792857143,41916.30792857143,2025-04-18,True,False
|
||||
1538899011,14,5923.854,6164.458714285714,6164.458714285714,2025-04-12,True,False
|
||||
1543039017,14,43414.55,45864.37103571429,45864.37103571429,2025-04-09,True,False
|
||||
1543041018,14,34915.95,36686.687464285715,36686.687464285715,2025-04-15,True,False
|
||||
1543044019,14,26606.08,27860.366628571435,27860.366628571435,2025-04-11,True,False
|
||||
1543345015,14,17942.2662,18437.137757142857,18437.137757142857,2025-04-15,True,False
|
||||
1543391012,14,13000.0,13631.42857142857,13631.42857142857,2025-04-22,True,False
|
||||
1547865012,14,57734.6278,57734.6278,57734.6278,2025-04-12,True,False
|
||||
1566054028,14,69690.3333,72499.63245,72499.63245,2025-04-15,True,False
|
||||
1567984016,14,67767.02519999999,70092.36429999999,70092.36429999999,2025-04-18,True,False
|
||||
1587630018,14,21000.0,22020.0,22020.0,2025-04-14,True,False
|
||||
1593487015,14,1380.0,1429.2857142857144,1429.2857142857144,2025-04-13,True,False
|
||||
1593684010,14,34093.7346,35096.491500000004,35096.491500000004,2025-04-19,True,False
|
||||
1602958019,14,6180.0,6342.857142857143,6342.857142857143,2025-04-18,True,False
|
||||
1609825013,14,33091.55,34864.31160714286,34864.31160714286,2025-04-12,True,False
|
||||
1613386012,14,11391.21,11903.81445,11903.81445,2025-04-11,True,False
|
||||
1616744013,14,6035.1439,6218.673735714286,6218.673735714286,2025-04-11,True,False
|
||||
1617111010,14,61800.0,63685.71428571428,63685.71428571428,2025-04-16,True,False
|
||||
1618434011,14,30000.0,31435.714285714286,31435.714285714286,2025-04-13,True,False
|
||||
1620472016,14,40056.54,42002.14337142858,42002.14337142858,2025-04-17,True,False
|
||||
1631034014,14,7227.3349,7427.815757142858,7427.815757142858,2025-04-15,True,False
|
||||
1635580014,14,50968.0946,53347.086285714286,53347.086285714286,2025-04-15,True,False
|
||||
1642340016,14,143009.0007,149886.37838571428,149886.37838571428,2025-04-13,True,False
|
||||
1642419011,14,66300.0,68758.92857142857,68758.92857142857,2025-04-18,True,False
|
||||
1643680010,14,318280.88,332830.8630857143,332830.8630857143,2025-04-19,True,False
|
||||
1648142016,14,25500.0,26321.428571428572,26321.428571428572,2025-04-14,True,False
|
||||
1650352010,14,44328.51,46861.56771428571,46861.56771428571,2025-04-15,True,False
|
||||
1664348010,14,46692.25,49093.565714285716,49093.565714285716,2025-04-17,True,False
|
||||
1668954015,14,30900.0,31757.14285714286,31757.14285714286,2025-04-15,True,False
|
||||
1671967011,14,42059.367000000006,42059.36700000001,42059.36700000001,2025-04-18,True,False
|
||||
1675953014,14,10775.38,11275.6655,11275.6655,2025-04-11,True,False
|
||||
1676417016,14,4306.02,4502.86662857143,4502.86662857143,2025-04-09,True,False
|
||||
1676704017,14,26252.26,27321.102014285716,27321.102014285716,2025-04-18,True,False
|
||||
1681835010,14,21000.0,21945.0,21945.0,2025-04-14,True,False
|
||||
1683968019,14,4534.4758,4739.713742857143,4739.713742857143,2025-04-15,True,False
|
||||
1684383017,14,22105.1832,23136.966857142856,23136.966857142856,2025-04-19,True,False
|
||||
1687874013,14,21210.0,22155.0,22155.0,2025-04-18,True,False
|
||||
1703943017,14,20200.0,21128.571428571428,21128.571428571428,2025-04-17,True,False
|
||||
1708145012,14,26019.916,26341.590785714292,26341.590785714292,2025-04-12,True,False
|
||||
1714166012,14,24532.1692,25144.622799999994,25144.622799999994,2025-04-19,True,False
|
||||
1732006017,14,28209.48,28693.84607142858,28693.84607142858,2025-04-15,True,False
|
||||
1732103019,14,14140.0,14600.0,14600.0,2025-04-14,True,False
|
||||
1734119012,14,7137.7458,7267.704757142857,7267.704757142857,2025-04-13,True,False
|
||||
1736676012,14,15000.0,15385.714285714286,15385.714285714286,2025-04-15,True,False
|
||||
1740753015,14,40056.54,41944.91974285715,41944.91974285715,2025-04-20,True,False
|
||||
1755401019,14,45450.0,46896.42857142857,46896.42857142857,2025-04-18,True,False
|
||||
1757447013,14,5287.37,5593.282121428572,5593.282121428572,2025-04-14,True,False
|
||||
1757605017,14,196316.2048,205895.99131428575,205895.99131428575,2025-04-11,True,False
|
||||
1759380013,14,3678.2628,3819.9325857142862,3819.9325857142862,2025-04-14,True,False
|
||||
1766882018,14,36501.42,38404.708328571425,38404.708328571425,2025-04-13,True,False
|
||||
1774623010,14,83294.7648,84610.54748571428,84610.54748571428,2025-04-15,True,False
|
||||
1776036014,14,363.19,380.5712357142857,380.5712357142857,2025-04-16,True,False
|
||||
1777475014,14,110687.73,116696.49248571429,116696.49248571429,2025-04-20,True,False
|
||||
1779276019,14,86907.5003,90656.69232142858,90656.69232142858,2025-04-17,True,False
|
||||
1785264017,14,110026.6326,114384.12299999998,114384.12299999998,2025-04-16,True,False
|
||||
1793393017,14,79427.03,83114.71353571431,83114.71353571431,2025-04-14,True,False
|
||||
1793495014,14,11193.92,11641.676800000001,11641.676800000001,2025-04-13,True,False
|
||||
1795523018,14,50000.0,52571.42857142857,52571.42857142857,2025-04-17,True,False
|
||||
1799104017,14,47500.0,49875.0,49875.0,2025-04-19,True,False
|
||||
1801542017,14,12504.129,12924.435857142857,12924.435857142857,2025-04-17,True,False
|
||||
1804659017,14,24533.601,25341.079464285714,25341.079464285714,2025-04-15,True,False
|
||||
1808809016,14,5002.1765,5207.357714285715,5207.357714285715,2025-04-15,True,False
|
||||
1814233016,14,28714.5828,29790.871971428565,29790.871971428565,2025-04-17,True,False
|
||||
1815571016,14,21000.0,21855.0,21855.0,2025-04-17,True,False
|
||||
1819235015,14,96150.0,100270.71428571429,100270.71428571429,2025-04-15,True,False
|
||||
1824974019,28,64750.0,67825.625,67825.625,2025-03-22,True,True
|
||||
1829881017,14,7864.28,8240.641971428571,8240.641971428571,2025-04-20,True,False
|
||||
1832273014,14,8880.394699999999,9244.654171428572,9244.654171428572,2025-04-15,True,False
|
||||
1832570012,14,10200.0,10307.142857142857,10307.142857142857,2025-04-19,True,False
|
||||
1833032010,14,32320.0,33165.71428571428,33165.71428571428,2025-04-16,True,False
|
||||
1839846011,14,61027.52820000001,62822.455500000025,62822.455500000025,2025-04-15,True,False
|
||||
1845307014,14,10100.0,10535.714285714286,10535.714285714286,2025-04-13,True,False
|
||||
1858230011,14,7780.03,8060.6392857142855,8060.6392857142855,2025-04-11,True,False
|
||||
1860928012,14,28840.0,29300.0,29300.0,2025-04-16,True,False
|
||||
1863257016,14,12260.0163,12667.527449999998,12667.527449999998,2025-04-14,True,False
|
||||
1867743014,14,2651.0480000000002,2806.6611428571427,2806.6611428571427,2025-04-19,True,False
|
||||
1874878019,14,21210.0,21870.0,21870.0,2025-04-14,True,False
|
||||
1890774014,14,6463.8182,6824.950899999999,6824.950899999999,2025-04-13,True,False
|
||||
1893248017,14,6961.23,7294.374578571428,7294.374578571428,2025-04-19,True,False
|
||||
1893502016,14,59180.3592,60962.40082857142,60962.40082857142,2025-04-15,True,False
|
||||
1904132012,14,27975.285,28641.363214285717,28641.363214285717,2025-04-10,True,False
|
||||
1905813013,14,26619.484800000002,27320.969142857135,27320.969142857135,2025-04-15,True,False
|
||||
1907631011,14,52231.8975,55002.33053571428,55002.33053571428,2025-04-17,True,False
|
||||
1908562013,14,24625.43,25891.880685714284,25891.880685714284,2025-04-11,True,False
|
||||
1908717019,14,30989.6604,31749.210900000005,31749.210900000005,2025-04-20,True,True
|
||||
1909045012,14,55935.69,58572.658242857135,58572.658242857135,2025-04-17,True,False
|
||||
1910312013,14,68931.05,72525.31189285716,72525.31189285716,2025-04-14,True,False
|
||||
1918274012,14,15952.640000000001,16395.15428571429,16395.15428571429,2025-04-20,True,False
|
||||
1924705014,14,101000.0,105214.28571428571,105214.28571428571,2025-04-19,True,False
|
||||
1931759013,14,68431.4078,70044.90839999999,70044.90839999999,2025-04-18,True,False
|
||||
1938793018,14,11589.75,12160.959107142859,12160.959107142859,2025-04-18,True,True
|
||||
1939714016,14,46692.25,49460.43339285714,49460.43339285714,2025-04-18,True,False
|
||||
1940684016,14,13218.0882,13838.264607142857,13838.264607142857,2025-04-10,True,False
|
||||
1944111019,14,23813.17,24816.72502142857,24816.72502142857,2025-04-11,True,False
|
||||
1944186013,14,20400.0,20885.714285714286,20885.714285714286,2025-04-17,True,False
|
||||
1944801013,14,8190.0,8505.9,8505.9,2025-04-13,True,False
|
||||
1946636017,14,42860.497800000005,42660.21510000001,42660.21510000001,2025-04-19,True,False
|
||||
1949225012,14,40056.54,41916.307928571434,41916.307928571434,2025-04-18,True,False
|
||||
1949359014,14,40857.6708,42402.70877142857,42402.70877142857,2025-04-15,True,False
|
||||
1951369012,14,3829.94,4013.2299857142857,4013.2299857142857,2025-04-15,True,False
|
||||
1954035015,14,51714.55,55075.99575000001,55075.99575000001,2025-04-18,True,False
|
||||
1955200012,14,18854.9931,19721.736064285717,19721.736064285717,2025-04-16,True,True
|
||||
1955976014,14,40857.6708,41716.02522857143,41716.02522857143,2025-04-15,True,False
|
||||
1957290019,14,21630.0,22200.0,22200.0,2025-04-15,True,False
|
||||
1957372012,14,28104.5,29429.426428571427,29429.426428571427,2025-04-17,True,False
|
||||
1957430013,14,28104.5,29128.30678571429,29128.30678571429,2025-04-16,True,False
|
||||
1957861013,14,56466.27,58644.2547,58644.2547,2025-04-21,True,False
|
||||
1959768013,14,5816.45,6115.581714285714,6115.581714285714,2025-04-11,True,False
|
||||
1972698016,14,34643.0,36480.5,36480.5,2025-04-18,True,False
|
||||
1972978011,14,5000.0,5282.142857142857,5282.142857142857,2025-04-12,True,False
|
||||
1974761013,14,45000.0,47282.142857142855,47282.142857142855,2025-04-19,True,False
|
||||
1985487014,14,57206.4,60078.857142857145,60078.857142857145,2025-04-15,True,False
|
||||
1987937018,14,41258.2362,42602.99147142858,42602.99147142858,2025-04-18,True,False
|
||||
1988801019,14,28804.31,29977.056907142858,29977.056907142858,2025-04-17,True,False
|
||||
1991880016,14,17159.613,17808.50592857143,17808.50592857143,2025-04-15,True,False
|
||||
1995835014,14,3518.4,3706.8857142857155,3706.8857142857155,2025-04-14,True,False
|
||||
1995942013,14,4740.48,4923.327085714285,4923.327085714285,2025-04-16,True,False
|
||||
1998110015,14,12953.25,13319.67857142857,13319.67857142857,2025-04-14,True,False
|
||||
1999144011,14,63053.17,66385.9804142857,66385.9804142857,2025-04-13,True,False
|
||||
2002086018,14,5707.4051,5845.934350000001,5845.934350000001,2025-04-12,True,False
|
||||
2002869019,14,4590.6,4833.246,4833.246,2025-04-15,True,False
|
||||
2002918010,14,4473.414,4633.178785714285,4633.178785714285,2025-04-14,True,False
|
||||
2002982017,14,3295.43,3476.6786499999994,3476.6786499999994,2025-04-15,True,False
|
||||
2006029015,14,3762.5126000000005,3900.8794000000007,3900.8794000000007,2025-04-15,True,False
|
||||
2010494016,14,40857.6708,41944.91974285715,41944.91974285715,2025-04-16,True,False
|
||||
2015047015,14,41258.2362,42087.97881428572,42087.97881428572,2025-04-17,True,False
|
||||
2015759017,14,47355.7,49926.438,49926.438,2025-04-20,True,False
|
||||
2020313017,14,5000.0,5253.571428571428,5253.571428571428,2025-04-11,True,False
|
||||
2034595018,14,153940.62,161417.73582857143,161417.73582857143,2025-04-19,True,False
|
||||
2039362015,14,100000.0,105071.42857142857,105071.42857142857,2025-04-18,True,False
|
||||
2043399010,14,41658.8016,42145.202442857146,42145.202442857146,2025-04-15,True,False
|
||||
2047786012,14,28150.93,29009.90428571428,29009.90428571428,2025-04-20,True,False
|
||||
2048850019,14,21000.0,21960.0,21960.0,2025-04-15,True,False
|
||||
2051114016,14,30357.600000000002,30816.299999999996,30816.299999999996,2025-04-16,True,False
|
||||
2055259010,14,22207.58,23127.608314285717,23127.608314285717,2025-04-15,True,False
|
||||
2059595017,14,13750.0,14683.035714285714,14683.035714285714,2025-04-12,True,False
|
||||
2059835014,14,66500.0,69920.0,69920.0,2025-04-20,True,False
|
||||
2062857010,14,20200.0,21057.14285714286,21057.14285714286,2025-04-20,True,True
|
||||
2066217012,14,57600.0,59739.42857142857,59739.42857142857,2025-04-16,True,False
|
||||
2076926027,14,30917.33,32220.274621428576,32220.274621428576,2025-04-10,True,False
|
||||
2077079016,14,44055.84,45786.60514285713,45786.60514285713,2025-04-15,True,False
|
||||
2083803018,14,39063.08,40374.48340000001,40374.48340000001,2025-04-16,True,False
|
||||
2084273016,14,15300.0,15975.0,15975.0,2025-04-17,True,False
|
||||
2085388018,14,300000.0,318214.28571428574,318214.28571428574,2025-04-16,True,True
|
||||
2088066012,14,33323.9804,34620.17482857143,34620.17482857143,2025-04-13,True,False
|
||||
2089870018,14,4242.0,4386.0,4386.0,2025-04-17,True,False
|
||||
2090266019,14,45982.0386,47237.85057857142,47237.85057857142,2025-04-15,True,False
|
||||
2090875017,14,49180.0462,49279.46677857143,49279.46677857143,2025-04-11,True,False
|
||||
2094484013,14,46363.3834,47970.035299999996,47970.035299999996,2025-04-17,True,False
|
||||
2095040012,14,81600.0,84000.0,84000.0,2025-04-18,True,False
|
||||
2101243011,14,12000.0,12677.142857142857,12677.142857142857,2025-04-16,True,False
|
||||
2101560013,14,12254.0573,12982.0211,12982.0211,2025-04-18,True,True
|
||||
2105827011,14,101000.0,105357.14285714286,105357.14285714286,2025-04-18,True,False
|
||||
2105903012,14,136350.0,142328.57142857142,142328.57142857142,2025-04-18,True,False
|
||||
2109081015,14,47626.095,49427.081785714276,49427.081785714276,2025-04-17,True,False
|
||||
2109496019,14,40457.1054,42288.26151428572,42288.26151428572,2025-04-21,True,False
|
||||
2113221014,14,66277.0701,68529.20354999999,68529.20354999999,2025-04-17,True,False
|
||||
2117066019,14,41658.8016,42173.814257142854,42173.814257142854,2025-04-19,True,False
|
||||
2117394015,14,4492.21,4752.116435714285,4752.116435714285,2025-04-09,True,False
|
||||
2119186010,14,57485.33,60564.901249999995,60564.901249999995,2025-04-16,True,False
|
||||
2122558013,14,8062.78,8385.2912,8385.2912,2025-04-13,True,False
|
||||
2123350018,14,37744.25,40143.70589285715,40143.70589285715,2025-04-11,True,False
|
||||
2124152013,14,41258.2362,42545.767842857145,42545.767842857145,2025-04-15,True,False
|
||||
2130118023,14,38787.71,41253.50013571428,41253.50013571428,2025-04-12,True,False
|
||||
2140539016,14,10819.872,10922.9184,10922.9184,2025-04-15,True,False
|
||||
2141435012,14,26000.0,26178.571428571428,26178.571428571428,2025-04-18,True,False
|
||||
2141748019,14,113300.0,115735.71428571429,115735.71428571429,2025-04-12,True,False
|
||||
2142999014,14,22500.0,23528.571428571428,23528.571428571428,2025-04-10,True,False
|
||||
2146179014,14,74375.087,76005.65821428572,76005.65821428572,2025-04-20,True,False
|
||||
2146523019,14,24240.0,25388.571428571428,25388.571428571428,2025-04-14,True,False
|
||||
2165506013,14,50762.0034,52788.217821428574,52788.217821428574,2025-04-16,True,False
|
||||
2165587016,14,5893.25,6137.398928571428,6137.398928571428,2025-04-17,True,False
|
||||
2168451011,14,40056.54,42173.81425714287,42173.81425714287,2025-04-19,True,False
|
||||
2168911018,14,34483.3594,36141.6822,36141.6822,2025-04-19,True,False
|
||||
2178112018,14,6801.7578,7082.782807142859,7082.782807142859,2025-04-13,True,False
|
||||
2180261010,14,41266.58,42930.08428571429,42930.08428571429,2025-04-20,True,False
|
||||
2180749017,14,18532.8,18761.914285714283,18761.914285714283,2025-04-19,True,False
|
||||
2182865010,14,38250.0,39750.0,39750.0,2025-04-17,True,False
|
||||
2197472016,14,25862.5145,27069.675714285717,27069.675714285717,2025-04-21,True,False
|
||||
2198899018,14,30641.945600000003,32202.214400000004,32202.214400000004,2025-04-18,True,False
|
||||
2214244019,14,50968.0946,52409.907742857155,52409.907742857155,2025-04-15,True,False
|
||||
2217047015,14,16160.0,16902.85714285714,16902.85714285714,2025-04-10,True,False
|
||||
2219928019,14,5452.0,5709.022857142858,5709.022857142858,2025-04-21,True,False
|
||||
2225726018,14,46800.0,47635.71428571428,47635.71428571428,2025-04-21,True,False
|
||||
2225954011,14,32640.0,33348.57142857143,33348.57142857143,2025-04-17,True,False
|
||||
2226559011,14,25500.0,26500.0,26500.0,2025-04-20,True,False
|
||||
2226613014,14,24239.99,25209.589600000003,25209.589600000003,2025-04-14,True,False
|
||||
2226717015,14,25750.0,26375.0,26375.0,2025-04-19,True,False
|
||||
2226808018,14,65233.75,68122.67321428572,68122.67321428572,2025-04-14,True,False
|
||||
2227088017,14,35000.0,36775.0,36775.0,2025-04-16,True,False
|
||||
2227138015,14,46800.0,47153.57142857143,47153.57142857143,2025-04-17,True,False
|
||||
2227213019,14,65233.75,67936.29107142858,67936.29107142858,2025-04-15,True,False
|
||||
2227418010,14,45900.0,47282.142857142855,47282.142857142855,2025-04-18,True,False
|
||||
2227600013,14,26520.0,27058.571428571428,27058.571428571428,2025-04-17,True,False
|
||||
2234193018,14,45000.0,46928.57142857143,46928.57142857143,2025-04-20,True,False
|
||||
2234892012,14,65886.09760000001,68821.6168,68821.6168,2025-04-14,True,False
|
||||
2242042016,14,4800.0,4964.571428571428,4964.571428571428,2025-04-14,True,False
|
||||
2242872011,14,30300.0,31542.85714285714,31542.85714285714,2025-04-21,True,False
|
||||
2244212015,14,35000.0,36775.0,36775.0,2025-04-17,True,False
|
||||
2253348011,14,25721.5286,26722.012385714283,26722.012385714283,2025-04-13,True,False
|
||||
2253502017,14,26780.0,27114.285714285714,27114.285714285714,2025-04-14,True,False
|
||||
2253954018,14,25391.834300000002,26594.983450000003,26594.983450000003,2025-04-17,True,False
|
||||
2267310019,14,21000.0,22245.0,22245.0,2025-04-13,True,False
|
||||
2271021019,14,52800.0,55779.42857142857,55779.42857142857,2025-04-17,True,False
|
||||
2272954015,14,7977.42,8329.365,8329.365,2025-04-19,True,False
|
||||
|
@@ -0,0 +1,308 @@
|
||||
accountid,least_inflow_6m
|
||||
0194052022,26589.45
|
||||
0532452020,80000.0
|
||||
0676572026,42323.61
|
||||
0761118025,10209.51
|
||||
0820472015,202325.60880000002
|
||||
0835181025,42886.261600000005
|
||||
0835200018,40857.6708
|
||||
0852762012,20648.37
|
||||
0853687015,15977.2405
|
||||
0856304027,77548.12
|
||||
0889240019,1150220.1988
|
||||
0890507013,22330.07
|
||||
0897678011,55141.2663
|
||||
0903403013,67017.0855
|
||||
0903407011,46434.82
|
||||
0908634012,39312.76
|
||||
0909149016,40457.1054
|
||||
0910196014,40457.1054
|
||||
0915538013,31310.0
|
||||
0926111012,12120.0
|
||||
0950731013,24131.3997
|
||||
0954497016,48274.38220000001
|
||||
0961463013,45890.81
|
||||
0966021014,41000.0
|
||||
0977809014,12341.82
|
||||
0983934012,22941.6304
|
||||
0984548012,12621.74
|
||||
1007718012,22418.0
|
||||
1012475016,15642.5776
|
||||
1024520010,20607.4643
|
||||
1026391014,31812.27
|
||||
1026407016,31305.5257
|
||||
1048739025,22949.7957
|
||||
1049872017,21443.806800000002
|
||||
1050211014,48780.1
|
||||
1050617018,30535.01
|
||||
1050806027,12381.43
|
||||
1055487016,50709.042400000006
|
||||
1056604012,123586.78
|
||||
1058148017,87361.58
|
||||
1059568010,110597.92899999999
|
||||
1062072018,15979.257000000001
|
||||
1062596015,70018.654
|
||||
1071766010,11815.6264
|
||||
1073657011,51075.2152
|
||||
1074328022,23416.5174
|
||||
1078919011,19392.19
|
||||
1092014011,14662.29
|
||||
1092396010,93320.8589
|
||||
1092753013,12456.44
|
||||
1099421018,19668.6188
|
||||
1099865016,23140.25
|
||||
1100425013,23596.19
|
||||
1101981019,15941.880400000002
|
||||
1102352014,30702.0
|
||||
1111025015,12812.961000000001
|
||||
1112274016,9728.9159
|
||||
1114046017,9856.06
|
||||
1136038012,49953.71
|
||||
1136840017,19030.68
|
||||
1137979028,16475.6928
|
||||
1144064018,34347.7368
|
||||
1145715018,52231.6955
|
||||
1148441013,71671.28
|
||||
1149064013,13041.33
|
||||
1153011012,66438.26
|
||||
1155907018,78055.5169
|
||||
1166621015,10458.3783
|
||||
1166928013,117526.77
|
||||
1175464018,21420.0
|
||||
1175961012,21840.0
|
||||
1181003016,53535.21
|
||||
1185747011,40857.6708
|
||||
1190093026,28436.137000000002
|
||||
1201866010,28981.34
|
||||
1214025019,11110.0
|
||||
1214711015,40056.54
|
||||
1218765016,40056.54
|
||||
1226530017,11896.3153
|
||||
1228194026,21420.0
|
||||
1243004012,12207.18
|
||||
1243592016,21000.0
|
||||
1254162017,17290.380800000003
|
||||
1256499010,10000.0
|
||||
1259062019,27510.5322
|
||||
1261523014,54300.4875
|
||||
1263325016,28431.6616
|
||||
1270270019,40408.5546
|
||||
1271085012,37000.0
|
||||
1275044015,131415.45309999998
|
||||
1279191010,37072.41
|
||||
1285665028,27586.08
|
||||
1291708012,113682.43
|
||||
1291809012,30300.0
|
||||
1291885010,14137.3122
|
||||
1298114029,27598.57
|
||||
1298415018,132727.26
|
||||
1298420010,179163.11
|
||||
1311857010,21944.381999999998
|
||||
1313574018,10000.0
|
||||
1331055016,102000.0
|
||||
1333165010,21000.0
|
||||
1342374016,45215.080200000004
|
||||
1342804012,21290.5374
|
||||
1343673022,21000.0
|
||||
1345196019,34137.85
|
||||
1355277012,92034.91619999999
|
||||
1356399027,40457.1054
|
||||
1364716018,40457.1054
|
||||
1365921013,113682.43
|
||||
1368292019,26473.8675
|
||||
1375320015,64000.0
|
||||
1377321016,67056.28
|
||||
1383019015,40056.54
|
||||
1384084016,11276.86
|
||||
1395426014,52623.78
|
||||
1396795012,37770.83
|
||||
1399402010,13987.86
|
||||
1408181013,9958.7111
|
||||
1408676023,46018.778999999995
|
||||
1410787014,25255.1775
|
||||
1428498012,14898.37
|
||||
1436713013,33026.67
|
||||
1443284010,22275.0
|
||||
1443552014,22720.5
|
||||
1444444015,23273.136
|
||||
1462318019,40857.6708
|
||||
1465238011,15977.089
|
||||
1465352016,23419.3326
|
||||
1469561010,30370.065000000002
|
||||
1479990013,120868.53820000001
|
||||
1486608013,14280.0
|
||||
1501871011,10605.3
|
||||
1511221015,134753.29
|
||||
1512253017,25946.4234
|
||||
1512290014,76173.8251
|
||||
1516029010,140222.5136
|
||||
1522821017,45567.614499999996
|
||||
1523440019,42496.2752
|
||||
1533377013,21840.0
|
||||
1538119014,40056.54
|
||||
1543039017,43414.55
|
||||
1543041018,34915.95
|
||||
1543044019,26606.08
|
||||
1543345015,17942.2662
|
||||
1543391012,13000.0
|
||||
1547865012,57734.6278
|
||||
1566054028,69690.3333
|
||||
1567984016,67767.02519999999
|
||||
1587630018,21000.0
|
||||
1593684010,34093.7346
|
||||
1609825013,33091.55
|
||||
1613386012,11391.21
|
||||
1617111010,61800.0
|
||||
1618434011,30000.0
|
||||
1620472016,40056.54
|
||||
1635580014,50968.0946
|
||||
1642340016,143009.0007
|
||||
1642419011,66300.0
|
||||
1643680010,318280.88
|
||||
1648142016,25500.0
|
||||
1650352010,44328.51
|
||||
1664348010,46692.25
|
||||
1668954015,30900.0
|
||||
1671967011,42059.367000000006
|
||||
1675953014,10775.38
|
||||
1676704017,26252.26
|
||||
1681835010,21000.0
|
||||
1684383017,22105.1832
|
||||
1687874013,21210.0
|
||||
1703943017,20200.0
|
||||
1708145012,26019.916
|
||||
1714166012,24532.1692
|
||||
1732006017,28209.48
|
||||
1732103019,14140.0
|
||||
1736676012,15000.0
|
||||
1740753015,40056.54
|
||||
1755401019,45450.0
|
||||
1757605017,196316.2048
|
||||
1766882018,36501.42
|
||||
1774623010,83294.7648
|
||||
1777475014,110687.73
|
||||
1779276019,86907.5003
|
||||
1785264017,110026.6326
|
||||
1793393017,79427.03
|
||||
1793495014,11193.92
|
||||
1795523018,50000.0
|
||||
1799104017,47500.0
|
||||
1801542017,12504.129
|
||||
1804659017,24533.601
|
||||
1814233016,28714.5828
|
||||
1815571016,21000.0
|
||||
1819235015,96150.0
|
||||
1824974019,64750.0
|
||||
1832570012,10200.0
|
||||
1833032010,32320.0
|
||||
1839846011,61027.52820000001
|
||||
1845307014,10100.0
|
||||
1860928012,28840.0
|
||||
1863257016,12260.0163
|
||||
1874878019,21210.0
|
||||
1893502016,59180.3592
|
||||
1904132012,27975.285
|
||||
1905813013,26619.484800000002
|
||||
1907631011,52231.8975
|
||||
1908562013,24625.43
|
||||
1908717019,30989.6604
|
||||
1909045012,55935.69
|
||||
1910312013,68931.05
|
||||
1918274012,15952.640000000001
|
||||
1924705014,101000.0
|
||||
1931759013,68431.4078
|
||||
1938793018,11589.75
|
||||
1939714016,46692.25
|
||||
1940684016,13218.0882
|
||||
1944111019,23813.17
|
||||
1944186013,20400.0
|
||||
1946636017,42860.497800000005
|
||||
1949225012,40056.54
|
||||
1949359014,40857.6708
|
||||
1954035015,51714.55
|
||||
1955200012,18854.9931
|
||||
1955976014,40857.6708
|
||||
1957290019,21630.0
|
||||
1957372012,28104.5
|
||||
1957430013,28104.5
|
||||
1957861013,56466.27
|
||||
1972698016,34643.0
|
||||
1974761013,45000.0
|
||||
1985487014,57206.4
|
||||
1987937018,41258.2362
|
||||
1988801019,28804.31
|
||||
1991880016,17159.613
|
||||
1998110015,12953.25
|
||||
1999144011,63053.17
|
||||
2010494016,40857.6708
|
||||
2015047015,41258.2362
|
||||
2015759017,47355.7
|
||||
2034595018,153940.62
|
||||
2039362015,100000.0
|
||||
2043399010,41658.8016
|
||||
2047786012,28150.93
|
||||
2048850019,21000.0
|
||||
2051114016,30357.600000000002
|
||||
2055259010,22207.58
|
||||
2059595017,13750.0
|
||||
2059835014,66500.0
|
||||
2062857010,20200.0
|
||||
2066217012,57600.0
|
||||
2076926027,30917.33
|
||||
2077079016,44055.84
|
||||
2083803018,39063.08
|
||||
2084273016,15300.0
|
||||
2085388018,300000.0
|
||||
2088066012,33323.9804
|
||||
2090266019,45982.0386
|
||||
2090875017,49180.0462
|
||||
2094484013,46363.3834
|
||||
2095040012,81600.0
|
||||
2101243011,12000.0
|
||||
2101560013,12254.0573
|
||||
2105827011,101000.0
|
||||
2105903012,136350.0
|
||||
2109081015,47626.095
|
||||
2109496019,40457.1054
|
||||
2113221014,66277.0701
|
||||
2117066019,41658.8016
|
||||
2119186010,57485.33
|
||||
2123350018,37744.25
|
||||
2124152013,41258.2362
|
||||
2130118023,38787.71
|
||||
2140539016,10819.872
|
||||
2141435012,26000.0
|
||||
2141748019,113300.0
|
||||
2142999014,22500.0
|
||||
2146179014,74375.087
|
||||
2146523019,24240.0
|
||||
2165506013,50762.0034
|
||||
2168451011,40056.54
|
||||
2168911018,34483.3594
|
||||
2180261010,41266.58
|
||||
2180749017,18532.8
|
||||
2182865010,38250.0
|
||||
2197472016,25862.5145
|
||||
2198899018,30641.945600000003
|
||||
2214244019,50968.0946
|
||||
2217047015,16160.0
|
||||
2225726018,46800.0
|
||||
2225954011,32640.0
|
||||
2226559011,25500.0
|
||||
2226613014,24239.99
|
||||
2226717015,25750.0
|
||||
2226808018,65233.75
|
||||
2227088017,35000.0
|
||||
2227138015,46800.0
|
||||
2227213019,65233.75
|
||||
2227418010,45900.0
|
||||
2227600013,26520.0
|
||||
2234193018,45000.0
|
||||
2234892012,65886.09760000001
|
||||
2242872011,30300.0
|
||||
2244212015,35000.0
|
||||
2253348011,25721.5286
|
||||
2253502017,26780.0
|
||||
2253954018,25391.834300000002
|
||||
2267310019,21000.0
|
||||
2271021019,52800.0
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,15 @@
|
||||
sqlalchemy>=2.0.0
|
||||
pandas>=1.5.0
|
||||
numpy>=1.21.0
|
||||
matplotlib>=3.5.0
|
||||
seaborn>=0.12.0
|
||||
matplotlib-venn>=0.11.7
|
||||
wordcloud>=1.8.0
|
||||
scikit-learn>=1.0.0
|
||||
psycopg2-binary>=2.9.0
|
||||
fastapi>=0.68.0
|
||||
uvicorn>=0.15.0
|
||||
pydantic>=1.8.0
|
||||
python-multipart>=0.0.5
|
||||
python-dotenv>=0.19.0
|
||||
joblib>=1.1.0
|
||||
+14199
File diff suppressed because one or more lines are too long
+7232
File diff suppressed because one or more lines are too long
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
Salary Analytics Package
|
||||
A package for analyzing and predicting salary patterns from transaction data.
|
||||
"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
@@ -0,0 +1,598 @@
|
||||
"""
|
||||
FastAPI application for salary analytics.
|
||||
"""
|
||||
|
||||
from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File, Depends
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict, List, Union
|
||||
import os
|
||||
import socket
|
||||
import logging
|
||||
import pandas as pd
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text, Table, Column, Integer, String, Float, DateTime, MetaData
|
||||
import numpy as np
|
||||
import warnings
|
||||
import time
|
||||
from .main import SalaryAnalyticsPipeline
|
||||
from .config import OUTPUT_PATHS, TABLE_NAME, BATCH_RESULTS_TABLE
|
||||
from .data_loader import DataLoader
|
||||
from .salary_predictor import SalaryPredictor
|
||||
from .salary_earner_analyzer import SalaryEarnerAnalyzer
|
||||
from .db_operations import DatabaseOperations
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Suppress warnings
|
||||
warnings.filterwarnings('ignore', category=RuntimeWarning, module='numpy')
|
||||
pd.options.mode.chained_assignment = None
|
||||
|
||||
app = FastAPI(
|
||||
title="Salary Analytics API",
|
||||
description="API for analyzing and predicting salary patterns from transaction data",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
# Add CORS middleware
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"], # Allows all origins
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"], # Allows all methods
|
||||
allow_headers=["*"], # Allows all headers
|
||||
)
|
||||
|
||||
# Global pipeline instance
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
|
||||
# Global variables to store loaded data and models
|
||||
data_loader = None
|
||||
df = None
|
||||
salary_predictor = None
|
||||
salary_earner_analyzer = None
|
||||
|
||||
class AnalysisResponse(BaseModel):
|
||||
"""Response model for analysis endpoints."""
|
||||
message: str
|
||||
data: Optional[Dict] = None
|
||||
file_path: Optional[str] = None
|
||||
|
||||
class BatchResponse(BaseModel):
|
||||
"""Response model for batch processing."""
|
||||
batch_number: int
|
||||
total_batches: int
|
||||
processed_rows: int
|
||||
results_path: str
|
||||
message: str
|
||||
|
||||
def check_data_loaded():
|
||||
"""Check if data is loaded before running analytics."""
|
||||
if pipeline.df is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="No data loaded. Please load data first using the /load-data endpoint."
|
||||
)
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup_event():
|
||||
"""Initialize the pipeline on startup."""
|
||||
try:
|
||||
logger.info("Initializing pipeline...")
|
||||
|
||||
# Print network information
|
||||
hostname = socket.gethostname()
|
||||
ip_address = socket.gethostbyname(hostname)
|
||||
logger.info(f"Server running on hostname: {hostname}")
|
||||
logger.info(f"Server IP address: {ip_address}")
|
||||
logger.info(f"Server is accessible at:")
|
||||
logger.info(f"- http://localhost:8000")
|
||||
logger.info(f"- http://127.0.0.1:8000")
|
||||
logger.info(f"- http://{ip_address}:8000")
|
||||
logger.info("Pipeline initialized successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during startup: {str(e)}")
|
||||
raise
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint."""
|
||||
start_time = time.time()
|
||||
logger.info("Root endpoint accessed")
|
||||
response = {"message": "Welcome to Salary Analytics API"}
|
||||
logger.info(f"Root endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
start_time = time.time()
|
||||
logger.info("Health check endpoint accessed")
|
||||
response = {"status": "healthy"}
|
||||
logger.info(f"Health check completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
|
||||
@app.post("/analyze/keyword", response_model=AnalysisResponse)
|
||||
async def analyze_keyword():
|
||||
"""Run keyword-based salary transaction analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting keyword analysis...")
|
||||
data = pipeline.run_keyword_analysis()
|
||||
logger.info(f"Keyword analysis completed. Found {len(data)} matches")
|
||||
response = AnalysisResponse(
|
||||
message="Keyword analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Keyword analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in keyword analysis: {str(e)}")
|
||||
logger.info(f"Keyword analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/analyze/consistent-amount", response_model=AnalysisResponse)
|
||||
async def analyze_consistent_amount():
|
||||
"""Run consistent amount transaction analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting consistent amount analysis...")
|
||||
data = pipeline.run_consistent_amount_analysis()
|
||||
logger.info(f"Consistent amount analysis completed. Found {len(data)} matches")
|
||||
response = AnalysisResponse(
|
||||
message="Consistent amount analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Consistent amount analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in consistent amount analysis: {str(e)}")
|
||||
logger.info(f"Consistent amount analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/analyze/transaction-type", response_model=AnalysisResponse)
|
||||
async def analyze_transaction_type():
|
||||
"""Run transaction type analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting transaction type analysis...")
|
||||
data = pipeline.run_transaction_type_analysis()
|
||||
logger.info(f"Transaction type analysis completed. Found {len(data)} matches")
|
||||
response = AnalysisResponse(
|
||||
message="Transaction type analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Transaction type analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transaction type analysis: {str(e)}")
|
||||
logger.info(f"Transaction type analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/generate/reports", response_model=AnalysisResponse)
|
||||
async def generate_reports(background_tasks: BackgroundTasks):
|
||||
"""Generate salary earner reports."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting report generation...")
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
logger.info("Reports generated successfully")
|
||||
response = AnalysisResponse(
|
||||
message="Reports generated successfully",
|
||||
data={
|
||||
"verified_salary_earners": len(reports['final_table']),
|
||||
"likely_salary_earners": len(reports['likely_salary_earner']),
|
||||
"high_earners": reports['total_high_earners']
|
||||
}
|
||||
)
|
||||
logger.info(f"Report generation endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in report generation: {str(e)}")
|
||||
logger.info(f"Report generation endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/train/models", response_model=AnalysisResponse)
|
||||
async def train_models():
|
||||
"""Train salary prediction models."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting model training...")
|
||||
pipeline.train_salary_prediction_models()
|
||||
logger.info("Models trained successfully")
|
||||
response = AnalysisResponse(
|
||||
message="Models trained successfully"
|
||||
)
|
||||
logger.info(f"Model training endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in model training: {str(e)}")
|
||||
logger.info(f"Model training endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/download/{report_type}")
|
||||
async def download_report(report_type: str):
|
||||
"""Download generated reports."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info(f"Attempting to download report: {report_type}")
|
||||
file_paths = {
|
||||
"high_earners": OUTPUT_PATHS["high_earner_details"],
|
||||
"likely_earners": OUTPUT_PATHS["likely_salary_earner"],
|
||||
"final_table": OUTPUT_PATHS["final_table"],
|
||||
"consistent_plot": OUTPUT_PATHS["consistent_earners_plot"],
|
||||
"inconsistent_plot": OUTPUT_PATHS["inconsistent_earners_plot"],
|
||||
"hypothesis_plot": OUTPUT_PATHS["hypothesis_overlap_plot"]
|
||||
}
|
||||
|
||||
if report_type not in file_paths:
|
||||
logger.error(f"Report type not found: {report_type}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=404, detail="Report type not found")
|
||||
|
||||
file_path = file_paths[report_type]
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"Report file not found: {file_path}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=404, detail="Report file not found")
|
||||
|
||||
logger.info(f"Successfully found report file: {file_path}")
|
||||
response = FileResponse(
|
||||
path=file_path,
|
||||
filename=os.path.basename(file_path),
|
||||
media_type="application/octet-stream"
|
||||
)
|
||||
logger.info(f"Download endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading report: {str(e)}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/run/pipeline", response_model=AnalysisResponse)
|
||||
async def run_full_pipeline():
|
||||
"""Run the complete salary analytics pipeline."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting full pipeline...")
|
||||
success = pipeline.run_full_pipeline()
|
||||
if not success:
|
||||
logger.error("Pipeline failed")
|
||||
logger.info(f"Full pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Pipeline failed")
|
||||
|
||||
logger.info("Pipeline completed successfully")
|
||||
response = AnalysisResponse(
|
||||
message="Pipeline completed successfully"
|
||||
)
|
||||
logger.info(f"Full pipeline endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in pipeline: {str(e)}")
|
||||
logger.info(f"Full pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/load-data")
|
||||
async def load_data(source: str = "db", file: Optional[UploadFile] = File(None)):
|
||||
"""
|
||||
Load data from either database or CSV file.
|
||||
|
||||
Args:
|
||||
source (str): Source of data ('db' or 'csv')
|
||||
file (UploadFile, optional): CSV file to load (required if source is 'csv')
|
||||
|
||||
Returns:
|
||||
dict: Status of data loading
|
||||
"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
if source not in ['db', 'csv']:
|
||||
logger.error(f"Invalid source: {source}")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="Source must be either 'db' or 'csv'")
|
||||
|
||||
if source == 'csv' and not file:
|
||||
logger.error("No file provided for CSV source")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
|
||||
if source == 'csv':
|
||||
# Save uploaded file temporarily
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file:
|
||||
content = await file.read()
|
||||
temp_file.write(content)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
success = pipeline.load_data(source='csv', file_path=temp_file_path)
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
else:
|
||||
success = pipeline.load_data(source='db')
|
||||
|
||||
if not success:
|
||||
logger.error("Failed to load data")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to load data")
|
||||
|
||||
response = {
|
||||
"status": "success",
|
||||
"message": f"Successfully loaded {len(pipeline.df)} rows of data",
|
||||
"columns": pipeline.df.columns.tolist(),
|
||||
"row_count": len(pipeline.df)
|
||||
}
|
||||
logger.info(f"Load data endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data: {str(e)}")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
async def get_file_if_csv(source: str, file: Optional[UploadFile] = File(None)):
|
||||
"""Dependency to handle file upload only when source is csv."""
|
||||
if source == 'csv' and not file:
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
return file
|
||||
|
||||
@app.post("/run/streaming-pipeline", response_model=List[BatchResponse])
|
||||
async def run_streaming_pipeline(
|
||||
source: str = "db",
|
||||
batch_size: int = 10000,
|
||||
file: Optional[Union[UploadFile, str]] = File(None)
|
||||
):
|
||||
"""
|
||||
Run the complete salary analytics pipeline in batches.
|
||||
|
||||
Args:
|
||||
source (str): Source of data ('db' or 'csv')
|
||||
batch_size (int): Number of rows to process in each batch
|
||||
file (UploadFile, optional): CSV file to load (required if source is 'csv')
|
||||
|
||||
Returns:
|
||||
List[BatchResponse]: List of responses for each batch processed
|
||||
"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
if source not in ['db', 'csv']:
|
||||
logger.error(f"Invalid source: {source}")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="Source must be either 'db' or 'csv'")
|
||||
|
||||
if source == 'csv' and not file:
|
||||
logger.error("No file provided for CSV source")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
|
||||
# Initialize data loader
|
||||
data_loader = DataLoader()
|
||||
data_loader.chunk_size = batch_size
|
||||
|
||||
# Create output directory for batch results
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
batch_output_dir = os.path.join(os.path.dirname(OUTPUT_PATHS['final_table']), f"batch_results_{timestamp}")
|
||||
os.makedirs(batch_output_dir, exist_ok=True)
|
||||
|
||||
# Initialize database operations
|
||||
if not data_loader.connect():
|
||||
logger.error("Failed to connect to database")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to connect to database")
|
||||
|
||||
db_ops = DatabaseOperations(data_loader.engine)
|
||||
if not db_ops.create_batch_results_table():
|
||||
logger.error("Failed to create batch results table")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to create batch results table")
|
||||
|
||||
responses = []
|
||||
batch_number = 0
|
||||
batch_start_time = time.time()
|
||||
|
||||
def preprocess_chunk(chunk):
|
||||
"""Preprocess a chunk of data with the same logic as DataLoader."""
|
||||
# Convert dates
|
||||
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
|
||||
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
|
||||
|
||||
# Rename columns
|
||||
chunk = chunk.rename(columns={
|
||||
'd1': 'trx_type',
|
||||
'd2': 'trx_subtype',
|
||||
'd3': 'initiated_by',
|
||||
'd4': 'customer_id'
|
||||
})
|
||||
|
||||
chunk = chunk.dropna()
|
||||
|
||||
return chunk
|
||||
|
||||
if source == 'csv':
|
||||
# Save uploaded file temporarily
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file:
|
||||
content = await file.read()
|
||||
temp_file.write(content)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
# Process CSV in chunks
|
||||
for chunk in pd.read_csv(temp_file_path, chunksize=batch_size):
|
||||
batch_number += 1
|
||||
logger.info(f"Processing batch {batch_number}")
|
||||
|
||||
# Preprocess chunk
|
||||
chunk = preprocess_chunk(chunk)
|
||||
|
||||
# Run pipeline on chunk
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
batch_start_time = time.time()
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
pipeline.run_transaction_type_analysis()
|
||||
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Add batch metadata to results
|
||||
results_df = reports['final_table'].copy()
|
||||
results_df['batch_number'] = batch_number
|
||||
results_df['total_batches'] = -1 # Unknown for CSV
|
||||
results_df['processed_at'] = datetime.now()
|
||||
|
||||
# Save batch results to CSV
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
results_df.to_csv(batch_results_path, index=False)
|
||||
|
||||
# Save to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1, # Unknown for CSV
|
||||
results_df=results_df,
|
||||
status="success"
|
||||
)
|
||||
|
||||
logger.info(f"Batch {batch_number} processed in {time.time() - batch_start_time:.2f} seconds")
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1, # Unknown for CSV
|
||||
processed_rows=len(chunk),
|
||||
results_path=batch_results_path,
|
||||
message=f"Successfully processed batch {batch_number}"
|
||||
))
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logger.error(f"Error processing batch {batch_number}: {error_message}")
|
||||
|
||||
# Save error to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1,
|
||||
results_df=pd.DataFrame(), # Empty DataFrame for error case
|
||||
status="error"
|
||||
)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {error_message}"
|
||||
))
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
else:
|
||||
# Process database in chunks
|
||||
if not data_loader.connect():
|
||||
raise HTTPException(status_code=500, detail="Failed to connect to database")
|
||||
|
||||
# Get total row count
|
||||
with data_loader.engine.connect() as conn:
|
||||
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
|
||||
total_rows = conn.execute(count_query).scalar()
|
||||
|
||||
total_batches = (total_rows + batch_size - 1) // batch_size
|
||||
offset = 0
|
||||
|
||||
while offset < total_rows:
|
||||
batch_number += 1
|
||||
logger.info(f"Processing batch {batch_number} of {total_batches}")
|
||||
|
||||
# Load chunk from database
|
||||
query = f"SELECT * FROM {TABLE_NAME} LIMIT {batch_size} OFFSET {offset}"
|
||||
chunk = pd.read_sql(query, data_loader.engine)
|
||||
|
||||
if chunk.empty:
|
||||
break
|
||||
|
||||
# Preprocess chunk
|
||||
chunk = preprocess_chunk(chunk)
|
||||
|
||||
# Run pipeline on chunk
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
batch_start_time = time.time()
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
pipeline.run_transaction_type_analysis()
|
||||
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Add batch metadata to results
|
||||
results_df = reports['final_table'].copy()
|
||||
results_df['batch_number'] = batch_number
|
||||
results_df['total_batches'] = total_batches
|
||||
results_df['processed_at'] = datetime.now()
|
||||
|
||||
# Save batch results to CSV
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
results_df.to_csv(batch_results_path, index=False)
|
||||
|
||||
# Save to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
results_df=results_df,
|
||||
status="success"
|
||||
)
|
||||
|
||||
logger.info(f"Batch {batch_number} of {total_batches} processed in {time.time() - batch_start_time:.2f} seconds")
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
processed_rows=len(chunk),
|
||||
results_path=batch_results_path,
|
||||
message=f"Successfully processed batch {batch_number} of {total_batches}"
|
||||
))
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logger.error(f"Error processing batch {batch_number}: {error_message}")
|
||||
|
||||
# Save error to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
results_df=pd.DataFrame(), # Empty DataFrame for error case
|
||||
status="error"
|
||||
)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {error_message}"
|
||||
))
|
||||
|
||||
offset += batch_size
|
||||
|
||||
logger.info(f"Streaming pipeline endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return responses
|
||||
except Exception as e:
|
||||
logger.error(f"Error in streaming pipeline: {str(e)}")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@@ -0,0 +1,72 @@
|
||||
"""
|
||||
Configuration settings for the salary analytics package.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dotenv import load_dotenv
|
||||
|
||||
# Load environment variables
|
||||
load_dotenv()
|
||||
|
||||
# Base directories
|
||||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
OUTPUT_DIR = os.path.join(BASE_DIR, "output")
|
||||
PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
|
||||
CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
|
||||
MODEL_DIR = os.path.join(OUTPUT_DIR, "models")
|
||||
|
||||
# Create directories if they don't exist
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
os.makedirs(PLOTS_DIR, exist_ok=True)
|
||||
os.makedirs(CSV_DIR, exist_ok=True)
|
||||
os.makedirs(MODEL_DIR, exist_ok=True)
|
||||
|
||||
# Database Configuration
|
||||
DB_CONFIG = {
|
||||
"user": os.getenv("DB_USER", "salaryloan"), # Default value as fallback
|
||||
"password": os.getenv("DB_PASSWORD", "salaryloan"),
|
||||
"name": os.getenv("DB_NAME", "salaryloan"),
|
||||
"port": os.getenv("DB_PORT", "10532"),
|
||||
"host": os.getenv("DB_HOST", "dev-data.simbrellang.net")
|
||||
}
|
||||
|
||||
# Table Configuration
|
||||
TABLE_NAME = "customer_account_transaction_hx"
|
||||
BATCH_RESULTS_TABLE = "salary_analytics_batch_results"
|
||||
|
||||
# Salary Keywords
|
||||
SALARY_KEYWORDS = [
|
||||
"salary", "payroll", "income", "wage", "wages",
|
||||
"earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
|
||||
"monthlypay", "netpay", "grosspay",
|
||||
"remuneration", "stipend", "allowance", "bonus", "commission",
|
||||
"pension", "retirement", "dividend", "benefits", "reimbursement",
|
||||
"overtime", "incentive", "paycheck", "paycheque", "salary advance",
|
||||
"monthly income", "income tax refund", "employer deposit",
|
||||
"payroll deposit", "salary credit", "income credit", "salary transfer",
|
||||
"income transfer", "salary received", "income received", "hr deposit",
|
||||
"company deposit", "employer payment", "employee payment",
|
||||
"sal",
|
||||
]
|
||||
|
||||
# Model Configuration
|
||||
MODEL_CONFIG = {
|
||||
"cv_threshold": 0.10,
|
||||
"min_transactions": 3,
|
||||
"threshold": 0.7,
|
||||
"high_earner_threshold": 10000
|
||||
}
|
||||
|
||||
# File Paths
|
||||
OUTPUT_PATHS = {
|
||||
"high_earner_details": os.path.join(CSV_DIR, "high_earner_details.csv"),
|
||||
"likely_salary_earner": os.path.join(CSV_DIR, "likely_salary_earner.csv"),
|
||||
"final_table": os.path.join(CSV_DIR, "final_table.csv"),
|
||||
"consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
|
||||
"inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
|
||||
"hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png"),
|
||||
"consistent_model": os.path.join(MODEL_DIR, "consistent_model.joblib"),
|
||||
"inconsistent_model": os.path.join(MODEL_DIR, "inconsistent_model.joblib"),
|
||||
"consistent_scaler": os.path.join(MODEL_DIR, "consistent_scaler.joblib"),
|
||||
"inconsistent_scaler": os.path.join(MODEL_DIR, "inconsistent_scaler.joblib")
|
||||
}
|
||||
@@ -0,0 +1,64 @@
|
||||
"""
|
||||
Consistent amount transaction analysis module.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from .config import MODEL_CONFIG
|
||||
|
||||
class ConsistentAmountAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.const_df = None
|
||||
|
||||
def calculate_coefficient_of_variation(self, group):
|
||||
"""Calculate coefficient of variation for a group of transactions."""
|
||||
amounts = group[group['initiated_by'] == 'C']['amount']
|
||||
mean = amounts.mean()
|
||||
std = amounts.std(ddof=0)
|
||||
|
||||
if mean == 0:
|
||||
return float('nan')
|
||||
return std / mean
|
||||
|
||||
def flag_consistent_amounts(self, group, cv_threshold=None):
|
||||
"""Flag accounts with low variance in transaction amounts."""
|
||||
if cv_threshold is None:
|
||||
cv_threshold = MODEL_CONFIG['cv_threshold']
|
||||
|
||||
filtered_group = group[group['initiated_by'] == 'C']
|
||||
cv = self.calculate_coefficient_of_variation(filtered_group)
|
||||
is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
|
||||
|
||||
return pd.Series(
|
||||
[is_consistent] * len(group),
|
||||
index=group.index,
|
||||
name='is_consistent_amount'
|
||||
)
|
||||
|
||||
def identify_consistent_amount_accounts(self, cv_threshold=None):
|
||||
"""Identify accounts with consistent transaction amounts."""
|
||||
if cv_threshold is None:
|
||||
cv_threshold = MODEL_CONFIG['cv_threshold']
|
||||
|
||||
# Create a copy of the original DataFrame
|
||||
self.const_df = self.df.copy()
|
||||
|
||||
# Calculate consistent amount flags
|
||||
consistent_flags = self.const_df.groupby('accountid').apply(
|
||||
lambda group: self.flag_consistent_amounts(group, cv_threshold)
|
||||
).reset_index(level=0, drop=True)
|
||||
|
||||
# Add the flags to the original DataFrame
|
||||
self.const_df['is_consistent_amount'] = consistent_flags
|
||||
|
||||
return self.const_df
|
||||
|
||||
def get_consistent_amount_data(self):
|
||||
"""Get transactions identified as having consistent amounts."""
|
||||
if self.const_df is None:
|
||||
self.identify_consistent_amount_accounts()
|
||||
|
||||
return self.const_df[
|
||||
(self.const_df['is_consistent_amount']) &
|
||||
(self.const_df['initiated_by'] == 'C')
|
||||
]
|
||||
@@ -0,0 +1,170 @@
|
||||
"""
|
||||
Data loading and preprocessing module.
|
||||
"""
|
||||
|
||||
from sqlalchemy import create_engine, text
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
from .config import DB_CONFIG, TABLE_NAME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DataLoader:
|
||||
def __init__(self):
|
||||
self.engine = None
|
||||
self.df = None
|
||||
self.chunk_size = 10000 # Load 10,000 rows at a time
|
||||
|
||||
def connect(self):
|
||||
"""Establish database connection."""
|
||||
try:
|
||||
logger.info("Attempting to connect to database...")
|
||||
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['name']}"
|
||||
self.engine = create_engine(DATABASE_URL)
|
||||
with self.engine.connect() as conn:
|
||||
# First check if table exists
|
||||
check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{TABLE_NAME}')")
|
||||
table_exists = conn.execute(check_table).scalar()
|
||||
|
||||
if not table_exists:
|
||||
logger.error(f"Table {TABLE_NAME} does not exist in the database")
|
||||
return False
|
||||
|
||||
# Get row count
|
||||
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
|
||||
row_count = conn.execute(count_query).scalar()
|
||||
logger.info(f"Table {TABLE_NAME} exists with {row_count} rows")
|
||||
|
||||
# Get version
|
||||
result = conn.execute(text("SELECT version();"))
|
||||
logger.info("Connected successfully to database!")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error connecting to database: {str(e)}")
|
||||
return False
|
||||
|
||||
def load_from_csv(self, file_path):
|
||||
"""Load data from a CSV file."""
|
||||
try:
|
||||
logger.info(f"Loading data from CSV file: {file_path}")
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"CSV file not found: {file_path}")
|
||||
return None
|
||||
|
||||
# Load data in chunks
|
||||
chunks = []
|
||||
for chunk in pd.read_csv(file_path, chunksize=self.chunk_size):
|
||||
# Preprocess chunk
|
||||
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
|
||||
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
|
||||
|
||||
# Rename columns if needed
|
||||
if 'd1' in chunk.columns:
|
||||
chunk = chunk.rename(columns={
|
||||
'd1': 'trx_type',
|
||||
'd2': 'trx_subtype',
|
||||
'd3': 'initiated_by',
|
||||
'd4': 'customer_id'
|
||||
})
|
||||
|
||||
chunk = chunk.dropna()
|
||||
chunks.append(chunk)
|
||||
|
||||
# Combine all chunks
|
||||
self.df = pd.concat(chunks, ignore_index=True)
|
||||
logger.info(f"Successfully loaded {len(self.df)} rows from CSV")
|
||||
|
||||
# Basic data validation
|
||||
logger.info("Performing data validation...")
|
||||
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
|
||||
logger.info(f"Data types:\n{self.df.dtypes}")
|
||||
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
|
||||
|
||||
return self.df
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data from CSV: {str(e)}")
|
||||
return None
|
||||
|
||||
def load_from_db(self):
|
||||
"""Load and preprocess transaction data from database in chunks."""
|
||||
if not self.engine:
|
||||
logger.info("No database connection. Attempting to connect...")
|
||||
if not self.connect():
|
||||
logger.error("Failed to establish database connection")
|
||||
return None
|
||||
|
||||
try:
|
||||
logger.info(f"Loading data from table: {TABLE_NAME}")
|
||||
|
||||
# First get total count
|
||||
with self.engine.connect() as conn:
|
||||
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
|
||||
total_rows = conn.execute(count_query).scalar()
|
||||
logger.info(f"Total rows to process: {total_rows}")
|
||||
|
||||
# Load data in chunks
|
||||
chunks = []
|
||||
offset = 0
|
||||
|
||||
while True:
|
||||
logger.info(f"Loading chunk starting at offset {offset}")
|
||||
query = f"SELECT * FROM {TABLE_NAME} LIMIT {self.chunk_size} OFFSET {offset}"
|
||||
chunk = pd.read_sql(query, self.engine)
|
||||
|
||||
if chunk.empty:
|
||||
break
|
||||
|
||||
# Preprocess chunk
|
||||
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
|
||||
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
|
||||
|
||||
# Rename columns
|
||||
chunk = chunk.rename(columns={
|
||||
'd1': 'trx_type',
|
||||
'd2': 'trx_subtype',
|
||||
'd3': 'initiated_by',
|
||||
'd4': 'customer_id'
|
||||
})
|
||||
|
||||
chunk = chunk.dropna()
|
||||
chunks.append(chunk)
|
||||
offset += self.chunk_size
|
||||
|
||||
if offset >= total_rows:
|
||||
break
|
||||
|
||||
# Combine all chunks
|
||||
self.df = pd.concat(chunks, ignore_index=True)
|
||||
logger.info(f"Successfully loaded {len(self.df)} rows of data")
|
||||
|
||||
# Basic data validation
|
||||
logger.info("Performing data validation...")
|
||||
logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
|
||||
logger.info(f"Data types:\n{self.df.dtypes}")
|
||||
logger.info(f"Missing values:\n{self.df.isnull().sum()}")
|
||||
|
||||
return self.df
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data: {str(e)}")
|
||||
return None
|
||||
|
||||
def load_data(self, source='db', file_path=None):
|
||||
"""Load data from either database or CSV file."""
|
||||
if source == 'db':
|
||||
return self.load_from_db()
|
||||
elif source == 'csv':
|
||||
if not file_path:
|
||||
logger.error("File path must be provided when loading from CSV")
|
||||
return None
|
||||
return self.load_from_csv(file_path)
|
||||
else:
|
||||
logger.error(f"Invalid source: {source}. Must be 'db' or 'csv'")
|
||||
return None
|
||||
|
||||
def get_data(self):
|
||||
"""Get the loaded DataFrame."""
|
||||
if self.df is None:
|
||||
logger.warning("No data loaded. Call load_data() first.")
|
||||
return self.df
|
||||
@@ -0,0 +1,137 @@
|
||||
"""
|
||||
Database operations module for salary analytics.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from sqlalchemy import text
|
||||
from .config import BATCH_RESULTS_TABLE
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class DatabaseOperations:
|
||||
def __init__(self, engine):
|
||||
"""Initialize with SQLAlchemy engine."""
|
||||
self.engine = engine
|
||||
|
||||
def create_batch_results_table(self):
|
||||
"""Create the batch results table if it doesn't exist."""
|
||||
try:
|
||||
with self.engine.connect() as conn:
|
||||
# Check if table exists
|
||||
check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{BATCH_RESULTS_TABLE}')")
|
||||
table_exists = conn.execute(check_table).scalar()
|
||||
|
||||
if not table_exists:
|
||||
# Create table
|
||||
create_table = text(f"""
|
||||
CREATE TABLE {BATCH_RESULTS_TABLE} (
|
||||
id SERIAL PRIMARY KEY,
|
||||
batch_number INTEGER,
|
||||
total_batches INTEGER,
|
||||
processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
accountid TEXT,
|
||||
num_months INTEGER,
|
||||
least_inflow_6m DECIMAL,
|
||||
avg_monthly_salary DECIMAL,
|
||||
estimated_next_amount DECIMAL,
|
||||
estimated_next_date DATE,
|
||||
is_45day_salary BOOLEAN,
|
||||
is_2months_salary BOOLEAN,
|
||||
status TEXT
|
||||
)
|
||||
""")
|
||||
conn.execute(create_table)
|
||||
conn.commit()
|
||||
logger.info(f"Created table {BATCH_RESULTS_TABLE}")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating batch results table: {str(e)}")
|
||||
return False
|
||||
|
||||
def save_batch_to_db(self, batch_number, total_batches, results_df, status="success"):
|
||||
"""Save batch processing results to database."""
|
||||
try:
|
||||
with self.engine.connect() as conn:
|
||||
# Add batch metadata to the DataFrame
|
||||
results_df['batch_number'] = batch_number
|
||||
results_df['total_batches'] = total_batches
|
||||
results_df['processed_at'] = datetime.now()
|
||||
|
||||
# Convert DataFrame to list of dictionaries
|
||||
records = results_df.to_dict('records')
|
||||
|
||||
# Insert each record
|
||||
for record in records:
|
||||
insert_query = text(f"""
|
||||
INSERT INTO {BATCH_RESULTS_TABLE}
|
||||
(batch_number, total_batches, processed_at, accountid, num_months,
|
||||
least_inflow_6m, avg_monthly_salary, estimated_next_amount,
|
||||
estimated_next_date, is_45day_salary, is_2months_salary, status)
|
||||
VALUES
|
||||
(:batch_number, :total_batches, :processed_at, :accountid, :num_months,
|
||||
:least_inflow_6m, :avg_monthly_salary, :estimated_next_amount,
|
||||
:estimated_next_date, :is_45day_salary, :is_2months_salary, :status)
|
||||
""")
|
||||
|
||||
# Convert boolean columns to proper format
|
||||
record['is_45day_salary'] = record.get('45daysalary', False)
|
||||
record['is_2months_salary'] = record.get('2monthssalary', False)
|
||||
|
||||
# Add status
|
||||
record['status'] = status
|
||||
|
||||
conn.execute(insert_query, record)
|
||||
|
||||
conn.commit()
|
||||
logger.info(f"Successfully saved batch {batch_number} results to database")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving batch {batch_number} to database: {str(e)}")
|
||||
return False
|
||||
|
||||
def get_batch_status(self, batch_number):
|
||||
"""Get the status of a specific batch."""
|
||||
try:
|
||||
with self.engine.connect() as conn:
|
||||
query = text(f"""
|
||||
SELECT
|
||||
batch_number,
|
||||
total_batches,
|
||||
processed_at,
|
||||
COUNT(*) as total_records,
|
||||
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successful_records,
|
||||
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as failed_records
|
||||
FROM {BATCH_RESULTS_TABLE}
|
||||
WHERE batch_number = :batch_number
|
||||
GROUP BY batch_number, total_batches, processed_at
|
||||
ORDER BY processed_at DESC
|
||||
LIMIT 1
|
||||
""")
|
||||
result = conn.execute(query, {"batch_number": batch_number}).fetchone()
|
||||
return dict(result) if result else None
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting batch {batch_number} status: {str(e)}")
|
||||
return None
|
||||
|
||||
def get_all_batches(self):
|
||||
"""Get all batch processing results."""
|
||||
try:
|
||||
with self.engine.connect() as conn:
|
||||
query = text(f"""
|
||||
SELECT
|
||||
batch_number,
|
||||
total_batches,
|
||||
processed_at,
|
||||
COUNT(*) as total_records,
|
||||
SUM(CASE WHEN status = 'success' THEN 1 ELSE 0 END) as successful_records,
|
||||
SUM(CASE WHEN status = 'error' THEN 1 ELSE 0 END) as failed_records
|
||||
FROM {BATCH_RESULTS_TABLE}
|
||||
GROUP BY batch_number, total_batches, processed_at
|
||||
ORDER BY batch_number
|
||||
""")
|
||||
results = conn.execute(query).fetchall()
|
||||
return [dict(row) for row in results]
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting all batches: {str(e)}")
|
||||
return []
|
||||
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Keyword-based salary transaction analysis module.
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
from .config import SALARY_KEYWORDS
|
||||
|
||||
class KeywordAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.desc_df = None
|
||||
|
||||
def identify_salary_transactions(self):
|
||||
"""
|
||||
Identifies potential salary-related transactions based on keywords
|
||||
and month-year patterns in the 'description' column.
|
||||
"""
|
||||
month_year_patterns = [
|
||||
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
|
||||
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
|
||||
]
|
||||
|
||||
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
|
||||
combined_pattern = (
|
||||
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
|
||||
'|'.join(month_year_patterns)
|
||||
)
|
||||
|
||||
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
|
||||
combined_pattern,
|
||||
na=False,
|
||||
regex=True
|
||||
)
|
||||
|
||||
self.desc_df = self.df.copy()
|
||||
return self.df
|
||||
|
||||
def get_salary_related_data(self):
|
||||
"""Get transactions identified as salary-related."""
|
||||
if self.desc_df is None:
|
||||
self.identify_salary_transactions()
|
||||
|
||||
return self.desc_df[
|
||||
(self.desc_df['is_salary_related'] == True) &
|
||||
(self.desc_df['initiated_by'] == 'C')
|
||||
]
|
||||
@@ -0,0 +1,154 @@
|
||||
"""
|
||||
Main module for running the salary analytics pipeline.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from .data_loader import DataLoader
|
||||
from .keyword_analyzer import KeywordAnalyzer
|
||||
from .consistent_amount_analyzer import ConsistentAmountAnalyzer
|
||||
from .transaction_type_analyzer import TransactionTypeAnalyzer
|
||||
from .salary_earner_analyzer import SalaryEarnerAnalyzer
|
||||
from .salary_predictor import SalaryPredictor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SalaryAnalyticsPipeline:
|
||||
def __init__(self):
|
||||
logger.info("Initializing SalaryAnalyticsPipeline")
|
||||
self.data_loader = None
|
||||
self.df = None
|
||||
self.keyword_analyzer = None
|
||||
self.consistent_amount_analyzer = None
|
||||
self.transaction_type_analyzer = None
|
||||
self.salary_earner_analyzer = None
|
||||
self.salary_predictor = None
|
||||
|
||||
def load_data(self, source='db', file_path=None):
|
||||
"""Load and preprocess the transaction data."""
|
||||
logger.info("Starting data loading process")
|
||||
self.data_loader = DataLoader()
|
||||
self.df = self.data_loader.load_data(source=source, file_path=file_path)
|
||||
if self.df is not None:
|
||||
logger.info(f"Successfully loaded data with {len(self.df)} rows")
|
||||
else:
|
||||
logger.error("Failed to load data")
|
||||
return self.df is not None
|
||||
|
||||
def run_keyword_analysis(self):
|
||||
"""Run keyword-based salary transaction analysis."""
|
||||
if self.df is None:
|
||||
logger.error("Data not loaded. Call load_data() first.")
|
||||
raise ValueError("Data not loaded. Call load_data() first.")
|
||||
|
||||
logger.info("Starting keyword analysis")
|
||||
self.keyword_analyzer = KeywordAnalyzer(self.df)
|
||||
self.keyword_analyzer.identify_salary_transactions()
|
||||
keyword_data = self.keyword_analyzer.get_salary_related_data()
|
||||
|
||||
# Update main DataFrame with keyword analysis results
|
||||
self.df['is_salary_related'] = self.df.index.isin(keyword_data.index)
|
||||
return keyword_data
|
||||
|
||||
def run_consistent_amount_analysis(self):
|
||||
"""Run consistent amount transaction analysis."""
|
||||
if self.df is None:
|
||||
logger.error("Data not loaded. Call load_data() first.")
|
||||
raise ValueError("Data not loaded. Call load_data() first.")
|
||||
|
||||
logger.info("Starting consistent amount analysis")
|
||||
self.consistent_amount_analyzer = ConsistentAmountAnalyzer(self.df)
|
||||
self.consistent_amount_analyzer.identify_consistent_amount_accounts()
|
||||
consistent_data = self.consistent_amount_analyzer.get_consistent_amount_data()
|
||||
|
||||
# Update main DataFrame with consistent amount analysis results
|
||||
self.df['is_consistent_amount'] = self.df.index.isin(consistent_data.index)
|
||||
return consistent_data
|
||||
|
||||
def run_transaction_type_analysis(self):
|
||||
"""Run transaction type analysis."""
|
||||
if self.df is None:
|
||||
logger.error("Data not loaded. Call load_data() first.")
|
||||
raise ValueError("Data not loaded. Call load_data() first.")
|
||||
|
||||
logger.info("Starting transaction type analysis")
|
||||
self.transaction_type_analyzer = TransactionTypeAnalyzer(self.df)
|
||||
self.transaction_type_analyzer.flag_salary_type_transactions()
|
||||
type_data = self.transaction_type_analyzer.get_salary_type_data()
|
||||
|
||||
# Update main DataFrame with transaction type analysis results
|
||||
self.df['is_salary_type'] = self.df.index.isin(type_data.index)
|
||||
return type_data
|
||||
|
||||
def generate_salary_earner_reports(self):
|
||||
"""Generate salary earner reports."""
|
||||
if self.df is None:
|
||||
logger.error("Data not loaded. Call load_data() first.")
|
||||
raise ValueError("Data not loaded. Call load_data() first.")
|
||||
|
||||
# Ensure all analysis flags are present
|
||||
required_columns = ['is_salary_related', 'is_consistent_amount', 'is_salary_type']
|
||||
missing_columns = [col for col in required_columns if col not in self.df.columns]
|
||||
|
||||
if missing_columns:
|
||||
logger.error(f"Missing required columns: {missing_columns}")
|
||||
raise ValueError(f"Missing required columns: {missing_columns}. Run all analyses first.")
|
||||
|
||||
logger.info("Starting salary earner report generation")
|
||||
self.salary_earner_analyzer = SalaryEarnerAnalyzer(self.df)
|
||||
return self.salary_earner_analyzer.generate_reports()
|
||||
|
||||
def train_salary_prediction_models(self):
|
||||
"""Train salary prediction models."""
|
||||
if self.df is None:
|
||||
logger.error("Data not loaded. Call load_data() first.")
|
||||
raise ValueError("Data not loaded. Call load_data() first.")
|
||||
|
||||
logger.info("Starting model training")
|
||||
self.salary_predictor = SalaryPredictor(self.df)
|
||||
|
||||
# Get accounts from the salary earner analyzer
|
||||
if self.salary_earner_analyzer is None:
|
||||
logger.info("Salary earner analyzer not initialized. Generating reports first.")
|
||||
self.generate_salary_earner_reports()
|
||||
|
||||
consistent_accounts = self.salary_earner_analyzer.final_table['accountid'].unique()
|
||||
inconsistent_accounts = self.salary_earner_analyzer.likely_salary_earner['accountid'].unique()
|
||||
|
||||
self.salary_predictor.train_and_evaluate(consistent_accounts, inconsistent_accounts)
|
||||
|
||||
def run_full_pipeline(self, source='db', file_path=None):
|
||||
"""Run the complete salary analytics pipeline."""
|
||||
logger.info("Starting full pipeline execution")
|
||||
if not self.load_data(source=source, file_path=file_path):
|
||||
logger.error("Failed to load data. Exiting pipeline.")
|
||||
return False
|
||||
|
||||
try:
|
||||
logger.info("Running keyword analysis...")
|
||||
self.run_keyword_analysis()
|
||||
|
||||
logger.info("Running consistent amount analysis...")
|
||||
self.run_consistent_amount_analysis()
|
||||
|
||||
logger.info("Running transaction type analysis...")
|
||||
self.run_transaction_type_analysis()
|
||||
|
||||
logger.info("Generating salary earner reports...")
|
||||
self.generate_salary_earner_reports()
|
||||
|
||||
logger.info("Training salary prediction models...")
|
||||
self.train_salary_prediction_models()
|
||||
|
||||
logger.info("Pipeline completed successfully!")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Pipeline failed: {str(e)}")
|
||||
return False
|
||||
|
||||
def main():
|
||||
"""Main function to run the salary analytics pipeline."""
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
pipeline.run_full_pipeline()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,176 @@
|
||||
"""
|
||||
Salary earner analysis and report generation module.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from matplotlib_venn import venn3
|
||||
from datetime import datetime, timedelta
|
||||
import logging
|
||||
from .config import MODEL_CONFIG, OUTPUT_PATHS
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class SalaryEarnerAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.final_table = None
|
||||
self.likely_salary_earner = None
|
||||
self.high_earner_details = None
|
||||
|
||||
def filter_venn_section(self, **kwargs):
|
||||
"""Filter accounts based on specified combinations of hypothesis flags."""
|
||||
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
|
||||
df1 = self.df[self.df['initiated_by'] == 'C'].copy()
|
||||
|
||||
invalid_keys = set(kwargs.keys()) - valid_columns
|
||||
if invalid_keys:
|
||||
raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
|
||||
|
||||
condition = pd.Series([True] * len(df1), index=df1.index)
|
||||
for key, value in kwargs.items():
|
||||
condition &= (df1[key] == value)
|
||||
|
||||
filtered_df = df1[condition]
|
||||
|
||||
# Drop any rows with NaN values in critical columns
|
||||
critical_cols = ['accountid', 'trx_start_date', 'amount']
|
||||
filtered_df = filtered_df.dropna(subset=critical_cols)
|
||||
|
||||
return filtered_df
|
||||
|
||||
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
|
||||
"""Plot and save Venn diagram showing overlap between hypotheses."""
|
||||
set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
|
||||
set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
|
||||
set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
|
||||
|
||||
plt.figure(figsize=(10, 10))
|
||||
venn3([set2, set3, set4], set_labels=('Consistent Amount',
|
||||
'Salary Description', 'Transaction Type'))
|
||||
plt.title('Overlap Between Hypotheses')
|
||||
plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
|
||||
plt.close()
|
||||
|
||||
def generate_salary_earners_table(self, all_three_hypotheses):
|
||||
"""Generate a table of salary earners with their metrics."""
|
||||
results = []
|
||||
for accountid, group in all_three_hypotheses.groupby('accountid'):
|
||||
# Skip if group is empty
|
||||
if group.empty:
|
||||
continue
|
||||
|
||||
# Calculate required metrics
|
||||
num_months = len(group)
|
||||
|
||||
# Handle last 6 months calculation
|
||||
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
|
||||
if last_6_months.empty:
|
||||
least_inflow = 0
|
||||
else:
|
||||
least_inflow = last_6_months['amount'].min()
|
||||
|
||||
# Handle average salary calculation
|
||||
if group['amount'].notna().any():
|
||||
avg_salary = group['amount'].mean()
|
||||
else:
|
||||
avg_salary = 0
|
||||
|
||||
# Calculate days_since_last_trx with NaN handling
|
||||
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
|
||||
median_interval = group['days_since_last_trx'].median()
|
||||
if pd.isna(median_interval):
|
||||
median_interval = 30 # Default to 30 days if no interval data
|
||||
|
||||
last_date = group['trx_start_date'].max()
|
||||
next_date = last_date + timedelta(days=median_interval)
|
||||
next_amount = avg_salary
|
||||
|
||||
# Boolean flags with NaN handling
|
||||
days_since_last = (datetime.now() - last_date).days
|
||||
has_45d = days_since_last <= 45
|
||||
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
|
||||
|
||||
results.append({
|
||||
'accountid': accountid,
|
||||
'num_months': num_months,
|
||||
'least_inflow_6m': least_inflow,
|
||||
'avg_monthly_salary': avg_salary,
|
||||
'estimated_next_amount': next_amount,
|
||||
'estimated_next_date': next_date,
|
||||
'45daysalary': has_45d,
|
||||
'2monthssalary': has_2m
|
||||
})
|
||||
|
||||
final_df = pd.DataFrame(results)
|
||||
# Drop rows where all numeric columns are NaN
|
||||
numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
|
||||
final_df = final_df.dropna(subset=numeric_cols, how='all')
|
||||
return final_df
|
||||
|
||||
def analyze_salary_earners(self, final_df):
|
||||
"""Analyze salary earners and identify high earners."""
|
||||
high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']].copy()
|
||||
high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
|
||||
count_high = len(high_earners)
|
||||
|
||||
return high_earner_details, count_high
|
||||
|
||||
def generate_reports(self):
|
||||
"""Generate all salary earner reports."""
|
||||
# Get accounts flagged by all three hypotheses
|
||||
all_three_hypotheses = self.filter_venn_section(
|
||||
is_salary_related=True,
|
||||
is_consistent_amount=True,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
# Generate final table
|
||||
self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
|
||||
logger.info(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
|
||||
|
||||
# Generate likely salary earner table
|
||||
green_section = self.filter_venn_section(
|
||||
is_salary_related=True,
|
||||
is_consistent_amount=False,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
yellow_section = self.filter_venn_section(
|
||||
is_salary_related=False,
|
||||
is_consistent_amount=True,
|
||||
is_salary_type=True
|
||||
)
|
||||
|
||||
self.likely_salary_earner = pd.concat([yellow_section, green_section])
|
||||
self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
|
||||
self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
|
||||
logger.info(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
|
||||
|
||||
# Analyze high earners
|
||||
self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
|
||||
logger.info(f"\nTotal High Earners: {total_high_earners}")
|
||||
|
||||
# Plot hypothesis overlap
|
||||
self.plot_hypothesis_overlap(
|
||||
self.df[self.df['is_salary_related']],
|
||||
self.df[self.df['is_consistent_amount']],
|
||||
self.df[self.df['is_salary_type']]
|
||||
)
|
||||
|
||||
# Save reports
|
||||
self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
|
||||
self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
|
||||
self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
|
||||
|
||||
return {
|
||||
'final_table': self.final_table,
|
||||
'likely_salary_earner': self.likely_salary_earner,
|
||||
'high_earner_details': self.high_earner_details,
|
||||
'total_high_earners': total_high_earners
|
||||
}
|
||||
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
Salary prediction module using machine learning.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
||||
from sklearn.ensemble import RandomForestRegressor
|
||||
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
||||
from joblib import dump
|
||||
from .config import OUTPUT_PATHS
|
||||
|
||||
class SalaryPredictor:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.model_cons = None
|
||||
self.model_incons = None
|
||||
self.scaler_cons = None
|
||||
self.scaler_incons = None
|
||||
|
||||
def add_feature_engineering(self, df):
|
||||
"""Engineer features for salary prediction."""
|
||||
df['month'] = df['trx_start_date'].dt.month
|
||||
df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
|
||||
|
||||
# Categorical encoding
|
||||
encoder = OneHotEncoder(sparse_output=False)
|
||||
encoded_trx_type = encoder.fit_transform(df[['trx_type']])
|
||||
encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
|
||||
df = pd.concat([df, encoded_df], axis=1)
|
||||
|
||||
# Rolling statistics
|
||||
df = df.sort_values(['accountid', 'trx_start_date'])
|
||||
df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
|
||||
min_periods=1).sum().reset_index(0, drop=True)
|
||||
df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
|
||||
min_periods=1).mean().reset_index(0, drop=True)
|
||||
|
||||
return df
|
||||
|
||||
def prepare_data(self, df_transactions, accounts):
|
||||
"""Prepare data for training and testing."""
|
||||
df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
|
||||
print(f"Filtered data for {len(accounts)} accounts.")
|
||||
print(f"Total transactions: {len(df_filtered)}")
|
||||
|
||||
# Drop unnecessary columns
|
||||
df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
|
||||
'trx_end_date', 'is_salary_related',
|
||||
'is_consistent_amount', 'is_salary_type'], axis=1)
|
||||
|
||||
# Add feature engineering
|
||||
df_filtered = self.add_feature_engineering(df_filtered)
|
||||
|
||||
# Aggregate monthly data
|
||||
agg_funcs = {
|
||||
'amount': 'mean',
|
||||
'rolling_sum_3m': 'last',
|
||||
'rolling_avg_3m': 'last',
|
||||
'month': 'first'
|
||||
}
|
||||
encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
|
||||
for col in encoded_cols:
|
||||
agg_funcs[col] = 'sum'
|
||||
|
||||
monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
|
||||
|
||||
# Filter accounts with at least 12 months
|
||||
account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
|
||||
valid_accounts = account_month_counts[account_month_counts >= 12].index
|
||||
monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
|
||||
|
||||
# Create sequences
|
||||
X_train, y_train, X_test, y_test = [], [], [], []
|
||||
feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
|
||||
'month'] + encoded_cols
|
||||
|
||||
for account in valid_accounts:
|
||||
account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
|
||||
|
||||
if len(account_data) >= 12:
|
||||
for t in range(5, 8):
|
||||
X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
|
||||
y_train.append(account_data['amount'].iloc[t])
|
||||
for t in range(8, 12):
|
||||
X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
|
||||
y_test.append(account_data['amount'].iloc[t])
|
||||
else:
|
||||
print(f"Skipping account {account} due to insufficient data (less than 12 months).")
|
||||
|
||||
return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
|
||||
|
||||
def train_model(self, X_train, y_train, X_test, y_test):
|
||||
"""Train and evaluate a Random Forest model."""
|
||||
# Scale features
|
||||
scaler = StandardScaler()
|
||||
X_train_scaled = scaler.fit_transform(X_train)
|
||||
X_test_scaled = scaler.transform(X_test)
|
||||
|
||||
# Train model
|
||||
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
# Evaluate
|
||||
y_pred = model.predict(X_test_scaled)
|
||||
mae = mean_absolute_error(y_test, y_pred)
|
||||
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
||||
r2 = r2_score(y_test, y_pred)
|
||||
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
|
||||
|
||||
return model, scaler
|
||||
|
||||
def plot_predictions(self, y_test, y_pred, title, output_path):
|
||||
"""Plot actual vs predicted values and save to file."""
|
||||
plt.figure(figsize=(10, 5))
|
||||
plt.scatter(y_test, y_pred, alpha=0.5)
|
||||
plt.xlabel("Actual Salary")
|
||||
plt.ylabel("Predicted Salary")
|
||||
plt.title(title)
|
||||
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
|
||||
plt.savefig(output_path)
|
||||
plt.close()
|
||||
|
||||
def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
|
||||
"""Train and evaluate models for both consistent and inconsistent salary earners."""
|
||||
# Train model for consistent salary earners
|
||||
X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
|
||||
if len(X_train_cons) > 0:
|
||||
self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
|
||||
print("Model trained for consistent salary earners.")
|
||||
|
||||
# Save model and scaler
|
||||
dump(self.model_cons, OUTPUT_PATHS['consistent_model'])
|
||||
dump(self.scaler_cons, OUTPUT_PATHS['consistent_scaler'])
|
||||
print("Saved consistent salary earner model and scaler.")
|
||||
|
||||
# Plot predictions
|
||||
X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
|
||||
y_pred = self.model_cons.predict(X_test_cons_scaled)
|
||||
self.plot_predictions(
|
||||
y_test_cons,
|
||||
y_pred,
|
||||
"Actual vs. Predicted Salary (Consistent Earners)",
|
||||
OUTPUT_PATHS['consistent_earners_plot']
|
||||
)
|
||||
else:
|
||||
print("No accounts with sufficient data for consistent salary earners.")
|
||||
|
||||
# Train model for inconsistent salary earners
|
||||
X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
|
||||
if len(X_train_incons) > 0:
|
||||
print("\nTraining model for inconsistent salary earners...")
|
||||
self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
|
||||
|
||||
# Save model and scaler
|
||||
dump(self.model_incons, OUTPUT_PATHS['inconsistent_model'])
|
||||
dump(self.scaler_incons, OUTPUT_PATHS['inconsistent_scaler'])
|
||||
print("Saved inconsistent salary earner model and scaler.")
|
||||
|
||||
# Plot predictions
|
||||
X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
|
||||
y_pred = self.model_incons.predict(X_test_incons_scaled)
|
||||
self.plot_predictions(
|
||||
y_test_incons,
|
||||
y_pred,
|
||||
"Actual vs. Predicted Salary (Inconsistent Earners)",
|
||||
OUTPUT_PATHS['inconsistent_earners_plot']
|
||||
)
|
||||
else:
|
||||
print("No accounts with sufficient data for inconsistent salary earners.")
|
||||
@@ -0,0 +1,43 @@
|
||||
"""
|
||||
Transaction type analysis module.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
from .config import MODEL_CONFIG
|
||||
|
||||
class TransactionTypeAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.trx_df = None
|
||||
|
||||
def flag_salary_type_transactions(self):
|
||||
"""Flag transactions that match salary criteria based on type and subtype."""
|
||||
self.df['is_salary_type'] = (
|
||||
((self.df['trx_type'] == 'T') | (self.df['trx_type'] == 'C')) &
|
||||
((self.df['trx_subtype'] == 'BI') | (self.df['trx_subtype'] == 'I') |
|
||||
(self.df['trx_subtype'] == 'BS') | (self.df['trx_subtype'] == 'CI')) &
|
||||
(self.df['initiated_by'] == 'C') &
|
||||
(self.df['amount'] > 0)
|
||||
)
|
||||
|
||||
self.trx_df = self.df.copy()
|
||||
return self.df
|
||||
|
||||
def is_salary_earner_by_type(self, group, min_transactions=None, threshold=None):
|
||||
"""Determine if an account likely belongs to a salary earner."""
|
||||
if min_transactions is None:
|
||||
min_transactions = MODEL_CONFIG['min_transactions']
|
||||
if threshold is None:
|
||||
threshold = MODEL_CONFIG['threshold']
|
||||
|
||||
if len(group) < min_transactions:
|
||||
return False
|
||||
valid_ratio = group['is_salary_type'].mean()
|
||||
return valid_ratio >= threshold
|
||||
|
||||
def get_salary_type_data(self):
|
||||
"""Get transactions identified as salary type."""
|
||||
if self.trx_df is None:
|
||||
self.flag_salary_type_transactions()
|
||||
|
||||
return self.trx_df[self.trx_df['is_salary_type']]
|
||||
Reference in New Issue
Block a user