Enhance salary analytics API with database operations and performance logging
- Introduced `DatabaseOperations` class for managing batch results in the database. - Added functionality to create a batch results table and save batch processing results. - Updated API endpoints to log execution time and handle batch processing errors more effectively. - Improved response handling in analysis endpoints and added batch metadata to results. - Suppressed warnings and improved logging throughout the application.
This commit is contained in:
+149
-20
@@ -13,12 +13,16 @@ import logging
|
||||
import pandas as pd
|
||||
import tempfile
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy import text, Table, Column, Integer, String, Float, DateTime, MetaData
|
||||
import numpy as np
|
||||
import warnings
|
||||
import time
|
||||
from .main import SalaryAnalyticsPipeline
|
||||
from .config import OUTPUT_PATHS, TABLE_NAME
|
||||
from .config import OUTPUT_PATHS, TABLE_NAME, BATCH_RESULTS_TABLE
|
||||
from .data_loader import DataLoader
|
||||
from .salary_predictor import SalaryPredictor
|
||||
from .salary_earner_analyzer import SalaryEarnerAnalyzer
|
||||
from .db_operations import DatabaseOperations
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
@@ -27,6 +31,10 @@ logging.basicConfig(
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Suppress warnings
|
||||
warnings.filterwarnings('ignore', category=RuntimeWarning, module='numpy')
|
||||
pd.options.mode.chained_assignment = None
|
||||
|
||||
app = FastAPI(
|
||||
title="Salary Analytics API",
|
||||
description="API for analyzing and predicting salary patterns from transaction data",
|
||||
@@ -96,72 +104,91 @@ async def startup_event():
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Root endpoint."""
|
||||
start_time = time.time()
|
||||
logger.info("Root endpoint accessed")
|
||||
return {"message": "Welcome to Salary Analytics API"}
|
||||
response = {"message": "Welcome to Salary Analytics API"}
|
||||
logger.info(f"Root endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
|
||||
@app.get("/health")
|
||||
async def health_check():
|
||||
"""Health check endpoint."""
|
||||
start_time = time.time()
|
||||
logger.info("Health check endpoint accessed")
|
||||
return {"status": "healthy"}
|
||||
response = {"status": "healthy"}
|
||||
logger.info(f"Health check completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
|
||||
@app.post("/analyze/keyword", response_model=AnalysisResponse)
|
||||
async def analyze_keyword():
|
||||
"""Run keyword-based salary transaction analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting keyword analysis...")
|
||||
data = pipeline.run_keyword_analysis()
|
||||
logger.info(f"Keyword analysis completed. Found {len(data)} matches")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Keyword analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Keyword analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in keyword analysis: {str(e)}")
|
||||
logger.info(f"Keyword analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/analyze/consistent-amount", response_model=AnalysisResponse)
|
||||
async def analyze_consistent_amount():
|
||||
"""Run consistent amount transaction analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting consistent amount analysis...")
|
||||
data = pipeline.run_consistent_amount_analysis()
|
||||
logger.info(f"Consistent amount analysis completed. Found {len(data)} matches")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Consistent amount analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Consistent amount analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in consistent amount analysis: {str(e)}")
|
||||
logger.info(f"Consistent amount analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/analyze/transaction-type", response_model=AnalysisResponse)
|
||||
async def analyze_transaction_type():
|
||||
"""Run transaction type analysis."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting transaction type analysis...")
|
||||
data = pipeline.run_transaction_type_analysis()
|
||||
logger.info(f"Transaction type analysis completed. Found {len(data)} matches")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Transaction type analysis completed successfully",
|
||||
data={"count": len(data)}
|
||||
)
|
||||
logger.info(f"Transaction type analysis endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in transaction type analysis: {str(e)}")
|
||||
logger.info(f"Transaction type analysis endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/generate/reports", response_model=AnalysisResponse)
|
||||
async def generate_reports(background_tasks: BackgroundTasks):
|
||||
"""Generate salary earner reports."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting report generation...")
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
logger.info("Reports generated successfully")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Reports generated successfully",
|
||||
data={
|
||||
"verified_salary_earners": len(reports['final_table']),
|
||||
@@ -169,28 +196,36 @@ async def generate_reports(background_tasks: BackgroundTasks):
|
||||
"high_earners": reports['total_high_earners']
|
||||
}
|
||||
)
|
||||
logger.info(f"Report generation endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in report generation: {str(e)}")
|
||||
logger.info(f"Report generation endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/train/models", response_model=AnalysisResponse)
|
||||
async def train_models():
|
||||
"""Train salary prediction models."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting model training...")
|
||||
pipeline.train_salary_prediction_models()
|
||||
logger.info("Models trained successfully")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Models trained successfully"
|
||||
)
|
||||
logger.info(f"Model training endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in model training: {str(e)}")
|
||||
logger.info(f"Model training endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.get("/download/{report_type}")
|
||||
async def download_report(report_type: str):
|
||||
"""Download generated reports."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info(f"Attempting to download report: {report_type}")
|
||||
@@ -205,40 +240,50 @@ async def download_report(report_type: str):
|
||||
|
||||
if report_type not in file_paths:
|
||||
logger.error(f"Report type not found: {report_type}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=404, detail="Report type not found")
|
||||
|
||||
file_path = file_paths[report_type]
|
||||
if not os.path.exists(file_path):
|
||||
logger.error(f"Report file not found: {file_path}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=404, detail="Report file not found")
|
||||
|
||||
logger.info(f"Successfully found report file: {file_path}")
|
||||
return FileResponse(
|
||||
response = FileResponse(
|
||||
path=file_path,
|
||||
filename=os.path.basename(file_path),
|
||||
media_type="application/octet-stream"
|
||||
)
|
||||
logger.info(f"Download endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error downloading report: {str(e)}")
|
||||
logger.info(f"Download endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/run/pipeline", response_model=AnalysisResponse)
|
||||
async def run_full_pipeline():
|
||||
"""Run the complete salary analytics pipeline."""
|
||||
start_time = time.time()
|
||||
try:
|
||||
check_data_loaded()
|
||||
logger.info("Starting full pipeline...")
|
||||
success = pipeline.run_full_pipeline()
|
||||
if not success:
|
||||
logger.error("Pipeline failed")
|
||||
logger.info(f"Full pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Pipeline failed")
|
||||
|
||||
logger.info("Pipeline completed successfully")
|
||||
return AnalysisResponse(
|
||||
response = AnalysisResponse(
|
||||
message="Pipeline completed successfully"
|
||||
)
|
||||
logger.info(f"Full pipeline endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error in pipeline: {str(e)}")
|
||||
logger.info(f"Full pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/load-data")
|
||||
@@ -253,11 +298,16 @@ async def load_data(source: str = "db", file: Optional[UploadFile] = File(None))
|
||||
Returns:
|
||||
dict: Status of data loading
|
||||
"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
if source not in ['db', 'csv']:
|
||||
logger.error(f"Invalid source: {source}")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="Source must be either 'db' or 'csv'")
|
||||
|
||||
if source == 'csv' and not file:
|
||||
logger.error("No file provided for CSV source")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
|
||||
if source == 'csv':
|
||||
@@ -276,16 +326,21 @@ async def load_data(source: str = "db", file: Optional[UploadFile] = File(None))
|
||||
success = pipeline.load_data(source='db')
|
||||
|
||||
if not success:
|
||||
logger.error("Failed to load data")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to load data")
|
||||
|
||||
return {
|
||||
response = {
|
||||
"status": "success",
|
||||
"message": f"Successfully loaded {len(pipeline.df)} rows of data",
|
||||
"columns": pipeline.df.columns.tolist(),
|
||||
"row_count": len(pipeline.df)
|
||||
}
|
||||
logger.info(f"Load data endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return response
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data: {str(e)}")
|
||||
logger.info(f"Load data endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
async def get_file_if_csv(source: str, file: Optional[UploadFile] = File(None)):
|
||||
@@ -311,11 +366,16 @@ async def run_streaming_pipeline(
|
||||
Returns:
|
||||
List[BatchResponse]: List of responses for each batch processed
|
||||
"""
|
||||
start_time = time.time()
|
||||
try:
|
||||
if source not in ['db', 'csv']:
|
||||
logger.error(f"Invalid source: {source}")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="Source must be either 'db' or 'csv'")
|
||||
|
||||
if source == 'csv' and not file:
|
||||
logger.error("No file provided for CSV source")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
|
||||
# Initialize data loader
|
||||
@@ -327,8 +387,21 @@ async def run_streaming_pipeline(
|
||||
batch_output_dir = os.path.join(os.path.dirname(OUTPUT_PATHS['final_table']), f"batch_results_{timestamp}")
|
||||
os.makedirs(batch_output_dir, exist_ok=True)
|
||||
|
||||
# Initialize database operations
|
||||
if not data_loader.connect():
|
||||
logger.error("Failed to connect to database")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to connect to database")
|
||||
|
||||
db_ops = DatabaseOperations(data_loader.engine)
|
||||
if not db_ops.create_batch_results_table():
|
||||
logger.error("Failed to create batch results table")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail="Failed to create batch results table")
|
||||
|
||||
responses = []
|
||||
batch_number = 0
|
||||
batch_start_time = time.time()
|
||||
|
||||
def preprocess_chunk(chunk):
|
||||
"""Preprocess a chunk of data with the same logic as DataLoader."""
|
||||
@@ -369,6 +442,7 @@ async def run_streaming_pipeline(
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
batch_start_time = time.time()
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
@@ -377,9 +451,25 @@ async def run_streaming_pipeline(
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Save batch results
|
||||
# Add batch metadata to results
|
||||
results_df = reports['final_table'].copy()
|
||||
results_df['batch_number'] = batch_number
|
||||
results_df['total_batches'] = -1 # Unknown for CSV
|
||||
results_df['processed_at'] = datetime.now()
|
||||
|
||||
# Save batch results to CSV
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
reports['final_table'].to_csv(batch_results_path, index=False)
|
||||
results_df.to_csv(batch_results_path, index=False)
|
||||
|
||||
# Save to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1, # Unknown for CSV
|
||||
results_df=results_df,
|
||||
status="success"
|
||||
)
|
||||
|
||||
logger.info(f"Batch {batch_number} processed in {time.time() - batch_start_time:.2f} seconds")
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
@@ -389,13 +479,23 @@ async def run_streaming_pipeline(
|
||||
message=f"Successfully processed batch {batch_number}"
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing batch {batch_number}: {str(e)}")
|
||||
error_message = str(e)
|
||||
logger.error(f"Error processing batch {batch_number}: {error_message}")
|
||||
|
||||
# Save error to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1,
|
||||
results_df=pd.DataFrame(), # Empty DataFrame for error case
|
||||
status="error"
|
||||
)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {str(e)}"
|
||||
message=f"Error processing batch {batch_number}: {error_message}"
|
||||
))
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
@@ -432,6 +532,7 @@ async def run_streaming_pipeline(
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
batch_start_time = time.time()
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
@@ -440,9 +541,25 @@ async def run_streaming_pipeline(
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Save batch results
|
||||
# Add batch metadata to results
|
||||
results_df = reports['final_table'].copy()
|
||||
results_df['batch_number'] = batch_number
|
||||
results_df['total_batches'] = total_batches
|
||||
results_df['processed_at'] = datetime.now()
|
||||
|
||||
# Save batch results to CSV
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
reports['final_table'].to_csv(batch_results_path, index=False)
|
||||
results_df.to_csv(batch_results_path, index=False)
|
||||
|
||||
# Save to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
results_df=results_df,
|
||||
status="success"
|
||||
)
|
||||
|
||||
logger.info(f"Batch {batch_number} of {total_batches} processed in {time.time() - batch_start_time:.2f} seconds")
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
@@ -452,18 +569,30 @@ async def run_streaming_pipeline(
|
||||
message=f"Successfully processed batch {batch_number} of {total_batches}"
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing batch {batch_number}: {str(e)}")
|
||||
error_message = str(e)
|
||||
logger.error(f"Error processing batch {batch_number}: {error_message}")
|
||||
|
||||
# Save error to database
|
||||
db_ops.save_batch_to_db(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
results_df=pd.DataFrame(), # Empty DataFrame for error case
|
||||
status="error"
|
||||
)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {str(e)}"
|
||||
message=f"Error processing batch {batch_number}: {error_message}"
|
||||
))
|
||||
|
||||
offset += batch_size
|
||||
|
||||
logger.info(f"Streaming pipeline endpoint completed in {time.time() - start_time:.2f} seconds")
|
||||
return responses
|
||||
except Exception as e:
|
||||
logger.error(f"Error in streaming pipeline: {str(e)}")
|
||||
logger.info(f"Streaming pipeline endpoint failed after {time.time() - start_time:.2f} seconds")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
Reference in New Issue
Block a user