Implement streaming pipeline endpoint for batch processing
- Added `/run/streaming-pipeline` endpoint to process data in batches from either a database or CSV file. - Introduced `BatchResponse` model for structured responses. - Updated README with new endpoint details, including parameters and example usage. - Enhanced error handling and logging during batch processing. - Ensured data preprocessing and NaN handling in analysis functions.
This commit is contained in:
+180
-3
@@ -6,15 +6,16 @@ from fastapi import FastAPI, HTTPException, BackgroundTasks, UploadFile, File
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from pydantic import BaseModel
|
||||
from typing import Optional, Dict
|
||||
from typing import Optional, Dict, List
|
||||
import os
|
||||
import socket
|
||||
import logging
|
||||
import pandas as pd
|
||||
import tempfile
|
||||
|
||||
from datetime import datetime
|
||||
from sqlalchemy import text
|
||||
from .main import SalaryAnalyticsPipeline
|
||||
from .config import OUTPUT_PATHS
|
||||
from .config import OUTPUT_PATHS, TABLE_NAME
|
||||
from .data_loader import DataLoader
|
||||
from .salary_predictor import SalaryPredictor
|
||||
from .salary_earner_analyzer import SalaryEarnerAnalyzer
|
||||
@@ -56,6 +57,14 @@ class AnalysisResponse(BaseModel):
|
||||
data: Optional[Dict] = None
|
||||
file_path: Optional[str] = None
|
||||
|
||||
class BatchResponse(BaseModel):
|
||||
"""Response model for batch processing."""
|
||||
batch_number: int
|
||||
total_batches: int
|
||||
processed_rows: int
|
||||
results_path: str
|
||||
message: str
|
||||
|
||||
def check_data_loaded():
|
||||
"""Check if data is loaded before running analytics."""
|
||||
if pipeline.df is None:
|
||||
@@ -277,4 +286,172 @@ async def load_data(source: str = "db", file: UploadFile = None):
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading data: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@app.post("/run/streaming-pipeline", response_model=List[BatchResponse])
|
||||
async def run_streaming_pipeline(source: str = "db", file: UploadFile = None, batch_size: int = 10000):
|
||||
"""
|
||||
Run the complete salary analytics pipeline in batches.
|
||||
|
||||
Args:
|
||||
source (str): Source of data ('db' or 'csv')
|
||||
file (UploadFile): CSV file to load (required if source is 'csv')
|
||||
batch_size (int): Number of rows to process in each batch
|
||||
|
||||
Returns:
|
||||
List[BatchResponse]: List of responses for each batch processed
|
||||
"""
|
||||
try:
|
||||
if source not in ['db', 'csv']:
|
||||
raise HTTPException(status_code=400, detail="Source must be either 'db' or 'csv'")
|
||||
|
||||
if source == 'csv' and not file:
|
||||
raise HTTPException(status_code=400, detail="File must be provided when loading from CSV")
|
||||
|
||||
# Initialize data loader
|
||||
data_loader = DataLoader()
|
||||
data_loader.chunk_size = batch_size
|
||||
|
||||
# Create output directory for batch results
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
batch_output_dir = os.path.join(os.path.dirname(OUTPUT_PATHS['final_table']), f"batch_results_{timestamp}")
|
||||
os.makedirs(batch_output_dir, exist_ok=True)
|
||||
|
||||
responses = []
|
||||
batch_number = 0
|
||||
|
||||
def preprocess_chunk(chunk):
|
||||
"""Preprocess a chunk of data with the same logic as DataLoader."""
|
||||
# Convert dates
|
||||
chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
|
||||
chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
|
||||
|
||||
chunk = chunk.rename(columns={
|
||||
'd1': 'trx_type',
|
||||
'd2': 'trx_subtype',
|
||||
'd3': 'initiated_by',
|
||||
'd4': 'customer_id'
|
||||
})
|
||||
chunk = chunk.dropna()
|
||||
|
||||
return chunk
|
||||
|
||||
if source == 'csv':
|
||||
# Save uploaded file temporarily
|
||||
with tempfile.NamedTemporaryFile(delete=False, suffix='.csv') as temp_file:
|
||||
content = await file.read()
|
||||
temp_file.write(content)
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
try:
|
||||
# Process CSV in chunks
|
||||
for chunk in pd.read_csv(temp_file_path, chunksize=batch_size):
|
||||
batch_number += 1
|
||||
logger.info(f"Processing batch {batch_number}")
|
||||
|
||||
# Preprocess chunk
|
||||
chunk = preprocess_chunk(chunk)
|
||||
|
||||
# Run pipeline on chunk
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
pipeline.run_transaction_type_analysis()
|
||||
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Save batch results
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
reports['final_table'].to_csv(batch_results_path, index=False)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1, # Unknown for CSV
|
||||
processed_rows=len(chunk),
|
||||
results_path=batch_results_path,
|
||||
message=f"Successfully processed batch {batch_number}"
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing batch {batch_number}: {str(e)}")
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=-1,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {str(e)}"
|
||||
))
|
||||
finally:
|
||||
# Clean up temporary file
|
||||
os.unlink(temp_file_path)
|
||||
else:
|
||||
# Process database in chunks
|
||||
if not data_loader.connect():
|
||||
raise HTTPException(status_code=500, detail="Failed to connect to database")
|
||||
|
||||
# Get total row count
|
||||
with data_loader.engine.connect() as conn:
|
||||
count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
|
||||
total_rows = conn.execute(count_query).scalar()
|
||||
|
||||
total_batches = (total_rows + batch_size - 1) // batch_size
|
||||
offset = 0
|
||||
|
||||
while offset < total_rows:
|
||||
batch_number += 1
|
||||
logger.info(f"Processing batch {batch_number} of {total_batches}")
|
||||
|
||||
# Load chunk from database
|
||||
query = f"SELECT * FROM {TABLE_NAME} LIMIT {batch_size} OFFSET {offset}"
|
||||
chunk = pd.read_sql(query, data_loader.engine)
|
||||
|
||||
if chunk.empty:
|
||||
break
|
||||
|
||||
# Preprocess chunk
|
||||
chunk = preprocess_chunk(chunk)
|
||||
|
||||
# Run pipeline on chunk
|
||||
pipeline = SalaryAnalyticsPipeline()
|
||||
pipeline.df = chunk
|
||||
|
||||
try:
|
||||
# Run analyses
|
||||
pipeline.run_keyword_analysis()
|
||||
pipeline.run_consistent_amount_analysis()
|
||||
pipeline.run_transaction_type_analysis()
|
||||
|
||||
# Generate reports
|
||||
reports = pipeline.generate_salary_earner_reports()
|
||||
|
||||
# Save batch results
|
||||
batch_results_path = os.path.join(batch_output_dir, f"batch_{batch_number}_results.csv")
|
||||
reports['final_table'].to_csv(batch_results_path, index=False)
|
||||
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
processed_rows=len(chunk),
|
||||
results_path=batch_results_path,
|
||||
message=f"Successfully processed batch {batch_number} of {total_batches}"
|
||||
))
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing batch {batch_number}: {str(e)}")
|
||||
responses.append(BatchResponse(
|
||||
batch_number=batch_number,
|
||||
total_batches=total_batches,
|
||||
processed_rows=len(chunk),
|
||||
results_path="",
|
||||
message=f"Error processing batch {batch_number}: {str(e)}"
|
||||
))
|
||||
|
||||
offset += batch_size
|
||||
|
||||
return responses
|
||||
except Exception as e:
|
||||
logger.error(f"Error in streaming pipeline: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
Reference in New Issue
Block a user