Implement streaming pipeline endpoint for batch processing

- Added `/run/streaming-pipeline` endpoint to process data in batches from either a database or CSV file. - Introduced `BatchResponse` model for structured responses. - Updated README with new endpoint details, including parameters and example usage. - Enhanced error handling and logging during batch processing. - Ensured data preprocessing and NaN handling in analysis functions.
2025-05-02 14:25:31 +01:00
parent 5767f55686
commit 9c429caa56
10 changed files with 246 additions and 11 deletions
@@ -18,7 +18,7 @@ class SalaryEarnerAnalyzer:
    def filter_venn_section(self, **kwargs):
        """Filter accounts based on specified combinations of hypothesis flags."""
        valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
-        df1 = self.df[self.df['initiated_by'] == 'C']
+        df1 = self.df[self.df['initiated_by'] == 'C'].copy()
        
        invalid_keys = set(kwargs.keys()) - valid_columns
        if invalid_keys:
@@ -28,7 +28,13 @@ class SalaryEarnerAnalyzer:
        for key, value in kwargs.items():
            condition &= (df1[key] == value)

-        return df1[condition]
+        filtered_df = df1[condition]
+        
+        # Drop any rows with NaN values in critical columns
+        critical_cols = ['accountid', 'trx_start_date', 'amount']
+        filtered_df = filtered_df.dropna(subset=critical_cols)
+        
+        return filtered_df

    def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
        """Plot and save Venn diagram showing overlap between hypotheses."""
@@ -47,21 +53,37 @@ class SalaryEarnerAnalyzer:
        """Generate a table of salary earners with their metrics."""
        results = []
        for accountid, group in all_three_hypotheses.groupby('accountid'):
+            # Skip if group is empty
+            if group.empty:
+                continue
+                
            # Calculate required metrics
            num_months = len(group)
+            
+            # Handle last 6 months calculation
            last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
-            least_inflow = last_6_months['amount'].min()
-            avg_salary = group['amount'].mean()
-
-            # Calculate days since last transaction
+            if last_6_months.empty:
+                least_inflow = 0
+            else:
+                least_inflow = last_6_months['amount'].min()
+            
+            # Handle average salary calculation
+            if group['amount'].notna().any():
+                avg_salary = group['amount'].mean()
+            else:
+                avg_salary = 0
+            
+            # Calculate days_since_last_trx with NaN handling
            group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
            median_interval = group['days_since_last_trx'].median()
+            if pd.isna(median_interval):
+                median_interval = 30  # Default to 30 days if no interval data

            last_date = group['trx_start_date'].max()
            next_date = last_date + timedelta(days=median_interval)
            next_amount = avg_salary

-            # Boolean flags
+            # Boolean flags with NaN handling
            days_since_last = (datetime.now() - last_date).days
            has_45d = days_since_last <= 45
            has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
@@ -78,7 +100,9 @@ class SalaryEarnerAnalyzer:
            })

        final_df = pd.DataFrame(results)
-        final_df = final_df.dropna()
+        # Drop rows where all numeric columns are NaN
+        numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
+        final_df = final_df.dropna(subset=numeric_cols, how='all')
        return final_df

    def analyze_salary_earners(self, final_df):