Implement streaming pipeline endpoint for batch processing
- Added `/run/streaming-pipeline` endpoint to process data in batches from either a database or CSV file. - Introduced `BatchResponse` model for structured responses. - Updated README with new endpoint details, including parameters and example usage. - Enhanced error handling and logging during batch processing. - Ensured data preprocessing and NaN handling in analysis functions.
This commit is contained in:
@@ -18,7 +18,7 @@ class SalaryEarnerAnalyzer:
|
||||
def filter_venn_section(self, **kwargs):
|
||||
"""Filter accounts based on specified combinations of hypothesis flags."""
|
||||
valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
|
||||
df1 = self.df[self.df['initiated_by'] == 'C']
|
||||
df1 = self.df[self.df['initiated_by'] == 'C'].copy()
|
||||
|
||||
invalid_keys = set(kwargs.keys()) - valid_columns
|
||||
if invalid_keys:
|
||||
@@ -28,7 +28,13 @@ class SalaryEarnerAnalyzer:
|
||||
for key, value in kwargs.items():
|
||||
condition &= (df1[key] == value)
|
||||
|
||||
return df1[condition]
|
||||
filtered_df = df1[condition]
|
||||
|
||||
# Drop any rows with NaN values in critical columns
|
||||
critical_cols = ['accountid', 'trx_start_date', 'amount']
|
||||
filtered_df = filtered_df.dropna(subset=critical_cols)
|
||||
|
||||
return filtered_df
|
||||
|
||||
def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
|
||||
"""Plot and save Venn diagram showing overlap between hypotheses."""
|
||||
@@ -47,21 +53,37 @@ class SalaryEarnerAnalyzer:
|
||||
"""Generate a table of salary earners with their metrics."""
|
||||
results = []
|
||||
for accountid, group in all_three_hypotheses.groupby('accountid'):
|
||||
# Skip if group is empty
|
||||
if group.empty:
|
||||
continue
|
||||
|
||||
# Calculate required metrics
|
||||
num_months = len(group)
|
||||
|
||||
# Handle last 6 months calculation
|
||||
last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
|
||||
least_inflow = last_6_months['amount'].min()
|
||||
avg_salary = group['amount'].mean()
|
||||
|
||||
# Calculate days since last transaction
|
||||
if last_6_months.empty:
|
||||
least_inflow = 0
|
||||
else:
|
||||
least_inflow = last_6_months['amount'].min()
|
||||
|
||||
# Handle average salary calculation
|
||||
if group['amount'].notna().any():
|
||||
avg_salary = group['amount'].mean()
|
||||
else:
|
||||
avg_salary = 0
|
||||
|
||||
# Calculate days_since_last_trx with NaN handling
|
||||
group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
|
||||
median_interval = group['days_since_last_trx'].median()
|
||||
if pd.isna(median_interval):
|
||||
median_interval = 30 # Default to 30 days if no interval data
|
||||
|
||||
last_date = group['trx_start_date'].max()
|
||||
next_date = last_date + timedelta(days=median_interval)
|
||||
next_amount = avg_salary
|
||||
|
||||
# Boolean flags
|
||||
# Boolean flags with NaN handling
|
||||
days_since_last = (datetime.now() - last_date).days
|
||||
has_45d = days_since_last <= 45
|
||||
has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
|
||||
@@ -78,7 +100,9 @@ class SalaryEarnerAnalyzer:
|
||||
})
|
||||
|
||||
final_df = pd.DataFrame(results)
|
||||
final_df = final_df.dropna()
|
||||
# Drop rows where all numeric columns are NaN
|
||||
numeric_cols = ['num_months', 'least_inflow_6m', 'avg_monthly_salary', 'estimated_next_amount']
|
||||
final_df = final_df.dropna(subset=numeric_cols, how='all')
|
||||
return final_df
|
||||
|
||||
def analyze_salary_earners(self, final_df):
|
||||
|
||||
Reference in New Issue
Block a user