Added new salary-related terms and improved image outputs in salary.ipynb

2025-04-28 19:44:40 +01:00
parent 8207d8f1ff
commit 591d4611b6
27 changed files with 1782 additions and 12 deletions
@@ -0,0 +1,45 @@
+# Git
+.git
+.gitignore
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual Environment
+venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Output
+output/
+
+# Logs
+*.log
+
+# Local development
+.env
+.env.local 
@@ -0,0 +1,21 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y libpq-dev && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY salary_analytics/ ./salary_analytics/
+
+RUN mkdir -p output/csv output/plots
+
+ENV PYTHONPATH=/app
+ENV HOST=0.0.0.0
+ENV PORT=8000
+
+EXPOSE 8000
+
+# Use host 0.0.0.0 to allow external connections
+CMD ["uvicorn", "salary_analytics.api:app", "--host", "0.0.0.0", "--port", "8000", "--reload"] 
@@ -0,0 +1,140 @@
+# Salary Analytics
+
+A comprehensive salary analytics system that analyzes transaction data to identify salary earners, predict future salaries, and generate detailed reports.
+
+## Features
+
+- **Transaction Analysis**
+  - Keyword-based salary transaction identification
+  - Consistent amount transaction analysis
+  - Transaction type analysis
+  - Hypothesis overlap visualization
+
+- **Salary Earner Classification**
+  - Verified salary earners identification
+  - Likely salary earners identification
+  - High earner detection
+  - Salary pattern analysis
+
+- **Machine Learning**
+  - Salary prediction models
+  - Separate models for consistent and inconsistent earners
+  - Feature engineering
+  - Model evaluation metrics
+
+- **Reporting**
+  - CSV reports generation
+  - Visualization plots
+  - High earner details
+  - Salary earner statistics
+
+## Architecture
+
+The project is organized into the following modules:
+
+```
+salary_analytics/
+├── __init__.py
+├── config.py           # Configuration settings
+├── data_loader.py      # Database connection and data loading
+├── keyword_analyzer.py # Keyword-based analysis
+├── consistent_amount_analyzer.py # Consistent amount analysis
+├── transaction_type_analyzer.py  # Transaction type analysis
+├── salary_earner_analyzer.py     # Salary earner analysis
+├── salary_predictor.py # Machine learning models
+├── main.py            # Main pipeline
+└── api.py             # FastAPI endpoints
+```
+
+
+## Configuration
+
+The system can be configured through environment variables or the `config.py` file:
+
+```python
+# Database Configuration
+DB_CONFIG = {
+    "user": "db_user",
+    "password": "your_secure_password",
+    "name": "salary_db",
+    "port": "5432",
+    "host": "localhost"
+}
+
+# Model Configuration
+MODEL_CONFIG = {
+    "cv_threshold": 0.10,
+    "min_transactions": 3,
+    "threshold": 0.7,
+    "high_earner_threshold": 10000
+}
+```
+
+## Usage
+
+### Using the API
+
+1. Start the API server:
+```bash
+uvicorn salary_analytics.api:app --reload
+```
+
+2. Access the API documentation:
+- Swagger UI: http://localhost:8000/docs
+- ReDoc: http://localhost:8000/redoc
+
+### API Endpoints
+
+1. **Basic Endpoints**
+   - `GET /`: Welcome message
+   - `GET /health`: Health check
+
+2. **Analysis Endpoints**
+   - `POST /analyze/keyword`: Run keyword analysis
+   - `POST /analyze/consistent-amount`: Run consistent amount analysis
+   - `POST /analyze/transaction-type`: Run transaction type analysis
+
+3. **Report Generation**
+   - `POST /generate/reports`: Generate all reports
+   - `GET /download/{report_type}`: Download specific reports
+     - Available types:
+       - `high_earners`: High earner details
+       - `likely_earners`: Likely salary earners
+       - `final_table`: Final analysis table
+       - `consistent_plot`: Consistent earners plot
+       - `inconsistent_plot`: Inconsistent earners plot
+       - `hypothesis_plot`: Hypothesis overlap plot
+
+4. **Model Training**
+   - `POST /train/models`: Train prediction models
+
+5. **Pipeline**
+   - `POST /run/pipeline`: Run complete pipeline
+
+## Docker Deployment
+
+1. Build the Docker image:
+```bash
+docker-compose build
+```
+
+2. Run the container:
+```bash
+docker-compose up
+```
+
+The API will be available at http://localhost:8000
+
+## Output Structure
+
+```
+output/
+├── csv/
+│   ├── high_earner_details.csv
+│   ├── likely_salary_earner.csv
+│   └── final_table.csv
+└── plots/
+    ├── consistent_earners_predictions.png
+    ├── inconsistent_earners_predictions.png
+    └── hypothesis_overlap.png
+```
@@ -0,0 +1,22 @@
+version: '3.8'
+
+services:
+  api:
+    build: .
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./output:/app/output
+    environment:
+      - DB_USER=salaryloan
+      - DB_PASSWORD=salaryloan
+      - DB_NAME=salaryloan
+      - DB_PORT=10532
+      - DB_HOST=dev-data.simbrellang.net
+    restart: unless-stopped
+    networks:
+      - salary_network
+
+networks:
+  salary_network:
+    driver: bridge
@@ -0,0 +1,13 @@
+sqlalchemy
+pandas
+numpy
+matplotlib
+seaborn
+matplotlib-venn
+wordcloud
+scikit-learn
+psycopg2-binary
+fastapi>=0.68.0
+uvicorn>=0.15.0
+pydantic>=1.8.0
+python-multipart>=0.0.5 
@@ -0,0 +1,549 @@
+from readline import redisplay
+from sqlalchemy import create_engine, text
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import warnings
+warnings.filterwarnings('ignore')
+import re
+import seaborn as sns
+from matplotlib_venn import venn3, venn2
+from wordcloud import WordCloud
+from datetime import datetime, timedelta
+
+DB_USER = "salaryloan"
+DB_PASSWORD = "salaryloan"
+DB_NAME = "salaryloan"
+DB_PORT = "10532"
+DB_HOST = "dev-data.simbrellang.net"
+DATABASE_URL = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}:{DB_PORT}/{DB_NAME}"
+
+engine = create_engine(DATABASE_URL)
+
+try:
+    with engine.connect() as conn:
+        result = conn.execute(text("SELECT version();"))
+        print("Connected successfully!")
+except Exception as e:
+    print("Error connecting to database:", e)
+
+
+table_name = "customer_account_transaction_hx"
+
+df = pd.read_sql(f"SELECT * FROM {table_name}", engine)
+df.head(10)
+
+
+# Change to date column to datetime
+df['trx_start_date'] = pd.to_datetime(df['trx_start_date'])
+df['trx_end_date'] = pd.to_datetime(df['trx_end_date'])
+
+# Rename columns
+df = df.rename(columns={'d1': 'trx_type', 'd2': 'trx_subtype',
+                        'd3': 'initiated_by', 'd4': 'customer_id'})
+
+
+keywords = [
+    "salary", "payroll", "income", "wage", "wages",
+    "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
+    "monthlypay", "netpay", "grosspay",
+    "remuneration", "stipend", "allowance", "bonus", "commission",
+    "pension", "retirement", "dividend", "benefits", "reimbursement",
+    "overtime", "incentive", "paycheck", "paycheque", "salary advance",
+    "monthly income", "income tax refund", "employer deposit",
+    "payroll deposit", "salary credit", "income credit", "salary transfer",
+    "income transfer", "salary received", "income received", "hr deposit",
+    "company deposit", "employer payment", "employee payment",
+    "sal",
+]
+
+
+def identify_salary_transactions(df, keywords):
+    """
+    Identifies potential salary-related transactions based on keywords
+    and month-year patterns in the 'description' column.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame containing transaction data.
+        keywords (list): A list of salary/income-related keywords to search for.
+
+    Returns:
+        pd.DataFrame: The input DataFrame with an added 'is_salary_related' column
+                      indicating potential salary transactions.
+    """
+    month_year_patterns = [
+        r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
+        r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
+    ]
+
+    escaped_keywords = [re.escape(keyword.lower()) for keyword in keywords]
+    combined_pattern = (
+        r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
+        '|'.join(month_year_patterns)
+        )
+
+    df['is_salary_related'] = df['description'].str.lower().str.contains(
+        combined_pattern,
+        na=False,
+        regex=True
+    )
+
+    return df
+
+
+desc_df = identify_salary_transactions(df, keywords)
+desc_data = desc_df[(desc_df['is_salary_related'] == True) & (desc_df['initiated_by'] == 'C')]
+desc_data.head()
+
+def calculate_coefficient_of_variation(group):
+    """
+    Calculates the coefficient of variation (CV) for a group of transactions
+    where 'initiated_by' is 'C'.
+
+    Args:
+        group (pd.DataFrame): Transactions for a single account (grouped by 'accountid').
+
+    Returns:
+        float: Coefficient of variation (std / mean). Returns NaN if mean is zero.
+    """
+    amounts = group[group['initiated_by'] == 'C']['amount']
+    mean = amounts.mean()
+    std = amounts.std(ddof=0)
+
+    if mean == 0:
+        return float('nan')
+    return std / mean
+
+def flag_consistent_amounts(group, cv_threshold=0.10):
+    """
+    Flags accounts with low variance in transaction amounts where 'initiated_by' is 'C'.
+
+    Args:
+        group (pd.DataFrame): Transactions for a single account.
+        cv_threshold (float): Maximum allowed CV to flag as consistent (default: 0.10).
+
+    Returns:
+        pd.Series: Boolean series indicating if the transaction belongs to a consistent account.
+    """
+    # Filter for transactions initiated by 'C' before calculating CV
+    filtered_group = group[group['initiated_by'] == 'C']
+    cv = calculate_coefficient_of_variation(filtered_group)  # Pass filtered group
+    is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
+
+    return pd.Series(
+        [is_consistent] * len(group),
+        index=group.index,
+        name='is_consistent_amount'
+    )
+
+def identify_consistent_amount_accounts(df, cv_threshold=0.10):
+    """
+    Identifies accounts with consistent transaction amounts where 'initiated_by' is 'C'.
+
+    Args:
+        df (pd.DataFrame): Transaction data with columns ['accountid', 'amount', 'initiated_by'].
+        cv_threshold (float): Maximum allowed CV (default: 0.10).
+
+    Returns:
+        pd.DataFrame: Input DataFrame with an added 'is_consistent_amount' column.
+    """
+    df = df.groupby('accountid').apply(
+        lambda group: flag_consistent_amounts(group, cv_threshold)
+    ).reset_index(level=0, drop=True)
+
+    return df
+
+
+const_df = identify_consistent_amount_accounts(df, cv_threshold=0.10)
+const_df = df.merge(const_df, left_index=True, right_index=True)
+df['is_consistent_amount'] = const_df['is_consistent_amount']
+const_data = const_df[
+    (const_df['is_consistent_amount']) &
+    (const_df['initiated_by']=='C')
+]
+
+
+
+def flag_salary_type_transactions(df):
+    """
+    Flags transactions that match the salary criteria based on type, subtype, and initiator.
+
+    Args:
+        df (pd.DataFrame): Transaction data with columns ['trx_type', 'trx_subtype', 'initiated_by', 'amount'].
+
+    Returns:
+        pd.DataFrame: Input DataFrame with an added 'is_salary_type' column.
+    """
+    df['is_salary_type'] = (
+        ((df['trx_type'] == 'T') | (df['trx_type'] == 'C')) &
+        ((df['trx_subtype'] == 'BI') |  (df['trx_subtype'] == 'I') |  (df['trx_subtype'] == 'BS') |  (df['trx_subtype'] == 'CI')) &
+        (df['initiated_by'] == 'C') &
+        (df['amount'] > 0)
+    )
+    return df
+
+
+def is_salary_earner_by_type(group, min_transactions=3, threshold=0.7):
+    """
+    Determines if an account likely belongs to a salary earner based on transaction type criteria.
+
+    Args:
+        group (pd.DataFrame): Transactions for a single account.
+        min_transactions (int): Minimum transactions required to qualify (default: 3).
+        threshold (float): Minimum proportion of salary-type transactions (default: 0.7).
+
+    Returns:
+        bool: True if the account meets the criteria, False otherwise.
+    """
+    if len(group) < min_transactions:
+        return False
+    valid_ratio = group['is_salary_type'].mean()
+    return valid_ratio >= threshold
+
+trx_df = flag_salary_type_transactions(df)
+trx_data = trx_df[trx_df['is_salary_type']]
+
+
+def plot_hypothesis_overlap(hypothesis1_df,
+                            hypothesis3_df, hypothesis4_df,
+                            account_col='accountid'):
+    """
+    Plots a Venn diagram showing overlap between the hypotheses.
+
+    Args:
+        hypothesis1_df (pd.DataFrame): DataFrame with Hypothesis 1 results
+        hypothesis2_df (pd.DataFrame): DataFrame with Hypothesis 2 results
+        hypothesis3_df (pd.DataFrame): DataFrame with Hypothesis 3 results
+        hypothesis4_df (pd.DataFrame): DataFrame with Hypothesis 4 results
+        account_col (str): Account identifier column.
+    """
+    set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
+    set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
+    set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
+
+
+    plt.figure(figsize=(10, 10))
+    venn3([set2, set3, set4], set_labels=('Consistent Amount',
+                                                'Salary Description', 'Transaction Type'))
+    plt.title('Overlap Between Hypotheses')
+    plt.show()
+
+plot_hypothesis_overlap(desc_data, const_data, trx_data)
+
+def filter_venn_section(df, **kwargs):
+    """
+    Filters accounts based on specified combinations of hypothesis flags.
+
+    Args:
+        df (pd.DataFrame): DataFrame with columns ['is_salary_related', 'is_consistent_amount', 'is_salary_type'].
+        **kwargs: Key-value pairs specifying the desired state of each hypothesis flag.
+                  For example: {'is_salary_related': True, 'is_consistent_amount': False}.
+
+    Returns:
+        pd.DataFrame: Filtered accounts matching the specified Venn section.
+    """
+    valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
+    df1 = df[df['initiated_by']=='C']
+    invalid_keys = set(kwargs.keys()) - valid_columns
+    if invalid_keys:
+        raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
+
+    condition = pd.Series([True] * len(df1), index=df1.index)
+    for key, value in kwargs.items():
+        condition &= (df1[key] == value)
+
+    return df1[condition]
+
+
+green_section = filter_venn_section(
+    df,
+    is_salary_related=True,
+    is_consistent_amount=False,
+    is_salary_type=True
+)
+
+redisplay(green_section.head(10))
+
+yellow_section = filter_venn_section(
+    df,
+    is_salary_related=False,
+    is_consistent_amount=True,
+    is_salary_type=True
+)
+
+
+
+
+# Get accounts flagged by all three hypotheses
+all_three_hypotheses = filter_venn_section(
+    df,
+    is_salary_related=True,
+    is_consistent_amount=True,
+    is_salary_type=True
+)
+
+
+
+def generate_salary_earners_table(all_three_hypotheses):
+
+    results = []
+    for accountid, group in all_three_hypotheses.groupby('accountid'):
+        # Calculate required metrics
+        num_months = len(group)
+        last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
+        least_inflow = last_6_months['amount'].min()
+        avg_salary = group['amount'].mean()
+
+        # Estimated next salary
+        # Calculate days_since_last_trx within the loop
+        group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
+        median_interval = group['days_since_last_trx'].median()
+
+        last_date = group['trx_start_date'].max()
+        next_date = last_date + timedelta(days=median_interval)
+        next_amount = avg_salary
+
+        # Boolean flags
+        days_since_last = (datetime.now() - last_date).days
+        has_45d = days_since_last <= 45
+        has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
+
+        results.append({
+            'accountid': accountid,
+            'num_months': num_months,
+            'least_inflow_6m': least_inflow,
+            'avg_monthly_salary': avg_salary,
+            'estimated_next_amount': next_amount,
+            'estimated_next_date': next_date,
+            '45daysalary': has_45d,
+            '2monthssalary': has_2m
+        })
+
+    final_df = pd.DataFrame(results)
+    final_df = final_df.dropna()
+    return final_df
+
+final_table = generate_salary_earners_table(all_three_hypotheses)
+
+# Display results
+print(f"Found {final_table['accountid'].nunique()} verified salary earners")
+
+likely_salary_earner = pd.concat([yellow_section, green_section])
+likely_salary_earner = likely_salary_earner.drop_duplicates(subset=['id'])
+likely_salary_earner = generate_salary_earners_table(likely_salary_earner)
+
+# Display results
+print(f"Found {likely_salary_earner['accountid'].nunique()} likely salary earners")
+
+
+def analyze_salary_earners(final_df):
+    """
+    Analyzes salary earners and identifies high earners (>=10k predicted salary).
+
+    Args:
+        final_df (pd.DataFrame): DataFrame containing salary earner information.
+
+    Returns:
+        pd.DataFrame: DataFrame with high earner statistics, including count and minimum inflows.
+    """
+    high_earners = final_df[final_df['estimated_next_amount'] >= 10000]
+    high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
+    count_high = len(high_earners)
+
+    high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
+
+    return high_earner_details, count_high
+
+high_earner_details_df, total_high_earners = analyze_salary_earners(final_table)
+
+print(f"\nTotal High Earners: {total_high_earners}")
+
+
+high_earner_details_df.to_csv('high_earner_details.csv', index=False)
+likely_salary_earner.to_csv('likely_salary_earner.csv', index=False)
+final_table.to_csv('final_table.csv', index=False)
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+
+def add_feature_engineering(df):
+    """
+    Engineers new features to the input DataFrame for salary prediction.
+
+    Adds features like month, month sequence, one-hot encoded transaction type,
+    3-month rolling sum, and 3-month rolling average of transaction amounts.
+
+    Args:
+        df (pd.DataFrame): The input DataFrame containing transaction data.
+
+    Returns:
+        pd.DataFrame: The DataFrame with engineered features added.
+    """
+
+    df['month'] = df['trx_start_date'].dt.month
+    df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
+
+    # Categorical encoding: one-hot encode trx_type
+    encoder = OneHotEncoder(sparse_output=False)
+    encoded_trx_type = encoder.fit_transform(df[['trx_type']])
+    encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
+    df = pd.concat([df, encoded_df], axis=1)
+
+    # Rolling statistics: sort by account and date
+    df = df.sort_values(['accountid', 'trx_start_date'])
+    df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
+                                                                     min_periods=1).sum().reset_index(0, drop=True)
+    df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
+                                                                     min_periods=1).mean().reset_index(0, drop=True)
+
+    return df
+
+def prepare_data(df_transactions, accounts):
+    """
+    Prepares transaction data for training and testing a salary prediction model.
+
+    Filters transactions for specified accounts, performs feature engineering,
+    aggregates data monthly, filters for accounts with sufficient data, and
+    creates training and testing sets using a sliding window approach.
+
+    Args:
+        df_transactions (pd.DataFrame): The input DataFrame containing all transaction data.
+        accounts (list): A list of account IDs to include in the data preparation.
+
+    Returns:
+        tuple: A tuple containing the training and testing data as NumPy arrays:
+               (X_train, y_train, X_test, y_test).
+    """
+
+    df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
+    print(f"Filtered data for {len(accounts)} accounts.")
+    print(f"Total transactions: {len(df_filtered)}")
+
+    # Drop unnecessary columns
+    df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
+                                    'trx_end_date', 'is_salary_related',
+                                    'is_consistent_amount', 'is_salary_type'], axis=1)
+
+    # Add feature engineering
+    df_filtered = add_feature_engineering(df_filtered)
+
+    # Aggregate monthly data with new features
+    agg_funcs = {
+        'amount': 'mean',
+        'rolling_sum_3m': 'last',
+        'rolling_avg_3m': 'last',
+        'month': 'first'
+    }
+    encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
+    for col in encoded_cols:
+        agg_funcs[col] = 'sum'
+
+    monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
+
+    # Filter accounts with at least 12 months
+    account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
+    valid_accounts = account_month_counts[account_month_counts >= 12].index
+    monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
+
+    # Create training and testing sequences
+    X_train, y_train, X_test, y_test = [], [], [], []
+    feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
+                    'month'] + encoded_cols
+
+    for account in valid_accounts:
+        account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
+
+        # Check if account has enough data for training and testing sequences
+        if len(account_data) >= 12:  # Ensure at least 12 months of data
+            for t in range(5, 8):
+                X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
+                y_train.append(account_data['amount'].iloc[t])
+            for t in range(8, 12):
+                X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
+                y_test.append(account_data['amount'].iloc[t])
+        else:
+            print(f"Skipping account {account} due to insufficient data (less than 12 months).")
+
+    return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
+
+
+def train_model(X_train, y_train, X_test, y_test):
+    """
+    Trains and evaluates a Random Forest Regressor for salary prediction.
+
+    Scales the input features using StandardScaler, trains the model,
+    predicts on the test set, and calculates evaluation metrics
+    (MAE, RMSE, R-squared).
+
+    Args:
+        X_train (np.ndarray): Training data features.
+        y_train (np.ndarray): Training data target (salary).
+        X_test (np.ndarray): Testing data features.
+        y_test (np.ndarray): Testing data target (salary).
+
+    Returns:
+        tuple: A tuple containing the trained model and the scaler object:
+               (model, scaler).
+    """
+
+    # Scale features
+    scaler = StandardScaler()
+    X_train_scaled = scaler.fit_transform(X_train)
+    X_test_scaled = scaler.transform(X_test)
+
+    # Train Random Forest model
+    model = RandomForestRegressor(n_estimators=100, random_state=42)
+    model.fit(X_train_scaled, y_train)
+
+    # Evaluate model
+    y_pred = model.predict(X_test_scaled)
+    mae = mean_absolute_error(y_test, y_pred)
+    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+    r2 = r2_score(y_test, y_pred)
+    print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
+    return model, scaler
+
+consistent_accounts = final_table['accountid'].unique()
+X_train_cons, y_train_cons, X_test_cons, y_test_cons = prepare_data(df, consistent_accounts)
+if len(X_train_cons) > 0:
+    model_cons, scaler_cons = train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
+    print("Model trained for consistent salary earners.")
+else:
+    print("No accounts with sufficient data for consistent salary earners.")
+
+
+X_test_cons_scaled = scaler_cons.transform(X_test_cons)
+y_pred = model_cons.predict(X_test_cons_scaled)
+
+plt.figure(figsize=(10, 5))
+plt.scatter(y_test_cons, y_pred, alpha=0.5)
+plt.xlabel("Actual Salary")
+plt.ylabel("Predicted Salary")
+plt.title("Actual vs. Predicted Salary")
+plt.plot([min(y_test_cons), max(y_test_cons)], [min(y_test_cons), max(y_test_cons)], 'r--')
+plt.show()
+
+inconsistent_accounts = likely_salary_earner['accountid'].unique()
+X_train_incons, y_train_incons, X_test_incons, y_test_incons = prepare_data(df, inconsistent_accounts)
+if len(X_train_incons) > 0:
+    print("\nTraining model for inconsistent salary earners...")
+    model_incons, scaler_incons = train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
+else:
+    print("No accounts with sufficient data for inconsistent salary earners.")
+    
+
+X_test_incons_scaled = scaler_incons.transform(X_test_incons)
+y_pred = model_incons.predict(X_test_incons_scaled)
+
+plt.figure(figsize=(10, 5))
+plt.scatter(y_test_incons, y_pred, alpha=0.5)
+plt.xlabel("Actual Salary")
+plt.ylabel("Predicted Salary")
+plt.title("Actual vs. Predicted Salary")
+plt.plot([min(y_test_incons), max(y_test_incons)], [min(y_test_incons), max(y_test_incons)], 'r--')
+plt.show()
+
@@ -0,0 +1,6 @@
+"""
+Salary Analytics Package
+A package for analyzing and predicting salary patterns from transaction data.
+"""
+
+__version__ = "0.1.0" 
@@ -0,0 +1,212 @@
+"""
+FastAPI application for salary analytics.
+"""
+
+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from fastapi.responses import FileResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import Optional, Dict
+import os
+import socket
+import logging
+
+from .main import SalaryAnalyticsPipeline
+from .config import OUTPUT_PATHS
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+app = FastAPI(
+    title="Salary Analytics API",
+    description="API for analyzing and predicting salary patterns from transaction data",
+    version="1.0.0"
+)
+
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+
+# Global pipeline instance
+pipeline = None
+
+class AnalysisResponse(BaseModel):
+    """Response model for analysis endpoints."""
+    message: str
+    data: Optional[Dict] = None
+    file_path: Optional[str] = None
+
+@app.on_event("startup")
+async def startup_event():
+    """Initialize the pipeline on startup."""
+    global pipeline
+    try:
+        logger.info("Initializing pipeline...")
+        pipeline = SalaryAnalyticsPipeline()
+        if not pipeline.load_data():
+            logger.error("Failed to load data during startup")
+            raise Exception("Failed to load data during startup")
+        
+        # Print network information
+        hostname = socket.gethostname()
+        ip_address = socket.gethostbyname(hostname)
+        logger.info(f"Server running on hostname: {hostname}")
+        logger.info(f"Server IP address: {ip_address}")
+        logger.info(f"Server is accessible at:")
+        logger.info(f"- http://localhost:8000")
+        logger.info(f"- http://127.0.0.1:8000")
+        logger.info(f"- http://{ip_address}:8000")
+        logger.info("Pipeline initialized successfully")
+    except Exception as e:
+        logger.error(f"Error during startup: {str(e)}")
+        raise
+
+@app.get("/")
+async def root():
+    """Root endpoint."""
+    logger.info("Root endpoint accessed")
+    return {"message": "Welcome to Salary Analytics API"}
+
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    logger.info("Health check endpoint accessed")
+    return {"status": "healthy"}
+
+@app.post("/analyze/keyword", response_model=AnalysisResponse)
+async def analyze_keyword():
+    """Run keyword-based salary transaction analysis."""
+    try:
+        logger.info("Starting keyword analysis...")
+        data = pipeline.run_keyword_analysis()
+        logger.info(f"Keyword analysis completed. Found {len(data)} matches")
+        return AnalysisResponse(
+            message="Keyword analysis completed successfully",
+            data={"count": len(data)}
+        )
+    except Exception as e:
+        logger.error(f"Error in keyword analysis: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/analyze/consistent-amount", response_model=AnalysisResponse)
+async def analyze_consistent_amount():
+    """Run consistent amount transaction analysis."""
+    try:
+        logger.info("Starting consistent amount analysis...")
+        data = pipeline.run_consistent_amount_analysis()
+        logger.info(f"Consistent amount analysis completed. Found {len(data)} matches")
+        return AnalysisResponse(
+            message="Consistent amount analysis completed successfully",
+            data={"count": len(data)}
+        )
+    except Exception as e:
+        logger.error(f"Error in consistent amount analysis: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/analyze/transaction-type", response_model=AnalysisResponse)
+async def analyze_transaction_type():
+    """Run transaction type analysis."""
+    try:
+        logger.info("Starting transaction type analysis...")
+        data = pipeline.run_transaction_type_analysis()
+        logger.info(f"Transaction type analysis completed. Found {len(data)} matches")
+        return AnalysisResponse(
+            message="Transaction type analysis completed successfully",
+            data={"count": len(data)}
+        )
+    except Exception as e:
+        logger.error(f"Error in transaction type analysis: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/generate/reports", response_model=AnalysisResponse)
+async def generate_reports(background_tasks: BackgroundTasks):
+    """Generate salary earner reports."""
+    try:
+        logger.info("Starting report generation...")
+        reports = pipeline.generate_salary_earner_reports()
+        logger.info("Reports generated successfully")
+        return AnalysisResponse(
+            message="Reports generated successfully",
+            data={
+                "verified_salary_earners": len(reports['final_table']),
+                "likely_salary_earners": len(reports['likely_salary_earner']),
+                "high_earners": reports['total_high_earners']
+            }
+        )
+    except Exception as e:
+        logger.error(f"Error in report generation: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/train/models", response_model=AnalysisResponse)
+async def train_models():
+    """Train salary prediction models."""
+    try:
+        logger.info("Starting model training...")
+        pipeline.train_salary_prediction_models()
+        logger.info("Models trained successfully")
+        return AnalysisResponse(
+            message="Models trained successfully"
+        )
+    except Exception as e:
+        logger.error(f"Error in model training: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.get("/download/{report_type}")
+async def download_report(report_type: str):
+    """Download generated reports."""
+    try:
+        logger.info(f"Attempting to download report: {report_type}")
+        file_paths = {
+            "high_earners": OUTPUT_PATHS["high_earner_details"],
+            "likely_earners": OUTPUT_PATHS["likely_salary_earner"],
+            "final_table": OUTPUT_PATHS["final_table"],
+            "consistent_plot": OUTPUT_PATHS["consistent_earners_plot"],
+            "inconsistent_plot": OUTPUT_PATHS["inconsistent_earners_plot"],
+            "hypothesis_plot": OUTPUT_PATHS["hypothesis_overlap_plot"]
+        }
+
+        if report_type not in file_paths:
+            logger.error(f"Report type not found: {report_type}")
+            raise HTTPException(status_code=404, detail="Report type not found")
+
+        file_path = file_paths[report_type]
+        if not os.path.exists(file_path):
+            logger.error(f"Report file not found: {file_path}")
+            raise HTTPException(status_code=404, detail="Report file not found")
+
+        logger.info(f"Successfully found report file: {file_path}")
+        return FileResponse(
+            path=file_path,
+            filename=os.path.basename(file_path),
+            media_type="application/octet-stream"
+        )
+    except Exception as e:
+        logger.error(f"Error downloading report: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e))
+
+@app.post("/run/pipeline", response_model=AnalysisResponse)
+async def run_full_pipeline():
+    """Run the complete salary analytics pipeline."""
+    try:
+        logger.info("Starting full pipeline...")
+        success = pipeline.run_full_pipeline()
+        if not success:
+            logger.error("Pipeline failed")
+            raise HTTPException(status_code=500, detail="Pipeline failed")
+        
+        logger.info("Pipeline completed successfully")
+        return AnalysisResponse(
+            message="Pipeline completed successfully"
+        )
+    except Exception as e:
+        logger.error(f"Error in pipeline: {str(e)}")
+        raise HTTPException(status_code=500, detail=str(e)) 
@@ -0,0 +1,61 @@
+"""
+Configuration settings for the salary analytics package.
+"""
+
+import os
+
+# Base directories
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+OUTPUT_DIR = os.path.join(BASE_DIR, "output")
+PLOTS_DIR = os.path.join(OUTPUT_DIR, "plots")
+CSV_DIR = os.path.join(OUTPUT_DIR, "csv")
+
+# Create directories if they don't exist
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+os.makedirs(PLOTS_DIR, exist_ok=True)
+os.makedirs(CSV_DIR, exist_ok=True)
+
+# Database Configuration
+DB_CONFIG = {
+    "user": "salaryloan",
+    "password": "salaryloan",
+    "name": "salaryloan",
+    "port": "10532",
+    "host": "dev-data.simbrellang.net"
+}
+
+# Table Configuration
+TABLE_NAME = "customer_account_transaction_hx"
+
+# Salary Keywords
+SALARY_KEYWORDS = [
+    "salary", "payroll", "income", "wage", "wages",
+    "earnings", "earning", "monthly pay", "net pay", "gross pay", "compensation",
+    "monthlypay", "netpay", "grosspay",
+    "remuneration", "stipend", "allowance", "bonus", "commission",
+    "pension", "retirement", "dividend", "benefits", "reimbursement",
+    "overtime", "incentive", "paycheck", "paycheque", "salary advance",
+    "monthly income", "income tax refund", "employer deposit",
+    "payroll deposit", "salary credit", "income credit", "salary transfer",
+    "income transfer", "salary received", "income received", "hr deposit",
+    "company deposit", "employer payment", "employee payment",
+    "sal",
+]
+
+# Model Configuration
+MODEL_CONFIG = {
+    "cv_threshold": 0.10,
+    "min_transactions": 3,
+    "threshold": 0.7,
+    "high_earner_threshold": 10000
+}
+
+# File Paths
+OUTPUT_PATHS = {
+    "high_earner_details": os.path.join(CSV_DIR, "high_earner_details.csv"),
+    "likely_salary_earner": os.path.join(CSV_DIR, "likely_salary_earner.csv"),
+    "final_table": os.path.join(CSV_DIR, "final_table.csv"),
+    "consistent_earners_plot": os.path.join(PLOTS_DIR, "consistent_earners_predictions.png"),
+    "inconsistent_earners_plot": os.path.join(PLOTS_DIR, "inconsistent_earners_predictions.png"),
+    "hypothesis_overlap_plot": os.path.join(PLOTS_DIR, "hypothesis_overlap.png")
+} 
@@ -0,0 +1,58 @@
+"""
+Consistent amount transaction analysis module.
+"""
+
+import pandas as pd
+from .config import MODEL_CONFIG
+
+class ConsistentAmountAnalyzer:
+    def __init__(self, df):
+        self.df = df
+        self.const_df = None
+
+    def calculate_coefficient_of_variation(self, group):
+        """Calculate coefficient of variation for a group of transactions."""
+        amounts = group[group['initiated_by'] == 'C']['amount']
+        mean = amounts.mean()
+        std = amounts.std(ddof=0)
+
+        if mean == 0:
+            return float('nan')
+        return std / mean
+
+    def flag_consistent_amounts(self, group, cv_threshold=None):
+        """Flag accounts with low variance in transaction amounts."""
+        if cv_threshold is None:
+            cv_threshold = MODEL_CONFIG['cv_threshold']
+
+        filtered_group = group[group['initiated_by'] == 'C']
+        cv = self.calculate_coefficient_of_variation(filtered_group)
+        is_consistent = cv <= cv_threshold if not pd.isna(cv) else False
+
+        return pd.Series(
+            [is_consistent] * len(group),
+            index=group.index,
+            name='is_consistent_amount'
+        )
+
+    def identify_consistent_amount_accounts(self, cv_threshold=None):
+        """Identify accounts with consistent transaction amounts."""
+        if cv_threshold is None:
+            cv_threshold = MODEL_CONFIG['cv_threshold']
+
+        self.df = self.df.groupby('accountid').apply(
+            lambda group: self.flag_consistent_amounts(group, cv_threshold)
+        ).reset_index(level=0, drop=True)
+
+        self.const_df = self.df.copy()
+        return self.df
+
+    def get_consistent_amount_data(self):
+        """Get transactions identified as having consistent amounts."""
+        if self.const_df is None:
+            self.identify_consistent_amount_accounts()
+        
+        return self.const_df[
+            (self.const_df['is_consistent_amount']) &
+            (self.const_df['initiated_by'] == 'C')
+        ] 
@@ -0,0 +1,113 @@
+"""
+Data loading and preprocessing module.
+"""
+
+from sqlalchemy import create_engine, text
+import pandas as pd
+from datetime import datetime
+import logging
+from .config import DB_CONFIG, TABLE_NAME
+
+logger = logging.getLogger(__name__)
+
+class DataLoader:
+    def __init__(self):
+        self.engine = None
+        self.df = None
+        self.chunk_size = 10000  # Load 10,000 rows at a time
+
+    def connect(self):
+        """Establish database connection."""
+        try:
+            logger.info("Attempting to connect to database...")
+            DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['name']}"
+            self.engine = create_engine(DATABASE_URL)
+            with self.engine.connect() as conn:
+                # First check if table exists
+                check_table = text(f"SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = '{TABLE_NAME}')")
+                table_exists = conn.execute(check_table).scalar()
+                
+                if not table_exists:
+                    logger.error(f"Table {TABLE_NAME} does not exist in the database")
+                    return False
+                
+                # Get row count
+                count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
+                row_count = conn.execute(count_query).scalar()
+                logger.info(f"Table {TABLE_NAME} exists with {row_count} rows")
+                
+                # Get version
+                result = conn.execute(text("SELECT version();"))
+                logger.info("Connected successfully to database!")
+                return True
+        except Exception as e:
+            logger.error(f"Error connecting to database: {str(e)}")
+            return False
+
+    def load_data(self):
+        """Load and preprocess transaction data in chunks."""
+        if not self.engine:
+            logger.info("No database connection. Attempting to connect...")
+            if not self.connect():
+                logger.error("Failed to establish database connection")
+                return None
+
+        try:
+            logger.info(f"Loading data from table: {TABLE_NAME}")
+            
+            # First get total count
+            with self.engine.connect() as conn:
+                count_query = text(f"SELECT COUNT(*) FROM {TABLE_NAME}")
+                total_rows = conn.execute(count_query).scalar()
+                logger.info(f"Total rows to process: {total_rows}")
+            
+            # Load data in chunks
+            chunks = []
+            offset = 0
+            
+            while True:
+                logger.info(f"Loading chunk starting at offset {offset}")
+                query = f"SELECT * FROM {TABLE_NAME} LIMIT {self.chunk_size} OFFSET {offset}"
+                chunk = pd.read_sql(query, self.engine)
+                
+                if chunk.empty:
+                    break
+                    
+                # Preprocess chunk
+                chunk['trx_start_date'] = pd.to_datetime(chunk['trx_start_date'])
+                chunk['trx_end_date'] = pd.to_datetime(chunk['trx_end_date'])
+                
+                # Rename columns
+                chunk = chunk.rename(columns={
+                    'd1': 'trx_type',
+                    'd2': 'trx_subtype',
+                    'd3': 'initiated_by',
+                    'd4': 'customer_id'
+                })
+                
+                chunks.append(chunk)
+                offset += self.chunk_size
+                
+                if offset >= total_rows:
+                    break
+            
+            # Combine all chunks
+            self.df = pd.concat(chunks, ignore_index=True)
+            logger.info(f"Successfully loaded {len(self.df)} rows of data")
+            
+            # Basic data validation
+            logger.info("Performing data validation...")
+            logger.info(f"Columns in dataset: {self.df.columns.tolist()}")
+            logger.info(f"Data types:\n{self.df.dtypes}")
+            logger.info(f"Missing values:\n{self.df.isnull().sum()}")
+            
+            return self.df
+        except Exception as e:
+            logger.error(f"Error loading data: {str(e)}")
+            return None
+
+    def get_data(self):
+        """Get the loaded DataFrame."""
+        if self.df is None:
+            logger.warning("No data loaded. Call load_data() first.")
+        return self.df 
@@ -0,0 +1,47 @@
+"""
+Keyword-based salary transaction analysis module.
+"""
+
+import re
+import pandas as pd
+from .config import SALARY_KEYWORDS
+
+class KeywordAnalyzer:
+    def __init__(self, df):
+        self.df = df
+        self.desc_df = None
+
+    def identify_salary_transactions(self):
+        """
+        Identifies potential salary-related transactions based on keywords
+        and month-year patterns in the 'description' column.
+        """
+        month_year_patterns = [
+            r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
+            r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
+        ]
+
+        escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
+        combined_pattern = (
+            r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
+            '|'.join(month_year_patterns)
+        )
+
+        self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
+            combined_pattern,
+            na=False,
+            regex=True
+        )
+
+        self.desc_df = self.df.copy()
+        return self.df
+
+    def get_salary_related_data(self):
+        """Get transactions identified as salary-related."""
+        if self.desc_df is None:
+            self.identify_salary_transactions()
+        
+        return self.desc_df[
+            (self.desc_df['is_salary_related'] == True) & 
+            (self.desc_df['initiated_by'] == 'C')
+        ] 
@@ -0,0 +1,134 @@
+"""
+Main module for running the salary analytics pipeline.
+"""
+
+import logging
+from .data_loader import DataLoader
+from .keyword_analyzer import KeywordAnalyzer
+from .consistent_amount_analyzer import ConsistentAmountAnalyzer
+from .transaction_type_analyzer import TransactionTypeAnalyzer
+from .salary_earner_analyzer import SalaryEarnerAnalyzer
+from .salary_predictor import SalaryPredictor
+
+logger = logging.getLogger(__name__)
+
+class SalaryAnalyticsPipeline:
+    def __init__(self):
+        logger.info("Initializing SalaryAnalyticsPipeline")
+        self.data_loader = None
+        self.df = None
+        self.keyword_analyzer = None
+        self.consistent_amount_analyzer = None
+        self.transaction_type_analyzer = None
+        self.salary_earner_analyzer = None
+        self.salary_predictor = None
+
+    def load_data(self):
+        """Load and preprocess the transaction data."""
+        logger.info("Starting data loading process")
+        self.data_loader = DataLoader()
+        self.df = self.data_loader.load_data()
+        if self.df is not None:
+            logger.info(f"Successfully loaded data with {len(self.df)} rows")
+        else:
+            logger.error("Failed to load data")
+        return self.df is not None
+
+    def run_keyword_analysis(self):
+        """Run keyword-based salary transaction analysis."""
+        if self.df is None:
+            logger.error("Data not loaded. Call load_data() first.")
+            raise ValueError("Data not loaded. Call load_data() first.")
+        
+        logger.info("Starting keyword analysis")
+        self.keyword_analyzer = KeywordAnalyzer(self.df)
+        self.keyword_analyzer.identify_salary_transactions()
+        return self.keyword_analyzer.get_salary_related_data()
+
+    def run_consistent_amount_analysis(self):
+        """Run consistent amount transaction analysis."""
+        if self.df is None:
+            logger.error("Data not loaded. Call load_data() first.")
+            raise ValueError("Data not loaded. Call load_data() first.")
+        
+        logger.info("Starting consistent amount analysis")
+        self.consistent_amount_analyzer = ConsistentAmountAnalyzer(self.df)
+        self.consistent_amount_analyzer.identify_consistent_amount_accounts()
+        return self.consistent_amount_analyzer.get_consistent_amount_data()
+
+    def run_transaction_type_analysis(self):
+        """Run transaction type analysis."""
+        if self.df is None:
+            logger.error("Data not loaded. Call load_data() first.")
+            raise ValueError("Data not loaded. Call load_data() first.")
+        
+        logger.info("Starting transaction type analysis")
+        self.transaction_type_analyzer = TransactionTypeAnalyzer(self.df)
+        self.transaction_type_analyzer.flag_salary_type_transactions()
+        return self.transaction_type_analyzer.get_salary_type_data()
+
+    def generate_salary_earner_reports(self):
+        """Generate salary earner reports."""
+        if self.df is None:
+            logger.error("Data not loaded. Call load_data() first.")
+            raise ValueError("Data not loaded. Call load_data() first.")
+        
+        logger.info("Starting salary earner report generation")
+        self.salary_earner_analyzer = SalaryEarnerAnalyzer(self.df)
+        return self.salary_earner_analyzer.generate_reports()
+
+    def train_salary_prediction_models(self):
+        """Train salary prediction models."""
+        if self.df is None:
+            logger.error("Data not loaded. Call load_data() first.")
+            raise ValueError("Data not loaded. Call load_data() first.")
+        
+        logger.info("Starting model training")
+        self.salary_predictor = SalaryPredictor(self.df)
+        
+        # Get accounts from the salary earner analyzer
+        if self.salary_earner_analyzer is None:
+            logger.info("Salary earner analyzer not initialized. Generating reports first.")
+            self.generate_salary_earner_reports()
+        
+        consistent_accounts = self.salary_earner_analyzer.final_table['accountid'].unique()
+        inconsistent_accounts = self.salary_earner_analyzer.likely_salary_earner['accountid'].unique()
+        
+        self.salary_predictor.train_and_evaluate(consistent_accounts, inconsistent_accounts)
+
+    def run_full_pipeline(self):
+        """Run the complete salary analytics pipeline."""
+        logger.info("Starting full pipeline execution")
+        if not self.load_data():
+            logger.error("Failed to load data. Exiting pipeline.")
+            return False
+
+        try:
+            logger.info("Running keyword analysis...")
+            self.run_keyword_analysis()
+
+            logger.info("Running consistent amount analysis...")
+            self.run_consistent_amount_analysis()
+
+            logger.info("Running transaction type analysis...")
+            self.run_transaction_type_analysis()
+
+            logger.info("Generating salary earner reports...")
+            self.generate_salary_earner_reports()
+
+            logger.info("Training salary prediction models...")
+            self.train_salary_prediction_models()
+
+            logger.info("Pipeline completed successfully!")
+            return True
+        except Exception as e:
+            logger.error(f"Pipeline failed: {str(e)}")
+            return False
+
+def main():
+    """Main function to run the salary analytics pipeline."""
+    pipeline = SalaryAnalyticsPipeline()
+    pipeline.run_full_pipeline()
+
+if __name__ == "__main__":
+    main() 
@@ -0,0 +1,145 @@
+"""
+Salary earner analysis and report generation module.
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib_venn import venn3
+from datetime import datetime, timedelta
+from .config import MODEL_CONFIG, OUTPUT_PATHS
+
+class SalaryEarnerAnalyzer:
+    def __init__(self, df):
+        self.df = df
+        self.final_table = None
+        self.likely_salary_earner = None
+        self.high_earner_details = None
+
+    def filter_venn_section(self, **kwargs):
+        """Filter accounts based on specified combinations of hypothesis flags."""
+        valid_columns = {'is_salary_related', 'is_consistent_amount', 'is_salary_type'}
+        df1 = self.df[self.df['initiated_by'] == 'C']
+        
+        invalid_keys = set(kwargs.keys()) - valid_columns
+        if invalid_keys:
+            raise ValueError(f"Invalid keys: {invalid_keys}. Valid keys are {valid_columns}.")
+
+        condition = pd.Series([True] * len(df1), index=df1.index)
+        for key, value in kwargs.items():
+            condition &= (df1[key] == value)
+
+        return df1[condition]
+
+    def plot_hypothesis_overlap(self, hypothesis1_df, hypothesis3_df, hypothesis4_df, account_col='accountid'):
+        """Plot and save Venn diagram showing overlap between hypotheses."""
+        set2 = set(hypothesis3_df[account_col][hypothesis3_df['is_consistent_amount']])
+        set3 = set(hypothesis1_df[account_col][hypothesis1_df['is_salary_related']])
+        set4 = set(hypothesis4_df[account_col][hypothesis4_df['is_salary_type']])
+
+        plt.figure(figsize=(10, 10))
+        venn3([set2, set3, set4], set_labels=('Consistent Amount',
+                                            'Salary Description', 'Transaction Type'))
+        plt.title('Overlap Between Hypotheses')
+        plt.savefig(OUTPUT_PATHS['hypothesis_overlap_plot'])
+        plt.close()
+
+    def generate_salary_earners_table(self, all_three_hypotheses):
+        """Generate a table of salary earners with their metrics."""
+        results = []
+        for accountid, group in all_three_hypotheses.groupby('accountid'):
+            # Calculate required metrics
+            num_months = len(group)
+            last_6_months = group[group['trx_start_date'] >= (datetime.now() - timedelta(days=180))]
+            least_inflow = last_6_months['amount'].min()
+            avg_salary = group['amount'].mean()
+
+            # Calculate days since last transaction
+            group['days_since_last_trx'] = group['trx_start_date'].diff().dt.days
+            median_interval = group['days_since_last_trx'].median()
+
+            last_date = group['trx_start_date'].max()
+            next_date = last_date + timedelta(days=median_interval)
+            next_amount = avg_salary
+
+            # Boolean flags
+            days_since_last = (datetime.now() - last_date).days
+            has_45d = days_since_last <= 45
+            has_2m = len(group[group['trx_start_date'] >= (datetime.now() - timedelta(days=60))]) >= 2
+
+            results.append({
+                'accountid': accountid,
+                'num_months': num_months,
+                'least_inflow_6m': least_inflow,
+                'avg_monthly_salary': avg_salary,
+                'estimated_next_amount': next_amount,
+                'estimated_next_date': next_date,
+                '45daysalary': has_45d,
+                '2monthssalary': has_2m
+            })
+
+        final_df = pd.DataFrame(results)
+        final_df = final_df.dropna()
+        return final_df
+
+    def analyze_salary_earners(self, final_df):
+        """Analyze salary earners and identify high earners."""
+        high_earners = final_df[final_df['estimated_next_amount'] >= MODEL_CONFIG['high_earner_threshold']]
+        high_earners['least_inflow_6m'] = high_earners['least_inflow_6m']
+        count_high = len(high_earners)
+
+        high_earner_details = high_earners[['accountid', 'least_inflow_6m']].reset_index(drop=True)
+        return high_earner_details, count_high
+
+    def generate_reports(self):
+        """Generate all salary earner reports."""
+        # Get accounts flagged by all three hypotheses
+        all_three_hypotheses = self.filter_venn_section(
+            is_salary_related=True,
+            is_consistent_amount=True,
+            is_salary_type=True
+        )
+
+        # Generate final table
+        self.final_table = self.generate_salary_earners_table(all_three_hypotheses)
+        print(f"Found {self.final_table['accountid'].nunique()} verified salary earners")
+
+        # Generate likely salary earner table
+        green_section = self.filter_venn_section(
+            is_salary_related=True,
+            is_consistent_amount=False,
+            is_salary_type=True
+        )
+
+        yellow_section = self.filter_venn_section(
+            is_salary_related=False,
+            is_consistent_amount=True,
+            is_salary_type=True
+        )
+
+        self.likely_salary_earner = pd.concat([yellow_section, green_section])
+        self.likely_salary_earner = self.likely_salary_earner.drop_duplicates(subset=['id'])
+        self.likely_salary_earner = self.generate_salary_earners_table(self.likely_salary_earner)
+        print(f"Found {self.likely_salary_earner['accountid'].nunique()} likely salary earners")
+
+        # Analyze high earners
+        self.high_earner_details, total_high_earners = self.analyze_salary_earners(self.final_table)
+        print(f"\nTotal High Earners: {total_high_earners}")
+
+        # Plot hypothesis overlap
+        self.plot_hypothesis_overlap(
+            self.df[self.df['is_salary_related']],
+            self.df[self.df['is_consistent_amount']],
+            self.df[self.df['is_salary_type']]
+        )
+
+        # Save reports
+        self.high_earner_details.to_csv(OUTPUT_PATHS['high_earner_details'], index=False)
+        self.likely_salary_earner.to_csv(OUTPUT_PATHS['likely_salary_earner'], index=False)
+        self.final_table.to_csv(OUTPUT_PATHS['final_table'], index=False)
+
+        return {
+            'final_table': self.final_table,
+            'likely_salary_earner': self.likely_salary_earner,
+            'high_earner_details': self.high_earner_details,
+            'total_high_earners': total_high_earners
+        } 
@@ -0,0 +1,160 @@
+"""
+Salary prediction module using machine learning.
+"""
+
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
+from .config import OUTPUT_PATHS
+
+class SalaryPredictor:
+    def __init__(self, df):
+        self.df = df
+        self.model_cons = None
+        self.model_incons = None
+        self.scaler_cons = None
+        self.scaler_incons = None
+
+    def add_feature_engineering(self, df):
+        """Engineer features for salary prediction."""
+        df['month'] = df['trx_start_date'].dt.month
+        df['month_seq'] = df.groupby(['accountid', 'month']).ngroup() + 1
+
+        # Categorical encoding
+        encoder = OneHotEncoder(sparse_output=False)
+        encoded_trx_type = encoder.fit_transform(df[['trx_type']])
+        encoded_df = pd.DataFrame(encoded_trx_type, columns=encoder.get_feature_names_out(['trx_type']))
+        df = pd.concat([df, encoded_df], axis=1)
+
+        # Rolling statistics
+        df = df.sort_values(['accountid', 'trx_start_date'])
+        df['rolling_sum_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
+                                                                     min_periods=1).sum().reset_index(0, drop=True)
+        df['rolling_avg_3m'] = df.groupby('accountid')['amount'].rolling(window=3,
+                                                                     min_periods=1).mean().reset_index(0, drop=True)
+
+        return df
+
+    def prepare_data(self, df_transactions, accounts):
+        """Prepare data for training and testing."""
+        df_filtered = df_transactions[df_transactions['accountid'].isin(accounts)].copy()
+        print(f"Filtered data for {len(accounts)} accounts.")
+        print(f"Total transactions: {len(df_filtered)}")
+
+        # Drop unnecessary columns
+        df_filtered = df_filtered.drop(['description', 'id', 'customer_id',
+                                    'trx_end_date', 'is_salary_related',
+                                    'is_consistent_amount', 'is_salary_type'], axis=1)
+
+        # Add feature engineering
+        df_filtered = self.add_feature_engineering(df_filtered)
+
+        # Aggregate monthly data
+        agg_funcs = {
+            'amount': 'mean',
+            'rolling_sum_3m': 'last',
+            'rolling_avg_3m': 'last',
+            'month': 'first'
+        }
+        encoded_cols = [col for col in df_filtered.columns if col.startswith('trx_type_')]
+        for col in encoded_cols:
+            agg_funcs[col] = 'sum'
+
+        monthly_data = df_filtered.groupby(['accountid', 'month_seq']).agg(agg_funcs).reset_index()
+
+        # Filter accounts with at least 12 months
+        account_month_counts = monthly_data.groupby('accountid')['month_seq'].max()
+        valid_accounts = account_month_counts[account_month_counts >= 12].index
+        monthly_data = monthly_data[monthly_data['accountid'].isin(valid_accounts)]
+
+        # Create sequences
+        X_train, y_train, X_test, y_test = [], [], [], []
+        feature_cols = ['accountid', 'amount', 'rolling_sum_3m', 'rolling_avg_3m',
+                    'month'] + encoded_cols
+
+        for account in valid_accounts:
+            account_data = monthly_data[monthly_data['accountid'] == account].sort_values('month_seq')
+
+            if len(account_data) >= 12:
+                for t in range(5, 8):
+                    X_train.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
+                    y_train.append(account_data['amount'].iloc[t])
+                for t in range(8, 12):
+                    X_test.append(account_data.iloc[t-5:t][feature_cols].values.flatten())
+                    y_test.append(account_data['amount'].iloc[t])
+            else:
+                print(f"Skipping account {account} due to insufficient data (less than 12 months).")
+
+        return np.array(X_train), np.array(y_train), np.array(X_test), np.array(y_test)
+
+    def train_model(self, X_train, y_train, X_test, y_test):
+        """Train and evaluate a Random Forest model."""
+        # Scale features
+        scaler = StandardScaler()
+        X_train_scaled = scaler.fit_transform(X_train)
+        X_test_scaled = scaler.transform(X_test)
+
+        # Train model
+        model = RandomForestRegressor(n_estimators=100, random_state=42)
+        model.fit(X_train_scaled, y_train)
+
+        # Evaluate
+        y_pred = model.predict(X_test_scaled)
+        mae = mean_absolute_error(y_test, y_pred)
+        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
+        r2 = r2_score(y_test, y_pred)
+        print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}, R-squared: {r2:.2f}")
+        
+        return model, scaler
+
+    def plot_predictions(self, y_test, y_pred, title, output_path):
+        """Plot actual vs predicted values and save to file."""
+        plt.figure(figsize=(10, 5))
+        plt.scatter(y_test, y_pred, alpha=0.5)
+        plt.xlabel("Actual Salary")
+        plt.ylabel("Predicted Salary")
+        plt.title(title)
+        plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], 'r--')
+        plt.savefig(output_path)
+        plt.close()
+
+    def train_and_evaluate(self, consistent_accounts, inconsistent_accounts):
+        """Train and evaluate models for both consistent and inconsistent salary earners."""
+        # Train model for consistent salary earners
+        X_train_cons, y_train_cons, X_test_cons, y_test_cons = self.prepare_data(self.df, consistent_accounts)
+        if len(X_train_cons) > 0:
+            self.model_cons, self.scaler_cons = self.train_model(X_train_cons, y_train_cons, X_test_cons, y_test_cons)
+            print("Model trained for consistent salary earners.")
+            
+            # Plot predictions
+            X_test_cons_scaled = self.scaler_cons.transform(X_test_cons)
+            y_pred = self.model_cons.predict(X_test_cons_scaled)
+            self.plot_predictions(
+                y_test_cons, 
+                y_pred, 
+                "Actual vs. Predicted Salary (Consistent Earners)",
+                OUTPUT_PATHS['consistent_earners_plot']
+            )
+        else:
+            print("No accounts with sufficient data for consistent salary earners.")
+
+        # Train model for inconsistent salary earners
+        X_train_incons, y_train_incons, X_test_incons, y_test_incons = self.prepare_data(self.df, inconsistent_accounts)
+        if len(X_train_incons) > 0:
+            print("\nTraining model for inconsistent salary earners...")
+            self.model_incons, self.scaler_incons = self.train_model(X_train_incons, y_train_incons, X_test_incons, y_test_incons)
+            
+            # Plot predictions
+            X_test_incons_scaled = self.scaler_incons.transform(X_test_incons)
+            y_pred = self.model_incons.predict(X_test_incons_scaled)
+            self.plot_predictions(
+                y_test_incons, 
+                y_pred, 
+                "Actual vs. Predicted Salary (Inconsistent Earners)",
+                OUTPUT_PATHS['inconsistent_earners_plot']
+            )
+        else:
+            print("No accounts with sufficient data for inconsistent salary earners.") 
@@ -0,0 +1,43 @@
+"""
+Transaction type analysis module.
+"""
+
+import pandas as pd
+from .config import MODEL_CONFIG
+
+class TransactionTypeAnalyzer:
+    def __init__(self, df):
+        self.df = df
+        self.trx_df = None
+
+    def flag_salary_type_transactions(self):
+        """Flag transactions that match salary criteria based on type and subtype."""
+        self.df['is_salary_type'] = (
+            ((self.df['trx_type'] == 'T') | (self.df['trx_type'] == 'C')) &
+            ((self.df['trx_subtype'] == 'BI') | (self.df['trx_subtype'] == 'I') |
+             (self.df['trx_subtype'] == 'BS') | (self.df['trx_subtype'] == 'CI')) &
+            (self.df['initiated_by'] == 'C') &
+            (self.df['amount'] > 0)
+        )
+        
+        self.trx_df = self.df.copy()
+        return self.df
+
+    def is_salary_earner_by_type(self, group, min_transactions=None, threshold=None):
+        """Determine if an account likely belongs to a salary earner."""
+        if min_transactions is None:
+            min_transactions = MODEL_CONFIG['min_transactions']
+        if threshold is None:
+            threshold = MODEL_CONFIG['threshold']
+
+        if len(group) < min_transactions:
+            return False
+        valid_ratio = group['is_salary_type'].mean()
+        return valid_ratio >= threshold
+
+    def get_salary_type_data(self):
+        """Get transactions identified as salary type."""
+        if self.trx_df is None:
+            self.flag_salary_type_transactions()
+        
+        return self.trx_df[self.trx_df['is_salary_type']]