47 lines
1.4 KiB
Python
47 lines
1.4 KiB
Python
"""
|
|
Keyword-based salary transaction analysis module.
|
|
"""
|
|
|
|
import re
|
|
import pandas as pd
|
|
from .config import SALARY_KEYWORDS
|
|
|
|
class KeywordAnalyzer:
|
|
def __init__(self, df):
|
|
self.df = df
|
|
self.desc_df = None
|
|
|
|
def identify_salary_transactions(self):
|
|
"""
|
|
Identifies potential salary-related transactions based on keywords
|
|
and month-year patterns in the 'description' column.
|
|
"""
|
|
month_year_patterns = [
|
|
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
|
|
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
|
|
]
|
|
|
|
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
|
|
combined_pattern = (
|
|
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
|
|
'|'.join(month_year_patterns)
|
|
)
|
|
|
|
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
|
|
combined_pattern,
|
|
na=False,
|
|
regex=True
|
|
)
|
|
|
|
self.desc_df = self.df.copy()
|
|
return self.df
|
|
|
|
def get_salary_related_data(self):
|
|
"""Get transactions identified as salary-related."""
|
|
if self.desc_df is None:
|
|
self.identify_salary_transactions()
|
|
|
|
return self.desc_df[
|
|
(self.desc_df['is_salary_related'] == True) &
|
|
(self.desc_df['initiated_by'] == 'C')
|
|
] |