Added new salary-related terms and improved image outputs in salary.ipynb
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Keyword-based salary transaction analysis module.
|
||||
"""
|
||||
|
||||
import re
|
||||
import pandas as pd
|
||||
from .config import SALARY_KEYWORDS
|
||||
|
||||
class KeywordAnalyzer:
|
||||
def __init__(self, df):
|
||||
self.df = df
|
||||
self.desc_df = None
|
||||
|
||||
def identify_salary_transactions(self):
|
||||
"""
|
||||
Identifies potential salary-related transactions based on keywords
|
||||
and month-year patterns in the 'description' column.
|
||||
"""
|
||||
month_year_patterns = [
|
||||
r"\b(?:JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)\s?\d{2,4}\b",
|
||||
r"\b(?:JANUARY|FEBRUARY|MARCH|APRIL|MAY|JUNE|JULY|AUGUST|SEPTEMBER|OCTOBER|NOVEMBER|DECEMBER)\s?\d{2,4}\b"
|
||||
]
|
||||
|
||||
escaped_keywords = [re.escape(keyword.lower()) for keyword in SALARY_KEYWORDS]
|
||||
combined_pattern = (
|
||||
r'\b(?:' + '|'.join(escaped_keywords) + r')\b|' +
|
||||
'|'.join(month_year_patterns)
|
||||
)
|
||||
|
||||
self.df['is_salary_related'] = self.df['description'].str.lower().str.contains(
|
||||
combined_pattern,
|
||||
na=False,
|
||||
regex=True
|
||||
)
|
||||
|
||||
self.desc_df = self.df.copy()
|
||||
return self.df
|
||||
|
||||
def get_salary_related_data(self):
|
||||
"""Get transactions identified as salary-related."""
|
||||
if self.desc_df is None:
|
||||
self.identify_salary_transactions()
|
||||
|
||||
return self.desc_df[
|
||||
(self.desc_df['is_salary_related'] == True) &
|
||||
(self.desc_df['initiated_by'] == 'C')
|
||||
]
|
||||
Reference in New Issue
Block a user