From 8ced36795a4371ec27d687e699df327b2b41c33e Mon Sep 17 00:00:00 2001 From: tayf <144544047+be-at@users.noreply.github.com> Date: Sun, 8 Mar 2026 19:55:55 +0200 Subject: [PATCH] feat: add wildcard pattern support for profanity words --- src/wiqaya/filter.py | 40 +++++++++++++++++++++++++--------------- tests/test_filter.py | 19 ++++++++++++++++++- 2 files changed, 43 insertions(+), 16 deletions(-) diff --git a/src/wiqaya/filter.py b/src/wiqaya/filter.py index c1d4e43..9680b4c 100644 --- a/src/wiqaya/filter.py +++ b/src/wiqaya/filter.py @@ -1,38 +1,48 @@ -from pathlib import Path +from pathlib import Path from .utils import remove_tashkeel - +import re DATA_DIR = Path(__file__).parent / "data" class Wiqaya: def __init__(self, lang: str): self.lang = lang - try: with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f: - self.WORDS = set(line.strip() for line in f) - + lines = [line.strip() for line in f if line.strip()] except FileNotFoundError: raise ValueError(f"Language '{self.lang}' not supported") + self.WORDS = set() + self._patterns = [] + + for entry in lines: + if "*" in entry: + # Convert wildcard to regex: *word* → .*word.*, word* → word.* + regex = re.escape(entry).replace(r"\*", ".*") + self._patterns.append(re.compile(f"^{regex}$")) + else: + self.WORDS.add(entry) + + def _matches_any_pattern(self, word: str) -> bool: + return any(p.match(word) for p in self._patterns) + + def _is_bad(self, word: str) -> bool: + return word in self.WORDS or self._matches_any_pattern(word) + def is_profane(self, text) -> bool: - words = self._process(text) - return any(word in self.WORDS for word in words) + return any(self._is_bad(w) for w in self._process(text)) def get_profane_words(self, text) -> list[str]: - words = self._process(text) - return [word for word in words if word in self.WORDS] + return [w for w in self._process(text) if self._is_bad(w)] def censor(self, text: str, char: str = "*") -> str: - words = self._process(text) - for word in words: - if word in self.WORDS: + for word in self._process(text): + if self._is_bad(word): text = text.replace(word, char * len(word)) return text def _process(self, text: str) -> list[str]: if self.lang == "ar": text = remove_tashkeel(text) - return text.lower().split() - - + return text.lower().split() \ No newline at end of file diff --git a/tests/test_filter.py b/tests/test_filter.py index 2156081..b6dfbea 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -55,4 +55,21 @@ def test_get_profane_words_en(): def test_invalid_lang(): import pytest with pytest.raises(ValueError): - Wiqaya(lang="xx") \ No newline at end of file + Wiqaya(lang="xx") + + +def test_wildcard_support(): + w = Wiqaya(lang="en") + + # is_profane + assert w.is_profane("wwsfuck") == True + assert w.is_profane("fuckwedf") == True + assert w.is_profane("wd+wfucked+") == True + + + # get_profane_words + assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"] + + # censor + assert w.censor("hello dsfuckw there") == "hello ******* there" + assert w.censor("dsfuckw ffdamn", char="#") == "####### ######" \ No newline at end of file