feat: add wildcard pattern support for profanity words

2026-03-08 19:55:55 +02:00 · 2026-03-08 19:55:55 +02:00 · 8ced36795a
commit 8ced36795a
parent 8caaaa6675
2 changed files with 43 additions and 16 deletions
--- a/src/wiqaya/filter.py
+++ b/src/wiqaya/filter.py
@ -1,32 +1,44 @@
 from pathlib import Path
 from .utils import remove_tashkeel
-
+import re
 DATA_DIR = Path(__file__).parent / "data"
 class Wiqaya:
    def __init__(self, lang: str):
        self.lang = lang
        try:
            with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
-                self.WORDS = set(line.strip() for line in f)
+                lines = [line.strip() for line in f if line.strip()]
        except FileNotFoundError:
            raise ValueError(f"Language '{self.lang}' not supported")
        self.WORDS = set()
        self._patterns = []
        for entry in lines:
            if "*" in entry:
                # Convert wildcard to regex: *word* → .*word.*, word* → word.*
                regex = re.escape(entry).replace(r"\*", ".*")
                self._patterns.append(re.compile(f"^{regex}$"))
            else:
                self.WORDS.add(entry)
    def _matches_any_pattern(self, word: str) -> bool:
        return any(p.match(word) for p in self._patterns)
    def _is_bad(self, word: str) -> bool:
        return word in self.WORDS or self._matches_any_pattern(word)
    def is_profane(self, text) -> bool:
-        words = self._process(text)
+        return any(self._is_bad(w) for w in self._process(text))
        return any(word in self.WORDS for word in words)
    def get_profane_words(self, text) -> list[str]:
-        words = self._process(text)
+        return [w for w in self._process(text) if self._is_bad(w)]
        return [word for word in words if word in self.WORDS]
    def censor(self, text: str, char: str = "*") -> str:
-        words = self._process(text)
+        for word in self._process(text):
-        for word in words:
+            if self._is_bad(word):
            if word in self.WORDS:
                text = text.replace(word, char * len(word))
        return text
@ -34,5 +46,3 @@ class Wiqaya:
        if self.lang == "ar":
            text = remove_tashkeel(text)
        return text.lower().split()
--- a/tests/test_filter.py
+++ b/tests/test_filter.py
@ -56,3 +56,20 @@ def test_invalid_lang():
    import pytest
    with pytest.raises(ValueError):
        Wiqaya(lang="xx")
 def test_wildcard_support():
    w = Wiqaya(lang="en")
    # is_profane
    assert w.is_profane("wwsfuck")    == True 
    assert w.is_profane("fuckwedf")   == True 
    assert w.is_profane("wd+wfucked+") == True
    # get_profane_words
    assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"]
    # censor
    assert w.censor("hello dsfuckw there")    == "hello ******* there"
    assert w.censor("dsfuckw ffdamn", char="#") == "####### ######"