feat: add wildcard pattern support for profanity words

This commit is contained in:
tayf 2026-03-08 19:55:55 +02:00
parent 8caaaa6675
commit 8ced36795a
2 changed files with 43 additions and 16 deletions

View file

@ -1,32 +1,44 @@
from pathlib import Path from pathlib import Path
from .utils import remove_tashkeel from .utils import remove_tashkeel
import re
DATA_DIR = Path(__file__).parent / "data" DATA_DIR = Path(__file__).parent / "data"
class Wiqaya: class Wiqaya:
def __init__(self, lang: str): def __init__(self, lang: str):
self.lang = lang self.lang = lang
try: try:
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f: with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
self.WORDS = set(line.strip() for line in f) lines = [line.strip() for line in f if line.strip()]
except FileNotFoundError: except FileNotFoundError:
raise ValueError(f"Language '{self.lang}' not supported") raise ValueError(f"Language '{self.lang}' not supported")
self.WORDS = set()
self._patterns = []
for entry in lines:
if "*" in entry:
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
regex = re.escape(entry).replace(r"\*", ".*")
self._patterns.append(re.compile(f"^{regex}$"))
else:
self.WORDS.add(entry)
def _matches_any_pattern(self, word: str) -> bool:
return any(p.match(word) for p in self._patterns)
def _is_bad(self, word: str) -> bool:
return word in self.WORDS or self._matches_any_pattern(word)
def is_profane(self, text) -> bool: def is_profane(self, text) -> bool:
words = self._process(text) return any(self._is_bad(w) for w in self._process(text))
return any(word in self.WORDS for word in words)
def get_profane_words(self, text) -> list[str]: def get_profane_words(self, text) -> list[str]:
words = self._process(text) return [w for w in self._process(text) if self._is_bad(w)]
return [word for word in words if word in self.WORDS]
def censor(self, text: str, char: str = "*") -> str: def censor(self, text: str, char: str = "*") -> str:
words = self._process(text) for word in self._process(text):
for word in words: if self._is_bad(word):
if word in self.WORDS:
text = text.replace(word, char * len(word)) text = text.replace(word, char * len(word))
return text return text
@ -34,5 +46,3 @@ class Wiqaya:
if self.lang == "ar": if self.lang == "ar":
text = remove_tashkeel(text) text = remove_tashkeel(text)
return text.lower().split() return text.lower().split()

View file

@ -56,3 +56,20 @@ def test_invalid_lang():
import pytest import pytest
with pytest.raises(ValueError): with pytest.raises(ValueError):
Wiqaya(lang="xx") Wiqaya(lang="xx")
def test_wildcard_support():
w = Wiqaya(lang="en")
# is_profane
assert w.is_profane("wwsfuck") == True
assert w.is_profane("fuckwedf") == True
assert w.is_profane("wd+wfucked+") == True
# get_profane_words
assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"]
# censor
assert w.censor("hello dsfuckw there") == "hello ******* there"
assert w.censor("dsfuckw ffdamn", char="#") == "####### ######"