feat: add wildcard pattern support for profanity words
This commit is contained in:
parent
8caaaa6675
commit
8ced36795a
2 changed files with 43 additions and 16 deletions
|
|
@ -1,38 +1,48 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .utils import remove_tashkeel
|
from .utils import remove_tashkeel
|
||||||
|
import re
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
class Wiqaya:
|
class Wiqaya:
|
||||||
def __init__(self, lang: str):
|
def __init__(self, lang: str):
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
||||||
self.WORDS = set(line.strip() for line in f)
|
lines = [line.strip() for line in f if line.strip()]
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise ValueError(f"Language '{self.lang}' not supported")
|
raise ValueError(f"Language '{self.lang}' not supported")
|
||||||
|
|
||||||
|
self.WORDS = set()
|
||||||
|
self._patterns = []
|
||||||
|
|
||||||
|
for entry in lines:
|
||||||
|
if "*" in entry:
|
||||||
|
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
|
||||||
|
regex = re.escape(entry).replace(r"\*", ".*")
|
||||||
|
self._patterns.append(re.compile(f"^{regex}$"))
|
||||||
|
else:
|
||||||
|
self.WORDS.add(entry)
|
||||||
|
|
||||||
|
def _matches_any_pattern(self, word: str) -> bool:
|
||||||
|
return any(p.match(word) for p in self._patterns)
|
||||||
|
|
||||||
|
def _is_bad(self, word: str) -> bool:
|
||||||
|
return word in self.WORDS or self._matches_any_pattern(word)
|
||||||
|
|
||||||
def is_profane(self, text) -> bool:
|
def is_profane(self, text) -> bool:
|
||||||
words = self._process(text)
|
return any(self._is_bad(w) for w in self._process(text))
|
||||||
return any(word in self.WORDS for word in words)
|
|
||||||
|
|
||||||
def get_profane_words(self, text) -> list[str]:
|
def get_profane_words(self, text) -> list[str]:
|
||||||
words = self._process(text)
|
return [w for w in self._process(text) if self._is_bad(w)]
|
||||||
return [word for word in words if word in self.WORDS]
|
|
||||||
|
|
||||||
def censor(self, text: str, char: str = "*") -> str:
|
def censor(self, text: str, char: str = "*") -> str:
|
||||||
words = self._process(text)
|
for word in self._process(text):
|
||||||
for word in words:
|
if self._is_bad(word):
|
||||||
if word in self.WORDS:
|
|
||||||
text = text.replace(word, char * len(word))
|
text = text.replace(word, char * len(word))
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _process(self, text: str) -> list[str]:
|
def _process(self, text: str) -> list[str]:
|
||||||
if self.lang == "ar":
|
if self.lang == "ar":
|
||||||
text = remove_tashkeel(text)
|
text = remove_tashkeel(text)
|
||||||
return text.lower().split()
|
return text.lower().split()
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -55,4 +55,21 @@ def test_get_profane_words_en():
|
||||||
def test_invalid_lang():
|
def test_invalid_lang():
|
||||||
import pytest
|
import pytest
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Wiqaya(lang="xx")
|
Wiqaya(lang="xx")
|
||||||
|
|
||||||
|
|
||||||
|
def test_wildcard_support():
|
||||||
|
w = Wiqaya(lang="en")
|
||||||
|
|
||||||
|
# is_profane
|
||||||
|
assert w.is_profane("wwsfuck") == True
|
||||||
|
assert w.is_profane("fuckwedf") == True
|
||||||
|
assert w.is_profane("wd+wfucked+") == True
|
||||||
|
|
||||||
|
|
||||||
|
# get_profane_words
|
||||||
|
assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"]
|
||||||
|
|
||||||
|
# censor
|
||||||
|
assert w.censor("hello dsfuckw there") == "hello ******* there"
|
||||||
|
assert w.censor("dsfuckw ffdamn", char="#") == "####### ######"
|
||||||
Loading…
Add table
Reference in a new issue