update
This commit is contained in:
parent
fbd872a40a
commit
c40cebcab6
3 changed files with 84 additions and 4 deletions
|
|
@ -22,6 +22,9 @@ w.get_profane_words("نص فيه حرامي و أطرش") # ['حرامي', 'أ
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية
|
> تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> يدعم المشروع النمط البديل (Wildcard) في قوائم الكلمات — استخدم `*` للتطابق مع أي تسلسل من الأحرف (مثال: `bad*` تطابق `badly`، و`*word*` تطابق أي كلمة تحتوي على `word`)
|
||||||
|
|
||||||
## اللغات المدعومة
|
## اللغات المدعومة
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,9 @@ w.get_profane_words("this is damn annoying") # ['damn']
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode
|
> The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode
|
||||||
|
|
||||||
|
> [!TIP]
|
||||||
|
> Wildcard patterns are supported in word lists — use `*` to match any sequence of characters (e.g., `bad*` matches `badly`, `*word*` matches anything containing `word`)
|
||||||
|
|
||||||
## Supported Languages
|
## Supported Languages
|
||||||
|
|
||||||
| Code | Language | Code | Language | Code | Language |
|
| Code | Language | Code | Language | Code | Language |
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,19 @@ DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
||||||
class Wiqaya:
|
class Wiqaya:
|
||||||
def __init__(self, lang: str):
|
def __init__(self, lang: str):
|
||||||
|
"""
|
||||||
|
Initialize the Wiqaya profanity filter for a given language.
|
||||||
|
|
||||||
|
Loads the word list from a language-specific .txt file in the data directory.
|
||||||
|
Entries containing '*' are treated as wildcard patterns and compiled into
|
||||||
|
regex objects. Plain entries are stored in a set for O(1) lookup.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
lang (str): Language code (e.g., 'ar', 'en'). Must match a filename in data/.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If no word list file exists for the given language.
|
||||||
|
"""
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
try:
|
try:
|
||||||
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
||||||
|
|
@ -15,7 +28,6 @@ class Wiqaya:
|
||||||
|
|
||||||
self.WORDS = set()
|
self.WORDS = set()
|
||||||
self._patterns = []
|
self._patterns = []
|
||||||
|
|
||||||
for entry in lines:
|
for entry in lines:
|
||||||
if "*" in entry:
|
if "*" in entry:
|
||||||
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
|
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
|
||||||
|
|
@ -25,25 +37,87 @@ class Wiqaya:
|
||||||
self.WORDS.add(entry)
|
self.WORDS.add(entry)
|
||||||
|
|
||||||
def _matches_any_pattern(self, word: str) -> bool:
|
def _matches_any_pattern(self, word: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check whether a word matches any of the compiled wildcard regex patterns.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word (str): The word to test.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the word matches at least one pattern, False otherwise.
|
||||||
|
"""
|
||||||
return any(p.match(word) for p in self._patterns)
|
return any(p.match(word) for p in self._patterns)
|
||||||
|
|
||||||
def _is_bad(self, word: str) -> bool:
|
def _is_bad(self, word: str) -> bool:
|
||||||
|
"""
|
||||||
|
Determine if a single word is considered profane.
|
||||||
|
|
||||||
|
Checks both the exact-match word set and the wildcard pattern list.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
word (str): The word to check.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the word is profane, False otherwise.
|
||||||
|
"""
|
||||||
return word in self.WORDS or self._matches_any_pattern(word)
|
return word in self.WORDS or self._matches_any_pattern(word)
|
||||||
|
|
||||||
def is_profane(self, text) -> bool:
|
def is_profane(self, text: str) -> bool:
|
||||||
|
"""
|
||||||
|
Return True if the text contains at least one profane word.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to scan.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if any profane word is found, False otherwise.
|
||||||
|
"""
|
||||||
return any(self._is_bad(w) for w in self._process(text))
|
return any(self._is_bad(w) for w in self._process(text))
|
||||||
|
|
||||||
def get_profane_words(self, text) -> list[str]:
|
def get_profane_words(self, text: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Extract and return all profane words found in the text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to scan.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]: A list of every word in the text that is considered profane.
|
||||||
|
"""
|
||||||
return [w for w in self._process(text) if self._is_bad(w)]
|
return [w for w in self._process(text) if self._is_bad(w)]
|
||||||
|
|
||||||
def censor(self, text: str, char: str = "*") -> str:
|
def censor(self, text: str, char: str = "*") -> str:
|
||||||
|
"""
|
||||||
|
Replace each profane word in the text with a repeated censor character.
|
||||||
|
|
||||||
|
The replacement preserves the original word's length (e.g., 'hell' → '****').
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The input text to censor.
|
||||||
|
char (str): The character used for censoring. Defaults to '*'.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The censored version of the input text.
|
||||||
|
"""
|
||||||
for word in self._process(text):
|
for word in self._process(text):
|
||||||
if self._is_bad(word):
|
if self._is_bad(word):
|
||||||
text = text.replace(word, char * len(word))
|
text = text.replace(word, char * len(word))
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _process(self, text: str) -> list[str]:
|
def _process(self, text: str) -> list[str]:
|
||||||
# حذف التشكيل من الكلمات العربية لتجنب التحايل
|
"""
|
||||||
|
Normalize and tokenize the input text into a list of words.
|
||||||
|
|
||||||
|
For Arabic text, diacritics (tashkeel) are stripped first to prevent
|
||||||
|
users from bypassing the filter by adding vowel marks to profane words.
|
||||||
|
The text is then lowercased and split on whitespace.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The raw input text.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[str]: A list of normalized, lowercase tokens.
|
||||||
|
"""
|
||||||
if self.lang == "ar":
|
if self.lang == "ar":
|
||||||
text = remove_tashkeel(text)
|
text = remove_tashkeel(text)
|
||||||
return text.lower().split()
|
return text.lower().split()
|
||||||
Loading…
Add table
Reference in a new issue