From c40cebcab6e717ef176e1e45a0279968bdc6151b Mon Sep 17 00:00:00 2001 From: tayf <144544047+be-at@users.noreply.github.com> Date: Tue, 10 Mar 2026 13:56:23 +0200 Subject: [PATCH] update --- README-ar.md | 3 ++ README.md | 3 ++ src/wiqaya/filter.py | 82 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 84 insertions(+), 4 deletions(-) diff --git a/README-ar.md b/README-ar.md index 6b24540..eb6a879 100644 --- a/README-ar.md +++ b/README-ar.md @@ -22,6 +22,9 @@ w.get_profane_words("نص فيه حرامي و أطرش") # ['حرامي', 'أ > [!NOTE] > تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية +> [!TIP] +> يدعم المشروع النمط البديل (Wildcard) في قوائم الكلمات — استخدم `*` للتطابق مع أي تسلسل من الأحرف (مثال: `bad*` تطابق `badly`، و`*word*` تطابق أي كلمة تحتوي على `word`) + ## اللغات المدعومة diff --git a/README.md b/README.md index 34c8dd8..563387b 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,9 @@ w.get_profane_words("this is damn annoying") # ['damn'] > [!NOTE] > The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode +> [!TIP] +> Wildcard patterns are supported in word lists — use `*` to match any sequence of characters (e.g., `bad*` matches `badly`, `*word*` matches anything containing `word`) + ## Supported Languages | Code | Language | Code | Language | Code | Language | diff --git a/src/wiqaya/filter.py b/src/wiqaya/filter.py index e7e3cbb..227d493 100644 --- a/src/wiqaya/filter.py +++ b/src/wiqaya/filter.py @@ -6,6 +6,19 @@ DATA_DIR = Path(__file__).parent / "data" class Wiqaya: def __init__(self, lang: str): + """ + Initialize the Wiqaya profanity filter for a given language. + + Loads the word list from a language-specific .txt file in the data directory. + Entries containing '*' are treated as wildcard patterns and compiled into + regex objects. Plain entries are stored in a set for O(1) lookup. + + Args: + lang (str): Language code (e.g., 'ar', 'en'). Must match a filename in data/. + + Raises: + ValueError: If no word list file exists for the given language. + """ self.lang = lang try: with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f: @@ -15,7 +28,6 @@ class Wiqaya: self.WORDS = set() self._patterns = [] - for entry in lines: if "*" in entry: # Convert wildcard to regex: *word* → .*word.*, word* → word.* @@ -25,25 +37,87 @@ class Wiqaya: self.WORDS.add(entry) def _matches_any_pattern(self, word: str) -> bool: + """ + Check whether a word matches any of the compiled wildcard regex patterns. + + Args: + word (str): The word to test. + + Returns: + bool: True if the word matches at least one pattern, False otherwise. + """ return any(p.match(word) for p in self._patterns) def _is_bad(self, word: str) -> bool: + """ + Determine if a single word is considered profane. + + Checks both the exact-match word set and the wildcard pattern list. + + Args: + word (str): The word to check. + + Returns: + bool: True if the word is profane, False otherwise. + """ return word in self.WORDS or self._matches_any_pattern(word) - def is_profane(self, text) -> bool: + def is_profane(self, text: str) -> bool: + """ + Return True if the text contains at least one profane word. + + Args: + text (str): The input text to scan. + + Returns: + bool: True if any profane word is found, False otherwise. + """ return any(self._is_bad(w) for w in self._process(text)) - def get_profane_words(self, text) -> list[str]: + def get_profane_words(self, text: str) -> list[str]: + """ + Extract and return all profane words found in the text. + + Args: + text (str): The input text to scan. + + Returns: + list[str]: A list of every word in the text that is considered profane. + """ return [w for w in self._process(text) if self._is_bad(w)] def censor(self, text: str, char: str = "*") -> str: + """ + Replace each profane word in the text with a repeated censor character. + + The replacement preserves the original word's length (e.g., 'hell' → '****'). + + Args: + text (str): The input text to censor. + char (str): The character used for censoring. Defaults to '*'. + + Returns: + str: The censored version of the input text. + """ for word in self._process(text): if self._is_bad(word): text = text.replace(word, char * len(word)) return text def _process(self, text: str) -> list[str]: - # حذف التشكيل من الكلمات العربية لتجنب التحايل + """ + Normalize and tokenize the input text into a list of words. + + For Arabic text, diacritics (tashkeel) are stripped first to prevent + users from bypassing the filter by adding vowel marks to profane words. + The text is then lowercased and split on whitespace. + + Args: + text (str): The raw input text. + + Returns: + list[str]: A list of normalized, lowercase tokens. + """ if self.lang == "ar": text = remove_tashkeel(text) return text.lower().split() \ No newline at end of file