From c40cebcab6e717ef176e1e45a0279968bdc6151b Mon Sep 17 00:00:00 2001
From: tayf <144544047+be-at@users.noreply.github.com>
Date: Tue, 10 Mar 2026 13:56:23 +0200
Subject: [PATCH] update

---
 README-ar.md         |  3 ++
 README.md            |  3 ++
 src/wiqaya/filter.py | 82 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 84 insertions(+), 4 deletions(-)

diff --git a/README-ar.md b/README-ar.md
index 6b24540..eb6a879 100644
--- a/README-ar.md
+++ b/README-ar.md
@@ -22,6 +22,9 @@ w.get_profane_words("نص فيه حرامي و أطرش")   # ['حرامي', 'أ
 > [!NOTE]
 > تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية
 
+> [!TIP]
+> يدعم المشروع النمط البديل (Wildcard) في قوائم الكلمات — استخدم `*` للتطابق مع أي تسلسل من الأحرف (مثال: `bad*` تطابق `badly`، و`*word*` تطابق أي كلمة تحتوي على `word`)
+
 ## اللغات المدعومة 
 
 
diff --git a/README.md b/README.md
index 34c8dd8..563387b 100644
--- a/README.md
+++ b/README.md
@@ -25,6 +25,9 @@ w.get_profane_words("this is damn annoying") # ['damn']
 > [!NOTE]
 > The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode
 
+> [!TIP]
+> Wildcard patterns are supported in word lists — use `*` to match any sequence of characters (e.g., `bad*` matches `badly`, `*word*` matches anything containing `word`)
+
 ## Supported Languages
 
 | Code | Language | Code | Language | Code | Language |
diff --git a/src/wiqaya/filter.py b/src/wiqaya/filter.py
index e7e3cbb..227d493 100644
--- a/src/wiqaya/filter.py
+++ b/src/wiqaya/filter.py
@@ -6,6 +6,19 @@ DATA_DIR = Path(__file__).parent / "data"
 
 class Wiqaya:
     def __init__(self, lang: str):
+        """
+        Initialize the Wiqaya profanity filter for a given language.
+
+        Loads the word list from a language-specific .txt file in the data directory.
+        Entries containing '*' are treated as wildcard patterns and compiled into
+        regex objects. Plain entries are stored in a set for O(1) lookup.
+
+        Args:
+            lang (str): Language code (e.g., 'ar', 'en'). Must match a filename in data/.
+        
+        Raises:
+            ValueError: If no word list file exists for the given language.
+        """
         self.lang = lang
         try:
             with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
@@ -15,7 +28,6 @@ class Wiqaya:
 
         self.WORDS = set()
         self._patterns = []
-
         for entry in lines:
             if "*" in entry:
                 # Convert wildcard to regex: *word* → .*word.*, word* → word.*
@@ -25,25 +37,87 @@ class Wiqaya:
                 self.WORDS.add(entry)
 
     def _matches_any_pattern(self, word: str) -> bool:
+        """
+        Check whether a word matches any of the compiled wildcard regex patterns.
+
+        Args:
+            word (str): The word to test.
+
+        Returns:
+            bool: True if the word matches at least one pattern, False otherwise.
+        """
         return any(p.match(word) for p in self._patterns)
 
     def _is_bad(self, word: str) -> bool:
+        """
+        Determine if a single word is considered profane.
+
+        Checks both the exact-match word set and the wildcard pattern list.
+
+        Args:
+            word (str): The word to check.
+
+        Returns:
+            bool: True if the word is profane, False otherwise.
+        """
         return word in self.WORDS or self._matches_any_pattern(word)
 
-    def is_profane(self, text) -> bool:
+    def is_profane(self, text: str) -> bool:
+        """
+        Return True if the text contains at least one profane word.
+
+        Args:
+            text (str): The input text to scan.
+
+        Returns:
+            bool: True if any profane word is found, False otherwise.
+        """
         return any(self._is_bad(w) for w in self._process(text))
 
-    def get_profane_words(self, text) -> list[str]:
+    def get_profane_words(self, text: str) -> list[str]:
+        """
+        Extract and return all profane words found in the text.
+
+        Args:
+            text (str): The input text to scan.
+
+        Returns:
+            list[str]: A list of every word in the text that is considered profane.
+        """
         return [w for w in self._process(text) if self._is_bad(w)]
 
     def censor(self, text: str, char: str = "*") -> str:
+        """
+        Replace each profane word in the text with a repeated censor character.
+
+        The replacement preserves the original word's length (e.g., 'hell' → '****').
+
+        Args:
+            text (str): The input text to censor.
+            char (str): The character used for censoring. Defaults to '*'.
+
+        Returns:
+            str: The censored version of the input text.
+        """
         for word in self._process(text):
             if self._is_bad(word):
                 text = text.replace(word, char * len(word))
         return text
 
     def _process(self, text: str) -> list[str]:
-        # حذف التشكيل من الكلمات العربية لتجنب التحايل
+        """
+        Normalize and tokenize the input text into a list of words.
+
+        For Arabic text, diacritics (tashkeel) are stripped first to prevent
+        users from bypassing the filter by adding vowel marks to profane words.
+        The text is then lowercased and split on whitespace.
+
+        Args:
+            text (str): The raw input text.
+
+        Returns:
+            list[str]: A list of normalized, lowercase tokens.
+        """
         if self.lang == "ar":
             text = remove_tashkeel(text)
         return text.lower().split()
\ No newline at end of file