From f9e47c91b4808452ca336bb9849bfb5a745649c7 Mon Sep 17 00:00:00 2001 From: tayf Date: Sun, 8 Mar 2026 09:10:10 +0200 Subject: [PATCH] create filter, utils --- README.md | 52 ++++++++++++++++++++++++++++++++++++++++++++ src/wiqaya/filter.py | 29 ++++++++++++++++++++++++ src/wiqaya/utils.py | 0 uv.lock | 8 +++++++ 4 files changed, 89 insertions(+) create mode 100644 src/wiqaya/filter.py create mode 100644 src/wiqaya/utils.py create mode 100644 uv.lock diff --git a/README.md b/README.md index e69de29..d243747 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,52 @@ +# Wiqaya + +A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages. + +## Installation + +```bash +pip install wiqaya +``` + +## Usage + +```python +from wiqaya import Wiqaya + +w = Wiqaya(lang="ar") + +w.is_profane("هذا نص عادي") # False +w.is_profane("نص يحتوي شتيمة") # True + +w.get_profane_words("نص فيه كلمة سيئة") # ['كلمة سيئة'] +``` + +## Supported Languages + +| Code | Language | Code | Language | Code | Language | +|------|----------|------|----------|------|----------| +| af | Afrikaans | am | Amharic | ar | العربية | +| az | Azerbaijani | be | Belarusian | bg | Bulgarian | +| ca | Catalan | ceb | Cebuano | cs | Czech | +| cy | Welsh | da | Danish | de | German | +| dz | Dzongkha | el | Greek | en | English | +| eo | Esperanto | es | Spanish | et | Estonian | +| eu | Basque | fa | Persian | fi | Finnish | +| fil | Filipino | fr | French | gd | Scottish Gaelic | +| gl | Galician | hi | Hindi | hr | Croatian | +| hu | Hungarian | hy | Armenian | id | Indonesian | +| is | Icelandic | it | Italian | ja | Japanese | +| kab | Kabyle | kh | Khmer | ko | Korean | +| la | Latin | lt | Lithuanian | lv | Latvian | +| mi | Maori | mk | Macedonian | ml | Malayalam | +| mn | Mongolian | mr | Marathi | ms | Malay | +| mt | Maltese | my | Burmese | nl | Dutch | +| no | Norwegian | pih | Norfuk | piy | Picard | +| pl | Polish | pt | Portuguese | ro | Romanian | +| rop | Kriol | ru | Russian | sk | Slovak | +| sl | Slovenian | sm | Samoan | sq | Albanian | +| sr | Serbian | sv | Swedish | ta | Tamil | +| te | Telugu | tet | Tetum | th | Thai | +| tlh | Klingon | to | Tongan | tr | Turkish | +| uk | Ukrainian | uz | Uzbek | vi | Vietnamese | +| yid | Yiddish | zh | Chinese | zu | Zulu | \ No newline at end of file diff --git a/src/wiqaya/filter.py b/src/wiqaya/filter.py new file mode 100644 index 0000000..1b8001a --- /dev/null +++ b/src/wiqaya/filter.py @@ -0,0 +1,29 @@ +from pathlib import Path + + + +DATA_DIR = Path(__file__).parent.parent.parent / "data" + +class Wiqaya: + def __init__(self, lang: str): + self.lang = lang + + try: + with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f: + self.WORDS = set(line.strip() for line in f) + + except FileNotFoundError: + raise ValueError(f"Language '{self.lang}' not supported") + + def is_profane(self, text) -> bool: + words = text.lower().split() + return any(word in self.WORDS for word in words) + + def get_profane_words(self, text) -> list[str]: + words = text.lower().split() + return [word for word in words if word in self.WORDS] + + + +t = Wiqaya("ar").get_profane_words("اهلا بك يا بزاز") +print(t) \ No newline at end of file diff --git a/src/wiqaya/utils.py b/src/wiqaya/utils.py new file mode 100644 index 0000000..e69de29 diff --git a/uv.lock b/uv.lock new file mode 100644 index 0000000..5beab82 --- /dev/null +++ b/uv.lock @@ -0,0 +1,8 @@ +version = 1 +revision = 3 +requires-python = ">=3.12" + +[[package]] +name = "wiqaya" +version = "0.1.0" +source = { editable = "." }