create filter, utils
This commit is contained in:
parent
e25f01dcf6
commit
f9e47c91b4
4 changed files with 89 additions and 0 deletions
52
README.md
52
README.md
|
|
@ -0,0 +1,52 @@
|
||||||
|
# Wiqaya
|
||||||
|
|
||||||
|
A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages.
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install wiqaya
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```python
|
||||||
|
from wiqaya import Wiqaya
|
||||||
|
|
||||||
|
w = Wiqaya(lang="ar")
|
||||||
|
|
||||||
|
w.is_profane("هذا نص عادي") # False
|
||||||
|
w.is_profane("نص يحتوي شتيمة") # True
|
||||||
|
|
||||||
|
w.get_profane_words("نص فيه كلمة سيئة") # ['كلمة سيئة']
|
||||||
|
```
|
||||||
|
|
||||||
|
## Supported Languages
|
||||||
|
|
||||||
|
| Code | Language | Code | Language | Code | Language |
|
||||||
|
|------|----------|------|----------|------|----------|
|
||||||
|
| af | Afrikaans | am | Amharic | ar | العربية |
|
||||||
|
| az | Azerbaijani | be | Belarusian | bg | Bulgarian |
|
||||||
|
| ca | Catalan | ceb | Cebuano | cs | Czech |
|
||||||
|
| cy | Welsh | da | Danish | de | German |
|
||||||
|
| dz | Dzongkha | el | Greek | en | English |
|
||||||
|
| eo | Esperanto | es | Spanish | et | Estonian |
|
||||||
|
| eu | Basque | fa | Persian | fi | Finnish |
|
||||||
|
| fil | Filipino | fr | French | gd | Scottish Gaelic |
|
||||||
|
| gl | Galician | hi | Hindi | hr | Croatian |
|
||||||
|
| hu | Hungarian | hy | Armenian | id | Indonesian |
|
||||||
|
| is | Icelandic | it | Italian | ja | Japanese |
|
||||||
|
| kab | Kabyle | kh | Khmer | ko | Korean |
|
||||||
|
| la | Latin | lt | Lithuanian | lv | Latvian |
|
||||||
|
| mi | Maori | mk | Macedonian | ml | Malayalam |
|
||||||
|
| mn | Mongolian | mr | Marathi | ms | Malay |
|
||||||
|
| mt | Maltese | my | Burmese | nl | Dutch |
|
||||||
|
| no | Norwegian | pih | Norfuk | piy | Picard |
|
||||||
|
| pl | Polish | pt | Portuguese | ro | Romanian |
|
||||||
|
| rop | Kriol | ru | Russian | sk | Slovak |
|
||||||
|
| sl | Slovenian | sm | Samoan | sq | Albanian |
|
||||||
|
| sr | Serbian | sv | Swedish | ta | Tamil |
|
||||||
|
| te | Telugu | tet | Tetum | th | Thai |
|
||||||
|
| tlh | Klingon | to | Tongan | tr | Turkish |
|
||||||
|
| uk | Ukrainian | uz | Uzbek | vi | Vietnamese |
|
||||||
|
| yid | Yiddish | zh | Chinese | zu | Zulu |
|
||||||
29
src/wiqaya/filter.py
Normal file
29
src/wiqaya/filter.py
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||||
|
|
||||||
|
class Wiqaya:
|
||||||
|
def __init__(self, lang: str):
|
||||||
|
self.lang = lang
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
||||||
|
self.WORDS = set(line.strip() for line in f)
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
raise ValueError(f"Language '{self.lang}' not supported")
|
||||||
|
|
||||||
|
def is_profane(self, text) -> bool:
|
||||||
|
words = text.lower().split()
|
||||||
|
return any(word in self.WORDS for word in words)
|
||||||
|
|
||||||
|
def get_profane_words(self, text) -> list[str]:
|
||||||
|
words = text.lower().split()
|
||||||
|
return [word for word in words if word in self.WORDS]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
t = Wiqaya("ar").get_profane_words("اهلا بك يا بزاز")
|
||||||
|
print(t)
|
||||||
0
src/wiqaya/utils.py
Normal file
0
src/wiqaya/utils.py
Normal file
8
uv.lock
generated
Normal file
8
uv.lock
generated
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
version = 1
|
||||||
|
revision = 3
|
||||||
|
requires-python = ">=3.12"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "wiqaya"
|
||||||
|
version = "0.1.0"
|
||||||
|
source = { editable = "." }
|
||||||
Loading…
Add table
Reference in a new issue