create filter, utils
This commit is contained in:
parent
e25f01dcf6
commit
f9e47c91b4
4 changed files with 89 additions and 0 deletions
52
README.md
52
README.md
|
|
@ -0,0 +1,52 @@
|
|||
# Wiqaya
|
||||
|
||||
A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages.
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
pip install wiqaya
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```python
|
||||
from wiqaya import Wiqaya
|
||||
|
||||
w = Wiqaya(lang="ar")
|
||||
|
||||
w.is_profane("هذا نص عادي") # False
|
||||
w.is_profane("نص يحتوي شتيمة") # True
|
||||
|
||||
w.get_profane_words("نص فيه كلمة سيئة") # ['كلمة سيئة']
|
||||
```
|
||||
|
||||
## Supported Languages
|
||||
|
||||
| Code | Language | Code | Language | Code | Language |
|
||||
|------|----------|------|----------|------|----------|
|
||||
| af | Afrikaans | am | Amharic | ar | العربية |
|
||||
| az | Azerbaijani | be | Belarusian | bg | Bulgarian |
|
||||
| ca | Catalan | ceb | Cebuano | cs | Czech |
|
||||
| cy | Welsh | da | Danish | de | German |
|
||||
| dz | Dzongkha | el | Greek | en | English |
|
||||
| eo | Esperanto | es | Spanish | et | Estonian |
|
||||
| eu | Basque | fa | Persian | fi | Finnish |
|
||||
| fil | Filipino | fr | French | gd | Scottish Gaelic |
|
||||
| gl | Galician | hi | Hindi | hr | Croatian |
|
||||
| hu | Hungarian | hy | Armenian | id | Indonesian |
|
||||
| is | Icelandic | it | Italian | ja | Japanese |
|
||||
| kab | Kabyle | kh | Khmer | ko | Korean |
|
||||
| la | Latin | lt | Lithuanian | lv | Latvian |
|
||||
| mi | Maori | mk | Macedonian | ml | Malayalam |
|
||||
| mn | Mongolian | mr | Marathi | ms | Malay |
|
||||
| mt | Maltese | my | Burmese | nl | Dutch |
|
||||
| no | Norwegian | pih | Norfuk | piy | Picard |
|
||||
| pl | Polish | pt | Portuguese | ro | Romanian |
|
||||
| rop | Kriol | ru | Russian | sk | Slovak |
|
||||
| sl | Slovenian | sm | Samoan | sq | Albanian |
|
||||
| sr | Serbian | sv | Swedish | ta | Tamil |
|
||||
| te | Telugu | tet | Tetum | th | Thai |
|
||||
| tlh | Klingon | to | Tongan | tr | Turkish |
|
||||
| uk | Ukrainian | uz | Uzbek | vi | Vietnamese |
|
||||
| yid | Yiddish | zh | Chinese | zu | Zulu |
|
||||
29
src/wiqaya/filter.py
Normal file
29
src/wiqaya/filter.py
Normal file
|
|
@ -0,0 +1,29 @@
|
|||
from pathlib import Path
|
||||
|
||||
|
||||
|
||||
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||
|
||||
class Wiqaya:
|
||||
def __init__(self, lang: str):
|
||||
self.lang = lang
|
||||
|
||||
try:
|
||||
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
||||
self.WORDS = set(line.strip() for line in f)
|
||||
|
||||
except FileNotFoundError:
|
||||
raise ValueError(f"Language '{self.lang}' not supported")
|
||||
|
||||
def is_profane(self, text) -> bool:
|
||||
words = text.lower().split()
|
||||
return any(word in self.WORDS for word in words)
|
||||
|
||||
def get_profane_words(self, text) -> list[str]:
|
||||
words = text.lower().split()
|
||||
return [word for word in words if word in self.WORDS]
|
||||
|
||||
|
||||
|
||||
t = Wiqaya("ar").get_profane_words("اهلا بك يا بزاز")
|
||||
print(t)
|
||||
0
src/wiqaya/utils.py
Normal file
0
src/wiqaya/utils.py
Normal file
8
uv.lock
generated
Normal file
8
uv.lock
generated
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
version = 1
|
||||
revision = 3
|
||||
requires-python = ">=3.12"
|
||||
|
||||
[[package]]
|
||||
name = "wiqaya"
|
||||
version = "0.1.0"
|
||||
source = { editable = "." }
|
||||
Loading…
Add table
Reference in a new issue