Compare commits
No commits in common. "master" and "v0.2.1" have entirely different histories.
83 changed files with 47 additions and 171 deletions
23
.github/workflows/publish.yml
vendored
Normal file
23
.github/workflows/publish.yml
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
name: Publish to PyPI
|
||||||
|
|
||||||
|
on:
|
||||||
|
release:
|
||||||
|
types: [published]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
publish:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
environment: pypi
|
||||||
|
permissions:
|
||||||
|
id-token: write
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- uses: astral-sh/setup-uv@v5
|
||||||
|
|
||||||
|
- name: Build
|
||||||
|
run: uv build
|
||||||
|
|
||||||
|
- name: Publish
|
||||||
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
34
.github/workflows/release.yml
vendored
34
.github/workflows/release.yml
vendored
|
|
@ -1,34 +0,0 @@
|
||||||
name: Release
|
|
||||||
on:
|
|
||||||
workflow_dispatch:
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
release:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
environment: pypi
|
|
||||||
permissions:
|
|
||||||
id-token: write
|
|
||||||
contents: write
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- uses: astral-sh/setup-uv@v5
|
|
||||||
|
|
||||||
- name: Get version from pyproject.toml
|
|
||||||
id: get_version
|
|
||||||
run: |
|
|
||||||
VERSION=$(python -c "import tomllib; data=tomllib.load(open('pyproject.toml','rb')); print(data['project']['version'])")
|
|
||||||
echo "version=$VERSION" >> $GITHUB_OUTPUT
|
|
||||||
|
|
||||||
- name: Build
|
|
||||||
run: uv build
|
|
||||||
|
|
||||||
- name: Create GitHub Release
|
|
||||||
uses: softprops/action-gh-release@v2
|
|
||||||
with:
|
|
||||||
tag_name: v${{ steps.get_version.outputs.version }}
|
|
||||||
name: v${{ steps.get_version.outputs.version }}
|
|
||||||
generate_release_notes: true
|
|
||||||
|
|
||||||
- name: Publish to PyPI
|
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
|
||||||
|
|
@ -20,10 +20,8 @@ w.get_profane_words("نص فيه حرامي و أطرش") # ['حرامي', 'أ
|
||||||
```
|
```
|
||||||
|
|
||||||
> [!NOTE]
|
> [!NOTE]
|
||||||
> تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية
|
> المكتبة تدعم إزالة التشكيل من الكلمات تلقائياً
|
||||||
|
|
||||||
> [!TIP]
|
|
||||||
> يدعم المشروع النمط البديل (Wildcard) في قوائم الكلمات — استخدم `*` للتطابق مع أي تسلسل من الأحرف (مثال: `bad*` تطابق `badly`، و`*word*` تطابق أي كلمة تحتوي على `word`)
|
|
||||||
|
|
||||||
## اللغات المدعومة
|
## اللغات المدعومة
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -22,11 +22,6 @@ w.is_profane("Hello World") # False
|
||||||
w.get_profane_words("this is damn annoying") # ['damn']
|
w.get_profane_words("this is damn annoying") # ['damn']
|
||||||
|
|
||||||
```
|
```
|
||||||
> [!NOTE]
|
|
||||||
> The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode
|
|
||||||
|
|
||||||
> [!TIP]
|
|
||||||
> Wildcard patterns are supported in word lists — use `*` to match any sequence of characters (e.g., `bad*` matches `badly`, `*word*` matches anything containing `word`)
|
|
||||||
|
|
||||||
## Supported Languages
|
## Supported Languages
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -966,7 +966,7 @@ zwimel
|
||||||
ابو فص
|
ابو فص
|
||||||
ابو قرعة
|
ابو قرعة
|
||||||
اتن
|
اتن
|
||||||
*احا*
|
احا
|
||||||
احترم نفسك
|
احترم نفسك
|
||||||
احتلام
|
احتلام
|
||||||
احلي كث
|
احلي كث
|
||||||
|
|
@ -1275,11 +1275,10 @@ zwimel
|
||||||
نكت امه
|
نكت امه
|
||||||
نياكة
|
نياكة
|
||||||
نياكه
|
نياكه
|
||||||
*نيك*
|
نيك
|
||||||
واطي
|
واطي
|
||||||
وسخ
|
وسخ
|
||||||
ولد القحبة
|
ولد القحبة
|
||||||
ولد القحبه
|
ولد القحبه
|
||||||
يا هبيلة
|
يا هبيلة
|
||||||
يلعن
|
يلعن
|
||||||
*كس*
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
[project]
|
[project]
|
||||||
name = "wiqaya"
|
name = "wiqaya"
|
||||||
version = "0.2.5"
|
version = "0.2.0"
|
||||||
description = "A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages."
|
description = "A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages."
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = {text = "MIT"}
|
license = {text = "MIT"}
|
||||||
|
|
@ -18,6 +18,3 @@ build-backend = "uv_build"
|
||||||
dev = [
|
dev = [
|
||||||
"pytest>=9.0.2",
|
"pytest>=9.0.2",
|
||||||
]
|
]
|
||||||
|
|
||||||
[tool.setuptools.package-data]
|
|
||||||
wiqaya = ["data/*.txt"]
|
|
||||||
|
|
|
||||||
|
|
@ -1,123 +1,38 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from .utils import remove_tashkeel
|
from .utils import remove_tashkeel
|
||||||
import re
|
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
|
||||||
|
DATA_DIR = Path(__file__).parent.parent.parent / "data"
|
||||||
|
|
||||||
class Wiqaya:
|
class Wiqaya:
|
||||||
def __init__(self, lang: str):
|
def __init__(self, lang: str):
|
||||||
"""
|
|
||||||
Initialize the Wiqaya profanity filter for a given language.
|
|
||||||
|
|
||||||
Loads the word list from a language-specific .txt file in the data directory.
|
|
||||||
Entries containing '*' are treated as wildcard patterns and compiled into
|
|
||||||
regex objects. Plain entries are stored in a set for O(1) lookup.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
lang (str): Language code (e.g., 'ar', 'en'). Must match a filename in data/.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If no word list file exists for the given language.
|
|
||||||
"""
|
|
||||||
self.lang = lang
|
self.lang = lang
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
|
||||||
lines = [line.strip() for line in f if line.strip()]
|
self.WORDS = set(line.strip() for line in f)
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
raise ValueError(f"Language '{self.lang}' not supported")
|
raise ValueError(f"Language '{self.lang}' not supported")
|
||||||
|
|
||||||
self.WORDS = set()
|
def is_profane(self, text) -> bool:
|
||||||
self._patterns = []
|
words = self._process(text)
|
||||||
for entry in lines:
|
return any(word in self.WORDS for word in words)
|
||||||
if "*" in entry:
|
|
||||||
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
|
|
||||||
regex = re.escape(entry).replace(r"\*", ".*")
|
|
||||||
self._patterns.append(re.compile(f"^{regex}$"))
|
|
||||||
else:
|
|
||||||
self.WORDS.add(entry)
|
|
||||||
|
|
||||||
def _matches_any_pattern(self, word: str) -> bool:
|
def get_profane_words(self, text) -> list[str]:
|
||||||
"""
|
words = self._process(text)
|
||||||
Check whether a word matches any of the compiled wildcard regex patterns.
|
return [word for word in words if word in self.WORDS]
|
||||||
|
|
||||||
Args:
|
|
||||||
word (str): The word to test.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the word matches at least one pattern, False otherwise.
|
|
||||||
"""
|
|
||||||
return any(p.match(word) for p in self._patterns)
|
|
||||||
|
|
||||||
def _is_bad(self, word: str) -> bool:
|
|
||||||
"""
|
|
||||||
Determine if a single word is considered profane.
|
|
||||||
|
|
||||||
Checks both the exact-match word set and the wildcard pattern list.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
word (str): The word to check.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if the word is profane, False otherwise.
|
|
||||||
"""
|
|
||||||
return word in self.WORDS or self._matches_any_pattern(word)
|
|
||||||
|
|
||||||
def is_profane(self, text: str) -> bool:
|
|
||||||
"""
|
|
||||||
Return True if the text contains at least one profane word.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to scan.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
bool: True if any profane word is found, False otherwise.
|
|
||||||
"""
|
|
||||||
return any(self._is_bad(w) for w in self._process(text))
|
|
||||||
|
|
||||||
def get_profane_words(self, text: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Extract and return all profane words found in the text.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to scan.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list[str]: A list of every word in the text that is considered profane.
|
|
||||||
"""
|
|
||||||
return [w for w in self._process(text) if self._is_bad(w)]
|
|
||||||
|
|
||||||
def censor(self, text: str, char: str = "*") -> str:
|
def censor(self, text: str, char: str = "*") -> str:
|
||||||
"""
|
words = self._process(text)
|
||||||
Replace each profane word in the text with a repeated censor character.
|
for word in words:
|
||||||
|
if word in self.WORDS:
|
||||||
The replacement preserves the original word's length (e.g., 'hell' → '****').
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The input text to censor.
|
|
||||||
char (str): The character used for censoring. Defaults to '*'.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
str: The censored version of the input text.
|
|
||||||
"""
|
|
||||||
for word in self._process(text):
|
|
||||||
if self._is_bad(word):
|
|
||||||
text = text.replace(word, char * len(word))
|
text = text.replace(word, char * len(word))
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def _process(self, text: str) -> list[str]:
|
def _process(self, text: str) -> list[str]:
|
||||||
"""
|
|
||||||
Normalize and tokenize the input text into a list of words.
|
|
||||||
|
|
||||||
For Arabic text, diacritics (tashkeel) are stripped first to prevent
|
|
||||||
users from bypassing the filter by adding vowel marks to profane words.
|
|
||||||
The text is then lowercased and split on whitespace.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text (str): The raw input text.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list[str]: A list of normalized, lowercase tokens.
|
|
||||||
"""
|
|
||||||
if self.lang == "ar":
|
if self.lang == "ar":
|
||||||
text = remove_tashkeel(text)
|
text = remove_tashkeel(text)
|
||||||
return text.lower().split()
|
return text.lower().split()
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -56,20 +56,3 @@ def test_invalid_lang():
|
||||||
import pytest
|
import pytest
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
Wiqaya(lang="xx")
|
Wiqaya(lang="xx")
|
||||||
|
|
||||||
|
|
||||||
def test_wildcard_support():
|
|
||||||
w = Wiqaya(lang="en")
|
|
||||||
|
|
||||||
# is_profane
|
|
||||||
assert w.is_profane("wwsfuck") == True
|
|
||||||
assert w.is_profane("fuckwedf") == True
|
|
||||||
assert w.is_profane("wd+wfucked+") == True
|
|
||||||
|
|
||||||
|
|
||||||
# get_profane_words
|
|
||||||
assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"]
|
|
||||||
|
|
||||||
# censor
|
|
||||||
assert w.censor("hello dsfuckw there") == "hello ******* there"
|
|
||||||
assert w.censor("dsfuckw ffdamn", char="#") == "####### ######"
|
|
||||||
2
uv.lock
generated
2
uv.lock
generated
|
|
@ -65,7 +65,7 @@ wheels = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "wiqaya"
|
name = "wiqaya"
|
||||||
version = "0.2.5"
|
version = "0.2.0"
|
||||||
source = { editable = "." }
|
source = { editable = "." }
|
||||||
|
|
||||||
[package.dev-dependencies]
|
[package.dev-dependencies]
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue