Compare commits

...

18 commits

Author SHA1 Message Date
tayf
950e72e09c ci(workflow): remove obsolete publish workflow 2026-03-19 17:09:12 +02:00
tayf
b73c2c6134 ci(workflow): add PyPI publish step to release workflow and remove separate publish workflow 2026-03-19 17:08:41 +02:00
tayf
6aebd9ff2b ci(workflow): set release draft to false 2026-03-19 16:48:17 +02:00
tayf
c50f4b4324 ci(workflow): use release event for publishing 2026-03-19 16:42:05 +02:00
tayf
740fabbbe9 update workflow 2026-03-19 16:37:46 +02:00
tayf
dea61b52ea update workflow 2026-03-19 16:24:08 +02:00
tayf
928b233817 chore: bump version to 0.2.5 2026-03-10 14:24:36 +02:00
tayf
c40cebcab6 update 2026-03-10 13:56:23 +02:00
tayf
fbd872a40a update db 2026-03-10 13:52:44 +02:00
tayf
1d655d32ef to v0.2.4 2026-03-08 20:12:23 +02:00
tayf
caa672219c fix: add uv build step to publish workflow 2026-03-08 20:09:34 +02:00
tayf
c3bfc05755 update workflow 2026-03-08 20:06:20 +02:00
tayf
707abd92bc bump to 0.2.3 2026-03-08 19:57:17 +02:00
tayf
8ced36795a feat: add wildcard pattern support for profanity words 2026-03-08 19:55:55 +02:00
tayf
8caaaa6675 bump version to 0.2.2 2026-03-08 17:25:10 +02:00
tayf
7a8e0f2d84 update db 2026-03-08 17:21:31 +02:00
tayf
e594116166 update 2026-03-08 17:17:12 +02:00
tayf
9d9df30287 update 2026-03-08 17:07:12 +02:00
83 changed files with 171 additions and 51 deletions

34
.github/workflows/release.yml vendored Normal file
View file

@ -0,0 +1,34 @@
name: Release
on:
workflow_dispatch:
jobs:
release:
runs-on: ubuntu-latest
environment: pypi
permissions:
id-token: write
contents: write
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v5
- name: Get version from pyproject.toml
id: get_version
run: |
VERSION=$(python -c "import tomllib; data=tomllib.load(open('pyproject.toml','rb')); print(data['project']['version'])")
echo "version=$VERSION" >> $GITHUB_OUTPUT
- name: Build
run: uv build
- name: Create GitHub Release
uses: softprops/action-gh-release@v2
with:
tag_name: v${{ steps.get_version.outputs.version }}
name: v${{ steps.get_version.outputs.version }}
generate_release_notes: true
- name: Publish to PyPI
uses: pypa/gh-action-pypi-publish@release/v1

View file

@ -20,8 +20,10 @@ w.get_profane_words("نص فيه حرامي و أطرش") # ['حرامي', 'أ
```
> [!NOTE]
> المكتبة تدعم إزالة التشكيل من الكلمات تلقائياً
> تدعم المكتبة إزالة التشكيل تلقائياً عند استخدام اللغة العربية
> [!TIP]
> يدعم المشروع النمط البديل (Wildcard) في قوائم الكلمات — استخدم `*` للتطابق مع أي تسلسل من الأحرف (مثال: `bad*` تطابق `badly`، و`*word*` تطابق أي كلمة تحتوي على `word`)
## اللغات المدعومة

View file

@ -22,6 +22,11 @@ w.is_profane("Hello World") # False
w.get_profane_words("this is damn annoying") # ['damn']
```
> [!NOTE]
> The library automatically removes Arabic diacritics (Tashkeel) when using Arabic language mode
> [!TIP]
> Wildcard patterns are supported in word lists — use `*` to match any sequence of characters (e.g., `bad*` matches `badly`, `*word*` matches anything containing `word`)
## Supported Languages

View file

@ -1,27 +0,0 @@
name: Publish to PyPI
on:
release:
types: [published]
jobs:
publish:
runs-on: ubuntu-latest
environment: pypi
permissions:
id-token: write
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Build
run: |
pip install build
python -m build
- name: Publish
uses: pypa/gh-action-pypi-publish@release/v1

View file

@ -1,6 +1,6 @@
[project]
name = "wiqaya"
version = "0.2.0"
version = "0.2.5"
description = "A Python library for multilingual profanity detection and filtering. It identifies and censors offensive or abusive words across multiple languages."
readme = "README.md"
license = {text = "MIT"}
@ -18,3 +18,6 @@ build-backend = "uv_build"
dev = [
"pytest>=9.0.2",
]
[tool.setuptools.package-data]
wiqaya = ["data/*.txt"]

View file

@ -966,7 +966,7 @@ zwimel
ابو فص
ابو قرعة
اتن
احا
*احا*
احترم نفسك
احتلام
احلي كث
@ -1275,10 +1275,11 @@ zwimel
نكت امه
نياكة
نياكه
نيك
*نيك*
واطي
وسخ
ولد القحبة
ولد القحبه
يا هبيلة
يلعن
*كس*

View file

@ -1,38 +1,123 @@
from pathlib import Path
from pathlib import Path
from .utils import remove_tashkeel
import re
DATA_DIR = Path(__file__).parent.parent.parent / "data"
DATA_DIR = Path(__file__).parent / "data"
class Wiqaya:
def __init__(self, lang: str):
self.lang = lang
"""
Initialize the Wiqaya profanity filter for a given language.
Loads the word list from a language-specific .txt file in the data directory.
Entries containing '*' are treated as wildcard patterns and compiled into
regex objects. Plain entries are stored in a set for O(1) lookup.
Args:
lang (str): Language code (e.g., 'ar', 'en'). Must match a filename in data/.
Raises:
ValueError: If no word list file exists for the given language.
"""
self.lang = lang
try:
with open(f"{DATA_DIR}/{self.lang}.txt", "r", encoding="utf-8") as f:
self.WORDS = set(line.strip() for line in f)
lines = [line.strip() for line in f if line.strip()]
except FileNotFoundError:
raise ValueError(f"Language '{self.lang}' not supported")
def is_profane(self, text) -> bool:
words = self._process(text)
return any(word in self.WORDS for word in words)
self.WORDS = set()
self._patterns = []
for entry in lines:
if "*" in entry:
# Convert wildcard to regex: *word* → .*word.*, word* → word.*
regex = re.escape(entry).replace(r"\*", ".*")
self._patterns.append(re.compile(f"^{regex}$"))
else:
self.WORDS.add(entry)
def get_profane_words(self, text) -> list[str]:
words = self._process(text)
return [word for word in words if word in self.WORDS]
def _matches_any_pattern(self, word: str) -> bool:
"""
Check whether a word matches any of the compiled wildcard regex patterns.
Args:
word (str): The word to test.
Returns:
bool: True if the word matches at least one pattern, False otherwise.
"""
return any(p.match(word) for p in self._patterns)
def _is_bad(self, word: str) -> bool:
"""
Determine if a single word is considered profane.
Checks both the exact-match word set and the wildcard pattern list.
Args:
word (str): The word to check.
Returns:
bool: True if the word is profane, False otherwise.
"""
return word in self.WORDS or self._matches_any_pattern(word)
def is_profane(self, text: str) -> bool:
"""
Return True if the text contains at least one profane word.
Args:
text (str): The input text to scan.
Returns:
bool: True if any profane word is found, False otherwise.
"""
return any(self._is_bad(w) for w in self._process(text))
def get_profane_words(self, text: str) -> list[str]:
"""
Extract and return all profane words found in the text.
Args:
text (str): The input text to scan.
Returns:
list[str]: A list of every word in the text that is considered profane.
"""
return [w for w in self._process(text) if self._is_bad(w)]
def censor(self, text: str, char: str = "*") -> str:
words = self._process(text)
for word in words:
if word in self.WORDS:
"""
Replace each profane word in the text with a repeated censor character.
The replacement preserves the original word's length (e.g., 'hell''****').
Args:
text (str): The input text to censor.
char (str): The character used for censoring. Defaults to '*'.
Returns:
str: The censored version of the input text.
"""
for word in self._process(text):
if self._is_bad(word):
text = text.replace(word, char * len(word))
return text
def _process(self, text: str) -> list[str]:
"""
Normalize and tokenize the input text into a list of words.
For Arabic text, diacritics (tashkeel) are stripped first to prevent
users from bypassing the filter by adding vowel marks to profane words.
The text is then lowercased and split on whitespace.
Args:
text (str): The raw input text.
Returns:
list[str]: A list of normalized, lowercase tokens.
"""
if self.lang == "ar":
text = remove_tashkeel(text)
return text.lower().split()
return text.lower().split()

View file

@ -55,4 +55,21 @@ def test_get_profane_words_en():
def test_invalid_lang():
import pytest
with pytest.raises(ValueError):
Wiqaya(lang="xx")
Wiqaya(lang="xx")
def test_wildcard_support():
w = Wiqaya(lang="en")
# is_profane
assert w.is_profane("wwsfuck") == True
assert w.is_profane("fuckwedf") == True
assert w.is_profane("wd+wfucked+") == True
# get_profane_words
assert w.get_profane_words("hello fsdfuckwwq clean") == ["fsdfuckwwq"]
# censor
assert w.censor("hello dsfuckw there") == "hello ******* there"
assert w.censor("dsfuckw ffdamn", char="#") == "####### ######"

2
uv.lock generated
View file

@ -65,7 +65,7 @@ wheels = [
[[package]]
name = "wiqaya"
version = "0.2.0"
version = "0.2.5"
source = { editable = "." }
[package.dev-dependencies]