| commit cf83a1acd06e1a347cae033a7cb9fdeff8dfcd01 |
| Author: YoussefReda9 <youssefreda9004@gmail.com> |
| Date: Tue Jun 30 04:36:40 2026 +0300 |
|
|
| Fix: 10 critical NLP logic bugs in grammar, spelling, and punctuation to prevent false positives |
|
|
| diff --git a/src/nlp/spelling/araspell_rules.py b/src/nlp/spelling/araspell_rules.py |
| index 634f134..0102cae 100644 |
| --- a/src/nlp/spelling/araspell_rules.py |
| +++ b/src/nlp/spelling/araspell_rules.py |
| @@ -129,14 +129,9 @@ class AraSpellPostProcessor: |
| @staticmethod |
| def remove_duplicate_words(text: str) -> str: |
| """Remove consecutive duplicate words. e.g. كتاب كتاب → كتاب""" |
| - words = text.split() |
| - if len(words) < 2: |
| - return text |
| - result = [words[0]] |
| - for i in range(1, len(words)): |
| - if words[i] != words[i-1]: |
| - result.append(words[i]) |
| - return ' '.join(result) |
| + # Bug 2.11: Destroys rhetorical repetition (التوكيد اللفظي) like "صفا صفا". |
| + # Disabled as it destroys valid Arabic phrases. |
| + return text |
| |
| @staticmethod |
| def normalize_spaces(text: str) -> str: |
| @@ -337,11 +332,11 @@ class AraSpellPostProcessor: |
| if any(word.endswith(e) for e in PROTECTED_ENDINGS): |
| result.append(word) |
| continue |
| - if word in PROTECTED_HA_WORDS: |
| + if word in PROTECTED_HA_WORDS or word in ['هذه', 'هاته']: |
| result.append(word) |
| continue |
| if len(word) >= 3 and word.endswith('ه'): |
| - if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS: |
| + if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS or word[-2] in 'اويءؤئ': |
| candidate_with_ta = word[:-1] + 'ة' |
| # Default: prefer ة (correct Arabic orthography for feminine nouns) |
| if vocab_manager: |
| @@ -389,11 +384,8 @@ class AraSpellPostProcessor: |
| if i + 1 < len(words): |
| next_word = words[i + 1] |
| # Bug 2.11: Destroys Badal structures (الأستاذ أستاذ -> الأستاذ) |
| - if word == next_word: # Only remove exact duplicates, not normalized duplicates |
| - keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word |
| - result.append(keep) |
| - i += 2 |
| - continue |
| + # and Rhetorical Repetition (التوكيد اللفظي) |
| + # Removed the aggressive duplicate word deletion. |
| result.append(word) |
| i += 1 |
| return ' '.join(result) |
| @@ -1177,7 +1169,15 @@ class ArabicSpellChecker: |
| logger.info("[MLM/CONTEXTUAL] Disabled by configuration (use_contextual=False)") |
| |
| def _fix_repeated_end_chars(self, text: str) -> str: |
| - text = re.sub(r'([ا-ي])\1+\b', r'\1', text) |
| + # Exclude 'ي' if it is preceded by a Kasra or another Yaa (e.g., يحيي) |
| + def _replace_repeated(m): |
| + w = m.group(0) |
| + char = m.group(2) |
| + if w.endswith('يي'): |
| + if self.vocab_manager and self.vocab_manager.is_iv(w): |
| + return w |
| + return m.group(1) + char |
| + text = re.sub(r'\b([^\s]+?)([\u0621-\u064A])\2+\b', _replace_repeated, text) |
| return text |
| |
| def _fix_merged_with_errors(self, text: str) -> str: |
|
|
| commit ee5e50414a8f53b71d5fa3d4f864c812ee835567 |
| Author: YoussefReda9 <youssefreda9004@gmail.com> |
| Date: Tue Jun 30 04:11:41 2026 +0300 |
|
|
| Fix 30 NLP edge cases in Grammar, Spelling, and Punctuation (Phase 10 results and Extension UI improvements) |
|
|
| diff --git a/src/nlp/spelling/araspell_rules.py b/src/nlp/spelling/araspell_rules.py |
| index 39d02b7..634f134 100644 |
| --- a/src/nlp/spelling/araspell_rules.py |
| +++ b/src/nlp/spelling/araspell_rules.py |
| @@ -154,17 +154,9 @@ class AraSpellPostProcessor: |
| @staticmethod |
| def remove_word_repetition_with_wa(text: str) -> str: |
| """Remove word و word → word""" |
| - words = text.split() |
| - result = [] |
| - i = 0 |
| - while i < len(words): |
| - if i + 2 < len(words) and words[i] == words[i+2] and words[i+1] == 'و': |
| - result.append(words[i]) |
| - i += 3 |
| - else: |
| - result.append(words[i]) |
| - i += 1 |
| - return ' '.join(result) |
| + # Bug 2.9: This deletes valid rhetorical repetition (التوكيد اللفظي) like "صنفا وصنفا" |
| + # Disabled as it is highly destructive to valid Arabic. |
| + return text |
| |
| # --- Hamza & Ta Marbuta Handling --- |
| |
| @@ -181,7 +173,7 @@ class AraSpellPostProcessor: |
| 'اذا': 'إذا', 'اذ': 'إذ', |
| 'اي': 'أي', 'اين': 'أين', |
| 'او': 'أو', |
| - 'اما': 'أما', |
| + |
| 'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم', |
| 'اخر': 'آخر', 'اخرى': 'أخرى', |
| 'الان': 'الآن', |
| @@ -201,9 +193,9 @@ class AraSpellPostProcessor: |
| 'اهل': 'أهل', |
| 'اطفال': 'أطفال', |
| 'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي', |
| - 'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب', |
| - 'اعرف': 'أعرف', 'اعلم': 'أعلم', |
| - 'اخذ': 'أخذ', 'اكل': 'أكل', |
| + 'اريد': 'أريد', 'احب': 'أحب', |
| + 'اعلم': 'أعلم', |
| + 'اكل': 'أكل', |
| 'الايام': 'الأيام', |
| 'الاطفال': 'الأطفال', |
| 'الاسعار': 'الأسعار', |
| @@ -243,10 +235,8 @@ class AraSpellPostProcessor: |
| 'ادارة': 'إدارة', 'ادارية': 'إدارية', |
| 'اعلام': 'إعلام', 'اعلامي': 'إعلامي', |
| 'احتمال': 'احتمال', 'احتفال': 'احتفال', |
| - 'ازور': 'أزور', 'اذهب': 'أذهب', 'اكتب': 'أكتب', |
| 'اقرا': 'أقرأ', 'اقرأ': 'أقرأ', |
| - 'اعمل': 'أعمل', 'ادرس': 'أدرس', |
| - 'اشتري': 'أشتري', 'اسافر': 'أسافر', |
| + 'اسافر': 'أسافر', |
| 'احبه': 'أحبه', |
| 'مسؤول': 'مسؤول', 'مسؤولية': 'مسؤولية', |
| 'رؤية': 'رؤية', 'رؤيا': 'رؤيا', |
| @@ -259,7 +249,7 @@ class AraSpellPostProcessor: |
| 'مصطفي': 'مصطفى', 'موسي': 'موسى', 'عيسي': 'عيسى', |
| 'هدي': 'هدى', 'بني': 'بنى', |
| 'معني': 'معنى', 'مبني': 'مبنى', |
| - 'علي': 'على', # Common alif maqsura confusion |
| + |
| 'الي': 'إلى', |
| # FIX-47: Verb+pronoun hamza entries (احبه→أحبه) |
| 'احبه': 'أحبه', 'احبها': 'أحبها', 'احبك': 'أحبك', |
| @@ -280,16 +270,9 @@ class AraSpellPostProcessor: |
| @staticmethod |
| def fix_hamza_conservative(text: str) -> str: |
| """Conservative Hamza normalization — only at word END, not middle.""" |
| - words = text.split() |
| - result = [] |
| - for word in words: |
| - if len(word) >= 3: |
| - if word.endswith('أ'): |
| - word = word[:-1] + 'ا' |
| - if word.endswith('إ'): |
| - word = word[:-1] + 'ا' |
| - result.append(word) |
| - return ' '.join(result) |
| + # Bug 2.5: Blindly changing أ at the end of word to ا corrupts valid orthography (قرأ -> قرا) |
| + # Disabled as it is highly destructive. |
| + return text |
| |
| # Attached prefixes that can precede hamza-whitelist words |
| # Ordered longest-first so وال is tried before و |
| @@ -364,8 +347,12 @@ class AraSpellPostProcessor: |
| if vocab_manager: |
| ta_iv = vocab_manager.is_iv(candidate_with_ta) |
| ha_iv = vocab_manager.is_iv(word) |
| - if ta_iv: |
| - # Always prefer ة when it's a valid word |
| + if ha_iv and ta_iv: |
| + # Bug 2.2: Do not prefer ة if ه is also valid (possessive pronoun) |
| + result.append(word) |
| + continue |
| + elif ta_iv: |
| + # Prefer ة when ONLY the ة form is valid |
| result.append(candidate_with_ta) |
| continue |
| elif ha_iv: |
| @@ -401,7 +388,8 @@ class AraSpellPostProcessor: |
| word = word[:-1] |
| if i + 1 < len(words): |
| next_word = words[i + 1] |
| - if normalize_word(word) == normalize_word(next_word): |
| + # Bug 2.11: Destroys Badal structures (الأستاذ أستاذ -> الأستاذ) |
| + if word == next_word: # Only remove exact duplicates, not normalized duplicates |
| keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word |
| result.append(keep) |
| i += 2 |
| @@ -454,18 +442,8 @@ class AraSpellPostProcessor: |
| result.append(word + next_word) |
| i += 2 |
| continue |
| - if len(word) >= 2 and len(next_word) >= 2 and word[-1] == next_word[0]: |
| - if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS): |
| - result.append(word[:-1] + next_word) |
| - i += 2 |
| - continue |
| - if (2 <= len(word) <= 4 and |
| - 1 <= len(next_word) <= 2 and |
| - 3 <= len(word) + len(next_word) <= 7): |
| - if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS): |
| - result.append(word + next_word) |
| - i += 2 |
| - continue |
| + # Bug 2.3: Destructive word merging (يوم مشمس -> يومشمس) |
| + # Removed generic boundary letter merging. |
| result.append(word) |
| i += 1 |
| return ' '.join(result) |
| @@ -779,15 +757,7 @@ class WordAligner: |
| if in_iv and not out_iv: |
| return input_word |
| if in_iv and out_iv: |
| - # Fix S1: When only difference is ه→ة at word end, prefer ة |
| - # (correct Arabic orthography — ة is the standard feminine ending) |
| - if (input_word.endswith('ه') and output_word.endswith('ة') |
| - and input_word[:-1] == output_word[:-1]): |
| - return output_word |
| - # Fix S1: Also handle ة→ه (don't regress a correct ة to ه) |
| - if (input_word.endswith('ة') and output_word.endswith('ه') |
| - and input_word[:-1] == output_word[:-1]): |
| - return input_word |
| + # Bug 2.2: Do not prefer ة over ه if both are IV, because ه is often a valid possessive pronoun. |
| return input_word |
| if len(input_word) == len(output_word) and len(input_word) >= 3: |
| for i in range(len(input_word)): |
| @@ -1211,51 +1181,23 @@ class ArabicSpellChecker: |
| return text |
| |
| def _fix_merged_with_errors(self, text: str) -> str: |
| - text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\2', text) |
| + # Bug 2.10: This regex was r'ال\2', deleting all instances of the character |
| + text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\1\2', text) |
| text = re.sub(r'\b([ا-ي]{3,})([ا-ي])\2+\b', r'\1\2', text) |
| return text |
| |
| def _split_merged_words_linguistic(self, text: str) -> str: |
| - text = re.sub( |
| - r'\b(في|من|إلى|الى|حتى|منذ|خلال|بعد|قبل)(ال)?([ا-ي]{3,})', |
| - r'\1 \2\3', text |
| - ) |
| - text = re.sub(r'\b(كل)([ا-ي]{3,})', r'\1 \2', text) |
| - text = re.sub(r'([ا-ي]{3,})(ال)([ا-ي]{3,})', r'\1 \2\3', text) |
| - text = re.sub(r'\b([بلك])(ال)?([ا-ي]{3,})', r'\1 \2\3', text) |
| - text = re.sub(r'([ا-ي]{4,})(عليكم|عليك|عليه|عليها)', r'\1 \2', text) |
| - text = re.sub(r'([ا-ي]{3,})(على|عن)([ا-ي]{3,})', r'\1 \2 \3', text) |
| + # Bug 2.7: Catastrophic preposition splitting (e.g. منطق -> من طق) |
| + # Disabled generic regex splitting as it is highly destructive to valid vocabulary. |
| return text |
| |
| def _split_long_words_heuristic(self, text: str, max_length: int = 15) -> str: |
| - words = text.split() |
| - result = [] |
| - for word in words: |
| - if len(word) <= max_length: |
| - result.append(word) |
| - continue |
| - if 'ال' in word[2:]: |
| - parts = word.split('ال', 1) |
| - if len(parts[0]) >= 2 and len(parts[1]) >= 3: |
| - result.extend([parts[0], 'ال' + parts[1]]) |
| - continue |
| - if len(word) >= 8: |
| - split_found = False |
| - for split_pos in [2, 3]: |
| - prefix = word[:split_pos] |
| - suffix = word[split_pos:] |
| - if prefix in ['في', 'من', 'على', 'عن', 'مع', 'كل', 'ب', 'ل', 'ك']: |
| - result.extend([prefix, suffix]) |
| - split_found = True |
| - break |
| - if not split_found: |
| - result.append(word) |
| - else: |
| - result.append(word) |
| - return ' '.join(result) |
| + # Bug 2.8: Overzealous long word splitting (e.g. فيتامينات -> في تامينات) |
| + # Disabled as it creates more errors than it fixes. |
| + return text |
| |
| def _normalize_tanween_patterns(self, text: str) -> str: |
| - text = re.sub(r'([ا-ي]{2,})أ\b', r'\1اً', text) |
| + # Bug 2.6: Blind replacement of trailing أ with اً corrupts verbs and nominative cases (قرأ -> قراً) |
| text = re.sub(r'\s+أ\s+', ' ', text) |
| text = re.sub(r'\b([بلك])\s+([ا-ي])', r'\1\2', text) |
| return text |
| @@ -1661,3 +1603,4 @@ class ArabicSpellChecker: |
| result = ' '.join(res_words_list) |
| |
| return result |
| + |
|
|