bayan-api / diff.txt
youssefreda9's picture
HF Deploy: Fix syntax error with smart quotes in popup.js
fe1e225
Raw
History Blame Contribute Delete
14.4 kB
commit cf83a1acd06e1a347cae033a7cb9fdeff8dfcd01
Author: YoussefReda9 <youssefreda9004@gmail.com>
Date: Tue Jun 30 04:36:40 2026 +0300
Fix: 10 critical NLP logic bugs in grammar, spelling, and punctuation to prevent false positives
diff --git a/src/nlp/spelling/araspell_rules.py b/src/nlp/spelling/araspell_rules.py
index 634f134..0102cae 100644
--- a/src/nlp/spelling/araspell_rules.py
+++ b/src/nlp/spelling/araspell_rules.py
@@ -129,14 +129,9 @@ class AraSpellPostProcessor:
@staticmethod
def remove_duplicate_words(text: str) -> str:
"""Remove consecutive duplicate words. e.g. كتاب كتاب → كتاب"""
- words = text.split()
- if len(words) < 2:
- return text
- result = [words[0]]
- for i in range(1, len(words)):
- if words[i] != words[i-1]:
- result.append(words[i])
- return ' '.join(result)
+ # Bug 2.11: Destroys rhetorical repetition (التوكيد اللفظي) like "صفا صفا".
+ # Disabled as it destroys valid Arabic phrases.
+ return text
@staticmethod
def normalize_spaces(text: str) -> str:
@@ -337,11 +332,11 @@ class AraSpellPostProcessor:
if any(word.endswith(e) for e in PROTECTED_ENDINGS):
result.append(word)
continue
- if word in PROTECTED_HA_WORDS:
+ if word in PROTECTED_HA_WORDS or word in ['هذه', 'هاته']:
result.append(word)
continue
if len(word) >= 3 and word.endswith('ه'):
- if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
+ if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS or word[-2] in 'اويءؤئ':
candidate_with_ta = word[:-1] + 'ة'
# Default: prefer ة (correct Arabic orthography for feminine nouns)
if vocab_manager:
@@ -389,11 +384,8 @@ class AraSpellPostProcessor:
if i + 1 < len(words):
next_word = words[i + 1]
# Bug 2.11: Destroys Badal structures (الأستاذ أستاذ -> الأستاذ)
- if word == next_word: # Only remove exact duplicates, not normalized duplicates
- keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word
- result.append(keep)
- i += 2
- continue
+ # and Rhetorical Repetition (التوكيد اللفظي)
+ # Removed the aggressive duplicate word deletion.
result.append(word)
i += 1
return ' '.join(result)
@@ -1177,7 +1169,15 @@ class ArabicSpellChecker:
logger.info("[MLM/CONTEXTUAL] Disabled by configuration (use_contextual=False)")
def _fix_repeated_end_chars(self, text: str) -> str:
- text = re.sub(r'([ا-ي])\1+\b', r'\1', text)
+ # Exclude 'ي' if it is preceded by a Kasra or another Yaa (e.g., يحيي)
+ def _replace_repeated(m):
+ w = m.group(0)
+ char = m.group(2)
+ if w.endswith('يي'):
+ if self.vocab_manager and self.vocab_manager.is_iv(w):
+ return w
+ return m.group(1) + char
+ text = re.sub(r'\b([^\s]+?)([\u0621-\u064A])\2+\b', _replace_repeated, text)
return text
def _fix_merged_with_errors(self, text: str) -> str:
commit ee5e50414a8f53b71d5fa3d4f864c812ee835567
Author: YoussefReda9 <youssefreda9004@gmail.com>
Date: Tue Jun 30 04:11:41 2026 +0300
Fix 30 NLP edge cases in Grammar, Spelling, and Punctuation (Phase 10 results and Extension UI improvements)
diff --git a/src/nlp/spelling/araspell_rules.py b/src/nlp/spelling/araspell_rules.py
index 39d02b7..634f134 100644
--- a/src/nlp/spelling/araspell_rules.py
+++ b/src/nlp/spelling/araspell_rules.py
@@ -154,17 +154,9 @@ class AraSpellPostProcessor:
@staticmethod
def remove_word_repetition_with_wa(text: str) -> str:
"""Remove word و word → word"""
- words = text.split()
- result = []
- i = 0
- while i < len(words):
- if i + 2 < len(words) and words[i] == words[i+2] and words[i+1] == 'و':
- result.append(words[i])
- i += 3
- else:
- result.append(words[i])
- i += 1
- return ' '.join(result)
+ # Bug 2.9: This deletes valid rhetorical repetition (التوكيد اللفظي) like "صنفا وصنفا"
+ # Disabled as it is highly destructive to valid Arabic.
+ return text
# --- Hamza & Ta Marbuta Handling ---
@@ -181,7 +173,7 @@ class AraSpellPostProcessor:
'اذا': 'إذا', 'اذ': 'إذ',
'اي': 'أي', 'اين': 'أين',
'او': 'أو',
- 'اما': 'أما',
+
'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
'اخر': 'آخر', 'اخرى': 'أخرى',
'الان': 'الآن',
@@ -201,9 +193,9 @@ class AraSpellPostProcessor:
'اهل': 'أهل',
'اطفال': 'أطفال',
'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
- 'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
- 'اعرف': 'أعرف', 'اعلم': 'أعلم',
- 'اخذ': 'أخذ', 'اكل': 'أكل',
+ 'اريد': 'أريد', 'احب': 'أحب',
+ 'اعلم': 'أعلم',
+ 'اكل': 'أكل',
'الايام': 'الأيام',
'الاطفال': 'الأطفال',
'الاسعار': 'الأسعار',
@@ -243,10 +235,8 @@ class AraSpellPostProcessor:
'ادارة': 'إدارة', 'ادارية': 'إدارية',
'اعلام': 'إعلام', 'اعلامي': 'إعلامي',
'احتمال': 'احتمال', 'احتفال': 'احتفال',
- 'ازور': 'أزور', 'اذهب': 'أذهب', 'اكتب': 'أكتب',
'اقرا': 'أقرأ', 'اقرأ': 'أقرأ',
- 'اعمل': 'أعمل', 'ادرس': 'أدرس',
- 'اشتري': 'أشتري', 'اسافر': 'أسافر',
+ 'اسافر': 'أسافر',
'احبه': 'أحبه',
'مسؤول': 'مسؤول', 'مسؤولية': 'مسؤولية',
'رؤية': 'رؤية', 'رؤيا': 'رؤيا',
@@ -259,7 +249,7 @@ class AraSpellPostProcessor:
'مصطفي': 'مصطفى', 'موسي': 'موسى', 'عيسي': 'عيسى',
'هدي': 'هدى', 'بني': 'بنى',
'معني': 'معنى', 'مبني': 'مبنى',
- 'علي': 'على', # Common alif maqsura confusion
+
'الي': 'إلى',
# FIX-47: Verb+pronoun hamza entries (احبه→أحبه)
'احبه': 'أحبه', 'احبها': 'أحبها', 'احبك': 'أحبك',
@@ -280,16 +270,9 @@ class AraSpellPostProcessor:
@staticmethod
def fix_hamza_conservative(text: str) -> str:
"""Conservative Hamza normalization — only at word END, not middle."""
- words = text.split()
- result = []
- for word in words:
- if len(word) >= 3:
- if word.endswith('أ'):
- word = word[:-1] + 'ا'
- if word.endswith('إ'):
- word = word[:-1] + 'ا'
- result.append(word)
- return ' '.join(result)
+ # Bug 2.5: Blindly changing أ at the end of word to ا corrupts valid orthography (قرأ -> قرا)
+ # Disabled as it is highly destructive.
+ return text
# Attached prefixes that can precede hamza-whitelist words
# Ordered longest-first so وال is tried before و
@@ -364,8 +347,12 @@ class AraSpellPostProcessor:
if vocab_manager:
ta_iv = vocab_manager.is_iv(candidate_with_ta)
ha_iv = vocab_manager.is_iv(word)
- if ta_iv:
- # Always prefer ة when it's a valid word
+ if ha_iv and ta_iv:
+ # Bug 2.2: Do not prefer ة if ه is also valid (possessive pronoun)
+ result.append(word)
+ continue
+ elif ta_iv:
+ # Prefer ة when ONLY the ة form is valid
result.append(candidate_with_ta)
continue
elif ha_iv:
@@ -401,7 +388,8 @@ class AraSpellPostProcessor:
word = word[:-1]
if i + 1 < len(words):
next_word = words[i + 1]
- if normalize_word(word) == normalize_word(next_word):
+ # Bug 2.11: Destroys Badal structures (الأستاذ أستاذ -> الأستاذ)
+ if word == next_word: # Only remove exact duplicates, not normalized duplicates
keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word
result.append(keep)
i += 2
@@ -454,18 +442,8 @@ class AraSpellPostProcessor:
result.append(word + next_word)
i += 2
continue
- if len(word) >= 2 and len(next_word) >= 2 and word[-1] == next_word[0]:
- if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
- result.append(word[:-1] + next_word)
- i += 2
- continue
- if (2 <= len(word) <= 4 and
- 1 <= len(next_word) <= 2 and
- 3 <= len(word) + len(next_word) <= 7):
- if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
- result.append(word + next_word)
- i += 2
- continue
+ # Bug 2.3: Destructive word merging (يوم مشمس -> يومشمس)
+ # Removed generic boundary letter merging.
result.append(word)
i += 1
return ' '.join(result)
@@ -779,15 +757,7 @@ class WordAligner:
if in_iv and not out_iv:
return input_word
if in_iv and out_iv:
- # Fix S1: When only difference is ه→ة at word end, prefer ة
- # (correct Arabic orthography — ة is the standard feminine ending)
- if (input_word.endswith('ه') and output_word.endswith('ة')
- and input_word[:-1] == output_word[:-1]):
- return output_word
- # Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
- if (input_word.endswith('ة') and output_word.endswith('ه')
- and input_word[:-1] == output_word[:-1]):
- return input_word
+ # Bug 2.2: Do not prefer ة over ه if both are IV, because ه is often a valid possessive pronoun.
return input_word
if len(input_word) == len(output_word) and len(input_word) >= 3:
for i in range(len(input_word)):
@@ -1211,51 +1181,23 @@ class ArabicSpellChecker:
return text
def _fix_merged_with_errors(self, text: str) -> str:
- text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\2', text)
+ # Bug 2.10: This regex was r'ال\2', deleting all instances of the character
+ text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\1\2', text)
text = re.sub(r'\b([ا-ي]{3,})([ا-ي])\2+\b', r'\1\2', text)
return text
def _split_merged_words_linguistic(self, text: str) -> str:
- text = re.sub(
- r'\b(في|من|إلى|الى|حتى|منذ|خلال|بعد|قبل)(ال)?([ا-ي]{3,})',
- r'\1 \2\3', text
- )
- text = re.sub(r'\b(كل)([ا-ي]{3,})', r'\1 \2', text)
- text = re.sub(r'([ا-ي]{3,})(ال)([ا-ي]{3,})', r'\1 \2\3', text)
- text = re.sub(r'\b([بلك])(ال)?([ا-ي]{3,})', r'\1 \2\3', text)
- text = re.sub(r'([ا-ي]{4,})(عليكم|عليك|عليه|عليها)', r'\1 \2', text)
- text = re.sub(r'([ا-ي]{3,})(على|عن)([ا-ي]{3,})', r'\1 \2 \3', text)
+ # Bug 2.7: Catastrophic preposition splitting (e.g. منطق -> من طق)
+ # Disabled generic regex splitting as it is highly destructive to valid vocabulary.
return text
def _split_long_words_heuristic(self, text: str, max_length: int = 15) -> str:
- words = text.split()
- result = []
- for word in words:
- if len(word) <= max_length:
- result.append(word)
- continue
- if 'ال' in word[2:]:
- parts = word.split('ال', 1)
- if len(parts[0]) >= 2 and len(parts[1]) >= 3:
- result.extend([parts[0], 'ال' + parts[1]])
- continue
- if len(word) >= 8:
- split_found = False
- for split_pos in [2, 3]:
- prefix = word[:split_pos]
- suffix = word[split_pos:]
- if prefix in ['في', 'من', 'على', 'عن', 'مع', 'كل', 'ب', 'ل', 'ك']:
- result.extend([prefix, suffix])
- split_found = True
- break
- if not split_found:
- result.append(word)
- else:
- result.append(word)
- return ' '.join(result)
+ # Bug 2.8: Overzealous long word splitting (e.g. فيتامينات -> في تامينات)
+ # Disabled as it creates more errors than it fixes.
+ return text
def _normalize_tanween_patterns(self, text: str) -> str:
- text = re.sub(r'([ا-ي]{2,})أ\b', r'\1اً', text)
+ # Bug 2.6: Blind replacement of trailing أ with اً corrupts verbs and nominative cases (قرأ -> قراً)
text = re.sub(r'\s+أ\s+', ' ', text)
text = re.sub(r'\b([بلك])\s+([ا-ي])', r'\1\2', text)
return text
@@ -1661,3 +1603,4 @@ class ArabicSpellChecker:
result = ' '.join(res_words_list)
return result
+