preparing to read Arabic morpholex from Wiktionary

2026-05-27 08:58:55 -06:00 · 2023-09-12 12:08:31 +02:00
parent 3640421022
commit 6312624a5f
1 changed files with 97 additions and 0 deletions
--- a/src/arabic/wiktionary/read_wiktionary.py
+++ b/src/arabic/wiktionary/read_wiktionary.py
@@ -0,0 +1,97 @@
+import gzip
+import json
+
+WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
+
+
+def get_gzip_json(file, sample=100000, langs=[]):
+    with gzip.open(file) as decompressed:
+        n = 0
+        for line in decompressed:
+            n += 1
+            if n % sample == 0:
+                obj = json.loads(line)
+                if obj.get('lang', None) in langs:
+                    print(line.decode("utf-8"))
+        print(n)
+
+
+# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])  
+# python3 read_wiktionary.py >wikt_arabic.jsonl
+# 621-671
+
+# https://en.wikipedia.org/wiki/Buckwalter_transliteration
+buckwalter_dict = {
+  0x621: "'",  # ء
+  0x622: '|',  # آ
+  0x623: '>',  # أ
+  0x624: '&',  # ؤ
+  0x625: '<',  # إ
+  0x626: '}',  # ئ
+  0x627: 'A',  # ا
+  0x628: 'b',  # ب
+  0x629: 'p',  # ة
+  0x62a: 't',  # ت
+  0x62b: 'v',  # ث
+  0x62c: 'j',  # ج
+  0x62d: 'H',  # ح
+  0x62e: 'x',  # خ
+  0x62f: 'd',  # د
+  0x630: '*',  # ذ
+  0x631: 'r',  # ر
+  0x632: 'z',  # ز
+  0x633: 's',  # س
+  0x634: '$',  # ش
+  0x635: 'S',  # ص
+  0x636: 'D',  # ض
+  0x637: 'T',  # ط
+  0x638: 'Z',  # ظ
+  0x639: 'E',  # ع
+  0x63a: 'g',  # غ
+  0x641: 'f',  # ف
+  0x642: 'q',  # ق
+  0x643: 'k',  # ك
+  0x644: 'l',  # ل
+  0x645: 'm',  # م
+  0x646: 'n',  # ن
+  0x647: 'h',  # ه
+  0x648: 'w',  # و
+  0x649: 'Y',  # ى
+  0x64a: 'y',  # ي
+  0x64b: 'F',  # ً
+  0x64c: 'N',  # ٌ
+  0x64d: 'K',  # ٍ
+  0x64e: 'a',  # َ
+  0x64f: 'u',  # ُ
+  0x650: 'i',  # ِ
+  0x651: '~',  # ّ
+  0x652: 'o',  # ْ
+  0x670: '`',  # '
+  0x671: '{'   # ٱ
+  }
+
+def to_buckwalter(s):
+    return ''.join(list(map(lambda c: buckwalter_dict.get(ord(c), '?'), s)))
+
+
+def is_arabic(s):
+    return s and any(1574 <= ord(c) <= 1616 for c in s)
+
+"""
+with open('wikt_arabic.jsonl') as file:
+    for line in file:
+        obj = json.loads(line)
+        if 'Arabic lemmas' in obj.get('categories', []):
+            entry = {
+                'pos': obj['pos'],
+                'forms': {form['form']: form.get('tags', []) for
+                          form in obj.get('forms', []) if
+                          'romanization' not in form.get('tags', []) and
+                          is_arabic(form['form'])
+                          },
+                'senses': obj.get('senses', [])
+                }
+            entry['n_forms'] = len(entry['forms'])
+            print(entry['pos'], entry['n_forms'])
+#            print(json.dumps(entry, ensure_ascii=False))
+"""