gf-rgl/src/arabic/wiktionary/read_wiktionary.py

import gzip
import json

WIKTIONARY_DUMP = 'raw-wiktextract-data.json.gz'
FILTERED_WIKT = 'wikt_arabic.jsonl'


def get_gzip_json(file, sample=100000, langs=[]):
    with gzip.open(file) as decompressed:
        n = 0
        for line in decompressed:
            n += 1
            if n % sample == 0:
                obj = json.loads(line)
                if obj.get('lang', None) in langs:
                    print(line.decode("utf-8"))
        print(n)


# get_gzip_json(WIKTIONARY_DUMP, 1, ['Arabic'])
# python3 read_wiktionary.py >wikt_arabic.jsonl
# 621-671

# https://en.wikipedia.org/wiki/Buckwalter_transliteration
buckwalter_dict = {
  0x621: "'",  # ء
  0x622: '|',  # آ
  0x623: '>',  # أ
  0x624: '&',  # ؤ
  0x625: '<',  # إ
  0x626: '}',  # ئ
  0x627: 'A',  # ا
  0x628: 'b',  # ب
  0x629: 'p',  # ة
  0x62a: 't',  # ت
  0x62b: 'v',  # ث
  0x62c: 'j',  # ج
  0x62d: 'H',  # ح
  0x62e: 'x',  # خ
  0x62f: 'd',  # د
  0x630: '*',  # ذ
  0x631: 'r',  # ر
  0x632: 'z',  # ز
  0x633: 's',  # س
  0x634: '$',  # ش
  0x635: 'S',  # ص
  0x636: 'D',  # ض
  0x637: 'T',  # ط
  0x638: 'Z',  # ظ
  0x639: 'E',  # ع
  0x63a: 'g',  # غ
  0x641: 'f',  # ف
  0x642: 'q',  # ق
  0x643: 'k',  # ك
  0x644: 'l',  # ل
  0x645: 'm',  # م
  0x646: 'n',  # ن
  0x647: 'h',  # ه
  0x648: 'w',  # و
  0x649: 'Y',  # ى
  0x64a: 'y',  # ي
  0x64b: 'F',  # ً
  0x64c: 'N',  # ٌ
  0x64d: 'K',  # ٍ
  0x64e: 'a',  # َ
  0x64f: 'u',  # ُ
  0x650: 'i',  # ِ
  0x651: '~',  # ّ
  0x652: 'o',  # ْ
  0x670: '`',  # '
  0x671: '{'   # ٱ
  }

def to_buckwalter(s):
    return ''.join([buckwalter_dict.get(ord(c), '?') for c in s])


def unvocalize(s):
    return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])

def is_arabic(s):
    return s and any(1574 <= ord(c) <= 1616 for c in s)


def gf_fun(s, pos):
    return ''.join(["'", s, "_", pos, "'"])


def forms_for_pos(obj):
    forms = {
        form['form']:
          form.get('tags', []) for
            form in obj.get('forms', []) if
               'romanization' not in form.get('tags', []) and
                   is_arabic(form['form'])
        }.items()
    if obj['pos'] == 'noun':
        lemma = [form[:-1] for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'singular']])][:1]
        return {
            'gf_fun': gf_fun(lemma[0], 'N') if lemma else None,
            'gf_cat': 'N',
            'singular': lemma,
            'plural': [form[:-1] for form, descr in forms
                         if all([w in descr for w in ['construct', 'nominative', 'plural']])][:1],
            'gender': 'Fem' if 'Arabic feminine nouns' in obj['categories']
                            else ('Masc' if  'Arabic masculine nouns' in obj['categories']
                                else None)
            }
    elif obj['pos'] == 'verb':
        lemma = [form for form, descr in forms
                      if all([w in descr for
                              w in ["active", "indicative", "masculine", "past", "perfective", "singular", "third-person"]])][:1]
        return {
          'gf_fun': gf_fun(lemma[0], 'V') if lemma else None,
          'gf_cat': 'V',
          'perfect': lemma,
          'imperfect': [form for form, descr in forms
                      if all([w in descr for
                              w in ["active", "indicative", "masculine", "non-past", "imperfective", "singular", "third-person"]])][:1],
          'verbclass': max([n for n in ['I', 'II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','']
                            if n in ' '.join([c for c in obj['categories'] if c.endswith('verbs') and any([n in c for n in 'IVX'])])],
                           key=len)
          }
    elif obj['pos'] == 'adj':
        lemma = [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'masculine', 'singular', 'informal']])][:1]
        return {
            'gf_fun': gf_fun(lemma[0], 'A') if lemma else None,
            'gf_cat': 'A',
            'masc_singular': lemma,
            'masc_plural': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'masculine', 'plural', 'informal']])][:1],
            'fem_singular': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'feminine', 'singular', 'informal']])][:1],
            'fem_plural': [form for form, descr in forms
                         if all([w in descr for w in ['indefinite', 'feminine', 'plural', 'informal']])][:1],
            }

    else:
        return {f: d for f, d in forms}


# "root": ["ش ر ح (š-r-ḥ)"]
def find_root(s):
    return ''.join([c for c in s if is_arabic(c)])

import sys
MODE = sys.argv[1]

if MODE == 'gf':
    print('abstract MorphoDictAraAbs = Cat ** {')

with open(FILTERED_WIKT) as file:
    seen_gf_funs = set()
    for line in file:
        obj = json.loads(line)
        if 'Arabic lemmas' in obj.get('categories', []):
            entry = {
                'pos': obj['pos'],
                'root': [find_root(t['expansion']) for t in obj.get('etymology_templates', []) if t.get('name', None) =='ar-root'][:1],
                'forms': forms_for_pos(obj),
                'senses': [sense['glosses'] for sense in obj.get('senses', [])
                           if 'glosses' in sense]
                }
#            entry['n_forms'] = len(entry['forms'])
#            print(entry['pos'], entry['n_forms'])
            if MODE == 'json':
                print(json.dumps(entry, ensure_ascii=False))

            if MODE == 'gf':

                if 'gf_fun' in entry['forms'] and entry['forms']['gf_fun']:
                    if entry['forms']['gf_fun'] not in seen_gf_funs:
                        print('fun', entry['forms']['gf_fun'], ':', entry['forms']['gf_cat'], ';', '--', entry['senses'])
                        seen_gf_funs.add(entry['forms']['gf_fun'])

                # to do: rename duplicate function names: of 13762 names, 12946 are unique

if MODE == 'gf':
    print('}')