mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-27 08:58:55 -06:00
factored out arabic_utilities.py as a separate file
This commit is contained in:
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
169
src/arabic/wiktionary/arabic_utilities.py
Normal file
@@ -0,0 +1,169 @@
|
|||||||
|
# utilities for Arabic script
|
||||||
|
# in the main mode, converts string literals in stdin 'to' or 'from' Buckwalter
|
||||||
|
# as specified by the command line argument:
|
||||||
|
#
|
||||||
|
# % python3 arabic_utilities.py to <MorphoDictAra.gf | python3 arabic_utilities.py from >b.tmp
|
||||||
|
# % diff MorphoDictAra.gf b.tmp
|
||||||
|
# %
|
||||||
|
|
||||||
|
def is_arabic(s):
|
||||||
|
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
||||||
|
|
||||||
|
|
||||||
|
def get_arabic(s):
|
||||||
|
return ''.join([c for c in s if is_arabic(c)])
|
||||||
|
|
||||||
|
|
||||||
|
def unvocalize(s):
|
||||||
|
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
||||||
|
|
||||||
|
|
||||||
|
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
||||||
|
buckwalter_dict = {
|
||||||
|
0x621: "'", # ء
|
||||||
|
0x622: '|', # آ
|
||||||
|
0x623: '>', # أ
|
||||||
|
0x624: '&', # ؤ
|
||||||
|
0x625: '<', # إ
|
||||||
|
0x626: '}', # ئ
|
||||||
|
0x627: 'A', # ا
|
||||||
|
0x628: 'b', # ب
|
||||||
|
0x629: 'p', # ة
|
||||||
|
0x62a: 't', # ت
|
||||||
|
0x62b: 'v', # ث
|
||||||
|
0x62c: 'j', # ج
|
||||||
|
0x62d: 'H', # ح
|
||||||
|
0x62e: 'x', # خ
|
||||||
|
0x62f: 'd', # د
|
||||||
|
0x630: '*', # ذ
|
||||||
|
0x631: 'r', # ر
|
||||||
|
0x632: 'z', # ز
|
||||||
|
0x633: 's', # س
|
||||||
|
0x634: '$', # ش
|
||||||
|
0x635: 'S', # ص
|
||||||
|
0x636: 'D', # ض
|
||||||
|
0x637: 'T', # ط
|
||||||
|
0x638: 'Z', # ظ
|
||||||
|
0x639: 'E', # ع
|
||||||
|
0x63a: 'g', # غ
|
||||||
|
0x641: 'f', # ف
|
||||||
|
0x642: 'q', # ق
|
||||||
|
0x643: 'k', # ك
|
||||||
|
0x644: 'l', # ل
|
||||||
|
0x645: 'm', # م
|
||||||
|
0x646: 'n', # ن
|
||||||
|
0x647: 'h', # ه
|
||||||
|
0x648: 'w', # و
|
||||||
|
0x649: 'Y', # ى
|
||||||
|
0x64a: 'y', # ي
|
||||||
|
0x64b: 'F', # ً
|
||||||
|
0x64c: 'N', # ٌ
|
||||||
|
0x64d: 'K', # ٍ
|
||||||
|
0x64e: 'a', # َ
|
||||||
|
0x64f: 'u', # ُ
|
||||||
|
0x650: 'i', # ِ
|
||||||
|
0x651: '~', # ّ
|
||||||
|
0x652: 'o', # ْ
|
||||||
|
0x670: '`', # '
|
||||||
|
0x671: '{' # ٱ
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
||||||
|
|
||||||
|
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
||||||
|
|
||||||
|
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
||||||
|
|
||||||
|
def to_buckwalter(s):
|
||||||
|
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
||||||
|
|
||||||
|
|
||||||
|
def from_buckwalter(s):
|
||||||
|
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
||||||
|
|
||||||
|
|
||||||
|
def drop_final_vowel(s):
|
||||||
|
if s[-1] in arabic_vowels:
|
||||||
|
return s[:-1]
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
def normal(s):
|
||||||
|
return unicodedata.normalize('NFD', s)
|
||||||
|
|
||||||
|
# heuristic for finding the three radicals from certain forms
|
||||||
|
# works only for sound (strong) 3-radical roots, otherwise None
|
||||||
|
def get_sound_trigram_root(s):
|
||||||
|
sounds = [c for c in s if c in sound_consonants]
|
||||||
|
if len(sounds) == 3:
|
||||||
|
return ''.join(sounds)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
||||||
|
# one more condition: each of the root letters occurs exactly ones
|
||||||
|
# TODO: better use the given root of the lex entry
|
||||||
|
def get_sound_fcl_pattern(s):
|
||||||
|
if root := get_sound_trigram_root(s):
|
||||||
|
if len([c in s for c in root]) == 3:
|
||||||
|
p = list(s)
|
||||||
|
r = s.find(root[0])
|
||||||
|
p[r] = chr(0x641)
|
||||||
|
r += s[r+1:].find(root[1]) + 1
|
||||||
|
p[r] = chr(0x639)
|
||||||
|
r += s[r+1:].find(root[2]) + 1
|
||||||
|
p[r] = chr(0x644)
|
||||||
|
p = ''.join(p)
|
||||||
|
## print('---PATT', s, root, p)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
# Wikt uses vowel+shadda which is a Unicode normalization
|
||||||
|
# GF uses shadda+vowel which is linguistically correct
|
||||||
|
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
||||||
|
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
||||||
|
## todo: more direct implementation
|
||||||
|
def reorder_shadda(s):
|
||||||
|
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
||||||
|
|
||||||
|
|
||||||
|
# quote word forms but not parameters
|
||||||
|
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
||||||
|
if cond(s):
|
||||||
|
return '"' + change(s) + '"'
|
||||||
|
else:
|
||||||
|
return s
|
||||||
|
|
||||||
|
|
||||||
|
# for a string, change each string literal in "..." with a change function
|
||||||
|
# leaving other characters as they are; print the string to stdout as you go
|
||||||
|
def change_literals(s, change):
|
||||||
|
inliteral = False
|
||||||
|
literal = ''
|
||||||
|
for c in s:
|
||||||
|
if c == '"' and inliteral:
|
||||||
|
print('"'+change(literal)+'"', end='')
|
||||||
|
inliteral = False
|
||||||
|
literal = ''
|
||||||
|
elif c == '"':
|
||||||
|
inliteral = True
|
||||||
|
elif inliteral:
|
||||||
|
literal += c
|
||||||
|
else:
|
||||||
|
print(c, end='')
|
||||||
|
|
||||||
|
|
||||||
|
# convert literals in stdin 'to' or 'from' Buckwalter
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import sys
|
||||||
|
mode = sys.argv[1]
|
||||||
|
for line in sys.stdin:
|
||||||
|
if mode == 'from':
|
||||||
|
change_literals(line, from_buckwalter)
|
||||||
|
elif mode == 'to':
|
||||||
|
change_literals(line, to_buckwalter)
|
||||||
|
|
||||||
|
|
||||||
@@ -3,7 +3,7 @@ import json
|
|||||||
import sys
|
import sys
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import pgf
|
import pgf
|
||||||
|
from arabic_utilities import *
|
||||||
|
|
||||||
# data from https://kaikki.org/dictionary/rawdata.html
|
# data from https://kaikki.org/dictionary/rawdata.html
|
||||||
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
# thanks Tatu Ylonen: Wiktextract: Wiktionary as Machine-Readable Structured Data,
|
||||||
@@ -110,132 +110,6 @@ if MODE == 'error-analysis':
|
|||||||
for labverdict, n in sorted(list(evals.items())):
|
for labverdict, n in sorted(list(evals.items())):
|
||||||
print(labverdict, n)
|
print(labverdict, n)
|
||||||
|
|
||||||
# https://en.wikipedia.org/wiki/Buckwalter_transliteration
|
|
||||||
buckwalter_dict = {
|
|
||||||
0x621: "'", # ء
|
|
||||||
0x622: '|', # آ
|
|
||||||
0x623: '>', # أ
|
|
||||||
0x624: '&', # ؤ
|
|
||||||
0x625: '<', # إ
|
|
||||||
0x626: '}', # ئ
|
|
||||||
0x627: 'A', # ا
|
|
||||||
0x628: 'b', # ب
|
|
||||||
0x629: 'p', # ة
|
|
||||||
0x62a: 't', # ت
|
|
||||||
0x62b: 'v', # ث
|
|
||||||
0x62c: 'j', # ج
|
|
||||||
0x62d: 'H', # ح
|
|
||||||
0x62e: 'x', # خ
|
|
||||||
0x62f: 'd', # د
|
|
||||||
0x630: '*', # ذ
|
|
||||||
0x631: 'r', # ر
|
|
||||||
0x632: 'z', # ز
|
|
||||||
0x633: 's', # س
|
|
||||||
0x634: '$', # ش
|
|
||||||
0x635: 'S', # ص
|
|
||||||
0x636: 'D', # ض
|
|
||||||
0x637: 'T', # ط
|
|
||||||
0x638: 'Z', # ظ
|
|
||||||
0x639: 'E', # ع
|
|
||||||
0x63a: 'g', # غ
|
|
||||||
0x641: 'f', # ف
|
|
||||||
0x642: 'q', # ق
|
|
||||||
0x643: 'k', # ك
|
|
||||||
0x644: 'l', # ل
|
|
||||||
0x645: 'm', # م
|
|
||||||
0x646: 'n', # ن
|
|
||||||
0x647: 'h', # ه
|
|
||||||
0x648: 'w', # و
|
|
||||||
0x649: 'Y', # ى
|
|
||||||
0x64a: 'y', # ي
|
|
||||||
0x64b: 'F', # ً
|
|
||||||
0x64c: 'N', # ٌ
|
|
||||||
0x64d: 'K', # ٍ
|
|
||||||
0x64e: 'a', # َ
|
|
||||||
0x64f: 'u', # ُ
|
|
||||||
0x650: 'i', # ِ
|
|
||||||
0x651: '~', # ّ
|
|
||||||
0x652: 'o', # ْ
|
|
||||||
0x670: '`', # '
|
|
||||||
0x671: '{' # ٱ
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
buckwalter_dict_rev = {b: chr(a) for a, b in buckwalter_dict.items()}
|
|
||||||
|
|
||||||
arabic_vowels = {chr(c) for c in {0x64b, 0x64c, 0x64d, 0x64e, 0x64f, 0x650}}
|
|
||||||
|
|
||||||
sound_consonants = {chr(c) for c in range(0x628, 0x648)} # excluding alif, waw, ya
|
|
||||||
|
|
||||||
def to_buckwalter(s):
|
|
||||||
return ''.join([buckwalter_dict.get(ord(c), c) for c in s])
|
|
||||||
|
|
||||||
|
|
||||||
def from_buckwalter(s):
|
|
||||||
return ''.join([buckwalter_dict_rev.get(c, c) for c in s])
|
|
||||||
|
|
||||||
|
|
||||||
def unvocalize(s):
|
|
||||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
|
||||||
|
|
||||||
|
|
||||||
def drop_final_vowel(s):
|
|
||||||
if s[-1] in arabic_vowels:
|
|
||||||
return s[:-1]
|
|
||||||
else:
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
def is_arabic(s):
|
|
||||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
|
||||||
|
|
||||||
def normal(s):
|
|
||||||
return unicodedata.normalize('NFD', s)
|
|
||||||
|
|
||||||
# heuristic for finding the three radicals from certain forms
|
|
||||||
# works only for sound (strong) 3-radical roots, otherwise None
|
|
||||||
def get_sound_trigram_root(s):
|
|
||||||
sounds = [c for c in s if c in sound_consonants]
|
|
||||||
if len(sounds) == 3:
|
|
||||||
return ''.join(sounds)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
# reverse engineer fcl pattern from a given form, with a sound trigram root
|
|
||||||
# one more condition: each of the root letters occurs exactly ones
|
|
||||||
# TODO: better use the given root of the lex entry
|
|
||||||
def get_sound_fcl_pattern(s):
|
|
||||||
if root := get_sound_trigram_root(s):
|
|
||||||
if len([c in s for c in root]) == 3:
|
|
||||||
p = list(s)
|
|
||||||
r = s.find(root[0])
|
|
||||||
p[r] = chr(0x641)
|
|
||||||
r += s[r+1:].find(root[1]) + 1
|
|
||||||
p[r] = chr(0x639)
|
|
||||||
r += s[r+1:].find(root[2]) + 1
|
|
||||||
p[r] = chr(0x644)
|
|
||||||
p = ''.join(p)
|
|
||||||
## print('---PATT', s, root, p)
|
|
||||||
return p
|
|
||||||
|
|
||||||
|
|
||||||
# Wikt uses vowel+shadda which is a Unicode normalization
|
|
||||||
# GF uses shadda+vowel which is linguistically correct
|
|
||||||
# see https://stackoverflow.com/questions/58559390/in-unicode-should-u0651-arabic-shadda-be-before-or-after-kasra
|
|
||||||
# unicodedata.normalize does this wrong, as noted by Ariel Gutman
|
|
||||||
## todo: more direct implementation
|
|
||||||
def reorder_shadda(s):
|
|
||||||
return from_buckwalter(to_buckwalter(s).replace('a~', '~a').replace('u~', '~u').replace('i~', '~i'))
|
|
||||||
|
|
||||||
|
|
||||||
# quote word forms but not parameters
|
|
||||||
def quote_if(s, cond=is_arabic, change=reorder_shadda):
|
|
||||||
if cond(s):
|
|
||||||
return '"' + change(s) + '"'
|
|
||||||
else:
|
|
||||||
return s
|
|
||||||
|
|
||||||
|
|
||||||
# generate word_d_C functions starting with d=0, but show d only when >= 1
|
# generate word_d_C functions starting with d=0, but show d only when >= 1
|
||||||
def gf_fun(s, pos, disamb=0):
|
def gf_fun(s, pos, disamb=0):
|
||||||
|
|||||||
@@ -1,6 +1,8 @@
|
|||||||
import csv
|
import csv
|
||||||
import json
|
import json
|
||||||
|
|
||||||
|
from arabic_utilities import *
|
||||||
|
|
||||||
# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
|
# to run: python3 to_wordnet.py >arabic-wn-morpho.jsonl
|
||||||
# the following are assumed
|
# the following are assumed
|
||||||
|
|
||||||
@@ -12,15 +14,6 @@ WN_TSV = 'ar2en_words_gf.csv' # Zarzoura
|
|||||||
# built as explained in ./read_wiktionary.py
|
# built as explained in ./read_wiktionary.py
|
||||||
MORPHO_GF = 'MorphoDictAraAbs.gf'
|
MORPHO_GF = 'MorphoDictAraAbs.gf'
|
||||||
|
|
||||||
def is_arabic(s):
|
|
||||||
return s and any(1574 <= ord(c) <= 1616 for c in s)
|
|
||||||
|
|
||||||
def get_arabic(s):
|
|
||||||
return ''.join([c for c in s if is_arabic(c)])
|
|
||||||
|
|
||||||
def unvocalize(s):
|
|
||||||
return ''.join([c for c in s if 0x621 <= ord(c) <= 0x64a])
|
|
||||||
|
|
||||||
|
|
||||||
# fun 'دُبُ_N' : N ; -- 10 [['bear']]
|
# fun 'دُبُ_N' : N ; -- 10 [['bear']]
|
||||||
funmap = {}
|
funmap = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user