Files
gf-rgl/src/arabic/OrthoAra.gf
2018-11-13 17:01:35 +01:00

78 lines
2.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
resource OrthoAra = open Prelude, Predef in {
flags coding=utf8 ;
oper
vow : pattern Str = #("َ" | "ِ" | "ُ" | "ً" | "ٍ" | "ٌ") ;
weak : pattern Str = #("و"|"ي") ;
-- "Sun letters": assimilate with def. article
sun : pattern Str = #("ت"|"ث"|"د"|"ذ"|"ر"|"ز"|"س"|"ش"|"ص"|"ض"|"ط"|"ظ"|"ل"|"ن") ;
-- Shadda: https://www.unicode.org/L2/L2017/17253-arabic-ordering.pdf
fixShd : Str -> Str -> Str = \word,suffix ->
case <word,suffix> of {
-- <x + "ّ", v@#vow + y> => x + v + "ّ" + y ;
<x + v@#vow, "ّ" + y> => x + "ّ" + v + y ;
_ => word + suffix
} ;
-- IL: using this to reuse patterns for weak verbs, might be strange/wrong
rmSukun : Str -> Str = \s -> case s of {
x + "ْ" + y => x + y ;
_ => s
} ;
-- Hamza
hamza : pattern Str = #("ء"|"؟") ;
rectifyHmz : Str -> Str = \word ->
case word of {
l@(""|"ال") + ("أ"|"أَ") + #hamza + "ْ" + tail => l + "آ" + tail;
l@(""|"ال") + ("أ"|"أَ") + #hamza + tail => l + "آ" + tail;
l@(""|"ال") + #hamza + v@("َ"|"ُ") + tail => l + "أ" + v + tail;
l@(""|"ال") + #hamza + v@("ِ") + tail => l + "إ" + v + tail;
head + v1@(#vow|"ْ"|"ا"|"ي"|"و")
+ #hamza + v2@(#vow|"ْ") + tail =>
case v2 of { "ْ" => head + v1 + bHmz v1 v2 + tail ; -- unsure about this /IL
_ => head + v1 + bHmz v1 v2 + v2 + tail } ;
head + v1@(#vow|"ْ"|"ا"|"ي"|"و") -- the same but it ends in vowel
+ #hamza + v2@(#vow|"ْ") =>
case v2 of { "ْ" => head + v1 + tHmz v1 ;
_ => head + v1 + tHmz v1 + v2 } ;
head + v1@(#vow|"ْ"|"ا"|"ي"|"و") -- the same but it ends without vowel
+ #hamza => head + v1 + tHmz v1 ;
head + #hamza + tail => head + (bHmz (dp 2 head) (take 2 tail)) + tail; --last head , take 1 tail
_ => word
};
--hamza at beginning of word (head)
hHmz : Str -> Str = \d ->
case d of {
"ِ" => "إ";
_ => "أ"
};
--hamza in middle of word (body)
bHmz : Str -> Str -> Str = \d1,d2 ->
case <d1,d2> of {
<"ِ",_> | <_,"ِ"> => "ئ";
<"ُ",_> | <_,"ُ"> => "ؤ";
<"َ",_> | <_,"َ"> => "أ";
_ => "ء"
};
--hamza carrier sequence
tHmz : Str -> Str = \d ->
case d of {
"ِ" => "ئ";
"ُ" => "ؤ";
"َ" => "أ";
"ْ"|"ا"|"و"|"ي" => "ء"
};
}