From 6b07bc3df9764da7ce69889cde9bac061fe3929f Mon Sep 17 00:00:00 2001 From: aarne Date: Sat, 3 Dec 2011 18:17:36 +0000 Subject: [PATCH] refactored ThaiScript by introducing a datatype of Thai characters - script to pronunciation is now more reliable --- lib/src/thai/LexiconTha.gf | 2 +- lib/src/thai/ThaiScript.hs | 383 +++++++++++++++++++------------------ 2 files changed, 196 insertions(+), 189 deletions(-) diff --git a/lib/src/thai/LexiconTha.gf b/lib/src/thai/LexiconTha.gf index 58b18567b..3286f1efe 100644 --- a/lib/src/thai/LexiconTha.gf +++ b/lib/src/thai/LexiconTha.gf @@ -8,7 +8,7 @@ flags lin -- add_V3 = dirV3 (regV "add") toP ; - airplane_N = mkN (thword "เครึอง" "บิน") "ลำ" ; + airplane_N = mkN (thword "เครื่อง" "บิน") "ลำ" ; -- already_Adv = mkAdv "already" ; animal_N = animalN (thword "สัตว์") ; answer_V2S = mkV2S (regV "ขาน") [] ; ---- prep diff --git a/lib/src/thai/ThaiScript.hs b/lib/src/thai/ThaiScript.hs index 79100772d..01bb2a283 100644 --- a/lib/src/thai/ThaiScript.hs +++ b/lib/src/thai/ThaiScript.hs @@ -28,7 +28,7 @@ appThpron s = case s of mkThpron s = "(thpron \"" ++ s ++ "\" \"" ++ thai2pron s ++ "\")" -- heuristics for finding syllables -uniSyllables :: [Int] -> [[Int]] +uniSyllables :: [ThaiDat] -> [[ThaiDat]] uniSyllables = reverse . (syll [] []) where syll sys sy is = case is of c:cs | isPreVowel c -> new [] is @@ -40,23 +40,24 @@ uniSyllables = reverse . (syll [] []) where continue old = syll sys (sy ++ old) -isPreVowel :: Int -> Bool -isPreVowel i = 0xe40 <= i && i <= 0xe44 +isPreVowel :: ThaiDat -> Bool +isPreVowel i = Ce <= i && i <= CaE -isVowel :: Int -> Bool -isVowel i = 0xe30 <= i && i <= 0xe44 +isVowel :: ThaiDat -> Bool +isVowel i = CaP <= i && i <= CaE -isConsonant :: Int -> Bool -isConsonant i = 0xe01 <= i && i <= 0xe2e && i /= 0xe2d +isConsonant :: ThaiDat -> Bool +isConsonant i = Ck <= i && i <= Ch' && i /= CO -isMark :: Int -> Bool -isMark i = 0xe47 <= i && i <= 0xe4c +isMark :: ThaiDat -> Bool +isMark i = CS <= i && i <= CK -- the following functions involving pron (=pronunciation) work on syllables thai2pron = uni2pron . thai2uni trans2pron = uni2pron . trans2uni trans2thai = uni2thai . trans2uni +trans2dat = uni2dat . trans2uni thai2uni :: String -> [Int] thai2uni = map fromEnum @@ -65,77 +66,90 @@ uni2thai :: [Int] -> String uni2thai = map toEnum uni2pron :: [Int] -> String -uni2pron is = case getSyllable is of +uni2pron = dat2pron . uni2dat + +uni2dat = map (maybe CR dat . (\u -> Map.lookup u uniMap)) --- CR as exception value + +dat2pron :: [ThaiDat] -> String +dat2pron is = case getSyllable is of -- diph order as in Smyth, p. 15 - [0xe40] : cc : [] : d : cs | kill d -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-8 -> e - [0xe41] : cc : [] : d : cs | kill d -> prons cc ++ tone cc d cs "\x25b" ++ endWith cs -- ä-8 -> ä - [v] : cc : [] : d : cs -> prons cc ++ tone cc d cs (pron v) ++ endWith cs -- e/ä/o/ay/ay - [0xe40] : cc : [0xe22] : d : cs -> prons cc ++ tone cc d cs "\601\601y" ++ endWith cs -- e-y -> ööy - [0xe40] : cc : [0xe2d] : d : cs -> prons cc ++ tone cc d cs "\601\601" ++ endWith cs -- e-O -> öö - [0xe40] : cc : [0xe2d,0xe30] : d : cs -> prons cc ++ tone cc d cs "\601" ++ endWith cs -- e-Oa -> ö - [0xe40] : cc : [0xe30] : d : cs -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-a - [0xe40] : cc : [0xe32] : d : cs -> prons cc ++ tone cc d cs "aw" ++ endWith cs -- e-a - [0xe40] : cc : [0xe32,0xe30] : d : cs -> prons cc ++ tone cc d cs "\596" ++ endWith cs -- e-Aa -> O - [0xe40] : cc : [0xe34] : d : cs -> prons cc ++ tone cc d cs "\601\601" ++ endWith cs -- e-i -> öö - [0xe40] : cc : [0xe35,0xe22] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iiy-> ia - [0xe40] : cc : [0xe34,0xe22,0xe30] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iya-> ia - [0xe40] : cc : [0xe35,0xe2d] : d : cs -> prons cc ++ tone cc d cs "\x289a" ++ endWith cs -- e-iiO-> üa ---[0xe40] : cc : [0xe37,0xe2d] : d : cs -> prons cc ++ tone cc d cs "\649\649" ++ endWith cs - - [0xe41] : cc : [0xe30] : d : cs -> prons cc ++ tone cc d cs "\x25b" ++ endWith cs -- ä-a -> ä - [0xe42] : cc : [0xe30] : d : cs -> prons cc ++ tone cc d cs "o" ++ endWith cs -- o-a -> o - [] : [0xe2d] : v : d : cs -> tone [0xe2d] d cs (prons v) ++ endWith cs -- Ov -> v - [] : cc : [0xe31,0xe27] : d : cs -> prons cc ++ tone cc d cs "ua" ++ endWith cs -- Caw -> Cua - [] : bb : [] : cc : [] : d : cs -> prons bb ++ "a" ++ prons cc ++ tone cc [] cs "o" ++ endWith cs -- CaCoC - [] : bb : [] : cc : [] -> prons bb ++ "o" ++ endWith [cc] -- CoC - [] : cc : v : d : cs -> prons cc ++ tone cc d cs (prons v) ++ endWith cs -- Cv- (normal) + [Ce] : cc : [] : d : cs | brev d -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-8 -> e + [Ce'] : cc : [] : d : cs | brev d -> prons cc ++ tone cc d cs i_ae ++ endWith cs -- ä-8 -> ä + [v] :[CO]: [] : d : cs -> tone[CO]d cs (pron v) ++ endWith cs -- e/ä/o/ay/ay + [v] : cc : [] : d : cs -> prons cc ++ tone cc d cs (pron v) ++ endWith cs -- e/ä/o/ay/ay + [Ce] : cc : [Cy] : d : cs -> prons cc ++ tone cc d cs (i_ooe++"y")++ endWith cs -- e-y -> ööy + [Ce] : cc : [CO] : d : cs -> prons cc ++ tone cc d cs i_ooe ++ endWith cs -- e-O -> öö + [Ce] : cc : [CO,CaP] : d : cs -> prons cc ++ tone cc d cs i_oe ++ endWith cs -- e-Oa -> ö + [Ce] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-a + [Ce] : cc : [CaL] : d : cs -> prons cc ++ tone cc d cs "aw" ++ endWith cs -- e-a + [Ce] : cc : [CaL,CaP] : d : cs -> prons cc ++ tone cc d cs i_O ++ endWith cs -- e-Aa -> O + [Ce] : cc : [Ci] : d : cs -> prons cc ++ tone cc d cs i_ooe ++ endWith cs -- e-i -> öö + [Ce] : cc : [CiL,Cy] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iiy-> ia + [Ce] : cc : [Ci,Cy,CaP] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iya-> ia + [Ce] : cc : [CiL,CO] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iiO-> üa + [Ce] : cc : [CvL,CO] : d : cs -> prons cc ++ tone cc d cs (i_ue++"a") ++ endWith cs -- e-vvO-> üa + [Ce'] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs i_ae ++ endWith cs -- ä-a -> ä + [CoL] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs "o" ++ endWith cs -- o-a -> o + [] :[CO]: v : d : cs -> tone[CO]d cs (prons v) ++ endWith cs -- Ov -> v + [] : cc : [Ca,Cw] : d : cs -> prons cc ++ tone cc d cs "ua" ++ endWith cs -- Caw -> Cua + [] : bb : [] : cc : []: d : cs -> prons bb ++ "a" ++ prons cc ++ tone cc [] cs "o" ++ endWith cs -- CaCoC + [] : bb : [] : cc : [] -> prons bb ++ "o" ++ endWith [cc] -- CoC + [] : cc : v : d : cs -> prons cc ++ tone cc d cs (prons v) ++ endWith cs -- Cv- (normal) _ -> prons is --- shouldn't happen where prons cc = case cc of - c:0xe4c:cs -> prons cs - 0xe2b:c:cs | isConsonant c -> concatMap pron (c:cs) ---- only some conss + c :CK:cs -> prons cs -- killer + Ch:c:cs | isConsonant c -> concatMap pron (c:cs) -- hC, ---- only some conss _ -> concatMap pron cc endWith ss = case ss of - (c:cs) -> encs c ++ uni2pron (concat cs) + (c:cs) -> encs c ++ dat2pron (concat cs) _ -> [] encs cs = case cs of [] -> [] _ -> prons (init cs) ++ enc (last cs) - enc c = lookThai [] pronunc_end c - pron c = lookThai [] pronunc c - kill d = elem 0xe47 d + enc c = lookThai [] enp c + pron c = lookThai [] pro c + brev d = elem CS d -- shortener -getSyllable :: [Int] -> [[Int]] -- (V?),(C|CC|hC),(V*),(D*),(C*),[[],C]? +getSyllable :: [ThaiDat] -> [[ThaiDat]] -- (V?),(C|CC|hC),(V*),(D*),(C*),[[],C]? getSyllable s = case s of v:cs | preVowel v -> [v]:getCons v cs [] -> [] c:_ -> []:getCons c s where getCons v s = case s of - b:c:cs | b == 0xe2b && isConsonant c -> [b,c]:getVow v cs -- hC - b:cs | b == 0xe2d -> [b] :getVow v cs -- O - b:c:cs | isConsonant b && (elem c [0xe23, 0xe25, 0xe27]) -> [b,c]:getVow v cs -- C(l|r|w) cluster - b:c:d:[] | isConsonant b && isConsonant c && isConsonant d -> [b] :[]:[c]:[]:[d]:[] -- CaCoC - b:c:[] | isConsonant b && isConsonant c -> [b] :[]:[c]:[] -- CoC - b:cs | isConsonant b -> [b] :getVow v cs -- C + b:c:cs | b == Ch && isConsonant c -> [b,c]:getVow v cs -- hC + b:cs | b == CO -> [b] :getVow v cs -- O + b:c:cs | isCluster b c -> [b,c]:getVow v cs -- C(l|r|w) cluster + b:cs | preVowel v -> [b] :getVow v cs + b:c:d:[] | all isConsonant [b,c,d] -> [b] :[]:[c]:[]:[d]:[] -- CaCoC + b:c:[] | all isConsonant [b,c] -> [b] :[]:[c]:[] -- CoC + b:cs | isConsonant b -> [b] :getVow v cs -- C _ -> [s] --- shouldn't happen ?? getVow v0 s = case span (\x -> inVow v0 x || diacritic x) s of (v,c:cs) -> let (d,w) = partition diacritic v in w:d:[c]:getSyllable cs - (v,_) -> let (d,w) = partition diacritic v in [w,d] + (v,_) -> let (d,w) = partition diacritic v in [w,d] inVow v0 x = inVowel x || case v0 of - 0xe40 -> elem x [0xe22] -- after e-, also y is a part of a vowel - _ -> False + Ce -> elem x [Cy] -- after e-, also y is a part of a vowel + _ -> False -inVowel v = (0xe30 <= v && v <= 0xe39) || v == 0xe2d -- infix vowels -preVowel v = 0xe40 <= v && v <= 0xe44 -- prefix vowels -diacritic x = 0xe47 <= x && x <= 0xe4d -- tones, killers +isCluster b c = + (pronTha c == "r" && elem (pronTha b) ["k","kh","p","ph","t"]) + || (pronTha c == "l" && elem (pronTha b) ["k","kh","p","ph"]) + || (pronTha c == "w" && elem (pronTha b) ["k","kh"]) -tone :: [Int] -> [Int] -> [[Int]] -> String -> String -tone cc@(c:_) d cs v = case (lookThai Low cclass c, isLive cs1, toneMark d) of +pronTha c = lookThai [] pro c + +inVowel v = (CaP <= v && v <= CuL) || v == CO -- infix vowels +preVowel v = Ce <= v && v <= CaE -- prefix vowels +diacritic x = CS <= x && x <= CK -- tones, killers + +tone :: [ThaiDat] -> [ThaiDat] -> [[ThaiDat]] -> String -> String +tone cc@(c:_) d cs v = case (lookThai Low ccl c, isLive cs1, toneMark d) of (_,_,3) -> high v (_,_,4) -> rising v (Low,_,1) -> falling v @@ -153,29 +167,29 @@ tone cc@(c:_) d cs v = case (lookThai Low cclass c, isLive cs1, toneMark d) of where cs1 = concat (take 1 cs) -toneMark :: [Int] -> Int +toneMark :: [ThaiDat] -> Int toneMark is = case is of - 0xe48:is -> 1 - 0xe49:is -> 2 - 0xe4a:is -> 3 - 0xe4b:is -> 4 - _:is -> toneMark is - _ -> 0 -- no tone mark in is + CT1:is -> 1 + CT2:is -> 2 + CT3:is -> 3 + CT4:is -> 4 + _ :is -> toneMark is + _ -> 0 -- no tone mark in is isLong :: String -> Bool isLong s = case s of - 'i':'a':_ -> True - 'u':'a':_ -> True - '\x289':a:_ -> True + 'i':'a':_ -> True + 'u':'a':_ -> True + '\x289':a:_ -> True -- uea c:d:_ | c == d -> True --- must be vowels _:cs -> isLong cs _ -> False -isLive :: [Int] -> Bool +isLive :: [ThaiDat] -> Bool isLive is = case is of - [i] -> lookThai False liveness i - [] -> True - _ -> False + [i] -> lookThai False liv i + [] -> True + _ -> False mid, high, low, falling, rising :: String -> String mid s = s @@ -188,15 +202,12 @@ accent a s = case s of c:cs -> c:a:cs _ -> s -lookThai :: a -> (ThaiChar -> a) -> Int -> a -lookThai v f i = maybe v f (Map.lookup i thaiMap) - trans2uni :: String -> [Int] trans2uni = map (\c -> maybe 0 id $ Map.lookup c trans) . unchar where - trans = Map.fromList [(translit c, unicode c) | c <- allThaiChars] + trans = Map.fromList [(tra c, uni c) | c <- allThaiChars] unchar :: String -> [String] unchar s = case s of @@ -209,102 +220,132 @@ unchar s = case s of _ -> [] -thaiMap :: Map.Map Int ThaiChar -thaiMap = Map.fromList [(unicode c,c) | c <- allThaiChars] +thaiMap :: Map.Map ThaiDat ThaiChar +thaiMap = Map.fromList [(dat c,c) | c <- allThaiChars] + +uniMap :: Map.Map Int ThaiChar +uniMap = Map.fromList [(uni c,c) | c <- allThaiChars] + +lookThai :: a -> (ThaiChar -> a) -> ThaiDat -> a +lookThai v f i = maybe v f (Map.lookup i thaiMap) data ThaiChar = TC { - unicode :: Int, - translit :: String, - cclass :: CClass, - liveness :: Bool, - pronunc :: String, - pronunc_end :: String + dat :: ThaiDat, + uni :: Int, + tra :: String, + ccl :: CClass, + liv :: Bool, + pro :: String, + enp :: String } deriving Show data CClass = Low | Mid | High deriving (Show, Eq) +-- IPA sounds +i_ng = "\331" +i_O = "\596" +i_ue = "\x289" +i_ae = "\x25b" +i_oe = "\601" +i_ooe = i_oe ++ i_oe +i_uue = i_ue ++ i_ue +i_aae = i_ae ++ i_ae +i_OO = i_O ++ i_O + allThaiChars :: [ThaiChar] allThaiChars = [ - TC {unicode = 3585, translit = "k", cclass = Mid, liveness = False, pronunc = "k", pronunc_end = "k"}, - TC {unicode = 3586, translit = "k1", cclass = High, liveness = False, pronunc = "kh", pronunc_end = "k"}, - TC {unicode = 3588, translit = "k2", cclass = Low, liveness = False, pronunc = "kh", pronunc_end = "k"}, - TC {unicode = 3590, translit = "k3", cclass = Low, liveness = False, pronunc = "kh", pronunc_end = "k"}, - TC {unicode = 3591, translit = "g", cclass = Low, liveness = True, pronunc = "\331", pronunc_end = "\331"}, - TC {unicode = 3592, translit = "c", cclass = Mid, liveness = False, pronunc = "c", pronunc_end = "t"}, - TC {unicode = 3593, translit = "c1", cclass = High, liveness = False, pronunc = "ch", pronunc_end = "t"}, - TC {unicode = 3594, translit = "c2", cclass = Low, liveness = False, pronunc = "ch", pronunc_end = "t"}, - TC {unicode = 3595, translit = "s'", cclass = Low, liveness = False, pronunc = "s", pronunc_end = "t"}, - TC {unicode = 3596, translit = "c3", cclass = Low, liveness = False, pronunc = "ch", pronunc_end = "t"}, - TC {unicode = 3597, translit = "y'", cclass = Low, liveness = False, pronunc = "y", pronunc_end = "n"}, - TC {unicode = 3598, translit = "d'", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "t"}, - TC {unicode = 3599, translit = "t'", cclass = Mid, liveness = False, pronunc = "t", pronunc_end = "t"}, - TC {unicode = 3600, translit = "t1", cclass = High, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3601, translit = "t2", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3602, translit = "t3", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3603, translit = "n'", cclass = Low, liveness = True, pronunc = "n", pronunc_end = "n"}, - TC {unicode = 3604, translit = "d", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "t"}, - TC {unicode = 3605, translit = "t", cclass = Mid, liveness = False, pronunc = "t", pronunc_end = "t"}, - TC {unicode = 3606, translit = "t4", cclass = High, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3607, translit = "t5", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3608, translit = "t6", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"}, - TC {unicode = 3609, translit = "n", cclass = Low, liveness = True, pronunc = "n", pronunc_end = "n"}, - TC {unicode = 3610, translit = "b", cclass = Mid, liveness = False, pronunc = "b", pronunc_end = "p"}, - TC {unicode = 3611, translit = "p", cclass = Mid, liveness = False, pronunc = "p", pronunc_end = "p"}, - TC {unicode = 3612, translit = "p1", cclass = High, liveness = False, pronunc = "ph", pronunc_end = "p"}, - TC {unicode = 3613, translit = "f", cclass = High, liveness = False, pronunc = "f", pronunc_end = "p"}, - TC {unicode = 3614, translit = "p2", cclass = Low, liveness = False, pronunc = "ph", pronunc_end = "p"}, - TC {unicode = 3615, translit = "f'", cclass = Low, liveness = False, pronunc = "f", pronunc_end = "p"}, - TC {unicode = 3616, translit = "p3", cclass = Low, liveness = False, pronunc = "ph", pronunc_end = "p"}, - TC {unicode = 3617, translit = "m", cclass = Low, liveness = True, pronunc = "m", pronunc_end = "m"}, - TC {unicode = 3618, translit = "y", cclass = Low, liveness = True, pronunc = "y", pronunc_end = "y"}, - TC {unicode = 3619, translit = "r", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"}, - TC {unicode = 3621, translit = "l", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"}, - TC {unicode = 3623, translit = "w", cclass = Low, liveness = True, pronunc = "w", pronunc_end = "w"}, - TC {unicode = 3624, translit = "s-", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"}, - TC {unicode = 3625, translit = "s.", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"}, - TC {unicode = 3626, translit = "s", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"}, - TC {unicode = 3627, translit = "h", cclass = High, liveness = True, pronunc = "h", pronunc_end = ""}, - TC {unicode = 3628, translit = "l'", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"}, - TC {unicode = 3629, translit = "O", cclass = Mid, liveness = True, pronunc = "\596\596", pronunc_end = "\596\596"}, - TC {unicode = 3630, translit = "h'", cclass = Low, liveness = True, pronunc = "h", pronunc_end = ""}, + TC {dat = Ck, uni = 3585, tra = "k", ccl = Mid, liv = False, pro = "k", enp = "k"}, + TC {dat = Ck1, uni = 3586, tra = "k1", ccl = High, liv = False, pro = "kh", enp = "k"}, + TC {dat = Ck2, uni = 3588, tra = "k2", ccl = Low, liv = False, pro = "kh", enp = "k"}, + TC {dat = Ck3, uni = 3590, tra = "k3", ccl = Low, liv = False, pro = "kh", enp = "k"}, + TC {dat = Cg, uni = 3591, tra = "g", ccl = Low, liv = True, pro = i_ng, enp = i_ng}, + TC {dat = Cc, uni = 3592, tra = "c", ccl = Mid, liv = False, pro = "c", enp = "t"}, + TC {dat = Cc1, uni = 3593, tra = "c1", ccl = High, liv = False, pro = "ch", enp = "t"}, + TC {dat = Cc2, uni = 3594, tra = "c2", ccl = Low, liv = False, pro = "ch", enp = "t"}, + TC {dat = Cs', uni = 3595, tra = "s'", ccl = Low, liv = False, pro = "s", enp = "t"}, + TC {dat = Cc3, uni = 3596, tra = "c3", ccl = Low, liv = False, pro = "ch", enp = "t"}, + TC {dat = Cy', uni = 3597, tra = "y'", ccl = Low, liv = False, pro = "y", enp = "n"}, + TC {dat = Cd', uni = 3598, tra = "d'", ccl = Mid, liv = False, pro = "d", enp = "t"}, + TC {dat = Ct', uni = 3599, tra = "t'", ccl = Mid, liv = False, pro = "t", enp = "t"}, + TC {dat = Ct1, uni = 3600, tra = "t1", ccl = High, liv = False, pro = "th", enp = "t"}, + TC {dat = Ct2, uni = 3601, tra = "t2", ccl = Low, liv = False, pro = "th", enp = "t"}, + TC {dat = Ct3, uni = 3602, tra = "t3", ccl = Low, liv = False, pro = "th", enp = "t"}, + TC {dat = Cn', uni = 3603, tra = "n'", ccl = Low, liv = True, pro = "n", enp = "n"}, + TC {dat = Cd, uni = 3604, tra = "d", ccl = Mid, liv = False, pro = "d", enp = "t"}, + TC {dat = Ct, uni = 3605, tra = "t", ccl = Mid, liv = False, pro = "t", enp = "t"}, + TC {dat = Ct4, uni = 3606, tra = "t4", ccl = High, liv = False, pro = "th", enp = "t"}, + TC {dat = Ct5, uni = 3607, tra = "t5", ccl = Low, liv = False, pro = "th", enp = "t"}, + TC {dat = Ct6, uni = 3608, tra = "t6", ccl = Low, liv = False, pro = "th", enp = "t"}, + TC {dat = Cn, uni = 3609, tra = "n", ccl = Low, liv = True, pro = "n", enp = "n"}, + TC {dat = Cb, uni = 3610, tra = "b", ccl = Mid, liv = False, pro = "b", enp = "p"}, + TC {dat = Cp, uni = 3611, tra = "p", ccl = Mid, liv = False, pro = "p", enp = "p"}, + TC {dat = Cp1, uni = 3612, tra = "p1", ccl = High, liv = False, pro = "ph", enp = "p"}, + TC {dat = Cf, uni = 3613, tra = "f", ccl = High, liv = False, pro = "f", enp = "p"}, + TC {dat = Cp2, uni = 3614, tra = "p2", ccl = Low, liv = False, pro = "ph", enp = "p"}, + TC {dat = Cf', uni = 3615, tra = "f'", ccl = Low, liv = False, pro = "f", enp = "p"}, + TC {dat = Cp3, uni = 3616, tra = "p3", ccl = Low, liv = False, pro = "ph", enp = "p"}, + TC {dat = Cm, uni = 3617, tra = "m", ccl = Low, liv = True, pro = "m", enp = "m"}, + TC {dat = Cy, uni = 3618, tra = "y", ccl = Low, liv = True, pro = "y", enp = "y"}, + TC {dat = Cr, uni = 3619, tra = "r", ccl = Low, liv = True, pro = "r", enp = "n"}, + TC {dat = Cl, uni = 3621, tra = "l", ccl = Low, liv = True, pro = "l", enp = "n"}, + TC {dat = Cw, uni = 3623, tra = "w", ccl = Low, liv = True, pro = "w", enp = "w"}, + TC {dat = CsM, uni = 3624, tra = "s-", ccl = High, liv = False, pro = "s", enp = "t"}, + TC {dat = CsP, uni = 3625, tra = "s.", ccl = High, liv = False, pro = "s", enp = "t"}, + TC {dat = Cs, uni = 3626, tra = "s", ccl = High, liv = False, pro = "s", enp = "t"}, + TC {dat = Ch, uni = 3627, tra = "h", ccl = High, liv = True, pro = "h", enp = ""}, + TC {dat = Cl, uni = 3628, tra = "l'", ccl = Low, liv = True, pro = "l", enp = "n"}, + TC {dat = CO, uni = 3629, tra = "O", ccl = Mid, liv = True, pro = i_OO, enp = i_OO}, + TC {dat = Ch', uni = 3630, tra = "h'", ccl = Low, liv = True, pro = "h", enp = ""}, - TC {unicode = 3632, translit = "a.", cclass = Low, liveness = False, pronunc = "a", pronunc_end = "a"}, - TC {unicode = 3633, translit = "a", cclass = Low, liveness = False, pronunc = "a", pronunc_end = "a"}, - TC {unicode = 3634, translit = "a:", cclass = Low, liveness = True, pronunc = "aa", pronunc_end = "aa"}, - TC {unicode = 3635, translit = "a+", cclass = Low, liveness = True, pronunc = "am", pronunc_end = "am"}, - TC {unicode = 3636, translit = "i", cclass = Low, liveness = False, pronunc = "i", pronunc_end = "i"}, - TC {unicode = 3637, translit = "i:", cclass = Low, liveness = True, pronunc = "ii", pronunc_end = "ii"}, - TC {unicode = 3638, translit = "v", cclass = Low, liveness = False, pronunc = "\x289", pronunc_end = "\x289"}, - TC {unicode = 3639, translit = "v:", cclass = Low, liveness = True, pronunc = "\x289\x289", pronunc_end = "\x289\x289"}, - TC {unicode = 3640, translit = "u", cclass = Low, liveness = False, pronunc = "u", pronunc_end = "u"}, - TC {unicode = 3641, translit = "u:", cclass = Low, liveness = True, pronunc = "uu", pronunc_end = "uu"}, - TC {unicode = 3648, translit = "e", cclass = Low, liveness = True, pronunc = "ee", pronunc_end = "ee"}, - TC {unicode = 3649, translit = "e'", cclass = Low, liveness = True, pronunc = "\x25b\x25b", pronunc_end = "0x25b\x25b"}, - TC {unicode = 3650, translit = "o:", cclass = Low, liveness = True, pronunc = "oo", pronunc_end = "oo"}, - TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ay", pronunc_end = "ay"}, - TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ay", pronunc_end = "ay"}, - TC {unicode = 3653, translit = "L", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"}, - TC {unicode = 3654, translit = "R", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"}, - TC {unicode = 3655, translit = "S", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3656, translit = "T1", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3657, translit = "T2", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3658, translit = "T3", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3659, translit = "T4", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3660, translit = "K", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, - TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "0", pronunc_end = "0"}, - TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "1", pronunc_end = "1"}, - TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "2", pronunc_end = "2"}, - TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "3", pronunc_end = "3"}, - TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "4", pronunc_end = "4"}, - TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "5", pronunc_end = "5"}, - TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "6", pronunc_end = "6"}, - TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "7", pronunc_end = "7"}, - TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "8", pronunc_end = "8"}, - TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"} + TC {dat = CaP, uni = 3632, tra = "a.", ccl = Low, liv = False, pro = "a", enp = "a"}, + TC {dat = Ca, uni = 3633, tra = "a", ccl = Low, liv = False, pro = "a", enp = "a"}, + TC {dat = CaL, uni = 3634, tra = "a:", ccl = Low, liv = True, pro = "aa", enp = "aa"}, + TC {dat = Cam, uni = 3635, tra = "a+", ccl = Low, liv = True, pro = "am", enp = "am"}, + TC {dat = Ci, uni = 3636, tra = "i", ccl = Low, liv = False, pro = "i", enp = "i"}, + TC {dat = CiL, uni = 3637, tra = "i:", ccl = Low, liv = True, pro = "ii", enp = "ii"}, + TC {dat = Cv, uni = 3638, tra = "v", ccl = Low, liv = False, pro = i_ue, enp = i_ue}, + TC {dat = CvL, uni = 3639, tra = "v:", ccl = Low, liv = True, pro = i_uue, enp = i_uue}, + TC {dat = Cu, uni = 3640, tra = "u", ccl = Low, liv = False, pro = "u", enp = "u"}, + TC {dat = CuL, uni = 3641, tra = "u:", ccl = Low, liv = True, pro = "uu", enp = "uu"}, + TC {dat = Ce, uni = 3648, tra = "e", ccl = Low, liv = True, pro = "ee", enp = "ee"}, + TC {dat = Ce', uni = 3649, tra = "e'", ccl = Low, liv = True, pro = i_aae, enp = i_aae}, + TC {dat = CoL, uni = 3650, tra = "o:", ccl = Low, liv = True, pro = "oo", enp = "oo"}, + TC {dat = CaH, uni = 3651, tra = "a%", ccl = Low, liv = True, pro = "ay", enp = "ay"}, + TC {dat = CaE, uni = 3652, tra = "a&", ccl = Low, liv = True, pro = "ay", enp = "ay"}, + TC {dat = CL, uni = 3653, tra = "L", ccl = Low, liv = True, pro = "l", enp = "n"}, + TC {dat = CR, uni = 3654, tra = "R", ccl = Low, liv = True, pro = "r", enp = "n"}, + TC {dat = CS, uni = 3655, tra = "S", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CT1, uni = 3656, tra = "T1", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CT2, uni = 3657, tra = "T2", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CT3, uni = 3658, tra = "T3", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CT4, uni = 3659, tra = "T4", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CK, uni = 3660, tra = "K", ccl = Low, liv = True, pro = "", enp = ""}, + TC {dat = CN0, uni = 3664, tra = "N0", ccl = Low, liv = False, pro = "0", enp = "0"}, + TC {dat = CN1, uni = 3665, tra = "N1", ccl = Low, liv = False, pro = "1", enp = "1"}, + TC {dat = CN2, uni = 3666, tra = "N2", ccl = Low, liv = False, pro = "2", enp = "2"}, + TC {dat = CN3, uni = 3667, tra = "N3", ccl = Low, liv = False, pro = "3", enp = "3"}, + TC {dat = CN4, uni = 3668, tra = "N4", ccl = Low, liv = False, pro = "4", enp = "4"}, + TC {dat = CN5, uni = 3669, tra = "N5", ccl = Low, liv = False, pro = "5", enp = "5"}, + TC {dat = CN6, uni = 3670, tra = "N6", ccl = Low, liv = False, pro = "6", enp = "6"}, + TC {dat = CN7, uni = 3671, tra = "N7", ccl = Low, liv = False, pro = "7", enp = "7"}, + TC {dat = CN8, uni = 3672, tra = "N8", ccl = Low, liv = False, pro = "8", enp = "8"}, + TC {dat = CN9, uni = 3673, tra = "N9", ccl = Low, liv = False, pro = "9", enp = "9"} ] +data ThaiDat = + Ck | Ck1 | Ck2 | Ck3 | Cg | Cc | Cc1 | Cc2 | Cs' | + Cc3 | Cy' | Cd' | Ct' | Ct1 | Ct2 | Ct3 | Cn' | Cd | + Ct | Ct4 | Ct5 | Ct6 | Cn | Cb | Cp | Cp1 | Cf | + Cp2 | Cf' | Cp3 | Cm | Cy | Cr | Cl | Cw | CsM | + CsP | Cs | Ch | Cl' | CO | Ch' | CaP | Ca | CaL | + Cam | Ci | CiL | Cv | CvL | Cu | CuL | Ce | Ce' | + CoL | CaH | CaE | CL | CR | CS | CT1 | CT2 | CT3 | + CT4 | CK | CN0 | CN1 | CN2 | CN3 | CN4 | CN5 | CN6 | + CN7 | CN8 | CN9 + deriving (Eq,Ord,Show) + -- testing with Wikipedia Swadesh list testFile = "src/test.txt" @@ -337,37 +378,3 @@ tabs s = case break (=='\t') s of _ -> [s] -{- -uni2pron :: [Int] -> String -uni2pron is = case is of - 0xe40:c:0xe34 :cs -> pron c ++ tone c cs "\601\601" ++ uni2pron cs - 0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "iia" ++ uni2pron cs - 0xe40:c:0xe37:0xe2d:cs -> pron c ++ tone c cs "\649\649" ++ uni2pron cs - 0xe40:c:0xe47 :cs -> pron c ++ tone c cs "e" ++ uni2pron cs - 0xe41:c:0xe47 :cs -> pron c ++ tone c cs "\x25b" ++ uni2pron cs - - v:0xe2b:c:cs | bvow v && isConsonant c - -> pron c ++ tone 0xe2b cs (pron v) ++ uni2pron cs -- h- - v:b:c:cs | clust b c && bvow v -- kr- etc - -> pron b ++ pron c ++ tone b (c:cs) (pron v) ++ uni2pron cs - v:c:cs | bvow v -> pron c ++ tone c cs (pron v) ++ uni2pron cs -- e .. ay - - c:0xe31:0xe27:cs -> pron c ++ tone c cs "ua" ++ uni2pron cs - - 0xe2b:c:v:cs | isConsonant c && cvow v - -> pron c ++ tone 0xe2b cs (pron v) ++ uni2pron cs -- h- - b:c:v:cs | clust b c && cvow v -- kr- etc - -> pron b ++ pron c ++ tone b (c:cs) (pron v) ++ uni2pron cs - 0xe2d:v:cs | cvow v -> tone 0xe2d cs (pron v) ++ uni2pron cs -- O- - c:v:cs | cvow v -> pron c ++ tone c cs (pron v) ++ uni2pron cs -- a .. u: - - [c] -> enc c - c:cs -> pron c ++ uni2pron cs --- shouldn't happen if syllabified ?? - [] -> [] - where - enc c = lookThai [] pronunc_end c - pron c = lookThai [] pronunc c - cvow v = (0xe30 <= v && v <= 0xe39) || v == 0xe2d -- central vowels - bvow v = 0xe40 <= v && v <= 0xe44 -- begin vowels - clust b c = isConsonant b && (elem c [0xe23, 0xe25, 0xe27]) --}