forked from GitHub/gf-core
421 lines
18 KiB
Haskell
421 lines
18 KiB
Haskell
module Main where
|
|
|
|
import Data.Char
|
|
import Data.List
|
|
import qualified Data.Map as Map
|
|
import System
|
|
|
|
-- convert all files *Tha.gf into *ThP.gf with "t" changed to (thpron "t" "p")
|
|
|
|
main = allThpron
|
|
|
|
allThpron = do
|
|
System.system "ls *Tha*.gf ../api/*Tha*.gf >srcThai.txt"
|
|
files <- readFile "srcThai.txt" >>= return . lines
|
|
mapM_ fileThpron files
|
|
return ()
|
|
|
|
fileThpron file = do
|
|
s <- readFile file
|
|
let tgt = appThpron file
|
|
writeFile tgt (appThpron s)
|
|
putStrLn ("wrote " ++ tgt)
|
|
|
|
-- thb = IPA Thai; not used in ISO-639-3 like Thp
|
|
appThpron s = case s of
|
|
'"':cs -> let (w,_:rest) = break (=='"') cs in mkThpron w ++ appThpron rest
|
|
'T':'h':'a':'A':rest -> "Thb" ++ appThpron rest -- AllThaAbs
|
|
'T':'h':'a':c:rest | isAlpha c -> "Tha" ++ c : appThpron rest -- Thank
|
|
'T':'h':'a':rest -> "Thb" ++ appThpron rest
|
|
c:cs -> c:appThpron cs
|
|
_ -> s
|
|
|
|
mkThpron s = "(thpron \"" ++ s ++ "\" \"" ++ thai2pron s ++ "\")"
|
|
|
|
|
|
-- the following functions involving pron (=pronunciation) work on syllables
|
|
|
|
thai2pron = uni2pron . thai2uni
|
|
trans2pron = uni2pron . trans2uni
|
|
trans2thai = uni2thai . trans2uni
|
|
trans2dat = uni2dat . trans2uni
|
|
|
|
thai2uni :: String -> [Int]
|
|
thai2uni = map fromEnum
|
|
|
|
uni2thai :: [Int] -> String
|
|
uni2thai = map toEnum
|
|
|
|
uni2pron :: [Int] -> String
|
|
uni2pron = dat2pron . uni2dat
|
|
|
|
uni2dat = map (maybe CR dat . (\u -> Map.lookup u uniMap)) --- CR as exception value
|
|
|
|
--high = accent '\x301'
|
|
--low = accent '\x300'
|
|
--rising = accent '\x306'
|
|
--falling = accent '\x302'
|
|
|
|
dat2pron :: [ThaiDat] -> String
|
|
dat2pron is = case is of
|
|
|
|
-- exceptional words
|
|
[Ce,Ck1,CaL] -> "kha\x301w"
|
|
[Cc1,Ca,Cn] -> "cha\x301n"
|
|
[CaE,Ch,Cm] -> "m" ++ high "ay"
|
|
[Ct,CT2,CO,Cg] -> "t" ++ falling i_O ++ i_ng
|
|
[Ce,Cg,Ci,Cn] -> i_ng ++ i_oe ++ "n"
|
|
[CaE,Cd,CT2] -> "d" ++ falling "aay"
|
|
[Ce,Ck,CT2,CaL] -> "k" ++ falling "aaw"
|
|
[CaE,Cm,CT2] -> "m" ++ high "aay"
|
|
[CO,Cy,CaL,Ck] -> "y" ++ low "aak"
|
|
[CO,Cy,CT1,CaL] -> "y" ++ low "aa"
|
|
[CO,Cy,CT1,CaL,Cg] -> "y" ++ low "aa" ++ i_ng
|
|
[CO,Cy,CT1,CuL] -> "y" ++ low "uu"
|
|
[Cp3,Cr] -> "phan" --- not in Smyth
|
|
[Cp2,Cw,Ck] -> "phw" ++ falling "ak" -- not in Smyth
|
|
[Cc2,CvL,CT1,CO] -> "ch" ++ falling i_uue --- to get rid of final O
|
|
[CO,Cg,CT1,Cu,Cn] -> low "a" ++ i_ng ++ low "un" --- probably there is a rule for leading vowelless O
|
|
|
|
-- words following the rules (mostly from Smyth's Essential Grammar)
|
|
_ -> case getSyllable is of
|
|
[Ce] : cc : [] : d : cs | brev d -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-8 -> e
|
|
[Ce'] : cc : [] : d : cs | brev d -> prons cc ++ tone cc d cs i_ae ++ endWith cs -- ä-8 -> ä
|
|
[v] :[CO]: [] : d : cs -> tone[CO]d cs (pron v) ++ endWith cs -- e/ä/o/ay/ay
|
|
[v] : cc : [] : d : cs -> prons cc ++ tone cc d cs (pron v) ++ endWith cs -- e/ä/o/ay/ay
|
|
[Ce] : cc : [Cy] : d : cs -> prons cc ++ tone cc d cs (i_ooe++"y")++ endWith cs -- e-y -> ööy
|
|
[Ce] : cc : [CO] : d : cs -> prons cc ++ tone cc d cs i_ooe ++ endWith cs -- e-O -> öö
|
|
[Ce] : cc : [CO,CaP] : d : cs -> prons cc ++ tone cc d cs i_oe ++ endWith cs -- e-Oa -> ö
|
|
[Ce] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs "e" ++ endWith cs -- e-a
|
|
[Ce] : cc : [CaL] : d : cs -> prons cc ++ tone cc d cs "aw" ++ endWith cs -- e-a
|
|
[Ce] : cc : [CaL,CaP] : d : cs -> prons cc ++ tone cc d cs i_O ++ endWith cs -- e-Aa -> O
|
|
[Ce] : cc : [Ci] : d : cs -> prons cc ++ tone cc d cs i_ooe ++ endWith cs -- e-i -> öö
|
|
[Ce] : cc : [CiL,Cy] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iiy-> ia
|
|
[Ce] : cc : [Ci,Cy,CaP] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iya-> ia
|
|
[Ce] : cc : [CiL,CO] : d : cs -> prons cc ++ tone cc d cs "ia" ++ endWith cs -- e-iiO-> üa
|
|
[Ce] : cc : [CvL,CO] : d : cs -> prons cc ++ tone cc d cs (i_ue++"a") ++ endWith cs -- e-vvO-> üa
|
|
[Ce] : cc : [CvL,CO,Cy] : d : cs -> prons cc ++ tone cc d cs (i_ue++"ay")++ endWith cs -- e-vvOy-> üay
|
|
[Ce'] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs i_ae ++ endWith cs -- ä-a -> ä
|
|
[CoL] : cc : [CaP] : d : cs -> prons cc ++ tone cc d cs "o" ++ endWith cs -- o-a -> o
|
|
[] :[CO]: v : d : cs -> tone[CO]d cs (prons v) ++ endWith cs -- Ov -> v
|
|
[] : cc : [Ca,Cw] : d : cs -> prons cc ++ tone cc d cs "ua" ++ endWith cs -- Caw -> Cua
|
|
[] : cc : v :[CK]:cs -> endWith cs -- swas(di:K)
|
|
[] : bb : [] : cc : []: d : cs -> prons bb ++ "a" ++ prons cc ++ tone cc [] cs "o" ++ endWith (d:cs) -- CaCoC
|
|
[] : bb : [] : cc : [] -> prons bb ++ tone bb [] [cc] "o" ++ endWith [cc] -- CoC
|
|
[] : bb : [] : cc : v : d : cs -> prons bb ++ "a" ++ prons cc ++ tone cc [] cs (prons v) ++ endWith cs -- CaCvC
|
|
[] : cc : v : d : cs -> prons cc ++ tone cc d cs (prons v) ++ endWith cs -- Cv- (normal)
|
|
|
|
_ -> prons is --- shouldn't happen
|
|
|
|
where
|
|
prons cc = case cc of
|
|
_ :CK:cs -> prons cs -- killer
|
|
Ch:c:cs | isConsonant c -> concatMap pron (c:cs) -- hC, ---- only some conss
|
|
_ -> concatMap pron cc
|
|
endWith ss = case concat ss of
|
|
_:CK:cs -> dat2pron cs
|
|
c :cs -> enc c ++ dat2pron cs
|
|
_ -> []
|
|
|
|
enc c = lookThai [] enp c
|
|
pron c = lookThai [] pro c
|
|
brev d = elem CS d -- shortener
|
|
|
|
getSyllable :: [ThaiDat] -> [[ThaiDat]] -- (V?),(C|CC|hC),(V*),(D*),(C*),[[],C]?
|
|
getSyllable s = case s of
|
|
v:cs | isPreVowel v -> [v]:getCons v cs
|
|
[] -> []
|
|
c:_ -> []:getCons c s
|
|
where
|
|
getCons v s = case s of
|
|
Ch:c:cs | isConsonant c && isLow c -> let (cc:ccs) = getCons v (c:cs) in (Ch:cc):ccs -- hC
|
|
CO:cs -> [CO] :getVow v cs -- O
|
|
Cs:Cr:cs -> [Cs] :getVow v cs -- sr -> s
|
|
_:CK:[] -> []
|
|
b:Cr:Cr:[] | isConsonant b -> [b] :[Ca]:[]:[Cr]:[] -- Crr -> Can
|
|
b:Cr:Cr:[c] | all isConsonant [b,c] -> [b] :[Ca]:[]:[c]:[] -- CrrC -> CaC
|
|
b:c:cs | isCluster b c -> [b,c] :getVow v cs -- C(l|r|w) cluster
|
|
b:cs | isPreVowel v -> [b] :getVow v cs
|
|
b:c:Cw:Cy:[]| isConsonant b && isDiacritic c -> [b]:[Cu,Ca]:[c]:[Cy]:[] -- CTuay
|
|
b:Cw:Cy:[] | isConsonant b -> [b] :[Cu,Ca]:[]:[Cy]:[] -- Cuay
|
|
b:c:d:[] | all isConsonant [b,c,d] -> [b] :[]:[c]:[]:[d]:[] -- CaCoC
|
|
b:c:[] | all isConsonant [b,c] -> [b] :[]:[c]:[] -- CoC
|
|
b:c:cs | all isConsonant [b,c] -> [b] :[]:[c]:getVow c cs -- CaCvC
|
|
b:cs | isConsonant b -> [b] :getVow v cs -- C
|
|
_ -> [s] --- shouldn't happen ??
|
|
getVow v0 s = case span (\x -> inVow v0 x || isDiacritic x) s of
|
|
(v,c:cs) -> let (d,w) = partition isDiacritic v in w:d:[c]:getSyllable cs
|
|
(v,_) -> let (d,w) = partition isDiacritic v in [w,d]
|
|
inVow v0 x = isInVowel x || case v0 of
|
|
Ce -> elem x [Cy] -- after e-, also y is a part of a vowel
|
|
_ -> False --- elem x [Cw]
|
|
isLow c = lookThai Low ccl c == Low
|
|
|
|
isCluster b c =
|
|
(pronTha c == "r" && elem (pronTha b) ["k","kh","p","ph","t"])
|
|
|| (pronTha c == "ri" && elem (pronTha b) ["k","kh","p","ph","t"])
|
|
|| (pronTha c == "l" && elem (pronTha b) ["k","kh","p","ph"])
|
|
|| (pronTha c == "w" && elem (pronTha b) ["k","kh"])
|
|
|
|
pronTha c = lookThai [] pro c
|
|
|
|
isInVowel :: ThaiDat -> Bool
|
|
isInVowel v = (CaP <= v && v <= CuL) || v == CO -- infix vowels
|
|
|
|
isPreVowel :: ThaiDat -> Bool
|
|
isPreVowel i = Ce <= i && i <= CaE
|
|
|
|
isVowel :: ThaiDat -> Bool
|
|
isVowel i = CaP <= i && i <= CaE
|
|
|
|
isConsonant :: ThaiDat -> Bool
|
|
isConsonant i = Ck <= i && i <= Ch' && i /= CO
|
|
|
|
isDiacritic :: ThaiDat -> Bool
|
|
isDiacritic i = CS <= i && i <= CK
|
|
|
|
tone :: [ThaiDat] -> [ThaiDat] -> [[ThaiDat]] -> String -> String
|
|
tone cc@(c:_) d cs v = case (lookThai Low ccl c, isLive cs1, toneMark d) of
|
|
(_,_,3) -> high v
|
|
(_,_,4) -> rising v
|
|
(Low,_,1) -> falling v
|
|
(Low,_,2) -> high v
|
|
(Low,True,_) -> mid v
|
|
(Low,False,_) -> case isLong v of
|
|
False -> high v
|
|
True -> falling v
|
|
(_,_,1) -> low v
|
|
(_,_,2) -> falling v
|
|
(Mid,True,_) -> mid v
|
|
(Mid,False,_) -> low v
|
|
(High,True,_) -> rising v
|
|
(High,False,_) -> low v
|
|
where
|
|
cs1 = concat (take 1 cs)
|
|
tone _ _ _ v = mid v
|
|
|
|
toneMark :: [ThaiDat] -> Int
|
|
toneMark is = case is of
|
|
CT1:is -> 1
|
|
CT2:is -> 2
|
|
CT3:is -> 3
|
|
CT4:is -> 4
|
|
_ :is -> toneMark is
|
|
_ -> 0 -- no tone mark in is
|
|
|
|
isLong :: String -> Bool
|
|
isLong s = case s of
|
|
'i':'a':_ -> True
|
|
'u':'a':_ -> True
|
|
'\x289':a:_ -> True -- uea
|
|
c:d:_ | c == d -> True --- must be vowels
|
|
_:cs -> isLong cs
|
|
_ -> False
|
|
|
|
isLive :: [ThaiDat] -> Bool
|
|
isLive is = case is of
|
|
[i] -> lookThai False liv i
|
|
[] -> True
|
|
_ -> False
|
|
|
|
mid, high, low, falling, rising :: String -> String
|
|
mid s = s
|
|
high = accent '\x301'
|
|
low = accent '\x300'
|
|
rising = accent '\x306'
|
|
falling = accent '\x302'
|
|
|
|
accent a s = case s of
|
|
c:cs -> c:a:cs
|
|
_ -> s
|
|
|
|
trans2uni :: String -> [Int]
|
|
trans2uni =
|
|
map (\c -> maybe 0 id $ Map.lookup c trans) .
|
|
unchar
|
|
where
|
|
trans = Map.fromList [(tra c, uni c) | c <- allThaiChars]
|
|
|
|
unchar :: String -> [String]
|
|
unchar s = case s of
|
|
c:d:cs
|
|
| isAlpha d -> [c] : unchar (d:cs)
|
|
| isSpace d -> [c]:[d]: unchar cs
|
|
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
|
|
(c:d:ds) : unchar cs2
|
|
[_] -> [s]
|
|
_ -> []
|
|
|
|
|
|
thaiMap :: Map.Map ThaiDat ThaiChar
|
|
thaiMap = Map.fromList [(dat c,c) | c <- allThaiChars]
|
|
|
|
uniMap :: Map.Map Int ThaiChar
|
|
uniMap = Map.fromList [(uni c,c) | c <- allThaiChars]
|
|
|
|
lookThai :: a -> (ThaiChar -> a) -> ThaiDat -> a
|
|
lookThai v f i = maybe v f (Map.lookup i thaiMap)
|
|
|
|
data ThaiChar = TC {
|
|
dat :: ThaiDat,
|
|
uni :: Int,
|
|
tra :: String,
|
|
ccl :: CClass,
|
|
liv :: Bool,
|
|
pro :: String,
|
|
enp :: String
|
|
}
|
|
deriving Show
|
|
|
|
data CClass = Low | Mid | High
|
|
deriving (Show, Eq)
|
|
|
|
-- IPA sounds
|
|
i_ng = "\331"
|
|
i_O = "\596"
|
|
i_ue = "\x289"
|
|
i_ae = "\x25b"
|
|
i_oe = "\601"
|
|
i_ooe = i_oe ++ i_oe
|
|
i_uue = i_ue ++ i_ue
|
|
i_aae = i_ae ++ i_ae
|
|
i_OO = i_O ++ i_O
|
|
|
|
allThaiChars :: [ThaiChar]
|
|
allThaiChars = [
|
|
TC {dat = Ck, uni = 3585, tra = "k", ccl = Mid, liv = False, pro = "k", enp = "k"},
|
|
TC {dat = Ck1, uni = 3586, tra = "k1", ccl = High, liv = False, pro = "kh", enp = "k"},
|
|
TC {dat = Ck2, uni = 3588, tra = "k2", ccl = Low, liv = False, pro = "kh", enp = "k"},
|
|
TC {dat = Ck3, uni = 3590, tra = "k3", ccl = Low, liv = False, pro = "kh", enp = "k"},
|
|
TC {dat = Cg, uni = 3591, tra = "g", ccl = Low, liv = True, pro = i_ng, enp = i_ng},
|
|
TC {dat = Cc, uni = 3592, tra = "c", ccl = Mid, liv = False, pro = "c", enp = "t"},
|
|
TC {dat = Cc1, uni = 3593, tra = "c1", ccl = High, liv = False, pro = "ch", enp = "t"},
|
|
TC {dat = Cc2, uni = 3594, tra = "c2", ccl = Low, liv = False, pro = "ch", enp = "t"},
|
|
TC {dat = Cs', uni = 3595, tra = "s'", ccl = Low, liv = False, pro = "s", enp = "t"},
|
|
TC {dat = Cc3, uni = 3596, tra = "c3", ccl = Low, liv = False, pro = "ch", enp = "t"},
|
|
TC {dat = Cy', uni = 3597, tra = "y'", ccl = Low, liv = False, pro = "y", enp = "n"},
|
|
TC {dat = Cd', uni = 3598, tra = "d'", ccl = Mid, liv = False, pro = "d", enp = "t"},
|
|
TC {dat = Ct', uni = 3599, tra = "t'", ccl = Mid, liv = False, pro = "t", enp = "t"},
|
|
TC {dat = Ct1, uni = 3600, tra = "t1", ccl = High, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Ct2, uni = 3601, tra = "t2", ccl = Low, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Ct3, uni = 3602, tra = "t3", ccl = Low, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Cn', uni = 3603, tra = "n'", ccl = Low, liv = True, pro = "n", enp = "n"},
|
|
TC {dat = Cd, uni = 3604, tra = "d", ccl = Mid, liv = False, pro = "d", enp = "t"},
|
|
TC {dat = Ct, uni = 3605, tra = "t", ccl = Mid, liv = False, pro = "t", enp = "t"},
|
|
TC {dat = Ct4, uni = 3606, tra = "t4", ccl = High, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Ct5, uni = 3607, tra = "t5", ccl = Low, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Ct6, uni = 3608, tra = "t6", ccl = Low, liv = False, pro = "th", enp = "t"},
|
|
TC {dat = Cn, uni = 3609, tra = "n", ccl = Low, liv = True, pro = "n", enp = "n"},
|
|
TC {dat = Cb, uni = 3610, tra = "b", ccl = Mid, liv = False, pro = "b", enp = "p"},
|
|
TC {dat = Cp, uni = 3611, tra = "p", ccl = Mid, liv = False, pro = "p", enp = "p"},
|
|
TC {dat = Cp1, uni = 3612, tra = "p1", ccl = High, liv = False, pro = "ph", enp = "p"},
|
|
TC {dat = Cf, uni = 3613, tra = "f", ccl = High, liv = False, pro = "f", enp = "p"},
|
|
TC {dat = Cp2, uni = 3614, tra = "p2", ccl = Low, liv = False, pro = "ph", enp = "p"},
|
|
TC {dat = Cf', uni = 3615, tra = "f'", ccl = Low, liv = False, pro = "f", enp = "p"},
|
|
TC {dat = Cp3, uni = 3616, tra = "p3", ccl = Low, liv = False, pro = "ph", enp = "p"},
|
|
TC {dat = Cm, uni = 3617, tra = "m", ccl = Low, liv = True, pro = "m", enp = "m"},
|
|
TC {dat = Cy, uni = 3618, tra = "y", ccl = Low, liv = True, pro = "y", enp = "y"},
|
|
TC {dat = Cr, uni = 3619, tra = "r", ccl = Low, liv = True, pro = "r", enp = "n"},
|
|
TC {dat = Cl, uni = 3621, tra = "l", ccl = Low, liv = True, pro = "l", enp = "n"},
|
|
TC {dat = Cw, uni = 3623, tra = "w", ccl = Low, liv = True, pro = "w", enp = "w"},
|
|
TC {dat = CsM, uni = 3624, tra = "s-", ccl = High, liv = False, pro = "s", enp = "t"},
|
|
TC {dat = CsP, uni = 3625, tra = "s.", ccl = High, liv = False, pro = "s", enp = "t"},
|
|
TC {dat = Cs, uni = 3626, tra = "s", ccl = High, liv = False, pro = "s", enp = "t"},
|
|
TC {dat = Ch, uni = 3627, tra = "h", ccl = High, liv = True, pro = "h", enp = ""},
|
|
TC {dat = Cl, uni = 3628, tra = "l'", ccl = Low, liv = True, pro = "l", enp = "n"},
|
|
TC {dat = CO, uni = 3629, tra = "O", ccl = Mid, liv = True, pro = i_OO, enp = i_OO},
|
|
TC {dat = Ch', uni = 3630, tra = "h'", ccl = Low, liv = True, pro = "h", enp = ""},
|
|
|
|
TC {dat = CaP, uni = 3632, tra = "a.", ccl = Low, liv = False, pro = "a", enp = "a"},
|
|
TC {dat = Ca, uni = 3633, tra = "a", ccl = Low, liv = False, pro = "a", enp = "a"},
|
|
TC {dat = CaL, uni = 3634, tra = "a:", ccl = Low, liv = True, pro = "aa", enp = "aa"},
|
|
TC {dat = Cam, uni = 3635, tra = "a+", ccl = Low, liv = True, pro = "am", enp = "am"},
|
|
TC {dat = Ci, uni = 3636, tra = "i", ccl = Low, liv = False, pro = "i", enp = "i"},
|
|
TC {dat = CiL, uni = 3637, tra = "i:", ccl = Low, liv = True, pro = "ii", enp = "ii"},
|
|
TC {dat = Cv, uni = 3638, tra = "v", ccl = Low, liv = False, pro = i_ue, enp = i_ue},
|
|
TC {dat = CvL, uni = 3639, tra = "v:", ccl = Low, liv = True, pro = i_uue, enp = i_uue},
|
|
TC {dat = Cu, uni = 3640, tra = "u", ccl = Low, liv = False, pro = "u", enp = "u"},
|
|
TC {dat = CuL, uni = 3641, tra = "u:", ccl = Low, liv = True, pro = "uu", enp = "uu"},
|
|
TC {dat = Ce, uni = 3648, tra = "e", ccl = Low, liv = True, pro = "ee", enp = "ee"},
|
|
TC {dat = Ce', uni = 3649, tra = "e'", ccl = Low, liv = True, pro = i_aae, enp = i_aae},
|
|
TC {dat = CoL, uni = 3650, tra = "o:", ccl = Low, liv = True, pro = "oo", enp = "oo"},
|
|
TC {dat = CaH, uni = 3651, tra = "a%", ccl = Low, liv = True, pro = "ay", enp = "ay"},
|
|
TC {dat = CaE, uni = 3652, tra = "a&", ccl = Low, liv = True, pro = "ay", enp = "ay"},
|
|
TC {dat = CL, uni = 3653, tra = "L", ccl = Low, liv = True, pro = "li", enp = "n"},
|
|
TC {dat = CR, uni = 3654, tra = "R", ccl = Low, liv = True, pro = "ri", enp = "n"},
|
|
TC {dat = CS, uni = 3655, tra = "S", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CT1, uni = 3656, tra = "T1", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CT2, uni = 3657, tra = "T2", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CT3, uni = 3658, tra = "T3", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CT4, uni = 3659, tra = "T4", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CK, uni = 3660, tra = "K", ccl = Low, liv = True, pro = "", enp = ""},
|
|
TC {dat = CN0, uni = 3664, tra = "N0", ccl = Low, liv = False, pro = "0", enp = "0"},
|
|
TC {dat = CN1, uni = 3665, tra = "N1", ccl = Low, liv = False, pro = "1", enp = "1"},
|
|
TC {dat = CN2, uni = 3666, tra = "N2", ccl = Low, liv = False, pro = "2", enp = "2"},
|
|
TC {dat = CN3, uni = 3667, tra = "N3", ccl = Low, liv = False, pro = "3", enp = "3"},
|
|
TC {dat = CN4, uni = 3668, tra = "N4", ccl = Low, liv = False, pro = "4", enp = "4"},
|
|
TC {dat = CN5, uni = 3669, tra = "N5", ccl = Low, liv = False, pro = "5", enp = "5"},
|
|
TC {dat = CN6, uni = 3670, tra = "N6", ccl = Low, liv = False, pro = "6", enp = "6"},
|
|
TC {dat = CN7, uni = 3671, tra = "N7", ccl = Low, liv = False, pro = "7", enp = "7"},
|
|
TC {dat = CN8, uni = 3672, tra = "N8", ccl = Low, liv = False, pro = "8", enp = "8"},
|
|
TC {dat = CN9, uni = 3673, tra = "N9", ccl = Low, liv = False, pro = "9", enp = "9"}
|
|
]
|
|
|
|
data ThaiDat =
|
|
Ck | Ck1 | Ck2 | Ck3 | Cg | Cc | Cc1 | Cc2 | Cs' |
|
|
Cc3 | Cy' | Cd' | Ct' | Ct1 | Ct2 | Ct3 | Cn' | Cd |
|
|
Ct | Ct4 | Ct5 | Ct6 | Cn | Cb | Cp | Cp1 | Cf |
|
|
Cp2 | Cf' | Cp3 | Cm | Cy | Cr | Cl | Cw | CsM |
|
|
CsP | Cs | Ch | Cl' | CO | Ch' | CaP | Ca | CaL |
|
|
Cam | Ci | CiL | Cv | CvL | Cu | CuL | Ce | Ce' |
|
|
CoL | CaH | CaE | CL | CR | CS | CT1 | CT2 | CT3 |
|
|
CT4 | CK | CN0 | CN1 | CN2 | CN3 | CN4 | CN5 | CN6 |
|
|
CN7 | CN8 | CN9
|
|
deriving (Eq,Ord,Show)
|
|
|
|
-- testing with Wikipedia Swadesh list
|
|
|
|
testFile = "src/test.txt"
|
|
resultFile = "src/results.txt"
|
|
|
|
test = do
|
|
s <- readFile testFile
|
|
writeFile resultFile []
|
|
mapM_ (testOne . tabs) $ lines s
|
|
|
|
testOne ws = case ws of
|
|
m:t:p:r:_ -> appendFile resultFile $ concat [mn,"\t",t,"\t",p,"\t",r,"\t",result,"\n"] where
|
|
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
|
|
mn = if result == r
|
|
then m
|
|
else if result == p then (m ++ "+") else (m ++ "-")
|
|
_ -> return ()
|
|
|
|
testOneS ws = case ws of
|
|
m:t:p:r:_ -> appendFile resultFile $ concat [m,"\t",t,"\t",pn,"\t",r,"\n"] where
|
|
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
|
|
pn = if m == "+"
|
|
then r
|
|
else p
|
|
_ -> return ()
|
|
|
|
tabs s = case break (=='\t') s of
|
|
([], _:ws) -> tabs ws
|
|
(w , _:ws) -> w:tabs ws
|
|
_ -> [s]
|
|
|
|
|
|
-- heuristics for finding syllables - unreliable, unfinished
|
|
uniSyllables :: [ThaiDat] -> [[ThaiDat]]
|
|
uniSyllables = reverse . (syll [] []) where
|
|
syll sys sy is = case is of
|
|
c:cs | isPreVowel c -> new [] is
|
|
c:d:cs | isConsonant c && isConsonant d -> new [c] (d:cs) ---- no consonant clusters
|
|
c:cs -> continue [c] cs ---- more rules to follow
|
|
_ -> sy:sys
|
|
where
|
|
new old = syll ((sy ++ old) : sys) []
|
|
continue old = syll sys (sy ++ old)
|
|
|
|
|