Thai tone rules, at least most of them

This commit is contained in:
aarne
2011-11-03 21:34:59 +00:00
parent 8e0fc2d964
commit 346e57707b

View File

@@ -3,6 +3,157 @@ module ThaiScript where
import Data.Char
import qualified Data.Map as Map
-- heuristics for finding syllables
uniSyllables :: [Int] -> [[Int]]
uniSyllables = reverse . (syll [] []) where
syll sys sy is = case is of
c:cs | isPreVowel c -> new [] is
c:d:cs | isConsonant c && isConsonant d -> new [c] (d:cs) ---- no consonant clusters
c:cs -> continue [c] cs ---- more rules to follow
_ -> sy:sys
where
new old = syll ((sy ++ old) : sys) []
continue old = syll sys (sy ++ old)
isPreVowel :: Int -> Bool
isPreVowel i = 0xe40 <= i && i <= 0xe44
isVowel :: Int -> Bool
isVowel i = 0xe30 <= i && i <= 0xe44
isConsonant :: Int -> Bool
isConsonant i = 0xe01 <= i && i <= 0xe2e && i /= 0xe2d
isMark :: Int -> Bool
isMark i = 0xe47 <= i && i <= 0xe4c
-- the following functions involving pron (=pronunciation) work on syllables
thai2pron = uni2pron . thai2uni
trans2pron = uni2pron . trans2uni
trans2thai = uni2thai . trans2uni
thai2uni :: String -> [Int]
thai2uni = map fromEnum
uni2thai :: [Int] -> String
uni2thai = map toEnum
uni2pron :: [Int] -> String
uni2pron is = case is of
0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "i:a" ++ uni2pron cs
0xe40:c:0xe37:0xe2d:cs -> pron c ++ tone c cs "ü:a" ++ uni2pron cs
0xe40:c:0xe32:cs -> pron c ++ tone c cs "ao" ++ uni2pron cs
0xe40:c:0xe34:cs -> pron c ++ tone c cs "ö:" ++ uni2pron cs
0xe40:c:0xe47:cs -> pron c ++ tone c cs "e" ++ uni2pron cs
0xe40:c:cs -> pron c ++ tone c cs "e:" ++ uni2pron cs
0xe41:c:0xe47:cs -> pron c ++ tone c cs "ä" ++ uni2pron cs
0xe41:c:cs -> pron c ++ tone c cs "ä:" ++ uni2pron cs
0xe42:c:cs -> pron c ++ tone c cs "o:" ++ uni2pron cs
0xe43:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs
0xe44:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs
c:0xe30:cs -> pron c ++ tone c cs "a" ++ uni2pron cs
c:0xe31:0xe27:cs -> pron c ++ tone c cs "u:a" ++ uni2pron cs
c:0xe31:cs -> pron c ++ tone c cs "a" ++ uni2pron cs
c:0xe32:cs -> pron c ++ tone c cs "a:" ++ uni2pron cs
c:0xe33:cs -> pron c ++ tone c cs "am" ++ uni2pron cs
c:0xe34:cs -> pron c ++ tone c cs "i" ++ uni2pron cs
c:0xe35:cs -> pron c ++ tone c cs "i:" ++ uni2pron cs
c:0xe36:cs -> pron c ++ tone c cs "ü" ++ uni2pron cs
c:0xe37:cs -> pron c ++ tone c cs "ü:" ++ uni2pron cs
c:0xe38:cs -> pron c ++ tone c cs "u" ++ uni2pron cs
c:0xe39:cs -> pron c ++ tone c cs "u:" ++ uni2pron cs
[c] -> enc c
c:cs -> pron c ++ uni2pron cs
[] -> []
where
enc c = lookThai [] pronunc_end c
pron c = lookThai [] pronunc c
tone :: Int -> [Int] -> String -> String
tone c cs v = case (lookThai Low cclass c, isLive cs, toneMark (c:cs)) of
(_,_,3) -> high v
(_,_,4) -> rising v
(Low,_,1) -> falling v
(Low,_,2) -> high v
(Low,True,_) -> mid v
(Low,False,_) -> case isLong v of
False -> high v
True -> falling v
(_,_,1) -> low v
(_,_,2) -> falling v
(Mid,True,_) -> mid v
(Mid,False,_) -> low v
(High,True,_) -> rising v
(High,False,_) -> low v
toneMark :: [Int] -> Int
toneMark is = case is of
0xe48:is -> 1
0xe49:is -> 2
0xe4a:is -> 3
0xe4b:is -> 4
_:is -> toneMark is
_ -> 0 -- no tone mark in is
isLong :: String -> Bool
isLong s = elem ':' s
isLive :: [Int] -> Bool
isLive is = case is of
[i] -> lookThai False liveness i
[] -> True
_ -> False
mid, high, low, falling, rising :: String -> String
mid s = s
high = toneMap "á" "é" "í" "ó" "ú" "ǘ" "ä'" "ö'"
low = toneMap "à" "è" "ì" "ò" "ù" "ǜ" "ä`" "ö`"
rising = toneMap "ã" "" "ĩ" "õ" "ũ" "ü~" "ä~" "ö~"
falling = toneMap "â" "ê" "î" "ô" "û" "ü^" "ä^" "ö^"
toneMap a e i o u ue ae oe s = case s of
'a':cs -> a++cs
'e':cs -> e++cs
'i':cs -> i++cs
'o':cs -> o++cs
'u':cs -> u++cs
'ü':cs -> ue++cs
'ä':cs -> ae++cs
'ö':cs -> oe++cs
_ -> s
lookThai :: a -> (ThaiChar -> a) -> Int -> a
lookThai v f i = maybe v f (Map.lookup i thaiMap)
trans2uni :: String -> [Int]
trans2uni =
map (\c -> maybe 0 id $ Map.lookup c trans) .
unchar
where
trans = Map.fromList [(translit c, unicode c) | c <- allThaiChars]
unchar :: String -> [String]
unchar s = case s of
c:d:cs
| isAlpha d -> [c] : unchar (d:cs)
| isSpace d -> [c]:[d]: unchar cs
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
(c:d:ds) : unchar cs2
[_] -> [s]
_ -> []
thaiMap :: Map.Map Int ThaiChar
thaiMap = Map.fromList [(unicode c,c) | c <- allThaiChars]
data ThaiChar = TC {
unicode :: Int,
translit :: String,
@@ -61,39 +212,39 @@ allThaiChars = [
TC {unicode = 3629, translit = "O", cclass = Mid, liveness = True, pronunc = "O", pronunc_end = "O"},
TC {unicode = 3630, translit = "h'", cclass = Low, liveness = True, pronunc = "h", pronunc_end = ""},
TC {unicode = 3632, translit = "a.", cclass = Low, liveness = False, pronunc = "a.", pronunc_end = "a."},
TC {unicode = 3633, translit = "a", cclass = Low, liveness = False, pronunc = "a", pronunc_end = "a"},
TC {unicode = 3634, translit = "a:", cclass = Low, liveness = False, pronunc = "a:", pronunc_end = "a:"},
TC {unicode = 3635, translit = "a+", cclass = Low, liveness = False, pronunc = "a+", pronunc_end = "a+"},
TC {unicode = 3636, translit = "i", cclass = Low, liveness = False, pronunc = "i", pronunc_end = "i"},
TC {unicode = 3637, translit = "i:", cclass = Low, liveness = False, pronunc = "i:", pronunc_end = "i:"},
TC {unicode = 3638, translit = "v", cclass = Low, liveness = False, pronunc = "v", pronunc_end = "v"},
TC {unicode = 3639, translit = "v:", cclass = Low, liveness = False, pronunc = "v:", pronunc_end = "v:"},
TC {unicode = 3640, translit = "u", cclass = Low, liveness = False, pronunc = "u", pronunc_end = "u"},
TC {unicode = 3641, translit = "u:", cclass = Low, liveness = False, pronunc = "u:", pronunc_end = "u:"},
TC {unicode = 3648, translit = "e", cclass = Low, liveness = False, pronunc = "e", pronunc_end = "e"},
TC {unicode = 3649, translit = "e'", cclass = Low, liveness = False, pronunc = "e'", pronunc_end = "e'"},
TC {unicode = 3650, translit = "o:", cclass = Low, liveness = False, pronunc = "o:", pronunc_end = "o:"},
TC {unicode = 3651, translit = "a%", cclass = Low, liveness = False, pronunc = "a%", pronunc_end = "a%"},
TC {unicode = 3652, translit = "a&", cclass = Low, liveness = False, pronunc = "a&", pronunc_end = "a&"},
TC {unicode = 3653, translit = "L", cclass = Low, liveness = False, pronunc = "L", pronunc_end = "L"},
TC {unicode = 3654, translit = "R", cclass = Low, liveness = False, pronunc = "R", pronunc_end = "R"},
TC {unicode = 3655, translit = "S", cclass = Low, liveness = False, pronunc = "S", pronunc_end = "S"},
TC {unicode = 3656, translit = "T1", cclass = Low, liveness = False, pronunc = "T1", pronunc_end = "T1"},
TC {unicode = 3657, translit = "T2", cclass = Low, liveness = False, pronunc = "T2", pronunc_end = "T2"},
TC {unicode = 3658, translit = "T3", cclass = Low, liveness = False, pronunc = "T3", pronunc_end = "T3"},
TC {unicode = 3659, translit = "T4", cclass = Low, liveness = False, pronunc = "T4", pronunc_end = "T4"},
TC {unicode = 3660, translit = "K", cclass = Low, liveness = False, pronunc = "K", pronunc_end = "K"},
TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "N0", pronunc_end = "N0"},
TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "N1", pronunc_end = "N1"},
TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "N2", pronunc_end = "N2"},
TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "N3", pronunc_end = "N3"},
TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "N4", pronunc_end = "N4"},
TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "N5", pronunc_end = "N5"},
TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "N6", pronunc_end = "N6"},
TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "N7", pronunc_end = "N7"},
TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "N8", pronunc_end = "N8"},
TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "N9", pronunc_end = "N9"}
TC {unicode = 3632, translit = "a.", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"},
TC {unicode = 3633, translit = "a", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"},
TC {unicode = 3634, translit = "a:", cclass = Low, liveness = True, pronunc = "a:", pronunc_end = "a:"},
TC {unicode = 3635, translit = "a+", cclass = Low, liveness = True, pronunc = "am", pronunc_end = "am"},
TC {unicode = 3636, translit = "i", cclass = Low, liveness = True, pronunc = "i", pronunc_end = "i"},
TC {unicode = 3637, translit = "i:", cclass = Low, liveness = True, pronunc = "i:", pronunc_end = "i:"},
TC {unicode = 3638, translit = "v", cclass = Low, liveness = True, pronunc = "ü", pronunc_end = "ü"},
TC {unicode = 3639, translit = "v:", cclass = Low, liveness = True, pronunc = "ü:", pronunc_end = "ü:"},
TC {unicode = 3640, translit = "u", cclass = Low, liveness = True, pronunc = "u", pronunc_end = "u"},
TC {unicode = 3641, translit = "u:", cclass = Low, liveness = True, pronunc = "u:", pronunc_end = "u:"},
TC {unicode = 3648, translit = "e", cclass = Low, liveness = True, pronunc = "e:", pronunc_end = "e:"},
TC {unicode = 3649, translit = "e'", cclass = Low, liveness = True, pronunc = "ä:", pronunc_end = "ä:"},
TC {unicode = 3650, translit = "o:", cclass = Low, liveness = True, pronunc = "o:", pronunc_end = "o:"},
TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
TC {unicode = 3653, translit = "L", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"},
TC {unicode = 3654, translit = "R", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"},
TC {unicode = 3655, translit = "S", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3656, translit = "T1", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3657, translit = "T2", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3658, translit = "T3", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3659, translit = "T4", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3660, translit = "K", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "0", pronunc_end = "0"},
TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "1", pronunc_end = "1"},
TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "2", pronunc_end = "2"},
TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "3", pronunc_end = "3"},
TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "4", pronunc_end = "4"},
TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "5", pronunc_end = "5"},
TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "6", pronunc_end = "6"},
TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "7", pronunc_end = "7"},
TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "8", pronunc_end = "8"},
TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"}
]
@@ -139,15 +290,6 @@ mkTransliteration name ts us =
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
unchar :: String -> [String]
unchar s = case s of
c:d:cs
| isAlpha d -> [c] : unchar (d:cs)
| isSpace d -> [c]:[d]: unchar cs
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
(c:d:ds) : unchar cs2
[_] -> [s]
_ -> []
transThai :: Transliteration
transThai = mkTransliteration "Thai" allTrans allCodes where