mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-14 07:19:31 -06:00
Thai tone rules, at least most of them
This commit is contained in:
@@ -3,6 +3,157 @@ module ThaiScript where
|
||||
import Data.Char
|
||||
import qualified Data.Map as Map
|
||||
|
||||
|
||||
-- heuristics for finding syllables
|
||||
uniSyllables :: [Int] -> [[Int]]
|
||||
uniSyllables = reverse . (syll [] []) where
|
||||
syll sys sy is = case is of
|
||||
c:cs | isPreVowel c -> new [] is
|
||||
c:d:cs | isConsonant c && isConsonant d -> new [c] (d:cs) ---- no consonant clusters
|
||||
c:cs -> continue [c] cs ---- more rules to follow
|
||||
_ -> sy:sys
|
||||
where
|
||||
new old = syll ((sy ++ old) : sys) []
|
||||
continue old = syll sys (sy ++ old)
|
||||
|
||||
|
||||
isPreVowel :: Int -> Bool
|
||||
isPreVowel i = 0xe40 <= i && i <= 0xe44
|
||||
|
||||
isVowel :: Int -> Bool
|
||||
isVowel i = 0xe30 <= i && i <= 0xe44
|
||||
|
||||
isConsonant :: Int -> Bool
|
||||
isConsonant i = 0xe01 <= i && i <= 0xe2e && i /= 0xe2d
|
||||
|
||||
isMark :: Int -> Bool
|
||||
isMark i = 0xe47 <= i && i <= 0xe4c
|
||||
|
||||
|
||||
-- the following functions involving pron (=pronunciation) work on syllables
|
||||
thai2pron = uni2pron . thai2uni
|
||||
trans2pron = uni2pron . trans2uni
|
||||
trans2thai = uni2thai . trans2uni
|
||||
|
||||
thai2uni :: String -> [Int]
|
||||
thai2uni = map fromEnum
|
||||
|
||||
uni2thai :: [Int] -> String
|
||||
uni2thai = map toEnum
|
||||
|
||||
uni2pron :: [Int] -> String
|
||||
uni2pron is = case is of
|
||||
0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "i:a" ++ uni2pron cs
|
||||
0xe40:c:0xe37:0xe2d:cs -> pron c ++ tone c cs "ü:a" ++ uni2pron cs
|
||||
0xe40:c:0xe32:cs -> pron c ++ tone c cs "ao" ++ uni2pron cs
|
||||
0xe40:c:0xe34:cs -> pron c ++ tone c cs "ö:" ++ uni2pron cs
|
||||
0xe40:c:0xe47:cs -> pron c ++ tone c cs "e" ++ uni2pron cs
|
||||
0xe40:c:cs -> pron c ++ tone c cs "e:" ++ uni2pron cs
|
||||
|
||||
0xe41:c:0xe47:cs -> pron c ++ tone c cs "ä" ++ uni2pron cs
|
||||
0xe41:c:cs -> pron c ++ tone c cs "ä:" ++ uni2pron cs
|
||||
|
||||
0xe42:c:cs -> pron c ++ tone c cs "o:" ++ uni2pron cs
|
||||
0xe43:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs
|
||||
0xe44:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs
|
||||
|
||||
c:0xe30:cs -> pron c ++ tone c cs "a" ++ uni2pron cs
|
||||
c:0xe31:0xe27:cs -> pron c ++ tone c cs "u:a" ++ uni2pron cs
|
||||
c:0xe31:cs -> pron c ++ tone c cs "a" ++ uni2pron cs
|
||||
c:0xe32:cs -> pron c ++ tone c cs "a:" ++ uni2pron cs
|
||||
c:0xe33:cs -> pron c ++ tone c cs "am" ++ uni2pron cs
|
||||
c:0xe34:cs -> pron c ++ tone c cs "i" ++ uni2pron cs
|
||||
c:0xe35:cs -> pron c ++ tone c cs "i:" ++ uni2pron cs
|
||||
c:0xe36:cs -> pron c ++ tone c cs "ü" ++ uni2pron cs
|
||||
c:0xe37:cs -> pron c ++ tone c cs "ü:" ++ uni2pron cs
|
||||
c:0xe38:cs -> pron c ++ tone c cs "u" ++ uni2pron cs
|
||||
c:0xe39:cs -> pron c ++ tone c cs "u:" ++ uni2pron cs
|
||||
|
||||
[c] -> enc c
|
||||
c:cs -> pron c ++ uni2pron cs
|
||||
[] -> []
|
||||
where
|
||||
enc c = lookThai [] pronunc_end c
|
||||
pron c = lookThai [] pronunc c
|
||||
|
||||
tone :: Int -> [Int] -> String -> String
|
||||
tone c cs v = case (lookThai Low cclass c, isLive cs, toneMark (c:cs)) of
|
||||
(_,_,3) -> high v
|
||||
(_,_,4) -> rising v
|
||||
(Low,_,1) -> falling v
|
||||
(Low,_,2) -> high v
|
||||
(Low,True,_) -> mid v
|
||||
(Low,False,_) -> case isLong v of
|
||||
False -> high v
|
||||
True -> falling v
|
||||
(_,_,1) -> low v
|
||||
(_,_,2) -> falling v
|
||||
(Mid,True,_) -> mid v
|
||||
(Mid,False,_) -> low v
|
||||
(High,True,_) -> rising v
|
||||
(High,False,_) -> low v
|
||||
|
||||
toneMark :: [Int] -> Int
|
||||
toneMark is = case is of
|
||||
0xe48:is -> 1
|
||||
0xe49:is -> 2
|
||||
0xe4a:is -> 3
|
||||
0xe4b:is -> 4
|
||||
_:is -> toneMark is
|
||||
_ -> 0 -- no tone mark in is
|
||||
|
||||
isLong :: String -> Bool
|
||||
isLong s = elem ':' s
|
||||
|
||||
isLive :: [Int] -> Bool
|
||||
isLive is = case is of
|
||||
[i] -> lookThai False liveness i
|
||||
[] -> True
|
||||
_ -> False
|
||||
|
||||
mid, high, low, falling, rising :: String -> String
|
||||
mid s = s
|
||||
high = toneMap "á" "é" "í" "ó" "ú" "ǘ" "ä'" "ö'"
|
||||
low = toneMap "à" "è" "ì" "ò" "ù" "ǜ" "ä`" "ö`"
|
||||
rising = toneMap "ã" "ẽ" "ĩ" "õ" "ũ" "ü~" "ä~" "ö~"
|
||||
falling = toneMap "â" "ê" "î" "ô" "û" "ü^" "ä^" "ö^"
|
||||
|
||||
toneMap a e i o u ue ae oe s = case s of
|
||||
'a':cs -> a++cs
|
||||
'e':cs -> e++cs
|
||||
'i':cs -> i++cs
|
||||
'o':cs -> o++cs
|
||||
'u':cs -> u++cs
|
||||
'ü':cs -> ue++cs
|
||||
'ä':cs -> ae++cs
|
||||
'ö':cs -> oe++cs
|
||||
_ -> s
|
||||
|
||||
|
||||
lookThai :: a -> (ThaiChar -> a) -> Int -> a
|
||||
lookThai v f i = maybe v f (Map.lookup i thaiMap)
|
||||
|
||||
trans2uni :: String -> [Int]
|
||||
trans2uni =
|
||||
map (\c -> maybe 0 id $ Map.lookup c trans) .
|
||||
unchar
|
||||
where
|
||||
trans = Map.fromList [(translit c, unicode c) | c <- allThaiChars]
|
||||
|
||||
unchar :: String -> [String]
|
||||
unchar s = case s of
|
||||
c:d:cs
|
||||
| isAlpha d -> [c] : unchar (d:cs)
|
||||
| isSpace d -> [c]:[d]: unchar cs
|
||||
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
|
||||
(c:d:ds) : unchar cs2
|
||||
[_] -> [s]
|
||||
_ -> []
|
||||
|
||||
|
||||
thaiMap :: Map.Map Int ThaiChar
|
||||
thaiMap = Map.fromList [(unicode c,c) | c <- allThaiChars]
|
||||
|
||||
data ThaiChar = TC {
|
||||
unicode :: Int,
|
||||
translit :: String,
|
||||
@@ -61,39 +212,39 @@ allThaiChars = [
|
||||
TC {unicode = 3629, translit = "O", cclass = Mid, liveness = True, pronunc = "O", pronunc_end = "O"},
|
||||
TC {unicode = 3630, translit = "h'", cclass = Low, liveness = True, pronunc = "h", pronunc_end = ""},
|
||||
|
||||
TC {unicode = 3632, translit = "a.", cclass = Low, liveness = False, pronunc = "a.", pronunc_end = "a."},
|
||||
TC {unicode = 3633, translit = "a", cclass = Low, liveness = False, pronunc = "a", pronunc_end = "a"},
|
||||
TC {unicode = 3634, translit = "a:", cclass = Low, liveness = False, pronunc = "a:", pronunc_end = "a:"},
|
||||
TC {unicode = 3635, translit = "a+", cclass = Low, liveness = False, pronunc = "a+", pronunc_end = "a+"},
|
||||
TC {unicode = 3636, translit = "i", cclass = Low, liveness = False, pronunc = "i", pronunc_end = "i"},
|
||||
TC {unicode = 3637, translit = "i:", cclass = Low, liveness = False, pronunc = "i:", pronunc_end = "i:"},
|
||||
TC {unicode = 3638, translit = "v", cclass = Low, liveness = False, pronunc = "v", pronunc_end = "v"},
|
||||
TC {unicode = 3639, translit = "v:", cclass = Low, liveness = False, pronunc = "v:", pronunc_end = "v:"},
|
||||
TC {unicode = 3640, translit = "u", cclass = Low, liveness = False, pronunc = "u", pronunc_end = "u"},
|
||||
TC {unicode = 3641, translit = "u:", cclass = Low, liveness = False, pronunc = "u:", pronunc_end = "u:"},
|
||||
TC {unicode = 3648, translit = "e", cclass = Low, liveness = False, pronunc = "e", pronunc_end = "e"},
|
||||
TC {unicode = 3649, translit = "e'", cclass = Low, liveness = False, pronunc = "e'", pronunc_end = "e'"},
|
||||
TC {unicode = 3650, translit = "o:", cclass = Low, liveness = False, pronunc = "o:", pronunc_end = "o:"},
|
||||
TC {unicode = 3651, translit = "a%", cclass = Low, liveness = False, pronunc = "a%", pronunc_end = "a%"},
|
||||
TC {unicode = 3652, translit = "a&", cclass = Low, liveness = False, pronunc = "a&", pronunc_end = "a&"},
|
||||
TC {unicode = 3653, translit = "L", cclass = Low, liveness = False, pronunc = "L", pronunc_end = "L"},
|
||||
TC {unicode = 3654, translit = "R", cclass = Low, liveness = False, pronunc = "R", pronunc_end = "R"},
|
||||
TC {unicode = 3655, translit = "S", cclass = Low, liveness = False, pronunc = "S", pronunc_end = "S"},
|
||||
TC {unicode = 3656, translit = "T1", cclass = Low, liveness = False, pronunc = "T1", pronunc_end = "T1"},
|
||||
TC {unicode = 3657, translit = "T2", cclass = Low, liveness = False, pronunc = "T2", pronunc_end = "T2"},
|
||||
TC {unicode = 3658, translit = "T3", cclass = Low, liveness = False, pronunc = "T3", pronunc_end = "T3"},
|
||||
TC {unicode = 3659, translit = "T4", cclass = Low, liveness = False, pronunc = "T4", pronunc_end = "T4"},
|
||||
TC {unicode = 3660, translit = "K", cclass = Low, liveness = False, pronunc = "K", pronunc_end = "K"},
|
||||
TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "N0", pronunc_end = "N0"},
|
||||
TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "N1", pronunc_end = "N1"},
|
||||
TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "N2", pronunc_end = "N2"},
|
||||
TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "N3", pronunc_end = "N3"},
|
||||
TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "N4", pronunc_end = "N4"},
|
||||
TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "N5", pronunc_end = "N5"},
|
||||
TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "N6", pronunc_end = "N6"},
|
||||
TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "N7", pronunc_end = "N7"},
|
||||
TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "N8", pronunc_end = "N8"},
|
||||
TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "N9", pronunc_end = "N9"}
|
||||
TC {unicode = 3632, translit = "a.", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"},
|
||||
TC {unicode = 3633, translit = "a", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"},
|
||||
TC {unicode = 3634, translit = "a:", cclass = Low, liveness = True, pronunc = "a:", pronunc_end = "a:"},
|
||||
TC {unicode = 3635, translit = "a+", cclass = Low, liveness = True, pronunc = "am", pronunc_end = "am"},
|
||||
TC {unicode = 3636, translit = "i", cclass = Low, liveness = True, pronunc = "i", pronunc_end = "i"},
|
||||
TC {unicode = 3637, translit = "i:", cclass = Low, liveness = True, pronunc = "i:", pronunc_end = "i:"},
|
||||
TC {unicode = 3638, translit = "v", cclass = Low, liveness = True, pronunc = "ü", pronunc_end = "ü"},
|
||||
TC {unicode = 3639, translit = "v:", cclass = Low, liveness = True, pronunc = "ü:", pronunc_end = "ü:"},
|
||||
TC {unicode = 3640, translit = "u", cclass = Low, liveness = True, pronunc = "u", pronunc_end = "u"},
|
||||
TC {unicode = 3641, translit = "u:", cclass = Low, liveness = True, pronunc = "u:", pronunc_end = "u:"},
|
||||
TC {unicode = 3648, translit = "e", cclass = Low, liveness = True, pronunc = "e:", pronunc_end = "e:"},
|
||||
TC {unicode = 3649, translit = "e'", cclass = Low, liveness = True, pronunc = "ä:", pronunc_end = "ä:"},
|
||||
TC {unicode = 3650, translit = "o:", cclass = Low, liveness = True, pronunc = "o:", pronunc_end = "o:"},
|
||||
TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
|
||||
TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
|
||||
TC {unicode = 3653, translit = "L", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"},
|
||||
TC {unicode = 3654, translit = "R", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"},
|
||||
TC {unicode = 3655, translit = "S", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3656, translit = "T1", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3657, translit = "T2", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3658, translit = "T3", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3659, translit = "T4", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3660, translit = "K", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
|
||||
TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "0", pronunc_end = "0"},
|
||||
TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "1", pronunc_end = "1"},
|
||||
TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "2", pronunc_end = "2"},
|
||||
TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "3", pronunc_end = "3"},
|
||||
TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "4", pronunc_end = "4"},
|
||||
TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "5", pronunc_end = "5"},
|
||||
TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "6", pronunc_end = "6"},
|
||||
TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "7", pronunc_end = "7"},
|
||||
TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "8", pronunc_end = "8"},
|
||||
TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"}
|
||||
]
|
||||
|
||||
|
||||
@@ -139,15 +290,6 @@ mkTransliteration name ts us =
|
||||
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
|
||||
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
|
||||
|
||||
unchar :: String -> [String]
|
||||
unchar s = case s of
|
||||
c:d:cs
|
||||
| isAlpha d -> [c] : unchar (d:cs)
|
||||
| isSpace d -> [c]:[d]: unchar cs
|
||||
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
|
||||
(c:d:ds) : unchar cs2
|
||||
[_] -> [s]
|
||||
_ -> []
|
||||
|
||||
transThai :: Transliteration
|
||||
transThai = mkTransliteration "Thai" allTrans allCodes where
|
||||
|
||||
Reference in New Issue
Block a user