From 346e57707b4a13cb3565e7b9cbaf9404ce2bdc0f Mon Sep 17 00:00:00 2001 From: aarne Date: Thu, 3 Nov 2011 21:34:59 +0000 Subject: [PATCH] Thai tone rules, at least most of them --- lib/src/thai/ThaiScript.hs | 226 ++++++++++++++++++++++++++++++------- 1 file changed, 184 insertions(+), 42 deletions(-) diff --git a/lib/src/thai/ThaiScript.hs b/lib/src/thai/ThaiScript.hs index 121e7c9b8..d13feacf4 100644 --- a/lib/src/thai/ThaiScript.hs +++ b/lib/src/thai/ThaiScript.hs @@ -3,6 +3,157 @@ module ThaiScript where import Data.Char import qualified Data.Map as Map + +-- heuristics for finding syllables +uniSyllables :: [Int] -> [[Int]] +uniSyllables = reverse . (syll [] []) where + syll sys sy is = case is of + c:cs | isPreVowel c -> new [] is + c:d:cs | isConsonant c && isConsonant d -> new [c] (d:cs) ---- no consonant clusters + c:cs -> continue [c] cs ---- more rules to follow + _ -> sy:sys + where + new old = syll ((sy ++ old) : sys) [] + continue old = syll sys (sy ++ old) + + +isPreVowel :: Int -> Bool +isPreVowel i = 0xe40 <= i && i <= 0xe44 + +isVowel :: Int -> Bool +isVowel i = 0xe30 <= i && i <= 0xe44 + +isConsonant :: Int -> Bool +isConsonant i = 0xe01 <= i && i <= 0xe2e && i /= 0xe2d + +isMark :: Int -> Bool +isMark i = 0xe47 <= i && i <= 0xe4c + + +-- the following functions involving pron (=pronunciation) work on syllables +thai2pron = uni2pron . thai2uni +trans2pron = uni2pron . trans2uni +trans2thai = uni2thai . trans2uni + +thai2uni :: String -> [Int] +thai2uni = map fromEnum + +uni2thai :: [Int] -> String +uni2thai = map toEnum + +uni2pron :: [Int] -> String +uni2pron is = case is of + 0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "i:a" ++ uni2pron cs + 0xe40:c:0xe37:0xe2d:cs -> pron c ++ tone c cs "ü:a" ++ uni2pron cs + 0xe40:c:0xe32:cs -> pron c ++ tone c cs "ao" ++ uni2pron cs + 0xe40:c:0xe34:cs -> pron c ++ tone c cs "ö:" ++ uni2pron cs + 0xe40:c:0xe47:cs -> pron c ++ tone c cs "e" ++ uni2pron cs + 0xe40:c:cs -> pron c ++ tone c cs "e:" ++ uni2pron cs + + 0xe41:c:0xe47:cs -> pron c ++ tone c cs "ä" ++ uni2pron cs + 0xe41:c:cs -> pron c ++ tone c cs "ä:" ++ uni2pron cs + + 0xe42:c:cs -> pron c ++ tone c cs "o:" ++ uni2pron cs + 0xe43:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs + 0xe44:c:cs -> pron c ++ tone c cs "ai" ++ uni2pron cs + + c:0xe30:cs -> pron c ++ tone c cs "a" ++ uni2pron cs + c:0xe31:0xe27:cs -> pron c ++ tone c cs "u:a" ++ uni2pron cs + c:0xe31:cs -> pron c ++ tone c cs "a" ++ uni2pron cs + c:0xe32:cs -> pron c ++ tone c cs "a:" ++ uni2pron cs + c:0xe33:cs -> pron c ++ tone c cs "am" ++ uni2pron cs + c:0xe34:cs -> pron c ++ tone c cs "i" ++ uni2pron cs + c:0xe35:cs -> pron c ++ tone c cs "i:" ++ uni2pron cs + c:0xe36:cs -> pron c ++ tone c cs "ü" ++ uni2pron cs + c:0xe37:cs -> pron c ++ tone c cs "ü:" ++ uni2pron cs + c:0xe38:cs -> pron c ++ tone c cs "u" ++ uni2pron cs + c:0xe39:cs -> pron c ++ tone c cs "u:" ++ uni2pron cs + + [c] -> enc c + c:cs -> pron c ++ uni2pron cs + [] -> [] + where + enc c = lookThai [] pronunc_end c + pron c = lookThai [] pronunc c + +tone :: Int -> [Int] -> String -> String +tone c cs v = case (lookThai Low cclass c, isLive cs, toneMark (c:cs)) of + (_,_,3) -> high v + (_,_,4) -> rising v + (Low,_,1) -> falling v + (Low,_,2) -> high v + (Low,True,_) -> mid v + (Low,False,_) -> case isLong v of + False -> high v + True -> falling v + (_,_,1) -> low v + (_,_,2) -> falling v + (Mid,True,_) -> mid v + (Mid,False,_) -> low v + (High,True,_) -> rising v + (High,False,_) -> low v + +toneMark :: [Int] -> Int +toneMark is = case is of + 0xe48:is -> 1 + 0xe49:is -> 2 + 0xe4a:is -> 3 + 0xe4b:is -> 4 + _:is -> toneMark is + _ -> 0 -- no tone mark in is + +isLong :: String -> Bool +isLong s = elem ':' s + +isLive :: [Int] -> Bool +isLive is = case is of + [i] -> lookThai False liveness i + [] -> True + _ -> False + +mid, high, low, falling, rising :: String -> String +mid s = s +high = toneMap "á" "é" "í" "ó" "ú" "ǘ" "ä'" "ö'" +low = toneMap "à" "è" "ì" "ò" "ù" "ǜ" "ä`" "ö`" +rising = toneMap "ã" "ẽ" "ĩ" "õ" "ũ" "ü~" "ä~" "ö~" +falling = toneMap "â" "ê" "î" "ô" "û" "ü^" "ä^" "ö^" + +toneMap a e i o u ue ae oe s = case s of + 'a':cs -> a++cs + 'e':cs -> e++cs + 'i':cs -> i++cs + 'o':cs -> o++cs + 'u':cs -> u++cs + 'ü':cs -> ue++cs + 'ä':cs -> ae++cs + 'ö':cs -> oe++cs + _ -> s + + +lookThai :: a -> (ThaiChar -> a) -> Int -> a +lookThai v f i = maybe v f (Map.lookup i thaiMap) + +trans2uni :: String -> [Int] +trans2uni = + map (\c -> maybe 0 id $ Map.lookup c trans) . + unchar + where + trans = Map.fromList [(translit c, unicode c) | c <- allThaiChars] + +unchar :: String -> [String] +unchar s = case s of + c:d:cs + | isAlpha d -> [c] : unchar (d:cs) + | isSpace d -> [c]:[d]: unchar cs + | otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in + (c:d:ds) : unchar cs2 + [_] -> [s] + _ -> [] + + +thaiMap :: Map.Map Int ThaiChar +thaiMap = Map.fromList [(unicode c,c) | c <- allThaiChars] + data ThaiChar = TC { unicode :: Int, translit :: String, @@ -61,39 +212,39 @@ allThaiChars = [ TC {unicode = 3629, translit = "O", cclass = Mid, liveness = True, pronunc = "O", pronunc_end = "O"}, TC {unicode = 3630, translit = "h'", cclass = Low, liveness = True, pronunc = "h", pronunc_end = ""}, - TC {unicode = 3632, translit = "a.", cclass = Low, liveness = False, pronunc = "a.", pronunc_end = "a."}, - TC {unicode = 3633, translit = "a", cclass = Low, liveness = False, pronunc = "a", pronunc_end = "a"}, - TC {unicode = 3634, translit = "a:", cclass = Low, liveness = False, pronunc = "a:", pronunc_end = "a:"}, - TC {unicode = 3635, translit = "a+", cclass = Low, liveness = False, pronunc = "a+", pronunc_end = "a+"}, - TC {unicode = 3636, translit = "i", cclass = Low, liveness = False, pronunc = "i", pronunc_end = "i"}, - TC {unicode = 3637, translit = "i:", cclass = Low, liveness = False, pronunc = "i:", pronunc_end = "i:"}, - TC {unicode = 3638, translit = "v", cclass = Low, liveness = False, pronunc = "v", pronunc_end = "v"}, - TC {unicode = 3639, translit = "v:", cclass = Low, liveness = False, pronunc = "v:", pronunc_end = "v:"}, - TC {unicode = 3640, translit = "u", cclass = Low, liveness = False, pronunc = "u", pronunc_end = "u"}, - TC {unicode = 3641, translit = "u:", cclass = Low, liveness = False, pronunc = "u:", pronunc_end = "u:"}, - TC {unicode = 3648, translit = "e", cclass = Low, liveness = False, pronunc = "e", pronunc_end = "e"}, - TC {unicode = 3649, translit = "e'", cclass = Low, liveness = False, pronunc = "e'", pronunc_end = "e'"}, - TC {unicode = 3650, translit = "o:", cclass = Low, liveness = False, pronunc = "o:", pronunc_end = "o:"}, - TC {unicode = 3651, translit = "a%", cclass = Low, liveness = False, pronunc = "a%", pronunc_end = "a%"}, - TC {unicode = 3652, translit = "a&", cclass = Low, liveness = False, pronunc = "a&", pronunc_end = "a&"}, - TC {unicode = 3653, translit = "L", cclass = Low, liveness = False, pronunc = "L", pronunc_end = "L"}, - TC {unicode = 3654, translit = "R", cclass = Low, liveness = False, pronunc = "R", pronunc_end = "R"}, - TC {unicode = 3655, translit = "S", cclass = Low, liveness = False, pronunc = "S", pronunc_end = "S"}, - TC {unicode = 3656, translit = "T1", cclass = Low, liveness = False, pronunc = "T1", pronunc_end = "T1"}, - TC {unicode = 3657, translit = "T2", cclass = Low, liveness = False, pronunc = "T2", pronunc_end = "T2"}, - TC {unicode = 3658, translit = "T3", cclass = Low, liveness = False, pronunc = "T3", pronunc_end = "T3"}, - TC {unicode = 3659, translit = "T4", cclass = Low, liveness = False, pronunc = "T4", pronunc_end = "T4"}, - TC {unicode = 3660, translit = "K", cclass = Low, liveness = False, pronunc = "K", pronunc_end = "K"}, - TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "N0", pronunc_end = "N0"}, - TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "N1", pronunc_end = "N1"}, - TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "N2", pronunc_end = "N2"}, - TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "N3", pronunc_end = "N3"}, - TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "N4", pronunc_end = "N4"}, - TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "N5", pronunc_end = "N5"}, - TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "N6", pronunc_end = "N6"}, - TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "N7", pronunc_end = "N7"}, - TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "N8", pronunc_end = "N8"}, - TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "N9", pronunc_end = "N9"} + TC {unicode = 3632, translit = "a.", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"}, + TC {unicode = 3633, translit = "a", cclass = Low, liveness = True, pronunc = "a", pronunc_end = "a"}, + TC {unicode = 3634, translit = "a:", cclass = Low, liveness = True, pronunc = "a:", pronunc_end = "a:"}, + TC {unicode = 3635, translit = "a+", cclass = Low, liveness = True, pronunc = "am", pronunc_end = "am"}, + TC {unicode = 3636, translit = "i", cclass = Low, liveness = True, pronunc = "i", pronunc_end = "i"}, + TC {unicode = 3637, translit = "i:", cclass = Low, liveness = True, pronunc = "i:", pronunc_end = "i:"}, + TC {unicode = 3638, translit = "v", cclass = Low, liveness = True, pronunc = "ü", pronunc_end = "ü"}, + TC {unicode = 3639, translit = "v:", cclass = Low, liveness = True, pronunc = "ü:", pronunc_end = "ü:"}, + TC {unicode = 3640, translit = "u", cclass = Low, liveness = True, pronunc = "u", pronunc_end = "u"}, + TC {unicode = 3641, translit = "u:", cclass = Low, liveness = True, pronunc = "u:", pronunc_end = "u:"}, + TC {unicode = 3648, translit = "e", cclass = Low, liveness = True, pronunc = "e:", pronunc_end = "e:"}, + TC {unicode = 3649, translit = "e'", cclass = Low, liveness = True, pronunc = "ä:", pronunc_end = "ä:"}, + TC {unicode = 3650, translit = "o:", cclass = Low, liveness = True, pronunc = "o:", pronunc_end = "o:"}, + TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"}, + TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"}, + TC {unicode = 3653, translit = "L", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"}, + TC {unicode = 3654, translit = "R", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"}, + TC {unicode = 3655, translit = "S", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3656, translit = "T1", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3657, translit = "T2", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3658, translit = "T3", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3659, translit = "T4", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3660, translit = "K", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""}, + TC {unicode = 3664, translit = "N0", cclass = Low, liveness = False, pronunc = "0", pronunc_end = "0"}, + TC {unicode = 3665, translit = "N1", cclass = Low, liveness = False, pronunc = "1", pronunc_end = "1"}, + TC {unicode = 3666, translit = "N2", cclass = Low, liveness = False, pronunc = "2", pronunc_end = "2"}, + TC {unicode = 3667, translit = "N3", cclass = Low, liveness = False, pronunc = "3", pronunc_end = "3"}, + TC {unicode = 3668, translit = "N4", cclass = Low, liveness = False, pronunc = "4", pronunc_end = "4"}, + TC {unicode = 3669, translit = "N5", cclass = Low, liveness = False, pronunc = "5", pronunc_end = "5"}, + TC {unicode = 3670, translit = "N6", cclass = Low, liveness = False, pronunc = "6", pronunc_end = "6"}, + TC {unicode = 3671, translit = "N7", cclass = Low, liveness = False, pronunc = "7", pronunc_end = "7"}, + TC {unicode = 3672, translit = "N8", cclass = Low, liveness = False, pronunc = "8", pronunc_end = "8"}, + TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"} ] @@ -139,15 +290,6 @@ mkTransliteration name ts us = tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"] uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"] -unchar :: String -> [String] -unchar s = case s of - c:d:cs - | isAlpha d -> [c] : unchar (d:cs) - | isSpace d -> [c]:[d]: unchar cs - | otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in - (c:d:ds) : unchar cs2 - [_] -> [s] - _ -> [] transThai :: Transliteration transThai = mkTransliteration "Thai" allTrans allCodes where