mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-06 17:52:51 -06:00
GF/src is now for 2.9, and the new sources are in src-3.0 - keep it this way until the release of GF 3
This commit is contained in:
368
src-3.0/GF/Text/Thai.hs
Normal file
368
src-3.0/GF/Text/Thai.hs
Normal file
@@ -0,0 +1,368 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Thai
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (experimental)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
--
|
||||
-- Thai transliteration and other alphabet information.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
-- AR 27/12/2006. Execute test2 to see the transliteration table.
|
||||
|
||||
module GF.Text.Thai (
|
||||
mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile
|
||||
) where
|
||||
|
||||
import qualified Data.Map as Map
|
||||
import Data.Char
|
||||
|
||||
-- for testing
|
||||
import GF.Text.UTF8
|
||||
import Data.List
|
||||
|
||||
import Debug.Trace
|
||||
|
||||
|
||||
mkThai :: String -> String
|
||||
mkThai = concat . map mkThaiWord . words
|
||||
mkThaiPron = unwords . map mkPronSyllable . words
|
||||
mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words
|
||||
|
||||
|
||||
type ThaiChar = Char
|
||||
|
||||
mkThaiWord :: String -> [ThaiChar]
|
||||
mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth
|
||||
|
||||
mkThaiChar :: String -> Int
|
||||
mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap
|
||||
|
||||
thaiMap :: Map.Map String Int
|
||||
thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes
|
||||
|
||||
-- convert all string literals in a text
|
||||
|
||||
thaiStrings :: String -> String
|
||||
thaiStrings = convStrings mkThai
|
||||
|
||||
thaiPronStrings :: String -> String
|
||||
thaiPronStrings = convStrings mkThaiPron
|
||||
|
||||
convStrings conv s = case s of
|
||||
'"':cs -> let (t,_:r) = span (/='"') cs in
|
||||
'"': conv t ++ "\"" ++ convStrings conv r
|
||||
c:cs -> c : convStrings conv cs
|
||||
_ -> s
|
||||
|
||||
|
||||
-- each character is either [letter] or [letter+nonletter]
|
||||
|
||||
unchar :: String -> [String]
|
||||
unchar s = case s of
|
||||
c:d:cs
|
||||
| isAlpha d -> [c] : unchar (d:cs)
|
||||
| d == '?' -> unchar cs -- use "o?" to represent implicit 'o'
|
||||
| otherwise -> [c,d] : unchar cs
|
||||
[_] -> [s]
|
||||
_ -> []
|
||||
|
||||
-- you can prefix transliteration by irregular phonology in []
|
||||
|
||||
pronAndOrth :: String -> (Maybe String, String)
|
||||
pronAndOrth s = case s of
|
||||
'[':cs -> case span (/=']') cs of
|
||||
(p,_:o) -> (Just p,o)
|
||||
_ -> (Nothing,s)
|
||||
_ -> (Nothing,s)
|
||||
|
||||
allThaiTrans :: [String]
|
||||
allThaiTrans = words $
|
||||
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
|
||||
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
|
||||
"p3 m y r - l - w s- s. s h l' O h' - " ++
|
||||
"a. a a: a+ i i: v v: u u: - - - - - - " ++
|
||||
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
|
||||
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
|
||||
|
||||
allThaiCodes :: [Int]
|
||||
allThaiCodes = [0x0e00 .. 0x0e7f]
|
||||
|
||||
|
||||
---------------------
|
||||
-- heuristic pronunciation of codes
|
||||
---------------------
|
||||
|
||||
-- fake English for TTS, a la Teach Yourself Thai
|
||||
|
||||
fakeEnglish :: String -> String
|
||||
fakeEnglish s = case s of
|
||||
'a':'a':cs -> "ah" ++ fakeEnglish cs
|
||||
'a':'y':cs -> "ai" ++ fakeEnglish cs
|
||||
'a' :cs -> "ah" ++ fakeEnglish cs
|
||||
'c':'h':cs -> "ch" ++ fakeEnglish cs
|
||||
'c' :cs -> "j" ++ fakeEnglish cs
|
||||
'e':'e':cs -> "aih" ++ fakeEnglish cs
|
||||
'g' :cs -> "ng" ++ fakeEnglish cs
|
||||
'i':'i':cs -> "ee" ++ fakeEnglish cs
|
||||
'k':'h':cs -> "k" ++ fakeEnglish cs
|
||||
'k' :cs -> "g" ++ fakeEnglish cs
|
||||
'O':'O':cs -> "or" ++ fakeEnglish cs
|
||||
'O' :cs -> "or" ++ fakeEnglish cs
|
||||
'o':'o':cs -> "or" ++ fakeEnglish cs
|
||||
'p':'h':cs -> "p" ++ fakeEnglish cs
|
||||
'p' :cs -> "b" ++ fakeEnglish cs
|
||||
't':'h':cs -> "t" ++ fakeEnglish cs
|
||||
't' :cs -> "d" ++ fakeEnglish cs
|
||||
'u':'u':cs -> "oo" ++ fakeEnglish cs
|
||||
'u' :cs -> "oo" ++ fakeEnglish cs
|
||||
'v':'v':cs -> "eu" ++ fakeEnglish cs
|
||||
'v' :cs -> "eu" ++ fakeEnglish cs
|
||||
'\228':'\228':cs -> "air" ++ fakeEnglish cs
|
||||
'\228' :cs -> "a" ++ fakeEnglish cs
|
||||
'\246':'\246':cs -> "er" ++ fakeEnglish cs
|
||||
'\246' :cs -> "er" ++ fakeEnglish cs
|
||||
c:cs | isTone c -> fakeEnglish cs
|
||||
c:cs -> c : fakeEnglish cs
|
||||
_ -> s
|
||||
where
|
||||
isTone = flip elem "'`^~"
|
||||
|
||||
|
||||
-- this works for one syllable
|
||||
|
||||
mkPronSyllable s = case fst $ pronAndOrth s of
|
||||
Just p -> p
|
||||
_ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s
|
||||
|
||||
data Syllable = Syll {
|
||||
initv :: [Int],
|
||||
initc :: [Int],
|
||||
midv :: [Int],
|
||||
finalc :: [Int],
|
||||
finalv :: [Int],
|
||||
tone :: [Int],
|
||||
shorten :: Bool,
|
||||
kill :: Bool
|
||||
}
|
||||
deriving Show
|
||||
|
||||
data Tone = TMid | TLow | THigh | TRise | TFall
|
||||
deriving Show
|
||||
|
||||
data CClass = CLow | CMid | CHigh
|
||||
deriving Show
|
||||
|
||||
pronSyllable :: Syllable -> String
|
||||
pronSyllable s =
|
||||
initCons ++ tonem ++ vowel ++ finalCons
|
||||
where
|
||||
|
||||
vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of
|
||||
([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y
|
||||
([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O
|
||||
([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya.
|
||||
([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa.
|
||||
([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o
|
||||
([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O
|
||||
([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i
|
||||
([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a.
|
||||
([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a:
|
||||
([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y
|
||||
([0x0e40],[],[],_,True,_) -> "e"
|
||||
|
||||
([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- ä-a.
|
||||
([0x0e41],[],[],_,True,_) -> "\228"
|
||||
|
||||
([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a.
|
||||
|
||||
([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy
|
||||
([],[0x0e2d],_,_,_,_) -> "OO" -- O
|
||||
|
||||
([],[],[],_,_,_) -> "o"
|
||||
|
||||
(i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ----
|
||||
|
||||
initCons = concatMap pronThaiChar $ case (reverse $ initc s) of
|
||||
0x0e2b:cs@(_:_) -> cs -- high h
|
||||
0x0e2d:cs@(_:_) -> cs -- O
|
||||
cs -> cs
|
||||
|
||||
finalCons =
|
||||
let (c,cs) = splitAt 1 $ finalc s
|
||||
in
|
||||
case c of
|
||||
[] -> []
|
||||
[0x0e22] -> [] --- y
|
||||
[k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k
|
||||
|
||||
iclass = case take 1 (reverse $ initc s) of
|
||||
[c] -> classThai c
|
||||
[] -> CMid -- O
|
||||
|
||||
isLong = not (shorten s) && case vowel of
|
||||
_:_:_ -> True ----
|
||||
_ -> False
|
||||
|
||||
isLive = case finalCons of
|
||||
c | elem c ["n","m","g"] -> True
|
||||
"" -> isLong
|
||||
_ -> False
|
||||
|
||||
tonem = case (iclass,isLive,isLong,tone s) of
|
||||
(_,_,_, [0x0e4a]) -> tHigh
|
||||
(_,_,_, [0x0e4b]) -> tRise
|
||||
(CLow,_,_,[0x0e49]) -> tRise
|
||||
(_,_,_, [0x0e49]) -> tFall
|
||||
(CLow,_,_,[0x0e48]) -> tFall
|
||||
(_, _,_,[0x0e48]) -> tLow
|
||||
(CHigh,True,_,_) -> tRise
|
||||
(_, True,_,_) -> tMid
|
||||
(CLow,False,False,_) -> tHigh
|
||||
(CLow,False,_,_) -> tFall
|
||||
_ -> tLow
|
||||
|
||||
(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^")
|
||||
|
||||
isVowel c = 0x0e30 <= c && c <= 0x0e44 ----
|
||||
isCons c = 0x0e01 <= c && c <= 0x0e2f ----
|
||||
isTone c = 0x0e48 <= c && c <= 0x0e4b
|
||||
|
||||
getSyllable :: [Int] -> Syllable
|
||||
getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where
|
||||
get syll c = case c of
|
||||
0x0e47 -> syll {shorten = True}
|
||||
0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last
|
||||
0x0e2d
|
||||
| null (initc syll) -> syll {initc = [c]} -- "O"
|
||||
| otherwise -> syll {midv = c : midv syll}
|
||||
_
|
||||
| isVowel c -> if null (initc syll)
|
||||
then syll {initv = c : initv syll}
|
||||
else syll {midv = c : midv syll}
|
||||
| isCons c -> if null (initc syll) ||
|
||||
(null (midv syll) && isCluster (initc syll) c)
|
||||
then syll {initc = c : initc syll}
|
||||
else syll {finalc = c : finalc syll}
|
||||
| isTone c -> syll {tone = [c]}
|
||||
_ -> syll ---- check this
|
||||
|
||||
isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b])
|
||||
|
||||
-- to test
|
||||
|
||||
test1 = testThai "k2wa:mrak"
|
||||
test2 = putStrLn $ thaiTable
|
||||
test3 = do
|
||||
writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n"
|
||||
appendFile "thai.txt" thaiTable
|
||||
test4 = do
|
||||
writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n"
|
||||
appendFile "alphthai.txt" thaiTableAlph
|
||||
|
||||
|
||||
testThai :: String -> IO ()
|
||||
testThai s = do
|
||||
putStrLn $ encodeUTF8 $ mkThai s
|
||||
putStrLn $ unwords $ map mkPronSyllable $ words s
|
||||
|
||||
testSyllable s =
|
||||
let y = getSyllable $ map mkThaiChar $ unchar s
|
||||
in
|
||||
putStrLn $ pronSyllable $ trace (show y) y
|
||||
|
||||
thaiFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ thaiStrings s
|
||||
|
||||
thaiPronFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiPronFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ thaiPronStrings s
|
||||
|
||||
thaiFakeFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiFakeFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ (convStrings mkThaiFake) s
|
||||
|
||||
finalThai c = maybe "" return (Map.lookup c thaiFinalMap)
|
||||
thaiFinalMap = Map.fromList $ zip allThaiCodes finals
|
||||
|
||||
classThai c = maybe CLow readClass (Map.lookup c thaiClassMap)
|
||||
thaiClassMap = Map.fromList $ zip allThaiCodes heights
|
||||
|
||||
readClass s = case s of
|
||||
'L' -> CLow
|
||||
'M' -> CMid
|
||||
'H' -> CHigh
|
||||
|
||||
|
||||
thaiTable :: String
|
||||
thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [
|
||||
"| " ++
|
||||
hex c ++ " | " ++
|
||||
encodeUTF8 (showThai s) ++ " | " ++
|
||||
s ++ " | " ++
|
||||
pronThai s ++ " | " ++
|
||||
[f] ++ " | " ++
|
||||
[q] ++ " | "
|
||||
|
|
||||
(c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans
|
||||
]
|
||||
|
||||
thaiTableAlph :: String
|
||||
thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [
|
||||
"| " ++ a ++
|
||||
" | " ++ unwords (map (encodeUTF8 . showThai) ss) ++
|
||||
" | " ++ unwords ss ++
|
||||
" |"
|
||||
|
|
||||
(a,ss) <- allProns
|
||||
]
|
||||
where
|
||||
prons = sort $ nub
|
||||
[p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)]
|
||||
allProns =
|
||||
[(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons]
|
||||
|
||||
showThai s = case s of
|
||||
"-" -> "-"
|
||||
--- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s]
|
||||
_ -> [toEnum $ mkThaiChar s]
|
||||
|
||||
|
||||
pronThaiChar = pronThai . recodeThai
|
||||
|
||||
recodeThai c = allThaiTrans !! (c - 0x0e00)
|
||||
|
||||
pronThai s = case s of
|
||||
[c,p]
|
||||
| c == 'N' && isDigit p -> [p]
|
||||
| c == 'T' && isDigit p -> ['\'',p]
|
||||
| isDigit p -> c:"h"
|
||||
| p==':' -> c:[c]
|
||||
| elem p "%&" -> c:"y"
|
||||
| p=='+' -> c:"m"
|
||||
| s == "e'" -> "\228\228"
|
||||
| otherwise -> [c]
|
||||
"O" -> "O"
|
||||
"e" -> "ee"
|
||||
[c] | isUpper c -> ""
|
||||
_ -> s
|
||||
|
||||
hex = map hx . reverse . digs where
|
||||
digs 0 = [0]
|
||||
digs n = n `mod` 16 : digs (n `div` 16)
|
||||
hx d = "0123456789ABCDEF" !! d
|
||||
|
||||
heights :: String
|
||||
finals :: String
|
||||
heights =
|
||||
" MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' '
|
||||
finals =
|
||||
" kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' '
|
||||
Reference in New Issue
Block a user