some bug fixes in Thai pronunciation generation

2011-11-09 21:30:36 +00:00
parent 7afbad405d
commit 641aef304d
1 changed files with 29 additions and 181 deletions
@@ -55,39 +55,38 @@ uni2thai = map toEnum

 uni2pron :: [Int] -> String
 uni2pron is = case is of
-  0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "iia" ++ uni2pron cs
+  0xe40:c:0xe32      :cs -> pron c ++ tone c cs "aw"       ++ uni2pron cs
+  0xe40:c:0xe34      :cs -> pron c ++ tone c cs "\601\601" ++ uni2pron cs
+  0xe40:c:0xe35:0xe22:cs -> pron c ++ tone c cs "iia"      ++ uni2pron cs
  0xe40:c:0xe37:0xe2d:cs -> pron c ++ tone c cs "\649\649" ++ uni2pron cs
-  0xe40:c:0xe32:cs -> pron c ++ tone c cs "aw" ++ uni2pron cs
-  0xe40:c:0xe34:cs -> pron c ++ tone c cs "\601\601" ++ uni2pron cs
-  0xe40:c:0xe47:cs -> pron c ++ tone c cs "e" ++ uni2pron cs
-  0xe40:c:cs -> pron c ++ tone c cs "ee" ++ uni2pron cs
+  0xe40:c:0xe47      :cs -> pron c ++ tone c cs "e"        ++ uni2pron cs
+  0xe41:c:0xe47      :cs -> pron c ++ tone c cs "\x25b"    ++ uni2pron cs

-  0xe41:c:0xe47:cs -> pron c ++ tone c cs "\x25b" ++ uni2pron cs
-  0xe41:c:cs -> pron c ++ tone c cs "\x25b\x25b" ++ uni2pron cs
+  v:0xe2b:c:cs | isConsonant c && bvow v  
+                         -> pron c ++ tone 0xe2b cs (pron v) ++ uni2pron cs  -- h-
+  v:b:c:cs | clust b c && bvow v                                             -- kr- etc
+                         -> pron b ++ pron c ++ tone c cs (pron v) ++ uni2pron cs
+  v:c:cs | bvow v        -> pron c ++ tone c cs (pron v)   ++ uni2pron cs  -- e .. ay

-  0xe42:c:cs -> pron c ++ tone c cs "o:" ++ uni2pron cs
-  0xe43:c:cs -> pron c ++ tone c cs "ay" ++ uni2pron cs
-  0xe44:c:cs -> pron c ++ tone c cs "ay" ++ uni2pron cs
+  c:0xe31:0xe27:cs       -> pron c ++ tone c cs "uua"      ++ uni2pron cs

-  c:0xe30:cs -> pron c ++ tone c cs "a"  ++ uni2pron cs
-  c:0xe31:0xe27:cs -> pron c ++ tone c cs "uua" ++ uni2pron cs
-  c:0xe31:cs -> pron c ++ tone c cs "a"  ++ uni2pron cs
-  c:0xe32:cs -> pron c ++ tone c cs "aa" ++ uni2pron cs
-  c:0xe33:cs -> pron c ++ tone c cs "am" ++ uni2pron cs
-  c:0xe34:cs -> pron c ++ tone c cs "i"  ++ uni2pron cs
-  c:0xe35:cs -> pron c ++ tone c cs "ii" ++ uni2pron cs
-  c:0xe36:cs -> pron c ++ tone c cs "\649" ++ uni2pron cs
-  c:0xe37:cs -> pron c ++ tone c cs "\649\649" ++ uni2pron cs
-  c:0xe38:cs -> pron c ++ tone c cs "u"  ++ uni2pron cs
-  c:0xe39:cs -> pron c ++ tone c cs "uu" ++ uni2pron cs
+  0xe2b:c:v:cs | isConsonant c && cvow v  
+                         -> pron c ++ tone 0xe2b cs (pron v) ++ uni2pron cs  -- h-
+  b:c:v:cs | clust b c && cvow v                                             -- kr- etc
+                         -> pron b ++ pron c ++ tone c cs (pron v) ++ uni2pron cs
+  0xe2d:v:cs   | cvow v  ->           tone 0xe2d cs (pron v) ++ uni2pron cs  -- O-
+  c:v:cs       | cvow v  -> pron c ++ tone c     cs (pron v) ++ uni2pron cs  -- a .. u:

  [c] -> enc c
-  c:cs -> pron c ++ uni2pron cs
+  c:cs -> pron c ++ uni2pron cs  --- shouldn't happen if syllabified ??
  [] -> []
 where
   enc  c = lookThai [] pronunc_end c
   pron c = lookThai [] pronunc c
-
+   cvow v = (0xe30 <= v && v <= 0xe39) || v == 0xe2d -- central vowels
+   bvow v = 0xe40 <= v && v <= 0xe44  -- begin vowels
+   clust b c = isConsonant b && (elem c [0xe23, 0xe25])
+ 
 tone :: Int -> [Int] -> String -> String
 tone c cs v = case (lookThai Low cclass c, isLive cs, toneMark (c:cs)) of
  (_,_,3) -> high v
@@ -136,24 +135,6 @@ falling = accent '\x302'
 accent a s = case s of
  c:cs -> c:a:cs
  _ -> s
- 
-{-
-high = toneMap "á" "é" "í" "ó" "ú" "ǘ" "ä'" "ö'"
-low  = toneMap "à" "è" "ì" "ò" "ù" "ǜ" "ä`" "ö`"
-rising  = toneMap "ã" "ẽ" "ĩ" "õ" "ũ" "ü~" "ä~" "ö~"
-falling  = toneMap "â" "ê" "î" "ô" "û" "ü^" "ä^" "ö^"
-
-toneMap a e i o u ue ae oe s = case s of
-  'a':cs -> a++cs
-  'e':cs -> e++cs
-  'i':cs -> i++cs
-  'o':cs -> o++cs
-  'u':cs -> u++cs
-  'ü':cs -> ue++cs
-  'ä':cs -> ae++cs
-  'ö':cs -> oe++cs
-  _ -> s
-}

 lookThai :: a -> (ThaiChar -> a) -> Int -> a
 lookThai v f i = maybe v f (Map.lookup i thaiMap)
@@ -205,13 +186,13 @@ allThaiChars = [
  TC {unicode = 3595, translit = "s'", cclass = Low, liveness = False, pronunc = "s", pronunc_end = "t"},
  TC {unicode = 3596, translit = "c3", cclass = Low, liveness = False, pronunc = "ch", pronunc_end = "t"},
  TC {unicode = 3597, translit = "y'", cclass = Low, liveness = False, pronunc = "y", pronunc_end = "n"},
-  TC {unicode = 3598, translit = "d'", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "d'"},
-  TC {unicode = 3599, translit = "t'", cclass = Mid, liveness = False, pronunc = "t'", pronunc_end = "t'"},
+  TC {unicode = 3598, translit = "d'", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "t"},
+  TC {unicode = 3599, translit = "t'", cclass = Mid, liveness = False, pronunc = "t", pronunc_end = "t"},
  TC {unicode = 3600, translit = "t1", cclass = High, liveness = False, pronunc = "th", pronunc_end = "t"},
  TC {unicode = 3601, translit = "t2", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"},
  TC {unicode = 3602, translit = "t3", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"},
  TC {unicode = 3603, translit = "n'", cclass = Low, liveness = True, pronunc = "n", pronunc_end = "n"},
-  TC {unicode = 3604, translit = "d", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "d"},
+  TC {unicode = 3604, translit = "d", cclass = Mid, liveness = False, pronunc = "d", pronunc_end = "t"},
  TC {unicode = 3605, translit = "t", cclass = Mid, liveness = False, pronunc = "t", pronunc_end = "t"},
  TC {unicode = 3606, translit = "t4", cclass = High, liveness = False, pronunc = "th", pronunc_end = "t"},
  TC {unicode = 3607, translit = "t5", cclass = Low, liveness = False, pronunc = "th", pronunc_end = "t"},
@@ -229,8 +210,8 @@ allThaiChars = [
  TC {unicode = 3619, translit = "r", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"},
  TC {unicode = 3621, translit = "l", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"},
  TC {unicode = 3623, translit = "w", cclass = Low, liveness = True, pronunc = "w", pronunc_end = "w"},
-  TC {unicode = 3624, translit = "s-", cclass = High, liveness = False, pronunc = "sh", pronunc_end = "t"},
-  TC {unicode = 3625, translit = "s.", cclass = High, liveness = False, pronunc = "sh", pronunc_end = "t"},
+  TC {unicode = 3624, translit = "s-", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"},
+  TC {unicode = 3625, translit = "s.", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"},
  TC {unicode = 3626, translit = "s", cclass = High, liveness = False, pronunc = "s", pronunc_end = "t"},
  TC {unicode = 3627, translit = "h", cclass = High, liveness = True, pronunc = "h", pronunc_end = ""},
  TC {unicode = 3628, translit = "l'", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"},
@@ -250,8 +231,8 @@ allThaiChars = [
  TC {unicode = 3648, translit = "e", cclass = Low, liveness = True, pronunc = "ee", pronunc_end = "ee"},
  TC {unicode = 3649, translit = "e'", cclass = Low, liveness = True, pronunc = "\x25b\x25b", pronunc_end = "0x25b\x25b"},
  TC {unicode = 3650, translit = "o:", cclass = Low, liveness = True, pronunc = "oo", pronunc_end = "oo"},
-  TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
-  TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ai", pronunc_end = "ai"},
+  TC {unicode = 3651, translit = "a%", cclass = Low, liveness = True, pronunc = "ay", pronunc_end = "ay"},
+  TC {unicode = 3652, translit = "a&", cclass = Low, liveness = True, pronunc = "ay", pronunc_end = "ay"},
  TC {unicode = 3653, translit = "L", cclass = Low, liveness = True, pronunc = "l", pronunc_end = "n"},
  TC {unicode = 3654, translit = "R", cclass = Low, liveness = True, pronunc = "r", pronunc_end = "n"},
  TC {unicode = 3655, translit = "S", cclass = Low, liveness = True, pronunc = "", pronunc_end = ""},
@@ -272,136 +253,3 @@ allThaiChars = [
  TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"}
 ]

-
-
--[TC u t Low False t t | 
- -- (u,t) <- Map.toList (trans_from_unicode transThai)]
-
-pronChar :: Int -> String 
-pronChar i = show i
-
-
-
-
-data Transliteration = Trans {
-  trans_to_unicode   :: Map.Map String Int,
-  trans_from_unicode :: Map.Map Int String,
-  invisible_chars    :: [String],
-  printname          :: String
-  }
-
-appTransToUnicode :: Transliteration -> String -> String
-appTransToUnicode trans = 
-  concat .
-  map (\c -> maybe c (return . toEnum) $
-             Map.lookup c (trans_to_unicode trans)
-      ) . 
-  filter (flip notElem (invisible_chars trans)) . 
-  unchar
-
-appTransFromUnicode :: Transliteration -> String -> String
-appTransFromUnicode trans = 
-  concat .
-  map (\c -> maybe [toEnum c] id $ 
-             Map.lookup c (trans_from_unicode trans)
-      ) . 
-  map fromEnum
-
-
-mkTransliteration :: String -> [String] -> [Int] -> Transliteration
-mkTransliteration name ts us = 
- Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) [] name
-  where
-    tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
-    uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
-
-
-transThai :: Transliteration
-transThai = mkTransliteration "Thai" allTrans allCodes where
-  allTrans = words $
-    "-  k  k1 -  k2 -  k3 g  c  c1 c2 s' c3 y' d' t' " ++
-    "t1 t2 t3 n' d  t  t4 t5 t6 n  b  p  p1 f  p2 f' " ++
-    "p3 m  y  r  -  l  -  w  s- s. s  h  l' O  h' -  " ++
-    "a. a  a: a+ i  i: v  v: u  u: -  -  -  -  -  -  " ++
-    "e  e' o: a% a& L  R  S  T1 T2 T3 T4 K  -  -  -  " ++
-    "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 -  -  -  -  -  -  "
-  allCodes = [0x0e00 .. 0x0e7f]
-
-{-
-| e01 | ก | k | M
-| e02 | ข | k1 | H
-| e04 | ค | k2 |
-| e06 | ฆ | k3 |
-| e07 | ง | g |
-| e08 | จ | c | M
-| e09 | ฉ | c1 | H
-| e0a | ช | c2 |
-| e0b | ซ | s' |
-| e0c | ฌ | c3 |
-| e0d | ญ | y' |
-| e0e | ฎ | d' | M
-| e0f | ฏ | t' | M
-| e10 | ฐ | t1 | H
-| e11 | ฑ | t2 |
-| e12 | ฒ | t3 |
-| e13 | ณ | n' |
-| e14 | ด | d | M
-| e15 | ต | t | M
-| e16 | ถ | t4 | H
-| e17 | ท | t5 |
-| e18 | ธ | t6 |
-| e19 | น | n |
-| e1a | บ | b | M
-| e1b | ป | p | M
-| e1c | ผ | p1 | H
-| e1d | ฝ | f | H
-| e1e | พ | p2 |
-| e1f | ฟ | f' |
-| e20 | ภ | p3 |
-| e21 | ม | m |
-| e22 | ย | y |
-| e23 | ร | r |
-| e25 | ล | l |
-| e27 | ว | w |
-| e28 | ศ | s- | H
-| e29 | ษ | s. | H
-| e2a | ส | s | H
-| e2b | ห | h | H
-| e2c | ฬ | l' |
-| e2d | อ | O | M
-| e2e | ฮ | h' |
-
-| e30 | ะ | a. |
-| e31 | ั | a |
-| e32 | า | a: |
-| e33 | ำ | a+ |
-| e34 | ิ | i |
-| e35 | ี | i: |
-| e36 | ึ | v |
-| e37 | ื | v: |
-| e38 | ุ | u |
-| e39 | ู | u: |
-| e40 | เ | e |
-| e41 | แ | e' |
-| e42 | โ | o: |
-| e43 | ใ | a% |
-| e44 | ไ | a& |
-| e45 | ๅ | L |
-| e46 | ๆ | R |
-| e47 | ็ | S |
-| e48 | ่ | T1 |
-| e49 | ้ | T2 |
-| e4a | ๊ | T3 |
-| e4b | ๋ | T4 |
-| e4c | ์ | K |
-| e50 | ๐ | N0 |
-| e51 | ๑ | N1 |
-| e52 | ๒ | N2 |
-| e53 | ๓ | N3 |
-| e54 | ๔ | N4 |
-| e55 | ๕ | N5 |
-| e56 | ๖ | N6 |
-| e57 | ๗ | N7 |
-| e58 | ๘ | N8 |
-| e59 | ๙ | N9 |
-}