a script for generating Thai files with pronunciation

This commit is contained in:
aarne
2011-12-03 12:07:46 +00:00
parent 9e1a3a5d60
commit dfce0ef551
4 changed files with 57 additions and 34 deletions

View File

@@ -54,6 +54,7 @@ langsCoding = [
(("spanish", "Spa"),"Romance"),
(("swedish", "Swe"),"Scand"),
(("thai", "Tha"),""),
(("thai", "Thp"),""), -- Thai pronunciation
(("turkish", "Tur"),""),
(("urdu", "Urd"),"Hindustani")
]
@@ -66,7 +67,7 @@ langs = map fst langsCoding
langsLangAll = langs
-- languagues that are almost complete and for which Lang is normally compiled
langsLang = langs `except` langsIncomplete ---- []
langsLang = langs `except` (langsIncomplete ++ ["Thp"])
-- languagues that have notpresent marked
langsPresent = langsLang `except` ["Lav","Nep","Pes","Tha"]

View File

@@ -1,9 +1,4 @@
-- The only place where literal Thai strings are defined
-- (except for Lexicon and Structural).
-- Convert this into StringsThai by
-- gf
-- > rf -file=thai/src/StringsTha.gf | ps -env=quotes -to_thai | wf -file=thai/StringsTha.gf
-- สุคสันต์วันเกิด!
-- a repository of literal Thai strings
resource StringsTha = {
@@ -11,6 +6,9 @@ flags coding = utf8 ;
oper
-- if Thai is paired with Pronunciation, return the latter
thpron : Str -> Str -> Str = \t,p -> p ;
aphai_s = "อภัย" ; -- excuse2
baan_s = "บ้าน" ; -- house
biar_s = "เบียร์" ; -- beer
@@ -30,7 +28,7 @@ haa_s = "ห้า" ; -- five
hay_s = "ให้" ; -- give
hoog_s = "ห้อง" ; -- room
hok_s = "หก" ; -- six
jai_s = "ใj" ; -- understand2
jai_s = "ใ" ; -- understand2
kaaw_s = "เกา" ; -- nine
kam_s = "กำ" ; -- Progr1
kew_s = "แก้ว" ; -- glass (drink Classif)

View File

@@ -1,4 +1,4 @@
concrete TextTha of Text = CommonX ** {
concrete TextTha of Text = CommonX ** open ResTha in {
-- No punctuation - but make sure to leave spaces between sentences!

View File

@@ -1,37 +1,31 @@
module ThaiScript where
module Main where
import Data.Char
import Data.List
import qualified Data.Map as Map
import System
testFile = "src/test.txt"
resultFile = "src/results.txt"
-- convert all files *Tha.gf into *Thp.gf with "t" changed to (thpron "t" "p")
main = allThpron
test = do
s <- readFile testFile
writeFile resultFile []
mapM_ (testOne . tabs) $ lines s
allThpron = do
System.system "ls *Tha*.gf ../api/*Tha*.gf >srcThai.txt"
files <- readFile "srcThai.txt" >>= return . lines
mapM_ fileThpron files
testOne ws = case ws of
m:t:p:r:_ -> appendFile resultFile $ concat [mn,"\t",t,"\t",p,"\t",r,"\t",result,"\n"] where
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
mn = if result == r
then m
else if result == p then (m ++ "+") else (m ++ "-")
_ -> return ()
fileThpron file = do
s <- readFile file
let tgt = appThpron file
writeFile tgt (appThpron s)
putStrLn ("wrote " ++ tgt)
testOneS ws = case ws of
m:t:p:r:_ -> appendFile resultFile $ concat [m,"\t",t,"\t",pn,"\t",r,"\n"] where
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
pn = if m == "+"
then r
else p
_ -> return ()
appThpron s = case s of
'"':cs -> let (w,_:rest) = break (=='"') cs in mkThpron w ++ appThpron rest
'T':'h':'a':rest -> "Thp" ++ appThpron rest
c:cs -> c:appThpron cs
_ -> s
tabs s = case break (=='\t') s of
([], _:ws) -> tabs ws
(w , _:ws) -> w:tabs ws
_ -> [s]
mkThpron s = "(thpron \"" ++ s ++ "\" \"" ++ thai2pron s ++ "\")"
-- heuristics for finding syllables
uniSyllables :: [Int] -> [[Int]]
@@ -311,6 +305,36 @@ allThaiChars = [
TC {unicode = 3673, translit = "N9", cclass = Low, liveness = False, pronunc = "9", pronunc_end = "9"}
]
-- testing with Wikipedia Swadesh list
testFile = "src/test.txt"
resultFile = "src/results.txt"
test = do
s <- readFile testFile
writeFile resultFile []
mapM_ (testOne . tabs) $ lines s
testOne ws = case ws of
m:t:p:r:_ -> appendFile resultFile $ concat [mn,"\t",t,"\t",p,"\t",r,"\t",result,"\n"] where
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
mn = if result == r
then m
else if result == p then (m ++ "+") else (m ++ "-")
_ -> return ()
testOneS ws = case ws of
m:t:p:r:_ -> appendFile resultFile $ concat [m,"\t",t,"\t",pn,"\t",r,"\n"] where
result = unwords (intersperse "," (map thai2pron (filter (/=",") (words t))))
pn = if m == "+"
then r
else p
_ -> return ()
tabs s = case break (=='\t') s of
([], _:ws) -> tabs ws
(w , _:ws) -> w:tabs ws
_ -> [s]
{-