removed src for 2.9

This commit is contained in:
aarne
2008-06-25 16:43:48 +00:00
parent 7e0c2c3656
commit 7d721eb16e
536 changed files with 0 additions and 127076 deletions

View File

@@ -1,63 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Arabic
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:34 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.6 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Arabic (mkArabic) where
mkArabic :: String -> String
mkArabic = unwords . (map mkArabicWord) . words
----mkArabic = reverse . unwords . (map mkArabicWord) . words
--- reverse : assumes everything's on same line
type ArabicChar = Char
mkArabicWord :: String -> [ArabicChar]
mkArabicWord = map mkArabicChar . getLetterPos
getLetterPos :: String -> [(Char,Int)]
getLetterPos [] = []
getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b
getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb
getLetterPos [c] = [(c,1)] -- 1=isolated
getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial
getIn [] = []
getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c
getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc
getIn [c] = [(c,2)] -- 2=final
getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
getIn (c:cs) = (c,4) : getIn cs -- 4=medial
isReduced :: Char -> Bool
isReduced c = c `elem` "UuWiYOaAdVrzwj"
mkArabicChar ('*',p) | p > 4 && p < 10 =
(map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)
mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c
where
cc = mkArabicTab allArabicCodes allArabic
mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
(as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
mkArabicTab [] _ = []
allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy"
allArabic :: String
allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80

View File

@@ -1,97 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Devanagari
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:34 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Devanagari (mkDevanagari) where
mkDevanagari :: String -> String
mkDevanagari = digraphWordToUnicode . adHocToDigraphWord
adHocToDigraphWord :: String -> [(Char, Char)]
adHocToDigraphWord str = case str of
[] -> []
'<' : cs -> ('\\', '<') : spoolMarkup cs
' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
-- if c1 is a vowel
-- Two of the same vowel => lengthening
c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs
-- digraphed or long vowel
c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
-- c1 isn't a vowel
-- c1 : 'a' : [] -> [(' ', c1)] -- a inherent
-- c1 : c2 : [] | isVowel c2 -> (' ', c1) : [(' ', c2)]
-- c1 is aspirated
c1 : 'H' : c2 : c3 : cs | c2 == c3 && isVowel c2 ->
(c1, 'H') : (c2, ':') : adHocToDigraphWord cs
c1 : 'H' : c2 : c3 : cs | isVowel c2 && isVowel c3 ->
(c1, 'H') : (c2, c3) : adHocToDigraphWord cs
c1 : 'H' : 'a' : cs -> (c1, 'H') : adHocToDigraphWord cs -- a inherent
c1 : 'H' : c2 : cs | isVowel c2 -> (c1, 'H') : (' ', c2) : adHocToDigraphWord cs
-- not vowelless at EOW
c1 : 'H' : ' ' : cs -> (c1, 'H') : ('\\', ' ') : adHocToDigraphWord cs
c1 : 'H' : [] -> [(c1, 'H')]
c1 : 'H' : cs -> (c1, 'H') : (' ', '^') : adHocToDigraphWord cs -- vowelless
-- c1 unasp.
c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs
-- not vowelless at EOW
c1 : ' ' : cs -> (' ', c1) : ('\\', ' '): adHocToDigraphWord cs
c1 : [] -> [(' ', c1)]
'M' : cs -> (' ', 'M') : adHocToDigraphWord cs -- vowelless but no vowelless sign for anusvara
c1 : cs -> (' ', c1) : (' ', '^') : adHocToDigraphWord cs -- vowelless
isVowel x = elem x "aeiou:"
cap :: Char -> Char
cap x = case x of
'a' -> 'A'
'e' -> 'E'
'i' -> 'I'
'o' -> 'O'
'u' -> 'U'
c -> c
spoolMarkup :: String -> [(Char, Char)]
spoolMarkup s = case s of
-- [] -> [] -- Shouldn't happen
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
c1 : cs -> ('\\', c1) : spoolMarkup cs
digraphWordToUnicode :: [(Char, Char)] -> String
digraphWordToUnicode = map digraphToUnicode
digraphToUnicode :: (Char, Char) -> Char
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
where
cc = zip allDevanagariCodes allDevanagari
digraphedDevanagari = " ~ M ;__ AA: II: UU:RoLoEvE~ EE:AvA~ OAU kkH ggHNG ccH jjH \241 TTH DDH N ttH ddH nn. ppH bbH m y rr. l LL. v \231 S s h____ .-Sa: ii: uu:ror:eve~ eaiava~ oau ^____OM | -dddu______ Q X G zD.RH fy.R:L:mrmR#I#d#0#1#2#3#4#5#6#7#8#9#o"
allDevanagariCodes :: [(Char, Char)]
allDevanagariCodes = mkPairs digraphedDevanagari
allDevanagari :: String
allDevanagari = (map toEnum [0x0901 .. 0x0970])
mkPairs :: String -> [(Char, Char)]
mkPairs str = case str of
[] -> []
c1 : c2 : cs -> (c1, c2) : mkPairs cs

View File

@@ -1,72 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Ethiopic
-- Maintainer : HH
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:35 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.6 $
--
-- Ascii-Unicode decoding for Ethiopian.
-- Copyright (c) Harald Hammarström 2003 under Gnu General Public License
-----------------------------------------------------------------------------
module GF.Text.Ethiopic (mkEthiopic) where
mkEthiopic :: String -> String
mkEthiopic = digraphWordToUnicode . adHocToDigraphWord
-- mkEthiopic :: String -> String
-- mkEthiopic = reverse . unwords . (map (digraphWordToUnicode . adHocToDigraphWord)) . words
--- reverse : assumes everything's on same line
adHocToDigraphWord :: String -> [(Char, Int)]
adHocToDigraphWord str = case str of
[] -> []
'<' : cs -> ('<', -1) : spoolMarkup cs
c1 : cs | isVowel c1 -> (')', vowelOrder c1) : adHocToDigraphWord cs
-- c1 isn't a vowel
c1 : cs | not (elem c1 allEthiopicCodes) -> (c1, -1) : adHocToDigraphWord cs
c1 : c2 : cs | isVowel c2 -> (c1, vowelOrder c2) : adHocToDigraphWord cs
c1 : cs -> (c1, 5) : adHocToDigraphWord cs
spoolMarkup :: String -> [(Char, Int)]
spoolMarkup s = case s of
-- [] -> [] -- Shouldn't happen
'>' : cs -> ('>', -1) : adHocToDigraphWord cs
c1 : cs -> (c1, -1) : spoolMarkup cs
isVowel x = elem x "A\228ui\239aeoI"
vowelOrder :: Char -> Int
vowelOrder x = case x of
'A' -> 0
'\228' -> 0 -- ä
'u' -> 1
'i' -> 2
'a' -> 3
'e' -> 4
'I' -> 5
'\239' -> 5 -- ï
'o' -> 6
c -> 5 -- vowelless
digraphWordToUnicode = map digraphToUnicode
digraphToUnicode :: (Char, Int) -> Char
-- digraphToUnicode (c1, c2) = c1
digraphToUnicode (c1, -1) = c1
digraphToUnicode (c1, c2) = toEnum (0x1200 + c2 + 8*case lookup c1 cc of Just c' -> c')
where
cc = zip allEthiopicCodes allEthiopic
allEthiopic :: [Int]
allEthiopic = [0 .. 44] -- x 8
allEthiopicCodes = "hlHmLrs$KQ__bBtcxXnN)kW__w(zZyd_jgG_TCPSLfp"
-- Q = kW, X = xW, W = kW, G = gW

View File

@@ -1,99 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : ExtendedArabic
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:36 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.6 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.ExtendedArabic (mkArabic0600, mkExtendedArabic) where
mkArabic0600 :: String -> String
mkArabic0600 = digraphWordToUnicode . aarnesToDigraphWord
aarnesToDigraphWord :: String -> [(Char, Char)]
aarnesToDigraphWord str = case str of
[] -> []
'<' : cs -> ('\\', '<') : spoolMarkup2 cs
'v' : cs -> ('T', 'H') : aarnesToDigraphWord cs
'a' : cs -> (' ', 'A') : aarnesToDigraphWord cs
'o' : cs -> (' ', '3') : aarnesToDigraphWord cs
'O' : cs -> ('\'', 'i') : aarnesToDigraphWord cs
'u' : cs -> ('\'', 'A') : aarnesToDigraphWord cs
'C' : cs -> (' ', 'X') : aarnesToDigraphWord cs
'U' : cs -> ('~', 'A') : aarnesToDigraphWord cs
'A' : cs -> ('"', 't') : aarnesToDigraphWord cs
'c' : cs -> ('s', 'h') : aarnesToDigraphWord cs
c : cs -> (' ', c) : aarnesToDigraphWord cs
mkExtendedArabic :: String -> String
mkExtendedArabic = digraphWordToUnicode . adHocToDigraphWord
adHocToDigraphWord :: String -> [(Char, Char)]
adHocToDigraphWord str = case str of
[] -> []
'<' : cs -> ('\\', '<') : spoolMarkup cs
-- Sorani
'W' : cs -> (':', 'w') : adHocToDigraphWord cs -- ?? Will do
'E' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
'j' : cs -> ('d', 'j') : adHocToDigraphWord cs
'O' : cs -> ('v', 'w') : adHocToDigraphWord cs
'F' : cs -> (' ', 'v') : adHocToDigraphWord cs
'Z' : cs -> ('z', 'h') : adHocToDigraphWord cs
'I' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
'C' : cs -> ('c', 'h') : adHocToDigraphWord cs
-- Pashto
'e' : cs -> (':', 'y') : adHocToDigraphWord cs
'$' : cs -> ('3', 'H') : adHocToDigraphWord cs
'X' : cs -> ('s', '.') : adHocToDigraphWord cs
'G' : cs -> ('z', '.') : adHocToDigraphWord cs
'a' : cs -> (' ', 'A') : adHocToDigraphWord cs
'P' : cs -> ('\'', 'H') : adHocToDigraphWord cs
'R' : cs -> ('o', 'r') : adHocToDigraphWord cs
-- Shared
'A' : cs -> (' ', 'h') : adHocToDigraphWord cs -- ?? Maybe to "t or 0x06d5
'c' : cs -> ('s', 'h') : adHocToDigraphWord cs
c : cs -> (' ', c) : adHocToDigraphWord cs
-- Beginning 0x621 up and including 0x06d1
digraphedExtendedArabic = " '~A'A'w,A'i A b\"t tTHdj H X dDH r z ssh S D T Z 3GH__________ - f q k l m n h w i y&a&w&i/a/w/i/W/o/~/'/,/|/6/v_____________#0#1#2#3#4#5#6#7#8#9#%#,#'#*>b>q$|> A2'2,3'A'w'w&y'Tb:b:BoT3b p4b4B'H:H2H\"H3Hch4HTdod.dTD:d:D3d3D4dTrvror.rvRz.:rzh4zs.+s*S:S3S3T33>ff.f: v4f.q3q-k~kok.k3k3K gog:g:G3Gvl.l3l3L:n>nTnon3n?h4H't>Y\"Yow-wvwww|w^w:w3w>y/yvy.w:y3y____ -ae"
digraphWordToUnicode = map digraphToUnicode
digraphToUnicode :: (Char, Char) -> Char
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
where
cc = zip allExtendedArabicCodes allExtendedArabic
allExtendedArabicCodes :: [(Char, Char)]
allExtendedArabicCodes = mkPairs digraphedExtendedArabic
allExtendedArabic :: String
allExtendedArabic = (map toEnum [0x0621 .. 0x06d1])
mkPairs :: String -> [(Char, Char)]
mkPairs str = case str of
[] -> []
c1 : c2 : cs -> (c1, c2) : mkPairs cs
spoolMarkup :: String -> [(Char, Char)]
spoolMarkup s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
c1 : cs -> ('\\', c1) : spoolMarkup cs
spoolMarkup2 :: String -> [(Char, Char)]
spoolMarkup2 s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> ('\\', '>') : aarnesToDigraphWord cs
c1 : cs -> ('\\', c1) : spoolMarkup2 cs

View File

@@ -1,37 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : ExtraDiacritics
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:36 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.ExtraDiacritics (mkExtraDiacritics) where
mkExtraDiacritics :: String -> String
mkExtraDiacritics = mkExtraDiacriticsWord
mkExtraDiacriticsWord :: String -> String
mkExtraDiacriticsWord str = case str of
[] -> []
'<' : cs -> '<' : spoolMarkup cs
--
'/' : cs -> toEnum 0x0301 : mkExtraDiacriticsWord cs
'~' : cs -> toEnum 0x0306 : mkExtraDiacriticsWord cs
':' : cs -> toEnum 0x0304 : mkExtraDiacriticsWord cs -- some of these could be put in LatinA
'.' : cs -> toEnum 0x0323 : mkExtraDiacriticsWord cs
'i' : '-' : cs -> toEnum 0x0268 : mkExtraDiacriticsWord cs -- in IPA extensions
-- Default
c : cs -> c : mkExtraDiacriticsWord cs
spoolMarkup :: String -> String
spoolMarkup s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> '>' : mkExtraDiacriticsWord cs
c1 : cs -> c1 : spoolMarkup cs

View File

@@ -1,172 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Greek
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:37 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Greek (mkGreek) where
mkGreek :: String -> String
mkGreek = unwords . (map mkGreekWord) . mkGravis . words
--- TODO : optimize character formation by factorizing the case expressions
type GreekChar = Char
mkGreekWord :: String -> [GreekChar]
mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
mkGravis :: [String] -> [String]
mkGravis [] = []
mkGravis [w] = [w]
mkGravis (w1:w2:ws)
| stressed w2 = mkG w1 : mkGravis (w2:ws)
| otherwise = w1 : w2 : mkGravis ws
where
stressed w = any (`elem` "'~`") w
mkG :: String -> String
mkG w = let (w1,w2) = span (/='\'') w in
case w2 of
'\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
'\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
_ -> w
isVowel c = elem c "aehiouw"
mkGreekSpec :: String -> [(Char,Int)]
mkGreekSpec str = case str of
[] -> []
'(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs
'(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs
'(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs
'(' : '!' : c : cs -> (c,21) : mkGreekSpec cs
')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs
')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs
')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs
')' : '!' : c : cs -> (c,20) : mkGreekSpec cs
'\'': '!' : c : cs -> (c,30) : mkGreekSpec cs
'~' : '!' : c : cs -> (c,31) : mkGreekSpec cs
'`' : '!' : c : cs -> (c,32) : mkGreekSpec cs
'!' : c : cs -> (c,33) : mkGreekSpec cs
'(' :'\'': c : cs -> (c,5) : mkGreekSpec cs
'(' :'~' : c : cs -> (c,7) : mkGreekSpec cs
'(' :'`' : c : cs -> (c,3) : mkGreekSpec cs
'(' : c : cs -> (c,1) : mkGreekSpec cs
')' :'\'': c : cs -> (c,4) : mkGreekSpec cs
')' :'~' : c : cs -> (c,6) : mkGreekSpec cs
')' :'`' : c : cs -> (c,2) : mkGreekSpec cs
')' : c : cs -> (c,0) : mkGreekSpec cs
'\'': c : cs -> (c,10) : mkGreekSpec cs
'~' : c : cs -> (c,11) : mkGreekSpec cs
'`' : c : cs -> (c,12) : mkGreekSpec cs
c : cs -> (c,-1) : mkGreekSpec cs
mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c
where
cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
mkGreekChar (c,n) = case (c,n) of
('a',10) -> 0x03ac
('a',11) -> 0x1fb6
('a',12) -> 0x1f70
('a',30) -> 0x1fb4
('a',31) -> 0x1fb7
('a',32) -> 0x1fb2
('a',33) -> 0x1fb3
('a',n) | n >19 -> 0x1f80 + n - 20
('a',n) -> 0x1f00 + n
('e',10) -> 0x03ad -- '
-- ('e',11) -> 0x1fb6 -- ~ can't happen
('e',12) -> 0x1f72 -- `
('e',n) -> 0x1f10 + n
('h',10) -> 0x03ae -- '
('h',11) -> 0x1fc6 -- ~
('h',12) -> 0x1f74 -- `
('h',30) -> 0x1fc4
('h',31) -> 0x1fc7
('h',32) -> 0x1fc2
('h',33) -> 0x1fc3
('h',n) | n >19 -> 0x1f90 + n - 20
('h',n) -> 0x1f20 + n
('i',10) -> 0x03af -- '
('i',11) -> 0x1fd6 -- ~
('i',12) -> 0x1f76 -- `
('i',n) -> 0x1f30 + n
('o',10) -> 0x03cc -- '
-- ('o',11) -> 0x1fb6 -- ~ can't happen
('o',12) -> 0x1f78 -- `
('o',n) -> 0x1f40 + n
('y',10) -> 0x03cd -- '
('y',11) -> 0x1fe6 -- ~
('y',12) -> 0x1f7a -- `
('y',n) -> 0x1f50 + n
('w',10) -> 0x03ce -- '
('w',11) -> 0x1ff6 -- ~
('w',12) -> 0x1f7c -- `
('w',30) -> 0x1ff4
('w',31) -> 0x1ff7
('w',32) -> 0x1ff2
('w',33) -> 0x1ff3
('w',n) | n >19 -> 0x1fa0 + n - 20
('w',n) -> 0x1f60 + n
('r',1) -> 0x1fe5
_ -> mkGreekChar (c,-1) --- should not happen
allGreekMin :: [Int]
allGreekMin = [0x03b1 .. 0x03c9]
{-
encoding of Greek writing. Those hard to guess are marked with ---
maj min
A a Alpha 0391 03b1
B b Beta 0392 03b2
G g Gamma 0393 03b3
D d Delta 0394 03b4
E e Epsilon 0395 03b5
Z z Zeta 0396 03b6
H h Eta --- 0397 03b7
Q q Theta --- 0398 03b8
I i Iota 0399 03b9
K k Kappa 039a 03ba
L l Lambda 039b 03bb
M m My 039c 03bc
N n Ny 039d 03bd
X x Xi 039e 03be
O o Omikron 039f 03bf
P p Pi 03a0 03c0
R r Rho 03a1 03c1
j Sigma --- 03c2
S s Sigma 03a3 03c3
T t Tau 03a4 03c4
Y y Ypsilon 03a5 03c5
F f Phi 03a6 03c6
C c Khi --- 03a7 03c7
U u Psi 03a8 03c8
W w Omega --- 03a9 03c9
( spiritus asper
) spiritus lenis
! iota subscriptum
' acutus
~ circumflexus
` gravis
-}

View File

@@ -1,53 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Hebrew
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:37 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.8 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Hebrew (mkHebrew) where
mkHebrew :: String -> String
mkHebrew = mkHebrewWord
----mkHebrew = reverse . mkHebrewWord
--- reverse : assumes everything's on same line
type HebrewChar = Char
-- HH 031103 added code for spooling the markup
-- removed reverse, words, unwords
-- (seemed obsolete and come out wrong on the screen)
-- AR 26/1/2004 put reverse back - needed in Fudgets (but not in Java?)
mkHebrewWord :: String -> [HebrewChar]
-- mkHebrewWord = map mkHebrewChar
mkHebrewWord s = case s of
[] -> []
'<' : cs -> '<' : spoolMarkup cs
' ' : cs -> ' ' : mkHebrewWord cs
c1 : cs -> mkHebrewChar c1 : mkHebrewWord cs
spoolMarkup :: String -> String
spoolMarkup s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> '>' : mkHebrewWord cs
c1 : cs -> c1 : spoolMarkup cs
mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c
where
cc = zip allHebrewCodes allHebrew
allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
allHebrew :: String
allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))

View File

@@ -1,95 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Hiragana
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:38 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Hiragana (mkJapanese) where
-- long vowel romaaji must be ei, ou not ee, oo
mkJapanese :: String -> String
mkJapanese = digraphWordToUnicode . romaajiToDigraphWord
romaajiToDigraphWord :: String -> [(Char, Char)]
romaajiToDigraphWord str = case str of
[] -> []
'<' : cs -> ('\\', '<') : spoolMarkup cs
' ' : cs -> ('\\', ' ') : romaajiToDigraphWord cs
c1 : cs | isVowel c1 -> (' ', cap c1) : romaajiToDigraphWord cs
-- The combinations
c1 : 'y' : c2 : cs -> (c1, 'i') : ('y', cap c2) : romaajiToDigraphWord cs
's' : 'h' : 'a' : cs -> ('S', 'i') : ('y', 'A') : romaajiToDigraphWord cs
'c' : 'h' : 'a' : cs -> ('C', 'i') : ('y', 'A') : romaajiToDigraphWord cs
'j' : 'a' : cs -> ('j', 'i') : ('y', 'A') : romaajiToDigraphWord cs
's' : 'h' : 'u' : cs -> ('S', 'i') : ('y', 'U') : romaajiToDigraphWord cs
'c' : 'h' : 'u' : cs -> ('C', 'i') : ('y', 'U') : romaajiToDigraphWord cs
'j' : 'u' : cs -> ('j', 'i') : ('y', 'U') : romaajiToDigraphWord cs
's' : 'h' : 'o' : cs -> ('S', 'i') : ('y', 'O') : romaajiToDigraphWord cs
'c' : 'h' : 'o' : cs -> ('C', 'i') : ('y', 'O') : romaajiToDigraphWord cs
'j' : 'o' : cs -> ('j', 'i') : ('y', 'O') : romaajiToDigraphWord cs
'd' : 'z' : c3 : cs -> ('D', c3) : romaajiToDigraphWord cs
't' : 's' : c3 : cs -> ('T', c3) : romaajiToDigraphWord cs
'c' : 'h' : c3 : cs -> ('C', c3) : romaajiToDigraphWord cs
's' : 'h' : c3 : cs -> ('S', c3) : romaajiToDigraphWord cs
'z' : 'h' : c3 : cs -> ('Z', c3) : romaajiToDigraphWord cs
c1 : ' ' : cs -> (' ', c1) : ('\\', ' ') : romaajiToDigraphWord cs -- n
c1 : [] -> [(' ', c1)] -- n
c1 : c2 : cs | isVowel c2 -> (c1, c2) : romaajiToDigraphWord cs
c1 : c2 : cs | c1 == c2 -> ('T', 'U') : romaajiToDigraphWord (c2 : cs) -- double cons
c1 : cs -> (' ', c1) : romaajiToDigraphWord cs -- n
isVowel x = elem x "aeiou"
cap :: Char -> Char
cap x = case x of
'a' -> 'A'
'e' -> 'E'
'i' -> 'I'
'o' -> 'O'
'u' -> 'U'
c -> c
spoolMarkup :: String -> [(Char, Char)]
spoolMarkup s = case s of
-- [] -> [] -- Shouldn't happen
'>' : cs -> ('\\', '>') : romaajiToDigraphWord cs
c1 : cs -> ('\\', c1) : spoolMarkup cs
digraphWordToUnicode :: [(Char, Char)] -> String
digraphWordToUnicode = map digraphToUnicode
digraphToUnicode :: (Char, Char) -> Char
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
where
cc = zip allHiraganaCodes allHiragana
allHiraganaCodes :: [(Char, Char)]
allHiraganaCodes = mkPairs digraphedHiragana
allHiragana :: String
allHiragana = (map toEnum [0x3041 .. 0x309f])
mkPairs :: String -> [(Char, Char)]
mkPairs str = case str of
[] -> []
c1 : c2 : cs -> (c1, c2) : mkPairs cs
digraphedHiragana = " a A i I u U e E o OkagakigikugukegekogosazaSiZisuzusezesozotadaCijiTUTuDutedetodonaninunenohabapahibipihubupuhebepehobopomamimumemoyAyayUyuyOyorarirurerowaWawiwewo nvukAkE____<< o>>o >'> b"

View File

@@ -1,69 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : LatinASupplement
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:39 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.LatinASupplement (mkLatinASupplement) where
mkLatinASupplement :: String -> String
mkLatinASupplement = mkLatinASupplementWord
mkLatinASupplementWord :: String -> String
mkLatinASupplementWord str = case str of
[] -> []
'<' : cs -> '<' : spoolMarkup cs
-- Romanian & partly Turkish
's' : ',' : cs -> toEnum 0x015f : mkLatinASupplementWord cs
'a' : '%' : cs -> toEnum 0x0103 : mkLatinASupplementWord cs
-- Slavic and more
'c' : '^' : cs -> toEnum 0x010d : mkLatinASupplementWord cs
's' : '^' : cs -> toEnum 0x0161 : mkLatinASupplementWord cs
'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
'z' : '^' : cs -> toEnum 0x017e : mkLatinASupplementWord cs
-- Turkish
'g' : '%' : cs -> toEnum 0x011f : mkLatinASupplementWord cs
'I' : cs -> toEnum 0x0131 : mkLatinASupplementWord cs
'c' : ',' : cs -> toEnum 0x00e7 : mkLatinASupplementWord cs
-- Polish
'e' : ',' : cs -> toEnum 0x0119 : mkLatinASupplementWord cs
'a' : ',' : cs -> toEnum 0x0105 : mkLatinASupplementWord cs
'l' : '/' : cs -> toEnum 0x0142 : mkLatinASupplementWord cs
'z' : '.' : cs -> toEnum 0x017c : mkLatinASupplementWord cs
'n' : '\'' : cs -> toEnum 0x0144 : mkLatinASupplementWord cs
's' : '\'' : cs -> toEnum 0x015b : mkLatinASupplementWord cs
-- 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
-- Hungarian
'o' : '%' : cs -> toEnum 0x0151 : mkLatinASupplementWord cs
'u' : '%' : cs -> toEnum 0x0171 : mkLatinASupplementWord cs
-- Mongolian
'j' : '^' : cs -> toEnum 0x0135 : mkLatinASupplementWord cs
-- Khowar (actually in Combining diacritical marks not Latin-A Suppl.)
'o' : '.' : cs -> 'o' : (toEnum 0x0323 : mkLatinASupplementWord cs)
-- Length bars over vowels e.g korean
'a' : ':' : cs -> toEnum 0x0101 : mkLatinASupplementWord cs
'e' : ':' : cs -> toEnum 0x0113 : mkLatinASupplementWord cs
'i' : ':' : cs -> toEnum 0x012b : mkLatinASupplementWord cs
'o' : ':' : cs -> toEnum 0x014d : mkLatinASupplementWord cs
'u' : ':' : cs -> toEnum 0x016b : mkLatinASupplementWord cs
-- Default
c : cs -> c : mkLatinASupplementWord cs
spoolMarkup :: String -> String
spoolMarkup s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> '>' : mkLatinASupplementWord cs
c1 : cs -> c1 : spoolMarkup cs

View File

@@ -1,47 +0,0 @@
----------------------------------------------------------------------
-- |
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:39 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.6 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.OCSCyrillic (mkOCSCyrillic) where
mkOCSCyrillic :: String -> String
mkOCSCyrillic = mkOCSCyrillicWord
mkOCSCyrillicWord :: String -> String
mkOCSCyrillicWord str = case str of
[] -> []
' ' : cs -> ' ' : mkOCSCyrillicWord cs
'<' : cs -> '<' : spoolMarkup cs
'\228' : cs -> toEnum 0x0463 : mkOCSCyrillicWord cs -- ä
'j' : 'e' : '~' : cs -> toEnum 0x0469 : mkOCSCyrillicWord cs
'j' : 'o' : '~' : cs -> toEnum 0x046d : mkOCSCyrillicWord cs
'j' : 'e' : cs -> toEnum 0x0465 : mkOCSCyrillicWord cs
'e' : '~' : cs -> toEnum 0x0467 : mkOCSCyrillicWord cs
'o' : '~' : cs -> toEnum 0x046b : mkOCSCyrillicWord cs
'j' : 'u' : cs -> toEnum 0x044e : mkOCSCyrillicWord cs
'j' : 'a' : cs -> toEnum 0x044f : mkOCSCyrillicWord cs
'u' : cs -> toEnum 0x0479 : mkOCSCyrillicWord cs
c : cs -> (mkOCSCyrillicChar c) : mkOCSCyrillicWord cs
spoolMarkup :: String -> String
spoolMarkup s = case s of
[] -> [] -- Shouldn't happen
'>' : cs -> '>' : mkOCSCyrillicWord cs
c1 : cs -> c1 : spoolMarkup cs
mkOCSCyrillicChar :: Char -> Char
mkOCSCyrillicChar c = case lookup c cc of Just c' -> c' ; _ -> c
where
cc = zip "abvgdeZziJklmnoprstYfxCqwWUyIE" allOCSCyrillic
allOCSCyrillic :: String
allOCSCyrillic = (map toEnum [0x0430 .. 0x044e])

View File

@@ -1,56 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Russian
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:40 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Russian (mkRussian, mkRusKOI8) where
-- | an ad hoc ASCII encoding. Delimiters: @\/_ _\/@
mkRussian :: String -> String
mkRussian = unwords . (map mkRussianWord) . words
-- | the KOI8 encoding, incomplete. Delimiters: @\/* *\/@
mkRusKOI8 :: String -> String
mkRusKOI8 = unwords . (map mkRussianKOI8) . words
type RussianChar = Char
mkRussianWord :: String -> [RussianChar]
mkRussianWord = map (mkRussianChar allRussianCodes)
mkRussianKOI8 :: String -> [RussianChar]
mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c
where
cc = zip chars allRussian
allRussianCodes :: [Char]
allRussianCodes =
-- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
-- which expect source files to be in UTF-8
-- /bringert 2006-05-19
-- "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä"
map toEnum [197,229,65,66,86,71,68,69,88,90,73,74,75,76,77,78,79,80,82,83,84,85,70,72,67,81,87,163,125,33,42,214,89,196,97,98,118,103,100,101,120,122,105,106,107,108,109,110,111,112,114,115,116,117,102,104,99,113,119,35,48,49,39,246,121,228]
allRussianKOI8 :: [Char]
allRussianKOI8 =
-- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
-- which expect source files to be in UTF-8
-- /bringert 2006-05-19
-- "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ"
map toEnum [94,64,225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,242,243,244,245,230,232,227,254,251,253,248,249,255,252,224,241,193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,210,211,212,213,198,200,195,222,219,221,216,217,223,220,192,209]
allRussian :: String
allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places

View File

@@ -1,77 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Tamil
-- Maintainer : (Maintainer)
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:40 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- (Description of the module)
-----------------------------------------------------------------------------
module GF.Text.Tamil (mkTamil) where
mkTamil :: String -> String
mkTamil = digraphWordToUnicode . adHocToDigraphWord
adHocToDigraphWord :: String -> [(Char, Char)]
adHocToDigraphWord str = case str of
[] -> []
'<' : cs -> ('\\', '<') : spoolMarkup cs
' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
-- if c1 is a vowel
-- Two of the same vowel => lengthening
c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs
-- digraphed or long vowel
c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
-- c1 isn't a vowel
c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs
c1 : cs -> (' ', c1) : (' ', '.') : adHocToDigraphWord cs -- vowelless
isVowel x = elem x "aeiou:"
cap :: Char -> Char
cap x = case x of
'a' -> 'A'
'e' -> 'E'
'i' -> 'I'
'o' -> 'O'
'u' -> 'U'
c -> c
spoolMarkup :: String -> [(Char, Char)]
spoolMarkup s = case s of
-- [] -> [] -- Shouldn't happen
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
c1 : cs -> ('\\', c1) : spoolMarkup cs
digraphWordToUnicode :: [(Char, Char)] -> String
digraphWordToUnicode = map digraphToUnicode
digraphToUnicode :: (Char, Char) -> Char
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
where
cc = zip allTamilCodes allTamil
mkPairs :: String -> [(Char, Char)]
mkPairs str = case str of
[] -> []
c1 : c2 : cs -> (c1, c2) : mkPairs cs
allTamilCodes :: [(Char, Char)]
allTamilCodes = mkPairs digraphedTamil
allTamil :: String
allTamil = (map toEnum [0x0b85 .. 0x0bfa])
digraphedTamil = " AA: II: UU:______ EE:AI__ OO:AU k______ G c__ j__ \241 T______ N t______ V n p______ m y r l L M v__ s S h________a: ii: uu:______ ee:ai__ oo:au .__________________ :______________________________#1#2#3#4#5#6#7#8#9^1^2^3=d=m=y=d=c==ru##"

View File

@@ -1,149 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Text
-- Maintainer : AR
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/06/23 14:32:44 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.10 $
--
-- elementary text postprocessing. AR 21\/11\/2001.
--
-- This is very primitive indeed. The functions should work on
-- token lists and not on strings. AR 5\/12\/2002
--
-- XML hack 14\/8\/2004; not in use yet
-----------------------------------------------------------------------------
module GF.Text.Text (untokWithXML,
exceptXML,
formatAsTextLit,
formatAsCodeLit,
formatAsText,
formatAsHTML,
formatAsLatex,
formatAsCode,
performBinds,
performBindsFinnish,
unStringLit,
concatRemSpace
) where
import GF.Data.Operations
import Data.Char
-- | does not apply untokenizer within XML tags --- heuristic "< "
-- this function is applied from top level...
untokWithXML :: (String -> String) -> String -> String
untokWithXML unt s = case s of
'<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where
(beg,rest) = span (/='>') cs
'<':cs -> '<':unto cs ---
[] -> []
_ -> unt beg ++ unto rest where
(beg,rest) = span (/='<') s
where
unto = untokWithXML unt
-- | ... whereas this one is embedded on a branch
exceptXML :: (String -> String) -> String -> String
exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where
(beg,rest) = span (/='>') s
formatAsTextLit :: String -> String
formatAsTextLit = formatAsText . unwords . map unStringLit . words
--- hope that there will be deforestation...
formatAsCodeLit :: String -> String
formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
formatAsText,formatAsHTML,formatAsLatex :: String -> String
formatAsText = formatAsTextGen (const False) (=="&-")
formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False)
formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False)
formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
formatAsTextGen tag para = unwords . format . cap . words where
format ws = case ws of
w : ww | capit w -> format $ (cap ww)
w : c : ww | major c -> format $ (w ++ c) :(cap ww)
w : c : ww | minor c -> format $ (w ++ c) : ww
p : c : ww | openp p -> format $ (p ++ c) :ww
p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
c : ww | para c -> "\n\n" : format ww
w : ww -> w : format ww
[] -> []
cap (p:ww) | tag p = p : cap ww
cap ((c:cs):ww) = (toUpper c : cs) : ww
cap [] = []
capit = (=="&|")
major = flip elem (map singleton ".!?")
minor = flip elem (map singleton ",:;)")
openp = all (flip elem "(")
spanish = all (flip elem "\161\191")
formatAsCode :: String -> String
formatAsCode = rend 0 . words where
-- render from BNF Converter
rend i ss = case ss of
"[" :ts -> cons "[" $ rend i ts
"(" :ts -> cons "(" $ rend i ts
"{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts
"}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
"}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
";" :ts -> cons ";" $ new i $ rend i ts
t : "," :ts -> cons t $ space "," $ rend i ts
t : ")" :ts -> cons t $ cons ")" $ rend i ts
t : "]" :ts -> cons t $ cons "]" $ rend i ts
t :ts -> space t $ rend i ts
_ -> ""
cons s t = s ++ t
new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
space t s = if null s then t else t ++ " " ++ s
performBinds :: String -> String
performBinds = performBindsOpt (\x y -> y)
-- The function defines an effect of the former on the latter part,
-- such as in vowel harmony. It is triggered by the binder token "&*"
performBindsOpt :: (String -> String -> String) -> String -> String
performBindsOpt harm = unwords . format . words where
format ws = case ws of
w : "&+" : u : ws -> format ((w ++ u) : ws)
w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
w : ws -> w : format ws
[] -> []
-- unlexer for Finnish particles
-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän"
performBindsFinnish :: String -> String
performBindsFinnish = performBindsOpt vowelHarmony where
vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
toFront c = case c of
'A' -> '\196'
'O' -> '\214'
'a' -> '\228'
'o' -> '\246'
_ -> c
unStringLit :: String -> String
unStringLit s = case s of
c : cs | strlim c && strlim (last cs) -> init cs
_ -> s
where
strlim = (=='\'')
concatRemSpace :: String -> String
concatRemSpace = concat . words
{-
concatRemSpace s = case s of
'<':cs -> exceptXML concatRemSpace cs
c : cs | isSpace c -> concatRemSpace cs
c :cs -> c : concatRemSpace cs
_ -> s
-}

View File

@@ -1,368 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Thai
-- Maintainer : (Maintainer)
-- Stability : (experimental)
-- Portability : (portable)
--
--
-- Thai transliteration and other alphabet information.
-----------------------------------------------------------------------------
-- AR 27/12/2006. Execute test2 to see the transliteration table.
module GF.Text.Thai (
mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile
) where
import qualified Data.Map as Map
import Data.Char
-- for testing
import GF.Text.UTF8
import Data.List
import Debug.Trace
mkThai :: String -> String
mkThai = concat . map mkThaiWord . words
mkThaiPron = unwords . map mkPronSyllable . words
mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words
type ThaiChar = Char
mkThaiWord :: String -> [ThaiChar]
mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth
mkThaiChar :: String -> Int
mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap
thaiMap :: Map.Map String Int
thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes
-- convert all string literals in a text
thaiStrings :: String -> String
thaiStrings = convStrings mkThai
thaiPronStrings :: String -> String
thaiPronStrings = convStrings mkThaiPron
convStrings conv s = case s of
'"':cs -> let (t,_:r) = span (/='"') cs in
'"': conv t ++ "\"" ++ convStrings conv r
c:cs -> c : convStrings conv cs
_ -> s
-- each character is either [letter] or [letter+nonletter]
unchar :: String -> [String]
unchar s = case s of
c:d:cs
| isAlpha d -> [c] : unchar (d:cs)
| d == '?' -> unchar cs -- use "o?" to represent implicit 'o'
| otherwise -> [c,d] : unchar cs
[_] -> [s]
_ -> []
-- you can prefix transliteration by irregular phonology in []
pronAndOrth :: String -> (Maybe String, String)
pronAndOrth s = case s of
'[':cs -> case span (/=']') cs of
(p,_:o) -> (Just p,o)
_ -> (Nothing,s)
_ -> (Nothing,s)
allThaiTrans :: [String]
allThaiTrans = words $
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
"p3 m y r - l - w s- s. s h l' O h' - " ++
"a. a a: a+ i i: v v: u u: - - - - - - " ++
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
allThaiCodes :: [Int]
allThaiCodes = [0x0e00 .. 0x0e7f]
---------------------
-- heuristic pronunciation of codes
---------------------
-- fake English for TTS, a la Teach Yourself Thai
fakeEnglish :: String -> String
fakeEnglish s = case s of
'a':'a':cs -> "ah" ++ fakeEnglish cs
'a':'y':cs -> "ai" ++ fakeEnglish cs
'a' :cs -> "ah" ++ fakeEnglish cs
'c':'h':cs -> "ch" ++ fakeEnglish cs
'c' :cs -> "j" ++ fakeEnglish cs
'e':'e':cs -> "aih" ++ fakeEnglish cs
'g' :cs -> "ng" ++ fakeEnglish cs
'i':'i':cs -> "ee" ++ fakeEnglish cs
'k':'h':cs -> "k" ++ fakeEnglish cs
'k' :cs -> "g" ++ fakeEnglish cs
'O':'O':cs -> "or" ++ fakeEnglish cs
'O' :cs -> "or" ++ fakeEnglish cs
'o':'o':cs -> "or" ++ fakeEnglish cs
'p':'h':cs -> "p" ++ fakeEnglish cs
'p' :cs -> "b" ++ fakeEnglish cs
't':'h':cs -> "t" ++ fakeEnglish cs
't' :cs -> "d" ++ fakeEnglish cs
'u':'u':cs -> "oo" ++ fakeEnglish cs
'u' :cs -> "oo" ++ fakeEnglish cs
'v':'v':cs -> "eu" ++ fakeEnglish cs
'v' :cs -> "eu" ++ fakeEnglish cs
'\228':'\228':cs -> "air" ++ fakeEnglish cs
'\228' :cs -> "a" ++ fakeEnglish cs
'\246':'\246':cs -> "er" ++ fakeEnglish cs
'\246' :cs -> "er" ++ fakeEnglish cs
c:cs | isTone c -> fakeEnglish cs
c:cs -> c : fakeEnglish cs
_ -> s
where
isTone = flip elem "'`^~"
-- this works for one syllable
mkPronSyllable s = case fst $ pronAndOrth s of
Just p -> p
_ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s
data Syllable = Syll {
initv :: [Int],
initc :: [Int],
midv :: [Int],
finalc :: [Int],
finalv :: [Int],
tone :: [Int],
shorten :: Bool,
kill :: Bool
}
deriving Show
data Tone = TMid | TLow | THigh | TRise | TFall
deriving Show
data CClass = CLow | CMid | CHigh
deriving Show
pronSyllable :: Syllable -> String
pronSyllable s =
initCons ++ tonem ++ vowel ++ finalCons
where
vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of
([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y
([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O
([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya.
([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa.
([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o
([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O
([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i
([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a.
([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a:
([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y
([0x0e40],[],[],_,True,_) -> "e"
([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- ä-a.
([0x0e41],[],[],_,True,_) -> "\228"
([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a.
([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy
([],[0x0e2d],_,_,_,_) -> "OO" -- O
([],[],[],_,_,_) -> "o"
(i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ----
initCons = concatMap pronThaiChar $ case (reverse $ initc s) of
0x0e2b:cs@(_:_) -> cs -- high h
0x0e2d:cs@(_:_) -> cs -- O
cs -> cs
finalCons =
let (c,cs) = splitAt 1 $ finalc s
in
case c of
[] -> []
[0x0e22] -> [] --- y
[k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k
iclass = case take 1 (reverse $ initc s) of
[c] -> classThai c
[] -> CMid -- O
isLong = not (shorten s) && case vowel of
_:_:_ -> True ----
_ -> False
isLive = case finalCons of
c | elem c ["n","m","g"] -> True
"" -> isLong
_ -> False
tonem = case (iclass,isLive,isLong,tone s) of
(_,_,_, [0x0e4a]) -> tHigh
(_,_,_, [0x0e4b]) -> tRise
(CLow,_,_,[0x0e49]) -> tRise
(_,_,_, [0x0e49]) -> tFall
(CLow,_,_,[0x0e48]) -> tFall
(_, _,_,[0x0e48]) -> tLow
(CHigh,True,_,_) -> tRise
(_, True,_,_) -> tMid
(CLow,False,False,_) -> tHigh
(CLow,False,_,_) -> tFall
_ -> tLow
(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^")
isVowel c = 0x0e30 <= c && c <= 0x0e44 ----
isCons c = 0x0e01 <= c && c <= 0x0e2f ----
isTone c = 0x0e48 <= c && c <= 0x0e4b
getSyllable :: [Int] -> Syllable
getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where
get syll c = case c of
0x0e47 -> syll {shorten = True}
0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last
0x0e2d
| null (initc syll) -> syll {initc = [c]} -- "O"
| otherwise -> syll {midv = c : midv syll}
_
| isVowel c -> if null (initc syll)
then syll {initv = c : initv syll}
else syll {midv = c : midv syll}
| isCons c -> if null (initc syll) ||
(null (midv syll) && isCluster (initc syll) c)
then syll {initc = c : initc syll}
else syll {finalc = c : finalc syll}
| isTone c -> syll {tone = [c]}
_ -> syll ---- check this
isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b])
-- to test
test1 = testThai "k2wa:mrak"
test2 = putStrLn $ thaiTable
test3 = do
writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n"
appendFile "thai.txt" thaiTable
test4 = do
writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n"
appendFile "alphthai.txt" thaiTableAlph
testThai :: String -> IO ()
testThai s = do
putStrLn $ encodeUTF8 $ mkThai s
putStrLn $ unwords $ map mkPronSyllable $ words s
testSyllable s =
let y = getSyllable $ map mkThaiChar $ unchar s
in
putStrLn $ pronSyllable $ trace (show y) y
thaiFile :: FilePath -> Maybe FilePath -> IO ()
thaiFile f mo = do
s <- readFile f
let put = maybe putStr writeFile mo
put $ encodeUTF8 $ thaiStrings s
thaiPronFile :: FilePath -> Maybe FilePath -> IO ()
thaiPronFile f mo = do
s <- readFile f
let put = maybe putStr writeFile mo
put $ encodeUTF8 $ thaiPronStrings s
thaiFakeFile :: FilePath -> Maybe FilePath -> IO ()
thaiFakeFile f mo = do
s <- readFile f
let put = maybe putStr writeFile mo
put $ encodeUTF8 $ (convStrings mkThaiFake) s
finalThai c = maybe "" return (Map.lookup c thaiFinalMap)
thaiFinalMap = Map.fromList $ zip allThaiCodes finals
classThai c = maybe CLow readClass (Map.lookup c thaiClassMap)
thaiClassMap = Map.fromList $ zip allThaiCodes heights
readClass s = case s of
'L' -> CLow
'M' -> CMid
'H' -> CHigh
thaiTable :: String
thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [
"| " ++
hex c ++ " | " ++
encodeUTF8 (showThai s) ++ " | " ++
s ++ " | " ++
pronThai s ++ " | " ++
[f] ++ " | " ++
[q] ++ " | "
|
(c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans
]
thaiTableAlph :: String
thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [
"| " ++ a ++
" | " ++ unwords (map (encodeUTF8 . showThai) ss) ++
" | " ++ unwords ss ++
" |"
|
(a,ss) <- allProns
]
where
prons = sort $ nub
[p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)]
allProns =
[(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons]
showThai s = case s of
"-" -> "-"
--- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s]
_ -> [toEnum $ mkThaiChar s]
pronThaiChar = pronThai . recodeThai
recodeThai c = allThaiTrans !! (c - 0x0e00)
pronThai s = case s of
[c,p]
| c == 'N' && isDigit p -> [p]
| c == 'T' && isDigit p -> ['\'',p]
| isDigit p -> c:"h"
| p==':' -> c:[c]
| elem p "%&" -> c:"y"
| p=='+' -> c:"m"
| s == "e'" -> "\228\228"
| otherwise -> [c]
"O" -> "O"
"e" -> "ee"
[c] | isUpper c -> ""
_ -> s
hex = map hx . reverse . digs where
digs 0 = [0]
digs n = n `mod` 16 : digs (n `div` 16)
hx d = "0123456789ABCDEF" !! d
heights :: String
finals :: String
heights =
" MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' '
finals =
" kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' '

View File

@@ -1,48 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : UTF8
-- Maintainer : AR
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.5 $
--
-- From the Char module supplied with HBC.
-- code by Thomas Hallgren (Jul 10 1999)
-----------------------------------------------------------------------------
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
-- | Take a Unicode string and encode it as a string
-- with the UTF8 method.
decodeUTF8 :: String -> String
decodeUTF8 "" = ""
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
'\x80' <= c' && c' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
'\x80' <= c' && c' <= '\xbf' &&
'\x80' <= c'' && c'' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 s = s ---- AR workaround 22/6/2006
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
encodeUTF8 :: String -> String
encodeUTF8 "" = ""
encodeUTF8 (c:cs) =
if c > '\x0000' && c < '\x0080' then
c : encodeUTF8 cs
else if c < toEnum 0x0800 then
let i = fromEnum c
in toEnum (0xc0 + i `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs
else
let i = fromEnum c
in toEnum (0xe0 + i `div` 0x1000) :
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs

View File

@@ -1,69 +0,0 @@
----------------------------------------------------------------------
-- |
-- Module : Unicode
-- Maintainer : AR
-- Stability : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/04/21 16:23:42 $
-- > CVS $Author: bringert $
-- > CVS $Revision: 1.12 $
--
-- ad hoc Unicode conversions from different alphabets.
-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004
-----------------------------------------------------------------------------
module GF.Text.Unicode (mkUnicode, treat) where
import GF.Text.Greek (mkGreek)
import GF.Text.Arabic (mkArabic)
import GF.Text.Hebrew (mkHebrew)
import GF.Text.Russian (mkRussian, mkRusKOI8)
import GF.Text.Ethiopic (mkEthiopic)
import GF.Text.Tamil (mkTamil)
import GF.Text.OCSCyrillic (mkOCSCyrillic)
import GF.Text.LatinASupplement (mkLatinASupplement)
import GF.Text.Devanagari (mkDevanagari)
import GF.Text.Hiragana (mkJapanese)
import GF.Text.ExtendedArabic (mkArabic0600)
import GF.Text.ExtendedArabic (mkExtendedArabic)
import GF.Text.ExtraDiacritics (mkExtraDiacritics)
import Data.Char
mkUnicode :: String -> String
mkUnicode s = case s of
'/':'/':cs -> treat [] mkGreek unic ++ mkUnicode rest
'/':'+':cs -> mkHebrew unic ++ mkUnicode rest
'/':'-':cs -> mkArabic unic ++ mkUnicode rest
'/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest
'/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest
'/':'E':cs -> mkEthiopic unic ++ mkUnicode rest
'/':'T':cs -> mkTamil unic ++ mkUnicode rest
'/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest
'/':'&':cs -> mkDevanagari unic ++ mkUnicode rest
'/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest
'/':'J':cs -> mkJapanese unic ++ mkUnicode rest
'/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest
'/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest
'/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest
c:cs -> c:mkUnicode cs
_ -> s
where
(unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s
remClosing u s = case s of
c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match
c:cs -> remClosing (c:u) cs
_ -> (reverse u,[]) -- forgiving missing end
-- | don't convert XML tags --- assumes \<\> always means XML tags
treat :: String -> (String -> String) -> String -> String
treat old mk s = case s of
'<':cs -> mk (reverse old) ++ '<':noTreat cs
c:cs -> treat (c:old) mk cs
_ -> mk (reverse old)
where
noTreat s = case s of
'>':cs -> '>' : treat [] mk cs
c:cs -> c : noTreat cs
_ -> s