mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-21 10:49:33 -06:00
removed src for 2.9
This commit is contained in:
@@ -1,63 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Arabic
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:34 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.6 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Arabic (mkArabic) where
|
||||
|
||||
mkArabic :: String -> String
|
||||
mkArabic = unwords . (map mkArabicWord) . words
|
||||
----mkArabic = reverse . unwords . (map mkArabicWord) . words
|
||||
--- reverse : assumes everything's on same line
|
||||
|
||||
type ArabicChar = Char
|
||||
|
||||
mkArabicWord :: String -> [ArabicChar]
|
||||
mkArabicWord = map mkArabicChar . getLetterPos
|
||||
|
||||
getLetterPos :: String -> [(Char,Int)]
|
||||
getLetterPos [] = []
|
||||
getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
|
||||
getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b
|
||||
getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb
|
||||
getLetterPos [c] = [(c,1)] -- 1=isolated
|
||||
getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
|
||||
getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial
|
||||
|
||||
|
||||
getIn [] = []
|
||||
getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
|
||||
getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c
|
||||
getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc
|
||||
getIn [c] = [(c,2)] -- 2=final
|
||||
getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
|
||||
getIn (c:cs) = (c,4) : getIn cs -- 4=medial
|
||||
|
||||
isReduced :: Char -> Bool
|
||||
isReduced c = c `elem` "UuWiYOaAdVrzwj"
|
||||
|
||||
mkArabicChar ('*',p) | p > 4 && p < 10 =
|
||||
(map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)
|
||||
mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c
|
||||
where
|
||||
cc = mkArabicTab allArabicCodes allArabic
|
||||
|
||||
mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
|
||||
(as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
|
||||
mkArabicTab [] _ = []
|
||||
|
||||
allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy"
|
||||
|
||||
allArabic :: String
|
||||
allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80
|
||||
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Devanagari
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:34 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Devanagari (mkDevanagari) where
|
||||
|
||||
mkDevanagari :: String -> String
|
||||
mkDevanagari = digraphWordToUnicode . adHocToDigraphWord
|
||||
|
||||
adHocToDigraphWord :: String -> [(Char, Char)]
|
||||
adHocToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('\\', '<') : spoolMarkup cs
|
||||
' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
|
||||
|
||||
-- if c1 is a vowel
|
||||
-- Two of the same vowel => lengthening
|
||||
c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs
|
||||
-- digraphed or long vowel
|
||||
c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
|
||||
c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
|
||||
|
||||
-- c1 isn't a vowel
|
||||
-- c1 : 'a' : [] -> [(' ', c1)] -- a inherent
|
||||
-- c1 : c2 : [] | isVowel c2 -> (' ', c1) : [(' ', c2)]
|
||||
|
||||
-- c1 is aspirated
|
||||
c1 : 'H' : c2 : c3 : cs | c2 == c3 && isVowel c2 ->
|
||||
(c1, 'H') : (c2, ':') : adHocToDigraphWord cs
|
||||
c1 : 'H' : c2 : c3 : cs | isVowel c2 && isVowel c3 ->
|
||||
(c1, 'H') : (c2, c3) : adHocToDigraphWord cs
|
||||
c1 : 'H' : 'a' : cs -> (c1, 'H') : adHocToDigraphWord cs -- a inherent
|
||||
c1 : 'H' : c2 : cs | isVowel c2 -> (c1, 'H') : (' ', c2) : adHocToDigraphWord cs
|
||||
-- not vowelless at EOW
|
||||
c1 : 'H' : ' ' : cs -> (c1, 'H') : ('\\', ' ') : adHocToDigraphWord cs
|
||||
c1 : 'H' : [] -> [(c1, 'H')]
|
||||
c1 : 'H' : cs -> (c1, 'H') : (' ', '^') : adHocToDigraphWord cs -- vowelless
|
||||
|
||||
-- c1 unasp.
|
||||
c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
|
||||
c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
|
||||
c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
|
||||
c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs
|
||||
-- not vowelless at EOW
|
||||
c1 : ' ' : cs -> (' ', c1) : ('\\', ' '): adHocToDigraphWord cs
|
||||
c1 : [] -> [(' ', c1)]
|
||||
'M' : cs -> (' ', 'M') : adHocToDigraphWord cs -- vowelless but no vowelless sign for anusvara
|
||||
c1 : cs -> (' ', c1) : (' ', '^') : adHocToDigraphWord cs -- vowelless
|
||||
|
||||
isVowel x = elem x "aeiou:"
|
||||
cap :: Char -> Char
|
||||
cap x = case x of
|
||||
'a' -> 'A'
|
||||
'e' -> 'E'
|
||||
'i' -> 'I'
|
||||
'o' -> 'O'
|
||||
'u' -> 'U'
|
||||
c -> c
|
||||
|
||||
spoolMarkup :: String -> [(Char, Char)]
|
||||
spoolMarkup s = case s of
|
||||
-- [] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
|
||||
c1 : cs -> ('\\', c1) : spoolMarkup cs
|
||||
|
||||
|
||||
digraphWordToUnicode :: [(Char, Char)] -> String
|
||||
digraphWordToUnicode = map digraphToUnicode
|
||||
|
||||
digraphToUnicode :: (Char, Char) -> Char
|
||||
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
|
||||
where
|
||||
cc = zip allDevanagariCodes allDevanagari
|
||||
|
||||
digraphedDevanagari = " ~ M ;__ AA: II: UU:RoLoEvE~ EE:AvA~ OAU kkH ggHNG ccH jjH \241 TTH DDH N ttH ddH nn. ppH bbH m y rr. l LL. v \231 S s h____ .-Sa: ii: uu:ror:eve~ eaiava~ oau ^____OM | -dddu______ Q X G zD.RH fy.R:L:mrmR#I#d#0#1#2#3#4#5#6#7#8#9#o"
|
||||
|
||||
allDevanagariCodes :: [(Char, Char)]
|
||||
allDevanagariCodes = mkPairs digraphedDevanagari
|
||||
|
||||
allDevanagari :: String
|
||||
allDevanagari = (map toEnum [0x0901 .. 0x0970])
|
||||
|
||||
mkPairs :: String -> [(Char, Char)]
|
||||
mkPairs str = case str of
|
||||
[] -> []
|
||||
c1 : c2 : cs -> (c1, c2) : mkPairs cs
|
||||
|
||||
@@ -1,72 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Ethiopic
|
||||
-- Maintainer : HH
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:35 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.6 $
|
||||
--
|
||||
-- Ascii-Unicode decoding for Ethiopian.
|
||||
-- Copyright (c) Harald Hammarström 2003 under Gnu General Public License
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Ethiopic (mkEthiopic) where
|
||||
|
||||
mkEthiopic :: String -> String
|
||||
mkEthiopic = digraphWordToUnicode . adHocToDigraphWord
|
||||
|
||||
-- mkEthiopic :: String -> String
|
||||
-- mkEthiopic = reverse . unwords . (map (digraphWordToUnicode . adHocToDigraphWord)) . words
|
||||
--- reverse : assumes everything's on same line
|
||||
|
||||
adHocToDigraphWord :: String -> [(Char, Int)]
|
||||
adHocToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('<', -1) : spoolMarkup cs
|
||||
c1 : cs | isVowel c1 -> (')', vowelOrder c1) : adHocToDigraphWord cs
|
||||
-- c1 isn't a vowel
|
||||
c1 : cs | not (elem c1 allEthiopicCodes) -> (c1, -1) : adHocToDigraphWord cs
|
||||
c1 : c2 : cs | isVowel c2 -> (c1, vowelOrder c2) : adHocToDigraphWord cs
|
||||
c1 : cs -> (c1, 5) : adHocToDigraphWord cs
|
||||
|
||||
spoolMarkup :: String -> [(Char, Int)]
|
||||
spoolMarkup s = case s of
|
||||
-- [] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('>', -1) : adHocToDigraphWord cs
|
||||
c1 : cs -> (c1, -1) : spoolMarkup cs
|
||||
|
||||
isVowel x = elem x "A\228ui\239aeoI"
|
||||
|
||||
vowelOrder :: Char -> Int
|
||||
vowelOrder x = case x of
|
||||
'A' -> 0
|
||||
'\228' -> 0 -- ä
|
||||
'u' -> 1
|
||||
'i' -> 2
|
||||
'a' -> 3
|
||||
'e' -> 4
|
||||
'I' -> 5
|
||||
'\239' -> 5 -- ï
|
||||
'o' -> 6
|
||||
c -> 5 -- vowelless
|
||||
|
||||
digraphWordToUnicode = map digraphToUnicode
|
||||
|
||||
digraphToUnicode :: (Char, Int) -> Char
|
||||
-- digraphToUnicode (c1, c2) = c1
|
||||
|
||||
digraphToUnicode (c1, -1) = c1
|
||||
digraphToUnicode (c1, c2) = toEnum (0x1200 + c2 + 8*case lookup c1 cc of Just c' -> c')
|
||||
where
|
||||
cc = zip allEthiopicCodes allEthiopic
|
||||
|
||||
allEthiopic :: [Int]
|
||||
allEthiopic = [0 .. 44] -- x 8
|
||||
|
||||
allEthiopicCodes = "hlHmLrs$KQ__bBtcxXnN)kW__w(zZyd_jgG_TCPSLfp"
|
||||
|
||||
-- Q = kW, X = xW, W = kW, G = gW
|
||||
|
||||
@@ -1,99 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : ExtendedArabic
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:36 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.6 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.ExtendedArabic (mkArabic0600, mkExtendedArabic) where
|
||||
|
||||
mkArabic0600 :: String -> String
|
||||
mkArabic0600 = digraphWordToUnicode . aarnesToDigraphWord
|
||||
|
||||
aarnesToDigraphWord :: String -> [(Char, Char)]
|
||||
aarnesToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('\\', '<') : spoolMarkup2 cs
|
||||
|
||||
'v' : cs -> ('T', 'H') : aarnesToDigraphWord cs
|
||||
'a' : cs -> (' ', 'A') : aarnesToDigraphWord cs
|
||||
'o' : cs -> (' ', '3') : aarnesToDigraphWord cs
|
||||
'O' : cs -> ('\'', 'i') : aarnesToDigraphWord cs
|
||||
|
||||
'u' : cs -> ('\'', 'A') : aarnesToDigraphWord cs
|
||||
'C' : cs -> (' ', 'X') : aarnesToDigraphWord cs
|
||||
|
||||
'U' : cs -> ('~', 'A') : aarnesToDigraphWord cs
|
||||
'A' : cs -> ('"', 't') : aarnesToDigraphWord cs
|
||||
'c' : cs -> ('s', 'h') : aarnesToDigraphWord cs
|
||||
c : cs -> (' ', c) : aarnesToDigraphWord cs
|
||||
|
||||
mkExtendedArabic :: String -> String
|
||||
mkExtendedArabic = digraphWordToUnicode . adHocToDigraphWord
|
||||
|
||||
adHocToDigraphWord :: String -> [(Char, Char)]
|
||||
adHocToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('\\', '<') : spoolMarkup cs
|
||||
-- Sorani
|
||||
'W' : cs -> (':', 'w') : adHocToDigraphWord cs -- ?? Will do
|
||||
'E' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
|
||||
'j' : cs -> ('d', 'j') : adHocToDigraphWord cs
|
||||
'O' : cs -> ('v', 'w') : adHocToDigraphWord cs
|
||||
'F' : cs -> (' ', 'v') : adHocToDigraphWord cs
|
||||
'Z' : cs -> ('z', 'h') : adHocToDigraphWord cs
|
||||
'I' : cs -> (' ', 'i') : adHocToDigraphWord cs -- ?? Letter missing!
|
||||
'C' : cs -> ('c', 'h') : adHocToDigraphWord cs
|
||||
-- Pashto
|
||||
'e' : cs -> (':', 'y') : adHocToDigraphWord cs
|
||||
'$' : cs -> ('3', 'H') : adHocToDigraphWord cs
|
||||
'X' : cs -> ('s', '.') : adHocToDigraphWord cs
|
||||
'G' : cs -> ('z', '.') : adHocToDigraphWord cs
|
||||
'a' : cs -> (' ', 'A') : adHocToDigraphWord cs
|
||||
'P' : cs -> ('\'', 'H') : adHocToDigraphWord cs
|
||||
'R' : cs -> ('o', 'r') : adHocToDigraphWord cs
|
||||
-- Shared
|
||||
'A' : cs -> (' ', 'h') : adHocToDigraphWord cs -- ?? Maybe to "t or 0x06d5
|
||||
'c' : cs -> ('s', 'h') : adHocToDigraphWord cs
|
||||
c : cs -> (' ', c) : adHocToDigraphWord cs
|
||||
|
||||
|
||||
-- Beginning 0x621 up and including 0x06d1
|
||||
digraphedExtendedArabic = " '~A'A'w,A'i A b\"t tTHdj H X dDH r z ssh S D T Z 3GH__________ - f q k l m n h w i y&a&w&i/a/w/i/W/o/~/'/,/|/6/v_____________#0#1#2#3#4#5#6#7#8#9#%#,#'#*>b>q$|> A2'2,3'A'w'w&y'Tb:b:BoT3b p4b4B'H:H2H\"H3Hch4HTdod.dTD:d:D3d3D4dTrvror.rvRz.:rzh4zs.+s*S:S3S3T33>ff.f: v4f.q3q-k~kok.k3k3K gog:g:G3Gvl.l3l3L:n>nTnon3n?h4H't>Y\"Yow-wvwww|w^w:w3w>y/yvy.w:y3y____ -ae"
|
||||
|
||||
digraphWordToUnicode = map digraphToUnicode
|
||||
|
||||
digraphToUnicode :: (Char, Char) -> Char
|
||||
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
|
||||
where
|
||||
cc = zip allExtendedArabicCodes allExtendedArabic
|
||||
|
||||
allExtendedArabicCodes :: [(Char, Char)]
|
||||
allExtendedArabicCodes = mkPairs digraphedExtendedArabic
|
||||
|
||||
allExtendedArabic :: String
|
||||
allExtendedArabic = (map toEnum [0x0621 .. 0x06d1])
|
||||
|
||||
mkPairs :: String -> [(Char, Char)]
|
||||
mkPairs str = case str of
|
||||
[] -> []
|
||||
c1 : c2 : cs -> (c1, c2) : mkPairs cs
|
||||
|
||||
spoolMarkup :: String -> [(Char, Char)]
|
||||
spoolMarkup s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
|
||||
c1 : cs -> ('\\', c1) : spoolMarkup cs
|
||||
|
||||
spoolMarkup2 :: String -> [(Char, Char)]
|
||||
spoolMarkup2 s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('\\', '>') : aarnesToDigraphWord cs
|
||||
c1 : cs -> ('\\', c1) : spoolMarkup2 cs
|
||||
@@ -1,37 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : ExtraDiacritics
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:36 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.ExtraDiacritics (mkExtraDiacritics) where
|
||||
|
||||
mkExtraDiacritics :: String -> String
|
||||
mkExtraDiacritics = mkExtraDiacriticsWord
|
||||
|
||||
mkExtraDiacriticsWord :: String -> String
|
||||
mkExtraDiacriticsWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> '<' : spoolMarkup cs
|
||||
--
|
||||
'/' : cs -> toEnum 0x0301 : mkExtraDiacriticsWord cs
|
||||
'~' : cs -> toEnum 0x0306 : mkExtraDiacriticsWord cs
|
||||
':' : cs -> toEnum 0x0304 : mkExtraDiacriticsWord cs -- some of these could be put in LatinA
|
||||
'.' : cs -> toEnum 0x0323 : mkExtraDiacriticsWord cs
|
||||
'i' : '-' : cs -> toEnum 0x0268 : mkExtraDiacriticsWord cs -- in IPA extensions
|
||||
-- Default
|
||||
c : cs -> c : mkExtraDiacriticsWord cs
|
||||
|
||||
spoolMarkup :: String -> String
|
||||
spoolMarkup s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> '>' : mkExtraDiacriticsWord cs
|
||||
c1 : cs -> c1 : spoolMarkup cs
|
||||
@@ -1,172 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Greek
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:37 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Greek (mkGreek) where
|
||||
|
||||
mkGreek :: String -> String
|
||||
mkGreek = unwords . (map mkGreekWord) . mkGravis . words
|
||||
|
||||
--- TODO : optimize character formation by factorizing the case expressions
|
||||
|
||||
type GreekChar = Char
|
||||
|
||||
mkGreekWord :: String -> [GreekChar]
|
||||
mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
|
||||
|
||||
mkGravis :: [String] -> [String]
|
||||
mkGravis [] = []
|
||||
mkGravis [w] = [w]
|
||||
mkGravis (w1:w2:ws)
|
||||
| stressed w2 = mkG w1 : mkGravis (w2:ws)
|
||||
| otherwise = w1 : w2 : mkGravis ws
|
||||
where
|
||||
stressed w = any (`elem` "'~`") w
|
||||
mkG :: String -> String
|
||||
mkG w = let (w1,w2) = span (/='\'') w in
|
||||
case w2 of
|
||||
'\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
|
||||
'\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
|
||||
_ -> w
|
||||
isVowel c = elem c "aehiouw"
|
||||
|
||||
mkGreekSpec :: String -> [(Char,Int)]
|
||||
mkGreekSpec str = case str of
|
||||
[] -> []
|
||||
'(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs
|
||||
'(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs
|
||||
'(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs
|
||||
'(' : '!' : c : cs -> (c,21) : mkGreekSpec cs
|
||||
')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs
|
||||
')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs
|
||||
')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs
|
||||
')' : '!' : c : cs -> (c,20) : mkGreekSpec cs
|
||||
'\'': '!' : c : cs -> (c,30) : mkGreekSpec cs
|
||||
'~' : '!' : c : cs -> (c,31) : mkGreekSpec cs
|
||||
'`' : '!' : c : cs -> (c,32) : mkGreekSpec cs
|
||||
'!' : c : cs -> (c,33) : mkGreekSpec cs
|
||||
'(' :'\'': c : cs -> (c,5) : mkGreekSpec cs
|
||||
'(' :'~' : c : cs -> (c,7) : mkGreekSpec cs
|
||||
'(' :'`' : c : cs -> (c,3) : mkGreekSpec cs
|
||||
'(' : c : cs -> (c,1) : mkGreekSpec cs
|
||||
')' :'\'': c : cs -> (c,4) : mkGreekSpec cs
|
||||
')' :'~' : c : cs -> (c,6) : mkGreekSpec cs
|
||||
')' :'`' : c : cs -> (c,2) : mkGreekSpec cs
|
||||
')' : c : cs -> (c,0) : mkGreekSpec cs
|
||||
'\'': c : cs -> (c,10) : mkGreekSpec cs
|
||||
'~' : c : cs -> (c,11) : mkGreekSpec cs
|
||||
'`' : c : cs -> (c,12) : mkGreekSpec cs
|
||||
c : cs -> (c,-1) : mkGreekSpec cs
|
||||
|
||||
mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c
|
||||
where
|
||||
cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
|
||||
mkGreekChar (c,n) = case (c,n) of
|
||||
('a',10) -> 0x03ac
|
||||
('a',11) -> 0x1fb6
|
||||
('a',12) -> 0x1f70
|
||||
('a',30) -> 0x1fb4
|
||||
('a',31) -> 0x1fb7
|
||||
('a',32) -> 0x1fb2
|
||||
('a',33) -> 0x1fb3
|
||||
('a',n) | n >19 -> 0x1f80 + n - 20
|
||||
('a',n) -> 0x1f00 + n
|
||||
('e',10) -> 0x03ad -- '
|
||||
-- ('e',11) -> 0x1fb6 -- ~ can't happen
|
||||
('e',12) -> 0x1f72 -- `
|
||||
('e',n) -> 0x1f10 + n
|
||||
('h',10) -> 0x03ae -- '
|
||||
('h',11) -> 0x1fc6 -- ~
|
||||
('h',12) -> 0x1f74 -- `
|
||||
|
||||
('h',30) -> 0x1fc4
|
||||
('h',31) -> 0x1fc7
|
||||
('h',32) -> 0x1fc2
|
||||
('h',33) -> 0x1fc3
|
||||
('h',n) | n >19 -> 0x1f90 + n - 20
|
||||
|
||||
('h',n) -> 0x1f20 + n
|
||||
('i',10) -> 0x03af -- '
|
||||
('i',11) -> 0x1fd6 -- ~
|
||||
('i',12) -> 0x1f76 -- `
|
||||
('i',n) -> 0x1f30 + n
|
||||
('o',10) -> 0x03cc -- '
|
||||
-- ('o',11) -> 0x1fb6 -- ~ can't happen
|
||||
('o',12) -> 0x1f78 -- `
|
||||
('o',n) -> 0x1f40 + n
|
||||
('y',10) -> 0x03cd -- '
|
||||
('y',11) -> 0x1fe6 -- ~
|
||||
('y',12) -> 0x1f7a -- `
|
||||
('y',n) -> 0x1f50 + n
|
||||
('w',10) -> 0x03ce -- '
|
||||
('w',11) -> 0x1ff6 -- ~
|
||||
('w',12) -> 0x1f7c -- `
|
||||
|
||||
('w',30) -> 0x1ff4
|
||||
('w',31) -> 0x1ff7
|
||||
('w',32) -> 0x1ff2
|
||||
('w',33) -> 0x1ff3
|
||||
('w',n) | n >19 -> 0x1fa0 + n - 20
|
||||
|
||||
('w',n) -> 0x1f60 + n
|
||||
('r',1) -> 0x1fe5
|
||||
_ -> mkGreekChar (c,-1) --- should not happen
|
||||
|
||||
allGreekMin :: [Int]
|
||||
allGreekMin = [0x03b1 .. 0x03c9]
|
||||
|
||||
|
||||
{-
|
||||
encoding of Greek writing. Those hard to guess are marked with ---
|
||||
|
||||
maj min
|
||||
A a Alpha 0391 03b1
|
||||
B b Beta 0392 03b2
|
||||
G g Gamma 0393 03b3
|
||||
D d Delta 0394 03b4
|
||||
E e Epsilon 0395 03b5
|
||||
Z z Zeta 0396 03b6
|
||||
H h Eta --- 0397 03b7
|
||||
Q q Theta --- 0398 03b8
|
||||
I i Iota 0399 03b9
|
||||
K k Kappa 039a 03ba
|
||||
L l Lambda 039b 03bb
|
||||
M m My 039c 03bc
|
||||
N n Ny 039d 03bd
|
||||
X x Xi 039e 03be
|
||||
O o Omikron 039f 03bf
|
||||
P p Pi 03a0 03c0
|
||||
R r Rho 03a1 03c1
|
||||
j Sigma --- 03c2
|
||||
S s Sigma 03a3 03c3
|
||||
T t Tau 03a4 03c4
|
||||
Y y Ypsilon 03a5 03c5
|
||||
F f Phi 03a6 03c6
|
||||
C c Khi --- 03a7 03c7
|
||||
U u Psi 03a8 03c8
|
||||
W w Omega --- 03a9 03c9
|
||||
|
||||
( spiritus asper
|
||||
) spiritus lenis
|
||||
! iota subscriptum
|
||||
|
||||
' acutus
|
||||
~ circumflexus
|
||||
` gravis
|
||||
|
||||
-}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,53 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Hebrew
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:37 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.8 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Hebrew (mkHebrew) where
|
||||
|
||||
mkHebrew :: String -> String
|
||||
mkHebrew = mkHebrewWord
|
||||
----mkHebrew = reverse . mkHebrewWord
|
||||
--- reverse : assumes everything's on same line
|
||||
|
||||
type HebrewChar = Char
|
||||
|
||||
-- HH 031103 added code for spooling the markup
|
||||
-- removed reverse, words, unwords
|
||||
-- (seemed obsolete and come out wrong on the screen)
|
||||
-- AR 26/1/2004 put reverse back - needed in Fudgets (but not in Java?)
|
||||
|
||||
mkHebrewWord :: String -> [HebrewChar]
|
||||
-- mkHebrewWord = map mkHebrewChar
|
||||
|
||||
mkHebrewWord s = case s of
|
||||
[] -> []
|
||||
'<' : cs -> '<' : spoolMarkup cs
|
||||
' ' : cs -> ' ' : mkHebrewWord cs
|
||||
c1 : cs -> mkHebrewChar c1 : mkHebrewWord cs
|
||||
|
||||
spoolMarkup :: String -> String
|
||||
spoolMarkup s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> '>' : mkHebrewWord cs
|
||||
c1 : cs -> c1 : spoolMarkup cs
|
||||
|
||||
mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c
|
||||
where
|
||||
cc = zip allHebrewCodes allHebrew
|
||||
|
||||
allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
|
||||
|
||||
allHebrew :: String
|
||||
allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))
|
||||
|
||||
|
||||
@@ -1,95 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Hiragana
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:38 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Hiragana (mkJapanese) where
|
||||
|
||||
-- long vowel romaaji must be ei, ou not ee, oo
|
||||
|
||||
mkJapanese :: String -> String
|
||||
mkJapanese = digraphWordToUnicode . romaajiToDigraphWord
|
||||
|
||||
romaajiToDigraphWord :: String -> [(Char, Char)]
|
||||
romaajiToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('\\', '<') : spoolMarkup cs
|
||||
' ' : cs -> ('\\', ' ') : romaajiToDigraphWord cs
|
||||
|
||||
c1 : cs | isVowel c1 -> (' ', cap c1) : romaajiToDigraphWord cs
|
||||
|
||||
-- The combinations
|
||||
c1 : 'y' : c2 : cs -> (c1, 'i') : ('y', cap c2) : romaajiToDigraphWord cs
|
||||
|
||||
's' : 'h' : 'a' : cs -> ('S', 'i') : ('y', 'A') : romaajiToDigraphWord cs
|
||||
'c' : 'h' : 'a' : cs -> ('C', 'i') : ('y', 'A') : romaajiToDigraphWord cs
|
||||
'j' : 'a' : cs -> ('j', 'i') : ('y', 'A') : romaajiToDigraphWord cs
|
||||
|
||||
's' : 'h' : 'u' : cs -> ('S', 'i') : ('y', 'U') : romaajiToDigraphWord cs
|
||||
'c' : 'h' : 'u' : cs -> ('C', 'i') : ('y', 'U') : romaajiToDigraphWord cs
|
||||
'j' : 'u' : cs -> ('j', 'i') : ('y', 'U') : romaajiToDigraphWord cs
|
||||
|
||||
's' : 'h' : 'o' : cs -> ('S', 'i') : ('y', 'O') : romaajiToDigraphWord cs
|
||||
'c' : 'h' : 'o' : cs -> ('C', 'i') : ('y', 'O') : romaajiToDigraphWord cs
|
||||
'j' : 'o' : cs -> ('j', 'i') : ('y', 'O') : romaajiToDigraphWord cs
|
||||
|
||||
'd' : 'z' : c3 : cs -> ('D', c3) : romaajiToDigraphWord cs
|
||||
't' : 's' : c3 : cs -> ('T', c3) : romaajiToDigraphWord cs
|
||||
'c' : 'h' : c3 : cs -> ('C', c3) : romaajiToDigraphWord cs
|
||||
's' : 'h' : c3 : cs -> ('S', c3) : romaajiToDigraphWord cs
|
||||
'z' : 'h' : c3 : cs -> ('Z', c3) : romaajiToDigraphWord cs
|
||||
|
||||
c1 : ' ' : cs -> (' ', c1) : ('\\', ' ') : romaajiToDigraphWord cs -- n
|
||||
c1 : [] -> [(' ', c1)] -- n
|
||||
|
||||
c1 : c2 : cs | isVowel c2 -> (c1, c2) : romaajiToDigraphWord cs
|
||||
c1 : c2 : cs | c1 == c2 -> ('T', 'U') : romaajiToDigraphWord (c2 : cs) -- double cons
|
||||
c1 : cs -> (' ', c1) : romaajiToDigraphWord cs -- n
|
||||
|
||||
isVowel x = elem x "aeiou"
|
||||
cap :: Char -> Char
|
||||
cap x = case x of
|
||||
'a' -> 'A'
|
||||
'e' -> 'E'
|
||||
'i' -> 'I'
|
||||
'o' -> 'O'
|
||||
'u' -> 'U'
|
||||
c -> c
|
||||
|
||||
spoolMarkup :: String -> [(Char, Char)]
|
||||
spoolMarkup s = case s of
|
||||
-- [] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('\\', '>') : romaajiToDigraphWord cs
|
||||
c1 : cs -> ('\\', c1) : spoolMarkup cs
|
||||
|
||||
digraphWordToUnicode :: [(Char, Char)] -> String
|
||||
digraphWordToUnicode = map digraphToUnicode
|
||||
|
||||
digraphToUnicode :: (Char, Char) -> Char
|
||||
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
|
||||
where
|
||||
cc = zip allHiraganaCodes allHiragana
|
||||
|
||||
allHiraganaCodes :: [(Char, Char)]
|
||||
allHiraganaCodes = mkPairs digraphedHiragana
|
||||
|
||||
allHiragana :: String
|
||||
allHiragana = (map toEnum [0x3041 .. 0x309f])
|
||||
|
||||
mkPairs :: String -> [(Char, Char)]
|
||||
mkPairs str = case str of
|
||||
[] -> []
|
||||
c1 : c2 : cs -> (c1, c2) : mkPairs cs
|
||||
|
||||
digraphedHiragana = " a A i I u U e E o OkagakigikugukegekogosazaSiZisuzusezesozotadaCijiTUTuDutedetodonaninunenohabapahibipihubupuhebepehobopomamimumemoyAyayUyuyOyorarirurerowaWawiwewo nvukAkE____<< o>>o >'> b"
|
||||
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : LatinASupplement
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:39 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.LatinASupplement (mkLatinASupplement) where
|
||||
|
||||
mkLatinASupplement :: String -> String
|
||||
mkLatinASupplement = mkLatinASupplementWord
|
||||
|
||||
mkLatinASupplementWord :: String -> String
|
||||
mkLatinASupplementWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> '<' : spoolMarkup cs
|
||||
-- Romanian & partly Turkish
|
||||
's' : ',' : cs -> toEnum 0x015f : mkLatinASupplementWord cs
|
||||
'a' : '%' : cs -> toEnum 0x0103 : mkLatinASupplementWord cs
|
||||
-- Slavic and more
|
||||
'c' : '^' : cs -> toEnum 0x010d : mkLatinASupplementWord cs
|
||||
's' : '^' : cs -> toEnum 0x0161 : mkLatinASupplementWord cs
|
||||
'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
|
||||
'z' : '^' : cs -> toEnum 0x017e : mkLatinASupplementWord cs
|
||||
-- Turkish
|
||||
'g' : '%' : cs -> toEnum 0x011f : mkLatinASupplementWord cs
|
||||
'I' : cs -> toEnum 0x0131 : mkLatinASupplementWord cs
|
||||
'c' : ',' : cs -> toEnum 0x00e7 : mkLatinASupplementWord cs
|
||||
-- Polish
|
||||
'e' : ',' : cs -> toEnum 0x0119 : mkLatinASupplementWord cs
|
||||
'a' : ',' : cs -> toEnum 0x0105 : mkLatinASupplementWord cs
|
||||
'l' : '/' : cs -> toEnum 0x0142 : mkLatinASupplementWord cs
|
||||
'z' : '.' : cs -> toEnum 0x017c : mkLatinASupplementWord cs
|
||||
'n' : '\'' : cs -> toEnum 0x0144 : mkLatinASupplementWord cs
|
||||
's' : '\'' : cs -> toEnum 0x015b : mkLatinASupplementWord cs
|
||||
-- 'c' : '\'' : cs -> toEnum 0x0107 : mkLatinASupplementWord cs
|
||||
|
||||
-- Hungarian
|
||||
'o' : '%' : cs -> toEnum 0x0151 : mkLatinASupplementWord cs
|
||||
'u' : '%' : cs -> toEnum 0x0171 : mkLatinASupplementWord cs
|
||||
|
||||
-- Mongolian
|
||||
'j' : '^' : cs -> toEnum 0x0135 : mkLatinASupplementWord cs
|
||||
|
||||
-- Khowar (actually in Combining diacritical marks not Latin-A Suppl.)
|
||||
'o' : '.' : cs -> 'o' : (toEnum 0x0323 : mkLatinASupplementWord cs)
|
||||
|
||||
-- Length bars over vowels e.g korean
|
||||
'a' : ':' : cs -> toEnum 0x0101 : mkLatinASupplementWord cs
|
||||
'e' : ':' : cs -> toEnum 0x0113 : mkLatinASupplementWord cs
|
||||
'i' : ':' : cs -> toEnum 0x012b : mkLatinASupplementWord cs
|
||||
'o' : ':' : cs -> toEnum 0x014d : mkLatinASupplementWord cs
|
||||
'u' : ':' : cs -> toEnum 0x016b : mkLatinASupplementWord cs
|
||||
|
||||
-- Default
|
||||
c : cs -> c : mkLatinASupplementWord cs
|
||||
|
||||
spoolMarkup :: String -> String
|
||||
spoolMarkup s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> '>' : mkLatinASupplementWord cs
|
||||
c1 : cs -> c1 : spoolMarkup cs
|
||||
@@ -1,47 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:39 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.6 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.OCSCyrillic (mkOCSCyrillic) where
|
||||
|
||||
mkOCSCyrillic :: String -> String
|
||||
mkOCSCyrillic = mkOCSCyrillicWord
|
||||
|
||||
mkOCSCyrillicWord :: String -> String
|
||||
mkOCSCyrillicWord str = case str of
|
||||
[] -> []
|
||||
' ' : cs -> ' ' : mkOCSCyrillicWord cs
|
||||
'<' : cs -> '<' : spoolMarkup cs
|
||||
'\228' : cs -> toEnum 0x0463 : mkOCSCyrillicWord cs -- ä
|
||||
'j' : 'e' : '~' : cs -> toEnum 0x0469 : mkOCSCyrillicWord cs
|
||||
'j' : 'o' : '~' : cs -> toEnum 0x046d : mkOCSCyrillicWord cs
|
||||
'j' : 'e' : cs -> toEnum 0x0465 : mkOCSCyrillicWord cs
|
||||
'e' : '~' : cs -> toEnum 0x0467 : mkOCSCyrillicWord cs
|
||||
'o' : '~' : cs -> toEnum 0x046b : mkOCSCyrillicWord cs
|
||||
'j' : 'u' : cs -> toEnum 0x044e : mkOCSCyrillicWord cs
|
||||
'j' : 'a' : cs -> toEnum 0x044f : mkOCSCyrillicWord cs
|
||||
'u' : cs -> toEnum 0x0479 : mkOCSCyrillicWord cs
|
||||
c : cs -> (mkOCSCyrillicChar c) : mkOCSCyrillicWord cs
|
||||
|
||||
spoolMarkup :: String -> String
|
||||
spoolMarkup s = case s of
|
||||
[] -> [] -- Shouldn't happen
|
||||
'>' : cs -> '>' : mkOCSCyrillicWord cs
|
||||
c1 : cs -> c1 : spoolMarkup cs
|
||||
|
||||
mkOCSCyrillicChar :: Char -> Char
|
||||
mkOCSCyrillicChar c = case lookup c cc of Just c' -> c' ; _ -> c
|
||||
where
|
||||
cc = zip "abvgdeZziJklmnoprstYfxCqwWUyIE" allOCSCyrillic
|
||||
|
||||
allOCSCyrillic :: String
|
||||
allOCSCyrillic = (map toEnum [0x0430 .. 0x044e])
|
||||
@@ -1,56 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Russian
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:40 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Russian (mkRussian, mkRusKOI8) where
|
||||
|
||||
-- | an ad hoc ASCII encoding. Delimiters: @\/_ _\/@
|
||||
mkRussian :: String -> String
|
||||
mkRussian = unwords . (map mkRussianWord) . words
|
||||
|
||||
-- | the KOI8 encoding, incomplete. Delimiters: @\/* *\/@
|
||||
mkRusKOI8 :: String -> String
|
||||
mkRusKOI8 = unwords . (map mkRussianKOI8) . words
|
||||
|
||||
type RussianChar = Char
|
||||
|
||||
mkRussianWord :: String -> [RussianChar]
|
||||
mkRussianWord = map (mkRussianChar allRussianCodes)
|
||||
|
||||
mkRussianKOI8 :: String -> [RussianChar]
|
||||
mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
|
||||
|
||||
mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c
|
||||
where
|
||||
cc = zip chars allRussian
|
||||
|
||||
allRussianCodes :: [Char]
|
||||
allRussianCodes =
|
||||
-- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
|
||||
-- which expect source files to be in UTF-8
|
||||
-- /bringert 2006-05-19
|
||||
-- "ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä"
|
||||
map toEnum [197,229,65,66,86,71,68,69,88,90,73,74,75,76,77,78,79,80,82,83,84,85,70,72,67,81,87,163,125,33,42,214,89,196,97,98,118,103,100,101,120,122,105,106,107,108,109,110,111,112,114,115,116,117,102,104,99,113,119,35,48,49,39,246,121,228]
|
||||
|
||||
allRussianKOI8 :: [Char]
|
||||
allRussianKOI8 =
|
||||
-- changed to Ints to work with Haskell compilers e.g. GHC 6.5 CVS
|
||||
-- which expect source files to be in UTF-8
|
||||
-- /bringert 2006-05-19
|
||||
-- "^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ"
|
||||
map toEnum [94,64,225,226,247,231,228,229,246,250,233,234,235,236,237,238,239,240,242,243,244,245,230,232,227,254,251,253,248,249,255,252,224,241,193,194,215,199,196,197,214,218,201,202,203,204,205,206,207,208,210,211,212,213,198,200,195,222,219,221,216,217,223,220,192,209]
|
||||
|
||||
allRussian :: String
|
||||
allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places
|
||||
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Tamil
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:40 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- (Description of the module)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Tamil (mkTamil) where
|
||||
|
||||
mkTamil :: String -> String
|
||||
mkTamil = digraphWordToUnicode . adHocToDigraphWord
|
||||
|
||||
adHocToDigraphWord :: String -> [(Char, Char)]
|
||||
adHocToDigraphWord str = case str of
|
||||
[] -> []
|
||||
'<' : cs -> ('\\', '<') : spoolMarkup cs
|
||||
' ' : cs -> ('\\', ' ') : adHocToDigraphWord cs -- skip space
|
||||
|
||||
-- if c1 is a vowel
|
||||
-- Two of the same vowel => lengthening
|
||||
c1 : c2 : cs | c1 == c2 && isVowel c1 -> (cap c1, ':') : adHocToDigraphWord cs
|
||||
-- digraphed or long vowel
|
||||
c1 : c2 : cs | isVowel c1 && isVowel c2 -> (cap c1, cap c2) : adHocToDigraphWord cs
|
||||
c1 : cs | isVowel c1 -> (' ', cap c1) : adHocToDigraphWord cs
|
||||
|
||||
-- c1 isn't a vowel
|
||||
c1 : c2 : c3 : cs | c2 == c3 && isVowel c2 -> (' ', c1) : (c2, ':') : adHocToDigraphWord cs
|
||||
c1 : c2 : c3 : cs | isVowel c2 && isVowel c3 -> (' ', c1) : (c2, c3) : adHocToDigraphWord cs
|
||||
c1 : 'a' : cs -> (' ', c1) : adHocToDigraphWord cs -- a inherent
|
||||
c1 : c2 : cs | isVowel c2 -> (' ', c1) : (' ', c2) : adHocToDigraphWord cs
|
||||
|
||||
c1 : cs -> (' ', c1) : (' ', '.') : adHocToDigraphWord cs -- vowelless
|
||||
|
||||
isVowel x = elem x "aeiou:"
|
||||
cap :: Char -> Char
|
||||
cap x = case x of
|
||||
'a' -> 'A'
|
||||
'e' -> 'E'
|
||||
'i' -> 'I'
|
||||
'o' -> 'O'
|
||||
'u' -> 'U'
|
||||
c -> c
|
||||
|
||||
spoolMarkup :: String -> [(Char, Char)]
|
||||
spoolMarkup s = case s of
|
||||
-- [] -> [] -- Shouldn't happen
|
||||
'>' : cs -> ('\\', '>') : adHocToDigraphWord cs
|
||||
c1 : cs -> ('\\', c1) : spoolMarkup cs
|
||||
|
||||
digraphWordToUnicode :: [(Char, Char)] -> String
|
||||
digraphWordToUnicode = map digraphToUnicode
|
||||
|
||||
digraphToUnicode :: (Char, Char) -> Char
|
||||
digraphToUnicode (c1, c2) = case lookup (c1, c2) cc of Just c' -> c' ; _ -> c2
|
||||
where
|
||||
cc = zip allTamilCodes allTamil
|
||||
|
||||
mkPairs :: String -> [(Char, Char)]
|
||||
mkPairs str = case str of
|
||||
[] -> []
|
||||
c1 : c2 : cs -> (c1, c2) : mkPairs cs
|
||||
|
||||
allTamilCodes :: [(Char, Char)]
|
||||
allTamilCodes = mkPairs digraphedTamil
|
||||
|
||||
allTamil :: String
|
||||
allTamil = (map toEnum [0x0b85 .. 0x0bfa])
|
||||
|
||||
digraphedTamil = " AA: II: UU:______ EE:AI__ OO:AU k______ G c__ j__ \241 T______ N t______ V n p______ m y r l L M v__ s S h________a: ii: uu:______ ee:ai__ oo:au .__________________ :______________________________#1#2#3#4#5#6#7#8#9^1^2^3=d=m=y=d=c==ru##"
|
||||
|
||||
@@ -1,149 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Text
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/06/23 14:32:44 $
|
||||
-- > CVS $Author: aarne $
|
||||
-- > CVS $Revision: 1.10 $
|
||||
--
|
||||
-- elementary text postprocessing. AR 21\/11\/2001.
|
||||
--
|
||||
-- This is very primitive indeed. The functions should work on
|
||||
-- token lists and not on strings. AR 5\/12\/2002
|
||||
--
|
||||
-- XML hack 14\/8\/2004; not in use yet
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Text (untokWithXML,
|
||||
exceptXML,
|
||||
formatAsTextLit,
|
||||
formatAsCodeLit,
|
||||
formatAsText,
|
||||
formatAsHTML,
|
||||
formatAsLatex,
|
||||
formatAsCode,
|
||||
performBinds,
|
||||
performBindsFinnish,
|
||||
unStringLit,
|
||||
concatRemSpace
|
||||
) where
|
||||
|
||||
import GF.Data.Operations
|
||||
import Data.Char
|
||||
|
||||
-- | does not apply untokenizer within XML tags --- heuristic "< "
|
||||
-- this function is applied from top level...
|
||||
untokWithXML :: (String -> String) -> String -> String
|
||||
untokWithXML unt s = case s of
|
||||
'<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where
|
||||
(beg,rest) = span (/='>') cs
|
||||
'<':cs -> '<':unto cs ---
|
||||
[] -> []
|
||||
_ -> unt beg ++ unto rest where
|
||||
(beg,rest) = span (/='<') s
|
||||
where
|
||||
unto = untokWithXML unt
|
||||
|
||||
-- | ... whereas this one is embedded on a branch
|
||||
exceptXML :: (String -> String) -> String -> String
|
||||
exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where
|
||||
(beg,rest) = span (/='>') s
|
||||
|
||||
formatAsTextLit :: String -> String
|
||||
formatAsTextLit = formatAsText . unwords . map unStringLit . words
|
||||
--- hope that there will be deforestation...
|
||||
|
||||
formatAsCodeLit :: String -> String
|
||||
formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
|
||||
|
||||
formatAsText,formatAsHTML,formatAsLatex :: String -> String
|
||||
formatAsText = formatAsTextGen (const False) (=="&-")
|
||||
formatAsHTML = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False)
|
||||
formatAsLatex = formatAsTextGen ((=="\\") . take 1) (const False)
|
||||
|
||||
formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
|
||||
formatAsTextGen tag para = unwords . format . cap . words where
|
||||
format ws = case ws of
|
||||
w : ww | capit w -> format $ (cap ww)
|
||||
w : c : ww | major c -> format $ (w ++ c) :(cap ww)
|
||||
w : c : ww | minor c -> format $ (w ++ c) : ww
|
||||
p : c : ww | openp p -> format $ (p ++ c) :ww
|
||||
p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
|
||||
c : ww | para c -> "\n\n" : format ww
|
||||
w : ww -> w : format ww
|
||||
[] -> []
|
||||
cap (p:ww) | tag p = p : cap ww
|
||||
cap ((c:cs):ww) = (toUpper c : cs) : ww
|
||||
cap [] = []
|
||||
capit = (=="&|")
|
||||
major = flip elem (map singleton ".!?")
|
||||
minor = flip elem (map singleton ",:;)")
|
||||
openp = all (flip elem "(")
|
||||
spanish = all (flip elem "\161\191")
|
||||
|
||||
formatAsCode :: String -> String
|
||||
formatAsCode = rend 0 . words where
|
||||
-- render from BNF Converter
|
||||
rend i ss = case ss of
|
||||
"[" :ts -> cons "[" $ rend i ts
|
||||
"(" :ts -> cons "(" $ rend i ts
|
||||
"{" :ts -> cons "{" $ new (i+1) $ rend (i+1) ts
|
||||
"}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
|
||||
"}" :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
|
||||
";" :ts -> cons ";" $ new i $ rend i ts
|
||||
t : "," :ts -> cons t $ space "," $ rend i ts
|
||||
t : ")" :ts -> cons t $ cons ")" $ rend i ts
|
||||
t : "]" :ts -> cons t $ cons "]" $ rend i ts
|
||||
t :ts -> space t $ rend i ts
|
||||
_ -> ""
|
||||
cons s t = s ++ t
|
||||
new i s = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
|
||||
space t s = if null s then t else t ++ " " ++ s
|
||||
|
||||
performBinds :: String -> String
|
||||
performBinds = performBindsOpt (\x y -> y)
|
||||
|
||||
|
||||
-- The function defines an effect of the former on the latter part,
|
||||
-- such as in vowel harmony. It is triggered by the binder token "&*"
|
||||
|
||||
performBindsOpt :: (String -> String -> String) -> String -> String
|
||||
performBindsOpt harm = unwords . format . words where
|
||||
format ws = case ws of
|
||||
w : "&+" : u : ws -> format ((w ++ u) : ws)
|
||||
w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
|
||||
w : ws -> w : format ws
|
||||
[] -> []
|
||||
|
||||
-- unlexer for Finnish particles
|
||||
-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän"
|
||||
|
||||
performBindsFinnish :: String -> String
|
||||
performBindsFinnish = performBindsOpt vowelHarmony where
|
||||
vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
|
||||
toFront c = case c of
|
||||
'A' -> '\196'
|
||||
'O' -> '\214'
|
||||
'a' -> '\228'
|
||||
'o' -> '\246'
|
||||
_ -> c
|
||||
|
||||
unStringLit :: String -> String
|
||||
unStringLit s = case s of
|
||||
c : cs | strlim c && strlim (last cs) -> init cs
|
||||
_ -> s
|
||||
where
|
||||
strlim = (=='\'')
|
||||
|
||||
concatRemSpace :: String -> String
|
||||
concatRemSpace = concat . words
|
||||
{-
|
||||
concatRemSpace s = case s of
|
||||
'<':cs -> exceptXML concatRemSpace cs
|
||||
c : cs | isSpace c -> concatRemSpace cs
|
||||
c :cs -> c : concatRemSpace cs
|
||||
_ -> s
|
||||
-}
|
||||
@@ -1,368 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Thai
|
||||
-- Maintainer : (Maintainer)
|
||||
-- Stability : (experimental)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
--
|
||||
-- Thai transliteration and other alphabet information.
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
-- AR 27/12/2006. Execute test2 to see the transliteration table.
|
||||
|
||||
module GF.Text.Thai (
|
||||
mkThai,mkThaiWord,mkThaiPron,mkThaiFake,thaiFile,thaiPronFile,thaiFakeFile
|
||||
) where
|
||||
|
||||
import qualified Data.Map as Map
|
||||
import Data.Char
|
||||
|
||||
-- for testing
|
||||
import GF.Text.UTF8
|
||||
import Data.List
|
||||
|
||||
import Debug.Trace
|
||||
|
||||
|
||||
mkThai :: String -> String
|
||||
mkThai = concat . map mkThaiWord . words
|
||||
mkThaiPron = unwords . map mkPronSyllable . words
|
||||
mkThaiFake = unwords . map (fakeEnglish . mkPronSyllable) . words
|
||||
|
||||
|
||||
type ThaiChar = Char
|
||||
|
||||
mkThaiWord :: String -> [ThaiChar]
|
||||
mkThaiWord = map (toEnum . mkThaiChar) . unchar . snd . pronAndOrth
|
||||
|
||||
mkThaiChar :: String -> Int
|
||||
mkThaiChar c = maybe 0 id $ Map.lookup c thaiMap
|
||||
|
||||
thaiMap :: Map.Map String Int
|
||||
thaiMap = Map.fromList $ zip allThaiTrans allThaiCodes
|
||||
|
||||
-- convert all string literals in a text
|
||||
|
||||
thaiStrings :: String -> String
|
||||
thaiStrings = convStrings mkThai
|
||||
|
||||
thaiPronStrings :: String -> String
|
||||
thaiPronStrings = convStrings mkThaiPron
|
||||
|
||||
convStrings conv s = case s of
|
||||
'"':cs -> let (t,_:r) = span (/='"') cs in
|
||||
'"': conv t ++ "\"" ++ convStrings conv r
|
||||
c:cs -> c : convStrings conv cs
|
||||
_ -> s
|
||||
|
||||
|
||||
-- each character is either [letter] or [letter+nonletter]
|
||||
|
||||
unchar :: String -> [String]
|
||||
unchar s = case s of
|
||||
c:d:cs
|
||||
| isAlpha d -> [c] : unchar (d:cs)
|
||||
| d == '?' -> unchar cs -- use "o?" to represent implicit 'o'
|
||||
| otherwise -> [c,d] : unchar cs
|
||||
[_] -> [s]
|
||||
_ -> []
|
||||
|
||||
-- you can prefix transliteration by irregular phonology in []
|
||||
|
||||
pronAndOrth :: String -> (Maybe String, String)
|
||||
pronAndOrth s = case s of
|
||||
'[':cs -> case span (/=']') cs of
|
||||
(p,_:o) -> (Just p,o)
|
||||
_ -> (Nothing,s)
|
||||
_ -> (Nothing,s)
|
||||
|
||||
allThaiTrans :: [String]
|
||||
allThaiTrans = words $
|
||||
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
|
||||
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
|
||||
"p3 m y r - l - w s- s. s h l' O h' - " ++
|
||||
"a. a a: a+ i i: v v: u u: - - - - - - " ++
|
||||
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
|
||||
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
|
||||
|
||||
allThaiCodes :: [Int]
|
||||
allThaiCodes = [0x0e00 .. 0x0e7f]
|
||||
|
||||
|
||||
---------------------
|
||||
-- heuristic pronunciation of codes
|
||||
---------------------
|
||||
|
||||
-- fake English for TTS, a la Teach Yourself Thai
|
||||
|
||||
fakeEnglish :: String -> String
|
||||
fakeEnglish s = case s of
|
||||
'a':'a':cs -> "ah" ++ fakeEnglish cs
|
||||
'a':'y':cs -> "ai" ++ fakeEnglish cs
|
||||
'a' :cs -> "ah" ++ fakeEnglish cs
|
||||
'c':'h':cs -> "ch" ++ fakeEnglish cs
|
||||
'c' :cs -> "j" ++ fakeEnglish cs
|
||||
'e':'e':cs -> "aih" ++ fakeEnglish cs
|
||||
'g' :cs -> "ng" ++ fakeEnglish cs
|
||||
'i':'i':cs -> "ee" ++ fakeEnglish cs
|
||||
'k':'h':cs -> "k" ++ fakeEnglish cs
|
||||
'k' :cs -> "g" ++ fakeEnglish cs
|
||||
'O':'O':cs -> "or" ++ fakeEnglish cs
|
||||
'O' :cs -> "or" ++ fakeEnglish cs
|
||||
'o':'o':cs -> "or" ++ fakeEnglish cs
|
||||
'p':'h':cs -> "p" ++ fakeEnglish cs
|
||||
'p' :cs -> "b" ++ fakeEnglish cs
|
||||
't':'h':cs -> "t" ++ fakeEnglish cs
|
||||
't' :cs -> "d" ++ fakeEnglish cs
|
||||
'u':'u':cs -> "oo" ++ fakeEnglish cs
|
||||
'u' :cs -> "oo" ++ fakeEnglish cs
|
||||
'v':'v':cs -> "eu" ++ fakeEnglish cs
|
||||
'v' :cs -> "eu" ++ fakeEnglish cs
|
||||
'\228':'\228':cs -> "air" ++ fakeEnglish cs
|
||||
'\228' :cs -> "a" ++ fakeEnglish cs
|
||||
'\246':'\246':cs -> "er" ++ fakeEnglish cs
|
||||
'\246' :cs -> "er" ++ fakeEnglish cs
|
||||
c:cs | isTone c -> fakeEnglish cs
|
||||
c:cs -> c : fakeEnglish cs
|
||||
_ -> s
|
||||
where
|
||||
isTone = flip elem "'`^~"
|
||||
|
||||
|
||||
-- this works for one syllable
|
||||
|
||||
mkPronSyllable s = case fst $ pronAndOrth s of
|
||||
Just p -> p
|
||||
_ -> pronSyllable $ getSyllable $ map mkThaiChar $ unchar s
|
||||
|
||||
data Syllable = Syll {
|
||||
initv :: [Int],
|
||||
initc :: [Int],
|
||||
midv :: [Int],
|
||||
finalc :: [Int],
|
||||
finalv :: [Int],
|
||||
tone :: [Int],
|
||||
shorten :: Bool,
|
||||
kill :: Bool
|
||||
}
|
||||
deriving Show
|
||||
|
||||
data Tone = TMid | TLow | THigh | TRise | TFall
|
||||
deriving Show
|
||||
|
||||
data CClass = CLow | CMid | CHigh
|
||||
deriving Show
|
||||
|
||||
pronSyllable :: Syllable -> String
|
||||
pronSyllable s =
|
||||
initCons ++ tonem ++ vowel ++ finalCons
|
||||
where
|
||||
|
||||
vowel = case (initv s, midv s, finalv s, finalc s, shorten s, tone s) of
|
||||
([0x0e40],[0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:y
|
||||
([0x0e40],[0x0e2d,0x0e35],_,_,_,_) -> "va" -- e-i:O
|
||||
([0x0e40],[0x0e30,0x0e35],_,[0x0e22],_,_) -> "ia" -- e-i:ya.
|
||||
([0x0e40],[0x0e30,0x0e2d],_,_,_,_) -> "\246" -- e-Oa.
|
||||
([0x0e40],[0x0e30,0x0e32],_,_,_,_) -> "O" -- e-a:a. -- open o
|
||||
([0x0e40],[0x0e2d],_,_,_,_) -> "\246\246" -- e-O
|
||||
([0x0e40],[0x0e34],_,_,_,_) -> "\246\246" -- e-i
|
||||
([0x0e40],[0x0e30],_,_,_,_) -> "e" -- e-a.
|
||||
([0x0e40],[0x0e32],_,_,_,_) -> "aw" -- e-a:
|
||||
([0x0e40],[],[],[0x0e22],_,_) -> "\246\246y" -- e-y
|
||||
([0x0e40],[],[],_,True,_) -> "e"
|
||||
|
||||
([0x0e41],[0x0e30],_,_,_,_) -> "\228" -- ä-a.
|
||||
([0x0e41],[],[],_,True,_) -> "\228"
|
||||
|
||||
([0x0e42],[0x0e30],_,_,_,_) -> "o" -- o:-a.
|
||||
|
||||
([],[0x0e2d],_,[0x0e22],_,_) -> "OOy" -- Oy
|
||||
([],[0x0e2d],_,_,_,_) -> "OO" -- O
|
||||
|
||||
([],[],[],_,_,_) -> "o"
|
||||
|
||||
(i,m,f,_,_,_) -> concatMap pronThaiChar (reverse $ f ++ m ++ i) ----
|
||||
|
||||
initCons = concatMap pronThaiChar $ case (reverse $ initc s) of
|
||||
0x0e2b:cs@(_:_) -> cs -- high h
|
||||
0x0e2d:cs@(_:_) -> cs -- O
|
||||
cs -> cs
|
||||
|
||||
finalCons =
|
||||
let (c,cs) = splitAt 1 $ finalc s
|
||||
in
|
||||
case c of
|
||||
[] -> []
|
||||
[0x0e22] -> [] --- y
|
||||
[k] -> concatMap pronThaiChar (reverse cs) ++ finalThai k
|
||||
|
||||
iclass = case take 1 (reverse $ initc s) of
|
||||
[c] -> classThai c
|
||||
[] -> CMid -- O
|
||||
|
||||
isLong = not (shorten s) && case vowel of
|
||||
_:_:_ -> True ----
|
||||
_ -> False
|
||||
|
||||
isLive = case finalCons of
|
||||
c | elem c ["n","m","g"] -> True
|
||||
"" -> isLong
|
||||
_ -> False
|
||||
|
||||
tonem = case (iclass,isLive,isLong,tone s) of
|
||||
(_,_,_, [0x0e4a]) -> tHigh
|
||||
(_,_,_, [0x0e4b]) -> tRise
|
||||
(CLow,_,_,[0x0e49]) -> tRise
|
||||
(_,_,_, [0x0e49]) -> tFall
|
||||
(CLow,_,_,[0x0e48]) -> tFall
|
||||
(_, _,_,[0x0e48]) -> tLow
|
||||
(CHigh,True,_,_) -> tRise
|
||||
(_, True,_,_) -> tMid
|
||||
(CLow,False,False,_) -> tHigh
|
||||
(CLow,False,_,_) -> tFall
|
||||
_ -> tLow
|
||||
|
||||
(tMid,tHigh,tLow,tRise,tFall) = ("-","'","`","~","^")
|
||||
|
||||
isVowel c = 0x0e30 <= c && c <= 0x0e44 ----
|
||||
isCons c = 0x0e01 <= c && c <= 0x0e2f ----
|
||||
isTone c = 0x0e48 <= c && c <= 0x0e4b
|
||||
|
||||
getSyllable :: [Int] -> Syllable
|
||||
getSyllable = foldl get (Syll [] [] [] [] [] [] False False) where
|
||||
get syll c = case c of
|
||||
0x0e47 -> syll {shorten = True}
|
||||
0x0e4c -> syll {kill = True, finalc = tail (finalc syll)} --- always last
|
||||
0x0e2d
|
||||
| null (initc syll) -> syll {initc = [c]} -- "O"
|
||||
| otherwise -> syll {midv = c : midv syll}
|
||||
_
|
||||
| isVowel c -> if null (initc syll)
|
||||
then syll {initv = c : initv syll}
|
||||
else syll {midv = c : midv syll}
|
||||
| isCons c -> if null (initc syll) ||
|
||||
(null (midv syll) && isCluster (initc syll) c)
|
||||
then syll {initc = c : initc syll}
|
||||
else syll {finalc = c : finalc syll}
|
||||
| isTone c -> syll {tone = [c]}
|
||||
_ -> syll ---- check this
|
||||
|
||||
isCluster s c = length s == 1 && (c == 0x0e23 || s == [0x0e2b])
|
||||
|
||||
-- to test
|
||||
|
||||
test1 = testThai "k2wa:mrak"
|
||||
test2 = putStrLn $ thaiTable
|
||||
test3 = do
|
||||
writeFile "thai.txt" "Thai Character Coding in GF\nAR 2007\n"
|
||||
appendFile "thai.txt" thaiTable
|
||||
test4 = do
|
||||
writeFile "alphthai.txt" "Thai Characters by Pronunciation\nAR 2007\n"
|
||||
appendFile "alphthai.txt" thaiTableAlph
|
||||
|
||||
|
||||
testThai :: String -> IO ()
|
||||
testThai s = do
|
||||
putStrLn $ encodeUTF8 $ mkThai s
|
||||
putStrLn $ unwords $ map mkPronSyllable $ words s
|
||||
|
||||
testSyllable s =
|
||||
let y = getSyllable $ map mkThaiChar $ unchar s
|
||||
in
|
||||
putStrLn $ pronSyllable $ trace (show y) y
|
||||
|
||||
thaiFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ thaiStrings s
|
||||
|
||||
thaiPronFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiPronFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ thaiPronStrings s
|
||||
|
||||
thaiFakeFile :: FilePath -> Maybe FilePath -> IO ()
|
||||
thaiFakeFile f mo = do
|
||||
s <- readFile f
|
||||
let put = maybe putStr writeFile mo
|
||||
put $ encodeUTF8 $ (convStrings mkThaiFake) s
|
||||
|
||||
finalThai c = maybe "" return (Map.lookup c thaiFinalMap)
|
||||
thaiFinalMap = Map.fromList $ zip allThaiCodes finals
|
||||
|
||||
classThai c = maybe CLow readClass (Map.lookup c thaiClassMap)
|
||||
thaiClassMap = Map.fromList $ zip allThaiCodes heights
|
||||
|
||||
readClass s = case s of
|
||||
'L' -> CLow
|
||||
'M' -> CMid
|
||||
'H' -> CHigh
|
||||
|
||||
|
||||
thaiTable :: String
|
||||
thaiTable = unlines $ ("\n|| hex | thai | trans | pron | fin | class |" ) : [
|
||||
"| " ++
|
||||
hex c ++ " | " ++
|
||||
encodeUTF8 (showThai s) ++ " | " ++
|
||||
s ++ " | " ++
|
||||
pronThai s ++ " | " ++
|
||||
[f] ++ " | " ++
|
||||
[q] ++ " | "
|
||||
|
|
||||
(c,q,f,s) <- zip4 allThaiCodes heights finals allThaiTrans
|
||||
]
|
||||
|
||||
thaiTableAlph :: String
|
||||
thaiTableAlph = unlines $ ("\n|| pron | thai | trans |" ) : [
|
||||
"| " ++ a ++
|
||||
" | " ++ unwords (map (encodeUTF8 . showThai) ss) ++
|
||||
" | " ++ unwords ss ++
|
||||
" |"
|
||||
|
|
||||
(a,ss) <- allProns
|
||||
]
|
||||
where
|
||||
prons = sort $ nub
|
||||
[p | s <- allThaiTrans, let p = pronThai s, not (null p),isAlpha (head p)]
|
||||
allProns =
|
||||
[(a,[s | s <- allThaiTrans, pronThai s == a]) | a <- prons]
|
||||
|
||||
showThai s = case s of
|
||||
"-" -> "-"
|
||||
--- v:_ | elem v "ivu" -> map (toEnum . mkThaiChar) ["O",s]
|
||||
_ -> [toEnum $ mkThaiChar s]
|
||||
|
||||
|
||||
pronThaiChar = pronThai . recodeThai
|
||||
|
||||
recodeThai c = allThaiTrans !! (c - 0x0e00)
|
||||
|
||||
pronThai s = case s of
|
||||
[c,p]
|
||||
| c == 'N' && isDigit p -> [p]
|
||||
| c == 'T' && isDigit p -> ['\'',p]
|
||||
| isDigit p -> c:"h"
|
||||
| p==':' -> c:[c]
|
||||
| elem p "%&" -> c:"y"
|
||||
| p=='+' -> c:"m"
|
||||
| s == "e'" -> "\228\228"
|
||||
| otherwise -> [c]
|
||||
"O" -> "O"
|
||||
"e" -> "ee"
|
||||
[c] | isUpper c -> ""
|
||||
_ -> s
|
||||
|
||||
hex = map hx . reverse . digs where
|
||||
digs 0 = [0]
|
||||
digs n = n `mod` 16 : digs (n `div` 16)
|
||||
hx d = "0123456789ABCDEF" !! d
|
||||
|
||||
heights :: String
|
||||
finals :: String
|
||||
heights =
|
||||
" MHHLLLLMHLLLLMMHLLLMMHLLLMMHHLLLLLL-L-LHHHHLML" ++ replicate 99 ' '
|
||||
finals =
|
||||
" kkkkkkgt-tt-ntttttntttttnpp--pppmyn-n-wttt-n--" ++ replicate 99 ' '
|
||||
@@ -1,48 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : UTF8
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:42 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.5 $
|
||||
--
|
||||
-- From the Char module supplied with HBC.
|
||||
-- code by Thomas Hallgren (Jul 10 1999)
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.UTF8 (decodeUTF8, encodeUTF8) where
|
||||
|
||||
-- | Take a Unicode string and encode it as a string
|
||||
-- with the UTF8 method.
|
||||
decodeUTF8 :: String -> String
|
||||
decodeUTF8 "" = ""
|
||||
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
|
||||
'\x80' <= c' && c' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
|
||||
'\x80' <= c' && c' <= '\xbf' &&
|
||||
'\x80' <= c'' && c'' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 s = s ---- AR workaround 22/6/2006
|
||||
----decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
|
||||
|
||||
encodeUTF8 :: String -> String
|
||||
encodeUTF8 "" = ""
|
||||
encodeUTF8 (c:cs) =
|
||||
if c > '\x0000' && c < '\x0080' then
|
||||
c : encodeUTF8 cs
|
||||
else if c < toEnum 0x0800 then
|
||||
let i = fromEnum c
|
||||
in toEnum (0xc0 + i `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
else
|
||||
let i = fromEnum c
|
||||
in toEnum (0xe0 + i `div` 0x1000) :
|
||||
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
@@ -1,69 +0,0 @@
|
||||
----------------------------------------------------------------------
|
||||
-- |
|
||||
-- Module : Unicode
|
||||
-- Maintainer : AR
|
||||
-- Stability : (stable)
|
||||
-- Portability : (portable)
|
||||
--
|
||||
-- > CVS $Date: 2005/04/21 16:23:42 $
|
||||
-- > CVS $Author: bringert $
|
||||
-- > CVS $Revision: 1.12 $
|
||||
--
|
||||
-- ad hoc Unicode conversions from different alphabets.
|
||||
-- AR 12\/4\/2000, 18\/9\/2001, 30\/5\/2002, 26\/1\/2004
|
||||
-----------------------------------------------------------------------------
|
||||
|
||||
module GF.Text.Unicode (mkUnicode, treat) where
|
||||
|
||||
import GF.Text.Greek (mkGreek)
|
||||
import GF.Text.Arabic (mkArabic)
|
||||
import GF.Text.Hebrew (mkHebrew)
|
||||
import GF.Text.Russian (mkRussian, mkRusKOI8)
|
||||
import GF.Text.Ethiopic (mkEthiopic)
|
||||
import GF.Text.Tamil (mkTamil)
|
||||
import GF.Text.OCSCyrillic (mkOCSCyrillic)
|
||||
import GF.Text.LatinASupplement (mkLatinASupplement)
|
||||
import GF.Text.Devanagari (mkDevanagari)
|
||||
import GF.Text.Hiragana (mkJapanese)
|
||||
import GF.Text.ExtendedArabic (mkArabic0600)
|
||||
import GF.Text.ExtendedArabic (mkExtendedArabic)
|
||||
import GF.Text.ExtraDiacritics (mkExtraDiacritics)
|
||||
|
||||
import Data.Char
|
||||
|
||||
mkUnicode :: String -> String
|
||||
mkUnicode s = case s of
|
||||
'/':'/':cs -> treat [] mkGreek unic ++ mkUnicode rest
|
||||
'/':'+':cs -> mkHebrew unic ++ mkUnicode rest
|
||||
'/':'-':cs -> mkArabic unic ++ mkUnicode rest
|
||||
'/':'_':cs -> treat [] mkRussian unic ++ mkUnicode rest
|
||||
'/':'*':cs -> mkRusKOI8 unic ++ mkUnicode rest
|
||||
'/':'E':cs -> mkEthiopic unic ++ mkUnicode rest
|
||||
'/':'T':cs -> mkTamil unic ++ mkUnicode rest
|
||||
'/':'C':cs -> mkOCSCyrillic unic ++ mkUnicode rest
|
||||
'/':'&':cs -> mkDevanagari unic ++ mkUnicode rest
|
||||
'/':'L':cs -> mkLatinASupplement unic ++ mkUnicode rest
|
||||
'/':'J':cs -> mkJapanese unic ++ mkUnicode rest
|
||||
'/':'6':cs -> mkArabic0600 unic ++ mkUnicode rest
|
||||
'/':'A':cs -> mkExtendedArabic unic ++ mkUnicode rest
|
||||
'/':'X':cs -> mkExtraDiacritics unic ++ mkUnicode rest
|
||||
c:cs -> c:mkUnicode cs
|
||||
_ -> s
|
||||
where
|
||||
(unic,rest) = remClosing [] $ dropWhile isSpace $ drop 2 s
|
||||
remClosing u s = case s of
|
||||
c:'/':s | elem c "/+-_*ETC&LJ6AX" -> (reverse u, s) --- end need not match
|
||||
c:cs -> remClosing (c:u) cs
|
||||
_ -> (reverse u,[]) -- forgiving missing end
|
||||
|
||||
-- | don't convert XML tags --- assumes \<\> always means XML tags
|
||||
treat :: String -> (String -> String) -> String -> String
|
||||
treat old mk s = case s of
|
||||
'<':cs -> mk (reverse old) ++ '<':noTreat cs
|
||||
c:cs -> treat (c:old) mk cs
|
||||
_ -> mk (reverse old)
|
||||
where
|
||||
noTreat s = case s of
|
||||
'>':cs -> '>' : treat [] mk cs
|
||||
c:cs -> c : noTreat cs
|
||||
_ -> s
|
||||
Reference in New Issue
Block a user