Founding the newly structured GF2.0 cvs archive.

This commit is contained in:
aarne
2003-09-22 13:16:55 +00:00
commit b1402e8bd6
162 changed files with 25569 additions and 0 deletions

48
src/GF/Text/Arabic.hs Normal file
View File

@@ -0,0 +1,48 @@
module Arabic where
mkArabic :: String -> String
mkArabic = reverse . unwords . (map mkArabicWord) . words
--- reverse : assumes everything's on same line
type ArabicChar = Char
mkArabicWord :: String -> [ArabicChar]
mkArabicWord = map mkArabicChar . getLetterPos
getLetterPos :: String -> [(Char,Int)]
getLetterPos [] = []
getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b
getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb
getLetterPos [c] = [(c,1)] -- 1=isolated
getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial
getIn [] = []
getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c
getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc
getIn [c] = [(c,2)] -- 2=final
getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
getIn (c:cs) = (c,4) : getIn cs -- 4=medial
isReduced :: Char -> Bool
isReduced c = c `elem` "UuWiYOaAdVrzwj"
mkArabicChar ('*',p) | p > 4 && p < 10 =
(map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)
mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c
where
cc = mkArabicTab allArabicCodes allArabic
mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
(as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
mkArabicTab [] _ = []
allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy"
allArabic :: String
allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80

158
src/GF/Text/Greek.hs Normal file
View File

@@ -0,0 +1,158 @@
module Greek where
mkGreek :: String -> String
mkGreek = unwords . (map mkGreekWord) . mkGravis . words
--- TODO : optimize character formation by factorizing the case expressions
type GreekChar = Char
mkGreekWord :: String -> [GreekChar]
mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
mkGravis :: [String] -> [String]
mkGravis [] = []
mkGravis [w] = [w]
mkGravis (w1:w2:ws)
| stressed w2 = mkG w1 : mkGravis (w2:ws)
| otherwise = w1 : w2 : mkGravis ws
where
stressed w = any (`elem` "'~`") w
mkG :: String -> String
mkG w = let (w1,w2) = span (/='\'') w in
case w2 of
'\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
'\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
_ -> w
isVowel c = elem c "aehiouw"
mkGreekSpec :: String -> [(Char,Int)]
mkGreekSpec str = case str of
[] -> []
'(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs
'(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs
'(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs
'(' : '!' : c : cs -> (c,21) : mkGreekSpec cs
')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs
')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs
')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs
')' : '!' : c : cs -> (c,20) : mkGreekSpec cs
'\'': '!' : c : cs -> (c,30) : mkGreekSpec cs
'~' : '!' : c : cs -> (c,31) : mkGreekSpec cs
'`' : '!' : c : cs -> (c,32) : mkGreekSpec cs
'!' : c : cs -> (c,33) : mkGreekSpec cs
'(' :'\'': c : cs -> (c,5) : mkGreekSpec cs
'(' :'~' : c : cs -> (c,7) : mkGreekSpec cs
'(' :'`' : c : cs -> (c,3) : mkGreekSpec cs
'(' : c : cs -> (c,1) : mkGreekSpec cs
')' :'\'': c : cs -> (c,4) : mkGreekSpec cs
')' :'~' : c : cs -> (c,6) : mkGreekSpec cs
')' :'`' : c : cs -> (c,2) : mkGreekSpec cs
')' : c : cs -> (c,0) : mkGreekSpec cs
'\'': c : cs -> (c,10) : mkGreekSpec cs
'~' : c : cs -> (c,11) : mkGreekSpec cs
'`' : c : cs -> (c,12) : mkGreekSpec cs
c : cs -> (c,-1) : mkGreekSpec cs
mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c
where
cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
mkGreekChar (c,n) = case (c,n) of
('a',10) -> 0x03ac
('a',11) -> 0x1fb6
('a',12) -> 0x1f70
('a',30) -> 0x1fb4
('a',31) -> 0x1fb7
('a',32) -> 0x1fb2
('a',33) -> 0x1fb3
('a',n) | n >19 -> 0x1f80 + n - 20
('a',n) -> 0x1f00 + n
('e',10) -> 0x03ad -- '
-- ('e',11) -> 0x1fb6 -- ~ can't happen
('e',12) -> 0x1f72 -- `
('e',n) -> 0x1f10 + n
('h',10) -> 0x03ae -- '
('h',11) -> 0x1fc6 -- ~
('h',12) -> 0x1f74 -- `
('h',30) -> 0x1fc4
('h',31) -> 0x1fc7
('h',32) -> 0x1fc2
('h',33) -> 0x1fc3
('h',n) | n >19 -> 0x1f90 + n - 20
('h',n) -> 0x1f20 + n
('i',10) -> 0x03af -- '
('i',11) -> 0x1fd6 -- ~
('i',12) -> 0x1f76 -- `
('i',n) -> 0x1f30 + n
('o',10) -> 0x03cc -- '
-- ('o',11) -> 0x1fb6 -- ~ can't happen
('o',12) -> 0x1f78 -- `
('o',n) -> 0x1f40 + n
('y',10) -> 0x03cd -- '
('y',11) -> 0x1fe6 -- ~
('y',12) -> 0x1f7a -- `
('y',n) -> 0x1f50 + n
('w',10) -> 0x03ce -- '
('w',11) -> 0x1ff6 -- ~
('w',12) -> 0x1f7c -- `
('w',30) -> 0x1ff4
('w',31) -> 0x1ff7
('w',32) -> 0x1ff2
('w',33) -> 0x1ff3
('w',n) | n >19 -> 0x1fa0 + n - 20
('w',n) -> 0x1f60 + n
('r',1) -> 0x1fe5
_ -> mkGreekChar (c,-1) --- should not happen
allGreekMin :: [Int]
allGreekMin = [0x03b1 .. 0x03c9]
{-
encoding of Greek writing. Those hard to guess are marked with ---
maj min
A a Alpha 0391 03b1
B b Beta 0392 03b2
G g Gamma 0393 03b3
D d Delta 0394 03b4
E e Epsilon 0395 03b5
Z z Zeta 0396 03b6
H h Eta --- 0397 03b7
Q q Theta --- 0398 03b8
I i Iota 0399 03b9
K k Kappa 039a 03ba
L l Lambda 039b 03bb
M m My 039c 03bc
N n Ny 039d 03bd
X x Xi 039e 03be
O o Omikron 039f 03bf
P p Pi 03a0 03c0
R r Rho 03a1 03c1
j Sigma --- 03c2
S s Sigma 03a3 03c3
T t Tau 03a4 03c4
Y y Ypsilon 03a5 03c5
F f Phi 03a6 03c6
C c Khi --- 03a7 03c7
U u Psi 03a8 03c8
W w Omega --- 03a9 03c9
( spiritus asper
) spiritus lenis
! iota subscriptum
' acutus
~ circumflexus
` gravis
-}

21
src/GF/Text/Hebrew.hs Normal file
View File

@@ -0,0 +1,21 @@
module Hebrew where
mkHebrew :: String -> String
mkHebrew = reverse . unwords . (map mkHebrewWord) . words
--- reverse : assumes everything's on same line
type HebrewChar = Char
mkHebrewWord :: String -> [HebrewChar]
mkHebrewWord = map mkHebrewChar
mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c
where
cc = zip allHebrewCodes allHebrew
allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
allHebrew :: String
allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))

31
src/GF/Text/Russian.hs Normal file
View File

@@ -0,0 +1,31 @@
module Russian where
-- an ad hoc ASCII encoding. Delimiters: /_ _/
mkRussian :: String -> String
mkRussian = unwords . (map mkRussianWord) . words
-- the KOI8 encoding, incomplete. Delimiters: /* */
mkRusKOI8 :: String -> String
mkRusKOI8 = unwords . (map mkRussianKOI8) . words
type RussianChar = Char
mkRussianWord :: String -> [RussianChar]
mkRussianWord = map (mkRussianChar allRussianCodes)
mkRussianKOI8 :: String -> [RussianChar]
mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c
where
cc = zip chars allRussian
allRussianCodes =
"ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä"
allRussianKOI8 =
"^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ"
allRussian :: String
allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places

56
src/GF/Text/Text.hs Normal file
View File

@@ -0,0 +1,56 @@
module Text where
import Operations
import Char
-- elementary text postprocessing. AR 21/11/2001
-- This is very primitive indeed. The functions should work on
-- token lists and not on strings. AR 5/12/2002
formatAsTextLit :: String -> String
formatAsTextLit = formatAsText . unwords . map unStringLit . words
--- hope that there will be deforestation...
formatAsCodeLit :: String -> String
formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
formatAsText :: String -> String
formatAsText = unwords . format . cap . words where
format ws = case ws of
w : c : ww | major c -> (w ++ c) : format (cap ww)
w : c : ww | minor c -> (w ++ c) : format ww
c : ww | para c -> "\n\n" : format ww
w : ww -> w : format ww
[] -> []
cap (p:(c:cs):ww) | para p = p : (toUpper c : cs) : ww
cap ((c:cs):ww) = (toUpper c : cs) : ww
cap [] = []
major = flip elem (map singleton ".!?")
minor = flip elem (map singleton ",:;")
para = (=="<p>")
formatAsCode :: String -> String
formatAsCode = unwords . format . words where
format ws = case ws of
p : w : ww | parB p -> format ((p ++ w') : ww') where (w':ww') = format (w:ww)
w : p : ww | par p -> format ((w ++ p') : ww') where (p':ww') = format (p:ww)
w : ww -> w : format ww
[] -> []
parB = flip elem (map singleton "([{")
parE = flip elem (map singleton "}])")
par t = parB t || parE t
performBinds :: String -> String
performBinds = unwords . format . words where
format ws = case ws of
w : "&+" : u : ws -> format ((w ++ u) : ws)
w : ws -> w : format ws
[] -> []
unStringLit :: String -> String
unStringLit s = case s of
c : cs | strlim c && strlim (last cs) -> init cs
_ -> s
where
strlim = (=='\'')

35
src/GF/Text/UTF8.hs Normal file
View File

@@ -0,0 +1,35 @@
module UTF8 where
-- From the Char module supplied with HBC.
-- code by Thomas Hallgren (Jul 10 1999)
-- Take a Unicode string and encode it as a string
-- with the UTF8 method.
decodeUTF8 :: String -> String
decodeUTF8 "" = ""
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
'\x80' <= c' && c' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
'\x80' <= c' && c' <= '\xbf' &&
'\x80' <= c'' && c'' <= '\xbf' =
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
encodeUTF8 :: String -> String
encodeUTF8 "" = ""
encodeUTF8 (c:cs) =
if c > '\x0000' && c < '\x0080' then
c : encodeUTF8 cs
else if c < toEnum 0x0800 then
let i = fromEnum c
in toEnum (0xc0 + i `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs
else
let i = fromEnum c
in toEnum (0xe0 + i `div` 0x1000) :
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
toEnum (0x80 + i `mod` 0x40) :
encodeUTF8 cs

24
src/GF/Text/Unicode.hs Normal file
View File

@@ -0,0 +1,24 @@
module Unicode where
import Greek (mkGreek)
import Arabic (mkArabic)
import Hebrew (mkHebrew)
import Russian (mkRussian, mkRusKOI8)
-- ad hoc Unicode conversions from different alphabets
-- AR 12/4/2000, 18/9/2001, 30/5/2002
mkUnicode s = case s of
'/':'/':cs -> mkGreek (remClosing cs)
'/':'+':cs -> mkHebrew (remClosing cs)
'/':'-':cs -> mkArabic (remClosing cs)
'/':'_':cs -> mkRussian (remClosing cs)
'/':'*':cs -> mkRusKOI8 (remClosing cs)
_ -> s
remClosing cs
| lcs > 1 && last cs == '/' = take (lcs-2) cs
| otherwise = cs
where lcs = length cs