mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-28 22:12:51 -06:00
Founding the newly structured GF2.0 cvs archive.
This commit is contained in:
48
src/GF/Text/Arabic.hs
Normal file
48
src/GF/Text/Arabic.hs
Normal file
@@ -0,0 +1,48 @@
|
||||
module Arabic where
|
||||
|
||||
mkArabic :: String -> String
|
||||
mkArabic = reverse . unwords . (map mkArabicWord) . words
|
||||
--- reverse : assumes everything's on same line
|
||||
|
||||
type ArabicChar = Char
|
||||
|
||||
mkArabicWord :: String -> [ArabicChar]
|
||||
mkArabicWord = map mkArabicChar . getLetterPos
|
||||
|
||||
getLetterPos :: String -> [(Char,Int)]
|
||||
getLetterPos [] = []
|
||||
getLetterPos ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
|
||||
getLetterPos ('O':cs) = ('*',8) : getIn cs -- 0xfe8b
|
||||
getLetterPos ('l':'a':cs) = ('*',5) : getLetterPos cs -- 0xfefb
|
||||
getLetterPos [c] = [(c,1)] -- 1=isolated
|
||||
getLetterPos (c:cs) | isReduced c = (c,1) : getLetterPos cs
|
||||
getLetterPos (c:cs) = (c,3) : getIn cs -- 3=initial
|
||||
|
||||
|
||||
getIn [] = []
|
||||
getIn ('I':cs) = ('*',7) : getLetterPos cs -- 0xfe80
|
||||
getIn ('O':cs) = ('*',9) : getIn cs -- 0xfe8c
|
||||
getIn ('l':'a':cs) = ('*',6) : getLetterPos cs -- 0xfefc
|
||||
getIn [c] = [(c,2)] -- 2=final
|
||||
getIn (c:cs) | isReduced c = (c,2) : getLetterPos cs
|
||||
getIn (c:cs) = (c,4) : getIn cs -- 4=medial
|
||||
|
||||
isReduced :: Char -> Bool
|
||||
isReduced c = c `elem` "UuWiYOaAdVrzwj"
|
||||
|
||||
mkArabicChar ('*',p) | p > 4 && p < 10 =
|
||||
(map toEnum [0xfefb,0xfefc,0xfe80,0xfe8b,0xfe8c]) !! (p-5)
|
||||
mkArabicChar cp@(c,p) = case lookup c cc of Just c' -> (c' !! (p-1)) ; _ -> c
|
||||
where
|
||||
cc = mkArabicTab allArabicCodes allArabic
|
||||
|
||||
mkArabicTab (c:cs) as = (c,as1) : mkArabicTab cs as2 where
|
||||
(as1,as2) = if isReduced c then splitAt 2 as else splitAt 4 as
|
||||
mkArabicTab [] _ = []
|
||||
|
||||
allArabicCodes = "UuWiYOabAtvgHCdVrzscSDTZoxfqklmnhwjy"
|
||||
|
||||
allArabic :: String
|
||||
allArabic = (map toEnum [0xfe81 .. 0xfef4]) -- I=0xfe80
|
||||
|
||||
|
||||
158
src/GF/Text/Greek.hs
Normal file
158
src/GF/Text/Greek.hs
Normal file
@@ -0,0 +1,158 @@
|
||||
module Greek where
|
||||
|
||||
mkGreek :: String -> String
|
||||
mkGreek = unwords . (map mkGreekWord) . mkGravis . words
|
||||
|
||||
--- TODO : optimize character formation by factorizing the case expressions
|
||||
|
||||
type GreekChar = Char
|
||||
|
||||
mkGreekWord :: String -> [GreekChar]
|
||||
mkGreekWord = map (toEnum . mkGreekChar) . mkGreekSpec
|
||||
|
||||
mkGravis :: [String] -> [String]
|
||||
mkGravis [] = []
|
||||
mkGravis [w] = [w]
|
||||
mkGravis (w1:w2:ws)
|
||||
| stressed w2 = mkG w1 : mkGravis (w2:ws)
|
||||
| otherwise = w1 : w2 : mkGravis ws
|
||||
where
|
||||
stressed w = any (`elem` "'~`") w
|
||||
mkG :: String -> String
|
||||
mkG w = let (w1,w2) = span (/='\'') w in
|
||||
case w2 of
|
||||
'\'':v:cs | not (any isVowel cs) -> w1 ++ "`" ++ [v] ++ cs
|
||||
'\'':'!':v:cs | not (any isVowel cs) -> w1 ++ "`!" ++ [v] ++ cs
|
||||
_ -> w
|
||||
isVowel c = elem c "aehiouw"
|
||||
|
||||
mkGreekSpec :: String -> [(Char,Int)]
|
||||
mkGreekSpec str = case str of
|
||||
[] -> []
|
||||
'(' :'\'': '!' : c : cs -> (c,25) : mkGreekSpec cs
|
||||
'(' :'~' : '!' : c : cs -> (c,27) : mkGreekSpec cs
|
||||
'(' :'`' : '!' : c : cs -> (c,23) : mkGreekSpec cs
|
||||
'(' : '!' : c : cs -> (c,21) : mkGreekSpec cs
|
||||
')' :'\'': '!' : c : cs -> (c,24) : mkGreekSpec cs
|
||||
')' :'~' : '!' : c : cs -> (c,26) : mkGreekSpec cs
|
||||
')' :'`' : '!' : c : cs -> (c,22) : mkGreekSpec cs
|
||||
')' : '!' : c : cs -> (c,20) : mkGreekSpec cs
|
||||
'\'': '!' : c : cs -> (c,30) : mkGreekSpec cs
|
||||
'~' : '!' : c : cs -> (c,31) : mkGreekSpec cs
|
||||
'`' : '!' : c : cs -> (c,32) : mkGreekSpec cs
|
||||
'!' : c : cs -> (c,33) : mkGreekSpec cs
|
||||
'(' :'\'': c : cs -> (c,5) : mkGreekSpec cs
|
||||
'(' :'~' : c : cs -> (c,7) : mkGreekSpec cs
|
||||
'(' :'`' : c : cs -> (c,3) : mkGreekSpec cs
|
||||
'(' : c : cs -> (c,1) : mkGreekSpec cs
|
||||
')' :'\'': c : cs -> (c,4) : mkGreekSpec cs
|
||||
')' :'~' : c : cs -> (c,6) : mkGreekSpec cs
|
||||
')' :'`' : c : cs -> (c,2) : mkGreekSpec cs
|
||||
')' : c : cs -> (c,0) : mkGreekSpec cs
|
||||
'\'': c : cs -> (c,10) : mkGreekSpec cs
|
||||
'~' : c : cs -> (c,11) : mkGreekSpec cs
|
||||
'`' : c : cs -> (c,12) : mkGreekSpec cs
|
||||
c : cs -> (c,-1) : mkGreekSpec cs
|
||||
|
||||
mkGreekChar (c,-1) = case lookup c cc of Just c' -> c' ; _ -> fromEnum c
|
||||
where
|
||||
cc = zip "abgdezhqiklmnxoprjstyfcuw" allGreekMin
|
||||
mkGreekChar (c,n) = case (c,n) of
|
||||
('a',10) -> 0x03ac
|
||||
('a',11) -> 0x1fb6
|
||||
('a',12) -> 0x1f70
|
||||
('a',30) -> 0x1fb4
|
||||
('a',31) -> 0x1fb7
|
||||
('a',32) -> 0x1fb2
|
||||
('a',33) -> 0x1fb3
|
||||
('a',n) | n >19 -> 0x1f80 + n - 20
|
||||
('a',n) -> 0x1f00 + n
|
||||
('e',10) -> 0x03ad -- '
|
||||
-- ('e',11) -> 0x1fb6 -- ~ can't happen
|
||||
('e',12) -> 0x1f72 -- `
|
||||
('e',n) -> 0x1f10 + n
|
||||
('h',10) -> 0x03ae -- '
|
||||
('h',11) -> 0x1fc6 -- ~
|
||||
('h',12) -> 0x1f74 -- `
|
||||
|
||||
('h',30) -> 0x1fc4
|
||||
('h',31) -> 0x1fc7
|
||||
('h',32) -> 0x1fc2
|
||||
('h',33) -> 0x1fc3
|
||||
('h',n) | n >19 -> 0x1f90 + n - 20
|
||||
|
||||
('h',n) -> 0x1f20 + n
|
||||
('i',10) -> 0x03af -- '
|
||||
('i',11) -> 0x1fd6 -- ~
|
||||
('i',12) -> 0x1f76 -- `
|
||||
('i',n) -> 0x1f30 + n
|
||||
('o',10) -> 0x03cc -- '
|
||||
-- ('o',11) -> 0x1fb6 -- ~ can't happen
|
||||
('o',12) -> 0x1f78 -- `
|
||||
('o',n) -> 0x1f40 + n
|
||||
('y',10) -> 0x03cd -- '
|
||||
('y',11) -> 0x1fe6 -- ~
|
||||
('y',12) -> 0x1f7a -- `
|
||||
('y',n) -> 0x1f50 + n
|
||||
('w',10) -> 0x03ce -- '
|
||||
('w',11) -> 0x1ff6 -- ~
|
||||
('w',12) -> 0x1f7c -- `
|
||||
|
||||
('w',30) -> 0x1ff4
|
||||
('w',31) -> 0x1ff7
|
||||
('w',32) -> 0x1ff2
|
||||
('w',33) -> 0x1ff3
|
||||
('w',n) | n >19 -> 0x1fa0 + n - 20
|
||||
|
||||
('w',n) -> 0x1f60 + n
|
||||
('r',1) -> 0x1fe5
|
||||
_ -> mkGreekChar (c,-1) --- should not happen
|
||||
|
||||
allGreekMin :: [Int]
|
||||
allGreekMin = [0x03b1 .. 0x03c9]
|
||||
|
||||
|
||||
{-
|
||||
encoding of Greek writing. Those hard to guess are marked with ---
|
||||
|
||||
maj min
|
||||
A a Alpha 0391 03b1
|
||||
B b Beta 0392 03b2
|
||||
G g Gamma 0393 03b3
|
||||
D d Delta 0394 03b4
|
||||
E e Epsilon 0395 03b5
|
||||
Z z Zeta 0396 03b6
|
||||
H h Eta --- 0397 03b7
|
||||
Q q Theta --- 0398 03b8
|
||||
I i Iota 0399 03b9
|
||||
K k Kappa 039a 03ba
|
||||
L l Lambda 039b 03bb
|
||||
M m My 039c 03bc
|
||||
N n Ny 039d 03bd
|
||||
X x Xi 039e 03be
|
||||
O o Omikron 039f 03bf
|
||||
P p Pi 03a0 03c0
|
||||
R r Rho 03a1 03c1
|
||||
j Sigma --- 03c2
|
||||
S s Sigma 03a3 03c3
|
||||
T t Tau 03a4 03c4
|
||||
Y y Ypsilon 03a5 03c5
|
||||
F f Phi 03a6 03c6
|
||||
C c Khi --- 03a7 03c7
|
||||
U u Psi 03a8 03c8
|
||||
W w Omega --- 03a9 03c9
|
||||
|
||||
( spiritus asper
|
||||
) spiritus lenis
|
||||
! iota subscriptum
|
||||
|
||||
' acutus
|
||||
~ circumflexus
|
||||
` gravis
|
||||
|
||||
-}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
21
src/GF/Text/Hebrew.hs
Normal file
21
src/GF/Text/Hebrew.hs
Normal file
@@ -0,0 +1,21 @@
|
||||
module Hebrew where
|
||||
|
||||
mkHebrew :: String -> String
|
||||
mkHebrew = reverse . unwords . (map mkHebrewWord) . words
|
||||
--- reverse : assumes everything's on same line
|
||||
|
||||
type HebrewChar = Char
|
||||
|
||||
mkHebrewWord :: String -> [HebrewChar]
|
||||
mkHebrewWord = map mkHebrewChar
|
||||
|
||||
mkHebrewChar c = case lookup c cc of Just c' -> c' ; _ -> c
|
||||
where
|
||||
cc = zip allHebrewCodes allHebrew
|
||||
|
||||
allHebrewCodes = "-abgdhwzHTyKklMmNnSoPpCcqrst"
|
||||
|
||||
allHebrew :: String
|
||||
allHebrew = (map toEnum (0x05be : [0x05d0 .. 0x05ea]))
|
||||
|
||||
|
||||
31
src/GF/Text/Russian.hs
Normal file
31
src/GF/Text/Russian.hs
Normal file
@@ -0,0 +1,31 @@
|
||||
module Russian where
|
||||
|
||||
-- an ad hoc ASCII encoding. Delimiters: /_ _/
|
||||
mkRussian :: String -> String
|
||||
mkRussian = unwords . (map mkRussianWord) . words
|
||||
|
||||
-- the KOI8 encoding, incomplete. Delimiters: /* */
|
||||
mkRusKOI8 :: String -> String
|
||||
mkRusKOI8 = unwords . (map mkRussianKOI8) . words
|
||||
|
||||
type RussianChar = Char
|
||||
|
||||
mkRussianWord :: String -> [RussianChar]
|
||||
mkRussianWord = map (mkRussianChar allRussianCodes)
|
||||
|
||||
mkRussianKOI8 :: String -> [RussianChar]
|
||||
mkRussianKOI8 = map (mkRussianChar allRussianKOI8)
|
||||
|
||||
mkRussianChar chars c = case lookup c cc of Just c' -> c' ; _ -> c
|
||||
where
|
||||
cc = zip chars allRussian
|
||||
|
||||
allRussianCodes =
|
||||
"ÅåABVGDEXZIJKLMNOPRSTUFHCQW£}!*ÖYÄabvgdexzijklmnoprstufhcqw#01'öyä"
|
||||
allRussianKOI8 =
|
||||
"^@áâ÷çäåöúéêëìíîïðòóôõæèãþûýøùÿüàñÁÂ×ÇÄÅÖÚÉÊËÌÍÎÏÐÒÓÔÕÆÈÃÞÛÝØÙßÜÀÑ"
|
||||
|
||||
allRussian :: String
|
||||
allRussian = (map toEnum (0x0401:0x0451:[0x0410 .. 0x044f])) -- Ëë in odd places
|
||||
|
||||
|
||||
56
src/GF/Text/Text.hs
Normal file
56
src/GF/Text/Text.hs
Normal file
@@ -0,0 +1,56 @@
|
||||
module Text where
|
||||
|
||||
import Operations
|
||||
import Char
|
||||
|
||||
-- elementary text postprocessing. AR 21/11/2001
|
||||
-- This is very primitive indeed. The functions should work on
|
||||
-- token lists and not on strings. AR 5/12/2002
|
||||
|
||||
|
||||
formatAsTextLit :: String -> String
|
||||
formatAsTextLit = formatAsText . unwords . map unStringLit . words
|
||||
--- hope that there will be deforestation...
|
||||
|
||||
formatAsCodeLit :: String -> String
|
||||
formatAsCodeLit = formatAsCode . unwords . map unStringLit . words
|
||||
|
||||
formatAsText :: String -> String
|
||||
formatAsText = unwords . format . cap . words where
|
||||
format ws = case ws of
|
||||
w : c : ww | major c -> (w ++ c) : format (cap ww)
|
||||
w : c : ww | minor c -> (w ++ c) : format ww
|
||||
c : ww | para c -> "\n\n" : format ww
|
||||
w : ww -> w : format ww
|
||||
[] -> []
|
||||
cap (p:(c:cs):ww) | para p = p : (toUpper c : cs) : ww
|
||||
cap ((c:cs):ww) = (toUpper c : cs) : ww
|
||||
cap [] = []
|
||||
major = flip elem (map singleton ".!?")
|
||||
minor = flip elem (map singleton ",:;")
|
||||
para = (=="<p>")
|
||||
|
||||
formatAsCode :: String -> String
|
||||
formatAsCode = unwords . format . words where
|
||||
format ws = case ws of
|
||||
p : w : ww | parB p -> format ((p ++ w') : ww') where (w':ww') = format (w:ww)
|
||||
w : p : ww | par p -> format ((w ++ p') : ww') where (p':ww') = format (p:ww)
|
||||
w : ww -> w : format ww
|
||||
[] -> []
|
||||
parB = flip elem (map singleton "([{")
|
||||
parE = flip elem (map singleton "}])")
|
||||
par t = parB t || parE t
|
||||
|
||||
performBinds :: String -> String
|
||||
performBinds = unwords . format . words where
|
||||
format ws = case ws of
|
||||
w : "&+" : u : ws -> format ((w ++ u) : ws)
|
||||
w : ws -> w : format ws
|
||||
[] -> []
|
||||
|
||||
unStringLit :: String -> String
|
||||
unStringLit s = case s of
|
||||
c : cs | strlim c && strlim (last cs) -> init cs
|
||||
_ -> s
|
||||
where
|
||||
strlim = (=='\'')
|
||||
35
src/GF/Text/UTF8.hs
Normal file
35
src/GF/Text/UTF8.hs
Normal file
@@ -0,0 +1,35 @@
|
||||
module UTF8 where
|
||||
|
||||
-- From the Char module supplied with HBC.
|
||||
-- code by Thomas Hallgren (Jul 10 1999)
|
||||
|
||||
-- Take a Unicode string and encode it as a string
|
||||
-- with the UTF8 method.
|
||||
decodeUTF8 :: String -> String
|
||||
decodeUTF8 "" = ""
|
||||
decodeUTF8 (c:cs) | c < '\x80' = c : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':cs) | '\xc0' <= c && c <= '\xdf' &&
|
||||
'\x80' <= c' && c' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x20) * 0x40 + fromEnum c' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 (c:c':c'':cs) | '\xe0' <= c && c <= '\xef' &&
|
||||
'\x80' <= c' && c' <= '\xbf' &&
|
||||
'\x80' <= c'' && c'' <= '\xbf' =
|
||||
toEnum ((fromEnum c `mod` 0x10 * 0x1000) + (fromEnum c' `mod` 0x40) * 0x40 + fromEnum c'' `mod` 0x40) : decodeUTF8 cs
|
||||
decodeUTF8 _ = error "UniChar.decodeUTF8: bad data"
|
||||
|
||||
encodeUTF8 :: String -> String
|
||||
encodeUTF8 "" = ""
|
||||
encodeUTF8 (c:cs) =
|
||||
if c > '\x0000' && c < '\x0080' then
|
||||
c : encodeUTF8 cs
|
||||
else if c < toEnum 0x0800 then
|
||||
let i = fromEnum c
|
||||
in toEnum (0xc0 + i `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
else
|
||||
let i = fromEnum c
|
||||
in toEnum (0xe0 + i `div` 0x1000) :
|
||||
toEnum (0x80 + (i `mod` 0x1000) `div` 0x40) :
|
||||
toEnum (0x80 + i `mod` 0x40) :
|
||||
encodeUTF8 cs
|
||||
24
src/GF/Text/Unicode.hs
Normal file
24
src/GF/Text/Unicode.hs
Normal file
@@ -0,0 +1,24 @@
|
||||
module Unicode where
|
||||
|
||||
import Greek (mkGreek)
|
||||
import Arabic (mkArabic)
|
||||
import Hebrew (mkHebrew)
|
||||
import Russian (mkRussian, mkRusKOI8)
|
||||
|
||||
-- ad hoc Unicode conversions from different alphabets
|
||||
|
||||
-- AR 12/4/2000, 18/9/2001, 30/5/2002
|
||||
|
||||
mkUnicode s = case s of
|
||||
'/':'/':cs -> mkGreek (remClosing cs)
|
||||
'/':'+':cs -> mkHebrew (remClosing cs)
|
||||
'/':'-':cs -> mkArabic (remClosing cs)
|
||||
'/':'_':cs -> mkRussian (remClosing cs)
|
||||
'/':'*':cs -> mkRusKOI8 (remClosing cs)
|
||||
_ -> s
|
||||
|
||||
remClosing cs
|
||||
| lcs > 1 && last cs == '/' = take (lcs-2) cs
|
||||
| otherwise = cs
|
||||
where lcs = length cs
|
||||
|
||||
Reference in New Issue
Block a user