1
0
forked from GitHub/gf-core

reintroduce the compiler API

This commit is contained in:
Krasimir Angelov
2024-01-18 20:58:10 +01:00
parent 282c6fc50f
commit a82095d117
138 changed files with 84 additions and 342 deletions

View File

@@ -0,0 +1,40 @@
module GF.Text.Clitics (getClitics,getCliticsText) where
import Data.List
-- AR 6/2/2011
-- Analyse word as stem+clitic whenever
-- (1) clitic is in clitic list
-- (2) either
-- (a) stem is in Lexicon
-- (b) stem can be analysed as stem0+clitic0
--
-- Examples:
-- Italian amarmi = amar+mi
-- Finnish autossanikohan = autossa+ni+kohan
--
-- The analysis gives all results, including the case where the whole word is in Lexicon.
--
-- The clitics in the list are expected to be reversed.
getClitics :: (String -> Bool) -> [String] -> String -> [[String]]
getClitics isLex rclitics = map (reverse . map reverse) . clits . reverse where
clits rword = ifLex rword [rclit:more |
rclit <- rclitics, stem <- splits rclit rword, more <- clits stem]
splits c = maybe [] return . stripPrefix c
ifLex w ws = if isLex (reverse w) then [w] : ws else ws
getCliticsText :: (String -> Bool) -> [String] -> [String] -> [String]
getCliticsText isLex rclitics =
map unwords . sequence . map (map render . getClitics isLex rclitics)
where
render = unwords . intersperse "&+"
-- example
--getClitics1 = getClitics exlex1 exclits1
--exlex1 = flip elem ["auto", "naise", "rahan","maa","maahan","maahankaan"]
--exclits1 = map reverse ["ni","ko","han","pas","nsa","kin","kaan"]

View File

@@ -0,0 +1,75 @@
module GF.Text.Coding where
import qualified Data.ByteString as BS
import Data.ByteString.Internal
import GHC.IO
import GHC.IO.Buffer
import GHC.IO.Encoding
import GHC.IO.Exception
import Control.Monad
encodeUnicode :: TextEncoding -> String -> ByteString
encodeUnicode enc s =
unsafePerformIO $ do
let len = length s
cbuf0 <- newCharBuffer (len*4) ReadBuffer
foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
let cbuf = cbuf0{bufR=len}
case enc of
TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
bss <- translate (encode encoder) cbuf
close encoder
return (BS.concat bss)
where
translate cod cbuf
| i < w = do bbuf <- newByteBuffer 128 WriteBuffer
(_,cbuf,bbuf) <- cod cbuf bbuf
if isEmptyBuffer bbuf
then ioe_invalidCharacter1
else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
bss <- translate cod cbuf
return (bs:bss)
| otherwise = return []
where
i = bufL cbuf
w = bufR cbuf
decodeUnicode :: TextEncoding -> ByteString -> String
decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
decodeUnicodeIO enc (PS fptr l len) = do
let bbuf = (emptyBuffer fptr len ReadBuffer) { bufL=l, bufR=l+len }
cbuf <- newCharBuffer 128 WriteBuffer
case enc of
TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
s <- translate (encode decoder) bbuf cbuf
close decoder
return s
where
translate cod bbuf cbuf
| i < w = do
(_,bbuf,cbuf) <- cod bbuf cbuf
if isEmptyBuffer cbuf
then ioe_invalidCharacter2
else unpack cod bbuf cbuf
| otherwise = return []
where
i = bufL bbuf
w = bufR bbuf
unpack cod bbuf cbuf
| i < w = do (c,i') <- readCharBuf (bufRaw cbuf) i
cs <- unpack cod bbuf cbuf{bufL=i'}
return (c:cs)
| otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
where
i = bufL cbuf
w = bufR cbuf
ioe_invalidCharacter1 = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this encoding") Nothing Nothing)
ioe_invalidCharacter2 = ioException
(IOError Nothing InvalidArgument ""
("invalid byte sequence for this decoding") Nothing Nothing)

View File

@@ -0,0 +1,448 @@
-- | Lexers and unlexers - they work on space-separated word strings
module GF.Text.Lexing (stringOp,opInEnv,bindTok) where
import GF.Text.Transliterations
import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse)
stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp good name = case name of
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
"lextext" -> Just $ appLexer (lexText good)
"lexcode" -> Just $ appLexer lexCode
"lexmixed" -> Just $ appLexer (lexMixed good)
"lexgreek" -> Just $ appLexer lexAGreek
"lexgreek2" -> Just $ appLexer lexAGreek2
"words" -> Just $ appLexer words
"unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok)
"unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
"unlexgreek" -> Just $ appUnlexer unlexAGreek
"unlexnone" -> Just id
"unlexid" -> Just id
"unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML
_ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
chop mk@(lg, mark) s0 s =
let (tag,rest) = splitAt lg s in
if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest
else case s of
c:cs -> chop mk (c:s0) cs
[] -> [reverse s0]
switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
(lbeg,lend) = (length beg, length end)
altern m ts = case ts of
t:ws | not m && t==beg -> t : altern True ws
t:ws | m && t==end -> t : altern False ws
t:ws -> (if m then op t else t) : altern m ws
[] -> []
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f
appUnlexer :: ([String] -> String) -> String -> String
----appUnlexer f = unlines . map (f . words) . lines
appUnlexer f = f . words
wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
-- * Code lexing
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
-- * Ancient Greek lexing
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
-- second version · = 00B7 standard code point
-- * Text lexing and unlexing for Ancient Greek:
-- 1. no capitalization of initial word,
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
-- 3. accent move from/to support word to/from following clitic words (iterated).
lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek
lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2
unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .).
-- | normalize = change grave accent on sentence internal words to acute,
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)
normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
fromAGreek :: [String] -> [String]
fromAGreek s = case s of
w:[]:vs -> w:[]:(fromAGreek vs)
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
w:v:vs | wasEnclitic v && wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
getProclitic w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && (hasEndCircum w ||
(hasEndAcute w && hasSingleAccent w)) ->
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
dropLastAccent w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:ws -> (toAcute w) : (fromAGreek ws)
ws -> ws
-- | de-normalize = change acute accent of end syllables in sentence internal
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
-- previous word to produce ordinary ancient greek
denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
toAGreek :: [String] -> [String]
toAGreek s = case s of
w:[]:vs -> w:[]:(toAGreek vs)
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
w:v:vs | isEnclitic v && isEnclitic w ->
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
addAcute w: (toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
w:v:vs | isEnclitic v ->
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
w:ws -> (toGrave w) : (toAGreek ws)
ws -> ws
-- | Change accent on the final syllable of a word
toGrave :: String -> String
toGrave = reverse . grave . reverse where
grave s = case s of
'\'':cs -> '`':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: grave cs
_ -> s
toAcute :: String -> String
toAcute = reverse . acute . reverse where
acute s = case s of
'`':cs -> '\'':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: acute cs
_ -> s
isAGreekVowel = flip elem "aeioyhw"
-- | Accent moves for enclitics and proclitics (atona)
enclitics = [
"moy","moi","me", -- personal pronouns
"soy","soi","se",
"oy(","oi(","e(",
"tis*","ti","tina'", -- indefinite pronoun
"tino's*","tini'",
"tine's*","tina's*",
"tinw~n","tisi'","tisi'n",
"poy","poi", -- indefinite adverbs
"pove'n","pws*",
"ph|","pote'",
"ge","te","toi", -- particles
"nyn","per","pw"
-- suffix -"de"
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
] -- and more, BR 11
proclitics = [
"o(","h(","oi(","ai(", -- articles
"e)n","ei)s*","e)x","e)k", -- prepositions
"ei)","w(s*", -- conjunctions
"oy)","oy)k","oy)c" -- negation
]
isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
++ (map dropAccent (filter hasAccent enclitics))
accented = (filter hasAccent enclitics)
++ map addAcute (filter (not . hasAccent) enclitics)
in flip elem (accented ++ unaccented)
wasProclitic = flip elem (map addAcute proclitics)
getEnclitic =
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
++ (map addAcute (filter (not . hasAccent) enclitics)))
(enclitics ++ (filter hasAccent enclitics)
++ (filter (not . hasAccent) enclitics))
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
getProclitic =
let pairs = zip (map addAcute proclitics) proclitics
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
-- | Accent manipulation
dropAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
dropLastAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
addAcute :: String -> String
addAcute = reverse . acu . reverse where
acu w = case w of
c:cs | c == '\'' -> c:cs
c:cs | c == '(' -> '\'':c:cs
c:cs | c == ')' -> '\'':c:cs
c:cs | isAGreekVowel c -> '\'':c:cs
c:cs -> c : acu cs
_ -> w
-- | Accent checking on end syllables
hasEndAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasEndCircum = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> True
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasPrefinalAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False -- final acute
'`':cs -> False
'~':cs -> False
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
c:cs | isAGreekVowel c -> findNext cs
_:cs -> find cs where
findNext s = case s of
[] -> False
'\'':cs -> True -- prefinal acute
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> findNext cs where
hasSingleAccent v =
hasAccent v && not (hasAccent (dropLastAccent v))
hasAccent v = case v of
[] -> False
c:cs -> elem c ['\'','`','~'] || hasAccent cs
{- Tests:
-- denormalization. Examples in BR 11 work:
-}
enclitics_expls = -- normalized
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
"sofo's tis*":"sofoi' tine's*": -- a.beta
"ei) tis*":"ei) tine's*":
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
"doy~los* tis*":"doy~loi tine's*":
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
"ei) poy tis* tina' i)'doi": -- Anm. 2.
[]
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
restoreBackslash ws = case ws of
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
w:ww -> w:restoreBackslash ww
_ -> ws
-- * Mixed lexing
-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: (String -> Bool) -> String -> [String]
lexMixed good = concat . alternate False [] where
alternate env t s = case s of
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
c:cs -> alternate env (c:t) cs
_ -> [lex env (reverse t)]
lex env = if env then lexLatexCode else lexText good
unlexMixed :: (String -> Bool) -> [String] -> String
unlexMixed good = capitInit . concat . alternate False where
alternate env s = case s of
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else (uncapitInit good . unlexText)
sep env c m = case (m,env) of
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
_ -> " " ++ c -- put space before opening $
-- * Additional lexing uitilties
-- | Capitalize first letter
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- | Uncapitalize first letter
uncapitInit good s =
case s of
c:cs | not (good s) -> toLower c : cs
_ -> s
-- | Unquote each string wrapped in double quotes
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"

View File

@@ -0,0 +1,48 @@
-- | Pretty printing with class
module GF.Text.Pretty(module GF.Text.Pretty,module PP) where
import qualified Text.PrettyPrint as PP
import Text.PrettyPrint as PP(Doc,Style(..),Mode(..),style,empty,isEmpty)
class Pretty a where
pp :: a -> Doc
ppList :: [a] -> Doc
ppList = fsep . map pp -- hmm
instance Pretty Doc where pp = id
instance Pretty Int where pp = PP.int
instance Pretty Integer where pp = PP.integer
instance Pretty Float where pp = PP.float
instance Pretty Double where pp = PP.double
instance Pretty Char where pp = PP.char; ppList = PP.text
instance Pretty a => Pretty [a] where
pp = ppList
ppList = fsep . map pp -- hmm
render x = PP.render (pp x)
render80 x = renderStyle style{lineLength=80,ribbonsPerLine=1} x
renderStyle s x = PP.renderStyle s (pp x)
infixl 5 $$,$+$
infixl 6 <>,<+>
x $$ y = pp x PP.$$ pp y
x $+$ y = pp x PP.$+$ pp y
x <+> y = pp x PP.<+> pp y
x <> y = pp x PP.<> pp y
braces x = PP.braces (pp x)
brackets x = PP.brackets (pp x)
cat xs = PP.cat (map pp xs)
doubleQuotes x = PP.doubleQuotes (pp x)
fcat xs = PP.fcat (map pp xs)
fsep xs = PP.fsep (map pp xs)
hang x d y = PP.hang (pp x) d (pp y)
hcat xs = PP.hcat (map pp xs)
hsep xs = PP.hsep (map pp xs)
nest d x = PP.nest d (pp x)
parens x = PP.parens (pp x)
punctuate x ys = PP.punctuate (pp x) (map pp ys)
quotes x = PP.quotes (pp x)
sep xs = PP.sep (map pp xs)
vcat xs = PP.vcat (map pp xs)

View File

@@ -0,0 +1,331 @@
module GF.Text.Transliterations (
transliterate,
transliterateWithFile,
transliteration,
characterTable,
transliterationPrintNames
) where
import Data.Char
import Numeric
import qualified Data.Map as Map
-- transliterations between ASCII and a Unicode character set
-- current transliterations: devanagari, thai
-- to add a new one: define the Unicode range and the corresponding ASCII strings,
-- which may be one or more characters long
-- conventions to be followed:
-- each character is either [letter] or [letter+nonletters]
-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations
-- characters can be invisible: ignored in translation to unicode
transliterate :: String -> Maybe (String -> String)
transliterate s = case s of
'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
_ -> Nothing
transliterateWithFile :: String -> String -> Bool -> (String -> String)
transliterateWithFile name src isFrom =
(if isFrom then appTransFromUnicode else appTransToUnicode) (getTransliterationFile name src)
transliteration :: String -> Maybe Transliteration
transliteration s = Map.lookup s allTransliterations
allTransliterations = Map.fromList [
("amharic",transAmharic),
("ancientgreek", transAncientGreek),
("arabic", transArabic),
("devanagari", transDevanagari),
("greek", transGreek),
("hebrew", transHebrew),
("persian", transPersian),
("sanskrit", transSanskrit),
("sindhi", transSindhi),
("nepali", transNepali),
("telugu", transTelugu),
("thai", transThai),
("urdu", transUrdu)
]
-- used in command options and help
transliterationPrintNames = [(t,printname p) | (t,p) <- Map.toList allTransliterations]
characterTable :: Transliteration -> String
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
prOne (i,s) = unwords ["|", showHex i "", "|", [toEnum i], "|", s, "|"]
data Transliteration = Trans {
trans_to_unicode :: Map.Map String Int,
trans_from_unicode :: Map.Map Int String,
invisible_chars :: [String],
printname :: String
}
appTransToUnicode :: Transliteration -> String -> String
appTransToUnicode trans =
concat .
map (\c -> maybe c (return . toEnum) $
Map.lookup c (trans_to_unicode trans)
) .
filter (flip notElem (invisible_chars trans)) .
unchar
appTransFromUnicode :: Transliteration -> String -> String
appTransFromUnicode trans =
concat .
map (\c -> maybe [toEnum c] id $
Map.lookup c (trans_from_unicode trans)
) .
map fromEnum
mkTransliteration :: String -> [String] -> [Int] -> Transliteration
mkTransliteration name ts us =
Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) [] name
where
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
getTransliterationFile :: String -> String -> Transliteration
getTransliterationFile name = uncurry (mkTransliteration name) . codes
where
codes = unzip . map (mkOne . words) . filter (not . all isSpace) . lines
mkOne ws = case ws of
[c]:t:_ -> (t,fromEnum c) -- ä a:
u:t:_ -> (t,read u) -- 228 a: OR 0xe4
_ -> error $ "not a valid transliteration:" ++ unwords ws
unchar :: String -> [String]
unchar s = case s of
c:d:cs
| isAlpha d -> [c] : unchar (d:cs)
| isSpace d -> [c]:[d]: unchar cs
| otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
(c:d:ds) : unchar cs2
[_] -> [s]
_ -> []
transThai :: Transliteration
transThai = mkTransliteration "Thai" allTrans allCodes where
allTrans = words $
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
"p3 m y r - l - w s- s. s h l' O h' - " ++
"a. a a: a+ i i: v v: u u: - - - - - - " ++
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
allCodes = [0x0e00 .. 0x0e7f]
transDevanagari :: Transliteration
transDevanagari =
(mkTransliteration "Devanagari"
allTransUrduHindi allCodes){invisible_chars = ["a"]} where
allCodes = [0x0900 .. 0x095f] ++ [0x0966 .. 0x096f]
allTransUrduHindi = words $
"- n~ m. h. - A A: I I: U U: r.- l.- - - E: " ++
"E+ - - O: O+ k k' g g' n- c c' j j' n* T " ++
"T' D D' N t t' d d' n - p p' b b' m y " ++
"r - l L - v s* S s h - - X~ - a: i " ++
"i: u u: r. l. - - e: e+ - - o: o+ X, - - " ++
"- - - - - - - - q x g. z R R' f - " ++
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 "
transUrdu :: Transliteration
transUrdu =
(mkTransliteration "Urdu" allTrans allCodes) where
allCodes = [0x0622 .. 0x062f] ++ [0x0630 .. 0x063a] ++ [0x0641,0x0642] ++ [0x06A9] ++ [0x0644 .. 0x0648] ++
[0x0654,0x0658,0x0679,0x067e,0x0686,0x0688,0x0691,0x0698,0x06af,0x06c1,0x06c3,0x06cc,0x06ba,0x06be,0x06d2] ++
[0x06f0 .. 0x06f9] ++ [0x061f,0x06D4]
allTrans = words $
"A - w^ - y^ a b - t C j H K d " ++ -- 0622 - 062f
"Z r z s X S Z- t- z- e G " ++ -- 0630 - 063a
"f q k l m n - w " ++ -- 0641, 0642, 0643 - 0648
"$ n- T p c D R x g h t: y N h' E " ++ -- 0654,658,679,67e,686,688,698,6af,6c1,6c3,6cc,6ba,6be,6d2
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 " ++ "? ."
transSindhi :: Transliteration
transSindhi =
(mkTransliteration "Sindhi" allTrans allCodes) where
allCodes = [0x062e] ++ [0x0627 .. 0x062f] ++ [0x0630 .. 0x063a] ++ [0x0641 .. 0x0648] ++
[0x067a,0x067b,0x067d,0x067e,0x067f] ++ [0x0680 .. 0x068f] ++
[0x0699,0x0918,0x06a6,0x061d,0x06a9,0x06af,0x06b3,0x06bb,0x06be,0x06f6,0x064a,0x06b1, 0x06aa, 0x06fd, 0x06fe] ++
[0x06f0 .. 0x06f9] ++ [0x061f,0x06D4]
allTrans = words $
"K a b - t C j H - d " ++ -- 0626 - 062f
"Z r z s X S Z- t- z- e G " ++ -- 0630 - 063a
"f q - L m n - W " ++ -- 0641 - 0648
"T! B T p T' " ++ -- 067a,067b,067d,067e,067f
"B' - - Y' J' - c c' - - d! - d' D - D' " ++ -- 0680 - 068f
"R - F' - k' g G' t' h' e' y c! k A M " ++ -- 0699, 0918, 06a6, 061d, 06a9,06af,06b3,06bb,06be,06f6,06cc,06b1
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 " ++ "? ."
transArabic :: Transliteration
transArabic = mkTransliteration "Arabic" allTrans allCodes where
allTrans = words $
" V A: A? w? A- y? A b t. t v g H K d " ++ -- 0621 - 062f
"W r z s C S D T Z c G " ++ -- 0630 - 063a
" f q k l m n h w y. y a. u. i. a u " ++ -- 0641 - 064f
"i v2 o a: V+ V- i: a+ " ++ -- 0650 - 0657
"A* q?" -- 0671 (used by AED)
allCodes = [0x0621..0x062f] ++ [0x0630..0x063a] ++
[0x0641..0x064f] ++ [0x0650..0x0657] ++ [0x0671,0x061f]
transPersian :: Transliteration
transPersian = (mkTransliteration "Persian/Farsi" allTrans allCodes)
{invisible_chars = ["a","u","i"]} where
allTrans = words $
" V A: A? w? A- y? A b t. t t- j H K d " ++ -- 0621 - 062f
"W r z s C S D T Z c G " ++ -- 0630 - 063a
" f q - l m n h v - y. a. u. i. a u " ++ -- 0640 - 064f
"i v2 o a: V+ V- i: a+ " ++ -- 0650 - 0657
"p c^ J k g y q? Z0"
allCodes = [0x0621..0x062f] ++ [0x0630..0x063a] ++
[0x0641..0x064f] ++ [0x0650..0x0657] ++
[0x067e,0x0686,0x0698,0x06a9,0x06af,0x06cc,0x061f,0x200c]
transNepali :: Transliteration
transNepali = mkTransliteration "Nepali" allTrans allCodes where
allTrans = words $
"z+ z= " ++
"- V M h: - H A i: I: f F Z - - - e: " ++
"E: - - O W k K g G n: C c j J Y q " ++
"Q x X N t T d D n - p P b B m y " ++
"r - l L - v S z s h - - ~ ` a i " ++
"I u U R - - - e E - - o w x: - - " ++
"O: - _ - - - - - - - - - - - - - " ++
"- - - - . > 0 1 2 3 4 5 6 7 8 9 " ++
"- - - - - - - - - - - - - - - - "
allCodes = [0x200c,0x200d] ++ [0x0900 .. 0x097f]
transHebrew :: Transliteration
transHebrew = mkTransliteration "unvocalized Hebrew" allTrans allCodes where
allTrans = words $
"A b g d h w z H T y K k l M m N " ++
"n S O P p Z. Z q r s t - - - - - " ++
"w2 w3 y2 g1 g2"
allCodes = [0x05d0..0x05f4]
transTelugu :: Transliteration
transTelugu = mkTransliteration "Telugu" allTrans allCodes where
allTrans = words $
"- c1 c2 c3 - A A: I I: U U: R_ L_ - E E: " ++
"A' - O O: A_ k k. g g. n. c c. j j. n' T " ++
"T. d d. N t t. d d. n - p p. b b. m y " ++
"r R l L - v s' S s h - - - c5 a: i " ++
"i: u u: r_ r. - e e: a' - o o: a_ c6 - - " ++
"- - - - - c7 c8 z Z - - - - - - - " ++
"R+ L+ l+ l* - - n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 "
allCodes = [0x0c00 .. 0x0c7f]
transGreek :: Transliteration
transGreek = mkTransliteration "modern Greek" allTrans allCodes where
allTrans = words $
"- - - - - - A' - E' H' I' - O' - Y' W' " ++
"i= A B G D E Z H V I K L M N X O " ++
"P R - S T Y F C Q W I- Y- a' e' h' i' " ++
"y= a b g d e z h v i k l m n x o " ++
"p r s* s t y f c q w i- y- o' y' w' - "
allCodes = [0x0380 .. 0x03cf]
transAncientGreek :: Transliteration
transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
allTrans = words $
-- "- - - - - - - c: - - - - - - - - " ++ -- standard code point for colon: 00B7
"- - - - - - - - - - - - - - - - " ++
"i= A B G D E Z H V I K L M N X O " ++
"P R - S T Y F C Q W I- Y- - - - - " ++
"y= a b g d e z h v i k l m n x o " ++
"p r s* s t y f c q w i- y- - - - - " ++
"a) a( a)` a(` a)' a(' a)~ a(~ A) A( A)` A(` A)' A(' A)~ A(~ " ++ -- 1f00-1f09,1f0a-1f0f
"e) e( e)` e(` e)' e(' - - E) E( E)` E(` E)' E(' - - " ++
"h) h( h)` h(` h)' h(' h)~ h(~ H) H( H)` H(` H)' H(' H)~ H(~ " ++
"i) i( i)` i(` i)' i(' i)~ i(~ I) I( I)` I(` I)' I(' I)~ I(~ " ++
"o) o( o)` o(` o)' o(' - - O) O( O)` O(` O)' O(' - - " ++
"y) y( y)` y(` y)' y(' y)~ y(~ - Y( - Y(` - Y(' - Y(~ " ++
"w) w( w)` w(` w)' w(' w)~ w(~ W) W( W)` W(` W)' W(' W)~ W(~ " ++
"a` a' e` e' h` h' i` i' o` o' y` y' w` w' - - " ++
"a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80-
"h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90-
"w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0-
"a. a_ a|` a| a|' - a~ a|~ - - - - - - - - " ++ -- 1fb0-
"- - h|` h| h|' - h~ h|~ - - - - - - - - " ++ -- 1fc0-
"i. i_ i=` i=' - - i~ i=~ - - - - - - - - " ++ -- 1fd0-
"y. y_ y=` y=' r) r( y~ y=~ - - - - - - - - " ++ -- 1fe0-
"- - w|` w| w|' - w~ w|~ - - - - - - - - " ++ -- 1ff0-
-- HL, Private Use Area Code Points (New Athena Unicode, Cardo, ALPHABETUM, Antioch)
-- see: http://apagreekkeys.org/technicalDetails.html
-- GreekKeys Support by Donald Mastronarde
"- - - - - - - - - e. o. R) Y) Y)` Y)' Y)~ " ++ -- e1a0-e1af
"e~ e)~ e(~ e_ e_' e_` e_) e_( e_)` e_(` e_)' e_(' E)~ E(~ E_ E. " ++ -- e1b0-e1bf
"o~ o)~ o(~ o_ o_' o_` o_) o_( o_)` o_(` o_)' o_(' O)~ O(~ O_ O. " ++ -- e1c0-e1cf
"a_` - a_~ a_)` a_(` a_)~ a_(~ - a.` a.) a.)` a.(' a.(` - - - " ++ -- eaf0-eaff
"a_' - - - a_) a_( - a_)' - a_(' a.' a.( a.)' - - - " ++ -- eb00-eb0f
"e_)~ e_(~ - - - - - e_~ - - - - - - - - " ++ -- eb20-eb2f
"- - - - - - i_~ - i_` i_' - - i_) i_)' i_( i_(' " ++ -- eb30-eb3f
"i.' i.) i.)' i.( i.` i.)` - i.(' i.(` - - - - - - - " ++ -- eb40-eb4f
"- - - - i_)` i_(` - i_)~ i_(~ - o_~ o_)~ o_(~ - - - " ++ -- eb50-eb5f
"y_` " ++ -- eb6f
"y_~ y_)` - - - y_(` - y_)~ y_(~ - y_' - - y_) y_( y_)' " ++ -- eb70-eb7f
"y_(' y.' y.( y.` y.) y.)' - - y.)` y.(' y.(` - - - - - " -- eb80-eb8f
allCodes = -- [0x00B0 .. 0x00Bf]
[0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff]
++ [0xe1a0 .. 0xe1af]
++ [0xe1b0 .. 0xe1bf]
++ [0xe1c0 .. 0xe1cf]
++ [0xeaf0 .. 0xeaff]
++ [0xeb00 .. 0xeb0f]
++ [0xeb20 .. 0xeb2f]
++ [0xeb30 .. 0xeb3f]
++ [0xeb40 .. 0xeb4f]
++ [0xeb50 .. 0xeb5f] ++ [0xeb6f]
++ [0xeb70 .. 0xeb7f]
++ [0xeb80 .. 0xeb8f]
transAmharic :: Transliteration
transAmharic = mkTransliteration "Amharic" allTrans allCodes where
allTrans = words $
" h. h- h' h( h) h h? h* l. l- l' l( l) l l? l* "++
" H. H- H' H( H) H H? H* m. m- m' m( m) m m? m* "++
" s. s- s' s( s) s s? s* r. r- r' r( r) r r? r* "++
" - - - - - - - - x. x- x' x( x) x x? x* "++
" q. q- q' q( q) q q? q* - - - - - - - - "++
" - - - - - - - - - - - - - - - - "++
" b. b- b' b( b) b b? b* v. v- v' v( v) v v? v* "++
" t. t- t' t( t) t t? t* c. c- c' c( c) c c? c* "++
" X. X- X' X( X) X X? - - - - X* - - - - "++
" n. n- n' n( n) n n? n* N. N- N' N( N) N N? N* "++
" a u i A E e o e* k. k- k' k( k) k k? - "++
" - - - k* - - - - - - - - - - - - "++
" - - - - - - - - w. w- w' w( w) w w? w* "++
" - - - - - - - - z. z- z' z( z) z z? z* "++
" Z. Z- Z' Z( Z) Z Z? Z* y. y- y' y( y) y y? y* "++
" d. d- d' d( d) d d? d* - - - - - - - - "++
" j. j- j' j( j) j j? j* g. g- g' g( g) g g? - "++
" - - - g* - - - - - - - - - - - - "++
" T. T- T' T( T) T T? T* C. C- C' C( C) C C? C* "++
" P. P- P' P( P) P P? P* S. S- S' S( S) S S? S* "++
" - - - - - - - - f. f- f' f( f) f f? f*"++
" p. p- p' p( p) p p? p*"
allCodes = [0x1200..0x1357]
-- by Prasad 31/5/2013
transSanskrit :: Transliteration
transSanskrit = (mkTransliteration "Sanskrit" allTrans allCodes) {invisible_chars = ["a"]} where
allTrans = words $
"- n~ m. h. - A A: I I: U U: R. L. - - E: " ++
"E+ - O O: O+ k k' g g' n- c c' j j' n* T " ++
"T' D D' N t t' d d' n - p p' b b' m y " ++
"r - l L - v s* S s h - - - v- a: i " ++
"i: u u: r. r.: - e e: e+ - o o: o+ a_ - - " ++
"o~ - - - - - - - q x G z R R' f - " ++
"R.: L.: l. l.: p, p. N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 "
allCodes = [0x0900 .. 0x097f]