reintroduce the compiler API

2024-01-18 20:58:10 +01:00
parent 282c6fc50f
commit a82095d117
138 changed files with 84 additions and 342 deletions
--- a/src/compiler/api/GF/Text/Clitics.hs
+++ b/src/compiler/api/GF/Text/Clitics.hs
@@ -0,0 +1,40 @@
+module GF.Text.Clitics (getClitics,getCliticsText) where
+
+import Data.List
+
+-- AR 6/2/2011
+-- Analyse word as stem+clitic whenever 
+--   (1) clitic is in clitic list
+--   (2) either 
+--      (a) stem is in Lexicon
+--      (b) stem can be analysed as stem0+clitic0
+-- 
+-- Examples: 
+--   Italian amarmi = amar+mi
+--   Finnish autossanikohan = autossa+ni+kohan
+--
+-- The analysis gives all results, including the case where the whole word is in Lexicon.
+-- 
+-- The clitics in the list are expected to be reversed.
+
+getClitics :: (String -> Bool) -> [String] -> String -> [[String]]
+getClitics isLex rclitics = map (reverse . map reverse) . clits . reverse where
+  clits rword = ifLex rword [rclit:more | 
+                  rclit <- rclitics, stem <- splits rclit rword, more <- clits stem]
+  splits c = maybe [] return . stripPrefix c
+
+  ifLex w ws = if isLex (reverse w) then [w] : ws else ws
+
+
+getCliticsText :: (String -> Bool) -> [String] -> [String] -> [String]
+getCliticsText isLex rclitics = 
+  map unwords . sequence . map (map render . getClitics isLex rclitics) 
+ where
+  render = unwords . intersperse "&+"
+
+
+-- example
+
+--getClitics1 = getClitics exlex1 exclits1
+--exlex1   = flip elem ["auto", "naise", "rahan","maa","maahan","maahankaan"]
+--exclits1 = map reverse ["ni","ko","han","pas","nsa","kin","kaan"]
--- a/src/compiler/api/GF/Text/Coding.hs
+++ b/src/compiler/api/GF/Text/Coding.hs
@@ -0,0 +1,75 @@
+module GF.Text.Coding where
+
+import qualified Data.ByteString as BS
+import Data.ByteString.Internal
+import GHC.IO
+import GHC.IO.Buffer
+import GHC.IO.Encoding
+import GHC.IO.Exception
+import Control.Monad
+
+encodeUnicode :: TextEncoding -> String -> ByteString
+encodeUnicode enc s =
+  unsafePerformIO $ do
+    let len = length s
+    cbuf0 <- newCharBuffer (len*4) ReadBuffer
+    foldM (\i c -> writeCharBuf (bufRaw cbuf0) i c) 0 s
+    let cbuf = cbuf0{bufR=len}
+    case enc of
+      TextEncoding {mkTextEncoder=mk} -> do encoder <- mk
+                                            bss <- translate (encode encoder) cbuf
+                                            close encoder
+                                            return (BS.concat bss)
+  where
+    translate cod cbuf
+      | i < w     = do bbuf <- newByteBuffer 128 WriteBuffer
+                       (_,cbuf,bbuf) <- cod cbuf bbuf
+                       if isEmptyBuffer bbuf
+                         then ioe_invalidCharacter1
+                         else do let bs = PS (bufRaw bbuf) (bufL bbuf) (bufR bbuf-bufL bbuf)
+                                 bss <- translate cod cbuf
+                                 return (bs:bss)
+      | otherwise = return []
+      where
+        i = bufL cbuf
+        w = bufR cbuf
+
+decodeUnicode :: TextEncoding -> ByteString -> String
+decodeUnicode enc bs = unsafePerformIO $ decodeUnicodeIO enc bs
+
+decodeUnicodeIO enc (PS fptr l len) = do
+    let bbuf = (emptyBuffer fptr len ReadBuffer) { bufL=l, bufR=l+len }
+    cbuf <- newCharBuffer 128 WriteBuffer
+    case enc of
+      TextEncoding {mkTextDecoder=mk} -> do decoder <- mk
+                                            s <- translate (encode decoder) bbuf cbuf
+                                            close decoder
+                                            return s
+  where
+    translate cod bbuf cbuf
+      | i < w     = do
+                       (_,bbuf,cbuf) <- cod bbuf cbuf
+                       if isEmptyBuffer cbuf
+                         then ioe_invalidCharacter2
+                         else unpack cod bbuf cbuf
+      | otherwise = return []
+      where
+        i = bufL bbuf
+        w = bufR bbuf
+                                            
+    unpack cod bbuf cbuf
+      | i < w     = do (c,i') <- readCharBuf (bufRaw cbuf) i
+                       cs <- unpack cod bbuf cbuf{bufL=i'}
+                       return (c:cs)
+      | otherwise = translate cod bbuf cbuf{bufL=0,bufR=0}
+      where
+        i = bufL cbuf
+        w = bufR cbuf
+
+ioe_invalidCharacter1 = ioException
+   (IOError Nothing InvalidArgument ""
+        ("invalid byte sequence for this encoding") Nothing Nothing)
+
+ioe_invalidCharacter2 = ioException
+   (IOError Nothing InvalidArgument ""
+        ("invalid byte sequence for this decoding") Nothing Nothing)
--- a/src/compiler/api/GF/Text/Lexing.hs
+++ b/src/compiler/api/GF/Text/Lexing.hs
@@ -0,0 +1,448 @@
+-- | Lexers and unlexers - they work on space-separated word strings
+module GF.Text.Lexing (stringOp,opInEnv,bindTok) where
+
+import GF.Text.Transliterations
+
+import Data.Char (isSpace,toUpper,toLower)
+import Data.List (intersperse)
+
+stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
+stringOp good name = case name of
+  "chars"      -> Just $ appLexer (filter (not . all isSpace) . map return)
+  "lextext"    -> Just $ appLexer (lexText good)
+  "lexcode"    -> Just $ appLexer lexCode
+  "lexmixed"   -> Just $ appLexer (lexMixed good)
+  "lexgreek"   -> Just $ appLexer lexAGreek
+  "lexgreek2"  -> Just $ appLexer lexAGreek2
+  "words"      -> Just $ appLexer words
+  "unchars"    -> Just $ appUnlexer concat
+  "unlextext"  -> Just $ appUnlexer (unlexText . unquote . bindTok)
+  "unlexcode"  -> Just $ appUnlexer unlexCode
+  "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
+  "unlexgreek" -> Just $ appUnlexer unlexAGreek
+  "unlexnone"  -> Just id
+  "unlexid"    -> Just id
+  "unwords"    -> Just $ appUnlexer unwords
+  "to_html"    -> Just wrapHTML
+  _            -> transliterate name
+
+-- perform op in environments beg--end, t.ex. between "--"
+--- suboptimal implementation
+opInEnv :: String -> String -> (String -> String) -> (String -> String)
+opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
+  chop mk@(lg, mark) s0 s = 
+    let (tag,rest) = splitAt lg s in
+    if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest 
+      else case s of
+        c:cs -> chop mk (c:s0) cs
+        [] -> [reverse s0]
+  switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
+  (lbeg,lend) = (length beg, length end)
+  altern m ts = case ts of
+    t:ws | not m && t==beg -> t : altern True ws
+    t:ws | m     && t==end -> t : altern False ws
+    t:ws -> (if m then op t else t) : altern m ws
+    [] -> []
+
+appLexer :: (String -> [String]) -> String -> String
+appLexer f = unwords . filter (not . null) . f
+
+appUnlexer :: ([String] -> String) -> String -> String
+----appUnlexer f = unlines . map (f . words) . lines
+appUnlexer f = f . words
+
+wrapHTML :: String -> String
+wrapHTML = unlines . tag . intersperse "<br>" . lines where
+  tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
+
+
+-- * Text lexing
+-- | Text lexing with standard word capitalization of the first word of every sentence
+lexText :: (String -> Bool) -> String -> [String]
+lexText good = lexText' (uncapitInit good)
+
+-- | Text lexing with custom treatment of the first word of every sentence.
+lexText' :: (String->String) -> String -> [String]
+lexText' uncap1 = uncap . lext where
+  lext s = case s of
+    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
+    c:cs | isMinorPunct c -> [c] : lext cs
+    c:cs | isSpace c ->       lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
+    _ -> [s]
+  uncap s = case s of
+    w:ws -> uncap1 w:ws
+    _ -> s
+
+unlexText :: [String] -> String
+unlexText = capitInit . unlext where
+  unlext s = case s of
+    w:[] -> w
+    w:[c]:[] | isPunct c -> w ++ [c]
+    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
+    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
+    w:ws -> w ++ " " ++ unlext ws
+    _ -> []
+
+-- | Bind tokens separated by Prelude.BIND, i.e. &+
+bindTok :: [String] -> [String]
+bindTok ws = case ws of
+               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
+               "&+":ws       -> bindTok ws
+               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
+               "&|":ws       -> bindTok ws
+               w:ws          -> w:bindTok ws
+               []            -> []
+
+-- * Code lexing
+
+-- | Haskell lexer, usable for much code
+lexCode :: String -> [String]
+lexCode ss = case lex ss of
+  [(w@(_:_),ws)] -> w : lexCode ws
+  _ -> []
+  
+
+-- * Ancient Greek lexing
+
+lexTextAGreek :: String -> [String]
+lexTextAGreek s = lext s where
+  lext s = case s of
+    c:cs | isAGreekPunct c -> [c] : (lext cs)
+    c:cs | isSpace c -> lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
+           in w : lext cs
+    [] -> []
+
+-- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
+-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
+
+lexTextAGreek2 :: String -> [String]
+lexTextAGreek2 s = lext s where
+  lext s = case s of
+    c:cs | isAGreekPunct c -> [c] : (lext cs)
+    c:cs | isSpace c -> lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
+           in case cs of 
+                '.':'.':d:ds | isSpace d 
+                  -> (w++['.']) : lext ('.':d:ds)
+                '.':d:ds | isAGreekPunct d || isSpace d 
+                  -> (w++['.']) : lext (d:ds)
+                '.':d:ds | not (isSpace d) 
+                  -> case lext (d:ds) of
+                       e:es -> (w++['.']++e) : es
+                       es -> (w++['.']) : es 
+                '.':[] -> (w++['.']) : []
+                _ -> w : lext cs      
+    [] -> []
+
+unlexTextAGreek :: [String] -> String
+unlexTextAGreek = unlext where
+  unlext s = case s of
+    w:[] -> w
+    w:[c]:[] | isAGreekPunct c -> w ++ [c]
+    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
+    w:ws -> w ++ " " ++ unlext ws
+    [] -> []
+
+isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
+                                   -- second version · = 00B7 standard code point
+
+-- * Text lexing and unlexing for Ancient Greek: 
+--   1. no capitalization of initial word, 
+--   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
+--   3. accent move from/to support word to/from following clitic words (iterated).
+
+lexAGreek :: String -> [String]
+lexAGreek = fromAGreek . lexTextAGreek
+
+lexAGreek2 :: String -> [String]
+lexAGreek2 = fromAGreek . lexTextAGreek2
+
+unlexAGreek :: [String] -> String
+unlexAGreek = unlexTextAGreek . toAGreek
+
+-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
+-- vowel indication (like a.) differs from sentence end (a .). 
+
+-- | normalize = change grave accent on sentence internal words to acute, 
+-- and shift inherited acutes to the following enclitic (where they are
+-- visible only as shown in the list of enclitics above)
+
+normalize :: String -> String
+normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 
+
+fromAGreek :: [String] -> [String]
+fromAGreek s = case s of
+  w:[]:vs -> w:[]:(fromAGreek vs)
+  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
+  w:v:vs | wasEnclitic v && wasEnclitic w ->
+    getEnclitic w : fromAGreek (v:vs)
+  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
+    getProclitic w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic v && (hasEndCircum w || 
+    (hasEndAcute w && hasSingleAccent w)) ->  
+    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
+  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
+    w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
+    dropLastAccent w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic w ->
+    getEnclitic w : fromAGreek (v:vs)
+  w:ws -> (toAcute w) : (fromAGreek ws)
+  ws -> ws 
+
+-- | de-normalize = change acute accent of end syllables in sentence internal 
+--  (non-enclitic) words to grave accent, and move accents of enclitics to the 
+--  previous word to produce ordinary ancient greek
+
+denormalize :: String -> String
+denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 
+
+toAGreek :: [String] -> [String]
+toAGreek s = case s of
+  w:[]:vs -> w:[]:(toAGreek vs)
+  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
+  w:v:vs | isEnclitic v && isEnclitic w -> 
+    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
+  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
+    addAcute w: (toAGreek (dropAccent v:vs))
+  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
+    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
+  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
+    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
+  w:v:vs | isEnclitic v -> 
+    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
+  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
+  w:ws -> (toGrave w) : (toAGreek ws)
+  ws -> ws 
+
+-- | Change accent on the final syllable of a word
+
+toGrave :: String -> String 
+toGrave = reverse . grave . reverse where
+  grave s = case s of
+    '\'':cs -> '`':cs 
+    c:cs | isAGreekVowel c -> c:cs 
+    c:cs -> c: grave cs
+    _ -> s
+
+toAcute :: String -> String 
+toAcute = reverse . acute . reverse where
+  acute s = case s of
+    '`':cs -> '\'':cs
+    c:cs | isAGreekVowel c -> c:cs 
+    c:cs -> c: acute cs
+    _ -> s
+
+isAGreekVowel = flip elem "aeioyhw"
+
+-- | Accent moves for enclitics and proclitics (atona)
+
+enclitics = [
+  "moy","moi","me",     -- personal pronouns
+  "soy","soi","se",
+  "oy(","oi(","e(",
+  "tis*","ti","tina'",  -- indefinite pronoun 
+  "tino's*","tini'",
+  "tine's*","tina's*",
+  "tinw~n","tisi'","tisi'n",
+  "poy","poi",          -- indefinite adverbs
+  "pove'n","pws*",
+  "ph|","pote'",
+  "ge","te","toi",      -- particles
+  "nyn","per","pw" 
+   -- suffix -"de"
+   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
+  ] -- and more, BR 11
+
+proclitics = [
+  "o(","h(","oi(","ai(",     -- articles
+  "e)n","ei)s*","e)x","e)k", -- prepositions
+  "ei)","w(s*",              -- conjunctions
+  "oy)","oy)k","oy)c"        -- negation
+  ]
+
+isEnclitic = flip elem enclitics
+isProclitic = flip elem proclitics
+
+-- Check if a word is an enclitic or accented enclitic and extract the enclitic
+
+wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
+                    ++ (map dropAccent (filter hasAccent enclitics))
+                  accented = (filter hasAccent enclitics) 
+                    ++ map addAcute (filter (not . hasAccent) enclitics) 
+              in flip elem (accented ++ unaccented)
+
+wasProclitic = flip elem (map addAcute proclitics)
+
+getEnclitic = 
+  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
+                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
+                  (enclitics ++ (filter hasAccent enclitics)
+                   ++ (filter (not . hasAccent) enclitics))
+      find = \v -> lookup v pairs
+  in \v -> case (find v) of 
+    Just x -> x 
+    _ -> v
+getProclitic = 
+  let pairs = zip (map addAcute proclitics) proclitics 
+      find = \v -> lookup v pairs
+  in \v -> case (find v) of 
+    Just x -> x 
+    _ -> v
+
+-- | Accent manipulation
+
+dropAccent = reverse . drop . reverse where 
+  drop s = case s of
+    [] -> []
+    '\'':cs -> cs
+    '`':cs -> cs
+    '~':cs -> cs
+    c:cs -> c:drop cs
+
+dropLastAccent = reverse . drop . reverse where 
+  drop s = case s of
+    [] -> []
+    '\'':cs -> cs
+    '`':cs -> cs
+    '~':cs -> cs
+    c:cs -> c:drop cs
+
+addAcute :: String -> String
+addAcute = reverse . acu . reverse where
+  acu w = case w of 
+    c:cs | c == '\'' -> c:cs
+    c:cs | c == '(' -> '\'':c:cs
+    c:cs | c == ')' -> '\'':c:cs
+    c:cs | isAGreekVowel c -> '\'':c:cs
+    c:cs -> c : acu cs
+    _ -> w
+
+-- | Accent checking on end syllables
+
+hasEndAcute = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> True
+    '`':cs -> False
+    '~':cs -> False
+    c:cs | isAGreekVowel c -> False
+    _:cs -> find cs
+
+hasEndCircum = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> False
+    '`':cs -> False
+    '~':cs -> True
+    c:cs | isAGreekVowel c -> False
+    _:cs -> find cs
+
+hasPrefinalAcute = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> False  -- final acute
+    '`':cs -> False
+    '~':cs -> False
+    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
+    c:cs | isAGreekVowel c -> findNext cs
+    _:cs -> find cs where
+  findNext s = case s of 
+    [] -> False
+    '\'':cs -> True  -- prefinal acute
+    '`':cs -> False
+    '~':cs -> False
+    c:cs | isAGreekVowel c -> False
+    _:cs -> findNext cs where
+    
+hasSingleAccent v = 
+  hasAccent v && not (hasAccent (dropLastAccent v))
+
+hasAccent v = case v of 
+  [] -> False
+  c:cs -> elem c ['\'','`','~'] || hasAccent cs
+
+{- Tests: 
+
+-- denormalization. Examples in BR 11 work: 
+-}
+enclitics_expls = -- normalized
+  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
+  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
+  "ei) tis*":"ei) tine's*":
+  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
+  "doy~los* tis*":"doy~loi tine's*":
+  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
+  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
+  [] 
+
+
+unlexCode :: [String] -> String
+unlexCode s = case s of
+  w:[] -> w
+  [c]:cs | isParen c -> [c] ++ unlexCode cs
+  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
+  w:ws -> w ++ " " ++ unlexCode ws
+  _ -> []
+
+
+-- | LaTeX lexer in the math mode: \ should not be separated from the next word
+
+lexLatexCode :: String -> [String]
+lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
+  restoreBackslash ws = case ws of
+    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
+    w:ww -> w:restoreBackslash ww
+    _ -> ws
+
+-- * Mixed lexing
+
+-- | LaTeX style lexer, with "math" environment using Code between $...$
+lexMixed :: (String -> Bool) -> String -> [String]
+lexMixed good = concat . alternate False [] where
+  alternate env t s = case s of
+    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
+    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
+    c:cs -> alternate env (c:t) cs
+    _ -> [lex env (reverse t)]
+  lex env = if env then lexLatexCode else lexText good
+
+unlexMixed :: (String -> Bool) -> [String] -> String
+unlexMixed good = capitInit . concat . alternate False where
+  alternate env s = case s of
+    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
+      (t,[])  -> unlex env t : []
+      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
+    _ -> []
+  unlex env = if env then unlexCode else (uncapitInit good . unlexText)
+  sep env c m = case (m,env) of
+    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
+    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
+    _ -> " " ++ c   -- put space before opening $
+
+-- * Additional lexing uitilties
+
+-- | Capitalize first letter
+capitInit s = case s of
+  c:cs -> toUpper c : cs
+  _ -> s
+
+-- | Uncapitalize first letter
+uncapitInit good s = 
+  case s of
+    c:cs | not (good s) -> toLower c : cs
+    _                   -> s
+
+-- | Unquote each string wrapped in double quotes
+unquote = map unq where 
+  unq s = case s of
+    '"':cs@(_:_) | last cs == '"' -> init cs
+    _ -> s
+
+isPunct = flip elem ".?!,:;"
+isMajorPunct = flip elem ".?!"
+isMinorPunct = flip elem ",:;"
+isParen = flip elem "()[]{}"
+isClosing = flip elem ")]}"
--- a/src/compiler/api/GF/Text/Pretty.hs
+++ b/src/compiler/api/GF/Text/Pretty.hs
@@ -0,0 +1,48 @@
+-- | Pretty printing with class
+module GF.Text.Pretty(module GF.Text.Pretty,module PP) where
+import qualified Text.PrettyPrint as PP
+import Text.PrettyPrint as PP(Doc,Style(..),Mode(..),style,empty,isEmpty)
+
+class Pretty a where
+  pp :: a -> Doc
+  ppList :: [a] -> Doc
+  ppList = fsep . map pp -- hmm
+
+instance Pretty Doc where pp = id
+instance Pretty Int where pp = PP.int
+instance Pretty Integer where pp = PP.integer
+instance Pretty Float where pp = PP.float
+instance Pretty Double where pp = PP.double
+instance Pretty Char where pp = PP.char; ppList = PP.text
+
+instance Pretty a => Pretty [a] where
+  pp = ppList
+  ppList = fsep . map pp -- hmm
+
+render x = PP.render (pp x)
+render80 x = renderStyle style{lineLength=80,ribbonsPerLine=1} x
+renderStyle s x = PP.renderStyle s (pp x)
+
+infixl 5 $$,$+$
+infixl 6 <>,<+>
+
+x $$ y = pp x PP.$$ pp y
+x $+$ y = pp x PP.$+$ pp y
+x <+> y = pp x PP.<+> pp y
+x <> y = pp x PP.<> pp y
+
+braces x = PP.braces (pp x)
+brackets x = PP.brackets (pp x)
+cat xs = PP.cat (map pp xs)
+doubleQuotes x = PP.doubleQuotes (pp x)
+fcat xs = PP.fcat (map pp xs)
+fsep xs = PP.fsep (map pp xs)
+hang x d y = PP.hang (pp x) d (pp y)
+hcat xs = PP.hcat (map pp xs)
+hsep xs = PP.hsep (map pp xs)
+nest d x = PP.nest d (pp x)
+parens x = PP.parens (pp x)
+punctuate x ys = PP.punctuate (pp x) (map pp ys)
+quotes x = PP.quotes (pp x)
+sep xs = PP.sep (map pp xs)
+vcat xs = PP.vcat (map pp xs)
--- a/src/compiler/api/GF/Text/Transliterations.hs
+++ b/src/compiler/api/GF/Text/Transliterations.hs
@@ -0,0 +1,331 @@
+module GF.Text.Transliterations (
+  transliterate,
+  transliterateWithFile,
+  transliteration,
+  characterTable,
+  transliterationPrintNames
+  ) where
+
+import Data.Char
+import Numeric
+import qualified Data.Map as Map
+
+-- transliterations between ASCII and a Unicode character set
+
+-- current transliterations: devanagari, thai
+
+-- to add a new one: define the Unicode range and the corresponding ASCII strings,
+-- which may be one or more characters long
+
+-- conventions to be followed:
+--   each character is either [letter] or [letter+nonletters]
+--   when using a sparse range of unicodes, mark missing codes as "-" in transliterations
+--   characters can be invisible: ignored in translation to unicode
+
+transliterate :: String -> Maybe (String -> String)
+transliterate s = case s of
+  'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
+  't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
+  _ -> Nothing
+
+transliterateWithFile :: String -> String -> Bool -> (String -> String)
+transliterateWithFile name src isFrom =
+  (if isFrom then appTransFromUnicode else appTransToUnicode) (getTransliterationFile name src)
+
+transliteration :: String -> Maybe Transliteration
+transliteration s = Map.lookup s allTransliterations
+
+allTransliterations = Map.fromList [
+  ("amharic",transAmharic),
+  ("ancientgreek", transAncientGreek),
+  ("arabic", transArabic),
+  ("devanagari", transDevanagari),
+  ("greek", transGreek),
+  ("hebrew", transHebrew),
+  ("persian", transPersian),
+  ("sanskrit", transSanskrit),
+  ("sindhi", transSindhi),
+  ("nepali", transNepali),
+  ("telugu", transTelugu),
+  ("thai", transThai),
+  ("urdu", transUrdu)
+ ]
+
+-- used in command options and help
+transliterationPrintNames = [(t,printname p) | (t,p) <- Map.toList allTransliterations]
+
+characterTable :: Transliteration -> String
+characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
+  prOne (i,s) = unwords ["|", showHex i "", "|", [toEnum i], "|", s, "|"]
+
+data Transliteration = Trans {
+  trans_to_unicode   :: Map.Map String Int,
+  trans_from_unicode :: Map.Map Int String,
+  invisible_chars    :: [String],
+  printname          :: String
+  }
+
+appTransToUnicode :: Transliteration -> String -> String
+appTransToUnicode trans =
+  concat .
+  map (\c -> maybe c (return . toEnum) $
+             Map.lookup c (trans_to_unicode trans)
+      ) .
+  filter (flip notElem (invisible_chars trans)) .
+  unchar
+
+appTransFromUnicode :: Transliteration -> String -> String
+appTransFromUnicode trans =
+  concat .
+  map (\c -> maybe [toEnum c] id $
+             Map.lookup c (trans_from_unicode trans)
+      ) .
+  map fromEnum
+
+
+mkTransliteration :: String -> [String] -> [Int] -> Transliteration
+mkTransliteration name ts us =
+ Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) [] name
+  where
+    tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
+    uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
+
+getTransliterationFile :: String -> String -> Transliteration
+getTransliterationFile name = uncurry (mkTransliteration name) . codes
+ where
+  codes = unzip . map (mkOne . words) . filter (not . all isSpace) . lines
+  mkOne ws = case ws of
+    [c]:t:_ -> (t,fromEnum c)  -- ä a:
+    u:t:_   -> (t,read u)      -- 228 a: OR 0xe4
+    _ -> error $ "not a valid transliteration:" ++ unwords ws
+
+unchar :: String -> [String]
+unchar s = case s of
+  c:d:cs
+   | isAlpha d -> [c]    : unchar (d:cs)
+   | isSpace d -> [c]:[d]: unchar cs
+   | otherwise -> let (ds,cs2) = break (\x -> isAlpha x || isSpace x) cs in
+                  (c:d:ds) : unchar cs2
+  [_]          -> [s]
+  _            -> []
+
+transThai :: Transliteration
+transThai = mkTransliteration "Thai" allTrans allCodes where
+  allTrans = words $
+    "-  k  k1 -  k2 -  k3 g  c  c1 c2 s' c3 y' d' t' " ++
+    "t1 t2 t3 n' d  t  t4 t5 t6 n  b  p  p1 f  p2 f' " ++
+    "p3 m  y  r  -  l  -  w  s- s. s  h  l' O  h' -  " ++
+    "a. a  a: a+ i  i: v  v: u  u: -  -  -  -  -  -  " ++
+    "e  e' o: a% a& L  R  S  T1 T2 T3 T4 K  -  -  -  " ++
+    "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 -  -  -  -  -  -  "
+  allCodes = [0x0e00 .. 0x0e7f]
+
+transDevanagari :: Transliteration
+transDevanagari =
+  (mkTransliteration "Devanagari"
+    allTransUrduHindi allCodes){invisible_chars = ["a"]} where
+      allCodes = [0x0900 .. 0x095f] ++ [0x0966 .. 0x096f]
+
+allTransUrduHindi = words $
+    "-  n~ m. h.  -  A A: I I: U U: r.- l.-  -  -  E: " ++
+    "E+ -  -  O: O+ k  k'  g  g'  n- c  c'  j  j'  n* T " ++
+    "T' D D' N t  t'  d  d'  n  -  p  p'  b  b'  m  y  " ++
+    "r  -  l  L  -  v  s*  S  s  h  -  -  X~ -  a:  i  " ++
+    "i:  u  u:  r.  l.  -  -  e:  e+  -  -  o:  o+  X,  -  -  " ++
+    "-  -  -  -  -  -  -  -  q  x  g.  z  R R'  f  -  " ++
+    "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 "
+
+
+transUrdu :: Transliteration
+transUrdu =
+  (mkTransliteration "Urdu" allTrans allCodes) where
+    allCodes = [0x0622 .. 0x062f] ++ [0x0630 .. 0x063a] ++ [0x0641,0x0642] ++ [0x06A9] ++ [0x0644 .. 0x0648] ++
+               [0x0654,0x0658,0x0679,0x067e,0x0686,0x0688,0x0691,0x0698,0x06af,0x06c1,0x06c3,0x06cc,0x06ba,0x06be,0x06d2] ++
+               [0x06f0 .. 0x06f9] ++ [0x061f,0x06D4]
+    allTrans = words $
+      "A - w^ - y^ a b - t C j H K d " ++  -- 0622 - 062f
+      "Z r z s X S Z- t- z- e G "   ++  -- 0630 - 063a
+      "f q k l m n - w "    ++  -- 0641, 0642, 0643 - 0648
+      "$ n- T p c D R x g h t: y N h' E " ++  -- 0654,658,679,67e,686,688,698,6af,6c1,6c3,6cc,6ba,6be,6d2
+      "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 " ++ "? ."
+
+transSindhi :: Transliteration
+transSindhi =
+  (mkTransliteration "Sindhi" allTrans allCodes) where
+    allCodes = [0x062e] ++ [0x0627 .. 0x062f] ++ [0x0630 .. 0x063a] ++ [0x0641 .. 0x0648] ++
+               [0x067a,0x067b,0x067d,0x067e,0x067f] ++ [0x0680 .. 0x068f] ++
+               [0x0699,0x0918,0x06a6,0x061d,0x06a9,0x06af,0x06b3,0x06bb,0x06be,0x06f6,0x064a,0x06b1, 0x06aa, 0x06fd, 0x06fe] ++
+               [0x06f0 .. 0x06f9] ++ [0x061f,0x06D4]
+    allTrans = words $
+      "K a b - t C j H - d " ++  -- 0626 - 062f
+      "Z r z s X S Z- t- z- e G "   ++  -- 0630 - 063a
+      "f q - L m n - W "    ++  -- 0641 - 0648
+      "T! B T p T' " ++  -- 067a,067b,067d,067e,067f
+      "B' - - Y' J' - c c' - - d! - d' D - D' " ++  -- 0680 - 068f
+      "R - F' - k' g G' t' h' e' y c! k A M " ++  -- 0699, 0918, 06a6, 061d, 06a9,06af,06b3,06bb,06be,06f6,06cc,06b1
+      "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 " ++ "? ."
+
+
+
+transArabic :: Transliteration
+transArabic = mkTransliteration "Arabic" allTrans allCodes where
+  allTrans = words $
+    "   V  A: A? w? A- y? A  b  t. t  v  g  H  K  d " ++  -- 0621 - 062f
+    "W  r  z  s  C S  D  T  Z  c  G "                 ++  -- 0630 - 063a
+    "   f  q  k  l  m  n  h  w  y. y a. u. i. a  u "  ++  -- 0641 - 064f
+    "i  v2 o  a: V+ V- i: a+ "                        ++  -- 0650 - 0657
+    "A*  q?"                                              -- 0671 (used by AED)
+  allCodes = [0x0621..0x062f] ++ [0x0630..0x063a] ++
+             [0x0641..0x064f] ++ [0x0650..0x0657] ++ [0x0671,0x061f]
+
+transPersian :: Transliteration
+transPersian = (mkTransliteration "Persian/Farsi" allTrans allCodes)
+    {invisible_chars = ["a","u","i"]} where
+  allTrans = words $
+    "   V  A: A? w? A- y? A  b  t. t  t-  j  H  K  d " ++  -- 0621 - 062f
+    "W  r  z  s  C  S  D  T  Z  c  G "                 ++  -- 0630 - 063a
+    "   f  q  -  l  m  n  h  v  -  y. a. u. i. a  u "   ++  -- 0640 - 064f
+    "i  v2 o  a: V+ V- i: a+ " ++                          -- 0650 - 0657
+    "p  c^ J  k  g  y q? Z0"
+  allCodes = [0x0621..0x062f] ++ [0x0630..0x063a] ++
+             [0x0641..0x064f] ++ [0x0650..0x0657] ++
+             [0x067e,0x0686,0x0698,0x06a9,0x06af,0x06cc,0x061f,0x200c]
+
+transNepali :: Transliteration
+transNepali = mkTransliteration "Nepali" allTrans allCodes where
+  allTrans = words $
+    "z+  z= " ++
+    "-  V  M  h: -  H  A  i: I: f  F  Z  -  -  -  e: " ++
+    "E: -  -  O  W  k  K  g  G  n: C  c  j  J  Y  q  " ++
+    "Q  x  X  N  t  T  d  D  n  -  p  P  b  B  m  y  " ++
+    "r  -  l  L  -  v  S  z  s  h  -  -  ~  `  a  i  " ++
+    "I  u  U  R  -  -  -  e  E  -  -  o  w  x: -  -  " ++
+    "O: -  _  -  -  -  -  -  -  -  -  -  -  -  -  -  " ++
+    "-  -  -  -  .  >  0  1  2  3  4  5  6  7  8  9  " ++
+    "-  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  "
+  allCodes = [0x200c,0x200d] ++ [0x0900 .. 0x097f]
+
+
+transHebrew :: Transliteration
+transHebrew = mkTransliteration "unvocalized Hebrew" allTrans allCodes where
+  allTrans = words $
+    "A  b  g  d  h  w  z  H  T  y  K  k  l  M  m  N " ++
+    "n  S  O  P  p  Z. Z  q  r  s  t  -  -  -  -  - " ++
+    "w2 w3 y2 g1 g2"
+  allCodes = [0x05d0..0x05f4]
+
+transTelugu :: Transliteration
+transTelugu = mkTransliteration "Telugu" allTrans allCodes where
+  allTrans = words $
+    "-  c1 c2 c3 -  A  A: I  I: U  U: R_ L_ -  E  E: " ++
+    "A' -  O  O: A_ k  k. g  g. n. c  c. j  j. n' T  " ++
+    "T. d  d. N  t  t. d  d. n  -  p  p. b  b. m  y  " ++
+    "r  R  l  L  -  v  s' S  s  h  -  -  -  c5 a: i  " ++
+    "i: u  u: r_ r. -  e  e: a' -  o  o: a_ c6 -  -  " ++
+    "-  -  -  -  -  c7 c8 z  Z  -  -  -  -  -  -  -  " ++
+    "R+ L+ l+ l* -  -  n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 "
+  allCodes = [0x0c00 .. 0x0c7f]
+
+transGreek :: Transliteration
+transGreek = mkTransliteration "modern Greek" allTrans allCodes where
+  allTrans = words $
+    "-  -  -  -  -  -  A' -  E' H' I' -  O' -  Y' W' " ++
+    "i= A  B  G  D  E  Z  H  V  I  K  L  M  N  X  O  " ++
+    "P  R  -  S  T  Y  F  C  Q  W  I- Y- a' e' h' i' " ++
+    "y= a  b  g  d  e  z  h  v  i  k  l  m  n  x  o  " ++
+    "p  r  s* s  t  y  f  c  q  w  i- y- o' y' w' -  "
+  allCodes = [0x0380 .. 0x03cf]
+
+transAncientGreek :: Transliteration
+transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
+ allTrans = words $
+--   "-  -  -  -  -  -  -  c: -  -  -  -  -  -  -  - " ++  -- standard code point for colon: 00B7
+   "-  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - " ++
+   "i= A  B  G  D  E  Z  H  V  I  K  L  M  N  X  O  " ++
+   "P  R  -  S  T  Y  F  C  Q  W  I- Y- -  -  -  -  " ++
+   "y= a  b  g  d  e  z  h  v  i  k  l  m  n  x  o  " ++
+   "p  r  s* s  t  y  f  c  q  w  i- y- -  -  -  -  " ++
+   "a)  a(  a)` a(` a)' a(' a)~ a(~ A)  A(  A)` A(` A)' A(' A)~ A(~ " ++  -- 1f00-1f09,1f0a-1f0f
+   "e)  e(  e)` e(` e)' e(' -   -   E)  E(  E)` E(` E)' E(' -   -   " ++
+   "h)  h(  h)` h(` h)' h(' h)~ h(~ H)  H(  H)` H(` H)' H(' H)~ H(~ " ++
+   "i)  i(  i)` i(` i)' i(' i)~ i(~ I)  I(  I)` I(` I)' I(' I)~ I(~ " ++
+   "o)  o(  o)` o(` o)' o(' -   -   O)  O(  O)` O(` O)' O(' -   -   " ++
+   "y)  y(  y)` y(` y)' y(' y)~ y(~ -   Y(  -   Y(` -   Y(' -   Y(~ " ++
+   "w)  w(  w)` w(` w)' w(' w)~ w(~ W)  W(  W)` W(` W)' W(' W)~ W(~ " ++
+   "a`  a'  e`  e'  h`  h'  i`  i'  o`  o'  y`  y'  w`  w'  -   -   " ++
+   "a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80-
+   "h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90-
+   "w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0-
+   "a.  a_  a|` a|  a|'  -  a~ a|~ - - - - - - - - " ++ -- 1fb0-
+   "-  -  h|` h|  h|'  -  h~ h|~ - - - - - - - - " ++ -- 1fc0-
+   "i. i_ i=` i=' -    -  i~ i=~ - - - - - - - - " ++ -- 1fd0-
+   "y. y_ y=` y=' r)   r( y~ y=~ - - - - - - - - " ++ -- 1fe0-
+   "-  -  w|` w|  w|'  -  w~ w|~ - - - - - - - - " ++ -- 1ff0-
+   -- HL, Private Use Area Code Points (New Athena Unicode, Cardo, ALPHABETUM, Antioch)
+   -- see: http://apagreekkeys.org/technicalDetails.html
+   --      GreekKeys Support by Donald Mastronarde
+   "- - - - - - - - - e. o. R) Y) Y)` Y)' Y)~ "    ++ -- e1a0-e1af
+   "e~ e)~ e(~ e_ e_' e_` e_) e_( e_)` e_(` e_)' e_(' E)~ E(~ E_ E. " ++ -- e1b0-e1bf
+   "o~ o)~ o(~ o_ o_' o_` o_) o_( o_)` o_(` o_)' o_(' O)~ O(~ O_ O. " ++ -- e1c0-e1cf
+   "a_` - a_~ a_)` a_(` a_)~ a_(~ - a.` a.) a.)` a.(' a.(` - - - "    ++ -- eaf0-eaff
+   "a_' - - - a_) a_( - a_)' - a_(' a.' a.( a.)' - - - "              ++ -- eb00-eb0f
+   "e_)~ e_(~ - - - - - e_~ - - - - - - - - "                         ++ -- eb20-eb2f
+   "- - - - - - i_~ - i_` i_' - - i_) i_)' i_( i_(' "                 ++ -- eb30-eb3f
+   "i.' i.) i.)' i.( i.` i.)` - i.(' i.(` - - - - - - - "             ++ -- eb40-eb4f
+   "- - - - i_)` i_(` - i_)~ i_(~ - o_~ o_)~ o_(~ - - - "             ++ -- eb50-eb5f
+   "y_` " ++ -- eb6f
+   "y_~ y_)` - - - y_(` - y_)~ y_(~ - y_' - - y_) y_( y_)' "  ++         -- eb70-eb7f
+   "y_(' y.' y.( y.` y.) y.)' - - y.)` y.(' y.(` - - - - - "             -- eb80-eb8f
+ allCodes =  -- [0x00B0 .. 0x00Bf]
+             [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff]
+          ++ [0xe1a0 .. 0xe1af]
+          ++ [0xe1b0 .. 0xe1bf]
+          ++ [0xe1c0 .. 0xe1cf]
+          ++ [0xeaf0 .. 0xeaff]
+          ++ [0xeb00 .. 0xeb0f]
+          ++ [0xeb20 .. 0xeb2f]
+          ++ [0xeb30 .. 0xeb3f]
+          ++ [0xeb40 .. 0xeb4f]
+          ++ [0xeb50 .. 0xeb5f] ++ [0xeb6f]
+          ++ [0xeb70 .. 0xeb7f]
+          ++ [0xeb80 .. 0xeb8f]
+
+transAmharic :: Transliteration
+transAmharic = mkTransliteration "Amharic" allTrans allCodes where
+  allTrans = words $
+    " h.  h-  h'  h(  h)  h  h?  h*  l.  l-  l'  l(  l)  l  l?  l*  "++
+    " H.  H-  H'  H(  H)  H  H?  H*  m.  m-  m'  m(  m)  m  m?  m*  "++
+    " s.  s-  s'  s(  s)  s  s?  s*  r.  r-  r'  r(  r)  r  r?  r* "++
+    " -   -   -   -   -  -   -  -   x.  x-  x'  x(  x)  x  x?   x* "++
+    " q.  q-  q'  q(  q)  q  q?  q*  -   -   -   -   -   -  -   - "++
+    " -   -   -   -   -   -  -   -   -   -   -   -   -   -  -   - "++
+    " b.  b-  b'  b(  b)  b  b?  b*  v.  v-  v'  v(  v)  v  v?  v* "++
+    " t.  t-  t'  t(  t)  t  t?  t*  c.  c-  c'  c(  c)  c  c?  c* "++
+    " X.  X-  X'  X(  X)  X  X?  -   -   -   -   X*  -   -  -   - "++
+    " n.  n-  n'  n(  n)  n  n?  n*  N.  N-  N'  N(  N)  N  N?  N* "++
+    " a   u   i   A   E   e  o   e*  k.  k-  k'  k(  k)  k  k?  - "++
+    " -   -   -   k*  -   -  -   -   -   -   -   -   -   -  -   - "++
+    " -   -   -   -   -   -  -   -   w.  w-  w'  w(  w)  w  w?  w* "++
+    " -   -   -   -   -   -  -   -   z.  z-  z'  z(  z)  z  z?  z* "++
+    " Z.  Z-  Z'  Z(  Z)  Z  Z?  Z*  y.  y-  y'  y(  y)  y  y?  y* "++
+    " d.  d-  d'  d(  d)  d  d?  d*  -   -   -   -   -   -  -   - "++
+    " j.  j-  j'  j(  j)  j  j?  j*  g.  g-  g'  g(  g)  g  g?  - "++
+    " -   -   -   g*  -   -  -   -   -   -   -   -   -   -  -   - "++
+    " T.  T-  T'  T(  T)  T  T?  T*  C.  C-  C'  C(  C)  C  C?  C* "++
+    " P.  P-  P'  P(  P)  P  P?  P*  S.  S-  S'  S(  S)  S  S?  S* "++
+    " -   -   -   -   -   -  -   -   f.  f-  f'  f(  f)  f  f?  f*"++
+    " p.  p-  p'  p(  p)  p  p?  p*"
+  allCodes = [0x1200..0x1357]
+
+-- by Prasad 31/5/2013
+transSanskrit :: Transliteration
+transSanskrit = (mkTransliteration "Sanskrit" allTrans allCodes) {invisible_chars = ["a"]} where
+  allTrans = words $
+    "-  n~ m. h. - A A: I I: U U: R. L. - - E: " ++
+    "E+ - O O: O+ k k' g g' n- c c' j j' n* T " ++
+    "T' D D' N t t' d d' n - p p' b b' m y " ++
+    "r - l L - v s* S s h - - - v- a: i " ++
+    "i: u u: r. r.: - e e: e+ - o o: o+ a_ - - " ++
+    "o~  -  -  -  -  - - -  q x G  z  R  R'  f  -  " ++
+    "R.: L.: l. l.: p, p.  N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 "
+  allCodes = [0x0900 .. 0x097f]