gf-core/src/compiler/GF/Text/Lexing.hs

-- | Lexers and unlexers - they work on space-separated word strings
module GF.Text.Lexing (stringOp,opInEnv) where

import GF.Text.Transliterations

import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse)

stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp good name = case name of
  "chars"      -> Just $ appLexer (filter (not . all isSpace) . map return)
  "lextext"    -> Just $ appLexer (lexText good)
  "lexcode"    -> Just $ appLexer lexCode
  "lexmixed"   -> Just $ appLexer (lexMixed good)
  "lexgreek"   -> Just $ appLexer lexAGreek
  "lexgreek2"  -> Just $ appLexer lexAGreek2
  "words"      -> Just $ appLexer words
  "bind"       -> Just $ appUnlexer (unwords . bindTok)
  "unchars"    -> Just $ appUnlexer concat
  "unlextext"  -> Just $ appUnlexer (unlexText . unquote . bindTok)
  "unlexcode"  -> Just $ appUnlexer unlexCode
  "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
  "unlexgreek" -> Just $ appUnlexer unlexAGreek
  "unlexnone"  -> Just id
  "unlexid"    -> Just id
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
  _            -> transliterate name

-- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation
opInEnv :: String -> String -> (String -> String) -> (String -> String)
opInEnv beg end op = concat . altern False . chop (lbeg, beg) [] where
  chop mk@(lg, mark) s0 s =
    let (tag,rest) = splitAt lg s in
    if tag==mark then (reverse s0) : mark : chop (switch mk) [] rest
      else case s of
        c:cs -> chop mk (c:s0) cs
        [] -> [reverse s0]
  switch (lg,mark) = if mark==beg then (lend,end) else (lbeg,beg)
  (lbeg,lend) = (length beg, length end)
  altern m ts = case ts of
    t:ws | not m && t==beg -> t : altern True ws
    t:ws | m     && t==end -> t : altern False ws
    t:ws -> (if m then op t else t) : altern m ws
    [] -> []

appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f

appUnlexer :: ([String] -> String) -> String -> String
----appUnlexer f = unlines . map (f . words) . lines
appUnlexer f = f . words

wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where
  tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]


-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)

-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
  lext s = case s of
    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
    c:cs | isMinorPunct c -> [c] : lext cs
    c:cs | isSpace c ->       lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
    _ -> [s]
  uncap s = case s of
    w:ws -> uncap1 w:ws
    _ -> s

unlexText :: [String] -> String
unlexText = capitInit . unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isPunct c -> w ++ [c]
    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    _ -> []

-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
               "&+":ws       -> bindTok ws
               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
               "&|":ws       -> bindTok ws
               w:ws          -> w:bindTok ws
               []            -> []

-- * Code lexing

-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
  [(w@(_:_),ws)] -> w : lexCode ws
  _ -> []


-- * Ancient Greek lexing

lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
           in w : lext cs
    [] -> []

-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.

lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
           in case cs of
                '.':'.':d:ds | isSpace d
                  -> (w++['.']) : lext ('.':d:ds)
                '.':d:ds | isAGreekPunct d || isSpace d
                  -> (w++['.']) : lext (d:ds)
                '.':d:ds | not (isSpace d)
                  -> case lext (d:ds) of
                       e:es -> (w++['.']++e) : es
                       es -> (w++['.']) : es
                '.':[] -> (w++['.']) : []
                _ -> w : lext cs
    [] -> []

unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isAGreekPunct c -> w ++ [c]
    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    [] -> []

isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
                                   -- second version · = 00B7 standard code point

-- * Text lexing and unlexing for Ancient Greek:
--   1. no capitalization of initial word,
--   2. grave/acute accent switch on final syllables of words not followed by punctuation,
--   3. accent move from/to support word to/from following clitic words (iterated).

lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek

lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2

unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek

-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .).

-- | normalize = change grave accent on sentence internal words to acute,
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)

normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)

fromAGreek :: [String] -> [String]
fromAGreek s = case s of
  w:[]:vs -> w:[]:(fromAGreek vs)
  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
  w:v:vs | wasEnclitic v && wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
    getProclitic w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && (hasEndCircum w ||
    (hasEndAcute w && hasSingleAccent w)) ->
    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
    w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
    dropLastAccent w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:ws -> (toAcute w) : (fromAGreek ws)
  ws -> ws

-- | de-normalize = change acute accent of end syllables in sentence internal
--  (non-enclitic) words to grave accent, and move accents of enclitics to the
--  previous word to produce ordinary ancient greek

denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)

toAGreek :: [String] -> [String]
toAGreek s = case s of
  w:[]:vs -> w:[]:(toAGreek vs)
  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
  w:v:vs | isEnclitic v && isEnclitic w ->
    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
    addAcute w: (toAGreek (dropAccent v:vs))
  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
  w:v:vs | isEnclitic v && hasPrefinalAcute w ->
    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
  w:v:vs | isEnclitic v ->
    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
  w:ws -> (toGrave w) : (toAGreek ws)
  ws -> ws

-- | Change accent on the final syllable of a word

toGrave :: String -> String
toGrave = reverse . grave . reverse where
  grave s = case s of
    '\'':cs -> '`':cs
    c:cs | isAGreekVowel c -> c:cs
    c:cs -> c: grave cs
    _ -> s

toAcute :: String -> String
toAcute = reverse . acute . reverse where
  acute s = case s of
    '`':cs -> '\'':cs
    c:cs | isAGreekVowel c -> c:cs
    c:cs -> c: acute cs
    _ -> s

isAGreekVowel = flip elem "aeioyhw"

-- | Accent moves for enclitics and proclitics (atona)

enclitics = [
  "moy","moi","me",     -- personal pronouns
  "soy","soi","se",
  "oy(","oi(","e(",
  "tis*","ti","tina'",  -- indefinite pronoun
  "tino's*","tini'",
  "tine's*","tina's*",
  "tinw~n","tisi'","tisi'n",
  "poy","poi",          -- indefinite adverbs
  "pove'n","pws*",
  "ph|","pote'",
  "ge","te","toi",      -- particles
  "nyn","per","pw"
   -- suffix -"de"
   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
  ] -- and more, BR 11

proclitics = [
  "o(","h(","oi(","ai(",     -- articles
  "e)n","ei)s*","e)x","e)k", -- prepositions
  "ei)","w(s*",              -- conjunctions
  "oy)","oy)k","oy)c"        -- negation
  ]

isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics

-- Check if a word is an enclitic or accented enclitic and extract the enclitic

wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
                    ++ (map dropAccent (filter hasAccent enclitics))
                  accented = (filter hasAccent enclitics)
                    ++ map addAcute (filter (not . hasAccent) enclitics)
              in flip elem (accented ++ unaccented)

wasProclitic = flip elem (map addAcute proclitics)

getEnclitic =
  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
                  (enclitics ++ (filter hasAccent enclitics)
                   ++ (filter (not . hasAccent) enclitics))
      find = \v -> lookup v pairs
  in \v -> case (find v) of
    Just x -> x
    _ -> v
getProclitic =
  let pairs = zip (map addAcute proclitics) proclitics
      find = \v -> lookup v pairs
  in \v -> case (find v) of
    Just x -> x
    _ -> v

-- | Accent manipulation

dropAccent = reverse . drop . reverse where
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs

dropLastAccent = reverse . drop . reverse where
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs

addAcute :: String -> String
addAcute = reverse . acu . reverse where
  acu w = case w of
    c:cs | c == '\'' -> c:cs
    c:cs | c == '(' -> '\'':c:cs
    c:cs | c == ')' -> '\'':c:cs
    c:cs | isAGreekVowel c -> '\'':c:cs
    c:cs -> c : acu cs
    _ -> w

-- | Accent checking on end syllables

hasEndAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> True
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs

hasEndCircum = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False
    '`':cs -> False
    '~':cs -> True
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs

hasPrefinalAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False  -- final acute
    '`':cs -> False
    '~':cs -> False
    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
    c:cs | isAGreekVowel c -> findNext cs
    _:cs -> find cs where
  findNext s = case s of
    [] -> False
    '\'':cs -> True  -- prefinal acute
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> findNext cs where

hasSingleAccent v =
  hasAccent v && not (hasAccent (dropLastAccent v))

hasAccent v = case v of
  [] -> False
  c:cs -> elem c ['\'','`','~'] || hasAccent cs

{- Tests:

-- denormalization. Examples in BR 11 work:
-}
enclitics_expls = -- normalized
  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
  "ei) tis*":"ei) tine's*":
  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
  "doy~los* tis*":"doy~loi tine's*":
  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
  []


unlexCode :: [String] -> String
unlexCode s = case s of
  w:[] -> w
  [c]:cs | isParen c -> [c] ++ unlexCode cs
  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
  w:ws -> w ++ " " ++ unlexCode ws
  _ -> []


-- | LaTeX lexer in the math mode: \ should not be separated from the next word

lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
  restoreBackslash ws = case ws of
    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
    w:ww -> w:restoreBackslash ww
    _ -> ws

-- * Mixed lexing

-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: (String -> Bool) -> String -> [String]
lexMixed good = concat . alternate False [] where
  alternate env t s = case s of
    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
    c:cs -> alternate env (c:t) cs
    _ -> [lex env (reverse t)]
  lex env = if env then lexLatexCode else lexText good

unlexMixed :: (String -> Bool) -> [String] -> String
unlexMixed good = capitInit . concat . alternate False where
  alternate env s = case s of
    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
      (t,[])  -> unlex env t : []
      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
    _ -> []
  unlex env = if env then unlexCode else (uncapitInit good . unlexText)
  sep env c m = case (m,env) of
    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct
    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
    _ -> " " ++ c   -- put space before opening $

-- * Additional lexing uitilties

-- | Capitalize first letter
capitInit s = case s of
  c:cs -> toUpper c : cs
  _ -> s

-- | Uncapitalize first letter
uncapitInit good s =
  case s of
    c:cs | not (good s) -> toLower c : cs
    _                   -> s

-- | Unquote each string wrapped in double quotes
unquote = map unq where
  unq s = case s of
    '"':cs@(_:_) | last cs == '"' -> init cs
    _ -> s

isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"