eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.

2026-06-17 05:50:12 -06:00 · 2017-09-04 11:43:37 +02:00
parent cae52bb9af
commit 1f908fa7bf
12 changed files with 453 additions and 561 deletions
--- a/gf.cabal
+++ b/gf.cabal
@@ -109,9 +109,6 @@ Library
  exposed-modules: 
    PGF
    PGF.Internal
    PGF.Lexing
    PGF.LexingAGreek
    PGF.Utilities
    PGF.Haskell
  other-modules:
@@ -137,6 +134,7 @@ Library
    PGF.VisualizeTree
    PGF.ByteCode
    PGF.OldBinary
    PGF.Utilities
  if flag(c-runtime)
    exposed-modules: PGF2
--- a/src/compiler/GF/Command/CommonCommands.hs
+++ b/src/compiler/GF/Command/CommonCommands.hs
@@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [
 stringOps menv opts s = foldr (menvop . app) s (reverse opts)
  where
-    app f = maybe id id (stringOp f)
+    app f = maybe id id (stringOp (const False) f)
    menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv
 envFlag fs =
--- a/src/compiler/GF/Data/Utilities.hs
+++ b/src/compiler/GF/Data/Utilities.hs
@@ -12,12 +12,12 @@
 -----------------------------------------------------------------------------
-module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where
+module GF.Data.Utilities(module GF.Data.Utilities) where
 import Data.Maybe
 import Data.List
 import Control.Monad (MonadPlus(..),liftM,when)
-import PGF.Utilities
+import qualified Data.Set as Set
 -- * functions on lists
@@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue)
 -- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things.
 --   The result list is stable (the elements are returned in the order they occur), and lazy.
 --   Requires that the list elements can be compared by Ord.
 --   Code ruthlessly taken from <http://hpaste.org/54411>
 nub' :: Ord a => [a] -> [a]
 nub' = loop Set.empty
    where loop _    []            = []
          loop seen (x : xs)
              | Set.member x seen = loop seen xs
              | otherwise         = x : loop (Set.insert x seen) xs
 -- | Replace all occurences of an element by another element.
 replace :: Eq a => a -> a -> [a] -> [a]
 replace x y = map (\z -> if z == x then y else z)
--- a/src/compiler/GF/Text/Lexing.hs
+++ b/src/compiler/GF/Text/Lexing.hs
@@ -2,30 +2,30 @@
 module GF.Text.Lexing (stringOp,opInEnv) where
 import GF.Text.Transliterations
 import PGF.Lexing
 import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016
-import Data.Char (isSpace)
+import Data.Char (isSpace,toUpper,toLower)
 import Data.List (intersperse)
-stringOp :: String -> Maybe (String -> String)
+stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
-stringOp name = case name of
+stringOp good name = case name of
  "chars"      -> Just $ appLexer (filter (not . all isSpace) . map return)
-  "lextext"    -> Just $ appLexer lexText
+  "lextext"    -> Just $ appLexer (lexText good)
  "lexcode"    -> Just $ appLexer lexCode
-  "lexmixed"   -> Just $ appLexer lexMixed
+  "lexmixed"   -> Just $ appLexer (lexMixed good)
  "lexgreek"   -> Just $ appLexer lexAGreek
  "lexgreek2"  -> Just $ appLexer lexAGreek2
  "words"      -> Just $ appLexer words
  "bind"       -> Just $ appUnlexer (unwords . bindTok)
  "unchars"    -> Just $ appUnlexer concat
-  "unlextext"  -> Just $ appUnlexer (unlexText . unquote)
+  "unlextext"  -> Just $ appUnlexer (unlexText . unquote . bindTok)
  "unlexcode"  -> Just $ appUnlexer unlexCode
-  "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote)
+  "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
  "unlexgreek" -> Just $ appUnlexer unlexAGreek
  "unlexnone"  -> Just id
  "unlexid"    -> Just id
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
-  _ -> transliterate name
+  _            -> transliterate name
 -- perform op in environments beg--end, t.ex. between "--"
 --- suboptimal implementation
@@ -55,3 +55,395 @@ appUnlexer f = f . words
 wrapHTML :: String -> String
 wrapHTML = unlines . tag . intersperse "<br>" . lines where
  tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
 -- * Text lexing
 -- | Text lexing with standard word capitalization of the first word of every sentence
 lexText :: (String -> Bool) -> String -> [String]
 lexText good = lexText' (uncapitInit good)
 -- | Text lexing with custom treatment of the first word of every sentence.
 lexText' :: (String->String) -> String -> [String]
 lexText' uncap1 = uncap . lext where
  lext s = case s of
    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
    c:cs | isMinorPunct c -> [c] : lext cs
    c:cs | isSpace c ->       lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
    _ -> [s]
  uncap s = case s of
    w:ws -> uncap1 w:ws
    _ -> s
 unlexText :: [String] -> String
 unlexText = capitInit . unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isPunct c -> w ++ [c]
    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    _ -> []
 -- | Bind tokens separated by Prelude.BIND, i.e. &+
 bindTok :: [String] -> [String]
 bindTok ws = case ws of
               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
               "&+":ws       -> bindTok ws
               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
               "&|":ws       -> bindTok ws
               w:ws          -> w:bindTok ws
               []            -> []
 -- * Code lexing
 -- | Haskell lexer, usable for much code
 lexCode :: String -> [String]
 lexCode ss = case lex ss of
  [(w@(_:_),ws)] -> w : lexCode ws
  _ -> []
 -- * Ancient Greek lexing
 lexTextAGreek :: String -> [String]
 lexTextAGreek s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in w : lext cs
    [] -> []
 -- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
 -- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
 lexTextAGreek2 :: String -> [String]
 lexTextAGreek2 s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in case cs of 
                '.':'.':d:ds | isSpace d 
                  -> (w++['.']) : lext ('.':d:ds)
                '.':d:ds | isAGreekPunct d || isSpace d 
                  -> (w++['.']) : lext (d:ds)
                '.':d:ds | not (isSpace d) 
                  -> case lext (d:ds) of
                       e:es -> (w++['.']++e) : es
                       es -> (w++['.']) : es 
                '.':[] -> (w++['.']) : []
                _ -> w : lext cs      
    [] -> []
 unlexTextAGreek :: [String] -> String
 unlexTextAGreek = unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isAGreekPunct c -> w ++ [c]
    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    [] -> []
 isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
                                   -- second version · = 00B7 standard code point
 -- * Text lexing and unlexing for Ancient Greek: 
 --   1. no capitalization of initial word, 
 --   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
 --   3. accent move from/to support word to/from following clitic words (iterated).
 lexAGreek :: String -> [String]
 lexAGreek = fromAGreek . lexTextAGreek
 lexAGreek2 :: String -> [String]
 lexAGreek2 = fromAGreek . lexTextAGreek2
 unlexAGreek :: [String] -> String
 unlexAGreek = unlexTextAGreek . toAGreek
 -- Note: unlexAGreek does not glue punctuation with the previous word, so that short
 -- vowel indication (like a.) differs from sentence end (a .). 
 -- | normalize = change grave accent on sentence internal words to acute, 
 -- and shift inherited acutes to the following enclitic (where they are
 -- visible only as shown in the list of enclitics above)
 normalize :: String -> String
 normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 
 fromAGreek :: [String] -> [String]
 fromAGreek s = case s of
  w:[]:vs -> w:[]:(fromAGreek vs)
  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
  w:v:vs | wasEnclitic v && wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
    getProclitic w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && (hasEndCircum w || 
    (hasEndAcute w && hasSingleAccent w)) ->  
    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
    w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
    dropLastAccent w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:ws -> (toAcute w) : (fromAGreek ws)
  ws -> ws 
 -- | de-normalize = change acute accent of end syllables in sentence internal 
 --  (non-enclitic) words to grave accent, and move accents of enclitics to the 
 --  previous word to produce ordinary ancient greek
 denormalize :: String -> String
 denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 
 toAGreek :: [String] -> [String]
 toAGreek s = case s of
  w:[]:vs -> w:[]:(toAGreek vs)
  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
  w:v:vs | isEnclitic v && isEnclitic w -> 
    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
    addAcute w: (toAGreek (dropAccent v:vs))
  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
  w:v:vs | isEnclitic v -> 
    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
  w:ws -> (toGrave w) : (toAGreek ws)
  ws -> ws 
 -- | Change accent on the final syllable of a word
 toGrave :: String -> String 
 toGrave = reverse . grave . reverse where
  grave s = case s of
    '\'':cs -> '`':cs 
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: grave cs
    _ -> s
 toAcute :: String -> String 
 toAcute = reverse . acute . reverse where
  acute s = case s of
    '`':cs -> '\'':cs
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: acute cs
    _ -> s
 isAGreekVowel = flip elem "aeioyhw"
 -- | Accent moves for enclitics and proclitics (atona)
 enclitics = [
  "moy","moi","me",     -- personal pronouns
  "soy","soi","se",
  "oy(","oi(","e(",
  "tis*","ti","tina'",  -- indefinite pronoun 
  "tino's*","tini'",
  "tine's*","tina's*",
  "tinw~n","tisi'","tisi'n",
  "poy","poi",          -- indefinite adverbs
  "pove'n","pws*",
  "ph|","pote'",
  "ge","te","toi",      -- particles
  "nyn","per","pw" 
   -- suffix -"de"
   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
  ] -- and more, BR 11
 proclitics = [
  "o(","h(","oi(","ai(",     -- articles
  "e)n","ei)s*","e)x","e)k", -- prepositions
  "ei)","w(s*",              -- conjunctions
  "oy)","oy)k","oy)c"        -- negation
  ]
 isEnclitic = flip elem enclitics
 isProclitic = flip elem proclitics
 -- Check if a word is an enclitic or accented enclitic and extract the enclitic
 wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
                    ++ (map dropAccent (filter hasAccent enclitics))
                  accented = (filter hasAccent enclitics) 
                    ++ map addAcute (filter (not . hasAccent) enclitics) 
              in flip elem (accented ++ unaccented)
 wasProclitic = flip elem (map addAcute proclitics)
 getEnclitic = 
  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
                  (enclitics ++ (filter hasAccent enclitics)
                   ++ (filter (not . hasAccent) enclitics))
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v
 getProclitic = 
  let pairs = zip (map addAcute proclitics) proclitics 
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v
 -- | Accent manipulation
 dropAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs
 dropLastAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs
 addAcute :: String -> String
 addAcute = reverse . acu . reverse where
  acu w = case w of 
    c:cs | c == '\'' -> c:cs
    c:cs | c == '(' -> '\'':c:cs
    c:cs | c == ')' -> '\'':c:cs
    c:cs | isAGreekVowel c -> '\'':c:cs
    c:cs -> c : acu cs
    _ -> w
 -- | Accent checking on end syllables
 hasEndAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> True
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs
 hasEndCircum = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False
    '`':cs -> False
    '~':cs -> True
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs
 hasPrefinalAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False  -- final acute
    '`':cs -> False
    '~':cs -> False
    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
    c:cs | isAGreekVowel c -> findNext cs
    _:cs -> find cs where
  findNext s = case s of 
    [] -> False
    '\'':cs -> True  -- prefinal acute
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> findNext cs where
 hasSingleAccent v = 
  hasAccent v && not (hasAccent (dropLastAccent v))
 hasAccent v = case v of 
  [] -> False
  c:cs -> elem c ['\'','`','~'] || hasAccent cs
 {- Tests: 
 -- denormalization. Examples in BR 11 work: 
 -}
 enclitics_expls = -- normalized
  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
  "ei) tis*":"ei) tine's*":
  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
  "doy~los* tis*":"doy~loi tine's*":
  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
  [] 
 unlexCode :: [String] -> String
 unlexCode s = case s of
  w:[] -> w
  [c]:cs | isParen c -> [c] ++ unlexCode cs
  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
  w:ws -> w ++ " " ++ unlexCode ws
  _ -> []
 -- | LaTeX lexer in the math mode: \ should not be separated from the next word
 lexLatexCode :: String -> [String]
 lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
  restoreBackslash ws = case ws of
    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
    w:ww -> w:restoreBackslash ww
    _ -> ws
 -- * Mixed lexing
 -- | LaTeX style lexer, with "math" environment using Code between $...$
 lexMixed :: (String -> Bool) -> String -> [String]
 lexMixed good = concat . alternate False [] where
  alternate env t s = case s of
    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
    c:cs -> alternate env (c:t) cs
    _ -> [lex env (reverse t)]
  lex env = if env then lexLatexCode else lexText good
 unlexMixed :: (String -> Bool) -> [String] -> String
 unlexMixed good = capitInit . concat . alternate False where
  alternate env s = case s of
    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
      (t,[])  -> unlex env t : []
      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
    _ -> []
  unlex env = if env then unlexCode else (uncapitInit good . unlexText)
  sep env c m = case (m,env) of
    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
    _ -> " " ++ c   -- put space before opening $
 -- * Additional lexing uitilties
 -- | Capitalize first letter
 capitInit s = case s of
  c:cs -> toUpper c : cs
  _ -> s
 -- | Uncapitalize first letter
 uncapitInit good s = 
  case s of
    c:cs | not (good s) -> toLower c : cs
    _                   -> s
 -- | Unquote each string wrapped in double quotes
 unquote = map unq where 
  unq s = case s of
    '"':cs@(_:_) | last cs == '"' -> init cs
    _ -> s
 isPunct = flip elem ".?!,:;"
 isMajorPunct = flip elem ".?!"
 isMinorPunct = flip elem ",:;"
 isParen = flip elem "()[]{}"
 isClosing = flip elem ")]}"
--- a/src/runtime/haskell-bind/PGF.hs
+++ b/src/runtime/haskell-bind/PGF.hs
@@ -0,0 +1,3 @@
 module PGF(module PGF2) where
 import PGF2
--- a/src/runtime/haskell-bind/PGF/Internal.hs
+++ b/src/runtime/haskell-bind/PGF/Internal.hs
@@ -0,0 +1 @@
 module PGF.Internal where
--- a/src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs
+++ b/src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs
@@ -1,61 +0,0 @@
 -- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding
 --   to the C implementation of the PGF run-time system.
 --
 -- AR April 2015 modified from pgf-shell
 import PGF2
 import PGF.Lexing (lexText')
 import Data.Char(isSpace,toLower)
 import Data.List (nub)
 import System.Environment
 import qualified Data.Map as Map
 maxNumTrees :: Int
 maxNumTrees = 1
 maxNumVariants :: Int
 maxNumVariants = 1
 main = getPGF =<< getArgs
 getPGF args = case args of
  [path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path
  [path,from,to]        -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path
  _                     -> putStrLn "Usage: pgf-hsbind-trans <path to pgf> <from-lang> <to-lang> [<cat> <#trees> <#variants>]"
 pgfTrans from to mcat mx pgf = do
  cfrom <- getConcr' pgf from
  cto   <- getConcr' pgf to
  let cat = maybe (startCat pgf) id mcat
  interact (unlines . map (translates pgf cfrom cto cat mx) . lines)
 getConcr' pgf lang =
    maybe (fail $ "Concrete syntax not found: "++show lang) return $
    Map.lookup lang (languages pgf)
 linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p]
  where
    unstar s = case s of
      '*':' ':cs -> cs
      _ -> s
 translates pgf cfrom cto cat (mxt,mxv) s0 = 
  let s1 = lextext cfrom s0
      (s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation
  in
  case cparse pgf cfrom cat s of
    Left tok -> unlines [s,"Parse error: "++tok]
    Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation
 cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where
  callbacks = maybe [] cb $ lookup "App" literalCallbacks
  cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs]
 lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of
  _:_ -> w
  _ -> case lookupMorpho cnc (uncapitInit w) of 
    [] -> w 
    _ -> uncapitInit w
  )
 where uncapitInit (c:cs) = toLower c : cs
--- a/src/runtime/haskell-bind/pgf2.cabal
+++ b/src/runtime/haskell-bind/pgf2.cabal
@@ -14,7 +14,9 @@ extra-source-files:  README
 cabal-version:       >=1.10
 library
-  exposed-modules:     PGF2, SG
+  exposed-modules:     PGF2, SG,
                       -- backwards compatibility API:
                       PGF, PGF.Internal
  other-modules:       PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI
  build-depends:       base >=4.3, bytestring >=0.9,
                       containers, pretty
--- a/src/runtime/haskell/PGF/Lexing.hs
+++ b/src/runtime/haskell/PGF/Lexing.hs
@@ -1,115 +0,0 @@
 module PGF.Lexing where
 import Data.Char(isSpace,toLower,toUpper)
 -- * Text lexing
 -- | Text lexing with standard word capitalization of the first word of every sentence
 lexText :: String -> [String]
 lexText = lexText' uncapitInit
 -- | Text lexing with custom treatment of the first word of every sentence.
 lexText' :: (String->String) -> String -> [String]
 lexText' uncap1 = uncap . lext where
  lext s = case s of
    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
    c:cs | isMinorPunct c -> [c] : lext cs
    c:cs | isSpace c ->       lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
    _ -> [s]
  uncap s = case s of
    w:ws -> uncap1 w:ws
    _ -> s
 unlexText :: [String] -> String
 unlexText = capitInit . unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isPunct c -> w ++ [c]
    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    _ -> []
 -- | Bind tokens separated by Prelude.BIND, i.e. &+
 bindTok :: [String] -> [String]
 bindTok ws = case ws of
               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
               "&+":ws       -> bindTok ws
               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
               "&|":ws       -> bindTok ws
               w:ws          -> w:bindTok ws
               []            -> []
 -- * Code lexing
 -- | Haskell lexer, usable for much code
 lexCode :: String -> [String]
 lexCode ss = case lex ss of
  [(w@(_:_),ws)] -> w : lexCode ws
  _ -> []
 unlexCode :: [String] -> String
 unlexCode s = case s of
  w:[] -> w
  [c]:cs | isParen c -> [c] ++ unlexCode cs
  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
  w:ws -> w ++ " " ++ unlexCode ws
  _ -> []
 -- | LaTeX lexer in the math mode: \ should not be separated from the next word
 lexLatexCode :: String -> [String]
 lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
  restoreBackslash ws = case ws of
    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
    w:ww -> w:restoreBackslash ww
    _ -> ws
 -- * Mixed lexing
 -- | LaTeX style lexer, with "math" environment using Code between $...$
 lexMixed :: String -> [String]
 lexMixed = concat . alternate False [] where
  alternate env t s = case s of
    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
    c:cs -> alternate env (c:t) cs
    _ -> [lex env (reverse t)]
  lex env = if env then lexLatexCode else lexText
 unlexMixed :: [String] -> String
 unlexMixed = capitInit . concat . alternate False where
  alternate env s = case s of
    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
      (t,[])  -> unlex env t : []
      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
    _ -> []
  unlex env = if env then unlexCode else (uncapitInit . unlexText)
  sep env c m = case (m,env) of
    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
    _ -> " " ++ c   -- put space before opening $
 -- * Additional lexing uitilties
 -- | Capitalize first letter
 capitInit s = case s of
  c:cs -> toUpper c : cs
  _ -> s
 -- | Uncapitalize first letter
 uncapitInit s = case s of
  c:cs -> toLower c : cs
  _ -> s
 -- | Unquote each string wrapped in double quotes
 unquote = map unq where 
  unq s = case s of
    '"':cs@(_:_) | last cs == '"' -> init cs
    _ -> s
 isPunct = flip elem ".?!,:;"
 isMajorPunct = flip elem ".?!"
 isMinorPunct = flip elem ",:;"
 isParen = flip elem "()[]{}"
 isClosing = flip elem ")]}"
--- a/src/runtime/haskell/PGF/LexingAGreek.hs
+++ b/src/runtime/haskell/PGF/LexingAGreek.hs
@@ -1,310 +0,0 @@
 module PGF.LexingAGreek where                      -- HL 2a1.2.2016
 import Data.Char(isSpace)
 -- * Text lexing without word capitalization of the first word of every sentence.
 -- Greek sentences in (transliterated) texts don't start with capital character. 
 -- Ordinary greek text does not have vowel length indicators. We then use '.' as
 -- a sentence separator.
 lexTextAGreek :: String -> [String]
 lexTextAGreek s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in w : lext cs
    [] -> []
 -- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
 -- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
 lexTextAGreek2 :: String -> [String]
 lexTextAGreek2 s = lext s where
  lext s = case s of
    c:cs | isAGreekPunct c -> [c] : (lext cs)
    c:cs | isSpace c -> lext cs
    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
           in case cs of 
                '.':'.':d:ds | isSpace d 
                  -> (w++['.']) : lext ('.':d:ds)
                '.':d:ds | isAGreekPunct d || isSpace d 
                  -> (w++['.']) : lext (d:ds)
                '.':d:ds | not (isSpace d) 
                  -> case lext (d:ds) of
                       e:es -> (w++['.']++e) : es
                       es -> (w++['.']) : es 
                '.':[] -> (w++['.']) : []
                _ -> w : lext cs      
    [] -> []
 unlexTextAGreek :: [String] -> String
 unlexTextAGreek = unlext where
  unlext s = case s of
    w:[] -> w
    w:[c]:[] | isAGreekPunct c -> w ++ [c]
    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
    w:ws -> w ++ " " ++ unlext ws
    [] -> []
 isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
                                   -- second version · = 00B7 standard code point
 -- * Text lexing and unlexing for Ancient Greek: 
 --   1. no capitalization of initial word, 
 --   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
 --   3. accent move from/to support word to/from following clitic words (iterated).
 lexAGreek :: String -> [String]
 lexAGreek = fromAGreek . lexTextAGreek
 lexAGreek2 :: String -> [String]
 lexAGreek2 = fromAGreek . lexTextAGreek2
 unlexAGreek :: [String] -> String
 unlexAGreek = unlexTextAGreek . toAGreek
 -- Note: unlexAGreek does not glue punctuation with the previous word, so that short
 -- vowel indication (like a.) differs from sentence end (a .). 
 -- | normalize = change grave accent on sentence internal words to acute, 
 -- and shift inherited acutes to the following enclitic (where they are
 -- visible only as shown in the list of enclitics above)
 normalize :: String -> String
 normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 
 fromAGreek :: [String] -> [String]
 fromAGreek s = case s of
  w:[]:vs -> w:[]:(fromAGreek vs)
  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
  w:v:vs | wasEnclitic v && wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
    getProclitic w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && (hasEndCircum w || 
    (hasEndAcute w && hasSingleAccent w)) ->  
    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
    w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
    dropLastAccent w : getEnclitic v : fromAGreek vs
  w:v:vs | wasEnclitic w ->
    getEnclitic w : fromAGreek (v:vs)
  w:ws -> (toAcute w) : (fromAGreek ws)
  ws -> ws 
 -- | de-normalize = change acute accent of end syllables in sentence internal 
 --  (non-enclitic) words to grave accent, and move accents of enclitics to the 
 --  previous word to produce ordinary ancient greek
 denormalize :: String -> String
 denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 
 toAGreek :: [String] -> [String]
 toAGreek s = case s of
  w:[]:vs -> w:[]:(toAGreek vs)
  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
  w:v:vs | isEnclitic v && isEnclitic w -> 
    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
    addAcute w: (toAGreek (dropAccent v:vs))
  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
  w:v:vs | isEnclitic v -> 
    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
  w:ws -> (toGrave w) : (toAGreek ws)
  ws -> ws 
 -- | Change accent on the final syllable of a word
 toGrave :: String -> String 
 toGrave = reverse . grave . reverse where
  grave s = case s of
    '\'':cs -> '`':cs 
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: grave cs
    _ -> s
 toAcute :: String -> String 
 toAcute = reverse . acute . reverse where
  acute s = case s of
    '`':cs -> '\'':cs
    c:cs | isAGreekVowel c -> c:cs 
    c:cs -> c: acute cs
    _ -> s
 isAGreekVowel = flip elem "aeioyhw"
 -- | Accent moves for enclitics and proclitics (atona)
 enclitics = [
  "moy","moi","me",     -- personal pronouns
  "soy","soi","se",
  "oy(","oi(","e(",
  "tis*","ti","tina'",  -- indefinite pronoun 
  "tino's*","tini'",
  "tine's*","tina's*",
  "tinw~n","tisi'","tisi'n",
  "poy","poi",          -- indefinite adverbs
  "pove'n","pws*",
  "ph|","pote'",
  "ge","te","toi",      -- particles
  "nyn","per","pw" 
   -- suffix -"de"
   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
  ] -- and more, BR 11
 proclitics = [
  "o(","h(","oi(","ai(",     -- articles
  "e)n","ei)s*","e)x","e)k", -- prepositions
  "ei)","w(s*",              -- conjunctions
  "oy)","oy)k","oy)c"        -- negation
  ]
 isEnclitic = flip elem enclitics
 isProclitic = flip elem proclitics
 -- Check if a word is an enclitic or accented enclitic and extract the enclitic
 wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
                    ++ (map dropAccent (filter hasAccent enclitics))
                  accented = (filter hasAccent enclitics) 
                    ++ map addAcute (filter (not . hasAccent) enclitics) 
              in flip elem (accented ++ unaccented)
 wasProclitic = flip elem (map addAcute proclitics)
 getEnclitic = 
  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
                  (enclitics ++ (filter hasAccent enclitics)
                   ++ (filter (not . hasAccent) enclitics))
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v
 getProclitic = 
  let pairs = zip (map addAcute proclitics) proclitics 
      find = \v -> lookup v pairs
  in \v -> case (find v) of 
    Just x -> x 
    _ -> v
 -- | Accent manipulation
 dropAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs
 dropLastAccent = reverse . drop . reverse where 
  drop s = case s of
    [] -> []
    '\'':cs -> cs
    '`':cs -> cs
    '~':cs -> cs
    c:cs -> c:drop cs
 addAcute :: String -> String
 addAcute = reverse . acu . reverse where
  acu w = case w of 
    c:cs | c == '\'' -> c:cs
    c:cs | c == '(' -> '\'':c:cs
    c:cs | c == ')' -> '\'':c:cs
    c:cs | isAGreekVowel c -> '\'':c:cs
    c:cs -> c : acu cs
    _ -> w
 -- | Accent checking on end syllables
 hasEndAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> True
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs
 hasEndCircum = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False
    '`':cs -> False
    '~':cs -> True
    c:cs | isAGreekVowel c -> False
    _:cs -> find cs
 hasPrefinalAcute = find . reverse where
  find s = case s of
    [] -> False
    '\'':cs -> False  -- final acute
    '`':cs -> False
    '~':cs -> False
    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
    c:cs | isAGreekVowel c -> findNext cs
    _:cs -> find cs where
  findNext s = case s of 
    [] -> False
    '\'':cs -> True  -- prefinal acute
    '`':cs -> False
    '~':cs -> False
    c:cs | isAGreekVowel c -> False
    _:cs -> findNext cs where
 hasSingleAccent v = 
  hasAccent v && not (hasAccent (dropLastAccent v))
 hasAccent v = case v of 
  [] -> False
  c:cs -> elem c ['\'','`','~'] || hasAccent cs
 {- Tests: 
 -- denormalization. Examples in BR 11 work: 
 -}
 enclitics_expls = -- normalized
  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
  "ei) tis*":"ei) tine's*":
  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
  "doy~los* tis*":"doy~loi tine's*":
  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
  [] 
 {-
 test = map denormalize enclitics_expls
 *PGF.LexingAGreek> test
  ["sofw~n tis*","sofw~n tines*","sof~n tinwn",
  "sofo's tis*","sofoi' tines*",
  "ei)' tis*","ei)' tines*",
  "a)'nvrwpo's* tis*","a)'nvrwpoi' tines*",
  "doy~lo's* tis*","doy~loi' tines*",
  "lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ",
  "ei)' poy' ti's* tina i)'doi"]
 -- normalization:
 *PGF.LexingAGreek> map normalize test
  ["sofw~n tis*","sofw~n tine's*","sof~n tinw~n",
  "sofo's tis*","sofoi' tine's*",
  "ei) tis*","ei) tine's*",
  "a)'nvrwpos* tis*","a)'nvrwpoi tine's*",
  "doy~los* tis*","doy~loi tine's*",
  "lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n",
  "ei) poy tis* tina' i)'doi"]
 *PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls
 True
 *PGF.LexingAGreek> map (denormalize . normalize) test == test
 True
 -}
--- a/src/runtime/haskell/pgf.cabal
+++ b/src/runtime/haskell/pgf.cabal
@@ -50,9 +50,6 @@ Library
  exposed-modules: 
    PGF
    PGF.Internal
    PGF.Lexing
    PGF.LexingAGreek
    PGF.Utilities
    PGF.Haskell
  other-modules:
@@ -78,3 +75,4 @@ Library
    PGF.VisualizeTree
    PGF.ByteCode
    PGF.OldBinary
    PGF.Utilities
--- a/src/server/PGFService.hs
+++ b/src/server/PGFService.hs
@@ -4,8 +4,8 @@ module PGFService(cgiMain,cgiMain',getPath,
                  Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where
 import PGF (PGF,Labels,CncLabels)
 import GF.Text.Lexing
 import qualified PGF
 import PGF.Lexing
 import Cache
 import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText,
                outputBinary,outputBinary',
@@ -272,8 +272,11 @@ cpgfMain qsem command (t,(pgf,pc)) =
            maybe (Left ("["++w++"]")) Right $
            msum [parse1 w,parse1 ow,morph w,morph ow]
          where
-            ow = if w==lw then capitInit w else lw
+            ow = case w of
-            lw = uncapitInit w
+                   c:cs | isLower c -> toUpper c : cs
                        | isUpper c -> toLower c : cs
                   s                -> s
            parse1 = either (const Nothing) (fmap fst . listToMaybe) .
                     C.parse concr cat
            morph w = listToMaybe
@@ -293,7 +296,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
    from1 = maybe (missing "from") return =<< from'
    from' = getLang "from"
-    to = (,) # getLangs "to" % unlexerC
+    to = (,) # getLangs "to" % unlexer (const False)
    getLangs = getLangs' readLang
    getLang = getLang' readLang
@@ -308,8 +311,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
              let t = C.readExpr s
              maybe (badRequest "bad tree" s) return t
-    --c_lexer concr = lexer
+    c_lexer concr = lexer (not . null . C.lookupMorpho concr)
    c_lexer concr = ilexer (not . null . C.lookupMorpho concr)
 --------------------------------------------------------------------------------
@@ -338,62 +340,29 @@ instance ToATree C.Expr where
 --------------------------------------------------------------------------------
 -- * Lexing
 -- | Lexers with a text lexer that tries to be a more clever with the first word
 ilexer good = lexer' uncap
  where
    uncap s = case span isUpper s of
                ([c],r) | not (good s) -> toLower c:r
                _ -> s
 -- | Standard lexers
-lexer = lexer' uncapitInit
+lexer good = maybe (return id) lexerfun =<< getInput "lexer" 
 lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer" 
  where
    lexerfun name =
-       case name of
+      case stringOp good ("lex"++name) of
-         "text"  -> return (unwords . lexText' uncap)
+        Just fn -> return fn
-         "code"  -> return (unwords . lexCode)
+        Nothing -> badRequest "Unknown lexer" name
         "mixed" -> return (unwords . lexMixed)
         _ -> badRequest "Unknown lexer" name
 type Unlexer = String->String
 -- | Unlexing for the C runtime system, &+ is already applied
-unlexerC :: CGI Unlexer
+unlexer :: (String -> Bool) -> CGI Unlexer
-unlexerC = maybe (return id) unlexerfun =<< getInput "unlexer"
+unlexer good = maybe (return id) unlexerfun =<< getInput "unlexer"
  where
    unlexerfun name =
-       case name of
+       case stringOp good ("unlex"++name) of
-         "text"  -> return (unlexText' . words)
+         Just fn -> return (fn . cleanMarker)
-         "code"  -> return (unlexCode . words)
+         Nothing -> badRequest "Unknown unlexer" name
-         "mixed" -> return (unlexMixed . words)
+    
-         "none"  -> return id
+    cleanMarker ('+':cs) = cs
-         "id"    -> return id
+    cleanMarker ('*':cs) = cs
-         _ -> badRequest "Unknown lexer" name
+    cleanMarker cs       = cs
 -- | Unlex text, skipping the quality marker used by the App grammar
 unlexText' ("+":ws) = "+ "++unlexText ws
 unlexText' ("*":ws) = "* "++unlexText ws
 unlexText' ws       = unlexText ws
 -- | Unlexing for the Haskell run-time, applying the &+ operator first
 unlexerH :: CGI Unlexer
 unlexerH = maybe (return doBind) unlexerfun =<< getInput "unlexer"
  where
    unlexerfun name =
       case name of
         "text"  -> return (unlexText' . bind)
         "code"  -> return (unlexCode . bind)
         "mixed" -> return (unlexMixed . bind)
         "none"  -> return id
         "id"    -> return id
         "bind"  -> return doBind
         _ -> badRequest "Unknown lexer" name
    doBind = unwords . bind
    bind = bindTok . words
 --------------------------------------------------------------------------------
 -- * Haskell run-time functionality
@@ -431,8 +400,8 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
               inp <- textInput
               return (fr,lex inp)
-    mlexer Nothing = lexer
+    mlexer Nothing     = lexer (const False)
-    mlexer (Just lang) = ilexer (PGF.isInMorpho morpho)
+    mlexer (Just lang) = lexer (PGF.isInMorpho morpho)
      where morpho = PGF.buildMorpho pgf lang
    tree :: CGI PGF.Tree
@@ -489,7 +458,7 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
    from = getLang "from"
    to1 = maybe (missing "to") return =<< getLang "to"
-    to = (,) # getLangs "to" % unlexerH
+    to = (,) # getLangs "to" % unlexer (const False)
    getLangs = getLangs' readLang
    getLang = getLang' readLang
		`@@ -0,0 +1,3 @@`
							`module PGF(module PGF2) where`

							`import PGF2`