eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.

2026-07-10 15:42:46 -06:00 · 2017-09-04 11:43:37 +02:00
parent cae52bb9af
commit 1f908fa7bf
12 changed files with 453 additions and 561 deletions
@@ -109,9 +109,6 @@ Library
  exposed-modules: 
    PGF
    PGF.Internal
-    PGF.Lexing
-    PGF.LexingAGreek
-    PGF.Utilities
    PGF.Haskell

  other-modules:
@@ -137,6 +134,7 @@ Library
    PGF.VisualizeTree
    PGF.ByteCode
    PGF.OldBinary
+    PGF.Utilities

  if flag(c-runtime)
    exposed-modules: PGF2
@@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [

 stringOps menv opts s = foldr (menvop . app) s (reverse opts)
  where
-    app f = maybe id id (stringOp f)
+    app f = maybe id id (stringOp (const False) f)
    menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv

 envFlag fs =
@@ -12,12 +12,12 @@
 -----------------------------------------------------------------------------


-module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where
+module GF.Data.Utilities(module GF.Data.Utilities) where

 import Data.Maybe
 import Data.List
 import Control.Monad (MonadPlus(..),liftM,when)
-import PGF.Utilities
+import qualified Data.Set as Set

 -- * functions on lists

@@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue)



+-- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things.
+--   The result list is stable (the elements are returned in the order they occur), and lazy.
+--   Requires that the list elements can be compared by Ord.
+--   Code ruthlessly taken from <http://hpaste.org/54411>
+nub' :: Ord a => [a] -> [a]
+nub' = loop Set.empty
+    where loop _    []            = []
+          loop seen (x : xs)
+              | Set.member x seen = loop seen xs
+              | otherwise         = x : loop (Set.insert x seen) xs
+
+
+-- | Replace all occurences of an element by another element.
+replace :: Eq a => a -> a -> [a] -> [a]
+replace x y = map (\z -> if z == x then y else z)
@@ -2,30 +2,30 @@
 module GF.Text.Lexing (stringOp,opInEnv) where

 import GF.Text.Transliterations
-import PGF.Lexing
-import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016

-import Data.Char (isSpace)
+import Data.Char (isSpace,toUpper,toLower)
 import Data.List (intersperse)

-stringOp :: String -> Maybe (String -> String)
-stringOp name = case name of
+stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
+stringOp good name = case name of
  "chars"      -> Just $ appLexer (filter (not . all isSpace) . map return)
-  "lextext"    -> Just $ appLexer lexText
+  "lextext"    -> Just $ appLexer (lexText good)
  "lexcode"    -> Just $ appLexer lexCode
-  "lexmixed"   -> Just $ appLexer lexMixed
+  "lexmixed"   -> Just $ appLexer (lexMixed good)
  "lexgreek"   -> Just $ appLexer lexAGreek
  "lexgreek2"  -> Just $ appLexer lexAGreek2
  "words"      -> Just $ appLexer words
  "bind"       -> Just $ appUnlexer (unwords . bindTok)
  "unchars"    -> Just $ appUnlexer concat
-  "unlextext"  -> Just $ appUnlexer (unlexText . unquote)
+  "unlextext"  -> Just $ appUnlexer (unlexText . unquote . bindTok)
  "unlexcode"  -> Just $ appUnlexer unlexCode
-  "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote)
+  "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
  "unlexgreek" -> Just $ appUnlexer unlexAGreek
+  "unlexnone"  -> Just id
+  "unlexid"    -> Just id
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
-  _ -> transliterate name
+  _            -> transliterate name

 -- perform op in environments beg--end, t.ex. between "--"
 --- suboptimal implementation
@@ -55,3 +55,395 @@ appUnlexer f = f . words
 wrapHTML :: String -> String
 wrapHTML = unlines . tag . intersperse "<br>" . lines where
  tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
+
+
+-- * Text lexing
+-- | Text lexing with standard word capitalization of the first word of every sentence
+lexText :: (String -> Bool) -> String -> [String]
+lexText good = lexText' (uncapitInit good)
+
+-- | Text lexing with custom treatment of the first word of every sentence.
+lexText' :: (String->String) -> String -> [String]
+lexText' uncap1 = uncap . lext where
+  lext s = case s of
+    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
+    c:cs | isMinorPunct c -> [c] : lext cs
+    c:cs | isSpace c ->       lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
+    _ -> [s]
+  uncap s = case s of
+    w:ws -> uncap1 w:ws
+    _ -> s
+
+unlexText :: [String] -> String
+unlexText = capitInit . unlext where
+  unlext s = case s of
+    w:[] -> w
+    w:[c]:[] | isPunct c -> w ++ [c]
+    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
+    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
+    w:ws -> w ++ " " ++ unlext ws
+    _ -> []
+
+-- | Bind tokens separated by Prelude.BIND, i.e. &+
+bindTok :: [String] -> [String]
+bindTok ws = case ws of
+               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
+               "&+":ws       -> bindTok ws
+               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
+               "&|":ws       -> bindTok ws
+               w:ws          -> w:bindTok ws
+               []            -> []
+
+-- * Code lexing
+
+-- | Haskell lexer, usable for much code
+lexCode :: String -> [String]
+lexCode ss = case lex ss of
+  [(w@(_:_),ws)] -> w : lexCode ws
+  _ -> []
+  
+
+-- * Ancient Greek lexing
+
+lexTextAGreek :: String -> [String]
+lexTextAGreek s = lext s where
+  lext s = case s of
+    c:cs | isAGreekPunct c -> [c] : (lext cs)
+    c:cs | isSpace c -> lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
+           in w : lext cs
+    [] -> []
+
+-- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
+-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
+
+lexTextAGreek2 :: String -> [String]
+lexTextAGreek2 s = lext s where
+  lext s = case s of
+    c:cs | isAGreekPunct c -> [c] : (lext cs)
+    c:cs | isSpace c -> lext cs
+    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
+           in case cs of 
+                '.':'.':d:ds | isSpace d 
+                  -> (w++['.']) : lext ('.':d:ds)
+                '.':d:ds | isAGreekPunct d || isSpace d 
+                  -> (w++['.']) : lext (d:ds)
+                '.':d:ds | not (isSpace d) 
+                  -> case lext (d:ds) of
+                       e:es -> (w++['.']++e) : es
+                       es -> (w++['.']) : es 
+                '.':[] -> (w++['.']) : []
+                _ -> w : lext cs      
+    [] -> []
+
+unlexTextAGreek :: [String] -> String
+unlexTextAGreek = unlext where
+  unlext s = case s of
+    w:[] -> w
+    w:[c]:[] | isAGreekPunct c -> w ++ [c]
+    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
+    w:ws -> w ++ " " ++ unlext ws
+    [] -> []
+
+isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
+                                   -- second version · = 00B7 standard code point
+
+-- * Text lexing and unlexing for Ancient Greek: 
+--   1. no capitalization of initial word, 
+--   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
+--   3. accent move from/to support word to/from following clitic words (iterated).
+
+lexAGreek :: String -> [String]
+lexAGreek = fromAGreek . lexTextAGreek
+
+lexAGreek2 :: String -> [String]
+lexAGreek2 = fromAGreek . lexTextAGreek2
+
+unlexAGreek :: [String] -> String
+unlexAGreek = unlexTextAGreek . toAGreek
+
+-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
+-- vowel indication (like a.) differs from sentence end (a .). 
+
+-- | normalize = change grave accent on sentence internal words to acute, 
+-- and shift inherited acutes to the following enclitic (where they are
+-- visible only as shown in the list of enclitics above)
+
+normalize :: String -> String
+normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 
+
+fromAGreek :: [String] -> [String]
+fromAGreek s = case s of
+  w:[]:vs -> w:[]:(fromAGreek vs)
+  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
+  w:v:vs | wasEnclitic v && wasEnclitic w ->
+    getEnclitic w : fromAGreek (v:vs)
+  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
+    getProclitic w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic v && (hasEndCircum w || 
+    (hasEndAcute w && hasSingleAccent w)) ->  
+    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
+  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
+    w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
+    dropLastAccent w : getEnclitic v : fromAGreek vs
+  w:v:vs | wasEnclitic w ->
+    getEnclitic w : fromAGreek (v:vs)
+  w:ws -> (toAcute w) : (fromAGreek ws)
+  ws -> ws 
+
+-- | de-normalize = change acute accent of end syllables in sentence internal 
+--  (non-enclitic) words to grave accent, and move accents of enclitics to the 
+--  previous word to produce ordinary ancient greek
+
+denormalize :: String -> String
+denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 
+
+toAGreek :: [String] -> [String]
+toAGreek s = case s of
+  w:[]:vs -> w:[]:(toAGreek vs)
+  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
+  w:v:vs | isEnclitic v && isEnclitic w -> 
+    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
+  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
+    addAcute w: (toAGreek (dropAccent v:vs))
+  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
+    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
+  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
+    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
+  w:v:vs | isEnclitic v -> 
+    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
+  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
+  w:ws -> (toGrave w) : (toAGreek ws)
+  ws -> ws 
+
+-- | Change accent on the final syllable of a word
+
+toGrave :: String -> String 
+toGrave = reverse . grave . reverse where
+  grave s = case s of
+    '\'':cs -> '`':cs 
+    c:cs | isAGreekVowel c -> c:cs 
+    c:cs -> c: grave cs
+    _ -> s
+
+toAcute :: String -> String 
+toAcute = reverse . acute . reverse where
+  acute s = case s of
+    '`':cs -> '\'':cs
+    c:cs | isAGreekVowel c -> c:cs 
+    c:cs -> c: acute cs
+    _ -> s
+
+isAGreekVowel = flip elem "aeioyhw"
+
+-- | Accent moves for enclitics and proclitics (atona)
+
+enclitics = [
+  "moy","moi","me",     -- personal pronouns
+  "soy","soi","se",
+  "oy(","oi(","e(",
+  "tis*","ti","tina'",  -- indefinite pronoun 
+  "tino's*","tini'",
+  "tine's*","tina's*",
+  "tinw~n","tisi'","tisi'n",
+  "poy","poi",          -- indefinite adverbs
+  "pove'n","pws*",
+  "ph|","pote'",
+  "ge","te","toi",      -- particles
+  "nyn","per","pw" 
+   -- suffix -"de"
+   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
+  ] -- and more, BR 11
+
+proclitics = [
+  "o(","h(","oi(","ai(",     -- articles
+  "e)n","ei)s*","e)x","e)k", -- prepositions
+  "ei)","w(s*",              -- conjunctions
+  "oy)","oy)k","oy)c"        -- negation
+  ]
+
+isEnclitic = flip elem enclitics
+isProclitic = flip elem proclitics
+
+-- Check if a word is an enclitic or accented enclitic and extract the enclitic
+
+wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
+                    ++ (map dropAccent (filter hasAccent enclitics))
+                  accented = (filter hasAccent enclitics) 
+                    ++ map addAcute (filter (not . hasAccent) enclitics) 
+              in flip elem (accented ++ unaccented)
+
+wasProclitic = flip elem (map addAcute proclitics)
+
+getEnclitic = 
+  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
+                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
+                  (enclitics ++ (filter hasAccent enclitics)
+                   ++ (filter (not . hasAccent) enclitics))
+      find = \v -> lookup v pairs
+  in \v -> case (find v) of 
+    Just x -> x 
+    _ -> v
+getProclitic = 
+  let pairs = zip (map addAcute proclitics) proclitics 
+      find = \v -> lookup v pairs
+  in \v -> case (find v) of 
+    Just x -> x 
+    _ -> v
+
+-- | Accent manipulation
+
+dropAccent = reverse . drop . reverse where 
+  drop s = case s of
+    [] -> []
+    '\'':cs -> cs
+    '`':cs -> cs
+    '~':cs -> cs
+    c:cs -> c:drop cs
+
+dropLastAccent = reverse . drop . reverse where 
+  drop s = case s of
+    [] -> []
+    '\'':cs -> cs
+    '`':cs -> cs
+    '~':cs -> cs
+    c:cs -> c:drop cs
+
+addAcute :: String -> String
+addAcute = reverse . acu . reverse where
+  acu w = case w of 
+    c:cs | c == '\'' -> c:cs
+    c:cs | c == '(' -> '\'':c:cs
+    c:cs | c == ')' -> '\'':c:cs
+    c:cs | isAGreekVowel c -> '\'':c:cs
+    c:cs -> c : acu cs
+    _ -> w
+
+-- | Accent checking on end syllables
+
+hasEndAcute = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> True
+    '`':cs -> False
+    '~':cs -> False
+    c:cs | isAGreekVowel c -> False
+    _:cs -> find cs
+
+hasEndCircum = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> False
+    '`':cs -> False
+    '~':cs -> True
+    c:cs | isAGreekVowel c -> False
+    _:cs -> find cs
+
+hasPrefinalAcute = find . reverse where
+  find s = case s of
+    [] -> False
+    '\'':cs -> False  -- final acute
+    '`':cs -> False
+    '~':cs -> False
+    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
+    c:cs | isAGreekVowel c -> findNext cs
+    _:cs -> find cs where
+  findNext s = case s of 
+    [] -> False
+    '\'':cs -> True  -- prefinal acute
+    '`':cs -> False
+    '~':cs -> False
+    c:cs | isAGreekVowel c -> False
+    _:cs -> findNext cs where
+    
+hasSingleAccent v = 
+  hasAccent v && not (hasAccent (dropLastAccent v))
+
+hasAccent v = case v of 
+  [] -> False
+  c:cs -> elem c ['\'','`','~'] || hasAccent cs
+
+{- Tests: 
+
+-- denormalization. Examples in BR 11 work: 
+-}
+enclitics_expls = -- normalized
+  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
+  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
+  "ei) tis*":"ei) tine's*":
+  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
+  "doy~los* tis*":"doy~loi tine's*":
+  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
+  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
+  [] 
+
+
+unlexCode :: [String] -> String
+unlexCode s = case s of
+  w:[] -> w
+  [c]:cs | isParen c -> [c] ++ unlexCode cs
+  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
+  w:ws -> w ++ " " ++ unlexCode ws
+  _ -> []
+
+
+-- | LaTeX lexer in the math mode: \ should not be separated from the next word
+
+lexLatexCode :: String -> [String]
+lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
+  restoreBackslash ws = case ws of
+    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
+    w:ww -> w:restoreBackslash ww
+    _ -> ws
+
+-- * Mixed lexing
+
+-- | LaTeX style lexer, with "math" environment using Code between $...$
+lexMixed :: (String -> Bool) -> String -> [String]
+lexMixed good = concat . alternate False [] where
+  alternate env t s = case s of
+    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
+    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
+    c:cs -> alternate env (c:t) cs
+    _ -> [lex env (reverse t)]
+  lex env = if env then lexLatexCode else lexText good
+
+unlexMixed :: (String -> Bool) -> [String] -> String
+unlexMixed good = capitInit . concat . alternate False where
+  alternate env s = case s of
+    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
+      (t,[])  -> unlex env t : []
+      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
+    _ -> []
+  unlex env = if env then unlexCode else (uncapitInit good . unlexText)
+  sep env c m = case (m,env) of
+    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
+    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
+    _ -> " " ++ c   -- put space before opening $
+
+-- * Additional lexing uitilties
+
+-- | Capitalize first letter
+capitInit s = case s of
+  c:cs -> toUpper c : cs
+  _ -> s
+
+-- | Uncapitalize first letter
+uncapitInit good s = 
+  case s of
+    c:cs | not (good s) -> toLower c : cs
+    _                   -> s
+
+-- | Unquote each string wrapped in double quotes
+unquote = map unq where 
+  unq s = case s of
+    '"':cs@(_:_) | last cs == '"' -> init cs
+    _ -> s
+
+isPunct = flip elem ".?!,:;"
+isMajorPunct = flip elem ".?!"
+isMinorPunct = flip elem ",:;"
+isParen = flip elem "()[]{}"
+isClosing = flip elem ")]}"
@@ -0,0 +1,3 @@
+module PGF(module PGF2) where
+
+import PGF2
@@ -0,0 +1 @@
+module PGF.Internal where
@@ -1,61 +0,0 @@
-- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding
--   to the C implementation of the PGF run-time system.
--
-- AR April 2015 modified from pgf-shell
-
-import PGF2
-import PGF.Lexing (lexText')
-
-import Data.Char(isSpace,toLower)
-import Data.List (nub)
-import System.Environment
-import qualified Data.Map as Map
-
-maxNumTrees :: Int
-maxNumTrees = 1
-
-maxNumVariants :: Int
-maxNumVariants = 1
-
-main = getPGF =<< getArgs
-
-getPGF args = case args of
-  [path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path
-  [path,from,to]        -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path
-  _                     -> putStrLn "Usage: pgf-hsbind-trans <path to pgf> <from-lang> <to-lang> [<cat> <#trees> <#variants>]"
-
-pgfTrans from to mcat mx pgf = do
-  cfrom <- getConcr' pgf from
-  cto   <- getConcr' pgf to
-  let cat = maybe (startCat pgf) id mcat
-  interact (unlines . map (translates pgf cfrom cto cat mx) . lines)
-
-getConcr' pgf lang =
-    maybe (fail $ "Concrete syntax not found: "++show lang) return $
-    Map.lookup lang (languages pgf)
-
-linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p]
-  where
-    unstar s = case s of
-      '*':' ':cs -> cs
-      _ -> s
-
-translates pgf cfrom cto cat (mxt,mxv) s0 = 
-  let s1 = lextext cfrom s0
-      (s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation
-  in
-  case cparse pgf cfrom cat s of
-    Left tok -> unlines [s,"Parse error: "++tok]
-    Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation
-
-cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where
-  callbacks = maybe [] cb $ lookup "App" literalCallbacks
-  cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs]
-
-lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of
-  _:_ -> w
-  _ -> case lookupMorpho cnc (uncapitInit w) of 
-    [] -> w 
-    _ -> uncapitInit w
-  )
- where uncapitInit (c:cs) = toLower c : cs
@@ -14,7 +14,9 @@ extra-source-files:  README
 cabal-version:       >=1.10

 library
-  exposed-modules:     PGF2, SG
+  exposed-modules:     PGF2, SG,
+                       -- backwards compatibility API:
+                       PGF, PGF.Internal
  other-modules:       PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI
  build-depends:       base >=4.3, bytestring >=0.9,
                       containers, pretty
@@ -1,115 +0,0 @@
-module PGF.Lexing where
-import Data.Char(isSpace,toLower,toUpper)
-
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
-lexText :: String -> [String]
-lexText = lexText' uncapitInit
-
-- | Text lexing with custom treatment of the first word of every sentence.
-lexText' :: (String->String) -> String -> [String]
-lexText' uncap1 = uncap . lext where
-  lext s = case s of
-    c:cs | isMajorPunct c -> [c] : uncap (lext cs)
-    c:cs | isMinorPunct c -> [c] : lext cs
-    c:cs | isSpace c ->       lext cs
-    _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
-    _ -> [s]
-  uncap s = case s of
-    w:ws -> uncap1 w:ws
-    _ -> s
-
-unlexText :: [String] -> String
-unlexText = capitInit . unlext where
-  unlext s = case s of
-    w:[] -> w
-    w:[c]:[] | isPunct c -> w ++ [c]
-    w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
-    w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
-    w:ws -> w ++ " " ++ unlext ws
-    _ -> []
-
-- | Bind tokens separated by Prelude.BIND, i.e. &+
-bindTok :: [String] -> [String]
-bindTok ws = case ws of
-               w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
-               "&+":ws       -> bindTok ws
-               "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
-               "&|":ws       -> bindTok ws
-               w:ws          -> w:bindTok ws
-               []            -> []
-
-- * Code lexing
-
-- | Haskell lexer, usable for much code
-lexCode :: String -> [String]
-lexCode ss = case lex ss of
-  [(w@(_:_),ws)] -> w : lexCode ws
-  _ -> []
-
-unlexCode :: [String] -> String
-unlexCode s = case s of
-  w:[] -> w
-  [c]:cs | isParen c -> [c] ++ unlexCode cs
-  w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
-  w:ws -> w ++ " " ++ unlexCode ws
-  _ -> []
-
-
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
-
-lexLatexCode :: String -> [String]
-lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
-  restoreBackslash ws = case ws of
-    "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
-    w:ww -> w:restoreBackslash ww
-    _ -> ws
-
-- * Mixed lexing
-
-- | LaTeX style lexer, with "math" environment using Code between $...$
-lexMixed :: String -> [String]
-lexMixed = concat . alternate False [] where
-  alternate env t s = case s of
-    '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
-    '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
-    c:cs -> alternate env (c:t) cs
-    _ -> [lex env (reverse t)]
-  lex env = if env then lexLatexCode else lexText
-
-unlexMixed :: [String] -> String
-unlexMixed = capitInit . concat . alternate False where
-  alternate env s = case s of
-    _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
-      (t,[])  -> unlex env t : []
-      (t,c:m) -> unlex env t : sep env c m : alternate (not env) m
-    _ -> []
-  unlex env = if env then unlexCode else (uncapitInit . unlexText)
-  sep env c m = case (m,env) of
-    ([p]:_,True) | isPunct p -> c   -- closing $ glued to next punct 
-    (_,  True) -> c ++ " "   -- closing $ otherwise separated by space from what follows
-    _ -> " " ++ c   -- put space before opening $
-
-- * Additional lexing uitilties
-
-- | Capitalize first letter
-capitInit s = case s of
-  c:cs -> toUpper c : cs
-  _ -> s
-
-- | Uncapitalize first letter
-uncapitInit s = case s of
-  c:cs -> toLower c : cs
-  _ -> s
-
-- | Unquote each string wrapped in double quotes
-unquote = map unq where 
-  unq s = case s of
-    '"':cs@(_:_) | last cs == '"' -> init cs
-    _ -> s
-
-isPunct = flip elem ".?!,:;"
-isMajorPunct = flip elem ".?!"
-isMinorPunct = flip elem ",:;"
-isParen = flip elem "()[]{}"
-isClosing = flip elem ")]}"
@@ -1,310 +0,0 @@
-module PGF.LexingAGreek where                      -- HL 2a1.2.2016
-import Data.Char(isSpace)
-
-- * Text lexing without word capitalization of the first word of every sentence.
-- Greek sentences in (transliterated) texts don't start with capital character. 
-
-- Ordinary greek text does not have vowel length indicators. We then use '.' as
-- a sentence separator.
-
-lexTextAGreek :: String -> [String]
-lexTextAGreek s = lext s where
-  lext s = case s of
-    c:cs | isAGreekPunct c -> [c] : (lext cs)
-    c:cs | isSpace c -> lext cs
-    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
-           in w : lext cs
-    [] -> []
-
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence 
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
-
-lexTextAGreek2 :: String -> [String]
-lexTextAGreek2 s = lext s where
-  lext s = case s of
-    c:cs | isAGreekPunct c -> [c] : (lext cs)
-    c:cs | isSpace c -> lext cs
-    _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s 
-           in case cs of 
-                '.':'.':d:ds | isSpace d 
-                  -> (w++['.']) : lext ('.':d:ds)
-                '.':d:ds | isAGreekPunct d || isSpace d 
-                  -> (w++['.']) : lext (d:ds)
-                '.':d:ds | not (isSpace d) 
-                  -> case lext (d:ds) of
-                       e:es -> (w++['.']++e) : es
-                       es -> (w++['.']) : es 
-                '.':[] -> (w++['.']) : []
-                _ -> w : lext cs      
-    [] -> []
-
-unlexTextAGreek :: [String] -> String
-unlexTextAGreek = unlext where
-  unlext s = case s of
-    w:[] -> w
-    w:[c]:[] | isAGreekPunct c -> w ++ [c]
-    w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
-    w:ws -> w ++ " " ++ unlext ws
-    [] -> []
-
-isAGreekPunct = flip elem ".,;··"  -- colon: first version · not in charset,
-                                   -- second version · = 00B7 standard code point
-
-- * Text lexing and unlexing for Ancient Greek: 
--   1. no capitalization of initial word, 
--   2. grave/acute accent switch on final syllables of words not followed by punctuation, 
--   3. accent move from/to support word to/from following clitic words (iterated).
-
-lexAGreek :: String -> [String]
-lexAGreek = fromAGreek . lexTextAGreek
-
-lexAGreek2 :: String -> [String]
-lexAGreek2 = fromAGreek . lexTextAGreek2
-
-unlexAGreek :: [String] -> String
-unlexAGreek = unlexTextAGreek . toAGreek
-
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .). 
-
-- | normalize = change grave accent on sentence internal words to acute, 
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)
-
-normalize :: String -> String
-normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) 
-
-fromAGreek :: [String] -> [String]
-fromAGreek s = case s of
-  w:[]:vs -> w:[]:(fromAGreek vs)
-  w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
-  w:v:vs | wasEnclitic v && wasEnclitic w ->
-    getEnclitic w : fromAGreek (v:vs)
-  w:v:vs | wasEnclitic v && wasProclitic w ->  -- "ei)' tines*"
-    getProclitic w : getEnclitic v : fromAGreek vs
-  w:v:vs | wasEnclitic v && (hasEndCircum w || 
-    (hasEndAcute w && hasSingleAccent w)) ->  
-    w : getEnclitic v : fromAGreek vs        -- ok "sofoi' tines*"
-  w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
-    w : getEnclitic v : fromAGreek vs
-  w:v:vs | wasEnclitic v && hasEndAcute w ->  -- ok "a)'nvrwpoi' tines*"
-    dropLastAccent w : getEnclitic v : fromAGreek vs
-  w:v:vs | wasEnclitic w ->
-    getEnclitic w : fromAGreek (v:vs)
-  w:ws -> (toAcute w) : (fromAGreek ws)
-  ws -> ws 
-
-- | de-normalize = change acute accent of end syllables in sentence internal 
--  (non-enclitic) words to grave accent, and move accents of enclitics to the 
--  previous word to produce ordinary ancient greek
-
-denormalize :: String -> String
-denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) 
-
-toAGreek :: [String] -> [String]
-toAGreek s = case s of
-  w:[]:vs -> w:[]:(toAGreek vs)
-  w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs)  -- w:[] for following -to_ancientgreek
-  w:v:vs | isEnclitic v && isEnclitic w -> 
-    addAcute w : toAGreek (dropAccent v:vs)    -- BR 11 Anm.2
-  w:v:vs | isEnclitic v && isProclitic w ->    -- BR 11 a.beta
-    addAcute w: (toAGreek (dropAccent v:vs))
-  w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> 
-    w:(toAGreek (dropAccent v:vs))             -- BR 11 a.alpha,beta
-  w:v:vs | isEnclitic v && hasPrefinalAcute w -> 
-    w:v: toAGreek vs   -- bisyllabic v keeps its accent  BR 11 b.
-  w:v:vs | isEnclitic v -> 
-    (addAcute w):(toAGreek (dropAccent v:vs))  -- BR 11 a.gamma
-  w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
-  w:ws -> (toGrave w) : (toAGreek ws)
-  ws -> ws 
-
-- | Change accent on the final syllable of a word
-
-toGrave :: String -> String 
-toGrave = reverse . grave . reverse where
-  grave s = case s of
-    '\'':cs -> '`':cs 
-    c:cs | isAGreekVowel c -> c:cs 
-    c:cs -> c: grave cs
-    _ -> s
-
-toAcute :: String -> String 
-toAcute = reverse . acute . reverse where
-  acute s = case s of
-    '`':cs -> '\'':cs
-    c:cs | isAGreekVowel c -> c:cs 
-    c:cs -> c: acute cs
-    _ -> s
-
-isAGreekVowel = flip elem "aeioyhw"
-
-- | Accent moves for enclitics and proclitics (atona)
-
-enclitics = [
-  "moy","moi","me",     -- personal pronouns
-  "soy","soi","se",
-  "oy(","oi(","e(",
-  "tis*","ti","tina'",  -- indefinite pronoun 
-  "tino's*","tini'",
-  "tine's*","tina's*",
-  "tinw~n","tisi'","tisi'n",
-  "poy","poi",          -- indefinite adverbs
-  "pove'n","pws*",
-  "ph|","pote'",
-  "ge","te","toi",      -- particles
-  "nyn","per","pw" 
-   -- suffix -"de"
-   -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
-  ] -- and more, BR 11
-
-proclitics = [
-  "o(","h(","oi(","ai(",     -- articles
-  "e)n","ei)s*","e)x","e)k", -- prepositions
-  "ei)","w(s*",              -- conjunctions
-  "oy)","oy)k","oy)c"        -- negation
-  ]
-
-isEnclitic = flip elem enclitics
-isProclitic = flip elem proclitics
-
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
-
-wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) 
-                    ++ (map dropAccent (filter hasAccent enclitics))
-                  accented = (filter hasAccent enclitics) 
-                    ++ map addAcute (filter (not . hasAccent) enclitics) 
-              in flip elem (accented ++ unaccented)
-
-wasProclitic = flip elem (map addAcute proclitics)
-
-getEnclitic = 
-  let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
-                   ++ (map addAcute (filter (not . hasAccent) enclitics)))
-                  (enclitics ++ (filter hasAccent enclitics)
-                   ++ (filter (not . hasAccent) enclitics))
-      find = \v -> lookup v pairs
-  in \v -> case (find v) of 
-    Just x -> x 
-    _ -> v
-getProclitic = 
-  let pairs = zip (map addAcute proclitics) proclitics 
-      find = \v -> lookup v pairs
-  in \v -> case (find v) of 
-    Just x -> x 
-    _ -> v
-
-- | Accent manipulation
-
-dropAccent = reverse . drop . reverse where 
-  drop s = case s of
-    [] -> []
-    '\'':cs -> cs
-    '`':cs -> cs
-    '~':cs -> cs
-    c:cs -> c:drop cs
-
-dropLastAccent = reverse . drop . reverse where 
-  drop s = case s of
-    [] -> []
-    '\'':cs -> cs
-    '`':cs -> cs
-    '~':cs -> cs
-    c:cs -> c:drop cs
-
-addAcute :: String -> String
-addAcute = reverse . acu . reverse where
-  acu w = case w of 
-    c:cs | c == '\'' -> c:cs
-    c:cs | c == '(' -> '\'':c:cs
-    c:cs | c == ')' -> '\'':c:cs
-    c:cs | isAGreekVowel c -> '\'':c:cs
-    c:cs -> c : acu cs
-    _ -> w
-
-- | Accent checking on end syllables
-
-hasEndAcute = find . reverse where
-  find s = case s of
-    [] -> False
-    '\'':cs -> True
-    '`':cs -> False
-    '~':cs -> False
-    c:cs | isAGreekVowel c -> False
-    _:cs -> find cs
-
-hasEndCircum = find . reverse where
-  find s = case s of
-    [] -> False
-    '\'':cs -> False
-    '`':cs -> False
-    '~':cs -> True
-    c:cs | isAGreekVowel c -> False
-    _:cs -> find cs
-
-hasPrefinalAcute = find . reverse where
-  find s = case s of
-    [] -> False
-    '\'':cs -> False  -- final acute
-    '`':cs -> False
-    '~':cs -> False
-    c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
-    c:cs | isAGreekVowel c -> findNext cs
-    _:cs -> find cs where
-  findNext s = case s of 
-    [] -> False
-    '\'':cs -> True  -- prefinal acute
-    '`':cs -> False
-    '~':cs -> False
-    c:cs | isAGreekVowel c -> False
-    _:cs -> findNext cs where
-    
-hasSingleAccent v = 
-  hasAccent v && not (hasAccent (dropLastAccent v))
-
-hasAccent v = case v of 
-  [] -> False
-  c:cs -> elem c ['\'','`','~'] || hasAccent cs
-
-{- Tests: 
-
-- denormalization. Examples in BR 11 work: 
-}
-enclitics_expls = -- normalized
-  "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n":  -- a.alpha
-  "sofo's tis*":"sofoi' tine's*":                 -- a.beta
-  "ei) tis*":"ei) tine's*":
-  "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*":        -- a.gamma
-  "doy~los* tis*":"doy~loi tine's*":
-  "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n":  -- b.
-  "ei) poy tis* tina' i)'doi":                    -- Anm. 2.
-  [] 
-{-
-test = map denormalize enclitics_expls
-
-*PGF.LexingAGreek> test
-  ["sofw~n tis*","sofw~n tines*","sof~n tinwn",
-  "sofo's tis*","sofoi' tines*",
-  "ei)' tis*","ei)' tines*",
-  "a)'nvrwpo's* tis*","a)'nvrwpoi' tines*",
-  "doy~lo's* tis*","doy~loi' tines*",
-  "lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ",
-  "ei)' poy' ti's* tina i)'doi"]
-
-- normalization:
-
-*PGF.LexingAGreek> map normalize test
-  ["sofw~n tis*","sofw~n tine's*","sof~n tinw~n",
-  "sofo's tis*","sofoi' tine's*",
-  "ei) tis*","ei) tine's*",
-  "a)'nvrwpos* tis*","a)'nvrwpoi tine's*",
-  "doy~los* tis*","doy~loi tine's*",
-  "lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n",
-  "ei) poy tis* tina' i)'doi"]
-
-*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls
-True
-*PGF.LexingAGreek> map (denormalize . normalize) test == test
-True
-
-}
@@ -50,9 +50,6 @@ Library
  exposed-modules: 
    PGF
    PGF.Internal
-    PGF.Lexing
-    PGF.LexingAGreek
-    PGF.Utilities
    PGF.Haskell

  other-modules:
@@ -78,3 +75,4 @@ Library
    PGF.VisualizeTree
    PGF.ByteCode
    PGF.OldBinary
+    PGF.Utilities
@@ -4,8 +4,8 @@ module PGFService(cgiMain,cgiMain',getPath,
                  Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where

 import PGF (PGF,Labels,CncLabels)
+import GF.Text.Lexing
 import qualified PGF
-import PGF.Lexing
 import Cache
 import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText,
                outputBinary,outputBinary',
@@ -272,8 +272,11 @@ cpgfMain qsem command (t,(pgf,pc)) =
            maybe (Left ("["++w++"]")) Right $
            msum [parse1 w,parse1 ow,morph w,morph ow]
          where
-            ow = if w==lw then capitInit w else lw
-            lw = uncapitInit w
+            ow = case w of
+                   c:cs | isLower c -> toUpper c : cs
+                        | isUpper c -> toLower c : cs
+                   s                -> s
+
            parse1 = either (const Nothing) (fmap fst . listToMaybe) .
                     C.parse concr cat
            morph w = listToMaybe
@@ -293,7 +296,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
    from1 = maybe (missing "from") return =<< from'
    from' = getLang "from"

-    to = (,) # getLangs "to" % unlexerC
+    to = (,) # getLangs "to" % unlexer (const False)

    getLangs = getLangs' readLang
    getLang = getLang' readLang
@@ -308,8 +311,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
              let t = C.readExpr s
              maybe (badRequest "bad tree" s) return t

-    --c_lexer concr = lexer
-    c_lexer concr = ilexer (not . null . C.lookupMorpho concr)
+    c_lexer concr = lexer (not . null . C.lookupMorpho concr)

 --------------------------------------------------------------------------------

@@ -338,62 +340,29 @@ instance ToATree C.Expr where
 --------------------------------------------------------------------------------
 -- * Lexing

-- | Lexers with a text lexer that tries to be a more clever with the first word
-ilexer good = lexer' uncap
-  where
-    uncap s = case span isUpper s of
-                ([c],r) | not (good s) -> toLower c:r
-                _ -> s
-
 -- | Standard lexers
-lexer = lexer' uncapitInit
-
-lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer" 
+lexer good = maybe (return id) lexerfun =<< getInput "lexer" 
  where
    lexerfun name =
-       case name of
-         "text"  -> return (unwords . lexText' uncap)
-         "code"  -> return (unwords . lexCode)
-         "mixed" -> return (unwords . lexMixed)
-         _ -> badRequest "Unknown lexer" name
+      case stringOp good ("lex"++name) of
+        Just fn -> return fn
+        Nothing -> badRequest "Unknown lexer" name


 type Unlexer = String->String

 -- | Unlexing for the C runtime system, &+ is already applied
-unlexerC :: CGI Unlexer
-unlexerC = maybe (return id) unlexerfun =<< getInput "unlexer"
+unlexer :: (String -> Bool) -> CGI Unlexer
+unlexer good = maybe (return id) unlexerfun =<< getInput "unlexer"
  where
    unlexerfun name =
-       case name of
-         "text"  -> return (unlexText' . words)
-         "code"  -> return (unlexCode . words)
-         "mixed" -> return (unlexMixed . words)
-         "none"  -> return id
-         "id"    -> return id
-         _ -> badRequest "Unknown lexer" name
-
-- | Unlex text, skipping the quality marker used by the App grammar
-unlexText' ("+":ws) = "+ "++unlexText ws
-unlexText' ("*":ws) = "* "++unlexText ws
-unlexText' ws       = unlexText ws
-
-- | Unlexing for the Haskell run-time, applying the &+ operator first
-unlexerH :: CGI Unlexer
-unlexerH = maybe (return doBind) unlexerfun =<< getInput "unlexer"
-  where
-    unlexerfun name =
-       case name of
-         "text"  -> return (unlexText' . bind)
-         "code"  -> return (unlexCode . bind)
-         "mixed" -> return (unlexMixed . bind)
-         "none"  -> return id
-         "id"    -> return id
-         "bind"  -> return doBind
-         _ -> badRequest "Unknown lexer" name
-
-    doBind = unwords . bind
-    bind = bindTok . words
+       case stringOp good ("unlex"++name) of
+         Just fn -> return (fn . cleanMarker)
+         Nothing -> badRequest "Unknown unlexer" name
+    
+    cleanMarker ('+':cs) = cs
+    cleanMarker ('*':cs) = cs
+    cleanMarker cs       = cs

 --------------------------------------------------------------------------------
 -- * Haskell run-time functionality
@@ -431,8 +400,8 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
               inp <- textInput
               return (fr,lex inp)

-    mlexer Nothing = lexer
-    mlexer (Just lang) = ilexer (PGF.isInMorpho morpho)
+    mlexer Nothing     = lexer (const False)
+    mlexer (Just lang) = lexer (PGF.isInMorpho morpho)
      where morpho = PGF.buildMorpho pgf lang

    tree :: CGI PGF.Tree
@@ -489,7 +458,7 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
    from = getLang "from"

    to1 = maybe (missing "to") return =<< getLang "to"
-    to = (,) # getLangs "to" % unlexerH
+    to = (,) # getLangs "to" % unlexer (const False)

    getLangs = getLangs' readLang
    getLang = getLang' readLang