From 1f908fa7bf65f51540ccb2b70ca2bd00d9b3dedf Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Mon, 4 Sep 2017 11:43:37 +0200 Subject: [PATCH] eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime. --- gf.cabal | 4 +- src/compiler/GF/Command/CommonCommands.hs | 2 +- src/compiler/GF/Data/Utilities.hs | 19 +- src/compiler/GF/Text/Lexing.hs | 412 +++++++++++++++++- src/runtime/haskell-bind/PGF.hs | 3 + src/runtime/haskell-bind/PGF/Internal.hs | 1 + .../haskell-bind/examples/pgf-hsbind-trans.hs | 61 --- src/runtime/haskell-bind/pgf2.cabal | 4 +- src/runtime/haskell/PGF/Lexing.hs | 115 ----- src/runtime/haskell/PGF/LexingAGreek.hs | 310 ------------- src/runtime/haskell/pgf.cabal | 4 +- src/server/PGFService.hs | 79 +--- 12 files changed, 453 insertions(+), 561 deletions(-) create mode 100644 src/runtime/haskell-bind/PGF.hs create mode 100644 src/runtime/haskell-bind/PGF/Internal.hs delete mode 100644 src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs delete mode 100644 src/runtime/haskell/PGF/Lexing.hs delete mode 100644 src/runtime/haskell/PGF/LexingAGreek.hs diff --git a/gf.cabal b/gf.cabal index 5ace12c34..c9f02c324 100644 --- a/gf.cabal +++ b/gf.cabal @@ -109,9 +109,6 @@ Library exposed-modules: PGF PGF.Internal - PGF.Lexing - PGF.LexingAGreek - PGF.Utilities PGF.Haskell other-modules: @@ -137,6 +134,7 @@ Library PGF.VisualizeTree PGF.ByteCode PGF.OldBinary + PGF.Utilities if flag(c-runtime) exposed-modules: PGF2 diff --git a/src/compiler/GF/Command/CommonCommands.hs b/src/compiler/GF/Command/CommonCommands.hs index 0cafad531..69ccaf325 100644 --- a/src/compiler/GF/Command/CommonCommands.hs +++ b/src/compiler/GF/Command/CommonCommands.hs @@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [ stringOps menv opts s = foldr (menvop . app) s (reverse opts) where - app f = maybe id id (stringOp f) + app f = maybe id id (stringOp (const False) f) menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv envFlag fs = diff --git a/src/compiler/GF/Data/Utilities.hs b/src/compiler/GF/Data/Utilities.hs index eac315508..29ed329dc 100644 --- a/src/compiler/GF/Data/Utilities.hs +++ b/src/compiler/GF/Data/Utilities.hs @@ -12,12 +12,12 @@ ----------------------------------------------------------------------------- -module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where +module GF.Data.Utilities(module GF.Data.Utilities) where import Data.Maybe import Data.List import Control.Monad (MonadPlus(..),liftM,when) -import PGF.Utilities +import qualified Data.Set as Set -- * functions on lists @@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue) +-- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things. +-- The result list is stable (the elements are returned in the order they occur), and lazy. +-- Requires that the list elements can be compared by Ord. +-- Code ruthlessly taken from +nub' :: Ord a => [a] -> [a] +nub' = loop Set.empty + where loop _ [] = [] + loop seen (x : xs) + | Set.member x seen = loop seen xs + | otherwise = x : loop (Set.insert x seen) xs + + +-- | Replace all occurences of an element by another element. +replace :: Eq a => a -> a -> [a] -> [a] +replace x y = map (\z -> if z == x then y else z) diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index 782e6ea9a..7195daacd 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -2,30 +2,30 @@ module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations -import PGF.Lexing -import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016 -import Data.Char (isSpace) +import Data.Char (isSpace,toUpper,toLower) import Data.List (intersperse) -stringOp :: String -> Maybe (String -> String) -stringOp name = case name of +stringOp :: (String -> Bool) -> String -> Maybe (String -> String) +stringOp good name = case name of "chars" -> Just $ appLexer (filter (not . all isSpace) . map return) - "lextext" -> Just $ appLexer lexText + "lextext" -> Just $ appLexer (lexText good) "lexcode" -> Just $ appLexer lexCode - "lexmixed" -> Just $ appLexer lexMixed + "lexmixed" -> Just $ appLexer (lexMixed good) "lexgreek" -> Just $ appLexer lexAGreek "lexgreek2" -> Just $ appLexer lexAGreek2 "words" -> Just $ appLexer words "bind" -> Just $ appUnlexer (unwords . bindTok) "unchars" -> Just $ appUnlexer concat - "unlextext" -> Just $ appUnlexer (unlexText . unquote) + "unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok) "unlexcode" -> Just $ appUnlexer unlexCode - "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote) + "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok) "unlexgreek" -> Just $ appUnlexer unlexAGreek + "unlexnone" -> Just id + "unlexid" -> Just id "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML - _ -> transliterate name + _ -> transliterate name -- perform op in environments beg--end, t.ex. between "--" --- suboptimal implementation @@ -55,3 +55,395 @@ appUnlexer f = f . words wrapHTML :: String -> String wrapHTML = unlines . tag . intersperse "
" . lines where tag ss = "":"":"":"":"" : ss ++ ["",""] + + +-- * Text lexing +-- | Text lexing with standard word capitalization of the first word of every sentence +lexText :: (String -> Bool) -> String -> [String] +lexText good = lexText' (uncapitInit good) + +-- | Text lexing with custom treatment of the first word of every sentence. +lexText' :: (String->String) -> String -> [String] +lexText' uncap1 = uncap . lext where + lext s = case s of + c:cs | isMajorPunct c -> [c] : uncap (lext cs) + c:cs | isMinorPunct c -> [c] : lext cs + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs + _ -> [s] + uncap s = case s of + w:ws -> uncap1 w:ws + _ -> s + +unlexText :: [String] -> String +unlexText = capitInit . unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isPunct c -> w ++ [c] + w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) + w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + _ -> [] + +-- | Bind tokens separated by Prelude.BIND, i.e. &+ +bindTok :: [String] -> [String] +bindTok ws = case ws of + w1:"&+":w2:ws -> bindTok ((w1++w2):ws) + "&+":ws -> bindTok ws + "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) + "&|":ws -> bindTok ws + w:ws -> w:bindTok ws + [] -> [] + +-- * Code lexing + +-- | Haskell lexer, usable for much code +lexCode :: String -> [String] +lexCode ss = case lex ss of + [(w@(_:_),ws)] -> w : lexCode ws + _ -> [] + + +-- * Ancient Greek lexing + +lexTextAGreek :: String -> [String] +lexTextAGreek s = lext s where + lext s = case s of + c:cs | isAGreekPunct c -> [c] : (lext cs) + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s + in w : lext cs + [] -> [] + +-- Philological greek text may use vowel length indicators. Then '.' is not a sentence +-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. + +lexTextAGreek2 :: String -> [String] +lexTextAGreek2 s = lext s where + lext s = case s of + c:cs | isAGreekPunct c -> [c] : (lext cs) + c:cs | isSpace c -> lext cs + _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s + in case cs of + '.':'.':d:ds | isSpace d + -> (w++['.']) : lext ('.':d:ds) + '.':d:ds | isAGreekPunct d || isSpace d + -> (w++['.']) : lext (d:ds) + '.':d:ds | not (isSpace d) + -> case lext (d:ds) of + e:es -> (w++['.']++e) : es + es -> (w++['.']) : es + '.':[] -> (w++['.']) : [] + _ -> w : lext cs + [] -> [] + +unlexTextAGreek :: [String] -> String +unlexTextAGreek = unlext where + unlext s = case s of + w:[] -> w + w:[c]:[] | isAGreekPunct c -> w ++ [c] + w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs + w:ws -> w ++ " " ++ unlext ws + [] -> [] + +isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset, + -- second version · = 00B7 standard code point + +-- * Text lexing and unlexing for Ancient Greek: +-- 1. no capitalization of initial word, +-- 2. grave/acute accent switch on final syllables of words not followed by punctuation, +-- 3. accent move from/to support word to/from following clitic words (iterated). + +lexAGreek :: String -> [String] +lexAGreek = fromAGreek . lexTextAGreek + +lexAGreek2 :: String -> [String] +lexAGreek2 = fromAGreek . lexTextAGreek2 + +unlexAGreek :: [String] -> String +unlexAGreek = unlexTextAGreek . toAGreek + +-- Note: unlexAGreek does not glue punctuation with the previous word, so that short +-- vowel indication (like a.) differs from sentence end (a .). + +-- | normalize = change grave accent on sentence internal words to acute, +-- and shift inherited acutes to the following enclitic (where they are +-- visible only as shown in the list of enclitics above) + +normalize :: String -> String +normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) + +fromAGreek :: [String] -> [String] +fromAGreek s = case s of + w:[]:vs -> w:[]:(fromAGreek vs) + w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs) + w:v:vs | wasEnclitic v && wasEnclitic w -> + getEnclitic w : fromAGreek (v:vs) + w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*" + getProclitic w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic v && (hasEndCircum w || + (hasEndAcute w && hasSingleAccent w)) -> + w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*" + w:v:vs | wasEnclitic v && hasPrefinalAcute w -> + w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*" + dropLastAccent w : getEnclitic v : fromAGreek vs + w:v:vs | wasEnclitic w -> + getEnclitic w : fromAGreek (v:vs) + w:ws -> (toAcute w) : (fromAGreek ws) + ws -> ws + +-- | de-normalize = change acute accent of end syllables in sentence internal +-- (non-enclitic) words to grave accent, and move accents of enclitics to the +-- previous word to produce ordinary ancient greek + +denormalize :: String -> String +denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) + +toAGreek :: [String] -> [String] +toAGreek s = case s of + w:[]:vs -> w:[]:(toAGreek vs) + w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek + w:v:vs | isEnclitic v && isEnclitic w -> + addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2 + w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta + addAcute w: (toAGreek (dropAccent v:vs)) + w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> + w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta + w:v:vs | isEnclitic v && hasPrefinalAcute w -> + w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b. + w:v:vs | isEnclitic v -> + (addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma + w:v:vs | isEnclitic w -> w:(toAGreek (v:vs)) + w:ws -> (toGrave w) : (toAGreek ws) + ws -> ws + +-- | Change accent on the final syllable of a word + +toGrave :: String -> String +toGrave = reverse . grave . reverse where + grave s = case s of + '\'':cs -> '`':cs + c:cs | isAGreekVowel c -> c:cs + c:cs -> c: grave cs + _ -> s + +toAcute :: String -> String +toAcute = reverse . acute . reverse where + acute s = case s of + '`':cs -> '\'':cs + c:cs | isAGreekVowel c -> c:cs + c:cs -> c: acute cs + _ -> s + +isAGreekVowel = flip elem "aeioyhw" + +-- | Accent moves for enclitics and proclitics (atona) + +enclitics = [ + "moy","moi","me", -- personal pronouns + "soy","soi","se", + "oy(","oi(","e(", + "tis*","ti","tina'", -- indefinite pronoun + "tino's*","tini'", + "tine's*","tina's*", + "tinw~n","tisi'","tisi'n", + "poy","poi", -- indefinite adverbs + "pove'n","pws*", + "ph|","pote'", + "ge","te","toi", -- particles + "nyn","per","pw" + -- suffix -"de" + -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~) + ] -- and more, BR 11 + +proclitics = [ + "o(","h(","oi(","ai(", -- articles + "e)n","ei)s*","e)x","e)k", -- prepositions + "ei)","w(s*", -- conjunctions + "oy)","oy)k","oy)c" -- negation + ] + +isEnclitic = flip elem enclitics +isProclitic = flip elem proclitics + +-- Check if a word is an enclitic or accented enclitic and extract the enclitic + +wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) + ++ (map dropAccent (filter hasAccent enclitics)) + accented = (filter hasAccent enclitics) + ++ map addAcute (filter (not . hasAccent) enclitics) + in flip elem (accented ++ unaccented) + +wasProclitic = flip elem (map addAcute proclitics) + +getEnclitic = + let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics)) + ++ (map addAcute (filter (not . hasAccent) enclitics))) + (enclitics ++ (filter hasAccent enclitics) + ++ (filter (not . hasAccent) enclitics)) + find = \v -> lookup v pairs + in \v -> case (find v) of + Just x -> x + _ -> v +getProclitic = + let pairs = zip (map addAcute proclitics) proclitics + find = \v -> lookup v pairs + in \v -> case (find v) of + Just x -> x + _ -> v + +-- | Accent manipulation + +dropAccent = reverse . drop . reverse where + drop s = case s of + [] -> [] + '\'':cs -> cs + '`':cs -> cs + '~':cs -> cs + c:cs -> c:drop cs + +dropLastAccent = reverse . drop . reverse where + drop s = case s of + [] -> [] + '\'':cs -> cs + '`':cs -> cs + '~':cs -> cs + c:cs -> c:drop cs + +addAcute :: String -> String +addAcute = reverse . acu . reverse where + acu w = case w of + c:cs | c == '\'' -> c:cs + c:cs | c == '(' -> '\'':c:cs + c:cs | c == ')' -> '\'':c:cs + c:cs | isAGreekVowel c -> '\'':c:cs + c:cs -> c : acu cs + _ -> w + +-- | Accent checking on end syllables + +hasEndAcute = find . reverse where + find s = case s of + [] -> False + '\'':cs -> True + '`':cs -> False + '~':cs -> False + c:cs | isAGreekVowel c -> False + _:cs -> find cs + +hasEndCircum = find . reverse where + find s = case s of + [] -> False + '\'':cs -> False + '`':cs -> False + '~':cs -> True + c:cs | isAGreekVowel c -> False + _:cs -> find cs + +hasPrefinalAcute = find . reverse where + find s = case s of + [] -> False + '\'':cs -> False -- final acute + '`':cs -> False + '~':cs -> False + c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs + c:cs | isAGreekVowel c -> findNext cs + _:cs -> find cs where + findNext s = case s of + [] -> False + '\'':cs -> True -- prefinal acute + '`':cs -> False + '~':cs -> False + c:cs | isAGreekVowel c -> False + _:cs -> findNext cs where + +hasSingleAccent v = + hasAccent v && not (hasAccent (dropLastAccent v)) + +hasAccent v = case v of + [] -> False + c:cs -> elem c ['\'','`','~'] || hasAccent cs + +{- Tests: + +-- denormalization. Examples in BR 11 work: +-} +enclitics_expls = -- normalized + "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha + "sofo's tis*":"sofoi' tine's*": -- a.beta + "ei) tis*":"ei) tine's*": + "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma + "doy~los* tis*":"doy~loi tine's*": + "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b. + "ei) poy tis* tina' i)'doi": -- Anm. 2. + [] + + +unlexCode :: [String] -> String +unlexCode s = case s of + w:[] -> w + [c]:cs | isParen c -> [c] ++ unlexCode cs + w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs + w:ws -> w ++ " " ++ unlexCode ws + _ -> [] + + +-- | LaTeX lexer in the math mode: \ should not be separated from the next word + +lexLatexCode :: String -> [String] +lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex + restoreBackslash ws = case ws of + "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww + w:ww -> w:restoreBackslash ww + _ -> ws + +-- * Mixed lexing + +-- | LaTeX style lexer, with "math" environment using Code between $...$ +lexMixed :: (String -> Bool) -> String -> [String] +lexMixed good = concat . alternate False [] where + alternate env t s = case s of + '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs + '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs + c:cs -> alternate env (c:t) cs + _ -> [lex env (reverse t)] + lex env = if env then lexLatexCode else lexText good + +unlexMixed :: (String -> Bool) -> [String] -> String +unlexMixed good = capitInit . concat . alternate False where + alternate env s = case s of + _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of + (t,[]) -> unlex env t : [] + (t,c:m) -> unlex env t : sep env c m : alternate (not env) m + _ -> [] + unlex env = if env then unlexCode else (uncapitInit good . unlexText) + sep env c m = case (m,env) of + ([p]:_,True) | isPunct p -> c -- closing $ glued to next punct + (_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows + _ -> " " ++ c -- put space before opening $ + +-- * Additional lexing uitilties + +-- | Capitalize first letter +capitInit s = case s of + c:cs -> toUpper c : cs + _ -> s + +-- | Uncapitalize first letter +uncapitInit good s = + case s of + c:cs | not (good s) -> toLower c : cs + _ -> s + +-- | Unquote each string wrapped in double quotes +unquote = map unq where + unq s = case s of + '"':cs@(_:_) | last cs == '"' -> init cs + _ -> s + +isPunct = flip elem ".?!,:;" +isMajorPunct = flip elem ".?!" +isMinorPunct = flip elem ",:;" +isParen = flip elem "()[]{}" +isClosing = flip elem ")]}" diff --git a/src/runtime/haskell-bind/PGF.hs b/src/runtime/haskell-bind/PGF.hs new file mode 100644 index 000000000..8aeca7ab8 --- /dev/null +++ b/src/runtime/haskell-bind/PGF.hs @@ -0,0 +1,3 @@ +module PGF(module PGF2) where + +import PGF2 diff --git a/src/runtime/haskell-bind/PGF/Internal.hs b/src/runtime/haskell-bind/PGF/Internal.hs new file mode 100644 index 000000000..e8193b788 --- /dev/null +++ b/src/runtime/haskell-bind/PGF/Internal.hs @@ -0,0 +1 @@ +module PGF.Internal where diff --git a/src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs b/src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs deleted file mode 100644 index 96808f906..000000000 --- a/src/runtime/haskell-bind/examples/pgf-hsbind-trans.hs +++ /dev/null @@ -1,61 +0,0 @@ --- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding --- to the C implementation of the PGF run-time system. --- --- AR April 2015 modified from pgf-shell - -import PGF2 -import PGF.Lexing (lexText') - -import Data.Char(isSpace,toLower) -import Data.List (nub) -import System.Environment -import qualified Data.Map as Map - -maxNumTrees :: Int -maxNumTrees = 1 - -maxNumVariants :: Int -maxNumVariants = 1 - -main = getPGF =<< getArgs - -getPGF args = case args of - [path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path - [path,from,to] -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path - _ -> putStrLn "Usage: pgf-hsbind-trans [ <#trees> <#variants>]" - -pgfTrans from to mcat mx pgf = do - cfrom <- getConcr' pgf from - cto <- getConcr' pgf to - let cat = maybe (startCat pgf) id mcat - interact (unlines . map (translates pgf cfrom cto cat mx) . lines) - -getConcr' pgf lang = - maybe (fail $ "Concrete syntax not found: "++show lang) return $ - Map.lookup lang (languages pgf) - -linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p] - where - unstar s = case s of - '*':' ':cs -> cs - _ -> s - -translates pgf cfrom cto cat (mxt,mxv) s0 = - let s1 = lextext cfrom s0 - (s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation - in - case cparse pgf cfrom cat s of - Left tok -> unlines [s,"Parse error: "++tok] - Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation - -cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where - callbacks = maybe [] cb $ lookup "App" literalCallbacks - cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs] - -lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of - _:_ -> w - _ -> case lookupMorpho cnc (uncapitInit w) of - [] -> w - _ -> uncapitInit w - ) - where uncapitInit (c:cs) = toLower c : cs diff --git a/src/runtime/haskell-bind/pgf2.cabal b/src/runtime/haskell-bind/pgf2.cabal index df1cc2b1a..8f29ea969 100644 --- a/src/runtime/haskell-bind/pgf2.cabal +++ b/src/runtime/haskell-bind/pgf2.cabal @@ -14,7 +14,9 @@ extra-source-files: README cabal-version: >=1.10 library - exposed-modules: PGF2, SG + exposed-modules: PGF2, SG, + -- backwards compatibility API: + PGF, PGF.Internal other-modules: PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI build-depends: base >=4.3, bytestring >=0.9, containers, pretty diff --git a/src/runtime/haskell/PGF/Lexing.hs b/src/runtime/haskell/PGF/Lexing.hs deleted file mode 100644 index 4dc352792..000000000 --- a/src/runtime/haskell/PGF/Lexing.hs +++ /dev/null @@ -1,115 +0,0 @@ -module PGF.Lexing where -import Data.Char(isSpace,toLower,toUpper) - --- * Text lexing --- | Text lexing with standard word capitalization of the first word of every sentence -lexText :: String -> [String] -lexText = lexText' uncapitInit - --- | Text lexing with custom treatment of the first word of every sentence. -lexText' :: (String->String) -> String -> [String] -lexText' uncap1 = uncap . lext where - lext s = case s of - c:cs | isMajorPunct c -> [c] : uncap (lext cs) - c:cs | isMinorPunct c -> [c] : lext cs - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs - _ -> [s] - uncap s = case s of - w:ws -> uncap1 w:ws - _ -> s - -unlexText :: [String] -> String -unlexText = capitInit . unlext where - unlext s = case s of - w:[] -> w - w:[c]:[] | isPunct c -> w ++ [c] - w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs) - w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs - w:ws -> w ++ " " ++ unlext ws - _ -> [] - --- | Bind tokens separated by Prelude.BIND, i.e. &+ -bindTok :: [String] -> [String] -bindTok ws = case ws of - w1:"&+":w2:ws -> bindTok ((w1++w2):ws) - "&+":ws -> bindTok ws - "&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws) - "&|":ws -> bindTok ws - w:ws -> w:bindTok ws - [] -> [] - --- * Code lexing - --- | Haskell lexer, usable for much code -lexCode :: String -> [String] -lexCode ss = case lex ss of - [(w@(_:_),ws)] -> w : lexCode ws - _ -> [] - -unlexCode :: [String] -> String -unlexCode s = case s of - w:[] -> w - [c]:cs | isParen c -> [c] ++ unlexCode cs - w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs - w:ws -> w ++ " " ++ unlexCode ws - _ -> [] - - --- | LaTeX lexer in the math mode: \ should not be separated from the next word - -lexLatexCode :: String -> [String] -lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex - restoreBackslash ws = case ws of - "\\":w:ww -> ("\\" ++ w) : restoreBackslash ww - w:ww -> w:restoreBackslash ww - _ -> ws - --- * Mixed lexing - --- | LaTeX style lexer, with "math" environment using Code between $...$ -lexMixed :: String -> [String] -lexMixed = concat . alternate False [] where - alternate env t s = case s of - '$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs - '\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs - c:cs -> alternate env (c:t) cs - _ -> [lex env (reverse t)] - lex env = if env then lexLatexCode else lexText - -unlexMixed :: [String] -> String -unlexMixed = capitInit . concat . alternate False where - alternate env s = case s of - _:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of - (t,[]) -> unlex env t : [] - (t,c:m) -> unlex env t : sep env c m : alternate (not env) m - _ -> [] - unlex env = if env then unlexCode else (uncapitInit . unlexText) - sep env c m = case (m,env) of - ([p]:_,True) | isPunct p -> c -- closing $ glued to next punct - (_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows - _ -> " " ++ c -- put space before opening $ - --- * Additional lexing uitilties - --- | Capitalize first letter -capitInit s = case s of - c:cs -> toUpper c : cs - _ -> s - --- | Uncapitalize first letter -uncapitInit s = case s of - c:cs -> toLower c : cs - _ -> s - --- | Unquote each string wrapped in double quotes -unquote = map unq where - unq s = case s of - '"':cs@(_:_) | last cs == '"' -> init cs - _ -> s - -isPunct = flip elem ".?!,:;" -isMajorPunct = flip elem ".?!" -isMinorPunct = flip elem ",:;" -isParen = flip elem "()[]{}" -isClosing = flip elem ")]}" diff --git a/src/runtime/haskell/PGF/LexingAGreek.hs b/src/runtime/haskell/PGF/LexingAGreek.hs deleted file mode 100644 index a8ed19238..000000000 --- a/src/runtime/haskell/PGF/LexingAGreek.hs +++ /dev/null @@ -1,310 +0,0 @@ -module PGF.LexingAGreek where -- HL 2a1.2.2016 -import Data.Char(isSpace) - --- * Text lexing without word capitalization of the first word of every sentence. --- Greek sentences in (transliterated) texts don't start with capital character. - --- Ordinary greek text does not have vowel length indicators. We then use '.' as --- a sentence separator. - -lexTextAGreek :: String -> [String] -lexTextAGreek s = lext s where - lext s = case s of - c:cs | isAGreekPunct c -> [c] : (lext cs) - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s - in w : lext cs - [] -> [] - --- Philological greek text may use vowel length indicators. Then '.' is not a sentence --- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c. - -lexTextAGreek2 :: String -> [String] -lexTextAGreek2 s = lext s where - lext s = case s of - c:cs | isAGreekPunct c -> [c] : (lext cs) - c:cs | isSpace c -> lext cs - _:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s - in case cs of - '.':'.':d:ds | isSpace d - -> (w++['.']) : lext ('.':d:ds) - '.':d:ds | isAGreekPunct d || isSpace d - -> (w++['.']) : lext (d:ds) - '.':d:ds | not (isSpace d) - -> case lext (d:ds) of - e:es -> (w++['.']++e) : es - es -> (w++['.']) : es - '.':[] -> (w++['.']) : [] - _ -> w : lext cs - [] -> [] - -unlexTextAGreek :: [String] -> String -unlexTextAGreek = unlext where - unlext s = case s of - w:[] -> w - w:[c]:[] | isAGreekPunct c -> w ++ [c] - w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs - w:ws -> w ++ " " ++ unlext ws - [] -> [] - -isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset, - -- second version · = 00B7 standard code point - --- * Text lexing and unlexing for Ancient Greek: --- 1. no capitalization of initial word, --- 2. grave/acute accent switch on final syllables of words not followed by punctuation, --- 3. accent move from/to support word to/from following clitic words (iterated). - -lexAGreek :: String -> [String] -lexAGreek = fromAGreek . lexTextAGreek - -lexAGreek2 :: String -> [String] -lexAGreek2 = fromAGreek . lexTextAGreek2 - -unlexAGreek :: [String] -> String -unlexAGreek = unlexTextAGreek . toAGreek - --- Note: unlexAGreek does not glue punctuation with the previous word, so that short --- vowel indication (like a.) differs from sentence end (a .). - --- | normalize = change grave accent on sentence internal words to acute, --- and shift inherited acutes to the following enclitic (where they are --- visible only as shown in the list of enclitics above) - -normalize :: String -> String -normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek) - -fromAGreek :: [String] -> [String] -fromAGreek s = case s of - w:[]:vs -> w:[]:(fromAGreek vs) - w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs) - w:v:vs | wasEnclitic v && wasEnclitic w -> - getEnclitic w : fromAGreek (v:vs) - w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*" - getProclitic w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic v && (hasEndCircum w || - (hasEndAcute w && hasSingleAccent w)) -> - w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*" - w:v:vs | wasEnclitic v && hasPrefinalAcute w -> - w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*" - dropLastAccent w : getEnclitic v : fromAGreek vs - w:v:vs | wasEnclitic w -> - getEnclitic w : fromAGreek (v:vs) - w:ws -> (toAcute w) : (fromAGreek ws) - ws -> ws - --- | de-normalize = change acute accent of end syllables in sentence internal --- (non-enclitic) words to grave accent, and move accents of enclitics to the --- previous word to produce ordinary ancient greek - -denormalize :: String -> String -denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek) - -toAGreek :: [String] -> [String] -toAGreek s = case s of - w:[]:vs -> w:[]:(toAGreek vs) - w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek - w:v:vs | isEnclitic v && isEnclitic w -> - addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2 - w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta - addAcute w: (toAGreek (dropAccent v:vs)) - w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) -> - w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta - w:v:vs | isEnclitic v && hasPrefinalAcute w -> - w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b. - w:v:vs | isEnclitic v -> - (addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma - w:v:vs | isEnclitic w -> w:(toAGreek (v:vs)) - w:ws -> (toGrave w) : (toAGreek ws) - ws -> ws - --- | Change accent on the final syllable of a word - -toGrave :: String -> String -toGrave = reverse . grave . reverse where - grave s = case s of - '\'':cs -> '`':cs - c:cs | isAGreekVowel c -> c:cs - c:cs -> c: grave cs - _ -> s - -toAcute :: String -> String -toAcute = reverse . acute . reverse where - acute s = case s of - '`':cs -> '\'':cs - c:cs | isAGreekVowel c -> c:cs - c:cs -> c: acute cs - _ -> s - -isAGreekVowel = flip elem "aeioyhw" - --- | Accent moves for enclitics and proclitics (atona) - -enclitics = [ - "moy","moi","me", -- personal pronouns - "soy","soi","se", - "oy(","oi(","e(", - "tis*","ti","tina'", -- indefinite pronoun - "tino's*","tini'", - "tine's*","tina's*", - "tinw~n","tisi'","tisi'n", - "poy","poi", -- indefinite adverbs - "pove'n","pws*", - "ph|","pote'", - "ge","te","toi", -- particles - "nyn","per","pw" - -- suffix -"de" - -- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~) - ] -- and more, BR 11 - -proclitics = [ - "o(","h(","oi(","ai(", -- articles - "e)n","ei)s*","e)x","e)k", -- prepositions - "ei)","w(s*", -- conjunctions - "oy)","oy)k","oy)c" -- negation - ] - -isEnclitic = flip elem enclitics -isProclitic = flip elem proclitics - --- Check if a word is an enclitic or accented enclitic and extract the enclitic - -wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics) - ++ (map dropAccent (filter hasAccent enclitics)) - accented = (filter hasAccent enclitics) - ++ map addAcute (filter (not . hasAccent) enclitics) - in flip elem (accented ++ unaccented) - -wasProclitic = flip elem (map addAcute proclitics) - -getEnclitic = - let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics)) - ++ (map addAcute (filter (not . hasAccent) enclitics))) - (enclitics ++ (filter hasAccent enclitics) - ++ (filter (not . hasAccent) enclitics)) - find = \v -> lookup v pairs - in \v -> case (find v) of - Just x -> x - _ -> v -getProclitic = - let pairs = zip (map addAcute proclitics) proclitics - find = \v -> lookup v pairs - in \v -> case (find v) of - Just x -> x - _ -> v - --- | Accent manipulation - -dropAccent = reverse . drop . reverse where - drop s = case s of - [] -> [] - '\'':cs -> cs - '`':cs -> cs - '~':cs -> cs - c:cs -> c:drop cs - -dropLastAccent = reverse . drop . reverse where - drop s = case s of - [] -> [] - '\'':cs -> cs - '`':cs -> cs - '~':cs -> cs - c:cs -> c:drop cs - -addAcute :: String -> String -addAcute = reverse . acu . reverse where - acu w = case w of - c:cs | c == '\'' -> c:cs - c:cs | c == '(' -> '\'':c:cs - c:cs | c == ')' -> '\'':c:cs - c:cs | isAGreekVowel c -> '\'':c:cs - c:cs -> c : acu cs - _ -> w - --- | Accent checking on end syllables - -hasEndAcute = find . reverse where - find s = case s of - [] -> False - '\'':cs -> True - '`':cs -> False - '~':cs -> False - c:cs | isAGreekVowel c -> False - _:cs -> find cs - -hasEndCircum = find . reverse where - find s = case s of - [] -> False - '\'':cs -> False - '`':cs -> False - '~':cs -> True - c:cs | isAGreekVowel c -> False - _:cs -> find cs - -hasPrefinalAcute = find . reverse where - find s = case s of - [] -> False - '\'':cs -> False -- final acute - '`':cs -> False - '~':cs -> False - c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs - c:cs | isAGreekVowel c -> findNext cs - _:cs -> find cs where - findNext s = case s of - [] -> False - '\'':cs -> True -- prefinal acute - '`':cs -> False - '~':cs -> False - c:cs | isAGreekVowel c -> False - _:cs -> findNext cs where - -hasSingleAccent v = - hasAccent v && not (hasAccent (dropLastAccent v)) - -hasAccent v = case v of - [] -> False - c:cs -> elem c ['\'','`','~'] || hasAccent cs - -{- Tests: - --- denormalization. Examples in BR 11 work: --} -enclitics_expls = -- normalized - "sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha - "sofo's tis*":"sofoi' tine's*": -- a.beta - "ei) tis*":"ei) tine's*": - "a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma - "doy~los* tis*":"doy~loi tine's*": - "lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b. - "ei) poy tis* tina' i)'doi": -- Anm. 2. - [] -{- -test = map denormalize enclitics_expls - -*PGF.LexingAGreek> test - ["sofw~n tis*","sofw~n tines*","sof~n tinwn", - "sofo's tis*","sofoi' tines*", - "ei)' tis*","ei)' tines*", - "a)'nvrwpo's* tis*","a)'nvrwpoi' tines*", - "doy~lo's* tis*","doy~loi' tines*", - "lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ", - "ei)' poy' ti's* tina i)'doi"] - --- normalization: - -*PGF.LexingAGreek> map normalize test - ["sofw~n tis*","sofw~n tine's*","sof~n tinw~n", - "sofo's tis*","sofoi' tine's*", - "ei) tis*","ei) tine's*", - "a)'nvrwpos* tis*","a)'nvrwpoi tine's*", - "doy~los* tis*","doy~loi tine's*", - "lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n", - "ei) poy tis* tina' i)'doi"] - -*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls -True -*PGF.LexingAGreek> map (denormalize . normalize) test == test -True - --} diff --git a/src/runtime/haskell/pgf.cabal b/src/runtime/haskell/pgf.cabal index 58f362d1b..35e2a84e1 100644 --- a/src/runtime/haskell/pgf.cabal +++ b/src/runtime/haskell/pgf.cabal @@ -50,9 +50,6 @@ Library exposed-modules: PGF PGF.Internal - PGF.Lexing - PGF.LexingAGreek - PGF.Utilities PGF.Haskell other-modules: @@ -78,3 +75,4 @@ Library PGF.VisualizeTree PGF.ByteCode PGF.OldBinary + PGF.Utilities diff --git a/src/server/PGFService.hs b/src/server/PGFService.hs index 9d8511915..b1020b4b8 100644 --- a/src/server/PGFService.hs +++ b/src/server/PGFService.hs @@ -4,8 +4,8 @@ module PGFService(cgiMain,cgiMain',getPath, Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where import PGF (PGF,Labels,CncLabels) +import GF.Text.Lexing import qualified PGF -import PGF.Lexing import Cache import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText, outputBinary,outputBinary', @@ -272,8 +272,11 @@ cpgfMain qsem command (t,(pgf,pc)) = maybe (Left ("["++w++"]")) Right $ msum [parse1 w,parse1 ow,morph w,morph ow] where - ow = if w==lw then capitInit w else lw - lw = uncapitInit w + ow = case w of + c:cs | isLower c -> toUpper c : cs + | isUpper c -> toLower c : cs + s -> s + parse1 = either (const Nothing) (fmap fst . listToMaybe) . C.parse concr cat morph w = listToMaybe @@ -293,7 +296,7 @@ cpgfMain qsem command (t,(pgf,pc)) = from1 = maybe (missing "from") return =<< from' from' = getLang "from" - to = (,) # getLangs "to" % unlexerC + to = (,) # getLangs "to" % unlexer (const False) getLangs = getLangs' readLang getLang = getLang' readLang @@ -308,8 +311,7 @@ cpgfMain qsem command (t,(pgf,pc)) = let t = C.readExpr s maybe (badRequest "bad tree" s) return t - --c_lexer concr = lexer - c_lexer concr = ilexer (not . null . C.lookupMorpho concr) + c_lexer concr = lexer (not . null . C.lookupMorpho concr) -------------------------------------------------------------------------------- @@ -338,62 +340,29 @@ instance ToATree C.Expr where -------------------------------------------------------------------------------- -- * Lexing --- | Lexers with a text lexer that tries to be a more clever with the first word -ilexer good = lexer' uncap - where - uncap s = case span isUpper s of - ([c],r) | not (good s) -> toLower c:r - _ -> s - -- | Standard lexers -lexer = lexer' uncapitInit - -lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer" +lexer good = maybe (return id) lexerfun =<< getInput "lexer" where lexerfun name = - case name of - "text" -> return (unwords . lexText' uncap) - "code" -> return (unwords . lexCode) - "mixed" -> return (unwords . lexMixed) - _ -> badRequest "Unknown lexer" name + case stringOp good ("lex"++name) of + Just fn -> return fn + Nothing -> badRequest "Unknown lexer" name type Unlexer = String->String -- | Unlexing for the C runtime system, &+ is already applied -unlexerC :: CGI Unlexer -unlexerC = maybe (return id) unlexerfun =<< getInput "unlexer" +unlexer :: (String -> Bool) -> CGI Unlexer +unlexer good = maybe (return id) unlexerfun =<< getInput "unlexer" where unlexerfun name = - case name of - "text" -> return (unlexText' . words) - "code" -> return (unlexCode . words) - "mixed" -> return (unlexMixed . words) - "none" -> return id - "id" -> return id - _ -> badRequest "Unknown lexer" name - --- | Unlex text, skipping the quality marker used by the App grammar -unlexText' ("+":ws) = "+ "++unlexText ws -unlexText' ("*":ws) = "* "++unlexText ws -unlexText' ws = unlexText ws - --- | Unlexing for the Haskell run-time, applying the &+ operator first -unlexerH :: CGI Unlexer -unlexerH = maybe (return doBind) unlexerfun =<< getInput "unlexer" - where - unlexerfun name = - case name of - "text" -> return (unlexText' . bind) - "code" -> return (unlexCode . bind) - "mixed" -> return (unlexMixed . bind) - "none" -> return id - "id" -> return id - "bind" -> return doBind - _ -> badRequest "Unknown lexer" name - - doBind = unwords . bind - bind = bindTok . words + case stringOp good ("unlex"++name) of + Just fn -> return (fn . cleanMarker) + Nothing -> badRequest "Unknown unlexer" name + + cleanMarker ('+':cs) = cs + cleanMarker ('*':cs) = cs + cleanMarker cs = cs -------------------------------------------------------------------------------- -- * Haskell run-time functionality @@ -431,8 +400,8 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) = inp <- textInput return (fr,lex inp) - mlexer Nothing = lexer - mlexer (Just lang) = ilexer (PGF.isInMorpho morpho) + mlexer Nothing = lexer (const False) + mlexer (Just lang) = lexer (PGF.isInMorpho morpho) where morpho = PGF.buildMorpho pgf lang tree :: CGI PGF.Tree @@ -489,7 +458,7 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) = from = getLang "from" to1 = maybe (missing "to") return =<< getLang "to" - to = (,) # getLangs "to" % unlexerH + to = (,) # getLangs "to" % unlexer (const False) getLangs = getLangs' readLang getLang = getLang' readLang