eliminate modules PGF.Lexing, PGF.LexingAGreek. Make PGF.Utilities an internal module in the runtime. These are not really part of the core runtime.

This commit is contained in:
Krasimir Angelov
2017-09-04 11:43:37 +02:00
parent cae52bb9af
commit 1f908fa7bf
12 changed files with 453 additions and 561 deletions

View File

@@ -109,9 +109,6 @@ Library
exposed-modules: exposed-modules:
PGF PGF
PGF.Internal PGF.Internal
PGF.Lexing
PGF.LexingAGreek
PGF.Utilities
PGF.Haskell PGF.Haskell
other-modules: other-modules:
@@ -137,6 +134,7 @@ Library
PGF.VisualizeTree PGF.VisualizeTree
PGF.ByteCode PGF.ByteCode
PGF.OldBinary PGF.OldBinary
PGF.Utilities
if flag(c-runtime) if flag(c-runtime)
exposed-modules: PGF2 exposed-modules: PGF2

View File

@@ -219,7 +219,7 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [
stringOps menv opts s = foldr (menvop . app) s (reverse opts) stringOps menv opts s = foldr (menvop . app) s (reverse opts)
where where
app f = maybe id id (stringOp f) app f = maybe id id (stringOp (const False) f)
menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv menvop op = maybe op (\ (b,e) -> opInEnv b e op) menv
envFlag fs = envFlag fs =

View File

@@ -12,12 +12,12 @@
----------------------------------------------------------------------------- -----------------------------------------------------------------------------
module GF.Data.Utilities(module GF.Data.Utilities, module PGF.Utilities) where module GF.Data.Utilities(module GF.Data.Utilities) where
import Data.Maybe import Data.Maybe
import Data.List import Data.List
import Control.Monad (MonadPlus(..),liftM,when) import Control.Monad (MonadPlus(..),liftM,when)
import PGF.Utilities import qualified Data.Set as Set
-- * functions on lists -- * functions on lists
@@ -190,3 +190,18 @@ joinS glue = concatS . intersperse (showString glue)
-- | Like 'Data.List.nub', but O(n log n) instead of O(n^2), since it uses a set to lookup previous things.
-- The result list is stable (the elements are returned in the order they occur), and lazy.
-- Requires that the list elements can be compared by Ord.
-- Code ruthlessly taken from <http://hpaste.org/54411>
nub' :: Ord a => [a] -> [a]
nub' = loop Set.empty
where loop _ [] = []
loop seen (x : xs)
| Set.member x seen = loop seen xs
| otherwise = x : loop (Set.insert x seen) xs
-- | Replace all occurences of an element by another element.
replace :: Eq a => a -> a -> [a] -> [a]
replace x y = map (\z -> if z == x then y else z)

View File

@@ -2,30 +2,30 @@
module GF.Text.Lexing (stringOp,opInEnv) where module GF.Text.Lexing (stringOp,opInEnv) where
import GF.Text.Transliterations import GF.Text.Transliterations
import PGF.Lexing
import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016
import Data.Char (isSpace) import Data.Char (isSpace,toUpper,toLower)
import Data.List (intersperse) import Data.List (intersperse)
stringOp :: String -> Maybe (String -> String) stringOp :: (String -> Bool) -> String -> Maybe (String -> String)
stringOp name = case name of stringOp good name = case name of
"chars" -> Just $ appLexer (filter (not . all isSpace) . map return) "chars" -> Just $ appLexer (filter (not . all isSpace) . map return)
"lextext" -> Just $ appLexer lexText "lextext" -> Just $ appLexer (lexText good)
"lexcode" -> Just $ appLexer lexCode "lexcode" -> Just $ appLexer lexCode
"lexmixed" -> Just $ appLexer lexMixed "lexmixed" -> Just $ appLexer (lexMixed good)
"lexgreek" -> Just $ appLexer lexAGreek "lexgreek" -> Just $ appLexer lexAGreek
"lexgreek2" -> Just $ appLexer lexAGreek2 "lexgreek2" -> Just $ appLexer lexAGreek2
"words" -> Just $ appLexer words "words" -> Just $ appLexer words
"bind" -> Just $ appUnlexer (unwords . bindTok) "bind" -> Just $ appUnlexer (unwords . bindTok)
"unchars" -> Just $ appUnlexer concat "unchars" -> Just $ appUnlexer concat
"unlextext" -> Just $ appUnlexer (unlexText . unquote) "unlextext" -> Just $ appUnlexer (unlexText . unquote . bindTok)
"unlexcode" -> Just $ appUnlexer unlexCode "unlexcode" -> Just $ appUnlexer unlexCode
"unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote) "unlexmixed" -> Just $ appUnlexer (unlexMixed good . unquote . bindTok)
"unlexgreek" -> Just $ appUnlexer unlexAGreek "unlexgreek" -> Just $ appUnlexer unlexAGreek
"unlexnone" -> Just id
"unlexid" -> Just id
"unwords" -> Just $ appUnlexer unwords "unwords" -> Just $ appUnlexer unwords
"to_html" -> Just wrapHTML "to_html" -> Just wrapHTML
_ -> transliterate name _ -> transliterate name
-- perform op in environments beg--end, t.ex. between "--" -- perform op in environments beg--end, t.ex. between "--"
--- suboptimal implementation --- suboptimal implementation
@@ -55,3 +55,395 @@ appUnlexer f = f . words
wrapHTML :: String -> String wrapHTML :: String -> String
wrapHTML = unlines . tag . intersperse "<br>" . lines where wrapHTML = unlines . tag . intersperse "<br>" . lines where
tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"] tag ss = "<html>":"<head>":"<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />":"</head>":"<body>" : ss ++ ["</body>","</html>"]
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: (String -> Bool) -> String -> [String]
lexText good = lexText' (uncapitInit good)
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
-- * Code lexing
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
-- * Ancient Greek lexing
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
-- second version · = 00B7 standard code point
-- * Text lexing and unlexing for Ancient Greek:
-- 1. no capitalization of initial word,
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
-- 3. accent move from/to support word to/from following clitic words (iterated).
lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek
lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2
unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .).
-- | normalize = change grave accent on sentence internal words to acute,
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)
normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
fromAGreek :: [String] -> [String]
fromAGreek s = case s of
w:[]:vs -> w:[]:(fromAGreek vs)
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
w:v:vs | wasEnclitic v && wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
getProclitic w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && (hasEndCircum w ||
(hasEndAcute w && hasSingleAccent w)) ->
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
dropLastAccent w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:ws -> (toAcute w) : (fromAGreek ws)
ws -> ws
-- | de-normalize = change acute accent of end syllables in sentence internal
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
-- previous word to produce ordinary ancient greek
denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
toAGreek :: [String] -> [String]
toAGreek s = case s of
w:[]:vs -> w:[]:(toAGreek vs)
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
w:v:vs | isEnclitic v && isEnclitic w ->
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
addAcute w: (toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
w:v:vs | isEnclitic v ->
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
w:ws -> (toGrave w) : (toAGreek ws)
ws -> ws
-- | Change accent on the final syllable of a word
toGrave :: String -> String
toGrave = reverse . grave . reverse where
grave s = case s of
'\'':cs -> '`':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: grave cs
_ -> s
toAcute :: String -> String
toAcute = reverse . acute . reverse where
acute s = case s of
'`':cs -> '\'':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: acute cs
_ -> s
isAGreekVowel = flip elem "aeioyhw"
-- | Accent moves for enclitics and proclitics (atona)
enclitics = [
"moy","moi","me", -- personal pronouns
"soy","soi","se",
"oy(","oi(","e(",
"tis*","ti","tina'", -- indefinite pronoun
"tino's*","tini'",
"tine's*","tina's*",
"tinw~n","tisi'","tisi'n",
"poy","poi", -- indefinite adverbs
"pove'n","pws*",
"ph|","pote'",
"ge","te","toi", -- particles
"nyn","per","pw"
-- suffix -"de"
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
] -- and more, BR 11
proclitics = [
"o(","h(","oi(","ai(", -- articles
"e)n","ei)s*","e)x","e)k", -- prepositions
"ei)","w(s*", -- conjunctions
"oy)","oy)k","oy)c" -- negation
]
isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
++ (map dropAccent (filter hasAccent enclitics))
accented = (filter hasAccent enclitics)
++ map addAcute (filter (not . hasAccent) enclitics)
in flip elem (accented ++ unaccented)
wasProclitic = flip elem (map addAcute proclitics)
getEnclitic =
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
++ (map addAcute (filter (not . hasAccent) enclitics)))
(enclitics ++ (filter hasAccent enclitics)
++ (filter (not . hasAccent) enclitics))
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
getProclitic =
let pairs = zip (map addAcute proclitics) proclitics
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
-- | Accent manipulation
dropAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
dropLastAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
addAcute :: String -> String
addAcute = reverse . acu . reverse where
acu w = case w of
c:cs | c == '\'' -> c:cs
c:cs | c == '(' -> '\'':c:cs
c:cs | c == ')' -> '\'':c:cs
c:cs | isAGreekVowel c -> '\'':c:cs
c:cs -> c : acu cs
_ -> w
-- | Accent checking on end syllables
hasEndAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasEndCircum = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> True
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasPrefinalAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False -- final acute
'`':cs -> False
'~':cs -> False
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
c:cs | isAGreekVowel c -> findNext cs
_:cs -> find cs where
findNext s = case s of
[] -> False
'\'':cs -> True -- prefinal acute
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> findNext cs where
hasSingleAccent v =
hasAccent v && not (hasAccent (dropLastAccent v))
hasAccent v = case v of
[] -> False
c:cs -> elem c ['\'','`','~'] || hasAccent cs
{- Tests:
-- denormalization. Examples in BR 11 work:
-}
enclitics_expls = -- normalized
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
"sofo's tis*":"sofoi' tine's*": -- a.beta
"ei) tis*":"ei) tine's*":
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
"doy~los* tis*":"doy~loi tine's*":
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
"ei) poy tis* tina' i)'doi": -- Anm. 2.
[]
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
restoreBackslash ws = case ws of
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
w:ww -> w:restoreBackslash ww
_ -> ws
-- * Mixed lexing
-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: (String -> Bool) -> String -> [String]
lexMixed good = concat . alternate False [] where
alternate env t s = case s of
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
c:cs -> alternate env (c:t) cs
_ -> [lex env (reverse t)]
lex env = if env then lexLatexCode else lexText good
unlexMixed :: (String -> Bool) -> [String] -> String
unlexMixed good = capitInit . concat . alternate False where
alternate env s = case s of
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else (uncapitInit good . unlexText)
sep env c m = case (m,env) of
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
_ -> " " ++ c -- put space before opening $
-- * Additional lexing uitilties
-- | Capitalize first letter
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- | Uncapitalize first letter
uncapitInit good s =
case s of
c:cs | not (good s) -> toLower c : cs
_ -> s
-- | Unquote each string wrapped in double quotes
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"

View File

@@ -0,0 +1,3 @@
module PGF(module PGF2) where
import PGF2

View File

@@ -0,0 +1 @@
module PGF.Internal where

View File

@@ -1,61 +0,0 @@
-- | pgf-hsbind-trans: A simple batch translator to illustrate the use of the Haskell binding
-- to the C implementation of the PGF run-time system.
--
-- AR April 2015 modified from pgf-shell
import PGF2
import PGF.Lexing (lexText')
import Data.Char(isSpace,toLower)
import Data.List (nub)
import System.Environment
import qualified Data.Map as Map
maxNumTrees :: Int
maxNumTrees = 1
maxNumVariants :: Int
maxNumVariants = 1
main = getPGF =<< getArgs
getPGF args = case args of
[path,from,to,cat,mxt,mxv] -> pgfTrans from to (Just cat) (read mxt, read mxv) =<< readPGF path
[path,from,to] -> pgfTrans from to Nothing (maxNumTrees,maxNumVariants) =<< readPGF path
_ -> putStrLn "Usage: pgf-hsbind-trans <path to pgf> <from-lang> <to-lang> [<cat> <#trees> <#variants>]"
pgfTrans from to mcat mx pgf = do
cfrom <- getConcr' pgf from
cto <- getConcr' pgf to
let cat = maybe (startCat pgf) id mcat
interact (unlines . map (translates pgf cfrom cto cat mx) . lines)
getConcr' pgf lang =
maybe (fail $ "Concrete syntax not found: "++show lang) return $
Map.lookup lang (languages pgf)
linearizeAndShow gr mxv (t,p) = [show t]++take mxv (nub (map unstar (linearizeAll gr t)))++[show p]
where
unstar s = case s of
'*':' ':cs -> cs
_ -> s
translates pgf cfrom cto cat (mxt,mxv) s0 =
let s1 = lextext cfrom s0
(s,p) = case reverse s1 of c:_ | elem c ".?!" -> (init s1,[c]) ; _ -> (s1,[]) -- separate final punctuation
in
case cparse pgf cfrom cat s of
Left tok -> unlines [s,"Parse error: "++tok]
Right ts -> unlines $ (("> "++ s):) $ take mxt $ map ((++p) . unlines . linearizeAndShow cto mxv) ts -- append punctuation
cparse pgf concr cat input = parseWithHeuristics concr cat input (-1) callbacks where
callbacks = maybe [] cb $ lookup "App" literalCallbacks
cb fs = [(cat,f pgf ("TranslateEng",concr) input)|(cat,f)<-fs]
lextext cnc = unwords . lexText' (\w -> case lookupMorpho cnc w of
_:_ -> w
_ -> case lookupMorpho cnc (uncapitInit w) of
[] -> w
_ -> uncapitInit w
)
where uncapitInit (c:cs) = toLower c : cs

View File

@@ -14,7 +14,9 @@ extra-source-files: README
cabal-version: >=1.10 cabal-version: >=1.10
library library
exposed-modules: PGF2, SG exposed-modules: PGF2, SG,
-- backwards compatibility API:
PGF, PGF.Internal
other-modules: PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI other-modules: PGF2.FFI, PGF2.Expr, PGF2.Type, SG.FFI
build-depends: base >=4.3, bytestring >=0.9, build-depends: base >=4.3, bytestring >=0.9,
containers, pretty containers, pretty

View File

@@ -1,115 +0,0 @@
module PGF.Lexing where
import Data.Char(isSpace,toLower,toUpper)
-- * Text lexing
-- | Text lexing with standard word capitalization of the first word of every sentence
lexText :: String -> [String]
lexText = lexText' uncapitInit
-- | Text lexing with custom treatment of the first word of every sentence.
lexText' :: (String->String) -> String -> [String]
lexText' uncap1 = uncap . lext where
lext s = case s of
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
c:cs | isMinorPunct c -> [c] : lext cs
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
_ -> [s]
uncap s = case s of
w:ws -> uncap1 w:ws
_ -> s
unlexText :: [String] -> String
unlexText = capitInit . unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isPunct c -> w ++ [c]
w:[c]:cs | isMajorPunct c -> w ++ [c] ++ " " ++ capitInit (unlext cs)
w:[c]:cs | isMinorPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
_ -> []
-- | Bind tokens separated by Prelude.BIND, i.e. &+
bindTok :: [String] -> [String]
bindTok ws = case ws of
w1:"&+":w2:ws -> bindTok ((w1++w2):ws)
"&+":ws -> bindTok ws
"&|":(c:cs):ws-> bindTok ((toUpper c:cs) : ws)
"&|":ws -> bindTok ws
w:ws -> w:bindTok ws
[] -> []
-- * Code lexing
-- | Haskell lexer, usable for much code
lexCode :: String -> [String]
lexCode ss = case lex ss of
[(w@(_:_),ws)] -> w : lexCode ws
_ -> []
unlexCode :: [String] -> String
unlexCode s = case s of
w:[] -> w
[c]:cs | isParen c -> [c] ++ unlexCode cs
w:cs@([c]:_) | isClosing c -> w ++ unlexCode cs
w:ws -> w ++ " " ++ unlexCode ws
_ -> []
-- | LaTeX lexer in the math mode: \ should not be separated from the next word
lexLatexCode :: String -> [String]
lexLatexCode = restoreBackslash . lexCode where --- quick hack: postprocess Haskell's lex
restoreBackslash ws = case ws of
"\\":w:ww -> ("\\" ++ w) : restoreBackslash ww
w:ww -> w:restoreBackslash ww
_ -> ws
-- * Mixed lexing
-- | LaTeX style lexer, with "math" environment using Code between $...$
lexMixed :: String -> [String]
lexMixed = concat . alternate False [] where
alternate env t s = case s of
'$':cs -> lex env (reverse t) : ["$"] : alternate (not env) [] cs
'\\':c:cs | elem c "()[]" -> lex env (reverse t) : [['\\',c]] : alternate (not env) [] cs
c:cs -> alternate env (c:t) cs
_ -> [lex env (reverse t)]
lex env = if env then lexLatexCode else lexText
unlexMixed :: [String] -> String
unlexMixed = capitInit . concat . alternate False where
alternate env s = case s of
_:_ -> case break (flip elem ["$","\\[","\\]","\\(","\\)"]) s of
(t,[]) -> unlex env t : []
(t,c:m) -> unlex env t : sep env c m : alternate (not env) m
_ -> []
unlex env = if env then unlexCode else (uncapitInit . unlexText)
sep env c m = case (m,env) of
([p]:_,True) | isPunct p -> c -- closing $ glued to next punct
(_, True) -> c ++ " " -- closing $ otherwise separated by space from what follows
_ -> " " ++ c -- put space before opening $
-- * Additional lexing uitilties
-- | Capitalize first letter
capitInit s = case s of
c:cs -> toUpper c : cs
_ -> s
-- | Uncapitalize first letter
uncapitInit s = case s of
c:cs -> toLower c : cs
_ -> s
-- | Unquote each string wrapped in double quotes
unquote = map unq where
unq s = case s of
'"':cs@(_:_) | last cs == '"' -> init cs
_ -> s
isPunct = flip elem ".?!,:;"
isMajorPunct = flip elem ".?!"
isMinorPunct = flip elem ",:;"
isParen = flip elem "()[]{}"
isClosing = flip elem ")]}"

View File

@@ -1,310 +0,0 @@
module PGF.LexingAGreek where -- HL 2a1.2.2016
import Data.Char(isSpace)
-- * Text lexing without word capitalization of the first word of every sentence.
-- Greek sentences in (transliterated) texts don't start with capital character.
-- Ordinary greek text does not have vowel length indicators. We then use '.' as
-- a sentence separator.
lexTextAGreek :: String -> [String]
lexTextAGreek s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in w : lext cs
[] -> []
-- Philological greek text may use vowel length indicators. Then '.' is not a sentence
-- separator, nor is 'v. ' for vowel v. Sentence ends at 'v..' or 'c. ' with non-vowel c.
lexTextAGreek2 :: String -> [String]
lexTextAGreek2 s = lext s where
lext s = case s of
c:cs | isAGreekPunct c -> [c] : (lext cs)
c:cs | isSpace c -> lext cs
_:_ -> let (w,cs) = break (\x -> isSpace x || isAGreekPunct x) s
in case cs of
'.':'.':d:ds | isSpace d
-> (w++['.']) : lext ('.':d:ds)
'.':d:ds | isAGreekPunct d || isSpace d
-> (w++['.']) : lext (d:ds)
'.':d:ds | not (isSpace d)
-> case lext (d:ds) of
e:es -> (w++['.']++e) : es
es -> (w++['.']) : es
'.':[] -> (w++['.']) : []
_ -> w : lext cs
[] -> []
unlexTextAGreek :: [String] -> String
unlexTextAGreek = unlext where
unlext s = case s of
w:[] -> w
w:[c]:[] | isAGreekPunct c -> w ++ [c]
w:[c]:cs | isAGreekPunct c -> w ++ [c] ++ " " ++ unlext cs
w:ws -> w ++ " " ++ unlext ws
[] -> []
isAGreekPunct = flip elem ".,;··" -- colon: first version · not in charset,
-- second version · = 00B7 standard code point
-- * Text lexing and unlexing for Ancient Greek:
-- 1. no capitalization of initial word,
-- 2. grave/acute accent switch on final syllables of words not followed by punctuation,
-- 3. accent move from/to support word to/from following clitic words (iterated).
lexAGreek :: String -> [String]
lexAGreek = fromAGreek . lexTextAGreek
lexAGreek2 :: String -> [String]
lexAGreek2 = fromAGreek . lexTextAGreek2
unlexAGreek :: [String] -> String
unlexAGreek = unlexTextAGreek . toAGreek
-- Note: unlexAGreek does not glue punctuation with the previous word, so that short
-- vowel indication (like a.) differs from sentence end (a .).
-- | normalize = change grave accent on sentence internal words to acute,
-- and shift inherited acutes to the following enclitic (where they are
-- visible only as shown in the list of enclitics above)
normalize :: String -> String
normalize = (unlexTextAGreek . fromAGreek . lexTextAGreek)
fromAGreek :: [String] -> [String]
fromAGreek s = case s of
w:[]:vs -> w:[]:(fromAGreek vs)
w:(v:vs) | isAGreekPunct (head v) -> w:v:(fromAGreek vs)
w:v:vs | wasEnclitic v && wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:v:vs | wasEnclitic v && wasProclitic w -> -- "ei)' tines*"
getProclitic w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && (hasEndCircum w ||
(hasEndAcute w && hasSingleAccent w)) ->
w : getEnclitic v : fromAGreek vs -- ok "sofoi' tines*"
w:v:vs | wasEnclitic v && hasPrefinalAcute w ->
w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic v && hasEndAcute w -> -- ok "a)'nvrwpoi' tines*"
dropLastAccent w : getEnclitic v : fromAGreek vs
w:v:vs | wasEnclitic w ->
getEnclitic w : fromAGreek (v:vs)
w:ws -> (toAcute w) : (fromAGreek ws)
ws -> ws
-- | de-normalize = change acute accent of end syllables in sentence internal
-- (non-enclitic) words to grave accent, and move accents of enclitics to the
-- previous word to produce ordinary ancient greek
denormalize :: String -> String
denormalize = (unlexTextAGreek . toAGreek . lexTextAGreek)
toAGreek :: [String] -> [String]
toAGreek s = case s of
w:[]:vs -> w:[]:(toAGreek vs)
w:v:vs | isAGreekPunct (head v) -> w:[]:v:(toAGreek vs) -- w:[] for following -to_ancientgreek
w:v:vs | isEnclitic v && isEnclitic w ->
addAcute w : toAGreek (dropAccent v:vs) -- BR 11 Anm.2
w:v:vs | isEnclitic v && isProclitic w -> -- BR 11 a.beta
addAcute w: (toAGreek (dropAccent v:vs))
w:v:vs | isEnclitic v && (hasEndCircum w || hasEndAcute w) ->
w:(toAGreek (dropAccent v:vs)) -- BR 11 a.alpha,beta
w:v:vs | isEnclitic v && hasPrefinalAcute w ->
w:v: toAGreek vs -- bisyllabic v keeps its accent BR 11 b.
w:v:vs | isEnclitic v ->
(addAcute w):(toAGreek (dropAccent v:vs)) -- BR 11 a.gamma
w:v:vs | isEnclitic w -> w:(toAGreek (v:vs))
w:ws -> (toGrave w) : (toAGreek ws)
ws -> ws
-- | Change accent on the final syllable of a word
toGrave :: String -> String
toGrave = reverse . grave . reverse where
grave s = case s of
'\'':cs -> '`':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: grave cs
_ -> s
toAcute :: String -> String
toAcute = reverse . acute . reverse where
acute s = case s of
'`':cs -> '\'':cs
c:cs | isAGreekVowel c -> c:cs
c:cs -> c: acute cs
_ -> s
isAGreekVowel = flip elem "aeioyhw"
-- | Accent moves for enclitics and proclitics (atona)
enclitics = [
"moy","moi","me", -- personal pronouns
"soy","soi","se",
"oy(","oi(","e(",
"tis*","ti","tina'", -- indefinite pronoun
"tino's*","tini'",
"tine's*","tina's*",
"tinw~n","tisi'","tisi'n",
"poy","poi", -- indefinite adverbs
"pove'n","pws*",
"ph|","pote'",
"ge","te","toi", -- particles
"nyn","per","pw"
-- suffix -"de"
-- praes.indik. of fhmi', ei)mi' (except fh's*, ei)~)
] -- and more, BR 11
proclitics = [
"o(","h(","oi(","ai(", -- articles
"e)n","ei)s*","e)x","e)k", -- prepositions
"ei)","w(s*", -- conjunctions
"oy)","oy)k","oy)c" -- negation
]
isEnclitic = flip elem enclitics
isProclitic = flip elem proclitics
-- Check if a word is an enclitic or accented enclitic and extract the enclitic
wasEnclitic = let unaccented = (filter (not . hasAccent) enclitics)
++ (map dropAccent (filter hasAccent enclitics))
accented = (filter hasAccent enclitics)
++ map addAcute (filter (not . hasAccent) enclitics)
in flip elem (accented ++ unaccented)
wasProclitic = flip elem (map addAcute proclitics)
getEnclitic =
let pairs = zip (enclitics ++ (map dropAccent (filter hasAccent enclitics))
++ (map addAcute (filter (not . hasAccent) enclitics)))
(enclitics ++ (filter hasAccent enclitics)
++ (filter (not . hasAccent) enclitics))
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
getProclitic =
let pairs = zip (map addAcute proclitics) proclitics
find = \v -> lookup v pairs
in \v -> case (find v) of
Just x -> x
_ -> v
-- | Accent manipulation
dropAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
dropLastAccent = reverse . drop . reverse where
drop s = case s of
[] -> []
'\'':cs -> cs
'`':cs -> cs
'~':cs -> cs
c:cs -> c:drop cs
addAcute :: String -> String
addAcute = reverse . acu . reverse where
acu w = case w of
c:cs | c == '\'' -> c:cs
c:cs | c == '(' -> '\'':c:cs
c:cs | c == ')' -> '\'':c:cs
c:cs | isAGreekVowel c -> '\'':c:cs
c:cs -> c : acu cs
_ -> w
-- | Accent checking on end syllables
hasEndAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> True
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasEndCircum = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False
'`':cs -> False
'~':cs -> True
c:cs | isAGreekVowel c -> False
_:cs -> find cs
hasPrefinalAcute = find . reverse where
find s = case s of
[] -> False
'\'':cs -> False -- final acute
'`':cs -> False
'~':cs -> False
c:d:cs | isAGreekVowel c && isAGreekVowel d -> findNext cs
c:cs | isAGreekVowel c -> findNext cs
_:cs -> find cs where
findNext s = case s of
[] -> False
'\'':cs -> True -- prefinal acute
'`':cs -> False
'~':cs -> False
c:cs | isAGreekVowel c -> False
_:cs -> findNext cs where
hasSingleAccent v =
hasAccent v && not (hasAccent (dropLastAccent v))
hasAccent v = case v of
[] -> False
c:cs -> elem c ['\'','`','~'] || hasAccent cs
{- Tests:
-- denormalization. Examples in BR 11 work:
-}
enclitics_expls = -- normalized
"sofw~n tis*":"sofw~n tine's*":"sof~n tinw~n": -- a.alpha
"sofo's tis*":"sofoi' tine's*": -- a.beta
"ei) tis*":"ei) tine's*":
"a)'nvrwpos* tis*":"a)'nvrwpoi tine's*": -- a.gamma
"doy~los* tis*":"doy~loi tine's*":
"lo'gos* tis*":"lo'goi tine's*":"lo'gwn tinw~n": -- b.
"ei) poy tis* tina' i)'doi": -- Anm. 2.
[]
{-
test = map denormalize enclitics_expls
*PGF.LexingAGreek> test
["sofw~n tis*","sofw~n tines*","sof~n tinwn",
"sofo's tis*","sofoi' tines*",
"ei)' tis*","ei)' tines*",
"a)'nvrwpo's* tis*","a)'nvrwpoi' tines*",
"doy~lo's* tis*","doy~loi' tines*",
"lo'gos* tis* ","lo'goi tine's*","lo'gwn tinw~n ",
"ei)' poy' ti's* tina i)'doi"]
-- normalization:
*PGF.LexingAGreek> map normalize test
["sofw~n tis*","sofw~n tine's*","sof~n tinw~n",
"sofo's tis*","sofoi' tine's*",
"ei) tis*","ei) tine's*",
"a)'nvrwpos* tis*","a)'nvrwpoi tine's*",
"doy~los* tis*","doy~loi tine's*",
"lo'gos* tis*","lo'goi tine's*","lo'gwn tinw~n",
"ei) poy tis* tina' i)'doi"]
*PGF.LexingAGreek> map (normalize . denormalize) enclitics_expls == enclitics_expls
True
*PGF.LexingAGreek> map (denormalize . normalize) test == test
True
-}

View File

@@ -50,9 +50,6 @@ Library
exposed-modules: exposed-modules:
PGF PGF
PGF.Internal PGF.Internal
PGF.Lexing
PGF.LexingAGreek
PGF.Utilities
PGF.Haskell PGF.Haskell
other-modules: other-modules:
@@ -78,3 +75,4 @@ Library
PGF.VisualizeTree PGF.VisualizeTree
PGF.ByteCode PGF.ByteCode
PGF.OldBinary PGF.OldBinary
PGF.Utilities

View File

@@ -4,8 +4,8 @@ module PGFService(cgiMain,cgiMain',getPath,
Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where Caches,pgfCache,newPGFCache,flushPGFCache,listPGFCache) where
import PGF (PGF,Labels,CncLabels) import PGF (PGF,Labels,CncLabels)
import GF.Text.Lexing
import qualified PGF import qualified PGF
import PGF.Lexing
import Cache import Cache
import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText, import CGIUtils(outputJSONP,outputPlain,outputHTML,outputText,
outputBinary,outputBinary', outputBinary,outputBinary',
@@ -272,8 +272,11 @@ cpgfMain qsem command (t,(pgf,pc)) =
maybe (Left ("["++w++"]")) Right $ maybe (Left ("["++w++"]")) Right $
msum [parse1 w,parse1 ow,morph w,morph ow] msum [parse1 w,parse1 ow,morph w,morph ow]
where where
ow = if w==lw then capitInit w else lw ow = case w of
lw = uncapitInit w c:cs | isLower c -> toUpper c : cs
| isUpper c -> toLower c : cs
s -> s
parse1 = either (const Nothing) (fmap fst . listToMaybe) . parse1 = either (const Nothing) (fmap fst . listToMaybe) .
C.parse concr cat C.parse concr cat
morph w = listToMaybe morph w = listToMaybe
@@ -293,7 +296,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
from1 = maybe (missing "from") return =<< from' from1 = maybe (missing "from") return =<< from'
from' = getLang "from" from' = getLang "from"
to = (,) # getLangs "to" % unlexerC to = (,) # getLangs "to" % unlexer (const False)
getLangs = getLangs' readLang getLangs = getLangs' readLang
getLang = getLang' readLang getLang = getLang' readLang
@@ -308,8 +311,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
let t = C.readExpr s let t = C.readExpr s
maybe (badRequest "bad tree" s) return t maybe (badRequest "bad tree" s) return t
--c_lexer concr = lexer c_lexer concr = lexer (not . null . C.lookupMorpho concr)
c_lexer concr = ilexer (not . null . C.lookupMorpho concr)
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
@@ -338,62 +340,29 @@ instance ToATree C.Expr where
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- * Lexing -- * Lexing
-- | Lexers with a text lexer that tries to be a more clever with the first word
ilexer good = lexer' uncap
where
uncap s = case span isUpper s of
([c],r) | not (good s) -> toLower c:r
_ -> s
-- | Standard lexers -- | Standard lexers
lexer = lexer' uncapitInit lexer good = maybe (return id) lexerfun =<< getInput "lexer"
lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer"
where where
lexerfun name = lexerfun name =
case name of case stringOp good ("lex"++name) of
"text" -> return (unwords . lexText' uncap) Just fn -> return fn
"code" -> return (unwords . lexCode) Nothing -> badRequest "Unknown lexer" name
"mixed" -> return (unwords . lexMixed)
_ -> badRequest "Unknown lexer" name
type Unlexer = String->String type Unlexer = String->String
-- | Unlexing for the C runtime system, &+ is already applied -- | Unlexing for the C runtime system, &+ is already applied
unlexerC :: CGI Unlexer unlexer :: (String -> Bool) -> CGI Unlexer
unlexerC = maybe (return id) unlexerfun =<< getInput "unlexer" unlexer good = maybe (return id) unlexerfun =<< getInput "unlexer"
where where
unlexerfun name = unlexerfun name =
case name of case stringOp good ("unlex"++name) of
"text" -> return (unlexText' . words) Just fn -> return (fn . cleanMarker)
"code" -> return (unlexCode . words) Nothing -> badRequest "Unknown unlexer" name
"mixed" -> return (unlexMixed . words)
"none" -> return id cleanMarker ('+':cs) = cs
"id" -> return id cleanMarker ('*':cs) = cs
_ -> badRequest "Unknown lexer" name cleanMarker cs = cs
-- | Unlex text, skipping the quality marker used by the App grammar
unlexText' ("+":ws) = "+ "++unlexText ws
unlexText' ("*":ws) = "* "++unlexText ws
unlexText' ws = unlexText ws
-- | Unlexing for the Haskell run-time, applying the &+ operator first
unlexerH :: CGI Unlexer
unlexerH = maybe (return doBind) unlexerfun =<< getInput "unlexer"
where
unlexerfun name =
case name of
"text" -> return (unlexText' . bind)
"code" -> return (unlexCode . bind)
"mixed" -> return (unlexMixed . bind)
"none" -> return id
"id" -> return id
"bind" -> return doBind
_ -> badRequest "Unknown lexer" name
doBind = unwords . bind
bind = bindTok . words
-------------------------------------------------------------------------------- --------------------------------------------------------------------------------
-- * Haskell run-time functionality -- * Haskell run-time functionality
@@ -431,8 +400,8 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
inp <- textInput inp <- textInput
return (fr,lex inp) return (fr,lex inp)
mlexer Nothing = lexer mlexer Nothing = lexer (const False)
mlexer (Just lang) = ilexer (PGF.isInMorpho morpho) mlexer (Just lang) = lexer (PGF.isInMorpho morpho)
where morpho = PGF.buildMorpho pgf lang where morpho = PGF.buildMorpho pgf lang
tree :: CGI PGF.Tree tree :: CGI PGF.Tree
@@ -489,7 +458,7 @@ pgfMain lcs@(alc,clc) path command tpgf@(t,pgf) =
from = getLang "from" from = getLang "from"
to1 = maybe (missing "to") return =<< getLang "to" to1 = maybe (missing "to") return =<< getLang "to"
to = (,) # getLangs "to" % unlexerH to = (,) # getLangs "to" % unlexer (const False)
getLangs = getLangs' readLang getLangs = getLangs' readLang
getLang = getLang' readLang getLang = getLang' readLang