mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 19:42:50 -06:00
PGF Service: a bit more clever lexer=text
Only change the first word to lowercase if the original input is not found in the grammar's morphology. This allows parsing of sentenses starting with "I" in English, nouns in German and proper names in other languages, but it can make the wrong choice for multi-words.
This commit is contained in:
@@ -2,8 +2,13 @@ module PGF.Lexing where
|
|||||||
import Data.Char(isSpace,toLower,toUpper)
|
import Data.Char(isSpace,toLower,toUpper)
|
||||||
|
|
||||||
-- * Text lexing
|
-- * Text lexing
|
||||||
|
-- | Text lexing with standard word capitalization of the first word of every sentence
|
||||||
lexText :: String -> [String]
|
lexText :: String -> [String]
|
||||||
lexText = uncap . lext where
|
lexText = lexText' uncapitInit
|
||||||
|
|
||||||
|
-- | Text lexing with custom treatment of the first word of every sentence.
|
||||||
|
lexText' :: (String->String) -> String -> [String]
|
||||||
|
lexText' uncap1 = uncap . lext where
|
||||||
lext s = case s of
|
lext s = case s of
|
||||||
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
c:cs | isMajorPunct c -> [c] : uncap (lext cs)
|
||||||
c:cs | isMinorPunct c -> [c] : lext cs
|
c:cs | isMinorPunct c -> [c] : lext cs
|
||||||
@@ -11,7 +16,7 @@ lexText = uncap . lext where
|
|||||||
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
_:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs
|
||||||
_ -> [s]
|
_ -> [s]
|
||||||
uncap s = case s of
|
uncap s = case s of
|
||||||
(c:cs):ws -> (toLower c : cs):ws
|
w:ws -> uncap1 w:ws
|
||||||
_ -> s
|
_ -> s
|
||||||
|
|
||||||
unlexText :: [String] -> String
|
unlexText :: [String] -> String
|
||||||
@@ -78,6 +83,11 @@ capitInit s = case s of
|
|||||||
c:cs -> toUpper c : cs
|
c:cs -> toUpper c : cs
|
||||||
_ -> s
|
_ -> s
|
||||||
|
|
||||||
|
-- | Uncapitalize first letter
|
||||||
|
uncapitInit s = case s of
|
||||||
|
c:cs -> toLower c : cs
|
||||||
|
_ -> s
|
||||||
|
|
||||||
-- | Unquote each string wrapped in double quotes
|
-- | Unquote each string wrapped in double quotes
|
||||||
unquote = map unq where
|
unquote = map unq where
|
||||||
unq s = case s of
|
unq s = case s of
|
||||||
|
|||||||
@@ -78,8 +78,11 @@ cgiMain' cache path =
|
|||||||
(getInput "command")
|
(getInput "command")
|
||||||
case command of
|
case command of
|
||||||
"download" -> outputBinary =<< liftIO (BS.readFile path)
|
"download" -> outputBinary =<< liftIO (BS.readFile path)
|
||||||
|
'c':'-':_ ->
|
||||||
#ifdef C_RUNTIME
|
#ifdef C_RUNTIME
|
||||||
'c':'-':_ -> cpgfMain command =<< liftIO (readCache (snd cache) path)
|
cpgfMain command =<< liftIO (readCache (snd cache) path)
|
||||||
|
#else
|
||||||
|
serverError "Server configured without C run-time support" ""
|
||||||
#endif
|
#endif
|
||||||
_ -> pgfMain command =<< liftIO (readCache (fst cache) path)
|
_ -> pgfMain command =<< liftIO (readCache (fst cache) path)
|
||||||
|
|
||||||
@@ -90,9 +93,9 @@ cgiMain' cache path =
|
|||||||
cpgfMain :: String -> (C.PGF,MVar ParseCache) -> CGI CGIResult
|
cpgfMain :: String -> (C.PGF,MVar ParseCache) -> CGI CGIResult
|
||||||
cpgfMain command (pgf,pc) =
|
cpgfMain command (pgf,pc) =
|
||||||
case command of
|
case command of
|
||||||
"c-parse" -> out =<< join (parse#lexer%input%from%start%limit%trie)
|
"c-parse" -> out =<< join (parse # input % from % start % limit % trie)
|
||||||
"c-linearize" -> out =<< lin # tree % to
|
"c-linearize" -> out =<< lin # tree % to
|
||||||
"c-translate" -> out =<< join (trans#lexer%input%from%to%start%limit%trie)
|
"c-translate" -> out =<< join (trans#input%from%to%start%limit%trie)
|
||||||
"c-flush" -> out =<< flush
|
"c-flush" -> out =<< flush
|
||||||
"c-grammar" -> out grammar
|
"c-grammar" -> out grammar
|
||||||
_ -> badRequest "Unknown command" command
|
_ -> badRequest "Unknown command" command
|
||||||
@@ -108,8 +111,9 @@ cpgfMain command (pgf,pc) =
|
|||||||
where
|
where
|
||||||
languages = [makeObj ["name".= l] | (l,_)<-Map.toList (C.languages pgf)]
|
languages = [makeObj ["name".= l] | (l,_)<-Map.toList (C.languages pgf)]
|
||||||
|
|
||||||
parse lexer input (from,concr) start mlimit trie =
|
parse input (from,concr) start mlimit trie =
|
||||||
do r <- parse' (lexer input) (from,concr) start mlimit
|
do lex <- c_lexer concr
|
||||||
|
r <- parse' (from,concr) start mlimit (lex input)
|
||||||
return $ showJSON [makeObj ("from".=from:jsonParseResult r)]
|
return $ showJSON [makeObj ("from".=from:jsonParseResult r)]
|
||||||
|
|
||||||
jsonParseResult = either bad good
|
jsonParseResult = either bad good
|
||||||
@@ -118,7 +122,7 @@ cpgfMain command (pgf,pc) =
|
|||||||
good trees = "trees".=map tp trees :[] -- :addTrie trie trees
|
good trees = "trees".=map tp trees :[] -- :addTrie trie trees
|
||||||
tp (tree,prob) = makeObj ["tree".=tree,"prob".=prob]
|
tp (tree,prob) = makeObj ["tree".=tree,"prob".=prob]
|
||||||
|
|
||||||
parse' input (from,concr) start mlimit =
|
parse' (from,concr) start mlimit input =
|
||||||
liftIO $ do t <- getCurrentTime
|
liftIO $ do t <- getCurrentTime
|
||||||
fmap (maybe id take mlimit . drop start)
|
fmap (maybe id take mlimit . drop start)
|
||||||
# modifyMVar pc (parse'' t)
|
# modifyMVar pc (parse'' t)
|
||||||
@@ -136,8 +140,9 @@ cpgfMain command (pgf,pc) =
|
|||||||
lin tree tos = showJSON (lin' tree tos)
|
lin tree tos = showJSON (lin' tree tos)
|
||||||
lin' tree tos = [makeObj ["to".=to,"text".=C.linearize c tree]|(to,c)<-tos]
|
lin' tree tos = [makeObj ["to".=to,"text".=C.linearize c tree]|(to,c)<-tos]
|
||||||
|
|
||||||
trans lexer input (from,concr) tos start mlimit trie =
|
trans input (from,concr) tos start mlimit trie =
|
||||||
do parses <- parse' (lexer input) (from,concr) start mlimit
|
do lex <- c_lexer concr
|
||||||
|
parses <- parse' (from,concr) start mlimit (lex input)
|
||||||
return $
|
return $
|
||||||
showJSON [ makeObj ["from".=from,
|
showJSON [ makeObj ["from".=from,
|
||||||
"translations".= jsonParses parses]]
|
"translations".= jsonParses parses]]
|
||||||
@@ -167,6 +172,11 @@ cpgfMain command (pgf,pc) =
|
|||||||
let t = C.readExpr s
|
let t = C.readExpr s
|
||||||
maybe (badRequest "bad tree" s) return t
|
maybe (badRequest "bad tree" s) return t
|
||||||
|
|
||||||
|
--c_lexer concr = lexer
|
||||||
|
c_lexer concr = ilexer (not . null . C.lookupMorpho concr)
|
||||||
|
|
||||||
|
--------------------------------------------------------------------------------
|
||||||
|
|
||||||
{-
|
{-
|
||||||
instance JSON C.CId where
|
instance JSON C.CId where
|
||||||
readJSON x = readJSON x >>= maybe (fail "Bad language.") return . C.readCId
|
readJSON x = readJSON x >>= maybe (fail "Bad language.") return . C.readCId
|
||||||
@@ -178,14 +188,27 @@ instance JSON C.Expr where
|
|||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
lexer = maybe (return id) lexerfun =<< getInput "lexer"
|
--------------------------------------------------------------------------------
|
||||||
|
-- * Lexing
|
||||||
|
|
||||||
|
-- | Lexers with a text lexer that tries to be a more clever with the first word
|
||||||
|
ilexer good = lexer' uncap
|
||||||
|
where
|
||||||
|
uncap s = if good s
|
||||||
|
then s
|
||||||
|
else uncapitInit s
|
||||||
|
|
||||||
|
-- | Standard lexers
|
||||||
|
lexer = lexer' uncapitInit
|
||||||
|
|
||||||
|
lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer"
|
||||||
where
|
where
|
||||||
lexerfun name =
|
lexerfun name =
|
||||||
case name of
|
case name of
|
||||||
"text" -> return (unwords . lexText)
|
"text" -> return (unwords . lexText' uncap)
|
||||||
"code" -> return (unwords . lexCode)
|
"code" -> return (unwords . lexCode)
|
||||||
"mixed" -> return (unwords . lexMixed)
|
"mixed" -> return (unwords . lexMixed)
|
||||||
_ -> throwCGIError 400 "Unknown lexer" ["Unknown lexer: "++name]
|
_ -> badRequest "Unknown lexer" name
|
||||||
|
|
||||||
--------------------------------------------------------------------------------
|
--------------------------------------------------------------------------------
|
||||||
-- * Haskell run-time functionality
|
-- * Haskell run-time functionality
|
||||||
@@ -193,15 +216,15 @@ lexer = maybe (return id) lexerfun =<< getInput "lexer"
|
|||||||
pgfMain :: String -> PGF -> CGI CGIResult
|
pgfMain :: String -> PGF -> CGI CGIResult
|
||||||
pgfMain command pgf =
|
pgfMain command pgf =
|
||||||
case command of
|
case command of
|
||||||
"parse" -> out =<< parse#lexer%input%cat%from%limit%trie
|
"parse" -> out =<< join (parse#input%cat%from%limit%trie)
|
||||||
"complete" -> out =<< doComplete pgf # input % cat % from % limit
|
"complete" -> out =<< doComplete pgf # input % cat % from % limit
|
||||||
"linearize" -> out =<< doLinearize pgf # tree % to
|
"linearize" -> out =<< doLinearize pgf # tree % to
|
||||||
"linearizeAll" -> out =<< doLinearizes pgf # tree % to
|
"linearizeAll" -> out =<< doLinearizes pgf # tree % to
|
||||||
"linearizeTable" -> out =<< doLinearizeTabular pgf # tree % to
|
"linearizeTable" -> out =<< doLinearizeTabular pgf # tree % to
|
||||||
"random" -> cat >>= \c -> depth >>= \dp -> limit >>= \l -> to >>= \to -> liftIO (doRandom pgf c dp l to) >>= out
|
"random" -> cat >>= \c -> depth >>= \dp -> limit >>= \l -> to >>= \to -> liftIO (doRandom pgf c dp l to) >>= out
|
||||||
"generate" -> out =<< doGenerate pgf # cat % depth % limit % to
|
"generate" -> out =<< doGenerate pgf # cat % depth % limit % to
|
||||||
"translate" -> out =<< trans#lexer%input%cat%from%to%limit%trie
|
"translate" -> out =<< join (trans#input%cat%from%to%limit%trie)
|
||||||
"translategroup" -> out =<< transgroup#lexer%input%cat%from%to%limit
|
"translategroup" -> out =<< join (transgroup#input%cat%from%to%limit)
|
||||||
"grammar" -> out =<< doGrammar pgf # requestAcceptLanguage
|
"grammar" -> out =<< doGrammar pgf # requestAcceptLanguage
|
||||||
"abstrtree" -> outputGraphviz =<< abstrTree pgf # graphvizOptions % tree
|
"abstrtree" -> outputGraphviz =<< abstrTree pgf # graphvizOptions % tree
|
||||||
"alignment" -> outputGraphviz =<< alignment pgf # tree % to
|
"alignment" -> outputGraphviz =<< alignment pgf # tree % to
|
||||||
@@ -213,20 +236,31 @@ pgfMain command pgf =
|
|||||||
"browse" -> join $ doBrowse pgf # optId % cssClass % href % format "html" % getIncludePrintNames
|
"browse" -> join $ doBrowse pgf # optId % cssClass % href % format "html" % getIncludePrintNames
|
||||||
"external" -> do cmd <- getInput "external"
|
"external" -> do cmd <- getInput "external"
|
||||||
doExternal cmd =<< input
|
doExternal cmd =<< input
|
||||||
_ -> throwCGIError 400 "Unknown command" ["Unknown command: " ++ show command]
|
_ -> badRequest "Unknown command" command
|
||||||
where
|
where
|
||||||
parse lexer input = doParse pgf (lexer input)
|
parse input cat from limit trie =
|
||||||
trans lexer input = doTranslate pgf (lexer input)
|
do lex <- mlexer from
|
||||||
transgroup lexer input = doTranslateGroup pgf (lexer input)
|
return (doParse pgf (lex input) cat from limit trie)
|
||||||
|
trans input cat from to limit trie =
|
||||||
|
do lex <- mlexer from
|
||||||
|
return (doTranslate pgf (lex input) cat from to limit trie)
|
||||||
|
transgroup input cat from to limit =
|
||||||
|
do lex <- mlexer from
|
||||||
|
return (doTranslateGroup pgf (lex input) cat from to limit)
|
||||||
|
|
||||||
|
-- mlexer _ = lexer
|
||||||
|
mlexer (Just lang) = ilexer (PGF.isInMorpho morpho)
|
||||||
|
where morpho = PGF.buildMorpho pgf lang
|
||||||
|
|
||||||
tree :: CGI PGF.Tree
|
tree :: CGI PGF.Tree
|
||||||
tree = do ms <- getInput "tree"
|
tree = do ms <- getInput "tree"
|
||||||
s <- maybe (throwCGIError 400 "No tree given" ["No tree given"]) return ms
|
s <- maybe (badRequest "No tree given" "") return ms
|
||||||
t <- maybe (throwCGIError 400 "Bad tree" ["tree: " ++ s]) return (PGF.readExpr s)
|
t <- maybe (badRequest "Bad tree" s) return (PGF.readExpr s)
|
||||||
t <- either (\err -> throwCGIError 400 "Type incorrect tree"
|
t <- either (\err -> badRequest "Type incorrect tree"
|
||||||
["tree: " ++ PGF.showExpr [] t
|
(unlines $
|
||||||
,render (PP.text "error:" <+> PGF.ppTcError err)
|
[PGF.showExpr [] t
|
||||||
])
|
,render (PP.text "error:" <+> PGF.ppTcError err)
|
||||||
|
]))
|
||||||
(return . fst)
|
(return . fst)
|
||||||
(PGF.inferExpr pgf t)
|
(PGF.inferExpr pgf t)
|
||||||
return t
|
return t
|
||||||
@@ -237,14 +271,14 @@ pgfMain command pgf =
|
|||||||
case mcat of
|
case mcat of
|
||||||
Nothing -> return Nothing
|
Nothing -> return Nothing
|
||||||
Just cat -> case PGF.readType cat of
|
Just cat -> case PGF.readType cat of
|
||||||
Nothing -> throwCGIError 400 "Bad category" ["Bad category: " ++ cat]
|
Nothing -> badRequest "Bad category" cat
|
||||||
Just typ -> return $ Just typ -- typecheck the category
|
Just typ -> return $ Just typ -- typecheck the category
|
||||||
|
|
||||||
optId :: CGI (Maybe PGF.CId)
|
optId :: CGI (Maybe PGF.CId)
|
||||||
optId = maybe (return Nothing) rd =<< getInput "id"
|
optId = maybe (return Nothing) rd =<< getInput "id"
|
||||||
where
|
where
|
||||||
rd = maybe err (return . Just) . PGF.readCId
|
rd = maybe err (return . Just) . PGF.readCId
|
||||||
err = throwCGIError 400 "Bad identifier" []
|
err = badRequest "Bad identifier" []
|
||||||
|
|
||||||
cssClass, href :: CGI (Maybe String)
|
cssClass, href :: CGI (Maybe String)
|
||||||
cssClass = getInput "css-class"
|
cssClass = getInput "css-class"
|
||||||
@@ -276,9 +310,9 @@ pgfMain command pgf =
|
|||||||
readLang :: String -> CGI PGF.Language
|
readLang :: String -> CGI PGF.Language
|
||||||
readLang l =
|
readLang l =
|
||||||
case PGF.readLanguage l of
|
case PGF.readLanguage l of
|
||||||
Nothing -> throwCGIError 400 "Bad language" ["Bad language: " ++ l]
|
Nothing -> badRequest "Bad language" l
|
||||||
Just lang | lang `elem` PGF.languages pgf -> return lang
|
Just lang | lang `elem` PGF.languages pgf -> return lang
|
||||||
| otherwise -> throwCGIError 400 "Unknown language" ["Unknown language: " ++ l]
|
| otherwise -> badRequest "Unknown language" l
|
||||||
|
|
||||||
-- * Request parameter access and related auxiliary functions
|
-- * Request parameter access and related auxiliary functions
|
||||||
|
|
||||||
@@ -316,15 +350,18 @@ toBool s = s `elem` ["","yes","true","True"]
|
|||||||
missing = badRequest "Missing parameter"
|
missing = badRequest "Missing parameter"
|
||||||
errorMissingId = badRequest "Missing identifier" ""
|
errorMissingId = badRequest "Missing identifier" ""
|
||||||
|
|
||||||
badRequest msg extra =
|
badRequest = throw 400
|
||||||
throwCGIError 400 msg [msg ++(if null extra then "" else ": "++extra)]
|
serverError = throw 500
|
||||||
|
|
||||||
|
throw code msg extra =
|
||||||
|
throwCGIError code msg [msg ++(if null extra then "" else ": "++extra)]
|
||||||
|
|
||||||
format def = maybe def id # getInput "format"
|
format def = maybe def id # getInput "format"
|
||||||
|
|
||||||
-- * Request implementations
|
-- * Request implementations
|
||||||
|
|
||||||
-- Hook for simple extensions of the PGF service
|
-- Hook for simple extensions of the PGF service
|
||||||
doExternal Nothing input = throwCGIError 400 "Unknown external command" ["Unknown external command"]
|
doExternal Nothing input = badRequest "Unknown external command" ""
|
||||||
doExternal (Just cmd) input =
|
doExternal (Just cmd) input =
|
||||||
do liftIO $ logError ("External command: "++cmd)
|
do liftIO $ logError ("External command: "++cmd)
|
||||||
cmds <- liftIO $ (fmap lines $ readFile "external_services")
|
cmds <- liftIO $ (fmap lines $ readFile "external_services")
|
||||||
@@ -332,7 +369,7 @@ doExternal (Just cmd) input =
|
|||||||
liftIO $ logError ("External services: "++show cmds)
|
liftIO $ logError ("External services: "++show cmds)
|
||||||
if cmd `elem` cmds then ok else err
|
if cmd `elem` cmds then ok else err
|
||||||
where
|
where
|
||||||
err = throwCGIError 400 "Unknown external command" ["Unknown external command: "++cmd]
|
err = badRequest "Unknown external command" cmd
|
||||||
ok =
|
ok =
|
||||||
do let tmpfile1 = "external_input.txt"
|
do let tmpfile1 = "external_input.txt"
|
||||||
tmpfile2 = "external_output.txt"
|
tmpfile2 = "external_output.txt"
|
||||||
|
|||||||
Reference in New Issue
Block a user