From 1602eab5b6bb4aa2648281619968633e1de46edf Mon Sep 17 00:00:00 2001 From: hallgren Date: Wed, 9 Apr 2014 14:13:18 +0000 Subject: [PATCH] PGF Service: a bit more clever lexer=text Only change the first word to lowercase if the original input is not found in the grammar's morphology. This allows parsing of sentenses starting with "I" in English, nouns in German and proper names in other languages, but it can make the wrong choice for multi-words. --- src/runtime/haskell/PGF/Lexing.hs | 14 +++- src/server/PGFService.hs | 103 ++++++++++++++++++++---------- 2 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/runtime/haskell/PGF/Lexing.hs b/src/runtime/haskell/PGF/Lexing.hs index 808a2af6f..10d8332f7 100644 --- a/src/runtime/haskell/PGF/Lexing.hs +++ b/src/runtime/haskell/PGF/Lexing.hs @@ -2,8 +2,13 @@ module PGF.Lexing where import Data.Char(isSpace,toLower,toUpper) -- * Text lexing +-- | Text lexing with standard word capitalization of the first word of every sentence lexText :: String -> [String] -lexText = uncap . lext where +lexText = lexText' uncapitInit + +-- | Text lexing with custom treatment of the first word of every sentence. +lexText' :: (String->String) -> String -> [String] +lexText' uncap1 = uncap . lext where lext s = case s of c:cs | isMajorPunct c -> [c] : uncap (lext cs) c:cs | isMinorPunct c -> [c] : lext cs @@ -11,7 +16,7 @@ lexText = uncap . lext where _:_ -> let (w,cs) = break (\x -> isSpace x || isPunct x) s in w : lext cs _ -> [s] uncap s = case s of - (c:cs):ws -> (toLower c : cs):ws + w:ws -> uncap1 w:ws _ -> s unlexText :: [String] -> String @@ -78,6 +83,11 @@ capitInit s = case s of c:cs -> toUpper c : cs _ -> s +-- | Uncapitalize first letter +uncapitInit s = case s of + c:cs -> toLower c : cs + _ -> s + -- | Unquote each string wrapped in double quotes unquote = map unq where unq s = case s of diff --git a/src/server/PGFService.hs b/src/server/PGFService.hs index 9364475d4..24547bfd0 100644 --- a/src/server/PGFService.hs +++ b/src/server/PGFService.hs @@ -78,8 +78,11 @@ cgiMain' cache path = (getInput "command") case command of "download" -> outputBinary =<< liftIO (BS.readFile path) + 'c':'-':_ -> #ifdef C_RUNTIME - 'c':'-':_ -> cpgfMain command =<< liftIO (readCache (snd cache) path) + cpgfMain command =<< liftIO (readCache (snd cache) path) +#else + serverError "Server configured without C run-time support" "" #endif _ -> pgfMain command =<< liftIO (readCache (fst cache) path) @@ -90,9 +93,9 @@ cgiMain' cache path = cpgfMain :: String -> (C.PGF,MVar ParseCache) -> CGI CGIResult cpgfMain command (pgf,pc) = case command of - "c-parse" -> out =<< join (parse#lexer%input%from%start%limit%trie) + "c-parse" -> out =<< join (parse # input % from % start % limit % trie) "c-linearize" -> out =<< lin # tree % to - "c-translate" -> out =<< join (trans#lexer%input%from%to%start%limit%trie) + "c-translate" -> out =<< join (trans#input%from%to%start%limit%trie) "c-flush" -> out =<< flush "c-grammar" -> out grammar _ -> badRequest "Unknown command" command @@ -108,8 +111,9 @@ cpgfMain command (pgf,pc) = where languages = [makeObj ["name".= l] | (l,_)<-Map.toList (C.languages pgf)] - parse lexer input (from,concr) start mlimit trie = - do r <- parse' (lexer input) (from,concr) start mlimit + parse input (from,concr) start mlimit trie = + do lex <- c_lexer concr + r <- parse' (from,concr) start mlimit (lex input) return $ showJSON [makeObj ("from".=from:jsonParseResult r)] jsonParseResult = either bad good @@ -118,7 +122,7 @@ cpgfMain command (pgf,pc) = good trees = "trees".=map tp trees :[] -- :addTrie trie trees tp (tree,prob) = makeObj ["tree".=tree,"prob".=prob] - parse' input (from,concr) start mlimit = + parse' (from,concr) start mlimit input = liftIO $ do t <- getCurrentTime fmap (maybe id take mlimit . drop start) # modifyMVar pc (parse'' t) @@ -136,8 +140,9 @@ cpgfMain command (pgf,pc) = lin tree tos = showJSON (lin' tree tos) lin' tree tos = [makeObj ["to".=to,"text".=C.linearize c tree]|(to,c)<-tos] - trans lexer input (from,concr) tos start mlimit trie = - do parses <- parse' (lexer input) (from,concr) start mlimit + trans input (from,concr) tos start mlimit trie = + do lex <- c_lexer concr + parses <- parse' (from,concr) start mlimit (lex input) return $ showJSON [ makeObj ["from".=from, "translations".= jsonParses parses]] @@ -167,6 +172,11 @@ cpgfMain command (pgf,pc) = let t = C.readExpr s maybe (badRequest "bad tree" s) return t + --c_lexer concr = lexer + c_lexer concr = ilexer (not . null . C.lookupMorpho concr) + +-------------------------------------------------------------------------------- + {- instance JSON C.CId where readJSON x = readJSON x >>= maybe (fail "Bad language.") return . C.readCId @@ -178,14 +188,27 @@ instance JSON C.Expr where #endif -lexer = maybe (return id) lexerfun =<< getInput "lexer" +-------------------------------------------------------------------------------- +-- * Lexing + +-- | Lexers with a text lexer that tries to be a more clever with the first word +ilexer good = lexer' uncap + where + uncap s = if good s + then s + else uncapitInit s + +-- | Standard lexers +lexer = lexer' uncapitInit + +lexer' uncap = maybe (return id) lexerfun =<< getInput "lexer" where lexerfun name = case name of - "text" -> return (unwords . lexText) - "code" -> return (unwords . lexCode) + "text" -> return (unwords . lexText' uncap) + "code" -> return (unwords . lexCode) "mixed" -> return (unwords . lexMixed) - _ -> throwCGIError 400 "Unknown lexer" ["Unknown lexer: "++name] + _ -> badRequest "Unknown lexer" name -------------------------------------------------------------------------------- -- * Haskell run-time functionality @@ -193,15 +216,15 @@ lexer = maybe (return id) lexerfun =<< getInput "lexer" pgfMain :: String -> PGF -> CGI CGIResult pgfMain command pgf = case command of - "parse" -> out =<< parse#lexer%input%cat%from%limit%trie + "parse" -> out =<< join (parse#input%cat%from%limit%trie) "complete" -> out =<< doComplete pgf # input % cat % from % limit "linearize" -> out =<< doLinearize pgf # tree % to "linearizeAll" -> out =<< doLinearizes pgf # tree % to "linearizeTable" -> out =<< doLinearizeTabular pgf # tree % to "random" -> cat >>= \c -> depth >>= \dp -> limit >>= \l -> to >>= \to -> liftIO (doRandom pgf c dp l to) >>= out "generate" -> out =<< doGenerate pgf # cat % depth % limit % to - "translate" -> out =<< trans#lexer%input%cat%from%to%limit%trie - "translategroup" -> out =<< transgroup#lexer%input%cat%from%to%limit + "translate" -> out =<< join (trans#input%cat%from%to%limit%trie) + "translategroup" -> out =<< join (transgroup#input%cat%from%to%limit) "grammar" -> out =<< doGrammar pgf # requestAcceptLanguage "abstrtree" -> outputGraphviz =<< abstrTree pgf # graphvizOptions % tree "alignment" -> outputGraphviz =<< alignment pgf # tree % to @@ -213,20 +236,31 @@ pgfMain command pgf = "browse" -> join $ doBrowse pgf # optId % cssClass % href % format "html" % getIncludePrintNames "external" -> do cmd <- getInput "external" doExternal cmd =<< input - _ -> throwCGIError 400 "Unknown command" ["Unknown command: " ++ show command] + _ -> badRequest "Unknown command" command where - parse lexer input = doParse pgf (lexer input) - trans lexer input = doTranslate pgf (lexer input) - transgroup lexer input = doTranslateGroup pgf (lexer input) + parse input cat from limit trie = + do lex <- mlexer from + return (doParse pgf (lex input) cat from limit trie) + trans input cat from to limit trie = + do lex <- mlexer from + return (doTranslate pgf (lex input) cat from to limit trie) + transgroup input cat from to limit = + do lex <- mlexer from + return (doTranslateGroup pgf (lex input) cat from to limit) + +-- mlexer _ = lexer + mlexer (Just lang) = ilexer (PGF.isInMorpho morpho) + where morpho = PGF.buildMorpho pgf lang tree :: CGI PGF.Tree tree = do ms <- getInput "tree" - s <- maybe (throwCGIError 400 "No tree given" ["No tree given"]) return ms - t <- maybe (throwCGIError 400 "Bad tree" ["tree: " ++ s]) return (PGF.readExpr s) - t <- either (\err -> throwCGIError 400 "Type incorrect tree" - ["tree: " ++ PGF.showExpr [] t - ,render (PP.text "error:" <+> PGF.ppTcError err) - ]) + s <- maybe (badRequest "No tree given" "") return ms + t <- maybe (badRequest "Bad tree" s) return (PGF.readExpr s) + t <- either (\err -> badRequest "Type incorrect tree" + (unlines $ + [PGF.showExpr [] t + ,render (PP.text "error:" <+> PGF.ppTcError err) + ])) (return . fst) (PGF.inferExpr pgf t) return t @@ -237,14 +271,14 @@ pgfMain command pgf = case mcat of Nothing -> return Nothing Just cat -> case PGF.readType cat of - Nothing -> throwCGIError 400 "Bad category" ["Bad category: " ++ cat] + Nothing -> badRequest "Bad category" cat Just typ -> return $ Just typ -- typecheck the category optId :: CGI (Maybe PGF.CId) optId = maybe (return Nothing) rd =<< getInput "id" where rd = maybe err (return . Just) . PGF.readCId - err = throwCGIError 400 "Bad identifier" [] + err = badRequest "Bad identifier" [] cssClass, href :: CGI (Maybe String) cssClass = getInput "css-class" @@ -276,9 +310,9 @@ pgfMain command pgf = readLang :: String -> CGI PGF.Language readLang l = case PGF.readLanguage l of - Nothing -> throwCGIError 400 "Bad language" ["Bad language: " ++ l] + Nothing -> badRequest "Bad language" l Just lang | lang `elem` PGF.languages pgf -> return lang - | otherwise -> throwCGIError 400 "Unknown language" ["Unknown language: " ++ l] + | otherwise -> badRequest "Unknown language" l -- * Request parameter access and related auxiliary functions @@ -316,15 +350,18 @@ toBool s = s `elem` ["","yes","true","True"] missing = badRequest "Missing parameter" errorMissingId = badRequest "Missing identifier" "" -badRequest msg extra = - throwCGIError 400 msg [msg ++(if null extra then "" else ": "++extra)] +badRequest = throw 400 +serverError = throw 500 + +throw code msg extra = + throwCGIError code msg [msg ++(if null extra then "" else ": "++extra)] format def = maybe def id # getInput "format" -- * Request implementations -- Hook for simple extensions of the PGF service -doExternal Nothing input = throwCGIError 400 "Unknown external command" ["Unknown external command"] +doExternal Nothing input = badRequest "Unknown external command" "" doExternal (Just cmd) input = do liftIO $ logError ("External command: "++cmd) cmds <- liftIO $ (fmap lines $ readFile "external_services") @@ -332,7 +369,7 @@ doExternal (Just cmd) input = liftIO $ logError ("External services: "++show cmds) if cmd `elem` cmds then ok else err where - err = throwCGIError 400 "Unknown external command" ["Unknown external command: "++cmd] + err = badRequest "Unknown external command" cmd ok = do let tmpfile1 = "external_input.txt" tmpfile2 = "external_output.txt"