modernized parser in EmbedAPI

This commit is contained in:
aarne
2007-09-05 14:05:42 +00:00
parent 22b4069803
commit f357082c27
15 changed files with 335 additions and 87 deletions

View File

@@ -15,7 +15,7 @@
module GF.Embed.EmbedAPI where
import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar,firstStateGrammar,allLanguages,allCategories,stateOptions)
import GF.Compile.ShellState (ShellState,grammar2shellState,canModules,stateGrammarOfLang,abstract,grammar,firstStateGrammar,allLanguages,allCategories,stateOptions,firstAbsCat)
import GF.UseGrammar.Linear (linTree2string)
import GF.UseGrammar.GetTree (string2tree)
import GF.Embed.EmbedParsing (parseString)
@@ -53,8 +53,11 @@ file2grammar :: FilePath -> IO MultiGrammar
linearize :: MultiGrammar -> Language -> Tree -> String
parse :: MultiGrammar -> Language -> Category -> String -> [Tree]
linearizeAll :: MultiGrammar -> Tree -> [String]
parseAll :: MultiGrammar -> Category -> String -> [[Tree]]
linearizeAll :: MultiGrammar -> Tree -> [String]
linearizeAllLang :: MultiGrammar -> Tree -> [(Language,String)]
parseAll :: MultiGrammar -> Category -> String -> [[Tree]]
parseAllLang :: MultiGrammar -> Category -> String -> [(Language,[Tree])]
readTree :: MultiGrammar -> String -> Tree
showTree :: Tree -> String
@@ -62,6 +65,8 @@ showTree :: Tree -> String
languages :: MultiGrammar -> [Language]
categories :: MultiGrammar -> [Category]
startCat :: MultiGrammar -> Category
---------------------------------------------------
-- Implementation
---------------------------------------------------
@@ -84,15 +89,19 @@ linearize mgr lang =
parse mgr lang cat =
map tree2exp .
errVal [] .
parseString noOptions sgr cfcat
parseString (stateOptions sgr) sgr cfcat
where
sgr = stateGrammarOfLang mgr (zIdent lang)
cfcat = string2CFCat abs cat
abs = maybe (error "no abstract syntax") prIdent $ abstract mgr
linearizeAll mgr t = [linearize mgr lang t | lang <- languages mgr]
linearizeAll mgr = map snd . linearizeAllLang mgr
linearizeAllLang mgr t = [(lang,linearize mgr lang t) | lang <- languages mgr]
parseAll mgr cat s = [parse mgr lang cat s | lang <- languages mgr]
parseAll mgr cat = map snd . parseAllLang mgr cat
parseAllLang mgr cat s =
[(lang,ts) | lang <- languages mgr, let ts = parse mgr lang cat s, not (null ts)]
readTree mgr s = tree2exp $ string2tree (firstStateGrammar mgr) s
@@ -101,3 +110,5 @@ showTree t = prt_ t
languages mgr = [prt_ l | l <- allLanguages mgr]
categories mgr = [prt_ c | (_,c) <- allCategories mgr]
startCat = prt_ . snd . firstAbsCat noOptions . firstStateGrammar

View File

@@ -33,8 +33,7 @@ import GF.Infra.Option
import GF.Compile.ShellState
import GF.Embed.EmbedCustom
import GF.CF.PPrCF (prCFTree)
import qualified GF.OldParsing.ParseCF as PCFOld -- OBSOLETE
import qualified GF.Parsing.GFC as New
-- import qualified GF.Parsing.GFC as New
@@ -55,83 +54,12 @@ parseStringMsg os sg cat s = do
return (ts,unlines ss)
parseStringC :: Options -> StateGrammar -> CFCat -> String -> Check [Tree]
parseStringC opts0 sg cat s
| otherwise = do
let opts = unionOptions opts0 $ stateOptions sg
cf = stateCF sg
gr = stateGrammarST sg
cn = cncId sg
tok = customOrDefault opts useTokenizer customTokenizer sg
parser = PCFOld.parse "ibn" (stateCF sg) cat -- customOrDefault opts useParser customParser sg cat
tokens2trms opts sg cn parser (tok s)
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]
tokens2trms opts sg cn parser toks = trees2trms opts sg cn toks trees info
where result = parser toks
info = snd result
trees = {- nub $ -} cfParseResults result -- peb 25/5-04: removed nub (O(n^2))
trees2trms :: Options -> StateGrammar -> Ident -> [CFTok] -> [CFTree] -> String -> Check [Tree]
trees2trms opts sg cn as ts0 info = do
ts <- case () of
_ | null ts0 -> checkWarn "No success in cf parsing" >> return []
_ | raw -> do
ts1 <- return (map cf2trm0 ts0) ----- should not need annot
checks [
mapM (checkErr . (annotate gr) . trExp) ts1 ---- complicated, often fails
,checkWarn (unlines ("Raw CF trees:":(map prCFTree ts0))) >> return []
]
_ -> do
let num = optIntOrN opts flagRawtrees 999999
let (ts01,rest) = splitAt num ts0
if null rest then return ()
else checkWarn ("Warning: only" +++ show num +++ "raw parses out of" +++
show (length ts0) +++
"considered; use -rawtrees=<Int> to see more"
)
(ts1,ss) <- checkErr $ mapErrN 1 postParse ts01
if null ts1 then raise ss else return ()
ts2 <- mapM (checkErr . annotate gr . refreshMetas [] . trExp) ts1 ----
if forgive then return ts2 else do
let tsss = [(t, allLinsOfTree gr cn t) | t <- ts2]
ps = [t | (t,ss) <- tsss,
any (compatToks as) (map str2cftoks ss)]
if null ps
then raise $ "Failure in morphology." ++
if verb
then "\nPossible corrections: " +++++
unlines (nub (map sstr (concatMap snd tsss)))
else ""
else return ps
if verb
then checkWarn $ " the token list" +++ show as ++++ unknown as +++++ info
else return ()
return $ optIntOrAll opts flagNumber $ nub ts
where
gr = stateGrammarST sg
raw = oElem rawParse opts
verb = oElem beVerbose opts
forgive = oElem forgiveParse opts
unknown ts = case filter noMatch [t | t@(TS _) <- ts] of
[] -> "where all words are known"
us -> "with the unknown tokens" +++ show us --- needs to be fixed for literals
terminals = map TS $ stateGrammarWords sg
noMatch t = all (not . compatTok t) terminals
--- too much type checking in building term info? return FullTerm to save work?
-- | raw parsing: so simple it is for a context-free CF grammar
cf2trm0 :: CFTree -> C.Exp
cf2trm0 (CFTree (fun, (_, trees))) = mkAppAtom (cffun2trm fun) (map cf2trm0 trees)
where
cffun2trm (CFFun (fun,_)) = fun
mkApp = foldl C.EApp
mkAppAtom a = mkApp (C.EAtom a)
parseStringC opts0 sg cat s = do
let opts = unionOptions opts0 $ stateOptions sg
algorithm = "f" -- default algorithm: FCFG
strategy = "bottomup"
tokenizer = customOrDefault opts useTokenizer customTokenizer sg
toks = tokenizer s
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
checkErr $ allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts