From 265f29afb69e1ad8ef14f927dd24c2b6f4e464f4 Mon Sep 17 00:00:00 2001 From: aarne Date: Mon, 11 Jun 2007 07:49:30 +0000 Subject: [PATCH] initial check for unknown words in parsing --- examples/regulus/{ => toy0}/Toy0.gf | 0 examples/regulus/{ => toy0}/Toy0Eng.gf | 0 examples/regulus/{ => toy0}/Toy0Fin.gf | 0 examples/regulus/{ => toy0}/Toy0Fre.gf | 0 examples/regulus/{ => toy0}/Toy0Ger.gf | 0 examples/regulus/{ => toy0}/Toy0I.gf | 0 examples/regulus/{ => toy0}/Toy0Swe.gf | 0 examples/regulus/{ => toy0}/Toy0_eng.gf | 0 src/GF/API.hs | 19 +++++++++++++++---- src/GF/Shell.hs | 2 +- src/GF/UseGrammar/Parsing.hs | 21 +++++++++++++++++---- 11 files changed, 33 insertions(+), 9 deletions(-) rename examples/regulus/{ => toy0}/Toy0.gf (100%) rename examples/regulus/{ => toy0}/Toy0Eng.gf (100%) rename examples/regulus/{ => toy0}/Toy0Fin.gf (100%) rename examples/regulus/{ => toy0}/Toy0Fre.gf (100%) rename examples/regulus/{ => toy0}/Toy0Ger.gf (100%) rename examples/regulus/{ => toy0}/Toy0I.gf (100%) rename examples/regulus/{ => toy0}/Toy0Swe.gf (100%) rename examples/regulus/{ => toy0}/Toy0_eng.gf (100%) diff --git a/examples/regulus/Toy0.gf b/examples/regulus/toy0/Toy0.gf similarity index 100% rename from examples/regulus/Toy0.gf rename to examples/regulus/toy0/Toy0.gf diff --git a/examples/regulus/Toy0Eng.gf b/examples/regulus/toy0/Toy0Eng.gf similarity index 100% rename from examples/regulus/Toy0Eng.gf rename to examples/regulus/toy0/Toy0Eng.gf diff --git a/examples/regulus/Toy0Fin.gf b/examples/regulus/toy0/Toy0Fin.gf similarity index 100% rename from examples/regulus/Toy0Fin.gf rename to examples/regulus/toy0/Toy0Fin.gf diff --git a/examples/regulus/Toy0Fre.gf b/examples/regulus/toy0/Toy0Fre.gf similarity index 100% rename from examples/regulus/Toy0Fre.gf rename to examples/regulus/toy0/Toy0Fre.gf diff --git a/examples/regulus/Toy0Ger.gf b/examples/regulus/toy0/Toy0Ger.gf similarity index 100% rename from examples/regulus/Toy0Ger.gf rename to examples/regulus/toy0/Toy0Ger.gf diff --git a/examples/regulus/Toy0I.gf b/examples/regulus/toy0/Toy0I.gf similarity index 100% rename from examples/regulus/Toy0I.gf rename to examples/regulus/toy0/Toy0I.gf diff --git a/examples/regulus/Toy0Swe.gf b/examples/regulus/toy0/Toy0Swe.gf similarity index 100% rename from examples/regulus/Toy0Swe.gf rename to examples/regulus/toy0/Toy0Swe.gf diff --git a/examples/regulus/Toy0_eng.gf b/examples/regulus/toy0/Toy0_eng.gf similarity index 100% rename from examples/regulus/Toy0_eng.gf rename to examples/regulus/toy0/Toy0_eng.gf diff --git a/src/GF/API.hs b/src/GF/API.hs index 762fa372f..3efd81472 100644 --- a/src/GF/API.hs +++ b/src/GF/API.hs @@ -75,6 +75,7 @@ import GF.Infra.UseIO import GF.Data.Zipper import Data.List (nub) +import Data.Char (toLower) import Data.Maybe (fromMaybe) import Control.Monad (liftM) import System (system) @@ -314,9 +315,16 @@ morphoAnalyse opts gr mo = morpho gr isKnownWord :: GFGrammar -> String -> Bool -isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of - a@(_:_:_) -> last (init a) /= '*' -- [word *] - _ -> False +isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s + +unknownTokens :: GFGrammar -> [CFTok] -> [String] +unknownTokens gr ts = + [w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w] + where + unk w = not $ GF.API.isKnownWord gr w + uncap (c:cs) = toLower c : cs + uncap s = s + {- prExpXML :: StateGrammar -> Term -> [String] @@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of _ -> id -} +optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]] +optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr + optTokenizer :: Options -> GFGrammar -> String -> String -optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr +optTokenizer opts gr = show . optTokenizerResult opts gr -- performs UTF8 if the language does not have flag coding=utf8; replaces name*U diff --git a/src/GF/Shell.hs b/src/GF/Shell.hs index 29a4b6c23..dd8267a91 100644 --- a/src/GF/Shell.hs +++ b/src/GF/Shell.hs @@ -254,7 +254,7 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com | otherwise -> parse $ prCommandArg a where parse x = do - warnDiscont opts + warnDiscont opts let p = optParseArgErrMsg opts gro x case p of Ok (ts,msg) diff --git a/src/GF/UseGrammar/Parsing.hs b/src/GF/UseGrammar/Parsing.hs index 65ed26863..599268b1d 100644 --- a/src/GF/UseGrammar/Parsing.hs +++ b/src/GF/UseGrammar/Parsing.hs @@ -29,6 +29,7 @@ import GF.Grammar.TypeCheck import GF.Grammar.Values --import CFMethod import GF.UseGrammar.Tokenize +import GF.UseGrammar.Morphology (isKnownWord) import GF.CF.Profile import GF.Infra.Option import GF.UseGrammar.Custom @@ -41,6 +42,7 @@ import qualified GF.Parsing.GFC as New import GF.Data.Operations import Data.List (nub,sortBy) +import Data.Char (toLower) import Control.Monad (liftM) -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 @@ -82,10 +84,21 @@ parseStringC opts0 sg cat s toks = case tokenizer s of t:_ -> t _ -> [] ---- no support for undet. tok. - ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks - ts' <- checkErr $ - allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts - return $ optIntOrAll opts flagNumber ts' + unknowns = + [w | TC w <- toks, unk w && unk (uncap w)] ++ [w | TS w <- toks, unk w] + where + unk w = not $ isKnownWord (morpho sg) w + uncap (c:cs) = toLower c : cs + uncap s = s + + case unknowns of + _:_ -> fail $ "Unknown words:" +++ unwords unknowns + _ -> do + + ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks + ts' <- checkErr $ + allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts + return $ optIntOrAll opts flagNumber ts' tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]