initial check for unknown words in parsing

This commit is contained in:
aarne
2007-06-11 07:49:30 +00:00
parent 84c08b1401
commit 265f29afb6
11 changed files with 33 additions and 9 deletions

View File

@@ -75,6 +75,7 @@ import GF.Infra.UseIO
import GF.Data.Zipper import GF.Data.Zipper
import Data.List (nub) import Data.List (nub)
import Data.Char (toLower)
import Data.Maybe (fromMaybe) import Data.Maybe (fromMaybe)
import Control.Monad (liftM) import Control.Monad (liftM)
import System (system) import System (system)
@@ -314,9 +315,16 @@ morphoAnalyse opts gr
mo = morpho gr mo = morpho gr
isKnownWord :: GFGrammar -> String -> Bool isKnownWord :: GFGrammar -> String -> Bool
isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s
a@(_:_:_) -> last (init a) /= '*' -- [word *]
_ -> False unknownTokens :: GFGrammar -> [CFTok] -> [String]
unknownTokens gr ts =
[w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w]
where
unk w = not $ GF.API.isKnownWord gr w
uncap (c:cs) = toLower c : cs
uncap s = s
{- {-
prExpXML :: StateGrammar -> Term -> [String] prExpXML :: StateGrammar -> Term -> [String]
@@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of
_ -> id _ -> id
-} -}
optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]]
optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr
optTokenizer :: Options -> GFGrammar -> String -> String optTokenizer :: Options -> GFGrammar -> String -> String
optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr optTokenizer opts gr = show . optTokenizerResult opts gr
-- performs UTF8 if the language does not have flag coding=utf8; replaces name*U -- performs UTF8 if the language does not have flag coding=utf8; replaces name*U

View File

@@ -254,7 +254,7 @@ execC co@(comm, opts0) sa@(sh@(st,(h,_,_,_)),a) = checkOptions st co >> case com
| otherwise -> parse $ prCommandArg a | otherwise -> parse $ prCommandArg a
where where
parse x = do parse x = do
warnDiscont opts warnDiscont opts
let p = optParseArgErrMsg opts gro x let p = optParseArgErrMsg opts gro x
case p of case p of
Ok (ts,msg) Ok (ts,msg)

View File

@@ -29,6 +29,7 @@ import GF.Grammar.TypeCheck
import GF.Grammar.Values import GF.Grammar.Values
--import CFMethod --import CFMethod
import GF.UseGrammar.Tokenize import GF.UseGrammar.Tokenize
import GF.UseGrammar.Morphology (isKnownWord)
import GF.CF.Profile import GF.CF.Profile
import GF.Infra.Option import GF.Infra.Option
import GF.UseGrammar.Custom import GF.UseGrammar.Custom
@@ -41,6 +42,7 @@ import qualified GF.Parsing.GFC as New
import GF.Data.Operations import GF.Data.Operations
import Data.List (nub,sortBy) import Data.List (nub,sortBy)
import Data.Char (toLower)
import Control.Monad (liftM) import Control.Monad (liftM)
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002 -- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
@@ -82,10 +84,21 @@ parseStringC opts0 sg cat s
toks = case tokenizer s of toks = case tokenizer s of
t:_ -> t t:_ -> t
_ -> [] ---- no support for undet. tok. _ -> [] ---- no support for undet. tok.
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks unknowns =
ts' <- checkErr $ [w | TC w <- toks, unk w && unk (uncap w)] ++ [w | TS w <- toks, unk w]
allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts where
return $ optIntOrAll opts flagNumber ts' unk w = not $ isKnownWord (morpho sg) w
uncap (c:cs) = toLower c : cs
uncap s = s
case unknowns of
_:_ -> fail $ "Unknown words:" +++ unwords unknowns
_ -> do
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
ts' <- checkErr $
allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts
return $ optIntOrAll opts flagNumber ts'
tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree] tokens2trms :: Options ->StateGrammar ->Ident -> CFParser -> [CFTok] -> Check [Tree]