mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-21 00:52:51 -06:00
initial check for unknown words in parsing
This commit is contained in:
@@ -75,6 +75,7 @@ import GF.Infra.UseIO
|
|||||||
import GF.Data.Zipper
|
import GF.Data.Zipper
|
||||||
|
|
||||||
import Data.List (nub)
|
import Data.List (nub)
|
||||||
|
import Data.Char (toLower)
|
||||||
import Data.Maybe (fromMaybe)
|
import Data.Maybe (fromMaybe)
|
||||||
import Control.Monad (liftM)
|
import Control.Monad (liftM)
|
||||||
import System (system)
|
import System (system)
|
||||||
@@ -314,9 +315,16 @@ morphoAnalyse opts gr
|
|||||||
mo = morpho gr
|
mo = morpho gr
|
||||||
|
|
||||||
isKnownWord :: GFGrammar -> String -> Bool
|
isKnownWord :: GFGrammar -> String -> Bool
|
||||||
isKnownWord gr s = case morphoAnalyse (options [beShort]) gr s of
|
isKnownWord gr s = GF.UseGrammar.Morphology.isKnownWord (morpho gr) s
|
||||||
a@(_:_:_) -> last (init a) /= '*' -- [word *]
|
|
||||||
_ -> False
|
unknownTokens :: GFGrammar -> [CFTok] -> [String]
|
||||||
|
unknownTokens gr ts =
|
||||||
|
[w | TC w <- ts, unk w && unk (uncap w)] ++ [w | TS w <- ts, unk w]
|
||||||
|
where
|
||||||
|
unk w = not $ GF.API.isKnownWord gr w
|
||||||
|
uncap (c:cs) = toLower c : cs
|
||||||
|
uncap s = s
|
||||||
|
|
||||||
|
|
||||||
{-
|
{-
|
||||||
prExpXML :: StateGrammar -> Term -> [String]
|
prExpXML :: StateGrammar -> Term -> [String]
|
||||||
@@ -397,8 +405,11 @@ optTransfer opts g = case getOptVal opts transferFun of
|
|||||||
_ -> id
|
_ -> id
|
||||||
-}
|
-}
|
||||||
|
|
||||||
|
optTokenizerResult :: Options -> GFGrammar -> String -> [[CFTok]]
|
||||||
|
optTokenizerResult opts gr = customOrDefault opts useTokenizer customTokenizer gr
|
||||||
|
|
||||||
optTokenizer :: Options -> GFGrammar -> String -> String
|
optTokenizer :: Options -> GFGrammar -> String -> String
|
||||||
optTokenizer opts gr = show . customOrDefault opts useTokenizer customTokenizer gr
|
optTokenizer opts gr = show . optTokenizerResult opts gr
|
||||||
|
|
||||||
-- performs UTF8 if the language does not have flag coding=utf8; replaces name*U
|
-- performs UTF8 if the language does not have flag coding=utf8; replaces name*U
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ import GF.Grammar.TypeCheck
|
|||||||
import GF.Grammar.Values
|
import GF.Grammar.Values
|
||||||
--import CFMethod
|
--import CFMethod
|
||||||
import GF.UseGrammar.Tokenize
|
import GF.UseGrammar.Tokenize
|
||||||
|
import GF.UseGrammar.Morphology (isKnownWord)
|
||||||
import GF.CF.Profile
|
import GF.CF.Profile
|
||||||
import GF.Infra.Option
|
import GF.Infra.Option
|
||||||
import GF.UseGrammar.Custom
|
import GF.UseGrammar.Custom
|
||||||
@@ -41,6 +42,7 @@ import qualified GF.Parsing.GFC as New
|
|||||||
import GF.Data.Operations
|
import GF.Data.Operations
|
||||||
|
|
||||||
import Data.List (nub,sortBy)
|
import Data.List (nub,sortBy)
|
||||||
|
import Data.Char (toLower)
|
||||||
import Control.Monad (liftM)
|
import Control.Monad (liftM)
|
||||||
|
|
||||||
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
|
-- AR 26/1/2000 -- 8/4 -- 28/1/2001 -- 9/12/2002
|
||||||
@@ -82,6 +84,17 @@ parseStringC opts0 sg cat s
|
|||||||
toks = case tokenizer s of
|
toks = case tokenizer s of
|
||||||
t:_ -> t
|
t:_ -> t
|
||||||
_ -> [] ---- no support for undet. tok.
|
_ -> [] ---- no support for undet. tok.
|
||||||
|
unknowns =
|
||||||
|
[w | TC w <- toks, unk w && unk (uncap w)] ++ [w | TS w <- toks, unk w]
|
||||||
|
where
|
||||||
|
unk w = not $ isKnownWord (morpho sg) w
|
||||||
|
uncap (c:cs) = toLower c : cs
|
||||||
|
uncap s = s
|
||||||
|
|
||||||
|
case unknowns of
|
||||||
|
_:_ -> fail $ "Unknown words:" +++ unwords unknowns
|
||||||
|
_ -> do
|
||||||
|
|
||||||
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
|
ts <- checkErr $ New.parse algorithm strategy (pInfo sg) (absId sg) cat toks
|
||||||
ts' <- checkErr $
|
ts' <- checkErr $
|
||||||
allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts
|
allChecks $ map (annotate (stateGrammarST sg) . refreshMetas []) ts
|
||||||
|
|||||||
Reference in New Issue
Block a user