From af4bf660024928da20b3a1e004d347d6bc0647c4 Mon Sep 17 00:00:00 2001 From: aarne Date: Tue, 18 Nov 2003 15:30:08 +0000 Subject: [PATCH] Using trie more. --- src/GF/CF/CanonToCF.hs | 17 ++++---- src/GF/Canon/CMacros.hs | 2 +- src/GF/Canon/Look.hs | 5 ++- src/GF/Compile/ShellState.hs | 2 +- src/GF/Data/Glue.hs | 11 +++--- src/GF/Data/Trie2.hs | 69 +++++++++++++++++++++++++++++---- src/GF/UseGrammar/Custom.hs | 1 + src/GF/UseGrammar/Morphology.hs | 56 ++++++++------------------ src/Today.hs | 2 +- 9 files changed, 99 insertions(+), 66 deletions(-) diff --git a/src/GF/CF/CanonToCF.hs b/src/GF/CF/CanonToCF.hs index 0950d6244..430ccbbac 100644 --- a/src/GF/CF/CanonToCF.hs +++ b/src/GF/CF/CanonToCF.hs @@ -12,6 +12,7 @@ import qualified Modules as M import CF import CFIdent import Morphology +import Trie2 import List (nub,partition) import Monad @@ -152,28 +153,26 @@ mkCFPredef :: Options -> [CFRuleGroup] -> ([CFRuleGroup],CFPredef) mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where (ruls,preds) = if oElem lexerByNeed opts -- option -cflexer then predefLexer rules - else (rules,NT) + else (rules,emptyTrie) preds0 s = [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++ [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++ [(cfCatString, stringCFFun t) | TL t <- [s]] ++ [(cfCatInt, intCFFun t) | TI t <- [s]] cats = map fst rules - look s = errVal [] $ liftM concat $ - mapM (flip justLookupTree preds . tS) $ wordsCFTok s --- for TC tokens + look = concatMap snd . map (trieLookup preds) . wordsCFTok --- for TC tokens ---- TODO: use trie instead of bintree; integrate with morphology -predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)])) -predefLexer groups = (reverse ruls, sorted2tree $ sortAssocs preds) where +--- TODO: integrate with morphology +--- predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)])) +predefLexer groups = (reverse ruls, tcompile preds) where (ruls,preds) = foldr mkOne ([],[]) groups mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where (rule,pre) = case partition isLexical rules of ([],_) -> (group,[]) - (ls,rest) -> ((cat,rest), concatMap mkLexRule ls) --- useLexRule cat : rest + (ls,rest) -> ((cat,rest), concatMap mkLexRule ls) isLexical (f,(c,its)) = case its of [CFTerm (RegAlts ws)] -> True _ -> False --- useLexRule cat = (dummyCFFun,(cat,[CFNonterm (lexCFCat cat)])) -- not needed mkLexRule r = case r of - (fun,(cat,[CFTerm (RegAlts ws)])) -> [(tS w, (cat,fun)) | w <- ws] + (fun,(cat,[CFTerm (RegAlts ws)])) -> [(w, [(cat,fun)]) | w <- ws] _ -> [] diff --git a/src/GF/Canon/CMacros.hs b/src/GF/Canon/CMacros.hs index 647cf9600..85a465871 100644 --- a/src/GF/Canon/CMacros.hs +++ b/src/GF/Canon/CMacros.hs @@ -161,7 +161,7 @@ wordsInTerm trm = filter (not . null) $ case trm of T _ cs -> concat [wo t | Cas _ t <- cs] C s t -> wo s ++ wo t FV ts -> concatMap wo ts - K (KP ss vs) -> ss ++ concat [s ++ t | Var s t <- vs] + K (KP ss vs) -> ss ++ concat [s | Var s _ <- vs] P t _ -> wo t --- not needed ? _ -> [] where wo = wordsInTerm diff --git a/src/GF/Canon/Look.hs b/src/GF/Canon/Look.hs index 1f55e4cdb..228a43f3c 100644 --- a/src/GF/Canon/Look.hs +++ b/src/GF/Canon/Look.hs @@ -166,4 +166,7 @@ ccompute cnc = comp [] noVar v = case v of LI _ -> False R rs -> all noVar [t | Ass _ t <- rs] - _ -> True --- other cases? + Con _ ts -> all noVar ts + FV ts -> all noVar ts + S x y -> noVar x && noVar y + _ -> True --- other cases that can be values to pattern match? diff --git a/src/GF/Compile/ShellState.hs b/src/GF/Compile/ShellState.hs index 17ea2cc9a..138630c3a 100644 --- a/src/GF/Compile/ShellState.hs +++ b/src/GF/Compile/ShellState.hs @@ -88,7 +88,7 @@ stateGrammarST = grammar stateCF = cf stateMorpho = morpho stateOptions = loptions -stateGrammarWords = map fst . tree2list . stateMorpho +stateGrammarWords = allMorphoWords . stateMorpho cncModuleIdST = stateGrammarST diff --git a/src/GF/Data/Glue.hs b/src/GF/Data/Glue.hs index 247224075..a533a6c2f 100644 --- a/src/GF/Data/Glue.hs +++ b/src/GF/Data/Glue.hs @@ -1,19 +1,18 @@ module Glue where -import Trie +import Trie2 import Operations import List -------- AR 8/11/2003, using Markus Forsberg's implementation of Huet's unglue -tcompileSimple :: [String] -> Trie -tcompileSimple ss = tcompile [(s,[(atWP,s)]) | s <- ss] - -decomposeSimple :: Trie -> String -> Err [String] +decomposeSimple :: Trie Char a -> [Char] -> Err [[Char]] decomposeSimple t s = do let ss = map (decompose t) $ words s if any null ss then Bad "unknown word in input" else return $ concat [intersperse "&+" ws | ws <- ss] -exTrie = tcompileSimple $ words "ett två tre tjugo trettio hundra tusen" +exTrie = tcompile (zip ws ws) where + ws = words "ett två tre tjugo trettio hundra tusen" + diff --git a/src/GF/Data/Trie2.hs b/src/GF/Data/Trie2.hs index e15c53e8c..2ccc05a9f 100644 --- a/src/GF/Data/Trie2.hs +++ b/src/GF/Data/Trie2.hs @@ -8,20 +8,25 @@ module Trie2 ( tcompile, collapse, Trie, - trieLookup + trieLookup, + decompose, + --- Attr, atW, atP, atWP, + emptyTrie ) where import Map +import List newtype TrieT a b = TrieT ([(a,TrieT a b)],[b]) newtype Trie a b = Trie (Map a (Trie a b), [b]) -emptyTrie = TrieT ([],[]) +emptyTrieT = TrieT ([],[]) +emptyTrie = Trie (empty,[]) -optimize :: Ord a => TrieT a b -> Trie a b +optimize :: (Ord a,Eq b) => TrieT a b -> Trie a b optimize (TrieT (xs,res)) = Trie ([(c,optimize t) | (c,t) <- xs] |->+ empty, - res) + nub res) --- nub by AR collapse :: Ord a => Trie a b -> [([a],[b])] collapse trie = collapse' trie [] @@ -31,8 +36,8 @@ collapse trie = collapse' trie [] collapse' (Trie (map,[])) s = concat [ collapse' trie (c:s) | (c,trie) <- flatten map] -tcompile :: Ord a => [([a],[b])] -> Trie a b -tcompile xs = optimize $ build xs emptyTrie +tcompile :: (Ord a,Eq b) => [([a],[b])] -> Trie a b +tcompile xs = optimize $ build xs emptyTrieT build :: Ord a => [([a],[b])] -> TrieT a b -> TrieT a b build [] trie = trie @@ -41,7 +46,7 @@ build (x:xs) trie = build xs (insert x trie) insert ([],ys) (TrieT (xs,res)) = TrieT (xs,ys ++ res) insert ((s:ss),ys) (TrieT (xs,res)) = case (span (\(s',_) -> s' /= s) xs) of - (xs,[]) -> TrieT (((s,(insert (ss,ys) emptyTrie)):xs),res) + (xs,[]) -> TrieT (((s,(insert (ss,ys) emptyTrieT)):xs),res) (xs,(y,trie):zs) -> TrieT (xs ++ ((y,insert (ss,ys) trie):zs),res) trieLookup :: Ord a => Trie a b -> [a] -> ([a],[b]) @@ -53,3 +58,53 @@ apply (Trie (map,_)) (s:ss) inp = case map ! s of Just trie -> apply trie ss inp Nothing -> (inp,[]) + +----------------------------- +-- from Trie for strings; simplified for GF by making binding always possible (AR) + +decompose :: Ord a => Trie a b -> [a] -> [[a]] +decompose trie sentence = backtrack [(sentence,[])] trie + +react :: Ord a => [a] -> [[a]] -> [([a],[[a]])] -> + [a] -> Trie a b -> Trie a b -> [[a]] +-- String -> [String] -> [(String,[String])] -> String -> Trie -> Trie -> [String] +react input output back occ (Trie (arcs,res)) init = + case res of -- Accept = non-empty res. + [] -> continue back + _ -> let pushout = (occ:output) + in case input of + [] -> reverse $ map reverse pushout + _ -> let pushback = ((input,pushout):back) + in continue pushback + where continue cont = case input of + [] -> backtrack cont init + (l:rest) -> case arcs ! l of + Just trie -> + react rest output cont (l:occ) trie init + Nothing -> backtrack cont init + +backtrack :: Ord a => [([a],[[a]])] -> Trie a b -> [[a]] +backtrack [] _ = [] +backtrack ((input,output):back) trie + = react input output back [] trie trie + + +{- so this is not needed from the original +type Attr = Int + +atW, atP, atWP :: Attr +(atW,atP,atWP) = (0,1,2) + +decompose :: Ord a => Trie a (Int,b) -> [a] -> [[a]] +decompose trie sentence = legal trie $ backtrack [(sentence,[])] trie + +-- The function legal checks if the decomposition is in fact a possible one. + +legal :: Ord a => Trie a (Int,b) -> [[a]] -> [[a]] +legal _ [] = [] +legal trie input = if (test (map ((map fst).snd.(trieLookup trie)) input)) then input else [] + where + test [] = False + test [xs] = elem atW xs || elem atWP xs + test (xs:xss) = (elem atP xs || elem atWP xs) && test xss +-} diff --git a/src/GF/UseGrammar/Custom.hs b/src/GF/UseGrammar/Custom.hs index 10446413a..64cb29680 100644 --- a/src/GF/UseGrammar/Custom.hs +++ b/src/GF/UseGrammar/Custom.hs @@ -147,6 +147,7 @@ customGrammarPrinter = ,(strCI "cf", prCF . stateCF) ,(strCI "lbnf", prLBNF . stateCF) ,(strCI "morpho", prMorpho . stateMorpho) + ,(strCI "fullform",prFullForm . stateMorpho) ,(strCI "opts", prOpts . stateOptions) ,(strCI "words", unwords . stateGrammarWords) {- ---- diff --git a/src/GF/UseGrammar/Morphology.hs b/src/GF/UseGrammar/Morphology.hs index c8f00615a..9d9371c3c 100644 --- a/src/GF/UseGrammar/Morphology.hs +++ b/src/GF/UseGrammar/Morphology.hs @@ -15,40 +15,33 @@ import Glue import Char import List (sortBy, intersperse) import Monad (liftM) +import Trie2 -- construct a morphological analyser from a GF grammar. AR 11/4/2001 --- we have found the binary search tree sorted by word forms more efficient +-- we first found the binary search tree sorted by word forms more efficient -- than a trie, at least for grammars with 7000 word forms +-- (18/11/2003) but this may change since we have to use a trie +-- for decompositions and also want to use it in the parser -type Morpho = BinTree (String,[String]) +type Morpho = Trie Char String -emptyMorpho = NT +emptyMorpho = emptyTrie --- with literals appMorpho :: Morpho -> String -> (String,[String]) -appMorpho m s = (s, ps ++ ms) where - ms = case lookupTree id s m of - Ok vs -> vs - _ -> [] - ps = [] ---- case lookupLiteral s of - ---- Ok (t,_) -> [tagPrt t] - ---- _ -> [] +appMorpho = appMorphoOnly +---- add lookup for literals -- without literals appMorphoOnly :: Morpho -> String -> (String,[String]) -appMorphoOnly m s = (s, ms) where - ms = case lookupTree id s m of - Ok vs -> vs - _ -> [] +appMorphoOnly m s = trieLookup m s -- recognize word, exluding literals isKnownWord :: Morpho -> String -> Bool isKnownWord mo = not . null . snd . appMorphoOnly mo mkMorpho :: CanonGrammar -> Ident -> Morpho ----- mkMorpho gr = emptyMorpho ---- -mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where +mkMorpho gr a = tcompile $ concatMap mkOne $ allItems where mkOne (Left (fun,c)) = map (prOne fun c) $ allLins fun mkOne (Right (fun,_)) = map (prSyn fun) $ allSyns fun @@ -58,14 +51,14 @@ mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where ts <- allLinsOfFun gr (CIQ a f) ss <- mapM (mapPairsM (mapPairsM (return . wordsInTerm))) ts return [(p,s) | (p,fs) <- concat $ map snd $ concat ss, s <- fs] - prOne (_,f) c (ps,s) = (s, prt f +++ tagPrt c ++ concat (map prt_ ps)) + prOne (_,f) c (ps,s) = (s, [prt f +++ tagPrt c +++ unwords (map prt_ ps)]) -- gather syncategorematic words allSyns fun@(m,f) = errVal [] $ do tss <- allLinsOfFun gr (CIQ a f) let ss = [s | ts <- tss, (_,fs) <- ts, (_,s) <- fs] return $ concat $ map wordsInTerm ss - prSyn f s = (s, "+" ++ tagPrt f) + prSyn f s = (s, ["+" ++ tagPrt f]) -- all words, Left from lexical rules and Right syncategorematic allItems = [lexRole t (f,c) | (f,c,t) <- allFuns] where @@ -77,7 +70,7 @@ mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where -- printing full-form lexicon and results prMorpho :: Morpho -> String -prMorpho = unlines . map prMorphoAnalysis . tree2list +prMorpho = unlines . map prMorphoAnalysis . collapse prMorphoAnalysis :: (String,[String]) -> String prMorphoAnalysis (w,fs) = unlines (w:fs) @@ -92,7 +85,7 @@ tagPrt (m,c) = "+" ++ prt c --- module name -- print all words recognized allMorphoWords :: Morpho -> [String] -allMorphoWords = map fst . tree2list +allMorphoWords = map fst . collapse -- analyse running text and show results either in short form or on separate lines morphoTextShort mo = unwords . map (prMorphoAnalysisShort . appMorpho mo) . words @@ -100,7 +93,7 @@ morphoText mo = unlines . map (('\n':) . prMorphoAnalysis . appMorpho mo) . word -- format used in the Italian Verb Engine prFullForm :: Morpho -> String -prFullForm = unlines . map prOne . tree2list where +prFullForm = unlines . map prOne . collapse where prOne (s,ps) = s ++ " : " ++ unwords (intersperse "/" ps) -- using Huet's unglueing method to find word boundaries @@ -109,21 +102,4 @@ prFullForm = unlines . map prOne . tree2list where ---- Moreover, we should specify the cases in which this happens - not all words decomposeWords :: Morpho -> String -> [String] -decomposeWords mo s = errVal (words s) $ - decomposeSimple (tcompileSimple (map fst $ tree2list mo)) s - --- auxiliaries - -mkMorphoTree :: (Ord a, Eq b) => [(a,b)] -> BinTree (a,[b]) -mkMorphoTree = sorted2tree . sortAssocs - -sortAssocs :: (Ord a, Eq b) => [(a,b)] -> [(a,[b])] -sortAssocs = arrange . sortBy (\ (x,_) (y,_) -> compare x y) where - arrange ((x,v):xvs) = arr x [v] xvs - arrange [] = [] - arr y vs xs = case xs of - (x,v):xvs -> if x==y then arr y vvs xvs else (y,vs) : arr x [v] xvs - where vvs = if elem v vs then vs else (v:vs) - _ -> [(y,vs)] - - +decomposeWords mo s = errVal (words s) $ decomposeSimple mo s diff --git a/src/Today.hs b/src/Today.hs index 7d6ec50b2..00d8f282c 100644 --- a/src/Today.hs +++ b/src/Today.hs @@ -1 +1 @@ -module Today where today = "Mon Nov 17 17:04:28 CET 2003" +module Today where today = "Tue Nov 18 17:18:44 CET 2003"