Using trie more.

This commit is contained in:
aarne
2003-11-18 15:30:08 +00:00
parent 8ecf475d5a
commit af4bf66002
9 changed files with 99 additions and 66 deletions

View File

@@ -12,6 +12,7 @@ import qualified Modules as M
import CF import CF
import CFIdent import CFIdent
import Morphology import Morphology
import Trie2
import List (nub,partition) import List (nub,partition)
import Monad import Monad
@@ -152,28 +153,26 @@ mkCFPredef :: Options -> [CFRuleGroup] -> ([CFRuleGroup],CFPredef)
mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where mkCFPredef opts rules = (ruls, \s -> preds0 s ++ look s) where
(ruls,preds) = if oElem lexerByNeed opts -- option -cflexer (ruls,preds) = if oElem lexerByNeed opts -- option -cflexer
then predefLexer rules then predefLexer rules
else (rules,NT) else (rules,emptyTrie)
preds0 s = preds0 s =
[(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++ [(cat, metaCFFun) | TM _ _ <- [s], cat <- cats] ++
[(cat, varCFFun x) | TV x <- [s], cat <- cats] ++ [(cat, varCFFun x) | TV x <- [s], cat <- cats] ++
[(cfCatString, stringCFFun t) | TL t <- [s]] ++ [(cfCatString, stringCFFun t) | TL t <- [s]] ++
[(cfCatInt, intCFFun t) | TI t <- [s]] [(cfCatInt, intCFFun t) | TI t <- [s]]
cats = map fst rules cats = map fst rules
look s = errVal [] $ liftM concat $ look = concatMap snd . map (trieLookup preds) . wordsCFTok --- for TC tokens
mapM (flip justLookupTree preds . tS) $ wordsCFTok s --- for TC tokens
--- TODO: use trie instead of bintree; integrate with morphology --- TODO: integrate with morphology
predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)])) --- predefLexer :: [CFRuleGroup] -> ([CFRuleGroup],BinTree (CFTok,[(CFCat, CFFun)]))
predefLexer groups = (reverse ruls, sorted2tree $ sortAssocs preds) where predefLexer groups = (reverse ruls, tcompile preds) where
(ruls,preds) = foldr mkOne ([],[]) groups (ruls,preds) = foldr mkOne ([],[]) groups
mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where mkOne group@(cat,rules) (rs,ps) = (rule:rs,pre ++ ps) where
(rule,pre) = case partition isLexical rules of (rule,pre) = case partition isLexical rules of
([],_) -> (group,[]) ([],_) -> (group,[])
(ls,rest) -> ((cat,rest), concatMap mkLexRule ls) --- useLexRule cat : rest (ls,rest) -> ((cat,rest), concatMap mkLexRule ls)
isLexical (f,(c,its)) = case its of isLexical (f,(c,its)) = case its of
[CFTerm (RegAlts ws)] -> True [CFTerm (RegAlts ws)] -> True
_ -> False _ -> False
-- useLexRule cat = (dummyCFFun,(cat,[CFNonterm (lexCFCat cat)])) -- not needed
mkLexRule r = case r of mkLexRule r = case r of
(fun,(cat,[CFTerm (RegAlts ws)])) -> [(tS w, (cat,fun)) | w <- ws] (fun,(cat,[CFTerm (RegAlts ws)])) -> [(w, [(cat,fun)]) | w <- ws]
_ -> [] _ -> []

View File

@@ -161,7 +161,7 @@ wordsInTerm trm = filter (not . null) $ case trm of
T _ cs -> concat [wo t | Cas _ t <- cs] T _ cs -> concat [wo t | Cas _ t <- cs]
C s t -> wo s ++ wo t C s t -> wo s ++ wo t
FV ts -> concatMap wo ts FV ts -> concatMap wo ts
K (KP ss vs) -> ss ++ concat [s ++ t | Var s t <- vs] K (KP ss vs) -> ss ++ concat [s | Var s _ <- vs]
P t _ -> wo t --- not needed ? P t _ -> wo t --- not needed ?
_ -> [] _ -> []
where wo = wordsInTerm where wo = wordsInTerm

View File

@@ -166,4 +166,7 @@ ccompute cnc = comp []
noVar v = case v of noVar v = case v of
LI _ -> False LI _ -> False
R rs -> all noVar [t | Ass _ t <- rs] R rs -> all noVar [t | Ass _ t <- rs]
_ -> True --- other cases? Con _ ts -> all noVar ts
FV ts -> all noVar ts
S x y -> noVar x && noVar y
_ -> True --- other cases that can be values to pattern match?

View File

@@ -88,7 +88,7 @@ stateGrammarST = grammar
stateCF = cf stateCF = cf
stateMorpho = morpho stateMorpho = morpho
stateOptions = loptions stateOptions = loptions
stateGrammarWords = map fst . tree2list . stateMorpho stateGrammarWords = allMorphoWords . stateMorpho
cncModuleIdST = stateGrammarST cncModuleIdST = stateGrammarST

View File

@@ -1,19 +1,18 @@
module Glue where module Glue where
import Trie import Trie2
import Operations import Operations
import List import List
-------- AR 8/11/2003, using Markus Forsberg's implementation of Huet's unglue -------- AR 8/11/2003, using Markus Forsberg's implementation of Huet's unglue
tcompileSimple :: [String] -> Trie decomposeSimple :: Trie Char a -> [Char] -> Err [[Char]]
tcompileSimple ss = tcompile [(s,[(atWP,s)]) | s <- ss]
decomposeSimple :: Trie -> String -> Err [String]
decomposeSimple t s = do decomposeSimple t s = do
let ss = map (decompose t) $ words s let ss = map (decompose t) $ words s
if any null ss if any null ss
then Bad "unknown word in input" then Bad "unknown word in input"
else return $ concat [intersperse "&+" ws | ws <- ss] else return $ concat [intersperse "&+" ws | ws <- ss]
exTrie = tcompileSimple $ words "ett två tre tjugo trettio hundra tusen" exTrie = tcompile (zip ws ws) where
ws = words "ett två tre tjugo trettio hundra tusen"

View File

@@ -8,20 +8,25 @@ module Trie2 (
tcompile, tcompile,
collapse, collapse,
Trie, Trie,
trieLookup trieLookup,
decompose,
--- Attr, atW, atP, atWP,
emptyTrie
) where ) where
import Map import Map
import List
newtype TrieT a b = TrieT ([(a,TrieT a b)],[b]) newtype TrieT a b = TrieT ([(a,TrieT a b)],[b])
newtype Trie a b = Trie (Map a (Trie a b), [b]) newtype Trie a b = Trie (Map a (Trie a b), [b])
emptyTrie = TrieT ([],[]) emptyTrieT = TrieT ([],[])
emptyTrie = Trie (empty,[])
optimize :: Ord a => TrieT a b -> Trie a b optimize :: (Ord a,Eq b) => TrieT a b -> Trie a b
optimize (TrieT (xs,res)) = Trie ([(c,optimize t) | (c,t) <- xs] |->+ empty, optimize (TrieT (xs,res)) = Trie ([(c,optimize t) | (c,t) <- xs] |->+ empty,
res) nub res) --- nub by AR
collapse :: Ord a => Trie a b -> [([a],[b])] collapse :: Ord a => Trie a b -> [([a],[b])]
collapse trie = collapse' trie [] collapse trie = collapse' trie []
@@ -31,8 +36,8 @@ collapse trie = collapse' trie []
collapse' (Trie (map,[])) s collapse' (Trie (map,[])) s
= concat [ collapse' trie (c:s) | (c,trie) <- flatten map] = concat [ collapse' trie (c:s) | (c,trie) <- flatten map]
tcompile :: Ord a => [([a],[b])] -> Trie a b tcompile :: (Ord a,Eq b) => [([a],[b])] -> Trie a b
tcompile xs = optimize $ build xs emptyTrie tcompile xs = optimize $ build xs emptyTrieT
build :: Ord a => [([a],[b])] -> TrieT a b -> TrieT a b build :: Ord a => [([a],[b])] -> TrieT a b -> TrieT a b
build [] trie = trie build [] trie = trie
@@ -41,7 +46,7 @@ build (x:xs) trie = build xs (insert x trie)
insert ([],ys) (TrieT (xs,res)) = TrieT (xs,ys ++ res) insert ([],ys) (TrieT (xs,res)) = TrieT (xs,ys ++ res)
insert ((s:ss),ys) (TrieT (xs,res)) insert ((s:ss),ys) (TrieT (xs,res))
= case (span (\(s',_) -> s' /= s) xs) of = case (span (\(s',_) -> s' /= s) xs) of
(xs,[]) -> TrieT (((s,(insert (ss,ys) emptyTrie)):xs),res) (xs,[]) -> TrieT (((s,(insert (ss,ys) emptyTrieT)):xs),res)
(xs,(y,trie):zs) -> TrieT (xs ++ ((y,insert (ss,ys) trie):zs),res) (xs,(y,trie):zs) -> TrieT (xs ++ ((y,insert (ss,ys) trie):zs),res)
trieLookup :: Ord a => Trie a b -> [a] -> ([a],[b]) trieLookup :: Ord a => Trie a b -> [a] -> ([a],[b])
@@ -53,3 +58,53 @@ apply (Trie (map,_)) (s:ss) inp
= case map ! s of = case map ! s of
Just trie -> apply trie ss inp Just trie -> apply trie ss inp
Nothing -> (inp,[]) Nothing -> (inp,[])
-----------------------------
-- from Trie for strings; simplified for GF by making binding always possible (AR)
decompose :: Ord a => Trie a b -> [a] -> [[a]]
decompose trie sentence = backtrack [(sentence,[])] trie
react :: Ord a => [a] -> [[a]] -> [([a],[[a]])] ->
[a] -> Trie a b -> Trie a b -> [[a]]
-- String -> [String] -> [(String,[String])] -> String -> Trie -> Trie -> [String]
react input output back occ (Trie (arcs,res)) init =
case res of -- Accept = non-empty res.
[] -> continue back
_ -> let pushout = (occ:output)
in case input of
[] -> reverse $ map reverse pushout
_ -> let pushback = ((input,pushout):back)
in continue pushback
where continue cont = case input of
[] -> backtrack cont init
(l:rest) -> case arcs ! l of
Just trie ->
react rest output cont (l:occ) trie init
Nothing -> backtrack cont init
backtrack :: Ord a => [([a],[[a]])] -> Trie a b -> [[a]]
backtrack [] _ = []
backtrack ((input,output):back) trie
= react input output back [] trie trie
{- so this is not needed from the original
type Attr = Int
atW, atP, atWP :: Attr
(atW,atP,atWP) = (0,1,2)
decompose :: Ord a => Trie a (Int,b) -> [a] -> [[a]]
decompose trie sentence = legal trie $ backtrack [(sentence,[])] trie
-- The function legal checks if the decomposition is in fact a possible one.
legal :: Ord a => Trie a (Int,b) -> [[a]] -> [[a]]
legal _ [] = []
legal trie input = if (test (map ((map fst).snd.(trieLookup trie)) input)) then input else []
where
test [] = False
test [xs] = elem atW xs || elem atWP xs
test (xs:xss) = (elem atP xs || elem atWP xs) && test xss
-}

View File

@@ -147,6 +147,7 @@ customGrammarPrinter =
,(strCI "cf", prCF . stateCF) ,(strCI "cf", prCF . stateCF)
,(strCI "lbnf", prLBNF . stateCF) ,(strCI "lbnf", prLBNF . stateCF)
,(strCI "morpho", prMorpho . stateMorpho) ,(strCI "morpho", prMorpho . stateMorpho)
,(strCI "fullform",prFullForm . stateMorpho)
,(strCI "opts", prOpts . stateOptions) ,(strCI "opts", prOpts . stateOptions)
,(strCI "words", unwords . stateGrammarWords) ,(strCI "words", unwords . stateGrammarWords)
{- ---- {- ----

View File

@@ -15,40 +15,33 @@ import Glue
import Char import Char
import List (sortBy, intersperse) import List (sortBy, intersperse)
import Monad (liftM) import Monad (liftM)
import Trie2
-- construct a morphological analyser from a GF grammar. AR 11/4/2001 -- construct a morphological analyser from a GF grammar. AR 11/4/2001
-- we have found the binary search tree sorted by word forms more efficient -- we first found the binary search tree sorted by word forms more efficient
-- than a trie, at least for grammars with 7000 word forms -- than a trie, at least for grammars with 7000 word forms
-- (18/11/2003) but this may change since we have to use a trie
-- for decompositions and also want to use it in the parser
type Morpho = BinTree (String,[String]) type Morpho = Trie Char String
emptyMorpho = NT emptyMorpho = emptyTrie
-- with literals
appMorpho :: Morpho -> String -> (String,[String]) appMorpho :: Morpho -> String -> (String,[String])
appMorpho m s = (s, ps ++ ms) where appMorpho = appMorphoOnly
ms = case lookupTree id s m of ---- add lookup for literals
Ok vs -> vs
_ -> []
ps = [] ---- case lookupLiteral s of
---- Ok (t,_) -> [tagPrt t]
---- _ -> []
-- without literals -- without literals
appMorphoOnly :: Morpho -> String -> (String,[String]) appMorphoOnly :: Morpho -> String -> (String,[String])
appMorphoOnly m s = (s, ms) where appMorphoOnly m s = trieLookup m s
ms = case lookupTree id s m of
Ok vs -> vs
_ -> []
-- recognize word, exluding literals -- recognize word, exluding literals
isKnownWord :: Morpho -> String -> Bool isKnownWord :: Morpho -> String -> Bool
isKnownWord mo = not . null . snd . appMorphoOnly mo isKnownWord mo = not . null . snd . appMorphoOnly mo
mkMorpho :: CanonGrammar -> Ident -> Morpho mkMorpho :: CanonGrammar -> Ident -> Morpho
---- mkMorpho gr = emptyMorpho ---- mkMorpho gr a = tcompile $ concatMap mkOne $ allItems where
mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where
mkOne (Left (fun,c)) = map (prOne fun c) $ allLins fun mkOne (Left (fun,c)) = map (prOne fun c) $ allLins fun
mkOne (Right (fun,_)) = map (prSyn fun) $ allSyns fun mkOne (Right (fun,_)) = map (prSyn fun) $ allSyns fun
@@ -58,14 +51,14 @@ mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where
ts <- allLinsOfFun gr (CIQ a f) ts <- allLinsOfFun gr (CIQ a f)
ss <- mapM (mapPairsM (mapPairsM (return . wordsInTerm))) ts ss <- mapM (mapPairsM (mapPairsM (return . wordsInTerm))) ts
return [(p,s) | (p,fs) <- concat $ map snd $ concat ss, s <- fs] return [(p,s) | (p,fs) <- concat $ map snd $ concat ss, s <- fs]
prOne (_,f) c (ps,s) = (s, prt f +++ tagPrt c ++ concat (map prt_ ps)) prOne (_,f) c (ps,s) = (s, [prt f +++ tagPrt c +++ unwords (map prt_ ps)])
-- gather syncategorematic words -- gather syncategorematic words
allSyns fun@(m,f) = errVal [] $ do allSyns fun@(m,f) = errVal [] $ do
tss <- allLinsOfFun gr (CIQ a f) tss <- allLinsOfFun gr (CIQ a f)
let ss = [s | ts <- tss, (_,fs) <- ts, (_,s) <- fs] let ss = [s | ts <- tss, (_,fs) <- ts, (_,s) <- fs]
return $ concat $ map wordsInTerm ss return $ concat $ map wordsInTerm ss
prSyn f s = (s, "+<syncategorematic>" ++ tagPrt f) prSyn f s = (s, ["+<syncategorematic>" ++ tagPrt f])
-- all words, Left from lexical rules and Right syncategorematic -- all words, Left from lexical rules and Right syncategorematic
allItems = [lexRole t (f,c) | (f,c,t) <- allFuns] where allItems = [lexRole t (f,c) | (f,c,t) <- allFuns] where
@@ -77,7 +70,7 @@ mkMorpho gr a = mkMorphoTree $ concat $ map mkOne $ allItems where
-- printing full-form lexicon and results -- printing full-form lexicon and results
prMorpho :: Morpho -> String prMorpho :: Morpho -> String
prMorpho = unlines . map prMorphoAnalysis . tree2list prMorpho = unlines . map prMorphoAnalysis . collapse
prMorphoAnalysis :: (String,[String]) -> String prMorphoAnalysis :: (String,[String]) -> String
prMorphoAnalysis (w,fs) = unlines (w:fs) prMorphoAnalysis (w,fs) = unlines (w:fs)
@@ -92,7 +85,7 @@ tagPrt (m,c) = "+" ++ prt c --- module name
-- print all words recognized -- print all words recognized
allMorphoWords :: Morpho -> [String] allMorphoWords :: Morpho -> [String]
allMorphoWords = map fst . tree2list allMorphoWords = map fst . collapse
-- analyse running text and show results either in short form or on separate lines -- analyse running text and show results either in short form or on separate lines
morphoTextShort mo = unwords . map (prMorphoAnalysisShort . appMorpho mo) . words morphoTextShort mo = unwords . map (prMorphoAnalysisShort . appMorpho mo) . words
@@ -100,7 +93,7 @@ morphoText mo = unlines . map (('\n':) . prMorphoAnalysis . appMorpho mo) . word
-- format used in the Italian Verb Engine -- format used in the Italian Verb Engine
prFullForm :: Morpho -> String prFullForm :: Morpho -> String
prFullForm = unlines . map prOne . tree2list where prFullForm = unlines . map prOne . collapse where
prOne (s,ps) = s ++ " : " ++ unwords (intersperse "/" ps) prOne (s,ps) = s ++ " : " ++ unwords (intersperse "/" ps)
-- using Huet's unglueing method to find word boundaries -- using Huet's unglueing method to find word boundaries
@@ -109,21 +102,4 @@ prFullForm = unlines . map prOne . tree2list where
---- Moreover, we should specify the cases in which this happens - not all words ---- Moreover, we should specify the cases in which this happens - not all words
decomposeWords :: Morpho -> String -> [String] decomposeWords :: Morpho -> String -> [String]
decomposeWords mo s = errVal (words s) $ decomposeWords mo s = errVal (words s) $ decomposeSimple mo s
decomposeSimple (tcompileSimple (map fst $ tree2list mo)) s
-- auxiliaries
mkMorphoTree :: (Ord a, Eq b) => [(a,b)] -> BinTree (a,[b])
mkMorphoTree = sorted2tree . sortAssocs
sortAssocs :: (Ord a, Eq b) => [(a,b)] -> [(a,[b])]
sortAssocs = arrange . sortBy (\ (x,_) (y,_) -> compare x y) where
arrange ((x,v):xvs) = arr x [v] xvs
arrange [] = []
arr y vs xs = case xs of
(x,v):xvs -> if x==y then arr y vvs xvs else (y,vs) : arr x [v] xvs
where vvs = if elem v vs then vs else (v:vs)
_ -> [(y,vs)]

View File

@@ -1 +1 @@
module Today where today = "Mon Nov 17 17:04:28 CET 2003" module Today where today = "Tue Nov 18 17:18:44 CET 2003"