1
0
forked from GitHub/gf-core

the content of ParseEngAbs3.probs is now merged with ParseEngAbs.probs. The later is now retrained. Once the grammar is compiled with the .probs file now it doesn't need anything more to do robust parsing. The robustness itself is controlled by the flags 'heuristic_search_factor', 'meta_prob' and 'meta_token_prob' in ParseEngAbs.gf

This commit is contained in:
kr.angelov
2013-11-06 10:21:46 +00:00
parent 84ef5fa5fa
commit 2483dc7728
27 changed files with 65052 additions and 65106 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,31 +0,0 @@
Phr * 1e-5
Phr A 4.276438451840496e-3
Phr AP 2.2906320842384386e-3
Phr AdA 2.9594729770522466e-6
Phr AdV 3.5217728426921733e-3
Phr Adv 4.47205961562365e-2
Phr CN 5.918945954104493e-6
Phr Cl 0.23274183333431983
Phr ClSlash 2.0716310839365724e-5
Phr Conj 1.2255177597973352e-2
Phr Det 1.8052785160018703e-4
Phr IAdv 2.9594729770522466e-6
Phr IP 2.9594729770522466e-6
Phr N 8.452254822461217e-3
Phr NP 7.473557108950038e-2
Phr Num 1.1837891908208986e-5
Phr Ord 1.7460890564608255e-4
Phr Predet 2.6635256793470218e-5
Phr Prep 7.56145345636849e-3
Phr Pron 2.3675783816417973e-5
Phr QS 4.143262167873145e-5
Phr Quant 4.03080219474516e-3
Phr RP 6.392461630432853e-4
Phr RS 1.021018177083025e-3
Phr S 5.920425690593019e-2
Phr Subj 6.392461630432853e-4
Phr V 1.071033270395208e-2
Phr V2 1.2784923260865705e-3
Phr VPS 4.751729811955087e-2
Phr VS 1.4797364885261233e-5
Phr _ 1.1965149246222233e-9

View File

@@ -3,32 +3,31 @@ import qualified Data.Map as Map
import Data.Maybe
import Data.List
grammar_name = "ParseEngAbs.pgf"
treebank_name = "log4.txt"
chunk_cats = map mkCId
["A", "AP", "AdA", "AdV", "Adv", "CN", "Cl", "ClSlash", "Conj", "Det",
"IAdv", "IP", "N", "NP", "Num", "Ord", "Predet", "Prep", "Pron", "QS",
"Quant", "RP", "RS", "S", "Subj", "V", "V2", "VPS", "VS"]
main = do
pgf <- readPGF "ParseEngAbs.pgf"
ls <- fmap (filterExprs . zip [1..] . lines) $ readFile "log4.txt"
pgf <- readPGF grammar_name
ls <- fmap (filterExprs . zip [1..] . lines) $ readFile treebank_name
putStrLn ""
putStrLn ("trees: "++show (length ls))
let stats = foldl' (collectStats pgf)
let (_,cat,_) = unType (startCat pgf)
stats = foldl' (collectStats pgf)
(initStats pgf)
[(n,fromMaybe (error l) (readExpr (toQ l)),Just (mkCId "Phr"),Nothing) | (n,l) <- ls]
[(n,fromMaybe (error l) (readExpr (toQ l)),Just cat) | (n,l) <- ls]
putStrLn ("coverage: "++show (coverage stats))
putStrLn ("Writing ParseEngAbs.probs...")
writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- uprobs pgf stats])
putStrLn ("Writing ParseEngAbs2.probs...")
writeFile "ParseEngAbs2.probs" (unlines [show cat1 ++ "\t" ++ show cat2 ++ "\t" ++ show p | (cat1,cat2,p) <- mprobs pgf stats])
putStrLn ("Writing global.probs...")
writeFile "global.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- gprobs pgf stats])
putStrLn ("Writing categories.probs...")
writeFile "categories.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- cprobs pgf stats])
writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- probs pgf stats])
where
toQ [] = []
toQ ('[':cs) = let (xs,']':ys) = break (==']') cs
in toQ ('?' : ys)
toQ ('?':cs) = 'Q' : toQ cs
toQ (c:cs) = c : toQ cs
@@ -42,31 +41,26 @@ initStats pgf =
(Map.fromListWith (+)
([(f,1) | f <- functions pgf] ++
[(cat pgf f,1) | f <- functions pgf])
,Map.empty
,0
)
collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) =
collectStats pgf (ustats,count) (n,e,mb_cat1) =
case unApp e of
Just (f,args) -> let fcat2 = cat2 pgf f n e
fcat = fromMaybe (cat2 pgf f n e) mb_cat1
fcat = fromMaybe fcat2 mb_cat1
cf = fromMaybe 0 (Map.lookup f ustats)
cc = fromMaybe 0 (Map.lookup fcat ustats)
in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2
then error (show n ++ ": " ++ showExpr [] e)
else
cf `seq` cc `seq` bstats `seq` count `seq`
cf `seq` cc `seq` count `seq`
foldl' (collectStats pgf)
(Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats)
,(if null args
then Map.insertWith (+) (fcat,wildCId) 1
else id)
(maybe bstats (\cat2 -> Map.insertWith (+) (cat2,fcat) 1 bstats) mb_cat2)
,count+1
)
(zipWith3 (\e mb_cat1 mb_cat2 -> (n,e,mb_cat1,mb_cat2)) args (argCats f) (repeat (Just fcat)))
(zipWith (\e mb_cat1 -> (n,e,mb_cat1)) args (argCats f))
Nothing -> case unStr e of
Just _ -> (ustats,bstats,count+1)
Just _ -> (ustats,count+1)
Nothing -> error ("collectStats ("++showExpr [] e++")")
where
argCats f =
@@ -75,44 +69,25 @@ collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) =
in map tyCat arg_tys
Nothing -> repeat Nothing
coverage (ustats,bstats,count) =
let c = fromMaybe 0 (Map.lookup (mkCId "Q") ustats)
in (fromIntegral (count - c) / fromIntegral count) * 100
coverage (stats,gcount) =
let c = fromMaybe 0 (Map.lookup (mkCId "Q") stats)
in (fromIntegral (gcount - c) / fromIntegral gcount) * 100
uprobs pgf (ustats,bstats,count) =
[toProb f (cat pgf f) | f <- functions pgf]
probs pgf (stats,gcount) =
[toFProb f (cat pgf f) | f <- functions pgf] ++
[toCProb c | c <- chunk_cats]
where
toProb f cat =
let count = fromMaybe 0 (Map.lookup f ustats)
cat_mass = fromMaybe 0 (Map.lookup cat ustats)
toFProb f cat =
let count = fromMaybe 0 (Map.lookup f stats)
cat_mass = fromMaybe 0 (Map.lookup cat stats)
in (f, fromIntegral count / fromIntegral cat_mass :: Double)
mprobs pgf (ustats,bstats,count) =
concat [toProb cat | cat <- categories pgf]
where
toProb cat =
let mass = sum [count | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
cat_count = fromMaybe 0 (Map.lookup cat ustats)
fun_count = sum [fromMaybe 0 (Map.lookup f ustats) | f <- functionsByCat pgf cat]
in (cat,mkCId "*",if cat_count == 0 then 0 else fromIntegral (cat_count - fun_count) / fromIntegral cat_count) :
[(cat1,cat2,fromIntegral count / fromIntegral mass)
| ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
toCProb c =
let ccount = fromMaybe 0 (Map.lookup c stats)
in (c, fromIntegral ccount / fromIntegral chunk_mass :: Double)
gprobs pgf (ustats,bstats,count) =
sortBy (\x y -> compare (snd y) (snd x)) [toProb f | f <- functions pgf]
where
toProb f =
let fcount = fromMaybe 0 (Map.lookup f ustats)
in (f, fromIntegral fcount / fromIntegral count :: Double)
cprobs pgf (ustats,bstats,count) =
sortBy (\x y -> compare (snd y) (snd x)) [toProb c | c <- categories pgf]
where
mass = sum [fromMaybe 0 (Map.lookup c ustats) | c <- categories pgf]
toProb c =
let fcount = fromMaybe 0 (Map.lookup c ustats)
in (c, fromIntegral fcount / fromIntegral mass :: Double)
chunk_mass =
sum [fromMaybe 0 (Map.lookup c stats) | c <- chunk_cats]
cat pgf f =
case fmap unType (functionType pgf f) of