forked from GitHub/gf-core
the content of ParseEngAbs3.probs is now merged with ParseEngAbs.probs. The later is now retrained. Once the grammar is compiled with the .probs file now it doesn't need anything more to do robust parsing. The robustness itself is controlled by the flags 'heuristic_search_factor', 'meta_prob' and 'meta_token_prob' in ParseEngAbs.gf
This commit is contained in:
File diff suppressed because it is too large
Load Diff
@@ -1,31 +0,0 @@
|
||||
Phr * 1e-5
|
||||
Phr A 4.276438451840496e-3
|
||||
Phr AP 2.2906320842384386e-3
|
||||
Phr AdA 2.9594729770522466e-6
|
||||
Phr AdV 3.5217728426921733e-3
|
||||
Phr Adv 4.47205961562365e-2
|
||||
Phr CN 5.918945954104493e-6
|
||||
Phr Cl 0.23274183333431983
|
||||
Phr ClSlash 2.0716310839365724e-5
|
||||
Phr Conj 1.2255177597973352e-2
|
||||
Phr Det 1.8052785160018703e-4
|
||||
Phr IAdv 2.9594729770522466e-6
|
||||
Phr IP 2.9594729770522466e-6
|
||||
Phr N 8.452254822461217e-3
|
||||
Phr NP 7.473557108950038e-2
|
||||
Phr Num 1.1837891908208986e-5
|
||||
Phr Ord 1.7460890564608255e-4
|
||||
Phr Predet 2.6635256793470218e-5
|
||||
Phr Prep 7.56145345636849e-3
|
||||
Phr Pron 2.3675783816417973e-5
|
||||
Phr QS 4.143262167873145e-5
|
||||
Phr Quant 4.03080219474516e-3
|
||||
Phr RP 6.392461630432853e-4
|
||||
Phr RS 1.021018177083025e-3
|
||||
Phr S 5.920425690593019e-2
|
||||
Phr Subj 6.392461630432853e-4
|
||||
Phr V 1.071033270395208e-2
|
||||
Phr V2 1.2784923260865705e-3
|
||||
Phr VPS 4.751729811955087e-2
|
||||
Phr VS 1.4797364885261233e-5
|
||||
Phr _ 1.1965149246222233e-9
|
||||
@@ -3,32 +3,31 @@ import qualified Data.Map as Map
|
||||
import Data.Maybe
|
||||
import Data.List
|
||||
|
||||
grammar_name = "ParseEngAbs.pgf"
|
||||
|
||||
treebank_name = "log4.txt"
|
||||
|
||||
chunk_cats = map mkCId
|
||||
["A", "AP", "AdA", "AdV", "Adv", "CN", "Cl", "ClSlash", "Conj", "Det",
|
||||
"IAdv", "IP", "N", "NP", "Num", "Ord", "Predet", "Prep", "Pron", "QS",
|
||||
"Quant", "RP", "RS", "S", "Subj", "V", "V2", "VPS", "VS"]
|
||||
|
||||
main = do
|
||||
pgf <- readPGF "ParseEngAbs.pgf"
|
||||
ls <- fmap (filterExprs . zip [1..] . lines) $ readFile "log4.txt"
|
||||
pgf <- readPGF grammar_name
|
||||
ls <- fmap (filterExprs . zip [1..] . lines) $ readFile treebank_name
|
||||
putStrLn ""
|
||||
putStrLn ("trees: "++show (length ls))
|
||||
let stats = foldl' (collectStats pgf)
|
||||
let (_,cat,_) = unType (startCat pgf)
|
||||
stats = foldl' (collectStats pgf)
|
||||
(initStats pgf)
|
||||
[(n,fromMaybe (error l) (readExpr (toQ l)),Just (mkCId "Phr"),Nothing) | (n,l) <- ls]
|
||||
[(n,fromMaybe (error l) (readExpr (toQ l)),Just cat) | (n,l) <- ls]
|
||||
|
||||
putStrLn ("coverage: "++show (coverage stats))
|
||||
|
||||
putStrLn ("Writing ParseEngAbs.probs...")
|
||||
writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- uprobs pgf stats])
|
||||
|
||||
putStrLn ("Writing ParseEngAbs2.probs...")
|
||||
writeFile "ParseEngAbs2.probs" (unlines [show cat1 ++ "\t" ++ show cat2 ++ "\t" ++ show p | (cat1,cat2,p) <- mprobs pgf stats])
|
||||
|
||||
putStrLn ("Writing global.probs...")
|
||||
writeFile "global.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- gprobs pgf stats])
|
||||
|
||||
putStrLn ("Writing categories.probs...")
|
||||
writeFile "categories.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- cprobs pgf stats])
|
||||
writeFile "ParseEngAbs.probs" (unlines [show f ++ "\t" ++ show p | (f,p) <- probs pgf stats])
|
||||
where
|
||||
toQ [] = []
|
||||
toQ ('[':cs) = let (xs,']':ys) = break (==']') cs
|
||||
in toQ ('?' : ys)
|
||||
toQ ('?':cs) = 'Q' : toQ cs
|
||||
toQ (c:cs) = c : toQ cs
|
||||
|
||||
@@ -42,31 +41,26 @@ initStats pgf =
|
||||
(Map.fromListWith (+)
|
||||
([(f,1) | f <- functions pgf] ++
|
||||
[(cat pgf f,1) | f <- functions pgf])
|
||||
,Map.empty
|
||||
,0
|
||||
)
|
||||
|
||||
collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) =
|
||||
collectStats pgf (ustats,count) (n,e,mb_cat1) =
|
||||
case unApp e of
|
||||
Just (f,args) -> let fcat2 = cat2 pgf f n e
|
||||
fcat = fromMaybe (cat2 pgf f n e) mb_cat1
|
||||
fcat = fromMaybe fcat2 mb_cat1
|
||||
cf = fromMaybe 0 (Map.lookup f ustats)
|
||||
cc = fromMaybe 0 (Map.lookup fcat ustats)
|
||||
in if isJust mb_cat1 && f /= mkCId "Q" && fcat /= fcat2
|
||||
then error (show n ++ ": " ++ showExpr [] e)
|
||||
else
|
||||
cf `seq` cc `seq` bstats `seq` count `seq`
|
||||
cf `seq` cc `seq` count `seq`
|
||||
foldl' (collectStats pgf)
|
||||
(Map.insert f (cf+1) (Map.insert fcat (cc+1) ustats)
|
||||
,(if null args
|
||||
then Map.insertWith (+) (fcat,wildCId) 1
|
||||
else id)
|
||||
(maybe bstats (\cat2 -> Map.insertWith (+) (cat2,fcat) 1 bstats) mb_cat2)
|
||||
,count+1
|
||||
)
|
||||
(zipWith3 (\e mb_cat1 mb_cat2 -> (n,e,mb_cat1,mb_cat2)) args (argCats f) (repeat (Just fcat)))
|
||||
(zipWith (\e mb_cat1 -> (n,e,mb_cat1)) args (argCats f))
|
||||
Nothing -> case unStr e of
|
||||
Just _ -> (ustats,bstats,count+1)
|
||||
Just _ -> (ustats,count+1)
|
||||
Nothing -> error ("collectStats ("++showExpr [] e++")")
|
||||
where
|
||||
argCats f =
|
||||
@@ -75,44 +69,25 @@ collectStats pgf (ustats,bstats,count) (n,e,mb_cat1,mb_cat2) =
|
||||
in map tyCat arg_tys
|
||||
Nothing -> repeat Nothing
|
||||
|
||||
coverage (ustats,bstats,count) =
|
||||
let c = fromMaybe 0 (Map.lookup (mkCId "Q") ustats)
|
||||
in (fromIntegral (count - c) / fromIntegral count) * 100
|
||||
coverage (stats,gcount) =
|
||||
let c = fromMaybe 0 (Map.lookup (mkCId "Q") stats)
|
||||
in (fromIntegral (gcount - c) / fromIntegral gcount) * 100
|
||||
|
||||
uprobs pgf (ustats,bstats,count) =
|
||||
[toProb f (cat pgf f) | f <- functions pgf]
|
||||
probs pgf (stats,gcount) =
|
||||
[toFProb f (cat pgf f) | f <- functions pgf] ++
|
||||
[toCProb c | c <- chunk_cats]
|
||||
where
|
||||
toProb f cat =
|
||||
let count = fromMaybe 0 (Map.lookup f ustats)
|
||||
cat_mass = fromMaybe 0 (Map.lookup cat ustats)
|
||||
toFProb f cat =
|
||||
let count = fromMaybe 0 (Map.lookup f stats)
|
||||
cat_mass = fromMaybe 0 (Map.lookup cat stats)
|
||||
in (f, fromIntegral count / fromIntegral cat_mass :: Double)
|
||||
|
||||
mprobs pgf (ustats,bstats,count) =
|
||||
concat [toProb cat | cat <- categories pgf]
|
||||
where
|
||||
toProb cat =
|
||||
let mass = sum [count | ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
|
||||
cat_count = fromMaybe 0 (Map.lookup cat ustats)
|
||||
fun_count = sum [fromMaybe 0 (Map.lookup f ustats) | f <- functionsByCat pgf cat]
|
||||
in (cat,mkCId "*",if cat_count == 0 then 0 else fromIntegral (cat_count - fun_count) / fromIntegral cat_count) :
|
||||
[(cat1,cat2,fromIntegral count / fromIntegral mass)
|
||||
| ((cat1,cat2),count) <- Map.toList bstats, cat1==cat]
|
||||
toCProb c =
|
||||
let ccount = fromMaybe 0 (Map.lookup c stats)
|
||||
in (c, fromIntegral ccount / fromIntegral chunk_mass :: Double)
|
||||
|
||||
gprobs pgf (ustats,bstats,count) =
|
||||
sortBy (\x y -> compare (snd y) (snd x)) [toProb f | f <- functions pgf]
|
||||
where
|
||||
toProb f =
|
||||
let fcount = fromMaybe 0 (Map.lookup f ustats)
|
||||
in (f, fromIntegral fcount / fromIntegral count :: Double)
|
||||
|
||||
cprobs pgf (ustats,bstats,count) =
|
||||
sortBy (\x y -> compare (snd y) (snd x)) [toProb c | c <- categories pgf]
|
||||
where
|
||||
mass = sum [fromMaybe 0 (Map.lookup c ustats) | c <- categories pgf]
|
||||
|
||||
toProb c =
|
||||
let fcount = fromMaybe 0 (Map.lookup c ustats)
|
||||
in (c, fromIntegral fcount / fromIntegral mass :: Double)
|
||||
chunk_mass =
|
||||
sum [fromMaybe 0 (Map.lookup c stats) | c <- chunk_cats]
|
||||
|
||||
cat pgf f =
|
||||
case fmap unType (functionType pgf f) of
|
||||
|
||||
Reference in New Issue
Block a user