1
0
forked from GitHub/gf-core

added simple script for estimating the coverage on the PennTreebank

This commit is contained in:
kr.angelov
2013-03-28 09:15:38 +00:00
parent 17fc938c20
commit 74a16273b9

View File

@@ -0,0 +1,45 @@
import PGF
import Data.Maybe
main = do
ls <- fmap (filterExprs . lines) $ readFile "log4.txt"
let (c,m1,m2) = foldl counts (0,0,0) (map (\l -> fromMaybe (error l) (readExpr (f l))) ls)
print (length ls)
print ((c / (c+m1+m2))*100)
print (((c+m2) / (c+m1+m2))*100)
meta_dist [length [l | l <- ls, length [c | c <- l, c == '?'] == n] | n <- [0..27]]
meta_dist [length [l | l <- ls, length [x | x <- (zip l (tail l)), x == ('(','?')] == n] | n <- [0..27]]
cs <- fmap (map (length . words) . lines) $ readFile "wsj.eng"
print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [c | c <- l, c == '?']])
print (average [fromIntegral c / fromIntegral (max n 1) | (c,l) <- zip cs ls, let n = length [x | x <- (zip l (tail l)), x == ('(','?')]])
average xs = sum xs / fromIntegral (length xs)
filterExprs [] = []
filterExprs (l:ls)
| null l = filterExprs ls
| elem (head l) "+#*" = drop 2 l : filterExprs ls
| otherwise = filterExprs ls
f [] = []
f ('[':cs) = let (xs,']':ys) = break (==']') cs
in f ('?' : ys)
f ('?':cs) = 'Q' : f cs
f (c:cs) = c : f cs
counts (c,m1,m2) e = c `seq` m1 `seq` m2 `seq`
case unApp e of
Just (f,es) | f == mkCId "Q" -> if null es
then foldl counts (c,m1,m2+1) es
else foldl counts (c,m1+1,m2) es
| otherwise -> foldl counts (c+1,m1,m2) es
Nothing -> case unStr e of
Just _ -> (c+1,m1,m2)
Nothing -> error ("counts ("++show e++")")
meta_dist cs = do
print cs
let cnt = fromIntegral (sum cs)
avg = fromIntegral (sum [n*c | (n,c) <- zip [0..] cs]) / cnt
dev = sqrt (sum [((fromIntegral n-avg) ^ 2)*fromIntegral c | (n,c) <- zip [0..] cs] / cnt)
print (avg,dev)