added classify.hs for building dictionaries from WordNet

This commit is contained in:
krasimir
2016-10-29 09:29:24 +00:00
parent a533fc70d3
commit 2e66e6bfac

View File

@@ -0,0 +1,275 @@
import Text.EditDistance
import Data.List
import qualified Data.Set as Set
import qualified Data.Map.Strict as Map
import Data.Maybe(maybe,mapMaybe,fromMaybe)
import Data.Char
import Data.IORef
import Debug.Trace
import System.Random
import System.Random.Shuffle
import System.Environment
import Translit
import Control.Monad(forM_)
train = False
useTenFold = False
main = do
[src,dst] <- getArgs
src <- fmap (toEntries src) $ readFile ("data/wn-data-"++src++".tab")
dst <- fmap (toEntries dst) $ readFile ("data/wn-data-"++dst++".tab")
gen <- newStdGen
let wps = addCounts (join src dst)
rps <- if train
then fmap (Set.fromList . map (toReferencePair . tsv) . lines) $ readFile "data/fiwn-transls.tsv"
else return Set.empty
e2f <- fmap (Map.fromList . map (toAlignmentPair . words) . lines) $ readFile "data/europarl-v7.fi-en.lemma.intersect.lex.e2f"
let features = addFeatures rps wps
res <- newIORef Map.empty
(if useTenFold
then forM_ (tenfold gen features)
else (>>=) (return (features,features))) $ \(evalData,trainData) -> do
tbl <- if train
then do let stats = Map.fromListWith add2 [((crank,drank),(if cls then (1,0) else (0,1))) | (_,_,_,_,_,crank,drank,cls) <- trainData]
where
add2 (x1,y1) (x2,y2) = (x1+x2,y1+y2)
tbl = [[maybe 0 (\(c1,c2) -> fromIntegral c1 / fromIntegral (c1+c2)) (Map.lookup (rank,dist) stats) | dist <- [1..40]] | rank <- [1..7]]
writeFile "stats.tsv" (unlines [untsv [show crank,show drank,show c1,show c2] | ((crank,drank),(c1,c2)) <- Map.toList stats])
writeFile "table.tsv" (unlines (map (untsv . map show) tbl))
return tbl
else do fmap (map (map read . tsv) . lines) $ readFile "table.tsv"
g <- newStdGen
-- let predictions = randomChoice g evalData
let predictions = classify tbl evalData
--let predictions = alignmentChoice e2f evalData
writeFile "predictions.tsv" (unlines [untsv [sense_id,
lemma1,
lemma2,
show c,show d,
show crank,show drank,
show cls,show pred]
| (sense_id,lemma1,lemma2,c,d,crank,drank,cls,pred) <- predictions])
let result0 = Map.fromListWith (+) [((cls,pred),1) | (_,_,_,_,_,_,_,cls,pred) <- predictions]
total = length predictions
result = Map.map (\c -> fromIntegral c / fromIntegral total) result0
sum_result <- readIORef res
let sum_result' = Map.fromList [let k = (cls,pred) in (k,fromMaybe 0 (Map.lookup k result)+fromMaybe 0 (Map.lookup k sum_result)) | cls <- [True,False], pred <- [True,False]]
writeIORef res sum_result'
result <- readIORef res
writeFile ("result.tsv") (unlines [untsv ([show cls,show pred,show (c/(if useTenFold then 10 else 1))]) | ((cls,pred),c) <- Map.toList result])
toEntries lng =
foldr addElem Map.empty .
mapMaybe (toEntry lng . tsv) .
tail .
lines
where
toEntry lng [sense_id,rel,w]
| rel == "lemma" || rel == lng++":lemma" = Just (sense_id,w)
toEntry _ _ = Nothing
addElem (k,a) =
Map.alter (\mb_as -> Just (a:fromMaybe [] mb_as)) k
join src dst =
[(sense_id,x,y) | (sense_id,xs) <- Map.toList src,
x <- xs,
y <- fromMaybe [] (Map.lookup sense_id dst)]
addCounts src_dst =
let cmap = Map.fromListWith (+) [((x,y),1) | (sense_id,x,y) <- src_dst]
cdmap = Map.mapWithKey (\(x,y) c -> (c,dist x y)) cmap
in [(sense_id,x,y,c,d) | (sense_id,x,y) <- src_dst, let (c,d) = fromMaybe (0,0) (Map.lookup (x,y) cdmap)]
where
dist x y = levenshteinDistance defaultEditCosts (map toLower x) (map toLower y)
toReferencePair (fis:fi:ens:en:_) = (conv ens,en,fi)
where
conv s = drop 8 s ++ ['-',s !! 7]
toAlignmentPair (eng:fin:_:prob:_) = ((mapEng eng,mapFin fin),read prob :: Double)
where
mapEng w =
init w ++ ((:[]) $
case last w of
'n' -> 'n'
'v' -> 'v'
'j' -> 'a'
'r' -> 'r'
c -> c)
mapFin w =
init w ++ ((:[]) $
case last w of
'n' -> 'n'
'v' -> 'v'
'j' -> 'a'
'a' -> 'a'
'r' -> 'r'
c -> c)
tsv :: String -> [String]
tsv "" = []
tsv cs =
let (x,cs1) = break (=='\t') cs
in x : if null cs1 then [] else tsv (tail cs1)
untsv :: [String] -> String
untsv = intercalate "\t"
addFeatures ts ps =
let (xs,ys) = takeSynset ps
(cds,xs') = mapAccumL (addValues cds) (Set.empty,Set.empty) xs
in if null xs
then []
else xs' ++ addFeatures ts ys
where
takeSynset [] = ([],[])
takeSynset (p:ps) = let sense_id = get_sense_id p
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
in (p : ps1, ps2)
where
get_sense_id (sense_id,_,_,_,_) = sense_id
addValues cds (cs,ds) (sense_id,lemma1,lemma2,c,d) =
let cls = Set.member (sense_id,lemma1,lemma2) ts
cs' = Set.insert c cs
ds' = Set.insert d ds
crank = findIndex 1 c (Set.toDescList (fst cds))
drank = findIndex 1 d (Set.toAscList (snd cds))
in ((cs',ds'),(sense_id,lemma1,lemma2,c,d,crank,drank,cls))
where
findIndex i x [] = i
findIndex i x (y:ys)
| x == y = i
| otherwise = findIndex (i+1) x ys
tenfold gen ps =
let synsets = takeSynset ps
len = length synsets
len10 = len `div` 10
in splitData len10 [] (shuffle' synsets len gen)
where
takeSynset [] = []
takeSynset (p:ps) = let sense_id = get_sense_id p
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
in (p : ps1) : takeSynset ps2
where
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
splitData len10 zs ps =
let (xs,ys) = splitAt len10 ps
in if null ys
then []
else (concat xs,concat (zs++ys)) : splitData len10 (xs++zs) ys
--classify :: [[Double]] -> (String,Int,String,Int,String,Int,Int,Int,Int,Bool) -> (String,Int,String,Int,String,Int,Int,Int,Int,Bool,Bool)
{-classify tbl (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls)
| tbl !! (crank-1) !! (drank-1) > 0.5 = (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls,True)
| otherwise = (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls,False)
-}
classify tbl ps =
let (xs,ys) = takeSynset ps
xs' = sortBy descProb (map pairProb xs)
(ids,sel1) = pick1 ([],[]) xs'
sel2 = pick2 ids xs'
sel = sel1++sel2
in if null xs
then []
else map (annotate sel) xs ++ classify tbl ys
where
takeSynset [] = ([],[])
takeSynset (p:ps) = let sense_id = get_sense_id p
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
in (p : ps1, ps2)
where
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
pairProb x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) = (lemma1,lemma2,tbl !! (crank-1) !! (drank-1))
descProb (_,_,p1) (_,_,p2) = compare p2 p1
pick1 ids [] = (ids,[])
pick1 ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
| not (elem lemma1 ids1 || elem lemma2 ids2) = let (ids',xs') = pick1 (lemma1:ids1,lemma2:ids2) xs
in (ids',(lemma1,lemma2) : xs')
| otherwise = pick1 ids xs
pick2 ids [] = []
pick2 ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
| not (elem lemma1 ids1) = (lemma1,lemma2) : pick2 (lemma1:ids1,lemma2:ids2) xs
| not (elem lemma2 ids2) = (lemma1,lemma2) : pick2 (lemma1:ids1,lemma2:ids2) xs
| otherwise = pick2 ids xs
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)
randomChoice g ps =
let (xs,ys) = takeSynset ps
(g',xs') = mapAccumL pairProb g xs
sel = pick [] (sortBy descProb xs')
in if null xs
then []
else map (annotate sel) xs ++ randomChoice g' ys
where
takeSynset [] = ([],[])
takeSynset (p:ps) = let sense_id = get_sense_id p
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
in (p : ps1, ps2)
where
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
pairProb g x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
let (prob,g') = randomR (0.0,1.0::Double) g
in (g',(lemma1,lemma2,prob))
descProb (_,_,p1) (_,_,p2) = compare p2 p1
pick ids [] = []
pick ids ((lemma1,lemma2,prob):xs)
| not (elem lemma1 ids) = (lemma1,lemma2) : pick (lemma1:lemma2:ids) xs
| not (elem lemma2 ids) = (lemma1,lemma2) : pick (lemma1:lemma2:ids) xs
| otherwise = pick ids xs
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)
alignmentChoice e2f ps =
let (xs,ys) = takeSynset ps
xs' = map pairProb xs
sel = pick ([],[]) (sortBy descProb xs')
in if null xs
then []
else map (annotate sel) xs ++ alignmentChoice e2f ys
where
takeSynset [] = ([],[])
takeSynset (p:ps) = let sense_id = get_sense_id p
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
in (p : ps1, ps2)
where
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
pairProb x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
let prob = fromMaybe 0 (Map.lookup (lemma1++"|"++[sense_id!!9],lemma2++"|"++[sense_id!!9]) e2f)
in (lemma1,lemma2,prob)
descProb (_,_,p1) (_,_,p2) = compare p2 p1
pick ids [] = []
pick ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
| not (elem lemma1 ids1) = (lemma1,lemma2) : pick (lemma1:ids1,lemma2:ids2) xs
| not (elem lemma2 ids2) = (lemma1,lemma2) : pick (lemma1:ids1,lemma2:ids2) xs
| otherwise = pick ids xs
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)