mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
added classify.hs for building dictionaries from WordNet
This commit is contained in:
275
lib/src/translator/classify.hs
Normal file
275
lib/src/translator/classify.hs
Normal file
@@ -0,0 +1,275 @@
|
||||
import Text.EditDistance
|
||||
import Data.List
|
||||
import qualified Data.Set as Set
|
||||
import qualified Data.Map.Strict as Map
|
||||
import Data.Maybe(maybe,mapMaybe,fromMaybe)
|
||||
import Data.Char
|
||||
import Data.IORef
|
||||
import Debug.Trace
|
||||
import System.Random
|
||||
import System.Random.Shuffle
|
||||
import System.Environment
|
||||
import Translit
|
||||
import Control.Monad(forM_)
|
||||
|
||||
train = False
|
||||
useTenFold = False
|
||||
|
||||
main = do
|
||||
[src,dst] <- getArgs
|
||||
src <- fmap (toEntries src) $ readFile ("data/wn-data-"++src++".tab")
|
||||
dst <- fmap (toEntries dst) $ readFile ("data/wn-data-"++dst++".tab")
|
||||
gen <- newStdGen
|
||||
let wps = addCounts (join src dst)
|
||||
|
||||
rps <- if train
|
||||
then fmap (Set.fromList . map (toReferencePair . tsv) . lines) $ readFile "data/fiwn-transls.tsv"
|
||||
else return Set.empty
|
||||
e2f <- fmap (Map.fromList . map (toAlignmentPair . words) . lines) $ readFile "data/europarl-v7.fi-en.lemma.intersect.lex.e2f"
|
||||
|
||||
let features = addFeatures rps wps
|
||||
|
||||
res <- newIORef Map.empty
|
||||
(if useTenFold
|
||||
then forM_ (tenfold gen features)
|
||||
else (>>=) (return (features,features))) $ \(evalData,trainData) -> do
|
||||
tbl <- if train
|
||||
then do let stats = Map.fromListWith add2 [((crank,drank),(if cls then (1,0) else (0,1))) | (_,_,_,_,_,crank,drank,cls) <- trainData]
|
||||
where
|
||||
add2 (x1,y1) (x2,y2) = (x1+x2,y1+y2)
|
||||
tbl = [[maybe 0 (\(c1,c2) -> fromIntegral c1 / fromIntegral (c1+c2)) (Map.lookup (rank,dist) stats) | dist <- [1..40]] | rank <- [1..7]]
|
||||
writeFile "stats.tsv" (unlines [untsv [show crank,show drank,show c1,show c2] | ((crank,drank),(c1,c2)) <- Map.toList stats])
|
||||
writeFile "table.tsv" (unlines (map (untsv . map show) tbl))
|
||||
return tbl
|
||||
else do fmap (map (map read . tsv) . lines) $ readFile "table.tsv"
|
||||
|
||||
g <- newStdGen
|
||||
-- let predictions = randomChoice g evalData
|
||||
let predictions = classify tbl evalData
|
||||
--let predictions = alignmentChoice e2f evalData
|
||||
writeFile "predictions.tsv" (unlines [untsv [sense_id,
|
||||
lemma1,
|
||||
lemma2,
|
||||
show c,show d,
|
||||
show crank,show drank,
|
||||
show cls,show pred]
|
||||
| (sense_id,lemma1,lemma2,c,d,crank,drank,cls,pred) <- predictions])
|
||||
|
||||
let result0 = Map.fromListWith (+) [((cls,pred),1) | (_,_,_,_,_,_,_,cls,pred) <- predictions]
|
||||
total = length predictions
|
||||
result = Map.map (\c -> fromIntegral c / fromIntegral total) result0
|
||||
|
||||
sum_result <- readIORef res
|
||||
let sum_result' = Map.fromList [let k = (cls,pred) in (k,fromMaybe 0 (Map.lookup k result)+fromMaybe 0 (Map.lookup k sum_result)) | cls <- [True,False], pred <- [True,False]]
|
||||
writeIORef res sum_result'
|
||||
|
||||
result <- readIORef res
|
||||
writeFile ("result.tsv") (unlines [untsv ([show cls,show pred,show (c/(if useTenFold then 10 else 1))]) | ((cls,pred),c) <- Map.toList result])
|
||||
|
||||
toEntries lng =
|
||||
foldr addElem Map.empty .
|
||||
mapMaybe (toEntry lng . tsv) .
|
||||
tail .
|
||||
lines
|
||||
where
|
||||
toEntry lng [sense_id,rel,w]
|
||||
| rel == "lemma" || rel == lng++":lemma" = Just (sense_id,w)
|
||||
toEntry _ _ = Nothing
|
||||
|
||||
addElem (k,a) =
|
||||
Map.alter (\mb_as -> Just (a:fromMaybe [] mb_as)) k
|
||||
|
||||
join src dst =
|
||||
[(sense_id,x,y) | (sense_id,xs) <- Map.toList src,
|
||||
x <- xs,
|
||||
y <- fromMaybe [] (Map.lookup sense_id dst)]
|
||||
|
||||
addCounts src_dst =
|
||||
let cmap = Map.fromListWith (+) [((x,y),1) | (sense_id,x,y) <- src_dst]
|
||||
cdmap = Map.mapWithKey (\(x,y) c -> (c,dist x y)) cmap
|
||||
in [(sense_id,x,y,c,d) | (sense_id,x,y) <- src_dst, let (c,d) = fromMaybe (0,0) (Map.lookup (x,y) cdmap)]
|
||||
where
|
||||
dist x y = levenshteinDistance defaultEditCosts (map toLower x) (map toLower y)
|
||||
|
||||
|
||||
toReferencePair (fis:fi:ens:en:_) = (conv ens,en,fi)
|
||||
where
|
||||
conv s = drop 8 s ++ ['-',s !! 7]
|
||||
|
||||
toAlignmentPair (eng:fin:_:prob:_) = ((mapEng eng,mapFin fin),read prob :: Double)
|
||||
where
|
||||
mapEng w =
|
||||
init w ++ ((:[]) $
|
||||
case last w of
|
||||
'n' -> 'n'
|
||||
'v' -> 'v'
|
||||
'j' -> 'a'
|
||||
'r' -> 'r'
|
||||
c -> c)
|
||||
|
||||
mapFin w =
|
||||
init w ++ ((:[]) $
|
||||
case last w of
|
||||
'n' -> 'n'
|
||||
'v' -> 'v'
|
||||
'j' -> 'a'
|
||||
'a' -> 'a'
|
||||
'r' -> 'r'
|
||||
c -> c)
|
||||
|
||||
tsv :: String -> [String]
|
||||
tsv "" = []
|
||||
tsv cs =
|
||||
let (x,cs1) = break (=='\t') cs
|
||||
in x : if null cs1 then [] else tsv (tail cs1)
|
||||
|
||||
untsv :: [String] -> String
|
||||
untsv = intercalate "\t"
|
||||
|
||||
addFeatures ts ps =
|
||||
let (xs,ys) = takeSynset ps
|
||||
(cds,xs') = mapAccumL (addValues cds) (Set.empty,Set.empty) xs
|
||||
in if null xs
|
||||
then []
|
||||
else xs' ++ addFeatures ts ys
|
||||
where
|
||||
takeSynset [] = ([],[])
|
||||
takeSynset (p:ps) = let sense_id = get_sense_id p
|
||||
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
|
||||
in (p : ps1, ps2)
|
||||
where
|
||||
get_sense_id (sense_id,_,_,_,_) = sense_id
|
||||
|
||||
addValues cds (cs,ds) (sense_id,lemma1,lemma2,c,d) =
|
||||
let cls = Set.member (sense_id,lemma1,lemma2) ts
|
||||
cs' = Set.insert c cs
|
||||
ds' = Set.insert d ds
|
||||
crank = findIndex 1 c (Set.toDescList (fst cds))
|
||||
drank = findIndex 1 d (Set.toAscList (snd cds))
|
||||
in ((cs',ds'),(sense_id,lemma1,lemma2,c,d,crank,drank,cls))
|
||||
where
|
||||
findIndex i x [] = i
|
||||
findIndex i x (y:ys)
|
||||
| x == y = i
|
||||
| otherwise = findIndex (i+1) x ys
|
||||
|
||||
tenfold gen ps =
|
||||
let synsets = takeSynset ps
|
||||
len = length synsets
|
||||
len10 = len `div` 10
|
||||
in splitData len10 [] (shuffle' synsets len gen)
|
||||
where
|
||||
takeSynset [] = []
|
||||
takeSynset (p:ps) = let sense_id = get_sense_id p
|
||||
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
|
||||
in (p : ps1) : takeSynset ps2
|
||||
where
|
||||
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
|
||||
|
||||
splitData len10 zs ps =
|
||||
let (xs,ys) = splitAt len10 ps
|
||||
in if null ys
|
||||
then []
|
||||
else (concat xs,concat (zs++ys)) : splitData len10 (xs++zs) ys
|
||||
|
||||
--classify :: [[Double]] -> (String,Int,String,Int,String,Int,Int,Int,Int,Bool) -> (String,Int,String,Int,String,Int,Int,Int,Int,Bool,Bool)
|
||||
{-classify tbl (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls)
|
||||
| tbl !! (crank-1) !! (drank-1) > 0.5 = (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls,True)
|
||||
| otherwise = (sense_id,lemma_id1,lemma1,lemma_id2,lemma2,c,d,crank,drank,cls,False)
|
||||
-}
|
||||
classify tbl ps =
|
||||
let (xs,ys) = takeSynset ps
|
||||
xs' = sortBy descProb (map pairProb xs)
|
||||
(ids,sel1) = pick1 ([],[]) xs'
|
||||
sel2 = pick2 ids xs'
|
||||
sel = sel1++sel2
|
||||
in if null xs
|
||||
then []
|
||||
else map (annotate sel) xs ++ classify tbl ys
|
||||
where
|
||||
takeSynset [] = ([],[])
|
||||
takeSynset (p:ps) = let sense_id = get_sense_id p
|
||||
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
|
||||
in (p : ps1, ps2)
|
||||
where
|
||||
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
|
||||
|
||||
pairProb x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) = (lemma1,lemma2,tbl !! (crank-1) !! (drank-1))
|
||||
|
||||
descProb (_,_,p1) (_,_,p2) = compare p2 p1
|
||||
|
||||
pick1 ids [] = (ids,[])
|
||||
pick1 ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
|
||||
| not (elem lemma1 ids1 || elem lemma2 ids2) = let (ids',xs') = pick1 (lemma1:ids1,lemma2:ids2) xs
|
||||
in (ids',(lemma1,lemma2) : xs')
|
||||
| otherwise = pick1 ids xs
|
||||
|
||||
pick2 ids [] = []
|
||||
pick2 ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
|
||||
| not (elem lemma1 ids1) = (lemma1,lemma2) : pick2 (lemma1:ids1,lemma2:ids2) xs
|
||||
| not (elem lemma2 ids2) = (lemma1,lemma2) : pick2 (lemma1:ids1,lemma2:ids2) xs
|
||||
| otherwise = pick2 ids xs
|
||||
|
||||
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
|
||||
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)
|
||||
|
||||
randomChoice g ps =
|
||||
let (xs,ys) = takeSynset ps
|
||||
(g',xs') = mapAccumL pairProb g xs
|
||||
sel = pick [] (sortBy descProb xs')
|
||||
in if null xs
|
||||
then []
|
||||
else map (annotate sel) xs ++ randomChoice g' ys
|
||||
where
|
||||
takeSynset [] = ([],[])
|
||||
takeSynset (p:ps) = let sense_id = get_sense_id p
|
||||
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
|
||||
in (p : ps1, ps2)
|
||||
where
|
||||
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
|
||||
|
||||
pairProb g x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
|
||||
let (prob,g') = randomR (0.0,1.0::Double) g
|
||||
in (g',(lemma1,lemma2,prob))
|
||||
|
||||
descProb (_,_,p1) (_,_,p2) = compare p2 p1
|
||||
|
||||
pick ids [] = []
|
||||
pick ids ((lemma1,lemma2,prob):xs)
|
||||
| not (elem lemma1 ids) = (lemma1,lemma2) : pick (lemma1:lemma2:ids) xs
|
||||
| not (elem lemma2 ids) = (lemma1,lemma2) : pick (lemma1:lemma2:ids) xs
|
||||
| otherwise = pick ids xs
|
||||
|
||||
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
|
||||
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)
|
||||
|
||||
|
||||
alignmentChoice e2f ps =
|
||||
let (xs,ys) = takeSynset ps
|
||||
xs' = map pairProb xs
|
||||
sel = pick ([],[]) (sortBy descProb xs')
|
||||
in if null xs
|
||||
then []
|
||||
else map (annotate sel) xs ++ alignmentChoice e2f ys
|
||||
where
|
||||
takeSynset [] = ([],[])
|
||||
takeSynset (p:ps) = let sense_id = get_sense_id p
|
||||
(ps1,ps2) = break (\p1 -> get_sense_id p1 /= sense_id) ps
|
||||
in (p : ps1, ps2)
|
||||
where
|
||||
get_sense_id (sense_id,_,_,_,_,_,_,_) = sense_id
|
||||
|
||||
pairProb x@(sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
|
||||
let prob = fromMaybe 0 (Map.lookup (lemma1++"|"++[sense_id!!9],lemma2++"|"++[sense_id!!9]) e2f)
|
||||
in (lemma1,lemma2,prob)
|
||||
|
||||
descProb (_,_,p1) (_,_,p2) = compare p2 p1
|
||||
|
||||
pick ids [] = []
|
||||
pick ids@(ids1,ids2) ((lemma1,lemma2,prob):xs)
|
||||
| not (elem lemma1 ids1) = (lemma1,lemma2) : pick (lemma1:ids1,lemma2:ids2) xs
|
||||
| not (elem lemma2 ids2) = (lemma1,lemma2) : pick (lemma1:ids1,lemma2:ids2) xs
|
||||
| otherwise = pick ids xs
|
||||
|
||||
annotate sel (sense_id,lemma1,lemma2,c,d,crank,drank,cls) =
|
||||
(sense_id,lemma1,lemma2,c,d,crank,drank,cls,elem (lemma1,lemma2) sel)
|
||||
Reference in New Issue
Block a user