mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-06-13 00:50:11 -06:00
117 lines
4.1 KiB
Haskell
117 lines
4.1 KiB
Haskell
import Data.Map
|
|
import Data.List
|
|
import Data.Char
|
|
import System
|
|
|
|
-- a script for extracting an English-Finnish translation dictionary from
|
|
-- (1) Eng-Fin wordnet links
|
|
-- (2) Fin frequency dictionary
|
|
-- (3) Fin KOTUS morpho wordlist
|
|
-- usage: runghc FreqFin.hs, which produces DictEngFin.gf (with appropriate files in place)
|
|
-- AR 23/5/2012
|
|
|
|
main = do
|
|
lexicon <- readFile "../../abstract/Lexicon.gf" >>= return . getLexicon
|
|
freqs <- readFile "taajuus.txt" >>= return . getFreqMap
|
|
morpho <- readFile "../DictFin.gf" >>= return . getMorphoMap
|
|
transV <- readFile "Ven_fi.txt" >>= return . getTransDict "V" freqs morpho
|
|
transV2 <- readFile "V2en_fi.txt" >>= return . getTransDict "V2" freqs morpho
|
|
transA <- readFile "Aen_fi.txt" >>= return . getTransDict "A" freqs morpho
|
|
transN <- readFile "Nen_fi.txt" >>= return . getTransDict "N" freqs morpho
|
|
transAdv <- readFile "Adven_fi.txt" >>= return . getTransDict "Adv" freqs morpho
|
|
let cnc = sort $ lmap mkLin $ Data.List.filter (notLex lexicon) $ transV ++ transV2 ++ transA ++ transN ++ transAdv
|
|
system $ "cp dictBegin dictEngFin"
|
|
mapM_ (appendFile "dictEngFin") cnc
|
|
system $ "cat dictEngFin dictEnd >DictEngFin.gf"
|
|
|
|
getFreqMap = fromList . lmap (getFreq . words) . lines
|
|
|
|
lmap = Prelude.map
|
|
mlookup = Data.Map.lookup
|
|
lnull = Prelude.null
|
|
|
|
|
|
type FreqMap = Map Word (Rank,Cat)
|
|
type Rank = Int
|
|
type Cat = String
|
|
type Word = String
|
|
type Lin = String
|
|
|
|
getFreq :: [String] -> (Word,(Rank,Cat))
|
|
getFreq ws = case ws of
|
|
n:a:r:w:c:_ -> (w,(read n,c))
|
|
|
|
|
|
-- trusted lexicon overrides all other decisions
|
|
getLexicon :: String -> MorphoMap
|
|
getLexicon = fromList . concat . lmap (getLex . words) . lines where
|
|
getLex ws = case ws of
|
|
fun:":":cat:_ -> [(takeWhile (/='_') fun, (cat,fun))]
|
|
_ -> []
|
|
|
|
notLex :: MorphoMap -> (Word,(Cat,[(Word,(Rank,Lin))])) -> Bool
|
|
notLex morpho (word,(cat,_)) = case mlookup word morpho of
|
|
Just (c,l) -> False --- | c == cat -> False --- love_V2/love_N
|
|
_ -> True
|
|
|
|
type MorphoMap = Map Word (Cat,Lin)
|
|
|
|
getMorphoMap = fromList . concat . lmap (getMorpho . words) . lines
|
|
|
|
getMorpho ws = case ws of
|
|
"lin":w:_:vs -> [(fst (wordcat w), (snd (wordcat w), unwords (init vs)))]
|
|
_ -> []
|
|
where
|
|
wordcat w = let (wd,c) = break (=='_') w in (wd, init (tail c))
|
|
|
|
type TransDict = [(Word,(Cat,[(Word,(Rank,Lin))]))]
|
|
|
|
getTransDict :: Cat -> FreqMap -> MorphoMap -> String -> TransDict
|
|
getTransDict cat freqs morpho = lmap getOne . lmap (lmap words) . stanzas . lines
|
|
where
|
|
getOne ls@((w:_):_) = (w,(cat, sortTrans cat [getRank vs | _:vs <- ls]))
|
|
getRank (v:[]) = case (mlookup v freqs, mlookup v morpho) of
|
|
(Just (i,c), Just (k,l)) | compatCat cat c && compatCat cat k -> (v, (i, linK l))
|
|
(Just (i,c), _) | compatCat cat c -> (v, (i, lin ("\"" ++ v ++ "\"")))
|
|
(_, Just (c,l)) | compatCat cat c -> (v, (morphoRank, linK l))
|
|
_ | all isLetter (take 1 v) && notVerb cat -> (v,(guessRank,lin ("\"" ++ v ++ "\"")))
|
|
_ -> (v,(noRank,lin v))
|
|
getRank vs = (unwords vs, (compRank,lin (unwords vs)))
|
|
|
|
lin l = "mk" ++ cat ++ " " ++ l
|
|
linK l = lin ("(lin " ++ catK ++ "K " ++ l ++ ")")
|
|
catK = case cat of
|
|
"V2" -> "V"
|
|
_ -> cat
|
|
notVerb cat = take 1 cat /= "V" -- can produce non-verbs
|
|
|
|
sortTrans :: Cat -> [(Word,(Rank,Lin))] -> [(Word,(Rank,Lin))]
|
|
sortTrans cat = chooseBest . sortBy (\ (_,(r,_)) (_,(s,_)) -> compare r s) where
|
|
chooseBest = take 1 ----
|
|
|
|
compatCat cat c = case cat of
|
|
"V2" -> c == "V"
|
|
_ -> c == cat
|
|
|
|
morphoRank, guessRank, noRank, compRank :: Int
|
|
morphoRank = 10000
|
|
guessRank = 20000
|
|
compRank = 30000
|
|
noRank = 40000
|
|
|
|
mkLin :: (Word,(Cat,[(Word,(Rank,Lin))])) -> String
|
|
mkLin (word,(cat,ws)) = unwords $ [keyw,fun,"=",lin,"; --",rank,"\n"] where
|
|
fun = lmap clean word ++ "_" ++ cat
|
|
(keyw,lin,rank) = case ws of
|
|
(w,(r,l)):_ | r < guessRank -> ("lin", l,show r)
|
|
(w,(r,_)):_ -> ("-- lin", "\"" ++ w ++ "\"",show r) -- non-wordnet or many-word
|
|
clean c = case c of
|
|
'-' -> '_'
|
|
_ -> c
|
|
|
|
stanzas :: [String] -> [[String]]
|
|
stanzas ls = case ls of
|
|
[]:ls2 -> stanzas ls2
|
|
_:_ -> let (ls1,ls2) = span (not . lnull) ls in ls1 : stanzas ls2
|
|
[] -> []
|