diff --git a/lib/src/swedish/DictEngSwe.gf b/lib/src/swedish/DictEngSwe.gf index d288e5b13..7dc5a18a9 100644 --- a/lib/src/swedish/DictEngSwe.gf +++ b/lib/src/swedish/DictEngSwe.gf @@ -1,9 +1,10 @@ ---# -path=.:alltenses +--# -path=.:../english:folketslexikon -concrete DictEngSwe of DictEngAbs = CatSwe ** open ParadigmsSwe, +concrete DictEngSwe of DictEngAbs = CatSwe, FolketsLexikon ** open (S = StructuralSwe), - (L = LexiconSwe), - MorphoSwe, ParadigmsSwe, Prelude in { + (L = LexiconSwe) +--, MorphoSwe, ParadigmsSwe, Prelude + in { flags coding=utf8 ; diff --git a/lib/src/swedish/folketslexikon/ImportFolketsLexikon.hs b/lib/src/swedish/folketslexikon/ImportFolketsLexikon.hs new file mode 100644 index 000000000..bece6c965 --- /dev/null +++ b/lib/src/swedish/folketslexikon/ImportFolketsLexikon.hs @@ -0,0 +1,141 @@ +-- A script for adding missing words to DictEngSwe by taking them from +-- from DictSwe, guided by translations in Folkets Lexikon, +-- http://folkets-lexikon.csc.kth.se/folkets/om.html + +-- Run it from lib/src/swedish/folketslexikon with +RTS -K32M -M1500M + +-- TODO: alo try Lexin + +import Prelude hiding (lookup) +import Data.List hiding (lookup) +import System.Directory(doesFileExist) +import System.Cmd(system) +import System.Exit +import Control.Monad(unless) +import Data.Maybe(fromJust) +import qualified Data.Map as M +import Text.XML.Light +import PGF + +main = do prepare + create + +--- Source and target names ---------------------------------------------------- + +folkets_root = "http://folkets-lexikon.csc.kth.se/folkets/" +folkets_en_sv = "folkets_en_sv_public.xml" +folkets_sv_en = "folkets_sv_en_public.xml" + +name = "FolketsLexikon" +target = name++".gf" + +--- Prepare -------------------------------------------------------------------- + +prepare = + do writeFile target dummy + download folkets_root folkets_en_sv + download folkets_root folkets_sv_en + makePGF pre "../../english/" "DictEngAbs.pgf" "DictEng.gf" + gfmake engswe "../" "DictEngSwe.gf" + makePGF swe "../" "DictSweAbs.pgf" "DictSwe.gf" + where + download url file = ifMissing file $ curl (url++file) + makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src) + + curl url = shell $ ["curl -O",url] + gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src] + + opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses" + imp dirs = unwords ["-i ../"++dir|dir<-dirs] + pre = ["prelude","abstract","common"] + swe = pre++["scandinavian"] + engswe = swe++["english"] + + ifMissing path cmd = do e <- doesFileExist path + unless e cmd + + shell ws = do let cmd = unwords ws + putStrLn cmd + e <- system cmd + case e of + ExitSuccess -> return () + _ -> fail "command failed" + +--- Create --------------------------------------------------------------------- + +create = + do eng <- readPGF "../../english/DictEngAbs.pgf" + lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv + lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en + let lexicon = M.unionWith (++) lexicon1 lexicon2 + missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf" + saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf" + let entries = map (translate lexicon eng saldo) missing + putStrLn $ "Looking for "++show (length missing)++" missing words." + writeFile target . concWrap $ map toGF entries + let histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries] + putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words." + mapM_ print histo + +lang1 = head . languages + +-- Finding functions by parsing is too slow. Linearize all functions instead. +dict2map :: PGF -> M.Map String [CId] +dict2map pgf = fromList [(w,[f])|f<-functions pgf,w<-lins pgf f] + +lin dict f = linearize dict (lang1 dict) (mkApp f []) + +lins dict f = nub . filter (/="") . map snd . concat . + tabularLinearizes dict (lang1 dict) $ mkApp f [] + +missingWords pgf = missingLins pgf (lang1 pgf) + +fromFolketsEnSv :: String -> M.Map String [String] +fromFolketsEnSv = + fmap nub . fromList . + map entry . findElements (unqual "word") . fromJust . parseXMLDoc + where + entry w = (value w,map value (findChildren (unqual "translation") w)) + value = fromJust . findAttr (unqual "value") + + +fromFolketsSvEn :: String -> M.Map String [String] +fromFolketsSvEn = + fmap nub . fromList . + concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc + where + entry w = [(en,[sv])|let sv=value w, + en<-map value (findChildren (unqual "translation") w)] + value = fromJust . findAttr (unqual "value") + +lookup m = maybe [] id . M.lookup m +fromList xs = M.fromListWith (++) xs + +translate lexicon eng saldo eabs = + (eabs,nub [sabs|let en=lin eng eabs, + sv<-lookup en lexicon, + sabs<-lookup sv saldo, + sameCat eabs sabs]) + +sameCat f1 f2 = suffix f1==suffix f2 + where suffix = takeWhile (/='_') . reverse . show + +toGF (eabs,[]) = "--"++show eabs++"\n" +toGF (eabs,s:ss) = " "++en++" = "++show s++";" + ++" -- "++show (1+length ss) + ++"\n" + ++unlines [indent++"--"++show s|s<-{-take 10-} ss] + where + en = show eabs + indent = replicate (length en+3) ' ' + +concWrap ws = unlines header++concat ws++unlines footer + +header = ["concrete "++name++" of DictEngAbs = CatSwe ** open DictSwe in {", + "", + "flags coding=utf8 ;", + "", + "lin"] +footer = ["}"] + +dummy = unlines (init header++footer) diff --git a/lib/src/swedish/folketslexikon/Makefile b/lib/src/swedish/folketslexikon/Makefile new file mode 100644 index 000000000..5a79633ae --- /dev/null +++ b/lib/src/swedish/folketslexikon/Makefile @@ -0,0 +1,7 @@ + +FolketsLexikon.gf: + ghc --make -O ImportFolketsLexikon.hs -rtsopts + ./ImportFolketsLexikon +RTS -K32M -M1500M + +clean: + rm -rf *.o *.hi *.gfo *.xml FolketsLexikon.gf ImportFolketsLexikon