Adding a script to import Folkets Lexikon

In order to compile DictEngSwe.gf, you must now first run this script to create
FolketsLexikon.gf.

	cd lib/src/swedish/folketslexikon
	make

This will automatically download and convert Folkets Lexikon.
If the download fails, a dummy version of FolketsLexikon.gf will remain,
allowing DictEngSwe.gf to be compiled anyway.
This commit is contained in:
hallgren
2013-04-10 16:25:48 +00:00
parent 45197acd2d
commit 6689677165
3 changed files with 153 additions and 4 deletions

View File

@@ -1,9 +1,10 @@
--# -path=.:alltenses
--# -path=.:../english:folketslexikon
concrete DictEngSwe of DictEngAbs = CatSwe ** open ParadigmsSwe,
concrete DictEngSwe of DictEngAbs = CatSwe, FolketsLexikon ** open
(S = StructuralSwe),
(L = LexiconSwe),
MorphoSwe, ParadigmsSwe, Prelude in {
(L = LexiconSwe)
--, MorphoSwe, ParadigmsSwe, Prelude
in {
flags coding=utf8 ;

View File

@@ -0,0 +1,141 @@
-- A script for adding missing words to DictEngSwe by taking them from
-- from DictSwe, guided by translations in Folkets Lexikon,
-- http://folkets-lexikon.csc.kth.se/folkets/om.html
-- Run it from lib/src/swedish/folketslexikon with +RTS -K32M -M1500M
-- TODO: alo try Lexin
import Prelude hiding (lookup)
import Data.List hiding (lookup)
import System.Directory(doesFileExist)
import System.Cmd(system)
import System.Exit
import Control.Monad(unless)
import Data.Maybe(fromJust)
import qualified Data.Map as M
import Text.XML.Light
import PGF
main = do prepare
create
--- Source and target names ----------------------------------------------------
folkets_root = "http://folkets-lexikon.csc.kth.se/folkets/"
folkets_en_sv = "folkets_en_sv_public.xml"
folkets_sv_en = "folkets_sv_en_public.xml"
name = "FolketsLexikon"
target = name++".gf"
--- Prepare --------------------------------------------------------------------
prepare =
do writeFile target dummy
download folkets_root folkets_en_sv
download folkets_root folkets_sv_en
makePGF pre "../../english/" "DictEngAbs.pgf" "DictEng.gf"
gfmake engswe "../" "DictEngSwe.gf"
makePGF swe "../" "DictSweAbs.pgf" "DictSwe.gf"
where
download url file = ifMissing file $ curl (url++file)
makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src)
curl url = shell $ ["curl -O",url]
gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src]
opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses"
imp dirs = unwords ["-i ../"++dir|dir<-dirs]
pre = ["prelude","abstract","common"]
swe = pre++["scandinavian"]
engswe = swe++["english"]
ifMissing path cmd = do e <- doesFileExist path
unless e cmd
shell ws = do let cmd = unwords ws
putStrLn cmd
e <- system cmd
case e of
ExitSuccess -> return ()
_ -> fail "command failed"
--- Create ---------------------------------------------------------------------
create =
do eng <- readPGF "../../english/DictEngAbs.pgf"
lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv
lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en
let lexicon = M.unionWith (++) lexicon1 lexicon2
missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf"
saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf"
let entries = map (translate lexicon eng saldo) missing
putStrLn $ "Looking for "++show (length missing)++" missing words."
writeFile target . concWrap $ map toGF entries
let histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries]
putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words."
mapM_ print histo
lang1 = head . languages
-- Finding functions by parsing is too slow. Linearize all functions instead.
dict2map :: PGF -> M.Map String [CId]
dict2map pgf = fromList [(w,[f])|f<-functions pgf,w<-lins pgf f]
lin dict f = linearize dict (lang1 dict) (mkApp f [])
lins dict f = nub . filter (/="") . map snd . concat .
tabularLinearizes dict (lang1 dict) $ mkApp f []
missingWords pgf = missingLins pgf (lang1 pgf)
fromFolketsEnSv :: String -> M.Map String [String]
fromFolketsEnSv =
fmap nub . fromList .
map entry . findElements (unqual "word") . fromJust . parseXMLDoc
where
entry w = (value w,map value (findChildren (unqual "translation") w))
value = fromJust . findAttr (unqual "value")
fromFolketsSvEn :: String -> M.Map String [String]
fromFolketsSvEn =
fmap nub . fromList .
concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc
where
entry w = [(en,[sv])|let sv=value w,
en<-map value (findChildren (unqual "translation") w)]
value = fromJust . findAttr (unqual "value")
lookup m = maybe [] id . M.lookup m
fromList xs = M.fromListWith (++) xs
translate lexicon eng saldo eabs =
(eabs,nub [sabs|let en=lin eng eabs,
sv<-lookup en lexicon,
sabs<-lookup sv saldo,
sameCat eabs sabs])
sameCat f1 f2 = suffix f1==suffix f2
where suffix = takeWhile (/='_') . reverse . show
toGF (eabs,[]) = "--"++show eabs++"\n"
toGF (eabs,s:ss) = " "++en++" = "++show s++";"
++" -- "++show (1+length ss)
++"\n"
++unlines [indent++"--"++show s|s<-{-take 10-} ss]
where
en = show eabs
indent = replicate (length en+3) ' '
concWrap ws = unlines header++concat ws++unlines footer
header = ["concrete "++name++" of DictEngAbs = CatSwe ** open DictSwe in {",
"",
"flags coding=utf8 ;",
"",
"lin"]
footer = ["}"]
dummy = unlines (init header++footer)

View File

@@ -0,0 +1,7 @@
FolketsLexikon.gf:
ghc --make -O ImportFolketsLexikon.hs -rtsopts
./ImportFolketsLexikon +RTS -K32M -M1500M
clean:
rm -rf *.o *.hi *.gfo *.xml FolketsLexikon.gf ImportFolketsLexikon