mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-20 02:09:32 -06:00
Adding a script to import Folkets Lexikon
In order to compile DictEngSwe.gf, you must now first run this script to create FolketsLexikon.gf. cd lib/src/swedish/folketslexikon make This will automatically download and convert Folkets Lexikon. If the download fails, a dummy version of FolketsLexikon.gf will remain, allowing DictEngSwe.gf to be compiled anyway.
This commit is contained in:
@@ -1,9 +1,10 @@
|
||||
--# -path=.:alltenses
|
||||
--# -path=.:../english:folketslexikon
|
||||
|
||||
concrete DictEngSwe of DictEngAbs = CatSwe ** open ParadigmsSwe,
|
||||
concrete DictEngSwe of DictEngAbs = CatSwe, FolketsLexikon ** open
|
||||
(S = StructuralSwe),
|
||||
(L = LexiconSwe),
|
||||
MorphoSwe, ParadigmsSwe, Prelude in {
|
||||
(L = LexiconSwe)
|
||||
--, MorphoSwe, ParadigmsSwe, Prelude
|
||||
in {
|
||||
|
||||
flags coding=utf8 ;
|
||||
|
||||
|
||||
141
lib/src/swedish/folketslexikon/ImportFolketsLexikon.hs
Normal file
141
lib/src/swedish/folketslexikon/ImportFolketsLexikon.hs
Normal file
@@ -0,0 +1,141 @@
|
||||
-- A script for adding missing words to DictEngSwe by taking them from
|
||||
-- from DictSwe, guided by translations in Folkets Lexikon,
|
||||
-- http://folkets-lexikon.csc.kth.se/folkets/om.html
|
||||
|
||||
-- Run it from lib/src/swedish/folketslexikon with +RTS -K32M -M1500M
|
||||
|
||||
-- TODO: alo try Lexin
|
||||
|
||||
import Prelude hiding (lookup)
|
||||
import Data.List hiding (lookup)
|
||||
import System.Directory(doesFileExist)
|
||||
import System.Cmd(system)
|
||||
import System.Exit
|
||||
import Control.Monad(unless)
|
||||
import Data.Maybe(fromJust)
|
||||
import qualified Data.Map as M
|
||||
import Text.XML.Light
|
||||
import PGF
|
||||
|
||||
main = do prepare
|
||||
create
|
||||
|
||||
--- Source and target names ----------------------------------------------------
|
||||
|
||||
folkets_root = "http://folkets-lexikon.csc.kth.se/folkets/"
|
||||
folkets_en_sv = "folkets_en_sv_public.xml"
|
||||
folkets_sv_en = "folkets_sv_en_public.xml"
|
||||
|
||||
name = "FolketsLexikon"
|
||||
target = name++".gf"
|
||||
|
||||
--- Prepare --------------------------------------------------------------------
|
||||
|
||||
prepare =
|
||||
do writeFile target dummy
|
||||
download folkets_root folkets_en_sv
|
||||
download folkets_root folkets_sv_en
|
||||
makePGF pre "../../english/" "DictEngAbs.pgf" "DictEng.gf"
|
||||
gfmake engswe "../" "DictEngSwe.gf"
|
||||
makePGF swe "../" "DictSweAbs.pgf" "DictSwe.gf"
|
||||
where
|
||||
download url file = ifMissing file $ curl (url++file)
|
||||
makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src)
|
||||
|
||||
curl url = shell $ ["curl -O",url]
|
||||
gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src]
|
||||
|
||||
opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses"
|
||||
imp dirs = unwords ["-i ../"++dir|dir<-dirs]
|
||||
pre = ["prelude","abstract","common"]
|
||||
swe = pre++["scandinavian"]
|
||||
engswe = swe++["english"]
|
||||
|
||||
ifMissing path cmd = do e <- doesFileExist path
|
||||
unless e cmd
|
||||
|
||||
shell ws = do let cmd = unwords ws
|
||||
putStrLn cmd
|
||||
e <- system cmd
|
||||
case e of
|
||||
ExitSuccess -> return ()
|
||||
_ -> fail "command failed"
|
||||
|
||||
--- Create ---------------------------------------------------------------------
|
||||
|
||||
create =
|
||||
do eng <- readPGF "../../english/DictEngAbs.pgf"
|
||||
lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv
|
||||
lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en
|
||||
let lexicon = M.unionWith (++) lexicon1 lexicon2
|
||||
missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf"
|
||||
saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf"
|
||||
let entries = map (translate lexicon eng saldo) missing
|
||||
putStrLn $ "Looking for "++show (length missing)++" missing words."
|
||||
writeFile target . concWrap $ map toGF entries
|
||||
let histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries]
|
||||
putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words."
|
||||
mapM_ print histo
|
||||
|
||||
lang1 = head . languages
|
||||
|
||||
-- Finding functions by parsing is too slow. Linearize all functions instead.
|
||||
dict2map :: PGF -> M.Map String [CId]
|
||||
dict2map pgf = fromList [(w,[f])|f<-functions pgf,w<-lins pgf f]
|
||||
|
||||
lin dict f = linearize dict (lang1 dict) (mkApp f [])
|
||||
|
||||
lins dict f = nub . filter (/="") . map snd . concat .
|
||||
tabularLinearizes dict (lang1 dict) $ mkApp f []
|
||||
|
||||
missingWords pgf = missingLins pgf (lang1 pgf)
|
||||
|
||||
fromFolketsEnSv :: String -> M.Map String [String]
|
||||
fromFolketsEnSv =
|
||||
fmap nub . fromList .
|
||||
map entry . findElements (unqual "word") . fromJust . parseXMLDoc
|
||||
where
|
||||
entry w = (value w,map value (findChildren (unqual "translation") w))
|
||||
value = fromJust . findAttr (unqual "value")
|
||||
|
||||
|
||||
fromFolketsSvEn :: String -> M.Map String [String]
|
||||
fromFolketsSvEn =
|
||||
fmap nub . fromList .
|
||||
concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc
|
||||
where
|
||||
entry w = [(en,[sv])|let sv=value w,
|
||||
en<-map value (findChildren (unqual "translation") w)]
|
||||
value = fromJust . findAttr (unqual "value")
|
||||
|
||||
lookup m = maybe [] id . M.lookup m
|
||||
fromList xs = M.fromListWith (++) xs
|
||||
|
||||
translate lexicon eng saldo eabs =
|
||||
(eabs,nub [sabs|let en=lin eng eabs,
|
||||
sv<-lookup en lexicon,
|
||||
sabs<-lookup sv saldo,
|
||||
sameCat eabs sabs])
|
||||
|
||||
sameCat f1 f2 = suffix f1==suffix f2
|
||||
where suffix = takeWhile (/='_') . reverse . show
|
||||
|
||||
toGF (eabs,[]) = "--"++show eabs++"\n"
|
||||
toGF (eabs,s:ss) = " "++en++" = "++show s++";"
|
||||
++" -- "++show (1+length ss)
|
||||
++"\n"
|
||||
++unlines [indent++"--"++show s|s<-{-take 10-} ss]
|
||||
where
|
||||
en = show eabs
|
||||
indent = replicate (length en+3) ' '
|
||||
|
||||
concWrap ws = unlines header++concat ws++unlines footer
|
||||
|
||||
header = ["concrete "++name++" of DictEngAbs = CatSwe ** open DictSwe in {",
|
||||
"",
|
||||
"flags coding=utf8 ;",
|
||||
"",
|
||||
"lin"]
|
||||
footer = ["}"]
|
||||
|
||||
dummy = unlines (init header++footer)
|
||||
7
lib/src/swedish/folketslexikon/Makefile
Normal file
7
lib/src/swedish/folketslexikon/Makefile
Normal file
@@ -0,0 +1,7 @@
|
||||
|
||||
FolketsLexikon.gf:
|
||||
ghc --make -O ImportFolketsLexikon.hs -rtsopts
|
||||
./ImportFolketsLexikon +RTS -K32M -M1500M
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.hi *.gfo *.xml FolketsLexikon.gf ImportFolketsLexikon
|
||||
Reference in New Issue
Block a user