Import Folkets Lexikon: report the number of words found

And some other minor tweaks.
This commit is contained in:
hallgren
2013-04-11 08:54:39 +00:00
parent 2d3e242ab3
commit 1ad1d427a4

View File

@@ -8,7 +8,7 @@
import Prelude hiding (lookup) import Prelude hiding (lookup)
import Data.List hiding (lookup) import Data.List hiding (lookup)
import System.Directory(doesFileExist) import System.Directory(doesFileExist,renameFile)
import System.Cmd(system) import System.Cmd(system)
import System.Exit import System.Exit
import Control.Monad(unless) import Control.Monad(unless)
@@ -42,7 +42,7 @@ prepare =
download url file = ifMissing file $ curl (url++file) download url file = ifMissing file $ curl (url++file)
makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src) makePGF imps dir pgf src = ifMissing (dir++pgf) (gfmake imps dir src)
curl url = shell $ ["curl -O",url] curl url = shell $ ["curl -OL --compressed",url]
gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src] gfmake imps dir src = shell $ ["cd",dir,"&&","gf",opts,imp imps,src]
opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses" opts = "-s -make -no-pmcfg -gfo-dir ../../alltenses"
@@ -64,17 +64,19 @@ prepare =
--- Create --------------------------------------------------------------------- --- Create ---------------------------------------------------------------------
create = create =
do eng <- readPGF "../../english/DictEngAbs.pgf" do lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv
lexicon1 <- fromFolketsEnSv `fmap` readFile folkets_en_sv
lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en lexicon2 <- fromFolketsSvEn `fmap` readFile folkets_sv_en
let lexicon = M.unionWith (++) lexicon1 lexicon2 let lexicon = M.unionWith (++) lexicon1 lexicon2
missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf" missing <- missingWords `fmap` readPGF "../DictEngAbs.pgf"
putStrLn $ "Looking for "++show (length missing)++" missing words."
eng <- readPGF "../../english/DictEngAbs.pgf"
saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf" saldo <- dict2map `fmap` readPGF "../DictSweAbs.pgf"
let entries = map (translate lexicon eng saldo) missing let entries = map (translate lexicon eng saldo) missing
putStrLn $ "Looking for "++show (length missing)++" missing words." histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries]
writeFile target . concWrap $ map toGF entries tmp = target++".tmp"
let histo = M.toList $ M.fromListWith (+) [(length ts,1)|(_,ts)<-entries]
putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words." putStrLn $ "Found "++show (sum [n|(a,n)<-histo,a>0])++" words."
writeFile tmp . concWrap $ map toGF entries
renameFile tmp target
mapM_ print histo mapM_ print histo
lang1 = head . languages lang1 = head . languages
@@ -92,24 +94,23 @@ missingWords pgf = missingLins pgf (lang1 pgf)
fromFolketsEnSv :: String -> M.Map String [String] fromFolketsEnSv :: String -> M.Map String [String]
fromFolketsEnSv = fromFolketsEnSv =
fmap nub . fromList . fromList .
map entry . findElements (unqual "word") . fromJust . parseXMLDoc map entry . findElements (unqual "word") . fromJust . parseXMLDoc
where where
entry w = (value w,map value (findChildren (unqual "translation") w)) entry w = (value w,map value (findChildren (unqual "translation") w))
value = fromJust . findAttr (unqual "value")
fromFolketsSvEn :: String -> M.Map String [String] fromFolketsSvEn :: String -> M.Map String [String]
fromFolketsSvEn = fromFolketsSvEn =
fmap nub . fromList . fromList .
concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc concatMap entry . findElements (unqual "word") . fromJust . parseXMLDoc
where where
entry w = [(en,[sv])|let sv=value w, entry w = [(en,[sv])|let sv=value w,
en<-map value (findChildren (unqual "translation") w)] en<-map value (findChildren (unqual "translation") w)]
value = fromJust . findAttr (unqual "value")
value = fromJust . findAttr (unqual "value")
lookup m = maybe [] id . M.lookup m lookup m = maybe [] id . M.lookup m
fromList xs = M.fromListWith (++) xs fromList xs = fmap nub $ M.fromListWith (++) xs
translate lexicon eng saldo eabs = translate lexicon eng saldo eabs =
(eabs,nub [sabs|let en=lin eng eabs, (eabs,nub [sabs|let en=lin eng eabs,