mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-16 16:29:32 -06:00
script for building DictLat from William Whitaker's Words. Contains 35288 lemmas.
This commit is contained in:
60
lib/src/latin/dict/MkLatin.hs
Normal file
60
lib/src/latin/dict/MkLatin.hs
Normal file
@@ -0,0 +1,60 @@
|
||||
import Data.Char
|
||||
|
||||
main = mkLatin
|
||||
|
||||
mkLatin = do
|
||||
ws <- readFile "DICTPAGE.RAW" >>= return . lines
|
||||
let fcs = mkDict ws
|
||||
let fs = [unwords [status st ++ "fun",f,":",cat,";","--",eng] | [st, f,cat,_,eng] <- fcs]
|
||||
let ls = [unwords [status st ++ "lin",f,"=",lat,";"] | [st, f,_,lat,_] <- fcs]
|
||||
|
||||
writeFile "DictLatAbs.gf" "abstract DictLatAbs = Cat ** {"
|
||||
appendFile "DictLatAbs.gf" "\n-- extracted from http://archives.nd.edu/whitaker/dictpage.htm\n"
|
||||
appendFile "DictLatAbs.gf" (unlines fs)
|
||||
appendFile "DictLatAbs.gf" "}"
|
||||
|
||||
writeFile "DictLat.gf" "concrete DictLat of DictLatAbs = CatLat ** open ParadigmsLat in {"
|
||||
appendFile "DictLat.gf" "\n-- extracted from http://archives.nd.edu/whitaker/dictpage.htm\n"
|
||||
appendFile "DictLat.gf" (unlines ls)
|
||||
appendFile "DictLat.gf" "}"
|
||||
|
||||
--- putStrLn $ unlines ls
|
||||
|
||||
mkDict :: [String] -> [[String]] -- fun, cat, lat, eng
|
||||
mkDict = map mkOne . zip [10001 ..] . map cleanUp
|
||||
where
|
||||
cleanUp s = let (lat,eng) = break (=='[') s in
|
||||
(words (filter (\c -> c==' ' || isLetter c) lat), eng)
|
||||
mkOne (i,(lws,eng)) = addId i (mkLat lws) ++ [eng]
|
||||
mkLat lws = case lws of
|
||||
x:y:"N":_:g:_ -> f [x, "N", lin "mkN" [show x,show y, (snd (gender g))]] where f = if fst (gender g) then ok else todo
|
||||
x:y:"N": g:_ -> f [x, "N", lin "mkN" [show x,show y, (snd (gender g))]] where f = if fst (gender g) then ok else todo
|
||||
x:"gen":z:"ADJ":_ -> ok [x, "A", lin "mkA" [show x]]
|
||||
x:y:z:"ADJ":_ -> ok [x, "A", lin "mkA" [show x]]
|
||||
x:"ADV":_ -> ok [x, "Adv", lin "mkAdv" [show x]]
|
||||
x:y:z:u:"V":_:"INTRANS":_ -> okv [y, "V", lin "mkV" [show y,show x,show z,show u]]
|
||||
x:y:z:u:"V":_:"TRANS":_ -> okv [y, "V2", lin "mkV2" ["(" ++ lin "mkV" [show y,show x,show z,show u] ++ ")"]]
|
||||
x:y:z:u:"V":_:"DEP":_ -> todo [y, "V", lin "depV" ["(" ++ lin "mkV" [show y,show x,show z,show u] ++ ")"]]
|
||||
x:y:z:u:"V":_ -> okv [y, "V", lin "mkV" [show y,show x,show z,show u]]
|
||||
|
||||
_ -> todo ["TODO","",unwords lws]
|
||||
|
||||
addId i (st:f:c:rest) = st:(f ++ "_" ++ show i ++ "_" ++ c):c:rest
|
||||
|
||||
gender g = case g of
|
||||
"M" -> (True, "masculine")
|
||||
"F" -> (True, "feminine")
|
||||
"N" -> (True, "neuter")
|
||||
"C" -> (True, "masculine {-C-}")
|
||||
_ -> (False, g ++ "{-??-}")
|
||||
|
||||
fun x c = x ++ "_" ++ c
|
||||
lin f xs = unwords (f:xs)
|
||||
todo xs = "1":xs
|
||||
ok xs = "0":xs
|
||||
okv r@(f:_) = if elem (take 3 (reverse f)) ["era","ere","eri"] then ok r else todo r
|
||||
|
||||
status st = case st of
|
||||
"0" -> ""
|
||||
_ -> "-- "
|
||||
|
||||
33
lib/src/latin/dict/README
Normal file
33
lib/src/latin/dict/README
Normal file
@@ -0,0 +1,33 @@
|
||||
Aarne Ranta 2017-01-19
|
||||
|
||||
A script for building a Latin dictionary from William Whitaker's Words
|
||||
|
||||
Source: http://archives.nd.edu/whitaker/dictpage.zip
|
||||
|
||||
See also: https://en.wikipedia.org/wiki/William_Whitaker's_Words
|
||||
|
||||
License: BSD for the script. Unknown for the source, therefore not redistributed here.
|
||||
|
||||
Usage:
|
||||
|
||||
runghc MkLatin.hs
|
||||
|
||||
Output:
|
||||
|
||||
DictLatAbs.gf
|
||||
DictLat.gf
|
||||
|
||||
Uses Herbert Lange's RGL implementation of Latin (darcs version of 2017-01-19 with ParadigmsLat.mkAdv added by AR)
|
||||
|
||||
Coverage of first version: 90%; 3938 missing out of 39227 entries. Marked with leading "--"
|
||||
Reasons for uncovered entries
|
||||
- uncommon category (e.g. structural words)
|
||||
- missing case in ParadigmsLat.gf, e.g. deponent verbs
|
||||
|
||||
Matching for adjectives (A) uncertain, because we only use ParadigmsLat.gf.mkA : Str -> A
|
||||
|
||||
Verb subcat information is uncertain, too. It is based on the src's TRANS and INTRANS flags.
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user