forked from GitHub/gf-core
improved generation of BNC dictionaries ; now called TopDictionary for clarity
This commit is contained in:
@@ -1,27 +1,80 @@
|
||||
import qualified Data.Map
|
||||
import Data.List
|
||||
|
||||
langs = words "Bul Chi Eng Fin Fre Ger Hin Ita Spa Swe"
|
||||
|
||||
createAllConcretes = do
|
||||
createAbstract
|
||||
mapM_ createConcrete langs
|
||||
|
||||
createAbstract = do
|
||||
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
||||
writeFile (gfFile "todo/tmp/TopDictionary" "") $
|
||||
unlines $ ["abstract TopDictionary = Cat **{"] ++
|
||||
[unwords ("fun":f:":": snd (splitFun f) :[";"]) | f <- bnc] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
|
||||
createConcrete lang = do
|
||||
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
||||
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines -- current lang lexicon
|
||||
let header = getHeader dict
|
||||
let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict]
|
||||
let bncdict = [(f,maybe "variants{} ;" id $ Data.Map.lookup f dictmap) | f <- bnc] -- current lang for BNC
|
||||
writeFile (gfFile "todo/tmp/Dictionary" lang) $
|
||||
unlines $ header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict] -- lin rules to a map
|
||||
let bncdict = [(f,lookupFun f dictmap) | f <- bnc] -- current lang for BNC
|
||||
writeFile (gfFile "todo/tmp/TopDictionary" lang) $
|
||||
unlines $ toTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
|
||||
gfFile body lang = body ++ lang ++ ".gf"
|
||||
|
||||
mergeDict lang = do
|
||||
old <- readFile (gfFile "Dictionary" lang) >>= return . lines -- read old lexicon
|
||||
new <- readFile (gfFile "todo/Dictionary" lang) >>= return . lines -- read corrected and new words
|
||||
new <- readFile (gfFile "todo/TopDictionary" lang) >>= return . lines -- read corrected and new words
|
||||
let header = getHeader new
|
||||
let oldmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words old]
|
||||
let newlist = [(f,unwords (takeWhile (/= "--") ws)) | "lin":f:"=":ws <- map words new] -- drop comments from corrected words
|
||||
let newmap = foldr (uncurry Data.Map.insert) oldmap newlist -- insert corrected words
|
||||
writeFile (gfFile "tmp/Dictionary" lang) $
|
||||
unlines $ header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"] -- print revised file to tmp/
|
||||
unlines $ fromTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"] -- print revised file to tmp/
|
||||
|
||||
-- get the part of Dict before the first lin rule
|
||||
getHeader = takeWhile ((/= "lin") . take 3)
|
||||
|
||||
toTop = map tt where
|
||||
tt s = case s of
|
||||
'D':'i':'c':'t':cs -> "TopDict" ++ tt cs
|
||||
c:cs -> c : tt cs
|
||||
_ -> s
|
||||
|
||||
fromTop = map tt where
|
||||
tt s = case s of
|
||||
'T':'o':'p':'D':'i':'c':'t':cs -> "Dict" ++ tt cs
|
||||
c:cs -> c : tt cs
|
||||
_ -> s
|
||||
|
||||
-- try to find lin rules by searching first literally, then subcategories in priority order
|
||||
|
||||
lookupFun f dictmap = case look f of
|
||||
Just rule -> rule
|
||||
_ -> case [r | Just r <- map look (subCats f), head (words r) `notElem` ["variants","variants{}"]] of
|
||||
rule:_ -> "variants{}; -- " ++ rule -- cannot return it as such, as probably type incorrect
|
||||
_ -> "variants{} ; -- "
|
||||
where
|
||||
look = flip Data.Map.lookup dictmap
|
||||
|
||||
subCats f = case splitFun f of
|
||||
(fun,cat) -> case cat of
|
||||
"V" -> [fun ++ c | c <- words "V2 V3 VS VQ VA VV V2S V2Q V2A V2V"]
|
||||
"V2" -> [fun ++ c | c <- words "V3 V2S V2Q V2A V2V V VS VQ VA VV"]
|
||||
"VS" -> [fun ++ c | c <- words "VQ V2S V2Q V2 V V2A V2V V3 VA VV"]
|
||||
"VQ" -> [fun ++ c | c <- words "VS V2Q V2S V2 V V2A V2V V3 VA VV"]
|
||||
"VA" -> [fun ++ c | c <- words "V V2A V2 V3 VS VQ VV V2S V2Q V2V"]
|
||||
"VV" -> [fun ++ c | c <- words "V2V V V2 V3 VS VQ VV V2S V2Q V2V"]
|
||||
"V3" -> [fun ++ c | c <- words "V2 V2S V2Q V2A V2V V VS VQ VA VV"]
|
||||
"V2S" -> [fun ++ c | c <- words "VS VQ V2Q V2A V2V V2 V3 VA V VV"]
|
||||
"V2Q" -> [fun ++ c | c <- words "VQ VS V2S V2A V2V V2 V3 VA V VV"]
|
||||
"V2A" -> [fun ++ c | c <- words "VA V2 V3 V VS VQ VV V2S V2Q V2V"]
|
||||
"V2V" -> [fun ++ c | c <- words "VV V2 V2 V VS VQ VV V2S V2Q V2V"]
|
||||
"Adv" -> [fun ++ c | c <- words "AdV Prep"]
|
||||
"AdV" -> [fun ++ c | c <- words "Adv Prep"]
|
||||
_ -> []
|
||||
|
||||
splitFun f = case span (/='_') (reverse f) of (tac,nuf) -> (reverse nuf, reverse tac)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,12 @@
|
||||
abstract TopDict = Cat ** {
|
||||
abstract TopDictionary = Cat **{
|
||||
fun of_Prep : Prep ;
|
||||
fun and_Conj : Conj ;
|
||||
fun in_Prep : Prep ;
|
||||
fun have_VV : VV ;
|
||||
fun have_VS : VS ;
|
||||
fun have_V2V : V2V ;
|
||||
fun have_V2 : V2 ;
|
||||
fun have_V : V ;
|
||||
fun it_Pron : Pron ;
|
||||
fun to_Prep : Prep ;
|
||||
fun for_Prep : Prep ;
|
||||
@@ -7846,4 +7850,4 @@ fun wildly_Adv : Adv ;
|
||||
fun reformer_N : N ;
|
||||
fun quantum_N : N ;
|
||||
fun considering_Subj : Subj ;
|
||||
}
|
||||
}
|
||||
+718
-698
File diff suppressed because it is too large
Load Diff
+855
-851
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+775
-776
File diff suppressed because it is too large
Load Diff
+2724
-2722
File diff suppressed because it is too large
Load Diff
+793
-787
File diff suppressed because it is too large
Load Diff
+1192
-1184
File diff suppressed because it is too large
Load Diff
+2839
-2839
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
+452
-451
File diff suppressed because it is too large
Load Diff
@@ -14,11 +14,11 @@
|
||||
<H2>Call for contributions: the generic translation dictionaries of GF</H2>
|
||||
|
||||
<P>
|
||||
<B>Wanted</B>: manual checking of TopDict???.gf files in
|
||||
<B>Wanted</B>: manual checking of TopDictionary???.gf files in
|
||||
<A HREF="http://www.grammaticalframework.org/lib/src/translator/todo">this directory</A>.
|
||||
</P>
|
||||
<P>
|
||||
<B>Abstract syntax</B>: <A HREF="./TopDict.gf">TopDict</A>, the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
<B>Abstract syntax</B>: <A HREF="./TopDictionary.gf">TopDictionary</A>, the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
<A HREF="http://www.kilgarriff.co.uk/BNClists/lemma.num">here</A>.
|
||||
</P>
|
||||
<P>
|
||||
@@ -63,6 +63,9 @@ Follow these steps for your language. For instance, ToCheckFre.gf, with Fre subs
|
||||
A reasonable batch of revisions is 500 words or more, which should be doable in less than 2 hours. To avoid conflicts and overlapping work,
|
||||
don't spend more than one day on a batch of work.
|
||||
</P>
|
||||
<P>
|
||||
The already split senses are explained <A HREF="../senses-in-Dictionary.txt">here</A>.
|
||||
</P>
|
||||
|
||||
<H2>Guidelines</H2>
|
||||
|
||||
|
||||
@@ -4,10 +4,10 @@ April 2014
|
||||
|
||||
==Call for contributions: the generic translation dictionaries of GF==
|
||||
|
||||
**Wanted**: manual checking of TopDict???.gf files in
|
||||
**Wanted**: manual checking of TopDictionary???.gf files in
|
||||
[this directory http://www.grammaticalframework.org/lib/src/translator/todo].
|
||||
|
||||
**Abstract syntax**: [TopDict ./TopDict.gf], the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
**Abstract syntax**: [TopDictionary ./TopDictionary.gf], the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
[here http://www.kilgarriff.co.uk/BNClists/lemma.num].
|
||||
|
||||
**Usage**: part of the general translation dictionaries, used for instance in the
|
||||
@@ -41,7 +41,7 @@ Follow these steps for your language. For instance, ToCheckFre.gf, with Fre subs
|
||||
A reasonable batch of revisions is 500 words or more, which should be doable in less than 2 hours. To avoid conflicts and overlapping work,
|
||||
don't spend more than one day on a batch of work.
|
||||
|
||||
|
||||
The already split senses are explained [here ../senses-in-Dictionary.txt].
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user