mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
improved generation of BNC dictionaries ; now called TopDictionary for clarity
This commit is contained in:
@@ -1,27 +1,80 @@
|
||||
import qualified Data.Map
|
||||
import Data.List
|
||||
|
||||
langs = words "Bul Chi Eng Fin Fre Ger Hin Ita Spa Swe"
|
||||
|
||||
createAllConcretes = do
|
||||
createAbstract
|
||||
mapM_ createConcrete langs
|
||||
|
||||
createAbstract = do
|
||||
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
||||
writeFile (gfFile "todo/tmp/TopDictionary" "") $
|
||||
unlines $ ["abstract TopDictionary = Cat **{"] ++
|
||||
[unwords ("fun":f:":": snd (splitFun f) :[";"]) | f <- bnc] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
|
||||
createConcrete lang = do
|
||||
bnc <- readFile "bnc-to-check.txt" >>= return . words -- list of BNC funs
|
||||
dict <- readFile (gfFile "Dictionary" lang) >>= return . lines -- current lang lexicon
|
||||
let header = getHeader dict
|
||||
let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict]
|
||||
let bncdict = [(f,maybe "variants{} ;" id $ Data.Map.lookup f dictmap) | f <- bnc] -- current lang for BNC
|
||||
writeFile (gfFile "todo/tmp/Dictionary" lang) $
|
||||
unlines $ header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
let dictmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words dict] -- lin rules to a map
|
||||
let bncdict = [(f,lookupFun f dictmap) | f <- bnc] -- current lang for BNC
|
||||
writeFile (gfFile "todo/tmp/TopDictionary" lang) $
|
||||
unlines $ toTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- bncdict] ++ ["}"] -- print inspectable file, to todo/tmp/
|
||||
|
||||
gfFile body lang = body ++ lang ++ ".gf"
|
||||
|
||||
mergeDict lang = do
|
||||
old <- readFile (gfFile "Dictionary" lang) >>= return . lines -- read old lexicon
|
||||
new <- readFile (gfFile "todo/Dictionary" lang) >>= return . lines -- read corrected and new words
|
||||
new <- readFile (gfFile "todo/TopDictionary" lang) >>= return . lines -- read corrected and new words
|
||||
let header = getHeader new
|
||||
let oldmap = Data.Map.fromList [(f,unwords ws) | "lin":f:"=":ws <- map words old]
|
||||
let newlist = [(f,unwords (takeWhile (/= "--") ws)) | "lin":f:"=":ws <- map words new] -- drop comments from corrected words
|
||||
let newmap = foldr (uncurry Data.Map.insert) oldmap newlist -- insert corrected words
|
||||
writeFile (gfFile "tmp/Dictionary" lang) $
|
||||
unlines $ header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"] -- print revised file to tmp/
|
||||
unlines $ fromTop header ++ [unwords ("lin":f:"=":[ws]) | (f,ws) <- Data.Map.assocs newmap] ++ ["}"] -- print revised file to tmp/
|
||||
|
||||
-- get the part of Dict before the first lin rule
|
||||
getHeader = takeWhile ((/= "lin") . take 3)
|
||||
|
||||
toTop = map tt where
|
||||
tt s = case s of
|
||||
'D':'i':'c':'t':cs -> "TopDict" ++ tt cs
|
||||
c:cs -> c : tt cs
|
||||
_ -> s
|
||||
|
||||
fromTop = map tt where
|
||||
tt s = case s of
|
||||
'T':'o':'p':'D':'i':'c':'t':cs -> "Dict" ++ tt cs
|
||||
c:cs -> c : tt cs
|
||||
_ -> s
|
||||
|
||||
-- try to find lin rules by searching first literally, then subcategories in priority order
|
||||
|
||||
lookupFun f dictmap = case look f of
|
||||
Just rule -> rule
|
||||
_ -> case [r | Just r <- map look (subCats f), head (words r) `notElem` ["variants","variants{}"]] of
|
||||
rule:_ -> "variants{}; -- " ++ rule -- cannot return it as such, as probably type incorrect
|
||||
_ -> "variants{} ; -- "
|
||||
where
|
||||
look = flip Data.Map.lookup dictmap
|
||||
|
||||
subCats f = case splitFun f of
|
||||
(fun,cat) -> case cat of
|
||||
"V" -> [fun ++ c | c <- words "V2 V3 VS VQ VA VV V2S V2Q V2A V2V"]
|
||||
"V2" -> [fun ++ c | c <- words "V3 V2S V2Q V2A V2V V VS VQ VA VV"]
|
||||
"VS" -> [fun ++ c | c <- words "VQ V2S V2Q V2 V V2A V2V V3 VA VV"]
|
||||
"VQ" -> [fun ++ c | c <- words "VS V2Q V2S V2 V V2A V2V V3 VA VV"]
|
||||
"VA" -> [fun ++ c | c <- words "V V2A V2 V3 VS VQ VV V2S V2Q V2V"]
|
||||
"VV" -> [fun ++ c | c <- words "V2V V V2 V3 VS VQ VV V2S V2Q V2V"]
|
||||
"V3" -> [fun ++ c | c <- words "V2 V2S V2Q V2A V2V V VS VQ VA VV"]
|
||||
"V2S" -> [fun ++ c | c <- words "VS VQ V2Q V2A V2V V2 V3 VA V VV"]
|
||||
"V2Q" -> [fun ++ c | c <- words "VQ VS V2S V2A V2V V2 V3 VA V VV"]
|
||||
"V2A" -> [fun ++ c | c <- words "VA V2 V3 V VS VQ VV V2S V2Q V2V"]
|
||||
"V2V" -> [fun ++ c | c <- words "VV V2 V2 V VS VQ VV V2S V2Q V2V"]
|
||||
"Adv" -> [fun ++ c | c <- words "AdV Prep"]
|
||||
"AdV" -> [fun ++ c | c <- words "Adv Prep"]
|
||||
_ -> []
|
||||
|
||||
splitFun f = case span (/='_') (reverse f) of (tac,nuf) -> (reverse nuf, reverse tac)
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,8 +1,12 @@
|
||||
abstract TopDict = Cat ** {
|
||||
abstract TopDictionary = Cat **{
|
||||
fun of_Prep : Prep ;
|
||||
fun and_Conj : Conj ;
|
||||
fun in_Prep : Prep ;
|
||||
fun have_VV : VV ;
|
||||
fun have_VS : VS ;
|
||||
fun have_V2V : V2V ;
|
||||
fun have_V2 : V2 ;
|
||||
fun have_V : V ;
|
||||
fun it_Pron : Pron ;
|
||||
fun to_Prep : Prep ;
|
||||
fun for_Prep : Prep ;
|
||||
@@ -7846,4 +7850,4 @@ fun wildly_Adv : Adv ;
|
||||
fun reformer_N : N ;
|
||||
fun quantum_N : N ;
|
||||
fun considering_Subj : Subj ;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
7857
lib/src/translator/todo/TopDictionaryEng.gf
Normal file
7857
lib/src/translator/todo/TopDictionaryEng.gf
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
7862
lib/src/translator/todo/TopDictionarySpa.gf
Normal file
7862
lib/src/translator/todo/TopDictionarySpa.gf
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -14,11 +14,11 @@
|
||||
<H2>Call for contributions: the generic translation dictionaries of GF</H2>
|
||||
|
||||
<P>
|
||||
<B>Wanted</B>: manual checking of TopDict???.gf files in
|
||||
<B>Wanted</B>: manual checking of TopDictionary???.gf files in
|
||||
<A HREF="http://www.grammaticalframework.org/lib/src/translator/todo">this directory</A>.
|
||||
</P>
|
||||
<P>
|
||||
<B>Abstract syntax</B>: <A HREF="./TopDict.gf">TopDict</A>, the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
<B>Abstract syntax</B>: <A HREF="./TopDictionary.gf">TopDictionary</A>, the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
<A HREF="http://www.kilgarriff.co.uk/BNClists/lemma.num">here</A>.
|
||||
</P>
|
||||
<P>
|
||||
@@ -63,6 +63,9 @@ Follow these steps for your language. For instance, ToCheckFre.gf, with Fre subs
|
||||
A reasonable batch of revisions is 500 words or more, which should be doable in less than 2 hours. To avoid conflicts and overlapping work,
|
||||
don't spend more than one day on a batch of work.
|
||||
</P>
|
||||
<P>
|
||||
The already split senses are explained <A HREF="../senses-in-Dictionary.txt">here</A>.
|
||||
</P>
|
||||
|
||||
<H2>Guidelines</H2>
|
||||
|
||||
|
||||
@@ -4,10 +4,10 @@ April 2014
|
||||
|
||||
==Call for contributions: the generic translation dictionaries of GF==
|
||||
|
||||
**Wanted**: manual checking of TopDict???.gf files in
|
||||
**Wanted**: manual checking of TopDictionary???.gf files in
|
||||
[this directory http://www.grammaticalframework.org/lib/src/translator/todo].
|
||||
|
||||
**Abstract syntax**: [TopDict ./TopDict.gf], the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
**Abstract syntax**: [TopDictionary ./TopDictionary.gf], the top-7000 English words from British National Corpus, as sorted by frequency
|
||||
[here http://www.kilgarriff.co.uk/BNClists/lemma.num].
|
||||
|
||||
**Usage**: part of the general translation dictionaries, used for instance in the
|
||||
@@ -41,7 +41,7 @@ Follow these steps for your language. For instance, ToCheckFre.gf, with Fre subs
|
||||
A reasonable batch of revisions is 500 words or more, which should be doable in less than 2 hours. To avoid conflicts and overlapping work,
|
||||
don't spend more than one day on a batch of work.
|
||||
|
||||
|
||||
The already split senses are explained [here ../senses-in-Dictionary.txt].
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user