a minimal-lexicon version of resource

This commit is contained in:
aarne
2006-03-02 17:20:46 +00:00
parent bf872d018e
commit 27467d32fd
13 changed files with 357 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
abstract MinLexicon = Cat ** {
fun
add_V3 : V3 ;
apple_N : N ;
become_VA : VA ;
big_A : A ;
break_V2 : V2 ;
brother_N2 : N2 ;
paint_V2A : V2A ;
-- paris_PN : PN ;
sleep_V : V ;
wonder_VQ : VQ ;
}

View File

@@ -0,0 +1,2 @@
concrete MinLexiconEng of MinLexicon = MinLexiconI with
(Lexicon = LexiconEng) ;

View File

@@ -0,0 +1,2 @@
concrete MinLexiconFre of MinLexicon = MinLexiconI with
(Lexicon = LexiconFre) ;

View File

@@ -0,0 +1,17 @@
incomplete concrete MinLexiconI of MinLexicon = Cat ** open Lexicon in {
lin
add_V3 = add_V3 ;
apple_N = apple_N ;
become_VA = become_VA ;
big_A = big_A ;
break_V2 = break_V2 ;
brother_N2 = brother_N2 ;
paint_V2A = paint_V2A ;
-- paris_PN = paris_PN ;
say_VS = say_VS ;
sleep_V = sleep_V ;
wonder_VQ = wonder_VQ ;
} ;

View File

@@ -0,0 +1,29 @@
abstract MinStructural = Cat ** {
fun
-- This is an alphabetical list of structural words: just one from each category
-- but so that all variation is created, e.g. all persons for pronouns.
almost_AdN : AdN ;
always_AdV : AdV ;
and_Conj : Conj ;
but_PConj : PConj ;
can_VV : VV ;
either7or_DConj : DConj ;
he_Pron : Pron ;
here_Adv : Adv ;
how_IAdv : IAdv ;
i_Pron : Pron ;
if_Subj : Subj ;
in_Prep : Prep ;
that_NP : NP ;
we_Pron : Pron ;
whichPl_IDet : IDet ;
whichSg_IDet : IDet ;
whoPl_IP : IP ;
whoSg_IP : IP ;
youSg_Pron : Pron ;
youPl_Pron : Pron ;
}

View File

@@ -0,0 +1,2 @@
concrete MinStructuralEng of MinStructural = MinStructuralI with
(Structural = StructuralEng) ;

View File

@@ -0,0 +1,2 @@
concrete MinStructuralFre of MinStructural = MinStructuralI with
(Structural = StructuralFre) ;

View File

@@ -0,0 +1,23 @@
incomplete concrete MinStructuralI of MinStructural = open Structural in {
lin
almost_AdN = almost_AdN ;
always_AdV = always_AdV ;
and_Conj = and_Conj ;
but_PConj = but_PConj ;
can_VV = can_VV ;
either7or_DConj = either7or_DConj ;
he_Pron = he_Pron ;
here_Adv = here_Adv ;
how_IAdv = how_IAdv ;
i_Pron = i_Pron ;
if_Subj = if_Subj ;
in_Prep = in_Prep ;
that_NP = that_NP ;
we_Pron = we_Pron ;
whichPl_IDet = whichPl_IDet ;
whichSg_IDet = whichSg_IDet ;
whoPl_IP = whoPl_IP ;
whoSg_IP = whoSg_IP ;
youSg_Pron = youSg_Pron ;
youPl_Pron = youPl_Pron ;
} ;

View File

@@ -0,0 +1,21 @@
--1 Minimal Resource Grammar
-- This module defines the syntax with a minimal lexicon.
-- This is useful for treebank generation.
abstract Minimal =
Noun,
Verb,
Adjective,
Adverb,
Numeral,
Sentence,
Question,
Relative,
Conjunction,
Phrase,
Text,
MinStructural,
Idiom,
MinLexicon
** {} ;

View File

@@ -0,0 +1,22 @@
--# -path=.:../english:../abstract:../common:prelude
concrete MinimalEng of Minimal =
NounEng,
VerbEng,
AdjectiveEng,
AdverbEng,
NumeralEng,
SentenceEng,
QuestionEng,
RelativeEng,
ConjunctionEng,
PhraseEng,
TextX,
StructuralEng,
IdiomEng,
LexiconEng
** {
flags startcat = Phr ; -- unlexer = text ; lexer = text ;
} ;

View File

@@ -0,0 +1,22 @@
--# -path=.:../french:../romance:../abstract:../common:prelude
concrete MinimalFre of Minimal =
NounFre,
VerbFre,
AdjectiveFre,
AdverbFre,
NumeralFre,
SentenceFre,
QuestionFre,
RelativeFre,
ConjunctionFre,
PhraseFre,
TextX,
StructuralFre,
IdiomFre,
LexiconFre
** {
flags startcat = Phr ; -- unlexer = text ; lexer = text ;
} ;

View File

@@ -0,0 +1,143 @@
module MkLexicon where
import Char
allLines o f = do
s <- readFile f
mapM_ (putStrLn . o) (filter noComm (lines s))
-- discard comments and empty lines
noComm s = case s of
'-':'-':_ -> False
"" -> False
_ -> True
-- remove tailing comments
remTail s = case s of
'-':'-':_ -> []
c:cs -> c : remTail cs
_ -> s
-- postfix with category
postfix p s = takeWhile (not . isSpace) s ++ "_" ++ p
-- make fun rule
mkFun s =
let (w,p) = span (/='_') s in
" " ++ s ++ " : " ++ tail p ++ " ;"
-- make reused lin rule
mkRLin s =
let w = head (words s) in
" " ++ w ++ " = " ++ w ++ " ;"
-- make regular lin rule
mkLin s =
let (w,p) = span (/='_') s in
" " ++ s ++ " = " ++ lin (tail p) w ++ " ;"
where
lin cat w = case cat of
"V2" -> "dirV2 (regV" ++ " \"" ++ w ++ "\")"
'V':_ -> "mk" ++ cat ++ " (regV" ++ " \"" ++ w ++ "\")"
_ -> "reg" ++ cat ++ " \"" ++ w ++ "\""
-- normalize identifiers in Structural
mkIdent s = case words s of
w:ws -> if obsolete w then ""
else " " ++ (unwords $ mkId (update w) : ws)
_ -> s
where
mkId name@(c:cs) =
let
(x,y) = span isCat cs
in
toLower c : clean x ++ "_" ++ new y
isCat = flip notElem "PDNVCAIS"
clean x = case span isLower x of
(_,[]) -> x
(u,v) -> u ++ "8" ++ map toLower v
new y = case y of
"NumDet" -> "NDet"
_ -> y
obsolete w = elem w $ words "TheseNumNP ThoseNumNP NobodyNP NeitherNor NoDet AnyDet"
update w = case w of
"EitherOr" -> "EitherOrConjD"
"BothAnd" -> "BothAndConjD"
"PhrYes" -> "YesPhr"
"PhrNo" -> "NoPhr"
"WeNumNP" -> "WeNP"
"YeNumNP" -> "YeNP"
"HowManyDet" -> "HowManyIDet"
"MostsDet" -> "MostManyDet"
"WhichDet" -> "WhichOneIDet"
"WhichNDet" -> "WhichManyIDet"
"EverywhereNP" -> "EverywhereAdv"
"SomewhereNP" -> "SomewhereAdv"
"AgentPrep" -> "By8agentPrep"
_ -> w
-- massage French verbs 9/2/2005
freVerb s = case words s of
v:_ -> " " ++ v ++ " : " ++ cat v ++ " ;"
_ -> []
where
cat v = dropWhile (not . isUpper) v
-- Swedish verbs 17/2
sweVerb s = case words s of
('v':a:u:[]):verb:_ -> "fun " ++ verb ++ " : V ;\n" ++
"lin " ++ verb ++ " = " ++ infl a u verb ++ " ;"
_ -> []
where
infl a u verb =
let
(dne,geb) = span isConsonant $ tail $ reverse verb
(beg,voc,end) = (reverse (tail geb), head geb, reverse dne)
(pret,sup) = (beg++ [toLower a] ++end, beg++ [toLower u] ++ end ++"it")
in
unwords ["irregV", prQuot verb, prQuot pret, prQuot sup]
prQuot s = "\"" ++ s ++ "\""
isConsonant = not . isVowel
isVowel = flip elem "aeiouyäöå"
-- Norwegian 13/3
groupLines :: [String] -> [String]
groupLines ss = [unwords [a, b, c] | [a,_,b,c,_] <- grps ss] where
grps ls = let (g,rest) = splitAt 5 ls in g:grps rest
lin2fun s = case words s of
_:fun:_:_ -> " fun " ++ fun ++ " : " ++ cat fun ++ " ;"
_ -> s
where
cat fun = reverse (takeWhile (/='_') (reverse fun))
-- filter from a given file those lines whose first word is in a sought-set
allThose :: [String] -> [String] -> [String]
allThose soughts givens = concatMap seek soughts where
seek w = let s = [line | line <- givens, w':_ <- [words line], w == w']
in if null s then ["-- " ++ w] else s
-- do this with files
-- example: getAllThose "abstract/Mtmp" "english/BasicEng.gf"
getAllThose :: FilePath -> FilePath -> IO ()
getAllThose sought given = do
s <- readFile sought
gi <- readFile given
let so = [w | l <- lines s, w:_ <- [words l]]
mapM_ putStrLn $ allThose so $ lines gi

View File

@@ -0,0 +1,58 @@
module Tab where
import Data.Map
type Tree = String ----
type Bracket = [String] ---
type Tab = Map [String] [Tree]
-- build a Tab from a unilingual treebank file (not XML)
test file = do
tab <- tb2tab file
tst tab
where
tst tab = do
s <- getLine
-- let ts = analyse tab $ words s
let ts = Prelude.map unwords $ analyseParts tab $ words s
mapM_ putStrLn ts
tst tab
-- analyse whole inputs
analyse :: Tab -> [String] -> [Tree]
analyse tab s = maybe [] id $ Data.Map.lookup s tab
-- analyse parts of inputs
analyseParts :: Tab -> [String] -> [Bracket]
analyseParts tab = anap where
anap ws = case ws of
w:vs -> case results ws of
[] -> [w:res | res <- anap vs]
(ts,ws2):_ -> [t:res | t <- ts, res <- anap ws2]
_ -> [[]]
results ws = Prelude.filter (not . Prelude.null . fst)
[(ana ws1,ws2) | i <- [0..length ws], let (ws1,ws2) = splitAt i ws]
ana = analyse tab
tb2tab :: FilePath -> IO Tab
tb2tab file = do
ss <- readFile file >>= return . lines
let ps = pairs ss
return $ fromListWith (++) ps
pairs :: [String] -> [([String],[String])]
pairs xs = case xs of
x:y:ys -> (words y,[x]) : pairs ys
_ -> []
{-
ceci - that_NP
-}