remove the obsolete parse grammar

This commit is contained in:
kr.angelov
2011-09-23 09:42:22 +00:00
parent 1df6197c57
commit 56630bcbb6
18 changed files with 0 additions and 216124 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,6 +0,0 @@
--# -path=.:oald:alltenses
concrete English of EnglishAbs =
ParseEng,
OaldStructuralEng - [above_Prep, after_Prep,and_Conj,before_Prep,behind_Prep,between_Prep,during_Prep,except_Prep,for_Prep,from_Prep,in_Prep,on_Prep,or_Conj,through_Prep,to_Prep,under_Prep, with_Prep,without_Prep] ;
-- OaldEng - [everywhere_Adv,have_V2,here_Adv,quite_Adv,somewhere_Adv,there_Adv] ;

View File

@@ -1,5 +0,0 @@
abstract EnglishAbs =
ParseEngAbs,
OaldStructural - [above_Prep, after_Prep,and_Conj,before_Prep,behind_Prep,between_Prep,during_Prep,except_Prep,for_Prep,from_Prep,in_Prep,on_Prep,or_Conj,through_Prep,to_Prep,under_Prep, with_Prep,without_Prep] ;
-- Oald - [everywhere_Adv,have_V2,here_Adv,quite_Adv,somewhere_Adv,there_Adv] ;

View File

@@ -1,15 +0,0 @@
abstract Parse =
Noun,
Verb,
Adjective,
Adverb,
Numeral,
Sentence,
Question,
Relative,
Conjunction,
Phrase,
Text,
Structural,
Idiom,
Tense ;

View File

@@ -1,167 +0,0 @@
--# -path=.:oald:alltenses
concrete ParseEng of ParseEngAbs =
NounEng,
VerbEng - [ComplVS],
AdjectiveEng,
AdverbEng,
NumeralEng,
SentenceEng, --- - [UseCl, UseQCl, UseRCl],
QuestionEng,
RelativeEng - [IdRP, RelSlash],
ConjunctionEng,
PhraseEng, --- - [UttImpSg, UttImpPl],
TextX - [Pol,PNeg,PPos],
TenseX - [Pol,PNeg,PPos],
GrammarEng [Pol,PNeg,PPos],
StructuralEng - [above_Prep, everywhere_Adv, everybody_NP, every_Det, only_Predet, somebody_NP],
IdiomEng,
ExtraEng - [
UncNegCl, UncNegQCl, UncNegRCl, UncNegImpSg, UncNegImpPl,
StrandRelSlash,
that_RP
],
LexiconEng [N3, distance_N3,
A2, married_A2,
VQ, wonder_VQ,
V2A, paint_V2A,
V2Q, ask_V2Q,
V2V, beg_V2V,
V2S, answer_V2S,
VA, become_VA],
OaldEng - [everywhere_Adv, here_Adv, quite_Adv, somewhere_Adv, there_Adv,have_V2]
** open ParadigmsEng, ResEng, MorphoEng, NounEng, ParamX, Prelude in {
flags startcat = Phr ; unlexer = text ; lexer = text ;
--
-- * Overridden things from the common API
--
-- Allow both "hope that he runs" and "hope he runs".
lin ComplVS v s = variants { VerbEng.ComplVS v s; ComplBareVS v s } ;
{-
--- this can now be done by just using ExtraEng.UncNeg : Pol
-- Allow both contracted and uncontracted negated clauses.
lin UseCl t p cl =
case p.p of {
Pos => SentenceEng.UseCl t p cl;
Neg => variants { SentenceEng.UseCl t p cl; UncNegCl t p cl }
} ;
lin UseQCl t p cl =
case p.p of {
Pos => SentenceEng.UseQCl t p cl;
Neg => variants { SentenceEng.UseQCl t p cl; UncNegQCl t p cl }
} ;
lin UseRCl t p cl =
case p.p of {
Pos => SentenceEng.UseRCl t p cl;
Neg => variants { SentenceEng.UseRCl t p cl; UncNegRCl t p cl }
} ;
lin UttImpSg p i =
case p.p of {
CPos => PhraseEng.UttImpSg p i;
CNeg _ => variants { PhraseEng.UttImpSg p i ; UncNegImpSg p i }
} ;
lin UttImpPl p i =
case p.p of {
CPos => PhraseEng.UttImpPl p i;
CNeg _ => variants { PhraseEng.UttImpPl p i ; UncNegImpPl p i }
} ;
-}
-- Two different forms of relative clauses:
-- Pied piping: "at which we are looking".
-- Stranding: "that he looks at"
-- EmptyRelSlash is not used here, since it would give
-- a meta-variable for the RP.
lin RelSlash rp slash = variants { RelativeEng.RelSlash rp slash; StrandRelSlash rp slash } ;
-- Allow both "who"/"which" and "that"
lin IdRP =
{ s = table {
RC _ (NCase Gen) => "whose" ;
RC Neutr _ => variants { "which"; "that"; {- for dictionary entries with the wrong gender -} "who" } ;
RC _ NPAcc => variants { "whom"; "that"; {- incorrect but common -} "who" } ;
RC _ (NCase Nom) => variants { "who" ; "that" } ;
RPrep _ => variants { "which"; "whom"; {- incorrect but common -} "who" }
} ;
a = RNoAg
} ;
lin everybody_NP = variants { regNP "everybody" singular; regNP "everyone" singular } ;
lin somebody_NP = variants { regNP "somebody" singular; regNP "someone" singular } ;
lin every_Det = variants { mkDeterminer singular "every"; mkDeterminer singular "each" };
lin only_Predet = variants { ss "only"; ss "just" };
--
-- English-specific additions
--
-- Syntactic additions
lin
VerbCN v cn = {s = \\n,c => v.s ! VPresPart ++ cn.s ! n ! c; g = cn.g};
NumOfNP num np = {
s = \\c => num.s ! Nom ++ "of" ++ np.s ! c ;
a = agrP3 num.n
} ;
CAdvNP ad cn np = {
s = \\c => ad.s ++ cn.s ! Sg ! npcase2case c ++ ad.p ++ np.s ! npNom ;
a = agrP3 Sg
} ;
CAdvSSlash ad cn slash = {
s = \\c => ad.s ++ cn.s ! Sg ! npcase2case c ++ ad.p ++ slash.s ++ slash.c2;
a = agrP3 Sg
} ;
-- CompCN cn = { s = \\a => let n = (fromAgr a).n
-- in IndefArt.s ! False ! n ++ cn.s ! n ! Acc} ;
-- Lexical additions
lin
a8few_Det = mkDeterminer plural ["a few"];
another_Predet = ss "another" ;
any_Predet = ss "any" ;
anybody_NP = variants { regNP "anybody" singular; regNP "anyone" singular };
anything_NP = regNP "anything" singular;
both_Det = mkDeterminer plural "both";
either_Det = mkDeterminer singular "either" ;
exactly_AdN = ss "exactly" ;
most_Det = mkDeterminer plural "most";
neither_Det = mkDeterminer singular "neither" ;
only_AdV = mkAdV "only" ;
should_VV = {
s = table {
VVF VInf => ["ought to"] ;
VVF VPres => "should" ;
VVF VPPart => ["ought to"] ;
VVF VPresPart => variants {} ; -- FIXME: "shoulding" ?
VVF VPast => ["should have"] ;
VVPastNeg => ["shouldn't have"] ;
VVPresNeg => "shouldn't"
} ;
typ = VVAux
} ;
several_Det = mkDeterminer plural "several" ;
} ;

View File

@@ -1,48 +0,0 @@
abstract ParseEngAbs =
Parse - [above_Prep],
ExtraEngAbs - [
UncNegCl, UncNegQCl, UncNegRCl, UncNegImpSg, UncNegImpPl,
StrandRelSlash,
that_RP
],
Lexicon [N3, distance_N3,
A2, married_A2,
VQ, wonder_VQ,
V2A, paint_V2A,
V2Q, ask_V2Q,
V2V, beg_V2V,
V2S, answer_V2S,
VA, become_VA],
Oald - [everywhere_Adv, here_Adv, quite_Adv, somewhere_Adv, there_Adv,have_V2]
** {
-- Syntactic additions
fun VerbCN : V -> CN -> CN ; -- running man
fun NumOfNP : Num -> NP -> NP ; -- ten of the dogs
fun CAdvNP : CAdv -> CN -> NP -> NP ; -- more wine than the professor
fun CAdvSSlash : CAdv -> CN -> SSlash -> NP ; -- more wine than the professor drank
--fun CompCN : CN -> Comp ; -- "(every man is) a dog", "(all men are) dogs"
-- Lexical additions
fun a8few_Det : Det ;
fun another_Predet : Predet ;
fun any_Predet : Predet ;
fun anybody_NP : NP ;
fun anything_NP : NP ;
fun both_Det : Det ;
fun either_Det : Det ;
fun exactly_AdN : AdN ;
fun most_Det : Det ;
fun neither_Det : Det ;
fun only_AdV : AdV ;
fun should_VV : VV ;
fun several_Det : Det ;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,184 +0,0 @@
-- English lexicon for GF, produced from:
-- Oxford advanced learner's dictionary of current English:
-- expanded 'computer usable' version compiled by Roger Mitton
-- The computer usable version is transcribed from:
-- Oxford advanced learner's dictionary of current English
-- A.S. Hornby ; with the assistance of A.P. Cowie [and] J. Windsor Lewis.
-- 3rd. ed., London : Oxford University Press, 1974.
-- Distributed as 'dict0710' by:
-- Oxford Text Archive
-- Oxford University Computing Services
-- 13 Banbury Road
-- Oxford
-- OX2 6NN
-- Under these conditions:
-- Freely available for non-commercial use provided that this header is
-- included in its entirety with any copy distributed.
--
-- GF version generated by asc2gf, Bjorn Bringert Nov 2008
-- based on asc2lex, Matthew Purver Nov 2001
-- http://www.stanford.edu/~mpurver/software.html
abstract OaldStructural = Cat ** {
fun abaft_Prep : Prep;
fun aboard_Prep : Prep;
fun about_Prep : Prep;
fun above_Prep : Prep;
fun according_as_Conj : Conj;
fun according_to_Prep : Prep;
fun across_Prep : Prep;
fun afore_Prep : Prep;
fun after_Conj : Conj;
fun after_Prep : Prep;
fun against_Prep : Prep;
fun agin_Prep : Prep;
fun albeit_Conj : Conj;
fun along_Prep : Prep;
fun alongside_Prep : Prep;
fun although_Conj : Conj;
fun amid_Prep : Prep;
fun amidst_Prep : Prep;
fun among_Prep : Prep;
fun amongst_Prep : Prep;
fun an_Conj : Conj;
fun and_Conj : Conj;
fun anent_Prep : Prep;
fun around_Prep : Prep;
fun as_Conj : Conj;
fun aslant_Prep : Prep;
fun astride_Prep : Prep;
fun at_Prep : Prep;
fun athwart_Prep : Prep;
fun bar_Prep : Prep;
fun barring_Prep : Prep;
fun because_Conj : Conj;
fun before_Conj : Conj;
fun before_Prep : Prep;
fun behind_Prep : Prep;
fun below_Prep : Prep;
fun beneath_Prep : Prep;
fun beside_Prep : Prep;
fun besides_Prep : Prep;
fun between_Prep : Prep;
fun betwixt_Prep : Prep;
fun beyond_Prep : Prep;
fun but_Conj : Conj;
fun but_Prep : Prep;
fun by_Prep : Prep;
fun circa_Prep : Prep;
fun concerning_Prep : Prep;
fun considering_Prep : Prep;
fun cos_Conj : Conj;
fun despite_Prep : Prep;
fun directly_Conj : Conj;
fun down_Prep : Prep;
fun during_Prep : Prep;
fun either_Conj : Conj;
fun ere_Prep : Prep;
fun except_Conj : Conj;
fun except_Prep : Prep;
fun excepting_Prep : Prep;
fun failing_Prep : Prep;
fun for_Conj : Conj;
fun for_Prep : Prep;
fun forasmuch_as_Conj : Conj;
fun from_Prep : Prep;
fun howbeit_Conj : Conj;
fun if_Conj : Conj;
fun immediately_Conj : Conj;
fun in_Prep : Prep;
fun inside_Prep : Prep;
fun instantly_Conj : Conj;
fun into_Prep : Prep;
fun less_Prep : Prep;
fun lest_Conj : Conj;
fun like_Conj : Conj;
fun like_Prep : Prep;
fun likewise_Conj : Conj;
fun mid_Prep : Prep;
fun midst_Prep : Prep;
fun minus_Prep : Prep;
fun near_Prep : Prep;
fun neath_Prep : Prep;
fun neither_Conj : Conj;
fun nevertheless_Conj : Conj;
fun next_Prep : Prep;
fun nigh_Prep : Prep;
fun nigher_Prep : Prep;
fun nighest_Prep : Prep;
fun nisi_Conj : Conj;
fun nor_Conj : Conj;
fun notwithstanding_Conj : Conj;
fun notwithstanding_Prep : Prep;
fun now_Conj : Conj;
fun o'er_Prep : Prep;
fun of_Prep : Prep;
fun off_Prep : Prep;
fun on_Prep : Prep;
fun on_to_Prep : Prep;
fun only_Conj : Conj;
fun onto_Prep : Prep;
fun or_Conj : Conj;
fun otherwise_Conj : Conj;
fun outside_Prep : Prep;
fun over_Prep : Prep;
fun past_Prep : Prep;
fun pending_Prep : Prep;
fun per_Prep : Prep;
fun plus_Prep : Prep;
fun provided_Conj : Conj;
fun providing_Conj : Conj;
fun qua_Conj : Conj;
fun qua_Prep : Prep;
fun re_Prep : Prep;
fun respecting_Prep : Prep;
fun round_Prep : Prep;
fun sans_Prep : Prep;
fun save_Prep : Prep;
fun saving_Prep : Prep;
fun since_Conj : Conj;
fun since_Prep : Prep;
fun so_Conj : Conj;
fun supposing_Conj : Conj;
fun than_Conj : Conj;
fun that_Conj : Conj;
fun tho'_Conj : Conj;
fun though_Conj : Conj;
fun thro'_Prep : Prep;
fun through_Prep : Prep;
fun throughout_Prep : Prep;
fun thru_Prep : Prep;
fun till_Conj : Conj;
fun till_Prep : Prep;
fun to_Prep : Prep;
fun touching_Prep : Prep;
fun toward_Prep : Prep;
fun towards_Prep : Prep;
fun tween_Prep : Prep;
fun twixt_Prep : Prep;
fun under_Prep : Prep;
fun underneath_Prep : Prep;
fun unless_Conj : Conj;
fun unlike_Prep : Prep;
fun until_Conj : Conj;
fun until_Prep : Prep;
fun unto_Prep : Prep;
fun up_Prep : Prep;
fun upon_Prep : Prep;
fun versus_Prep : Prep;
fun via_Prep : Prep;
fun vice_Prep : Prep;
fun vis_à_vis_Prep : Prep;
fun wanting_Prep : Prep;
fun when_Conj : Conj;
fun whencesoever_Conj : Conj;
fun whenever_Conj : Conj;
fun whereas_Conj : Conj;
fun whether_Conj : Conj;
fun while_Conj : Conj;
fun whilst_Conj : Conj;
fun with_Prep : Prep;
fun within_Prep : Prep;
fun without_Prep : Prep;
fun yet_Conj : Conj;
}

View File

@@ -1,185 +0,0 @@
-- English lexicon for GF, produced from:
-- Oxford advanced learner's dictionary of current English:
-- expanded 'computer usable' version compiled by Roger Mitton
-- The computer usable version is transcribed from:
-- Oxford advanced learner's dictionary of current English
-- A.S. Hornby ; with the assistance of A.P. Cowie [and] J. Windsor Lewis.
-- 3rd. ed., London : Oxford University Press, 1974.
-- Distributed as 'dict0710' by:
-- Oxford Text Archive
-- Oxford University Computing Services
-- 13 Banbury Road
-- Oxford
-- OX2 6NN
-- Under these conditions:
-- Freely available for non-commercial use provided that this header is
-- included in its entirety with any copy distributed.
--
-- GF version generated by asc2gf, Bjorn Bringert Nov 2008
-- based on asc2lex, Matthew Purver Nov 2001
-- http://www.stanford.edu/~mpurver/software.html
--# -path=.:alltenses
concrete OaldStructuralEng of OaldStructural = CatEng ** open ParadigmsEng in {
lin abaft_Prep = mkPrep "abaft";
lin aboard_Prep = mkPrep "aboard";
lin about_Prep = mkPrep "about";
lin above_Prep = mkPrep "above";
lin according_as_Conj = mkConj "according as";
lin according_to_Prep = mkPrep "according to";
lin across_Prep = mkPrep "across";
lin afore_Prep = mkPrep "afore";
lin after_Conj = mkConj "after";
lin after_Prep = mkPrep "after";
lin against_Prep = mkPrep "against";
lin agin_Prep = mkPrep "agin";
lin albeit_Conj = mkConj "albeit";
lin along_Prep = mkPrep "along";
lin alongside_Prep = mkPrep "alongside";
lin although_Conj = mkConj "although";
lin amid_Prep = mkPrep "amid";
lin amidst_Prep = mkPrep "amidst";
lin among_Prep = mkPrep "among";
lin amongst_Prep = mkPrep "amongst";
lin an_Conj = mkConj "an";
lin and_Conj = mkConj "and";
lin anent_Prep = mkPrep "anent";
lin around_Prep = mkPrep "around";
lin as_Conj = mkConj "as";
lin aslant_Prep = mkPrep "aslant";
lin astride_Prep = mkPrep "astride";
lin at_Prep = mkPrep "at";
lin athwart_Prep = mkPrep "athwart";
lin bar_Prep = mkPrep "bar";
lin barring_Prep = mkPrep "barring";
lin because_Conj = mkConj "because";
lin before_Conj = mkConj "before";
lin before_Prep = mkPrep "before";
lin behind_Prep = mkPrep "behind";
lin below_Prep = mkPrep "below";
lin beneath_Prep = mkPrep "beneath";
lin beside_Prep = mkPrep "beside";
lin besides_Prep = mkPrep "besides";
lin between_Prep = mkPrep "between";
lin betwixt_Prep = mkPrep "betwixt";
lin beyond_Prep = mkPrep "beyond";
lin but_Conj = mkConj "but";
lin but_Prep = mkPrep "but";
lin by_Prep = mkPrep "by";
lin circa_Prep = mkPrep "circa";
lin concerning_Prep = mkPrep "concerning";
lin considering_Prep = mkPrep "considering";
lin cos_Conj = mkConj "cos";
lin despite_Prep = mkPrep "despite";
lin directly_Conj = mkConj "directly";
lin down_Prep = mkPrep "down";
lin during_Prep = mkPrep "during";
lin either_Conj = mkConj "either";
lin ere_Prep = mkPrep "ere";
lin except_Conj = mkConj "except";
lin except_Prep = mkPrep "except";
lin excepting_Prep = mkPrep "excepting";
lin failing_Prep = mkPrep "failing";
lin for_Conj = mkConj "for";
lin for_Prep = mkPrep "for";
lin forasmuch_as_Conj = mkConj "forasmuch as";
lin from_Prep = mkPrep "from";
lin howbeit_Conj = mkConj "howbeit";
lin if_Conj = mkConj "if";
lin immediately_Conj = mkConj "immediately";
lin in_Prep = mkPrep "in";
lin inside_Prep = mkPrep "inside";
lin instantly_Conj = mkConj "instantly";
lin into_Prep = mkPrep "into";
lin less_Prep = mkPrep "less";
lin lest_Conj = mkConj "lest";
lin like_Conj = mkConj "like";
lin like_Prep = mkPrep "like";
lin likewise_Conj = mkConj "likewise";
lin mid_Prep = mkPrep "mid";
lin midst_Prep = mkPrep "midst";
lin minus_Prep = mkPrep "minus";
lin near_Prep = mkPrep "near";
lin neath_Prep = mkPrep "'neath";
lin neither_Conj = mkConj "neither";
lin nevertheless_Conj = mkConj "nevertheless";
lin next_Prep = mkPrep "next";
lin nigh_Prep = mkPrep "nigh";
lin nigher_Prep = mkPrep "nigher";
lin nighest_Prep = mkPrep "nighest";
lin nisi_Conj = mkConj "nisi";
lin nor_Conj = mkConj "nor";
lin notwithstanding_Conj = mkConj "notwithstanding";
lin notwithstanding_Prep = mkPrep "notwithstanding";
lin now_Conj = mkConj "now";
lin o'er_Prep = mkPrep "o'er";
lin of_Prep = mkPrep "of";
lin off_Prep = mkPrep "off";
lin on_Prep = mkPrep "on";
lin on_to_Prep = mkPrep "on to";
lin only_Conj = mkConj "only";
lin onto_Prep = mkPrep "onto";
lin or_Conj = mkConj "or";
lin otherwise_Conj = mkConj "otherwise";
lin outside_Prep = mkPrep "outside";
lin over_Prep = mkPrep "over";
lin past_Prep = mkPrep "past";
lin pending_Prep = mkPrep "pending";
lin per_Prep = mkPrep "per";
lin plus_Prep = mkPrep "plus";
lin provided_Conj = mkConj "provided";
lin providing_Conj = mkConj "providing";
lin qua_Conj = mkConj "qua";
lin qua_Prep = mkPrep "qua";
lin re_Prep = mkPrep "re";
lin respecting_Prep = mkPrep "respecting";
lin round_Prep = mkPrep "round";
lin sans_Prep = mkPrep "sans";
lin save_Prep = mkPrep "save";
lin saving_Prep = mkPrep "saving";
lin since_Conj = mkConj "since";
lin since_Prep = mkPrep "since";
lin so_Conj = mkConj "so";
lin supposing_Conj = mkConj "supposing";
lin than_Conj = mkConj "than";
lin that_Conj = mkConj "that";
lin tho'_Conj = mkConj "tho'";
lin though_Conj = mkConj "though";
lin thro'_Prep = mkPrep "thro'";
lin through_Prep = mkPrep "through";
lin throughout_Prep = mkPrep "throughout";
lin thru_Prep = mkPrep "thru";
lin till_Conj = mkConj "till";
lin till_Prep = mkPrep "till";
lin to_Prep = mkPrep "to";
lin touching_Prep = mkPrep "touching";
lin toward_Prep = mkPrep "toward";
lin towards_Prep = mkPrep "towards";
lin tween_Prep = mkPrep "'tween";
lin twixt_Prep = mkPrep "'twixt";
lin under_Prep = mkPrep "under";
lin underneath_Prep = mkPrep "underneath";
lin unless_Conj = mkConj "unless";
lin unlike_Prep = mkPrep "unlike";
lin until_Conj = mkConj "until";
lin until_Prep = mkPrep "until";
lin unto_Prep = mkPrep "unto";
lin up_Prep = mkPrep "up";
lin upon_Prep = mkPrep "upon";
lin versus_Prep = mkPrep "versus";
lin via_Prep = mkPrep "via";
lin vice_Prep = mkPrep "vice";
lin vis_à_vis_Prep = mkPrep "vis-à-vis";
lin wanting_Prep = mkPrep "wanting";
lin when_Conj = mkConj "when";
lin whencesoever_Conj = mkConj "whencesoever";
lin whenever_Conj = mkConj "whenever";
lin whereas_Conj = mkConj "whereas";
lin whether_Conj = mkConj "whether";
lin while_Conj = mkConj "while";
lin whilst_Conj = mkConj "whilst";
lin with_Prep = mkPrep "with";
lin within_Prep = mkPrep "within";
lin without_Prep = mkPrep "without";
lin yet_Conj = mkConj "yet";
}

View File

@@ -1,453 +0,0 @@
#! /usr/bin/perl -w
#
# Perl script to process OALD machine-readable ASCII file
# into a GF lexicon
#
# Usage: ./asc2gf < ascii_0710-1.txt
#
# Bjorn Bringert 2008,
# based on asc2lex by
# Matthew Purver, 11/2001
use strict;
my %irregular_verbs = ();
my %words = ();
my $irreg_eng = "../../english/IrregEng.gf";
open(IRREG_ENG,"$irreg_eng") or die "Could not open $irreg_eng\n";
while (<IRREG_ENG>) {
if (s/\s*([a-z\d]+)_V\s*=.*/$1/) {
chomp;
$irregular_verbs{$_} = 1;
}
}
close IRREG_ENG;
print "Known irregular verbs from $irreg_eng:\n";
print join(",", keys %irregular_verbs) . "\n";
# skip header section
while ( <STDIN> ) {
last if /<\/TEIHEADER>/;
}
# read a line from stdin
while ( my $line = <STDIN> ) {
# remove SGML tags
$line =~ s/<[^<>]+>//g;
# split line into fields according to spec (line may be empty now)
if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) {
my ( $word, $pos, $cat ) = ( $1, $2, $3 );
# trim white space
for ( ( $word, $pos, $cat ) ) {
s/\s*$//;
}
# make word lower-case
$word =~ tr/A-Z/a-z/; # lower case
# translate OALD diacritics
$word =~ s/~n/ñ/g;
$word =~ s/<c/ç/g;
$word =~ s/"a/ä/g;
$word =~ s/"o/ö/g;
$word =~ s/"u/ü/g;
$word =~ s/"i/ï/g;
$word =~ s/\^a/â/g;
$word =~ s/\^e/ê/g;
$word =~ s/\^o/ô/g;
$word =~ s/`a/à/g;
$word =~ s/`e/è/g;
$word =~ s/_e/é/g;
# make legal identifier
# Note: in theory this could cause clashes, but I don't think it does
# with the OALD.
my $name = $word;
$name =~ s/ /_/g; # space -> _
$name =~ s/-/_/g; # - -> _
$name =~ s/\./_/g; # . -> _
$name =~ s/^'//; # drop initial '
# get PoS & subcat info
my @pos = split( /,/, $pos );
$cat =~ s/,/\',\'/g;
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
foreach ( @pos ) {
my ( $pcode, $infl, $freq )=split(//);
# for verbs, get inflected forms
if ( $pcode =~ /^[GHIJ]/ ) {
$pos = 'verb';
my ($vbz, $vbg, $vbd);
# if this is a root form, work out the inflected forms
if ( $infl =~ /^\d/ ) {
if ( $infl == 0 ) {
( $vbz = $word ) =~ s/$/s/;
( $vbg = $word ) =~ s/$/ing/;
( $vbd = $word ) =~ s/$/ed/;
}
elsif ( $infl == 1 ) {
( $vbz = $word ) =~ s/$/es/;
( $vbg = $word ) =~ s/$/ing/;
( $vbd = $word ) =~ s/$/ed/;
}
elsif ( $infl == 2 ) {
( $vbz = $word ) =~ s/e$/es/;
( $vbg = $word ) =~ s/e$/ing/;
( $vbd = $word ) =~ s/e$/ed/;
}
elsif ( $infl == 3 ) {
( $vbz = $word ) =~ s/y$/ies/;
( $vbg = $word ) =~ s/y$/ying/;
( $vbd = $word ) =~ s/y$/ied/;
}
elsif ( $infl == 4 ) {
( $vbz = $word ) =~ s/$/s/;
( $vbg = $word ) =~ s/(\w)$/$1$1ing/;
( $vbd = $word ) =~ s/(\w)$/$1$1ed/;
}
elsif ( $infl == 5 ) {
# for irregulars, just mark as such for now, we'll guess later
$vbz = 'IRREG';
$vbg = 'IRREG';
$vbd = 'IRREG';
}
my $lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
# try to use a verb from IrregEng
if ( $infl == 5 ) {
for (my $i = 0; $i < length($word) - 1; $i++) {
my $suffix = substr($word, $i);
if ($irregular_verbs{$suffix}) {
if ($i == 0) {
$lin = "IrregEng.${name}_V";
} else {
my $prefix = substr($word, 0, $i);
$lin = "mkV \"$prefix\" IrregEng.${suffix}_V";
}
last;
}
}
}
if ($pcode eq 'G') {
#add_word("${name}_VX", "mkVX ($lin)");
print STDERR "Ignoring anomalous verb: $name\n";
}
if ($pcode eq 'I' || $pcode eq 'J') {
add_word("${name}_V", "$lin");
}
if ($pcode eq 'H' || $pcode eq 'J') {
add_word("${name}_V2", "mkV2 ($lin)");
}
}
# if this is an inflected form, save for guessing irregulars later
elsif ( $infl =~ /^a/ ) {
#push( @vbz, $word );
}
elsif ( $infl =~ /^b/ ) {
#push( @vbg, $word );
}
elsif ( $infl =~ /^c/ ) {
#push( @vbd, $word );
}
elsif ( $infl =~ /^d/ ) {
#push( @vbn, $word );
}
}
# for nouns, get plural form
elsif( $pcode =~ /^[KLMNY]/ ) {
$pos = 'noun';
$pcode =~ s/^K/count/;
$pcode =~ s/^L/mass/;
$pcode =~ s/^M/both/;
$pcode =~ s/^N/proper/;
if ( $pcode =~ /^Y/ ) {
$pcode = 'count' if $infl =~ /^[>\)\]]/;
$pcode = 'mass' if $infl =~ /^\}/;
$pcode = 'proper' if $infl =~ /^[:=~]/;
}
# if this is a singular form, work out plural form
unless ( $infl =~ /^j/ ) {
my $pl = '-';
if ( $infl eq '6' ) {
( $pl = $word ) =~ s/$/s/;
}
elsif ( $infl eq '7' ) {
( $pl = $word ) =~ s/$/es/;
}
elsif ( $infl eq '8' ) {
( $pl = $word ) =~ s/y$/ies/;
}
elsif ( $infl =~ /^[9k\]]/ ) {
$pl = $word;
}
elsif ( $infl =~ /^i/ ) {
# for irregulars, let's just make a guess and mark with '*'
# this could be done better, as for verbs, but I can't be bothered now
$pl = $word;
( $pl =~ s/^((wo)?m)an/$1en\*/ ) or
( $pl =~ s/man(-|$)/men$1\*/ ) or
( $pl =~ s/-in-law/s-in-law\*/ ) or
( $pl =~ s/um$/a\*/ ) or
( $pl =~ s/us$/i\*/ ) or
( $pl =~ s/a$/ae\*/ ) or
( $pl =~ s/on$/a\*/ ) or
( $pl =~ s/is$/es\*/ ) or
( $pl =~ s/o$/i\*/ ) or
( $pl =~ s/child$/children\*/ ) or
( $pl =~ s/oot$/eet\*/ ) or
( $pl =~ s/ooth$/eeth\*/ ) or
( $pl =~ s/([lm])ouse$/$1ice\*/ ) or
( $pl =~ s/f(e)?$/ves\*/ ) or
( $pl =~ s/[ei]x$/ices\*/ ) or
( $pl =~ s/eau$/eaux\*/ ) or
( $pl = 'IRREG' );
}
# if plural-only, swap root form & plural
elsif ( $infl =~ /^\)/ ) {
$pl = $word;
$word = '-';
}
( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' );
my $comment = "";
if ( $word eq '-' ) {
$comment .= " {- FIXME: no singular form -}";
}
if ( $pl eq '-' ) {
$comment .= " {- FIXME: no plural form -}";
}
if ( $pl =~ s/\*$// ) {
$comment .= " {- FIXME: guessed plural form -}";
}
if ( $pcode eq 'proper' ) {
add_word("${name}_PN", "mkPN \"$word\"");
} else {
add_word("${name}_N", "mkN \"$word\" \"$pl\"$comment");
}
}
}
# for adjectives, get comparative & superlative forms
elsif( $pcode =~ /^O/ ) {
$pos = 'adj';
# if this is root form, work out inflected forms
unless ( $infl =~ /^[rs]/ ) {
my ($comp, $sup);
if ( $infl =~ /^[Apqt]/ ) {
$comp = $sup = '-';
}
elsif ( $infl =~ /^B/ ) {
( $comp = $word ) =~ s/$/r/;
( $sup = $word ) =~ s/$/st/;
}
elsif ( $infl =~ /^C/ ) {
( $comp = $word ) =~ s/$/er/;
( $sup = $word ) =~ s/$/est/;
}
elsif ( $infl =~ /^D/ ) {
( $comp = $word ) =~ s/y$/ier/;
( $sup = $word ) =~ s/y$/iest/;
}
elsif ( $infl =~ /^E/ ) {
# for irregulars, let's just have a guess and mark with '*'
# (there aren't very many of these)
( $comp = $word ) =~ s/(\w)$/$1$1er\*/;
( $sup = $word ) =~ s/(\w)$/$1$1est\*/;
}
$infl =~ s/^[ABCDE]/normal/;
$infl =~ s/^p/pred/;
$infl =~ s/^q/attr/;
$infl =~ s/^t/affix/;
if ( $comp eq '-' ) {
add_word("${name}_A", "compoundA (mkA \"$word\")");
} else {
add_word("${name}_A", "mkA \"$word\" \"$comp\"");
}
}
}
# adverb
elsif( $pcode =~ /^P/ ) {
$pos = 'adv';
$infl =~ s/^[u\+]/normal/;
$infl =~ s/^w/whrel/;
$infl =~ s/^v/whq/;
add_word("${name}_Adv", "mkAdv \"$word\"");
}
# pronoun
elsif( $pcode =~ s/^Q/_/ ) {
$pos = 'pron';
$infl =~ s/^x/normal/;
$infl =~ s/^y/whq/;
$infl =~ s/^z/whrel/;
my $class = '_';
# reflexive pronouns
if ( ( $word =~ /self$/ ) or
( $word =~ /selves$/ ) ) {
$pcode = 'acc';
}
# accusative personal pronouns
if ( ( $word =~ /^him/ ) or
( $word =~ /^her/ ) or
( $word =~ /^them/ ) or
( $word eq 'us' ) or
( $word eq 'thee' ) or
( $word eq 'me' ) ) {
$pcode = 'acc';
$class = 'per';
}
# nominative personal pronouns
if ( ( $word eq 'he' ) or
( $word eq 'she' ) or
( $word eq 'they' ) or
( $word eq 'we' ) or
( $word eq 'thou' ) or
( $word eq 'i' ) ) {
$pcode = 'nom';
$class = 'per';
}
# other personal pronouns
if ( ( $word =~ /.+one/ ) or
( $word =~ /one.+/ ) or
( $word =~ /body/ ) or
( $word =~ /^you/ ) or
( $word =~ /^who/ ) ) {
$class = 'per';
}
# non-personal pronouns
if ( $word =~ /thing/ ) {
$class = 'nper';
}
# otherwise case/person info will be '_' (anon variable)
# add full spec to @pron array
#push( @pron, "$pos( \'$word\', $pcode, $infl, $class ).\n" );
}
# for determiners, leave anon variable as placeholder for semantics
elsif( $pcode =~ /^[RS]/ ) {
$pos = 'det';
$pcode =~ s/^R/def/;
$pcode =~ s/^S/indef/;
#add_word("${name}_Det","mkDeterminer \"$word\"");
}
# for prepositions - nothing to say
elsif( $pcode =~ s/^T/prep/ ) {
$pos = 'prep';
add_word("${name}_Prep","mkPrep \"$word\"");
}
# for conjunctions - nothing to say
elsif( $pcode =~ s/^V/conj/ ) {
$pos = 'conj';
add_word("${name}_Conj","mkConj \"$word\"");
}
# for miscellaneous, leave '-' as placeholder for illocutionary info
elsif( $pcode =~ /^[UWXZ]/ ) {
$pos = 'misc';
#push( @prefix, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^U/prefix/ );
#push( @interj, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^W/interj/ );
#push( @partcl, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^X/partcl/ );
#push( @unknown, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^Z/unknown/ );
}
}
}
}
my $absfile = "Oald.gf";
my $cncfile = "OaldEng.gf";
my $abs_structfile = "OaldStructural.gf";
my $cnc_structfile = "OaldStructuralEng.gf";
open (ABS, '>', $absfile);
open (CNC, '>', $cncfile);
open (ABS_STRUCTURAL, '>', $abs_structfile);
open (CNC_STRUCTURAL, '>', $cnc_structfile);
# print a nice comment at the top
my $header = "-- English lexicon for GF, produced from:\n"
. "-- Oxford advanced learner's dictionary of current English:\n"
. "-- expanded 'computer usable' version compiled by Roger Mitton\n"
. "-- The computer usable version is transcribed from:\n"
. "-- Oxford advanced learner's dictionary of current English\n"
. "-- A.S. Hornby ; with the assistance of A.P. Cowie [and] J. Windsor Lewis.\n"
. "-- 3rd. ed., London : Oxford University Press, 1974.\n"
. "-- Distributed as 'dict0710' by:\n"
. "-- Oxford Text Archive\n"
. "-- Oxford University Computing Services\n"
. "-- 13 Banbury Road\n"
. "-- Oxford\n"
. "-- OX2 6NN\n"
. "-- Under these conditions:\n"
. "-- Freely available for non-commercial use provided that this header is\n"
. "-- included in its entirety with any copy distributed.\n"
. "--\n"
. "-- GF version generated by asc2gf, Bjorn Bringert Nov 2008\n"
. "-- based on asc2lex, Matthew Purver Nov 2001\n"
. "-- http://www.stanford.edu/~mpurver/software.html\n"
. "\n";
print ABS $header;
print ABS "abstract Oald = Cat ** {\n";
print CNC $header;
print CNC "--# -path=.:alltenses\n";
print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng, IrregEng in {\n";
print ABS_STRUCTURAL $header;
print ABS_STRUCTURAL "abstract OaldStructural = Cat ** {\n";
print CNC_STRUCTURAL $header;
print CNC_STRUCTURAL "--# -path=.:alltenses\n";
print CNC_STRUCTURAL "concrete OaldStructuralEng of OaldStructural = CatEng ** open ParadigmsEng in {\n";
foreach my $name (sort (keys %words)) {
(my $cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/;
my $lin = $words{$name};
if ( $cat =~ /^(A)|(N)|(V)|(V2)$/ ) {
print ABS "fun $name : $cat;\n";
print CNC "lin $name = $lin;\n";
} else {
print ABS_STRUCTURAL "fun $name : $cat;\n";
print CNC_STRUCTURAL "lin $name = $lin;\n";
}
}
print ABS "}";
print CNC "}";
print ABS_STRUCTURAL "}";
print CNC_STRUCTURAL "}";
close(ABS_STRUCTURAL);
close(CNC_STRUCTURAL);
close(ABS);
close(CNC);
print "\nWrote open lexicon to $absfile and $cncfile\n";
print "Wrote closed lexicon to $abs_structfile and $cnc_structfile\n";
sub add_word {
my ($name,$lin) = @_;
if (exists $words{$name}) {
print STDERR "Duplicate word: $name\n";
} else {
$words{$name} = $lin;
}
}

View File

@@ -1,12 +0,0 @@
This directory contains the Oxford Advanced Learner's Dictionary of Current English
(expanded computer-usable version), available from the Oxford Text Archive (http://ota.ahds.ac.uk).
It has a flat structure but contains part-of-speech, verb subcategorisation & pronunciation info.
Files:
ascii_0710-1.txt the original plain ASCII version of the OALD
ascii_0710-2.txt the information to go with it
asc2lex a Perl script to process ASCII -> Prolog
lexicon2.pl the resulting Prolog version, hand-corrected for irregulars etc.
Matthew Purver, Jan 2001

View File

@@ -1,320 +0,0 @@
#! /usr/bin/perl
#
# Perl script to process OALD machine-readable ASCII file
# into a Prolog-readable lexicon usable by SHARDS
#
# Usage: ./asc2lex < ascii_0710-1.txt [> OUTPUT.PL]
#
# Matthew Purver, 11/2001
# print a nice comment at the top
print "% Prolog lexicon for SHARDS, from OALD machine-readable dictionary\n";
print "% Produced by asc2lex, Matthew Purver 11/2001\n\n";
# skip header section
while ( <STDIN> ) {
last if /<\/TEIHEADER>/;
}
# read a line from stdin
while ( $line = <STDIN> ) {
# remove SGML tags
$line =~ s/<[^<>]+>//g;
# split line into fields according to spec (line may be empty now)
if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) {
# trim white space
for ( ( $word, $pos, $cat ) = ( $1, $2, $3 ) ) {
s/\s*$//;
}
# make word lower-case atomic string
$word =~ s/\'/\\\'/g; # ' -> \'
$word =~ s/\"/\\\"/g; # " -> \"
$word =~ tr/A-Z/a-z/; # lower case
# get PoS & subcat info
@pos = split( /,/, $pos );
$cat =~ s/,/\',\'/g;
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
# set up Prolog-style string & put into array
foreach ( @pos ) {
( $pcode, $infl, $freq )=split(//);
# for verbs, get inflected forms
if ( $pcode =~ /^[GHIJ]/ ) {
$pos = 'verb';
$pcode =~ s/^G/unknown/;
$pcode =~ s/^H/tran/;
$pcode =~ s/^I/intran/;
$pcode =~ s/^J/_/;
# if this is a root form, work out the inflected forms
if ( $infl =~ /^\d/ ) {
if ( $infl == 0 ) {
( $vbz = $word ) =~ s/$/s/;
( $vbg = $word ) =~ s/$/ing/;
( $vbd = $word ) =~ s/$/ed/;
}
elsif ( $infl == 1 ) {
( $vbz = $word ) =~ s/$/es/;
( $vbg = $word ) =~ s/$/ing/;
( $vbd = $word ) =~ s/$/ed/;
}
elsif ( $infl == 2 ) {
( $vbz = $word ) =~ s/e$/es/;
( $vbg = $word ) =~ s/e$/ing/;
( $vbd = $word ) =~ s/e$/ed/;
}
elsif ( $infl == 3 ) {
( $vbz = $word ) =~ s/y$/ies/;
( $vbg = $word ) =~ s/y$/ying/;
( $vbd = $word ) =~ s/y$/ied/;
}
elsif ( $infl == 4 ) {
( $vbz = $word ) =~ s/$/s/;
( $vbg = $word ) =~ s/(\w)$/$1$1ing/;
( $vbd = $word ) =~ s/(\w)$/$1$1ed/;
}
elsif ( $infl == 5 ) {
# for irregulars, just mark as such for now, we'll guess later
$vbz = 'IRREG';
$vbg = 'IRREG';
$vbd = 'IRREG';
}
# add the full spec to @verb array
push( @verb,
"$pos( \'$word\', \'$vbz\', \'$vbg\', \'$vbd\', \'$vbd\', $pcode, [$cat] ).\n" );
}
# if this is an inflected form, save for guessing irregulars later
elsif ( $infl =~ /^a/ ) {
push( @vbz, $word );
}
elsif ( $infl =~ /^b/ ) {
push( @vbg, $word );
}
elsif ( $infl =~ /^c/ ) {
push( @vbd, $word );
}
elsif ( $infl =~ /^d/ ) {
push( @vbn, $word );
}
}
# for nouns, get plural form
elsif( $pcode =~ /^[KLMNY]/ ) {
$pos = 'noun';
$pcode =~ s/^K/count/;
$pcode =~ s/^L/mass/;
$pcode =~ s/^M/both/;
$pcode =~ s/^N/proper/;
if ( $pcode =~ /^Y/ ) {
$pcode = 'count' if $infl =~ /^[>\)\]]/;
$pcode = 'mass' if $infl =~ /^\}/;
$pcode = 'proper' if $infl =~ /^[:=~]/;
}
# if this is a singular form, work out plural form
unless ( $infl =~ /^j/ ) {
$pl = '-';
if ( $infl == 6 ) {
( $pl = $word ) =~ s/$/s/;
}
elsif ( $infl == 7 ) {
( $pl = $word ) =~ s/$/es/;
}
elsif ( $infl == 8 ) {
( $pl = $word ) =~ s/y$/ies/;
}
elsif ( $infl =~ /^[9k\]]/ ) {
$pl = $word;
}
elsif ( $infl =~ /^i/ ) {
# for irregulars, let's just make a guess and mark with '*'
# this could be done better, as for verbs, but I can't be bothered now
$pl = $word;
( $pl =~ s/^((wo)?m)an/$1en\*/ ) or
( $pl =~ s/man(-|$)/men$1\*/ ) or
( $pl =~ s/-in-law/s-in-law\*/ ) or
( $pl =~ s/um$/a\*/ ) or
( $pl =~ s/us$/i\*/ ) or
( $pl =~ s/a$/ae\*/ ) or
( $pl =~ s/on$/a\*/ ) or
( $pl =~ s/is$/es\*/ ) or
( $pl =~ s/o$/i\*/ ) or
( $pl =~ s/child$/children\*/ ) or
( $pl =~ s/oot$/eet\*/ ) or
( $pl =~ s/ooth$/eeth\*/ ) or
( $pl =~ s/([lm])ouse$/$1ice\*/ ) or
( $pl =~ s/f(e)?$/ves\*/ ) or
( $pl =~ s/[ei]x$/ices\*/ ) or
( $pl =~ s/eau$/eaux\*/ ) or
( $pl = 'IRREG' );
}
# if plural-only, swap root form & plural
elsif ( $infl =~ /^\)/ ) {
$pl = $word;
$word = '-';
}
# and add full spec to @noun array
( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' );
push( @noun, "$pos( \'$word\', \'$pl\', $pcode, $infl ).\n" )
}
}
# for adjectives, get comparative & superlative forms
elsif( $pcode =~ /^O/ ) {
$pos = 'adj';
# if this is root form, work out inflected forms
unless ( $infl =~ /^[rs]/ ) {
if ( $infl =~ /^[Apqt]/ ) {
$comp = $sup = '-';
}
elsif ( $infl =~ /^B/ ) {
( $comp = $word ) =~ s/$/r/;
( $sup = $word ) =~ s/$/st/;
}
elsif ( $infl =~ /^C/ ) {
( $comp = $word ) =~ s/$/er/;
( $sup = $word ) =~ s/$/est/;
}
elsif ( $infl =~ /^D/ ) {
( $comp = $word ) =~ s/y$/ier/;
( $sup = $word ) =~ s/y$/iest/;
}
elsif ( $infl =~ /^E/ ) {
# for irregulars, let's just have a guess and mark with '*'
# (there aren't very many of these)
( $comp = $word ) =~ s/(\w)$/$1$1er\*/;
( $sup = $word ) =~ s/(\w)$/$1$1est\*/;
}
$infl =~ s/^[ABCDE]/normal/;
$infl =~ s/^p/pred/;
$infl =~ s/^q/attr/;
$infl =~ s/^t/affix/;
# and add full spec to @adj array
push( @adj, "$pos( \'$word\', \'$comp\', \'$sup\', $infl ).\n" );
}
}
# for adverbs, just add all info to @adv array
elsif( $pcode =~ /^P/ ) {
$pos = 'adv';
$infl =~ s/^[u\+]/normal/;
$infl =~ s/^w/whrel/;
$infl =~ s/^v/whq/;
push( @adv, "$pos( \'$word\', $infl ).\n" );
}
# for pronouns, work out some case/person info
elsif( $pcode =~ s/^Q/_/ ) {
$pos = 'pron';
$infl =~ s/^x/normal/;
$infl =~ s/^y/whq/;
$infl =~ s/^z/whrel/;
$class = '_';
# reflexive pronouns
if ( ( $word =~ /self$/ ) or
( $word =~ /selves$/ ) ) {
$pcode = 'acc';
}
# accusative personal pronouns
if ( ( $word =~ /^him/ ) or
( $word =~ /^her/ ) or
( $word =~ /^them/ ) or
( $word eq 'us' ) or
( $word eq 'thee' ) or
( $word eq 'me' ) ) {
$pcode = 'acc';
$class = 'per';
}
# nominative personal pronouns
if ( ( $word eq 'he' ) or
( $word eq 'she' ) or
( $word eq 'they' ) or
( $word eq 'we' ) or
( $word eq 'thou' ) or
( $word eq 'i' ) ) {
$pcode = 'nom';
$class = 'per';
}
# other personal pronouns
if ( ( $word =~ /.+one/ ) or
( $word =~ /one.+/ ) or
( $word =~ /body/ ) or
( $word =~ /^you/ ) or
( $word =~ /^who/ ) ) {
$class = 'per';
}
# non-personal pronouns
if ( $word =~ /thing/ ) {
$class = 'nper';
}
# otherwise case/person info will be '_' (anon variable)
# add full spec to @pron array
push( @pron, "$pos( \'$word\', $pcode, $infl, $class ).\n" );
}
# for determiners, leave anon variable as placeholder for semantics
elsif( $pcode =~ /^[RS]/ ) {
$pos = 'det';
$pcode =~ s/^R/def/;
$pcode =~ s/^S/indef/;
push( @det, "$pos( \'$word\', $pcode, _ ).\n" );
}
# for prepositions - nothing to say
elsif( $pcode =~ s/^T/prep/ ) {
$pos = 'prep';
push( @prep, "$pos( \'$word\', $pcode ).\n" );
}
# for conjunctions - nothing to say
elsif( $pcode =~ s/^V/conj/ ) {
$pos = 'conj';
push( @conj, "$pos( \'$word\', $pcode ).\n" );
}
# for miscellaneous, leave '-' as placeholder for illocutionary info
elsif( $pcode =~ /^[UWXZ]/ ) {
$pos = 'misc';
push( @prefix, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^U/prefix/ );
push( @interj, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^W/interj/ );
push( @partcl, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^X/partcl/ );
push( @unknown, "$pos( \'$word\', $pcode, '-' ).\n" ) if ( $pcode =~ s/^Z/unknown/ );
}
}
}
}
# now have a guess at irregular verb forms (marking the best guess with '*')
foreach $verb ( @verb ) {
if ( $verb =~ /verb\( \'([^\']+)\', \'IRREG/ ) {
$word = $1;
$vbz = findbest( $word, @vbz );
$vbg = findbest( $word, @vbg );
$vbd = findbest( $word, @vbd );
$vbn = findbest( $word, @vbn );
$verb =~ s/($word\', \')IRREG(\', \')IRREG(\', \')IRREG(\', \')IRREG/\*$1$vbz$2$vbg$3$vbd$4$vbn/;
}
}
# now print everything out (so we can group PoSs together)
print @verb, "\n", @noun, "\n", @adj, "\n", @adv;
print "\n", @pron, "\n", @det, "\n", @prep, "\n", @conj;
print "\n", @prefix, "\n", @interj, "\n", @partcl, "\n", @unknown;
# find closest string match
# similarity measure is just the length of identical prefix
# prefer shorter strings in the case of equal similarity
sub findbest
{
my ( $word, @array ) = @_;
$bestlen = 0;
foreach $test ( @array ) {
if ( ( substr( $word, 0, $bestlen-1 ) eq substr( $test, 0, $bestlen-1 ) ) &&
( length( $test ) < length( $best ) ) ) {
$best = $test;
}
while ( ( substr( $word, 0, $bestlen ) eq substr( $test, 0, $bestlen ) ) &&
( $bestlen <= length( $test ) ) ) {
$bestlen++;
$best = $test;
}
}
return $best;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,39 +0,0 @@
youPl_Pron 0.04
youPol_Pron 0.04
UttS 0.6
UttQS 0.2
UttImp 0.1
NoPConj 0.8
NoVoc 0.98
PredVP 0.9
DetCN 0.8
UsePron 0.1
something_NP 0.01
somebody_NP 0.01
everything_NP 0.01
everybody_NP 0.01
SlashV2 0.8
UseV 0.4
ComplSlash 0.4
ComplVQ 0.02
ComplVS 0.02
ComplVA 0.02
DetQuant 0.8
ASimul 0.8
TFut 0.1
TCond 0.1
PPos 0.7
ApposCN 0.01
ExistNP 0.0001
UseCopula 0.01
ConjS 0.1