mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 03:32:51 -06:00
OALD: clean up asc2gf script.
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
#! /usr/bin/perl
|
#! /usr/bin/perl -w
|
||||||
#
|
#
|
||||||
# Perl script to process OALD machine-readable ASCII file
|
# Perl script to process OALD machine-readable ASCII file
|
||||||
# into a GF lexicon
|
# into a GF lexicon
|
||||||
@@ -9,11 +9,24 @@
|
|||||||
# based on asc2lex by
|
# based on asc2lex by
|
||||||
# Matthew Purver, 11/2001
|
# Matthew Purver, 11/2001
|
||||||
|
|
||||||
# The list of known irregular verbs is generated with this command
|
use strict;
|
||||||
# $ perl -e 'while (<>) { if (s/\s*([a-z\d]+)_V\s*=.*/"$1" => 1/) { chomp; push(@verbs,$_); } }; print "(" . join(",", @verbs) . ")\n";' english/IrregEng.gf
|
|
||||||
|
|
||||||
my %irregular_verbs = ("awake" => 1,"bear" => 1,"beat" => 1,"become" => 1,"begin" => 1,"bend" => 1,"beset" => 1,"bet" => 1,"bid" => 1,"bind" => 1,"bite" => 1,"bleed" => 1,"blow" => 1,"break" => 1,"breed" => 1,"bring" => 1,"broadcast" => 1,"build" => 1,"burn" => 1,"burst" => 1,"buy" => 1,"cast" => 1,"catch" => 1,"choose" => 1,"cling" => 1,"come" => 1,"cost" => 1,"creep" => 1,"cut" => 1,"deal" => 1,"dig" => 1,"dive" => 1,"do" => 1,"draw" => 1,"dream" => 1,"drive" => 1,"drink" => 1,"eat" => 1,"fall" => 1,"feed" => 1,"feel" => 1,"fight" => 1,"find" => 1,"fit" => 1,"flee" => 1,"fling" => 1,"fly" => 1,"forbid" => 1,"forget" => 1,"forgive" => 1,"forsake" => 1,"freeze" => 1,"get" => 1,"give" => 1,"go" => 1,"grind" => 1,"grow" => 1,"hang" => 1,"have" => 1,"hear" => 1,"hide" => 1,"hit" => 1,"hold" => 1,"hurt" => 1,"keep" => 1,"kneel" => 1,"knit" => 1,"know" => 1,"lay" => 1,"lead" => 1,"leap" => 1,"learn" => 1,"leave" => 1,"lend" => 1,"let" => 1,"lie" => 1,"light" => 1,"lose" => 1,"make" => 1,"mean" => 1,"meet" => 1,"misspell" => 1,"mistake" => 1,"mow" => 1,"overcome" => 1,"overdo" => 1,"overtake" => 1,"overthrow" => 1,"pay" => 1,"plead" => 1,"prove" => 1,"put" => 1,"quit" => 1,"read" => 1,"rid" => 1,"ride" => 1,"ring" => 1,"rise" => 1,"run" => 1,"saw" => 1,"say" => 1,"see" => 1,"seek" => 1,"sell" => 1,"send" => 1,"set" => 1,"sew" => 1,"shake" => 1,"shave" => 1,"shear" => 1,"shed" => 1,"shine" => 1,"shoe" => 1,"shoot" => 1,"show" => 1,"shrink" => 1,"shut" => 1,"sing" => 1,"sink" => 1,"sit" => 1,"sleep" => 1,"slay" => 1,"slide" => 1,"sling" => 1,"slit" => 1,"smite" => 1,"sow" => 1,"speak" => 1,"speed" => 1,"spend" => 1,"spill" => 1,"spin" => 1,"spit" => 1,"split" => 1,"spread" => 1,"spring" => 1,"stand" => 1,"steal" => 1,"stick" => 1,"sting" => 1,"stink" => 1,"stride" => 1,"strike" => 1,"string" => 1,"strive" => 1,"swear" => 1,"sweep" => 1,"swell" => 1,"swim" => 1,"swing" => 1,"take" => 1,"teach" => 1,"tear" => 1,"tell" => 1,"think" => 1,"thrive" => 1,"throw" => 1,"thrust" => 1,"tread" => 1,"understand" => 1,"uphold" => 1,"upset" => 1,"wake" => 1,"wear" => 1,"weave" => 1,"wed" => 1,"weep" => 1,"wind" => 1,"win" => 1,"withhold" => 1,"withstand" => 1,"wring" => 1,"write" => 1);
|
my %irregular_verbs = ();
|
||||||
|
my %words = ();
|
||||||
|
|
||||||
|
my $irreg_eng = "../../english/IrregEng.gf";
|
||||||
|
|
||||||
|
open(IRREG_ENG,"$irreg_eng") or die "Could not open $irreg_eng\n";
|
||||||
|
while (<IRREG_ENG>) {
|
||||||
|
if (s/\s*([a-z\d]+)_V\s*=.*/$1/) {
|
||||||
|
chomp;
|
||||||
|
$irregular_verbs{$_} = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close IRREG_ENG;
|
||||||
|
|
||||||
|
print "Known irregular verbs from $irreg_eng:\n";
|
||||||
|
print join(",", keys %irregular_verbs) . "\n";
|
||||||
|
|
||||||
|
|
||||||
# skip header section
|
# skip header section
|
||||||
@@ -22,7 +35,7 @@ while ( <STDIN> ) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
# read a line from stdin
|
# read a line from stdin
|
||||||
while ( $line = <STDIN> ) {
|
while ( my $line = <STDIN> ) {
|
||||||
|
|
||||||
# remove SGML tags
|
# remove SGML tags
|
||||||
$line =~ s/<[^<>]+>//g;
|
$line =~ s/<[^<>]+>//g;
|
||||||
@@ -30,8 +43,10 @@ while ( $line = <STDIN> ) {
|
|||||||
# split line into fields according to spec (line may be empty now)
|
# split line into fields according to spec (line may be empty now)
|
||||||
if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) {
|
if ( $line =~ /^(.{23}).{23}(.{23}).{1}(.{58})$/ ) {
|
||||||
|
|
||||||
|
my ( $word, $pos, $cat ) = ( $1, $2, $3 );
|
||||||
|
|
||||||
# trim white space
|
# trim white space
|
||||||
for ( ( $word, $pos, $cat ) = ( $1, $2, $3 ) ) {
|
for ( ( $word, $pos, $cat ) ) {
|
||||||
s/\s*$//;
|
s/\s*$//;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -55,7 +70,7 @@ while ( $line = <STDIN> ) {
|
|||||||
# make legal identifier
|
# make legal identifier
|
||||||
# Note: in theory this could cause clashes, but I don't think it does
|
# Note: in theory this could cause clashes, but I don't think it does
|
||||||
# with the OALD.
|
# with the OALD.
|
||||||
$name = $word;
|
my $name = $word;
|
||||||
$name =~ s/ /_/g; # space -> _
|
$name =~ s/ /_/g; # space -> _
|
||||||
$name =~ s/-/_/g; # - -> _
|
$name =~ s/-/_/g; # - -> _
|
||||||
$name =~ s/\./_/g; # . -> _
|
$name =~ s/\./_/g; # . -> _
|
||||||
@@ -63,16 +78,18 @@ while ( $line = <STDIN> ) {
|
|||||||
|
|
||||||
|
|
||||||
# get PoS & subcat info
|
# get PoS & subcat info
|
||||||
@pos = split( /,/, $pos );
|
my @pos = split( /,/, $pos );
|
||||||
$cat =~ s/,/\',\'/g;
|
$cat =~ s/,/\',\'/g;
|
||||||
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
|
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
|
||||||
|
|
||||||
foreach ( @pos ) {
|
foreach ( @pos ) {
|
||||||
( $pcode, $infl, $freq )=split(//);
|
my ( $pcode, $infl, $freq )=split(//);
|
||||||
|
|
||||||
# for verbs, get inflected forms
|
# for verbs, get inflected forms
|
||||||
if ( $pcode =~ /^[GHIJ]/ ) {
|
if ( $pcode =~ /^[GHIJ]/ ) {
|
||||||
$pos = 'verb';
|
$pos = 'verb';
|
||||||
|
my ($vbz, $vbg, $vbd);
|
||||||
|
|
||||||
# if this is a root form, work out the inflected forms
|
# if this is a root form, work out the inflected forms
|
||||||
if ( $infl =~ /^\d/ ) {
|
if ( $infl =~ /^\d/ ) {
|
||||||
if ( $infl == 0 ) {
|
if ( $infl == 0 ) {
|
||||||
@@ -107,7 +124,7 @@ while ( $line = <STDIN> ) {
|
|||||||
$vbd = 'IRREG';
|
$vbd = 'IRREG';
|
||||||
}
|
}
|
||||||
|
|
||||||
$lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
|
my $lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
|
||||||
|
|
||||||
# try to use a verb from IrregEng
|
# try to use a verb from IrregEng
|
||||||
if ( $infl == 5 ) {
|
if ( $infl == 5 ) {
|
||||||
@@ -138,16 +155,16 @@ while ( $line = <STDIN> ) {
|
|||||||
}
|
}
|
||||||
# if this is an inflected form, save for guessing irregulars later
|
# if this is an inflected form, save for guessing irregulars later
|
||||||
elsif ( $infl =~ /^a/ ) {
|
elsif ( $infl =~ /^a/ ) {
|
||||||
push( @vbz, $word );
|
#push( @vbz, $word );
|
||||||
}
|
}
|
||||||
elsif ( $infl =~ /^b/ ) {
|
elsif ( $infl =~ /^b/ ) {
|
||||||
push( @vbg, $word );
|
#push( @vbg, $word );
|
||||||
}
|
}
|
||||||
elsif ( $infl =~ /^c/ ) {
|
elsif ( $infl =~ /^c/ ) {
|
||||||
push( @vbd, $word );
|
#push( @vbd, $word );
|
||||||
}
|
}
|
||||||
elsif ( $infl =~ /^d/ ) {
|
elsif ( $infl =~ /^d/ ) {
|
||||||
push( @vbn, $word );
|
#push( @vbn, $word );
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# for nouns, get plural form
|
# for nouns, get plural form
|
||||||
@@ -164,14 +181,14 @@ while ( $line = <STDIN> ) {
|
|||||||
}
|
}
|
||||||
# if this is a singular form, work out plural form
|
# if this is a singular form, work out plural form
|
||||||
unless ( $infl =~ /^j/ ) {
|
unless ( $infl =~ /^j/ ) {
|
||||||
$pl = '-';
|
my $pl = '-';
|
||||||
if ( $infl == 6 ) {
|
if ( $infl eq '6' ) {
|
||||||
( $pl = $word ) =~ s/$/s/;
|
( $pl = $word ) =~ s/$/s/;
|
||||||
}
|
}
|
||||||
elsif ( $infl == 7 ) {
|
elsif ( $infl eq '7' ) {
|
||||||
( $pl = $word ) =~ s/$/es/;
|
( $pl = $word ) =~ s/$/es/;
|
||||||
}
|
}
|
||||||
elsif ( $infl == 8 ) {
|
elsif ( $infl eq '8' ) {
|
||||||
( $pl = $word ) =~ s/y$/ies/;
|
( $pl = $word ) =~ s/y$/ies/;
|
||||||
}
|
}
|
||||||
elsif ( $infl =~ /^[9k\]]/ ) {
|
elsif ( $infl =~ /^[9k\]]/ ) {
|
||||||
@@ -229,6 +246,7 @@ while ( $line = <STDIN> ) {
|
|||||||
$pos = 'adj';
|
$pos = 'adj';
|
||||||
# if this is root form, work out inflected forms
|
# if this is root form, work out inflected forms
|
||||||
unless ( $infl =~ /^[rs]/ ) {
|
unless ( $infl =~ /^[rs]/ ) {
|
||||||
|
my ($comp, $sup);
|
||||||
if ( $infl =~ /^[Apqt]/ ) {
|
if ( $infl =~ /^[Apqt]/ ) {
|
||||||
$comp = $sup = '-';
|
$comp = $sup = '-';
|
||||||
}
|
}
|
||||||
@@ -276,7 +294,7 @@ while ( $line = <STDIN> ) {
|
|||||||
$infl =~ s/^x/normal/;
|
$infl =~ s/^x/normal/;
|
||||||
$infl =~ s/^y/whq/;
|
$infl =~ s/^y/whq/;
|
||||||
$infl =~ s/^z/whrel/;
|
$infl =~ s/^z/whrel/;
|
||||||
$class = '_';
|
my $class = '_';
|
||||||
# reflexive pronouns
|
# reflexive pronouns
|
||||||
if ( ( $word =~ /self$/ ) or
|
if ( ( $word =~ /self$/ ) or
|
||||||
( $word =~ /selves$/ ) ) {
|
( $word =~ /selves$/ ) ) {
|
||||||
@@ -347,8 +365,8 @@ while ( $line = <STDIN> ) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$absfile = "Oald.gf";
|
my $absfile = "Oald.gf";
|
||||||
$cncfile = "OaldEng.gf";
|
my $cncfile = "OaldEng.gf";
|
||||||
|
|
||||||
open (ABS, '>', $absfile);
|
open (ABS, '>', $absfile);
|
||||||
open (CNC, '>', $cncfile);
|
open (CNC, '>', $cncfile);
|
||||||
@@ -356,8 +374,10 @@ open (CNC, '>', $cncfile);
|
|||||||
|
|
||||||
|
|
||||||
# print a nice comment at the top
|
# print a nice comment at the top
|
||||||
$header = "-- GF lexicon, from OALD machine-readable dictionary\n"
|
my $header = "-- English lexicon for GF, produced from OALD machine-readable dictionary.\n"
|
||||||
. "-- Produced by asc2gf, based on asc2lex, Matthew Purver 11/2001\n\n";
|
. "-- Generated by asc2gf, Bjorn Bringert Nov 2008\n"
|
||||||
|
. "-- based on asc2lex, Matthew Purver Nov 2001\n"
|
||||||
|
. "\n";
|
||||||
print ABS $header;
|
print ABS $header;
|
||||||
print CNC $header;
|
print CNC $header;
|
||||||
|
|
||||||
@@ -365,9 +385,9 @@ print ABS "abstract Oald = Cat ** {\n";
|
|||||||
print CNC "--# -path=.:alltenses\n";
|
print CNC "--# -path=.:alltenses\n";
|
||||||
print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng, IrregEng in {\n";
|
print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng, IrregEng in {\n";
|
||||||
|
|
||||||
foreach $name (sort (keys %words)) {
|
foreach my $name (sort (keys %words)) {
|
||||||
($cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/;
|
(my $cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/;
|
||||||
$lin = $words{$name};
|
my $lin = $words{$name};
|
||||||
print ABS "fun $name : $cat;\n";
|
print ABS "fun $name : $cat;\n";
|
||||||
print CNC "lin $name = $lin;\n";
|
print CNC "lin $name = $lin;\n";
|
||||||
}
|
}
|
||||||
@@ -380,7 +400,6 @@ close(CNC);
|
|||||||
|
|
||||||
print "\nWrote lexicon to $absfile and $cncfile\n";
|
print "\nWrote lexicon to $absfile and $cncfile\n";
|
||||||
|
|
||||||
exit 0;
|
|
||||||
|
|
||||||
|
|
||||||
sub add_word {
|
sub add_word {
|
||||||
@@ -391,52 +410,3 @@ sub add_word {
|
|||||||
$words{$name} = $lin;
|
$words{$name} = $lin;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# now have a guess at irregular verb forms (marking the best guess with '*')
|
|
||||||
foreach $verb ( @verb ) {
|
|
||||||
if ( $verb =~ /verb\( \'([^\']+)\', \'IRREG/ ) {
|
|
||||||
$word = $1;
|
|
||||||
$vbz = findbest( $word, @vbz );
|
|
||||||
$vbg = findbest( $word, @vbg );
|
|
||||||
$vbd = findbest( $word, @vbd );
|
|
||||||
$vbn = findbest( $word, @vbn );
|
|
||||||
$verb =~ s/($word\', \')IRREG(\', \')IRREG(\', \')IRREG(\', \')IRREG/\*$1$vbz$2$vbg$3$vbd$4$vbn/;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# now print everything out (so we can group PoSs together)
|
|
||||||
print @verb, "\n", @noun, "\n", @adj, "\n", @adv;
|
|
||||||
print "\n", @pron, "\n", @det, "\n", @prep, "\n", @conj;
|
|
||||||
print "\n", @prefix, "\n", @interj, "\n", @partcl, "\n", @unknown;
|
|
||||||
|
|
||||||
|
|
||||||
# find closest string match
|
|
||||||
# similarity measure is just the length of identical prefix
|
|
||||||
# prefer shorter strings in the case of equal similarity
|
|
||||||
sub findbest
|
|
||||||
{
|
|
||||||
my ( $word, @array ) = @_;
|
|
||||||
|
|
||||||
$bestlen = 0;
|
|
||||||
foreach $test ( @array ) {
|
|
||||||
if ( ( substr( $word, 0, $bestlen-1 ) eq substr( $test, 0, $bestlen-1 ) ) &&
|
|
||||||
( length( $test ) < length( $best ) ) ) {
|
|
||||||
$best = $test;
|
|
||||||
}
|
|
||||||
while ( ( substr( $word, 0, $bestlen ) eq substr( $test, 0, $bestlen ) ) &&
|
|
||||||
( $bestlen <= length( $test ) ) ) {
|
|
||||||
$bestlen++;
|
|
||||||
$best = $test;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return $best;
|
|
||||||
}
|
|
||||||
|
|||||||
Reference in New Issue
Block a user