forked from GitHub/gf-core
First working version of OALD dictionary conversion. Still missing: irregular verbs, irregular nouns, better handling of singular- and plural-only nouns.
This commit is contained in:
@@ -28,15 +28,10 @@ while ( $line = <STDIN> ) {
|
|||||||
s/\s*$//;
|
s/\s*$//;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( $word =~ /^'/ ) {
|
|
||||||
print STDERR "Ignoring: \"$word\"\n";
|
|
||||||
next;
|
|
||||||
}
|
|
||||||
|
|
||||||
# make word lower-case
|
# make word lower-case
|
||||||
$word =~ tr/A-Z/a-z/; # lower case
|
$word =~ tr/A-Z/a-z/; # lower case
|
||||||
|
|
||||||
# move diacritics to the following letter
|
# translate OALD diacritics
|
||||||
$word =~ s/~n/ñ/g;
|
$word =~ s/~n/ñ/g;
|
||||||
$word =~ s/<c/ç/g;
|
$word =~ s/<c/ç/g;
|
||||||
$word =~ s/"a/ä/g;
|
$word =~ s/"a/ä/g;
|
||||||
@@ -51,10 +46,13 @@ while ( $line = <STDIN> ) {
|
|||||||
$word =~ s/_e/é/g;
|
$word =~ s/_e/é/g;
|
||||||
|
|
||||||
# make legal identifier
|
# make legal identifier
|
||||||
|
# Note: in theory this could cause clashes, but I don't think it does
|
||||||
|
# with the OALD.
|
||||||
$name = $word;
|
$name = $word;
|
||||||
$name =~ s/ /_/g; # space -> _
|
$name =~ s/ /_/g; # space -> _
|
||||||
$name =~ s/-/_/g; # - -> _
|
$name =~ s/-/_/g; # - -> _
|
||||||
$name =~ s/\./_/g; # . -> _
|
$name =~ s/\./_/g; # . -> _
|
||||||
|
$name =~ s/^'//; # drop initial '
|
||||||
|
|
||||||
|
|
||||||
# get PoS & subcat info
|
# get PoS & subcat info
|
||||||
@@ -62,7 +60,6 @@ while ( $line = <STDIN> ) {
|
|||||||
$cat =~ s/,/\',\'/g;
|
$cat =~ s/,/\',\'/g;
|
||||||
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
|
( $cat = "\'$cat\'" ) unless ( $cat eq '' );
|
||||||
|
|
||||||
# set up Prolog-style string & put into array
|
|
||||||
foreach ( @pos ) {
|
foreach ( @pos ) {
|
||||||
( $pcode, $infl, $freq )=split(//);
|
( $pcode, $infl, $freq )=split(//);
|
||||||
|
|
||||||
@@ -106,7 +103,8 @@ while ( $line = <STDIN> ) {
|
|||||||
$lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
|
$lin = "mkV \"$word\" \"$vbz\" \"$vbd\" \"$vbd\" \"$vbg\"";
|
||||||
|
|
||||||
if ($pcode eq 'G') {
|
if ($pcode eq 'G') {
|
||||||
add_word("${name}_VX", "mkVX ($lin)");
|
#add_word("${name}_VX", "mkVX ($lin)");
|
||||||
|
print STDERR "Ignoring anomalous verb: $name\n";
|
||||||
}
|
}
|
||||||
if ($pcode eq 'I' || $pcode eq 'J') {
|
if ($pcode eq 'I' || $pcode eq 'J') {
|
||||||
add_word("${name}_V", "$lin");
|
add_word("${name}_V", "$lin");
|
||||||
@@ -184,7 +182,16 @@ while ( $line = <STDIN> ) {
|
|||||||
$word = '-';
|
$word = '-';
|
||||||
}
|
}
|
||||||
( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' );
|
( $infl =~ s/^[:l]/per/ ) or ( $infl =~ s/^[mn]/loc/ ) or ( $infl = '_' );
|
||||||
add_word("${name}_N", "mkN \"$word\" \"$pl\"");
|
|
||||||
|
if ( $pcode eq 'proper' ) {
|
||||||
|
add_word("${name}_PN", "mkPN \"$word\"");
|
||||||
|
} elsif ( $word eq '-' ) {
|
||||||
|
add_word("${name}_N", "mkN \"$word\" \"$pl\" {- FIXME: no singular form -}");
|
||||||
|
} elsif ( $pl eq '-' ) {
|
||||||
|
add_word("${name}_N", "mkN \"$word\" {- FIXME: no plural form -}");
|
||||||
|
} else {
|
||||||
|
add_word("${name}_N", "mkN \"$word\" \"$pl\"");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# for adjectives, get comparative & superlative forms
|
# for adjectives, get comparative & superlative forms
|
||||||
@@ -218,10 +225,14 @@ while ( $line = <STDIN> ) {
|
|||||||
$infl =~ s/^q/attr/;
|
$infl =~ s/^q/attr/;
|
||||||
$infl =~ s/^t/affix/;
|
$infl =~ s/^t/affix/;
|
||||||
|
|
||||||
add_word("${name}_A", "mkA \"$word\" \"$comp\"");
|
if ( $comp eq '-' ) {
|
||||||
|
add_word("${name}_A", "compoundA (mkA \"$word\")");
|
||||||
|
} else {
|
||||||
|
add_word("${name}_A", "mkA \"$word\" \"$comp\"");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# for adverbs, just add all info to @adv array
|
# adverb
|
||||||
elsif( $pcode =~ /^P/ ) {
|
elsif( $pcode =~ /^P/ ) {
|
||||||
$pos = 'adv';
|
$pos = 'adv';
|
||||||
$infl =~ s/^[u\+]/normal/;
|
$infl =~ s/^[u\+]/normal/;
|
||||||
@@ -229,7 +240,7 @@ while ( $line = <STDIN> ) {
|
|||||||
$infl =~ s/^v/whq/;
|
$infl =~ s/^v/whq/;
|
||||||
add_word("${name}_Adv", "mkAdv \"$word\"");
|
add_word("${name}_Adv", "mkAdv \"$word\"");
|
||||||
}
|
}
|
||||||
# for pronouns, work out some case/person info
|
# pronoun
|
||||||
elsif( $pcode =~ s/^Q/_/ ) {
|
elsif( $pcode =~ s/^Q/_/ ) {
|
||||||
$pos = 'pron';
|
$pos = 'pron';
|
||||||
$infl =~ s/^x/normal/;
|
$infl =~ s/^x/normal/;
|
||||||
@@ -282,17 +293,17 @@ while ( $line = <STDIN> ) {
|
|||||||
$pos = 'det';
|
$pos = 'det';
|
||||||
$pcode =~ s/^R/def/;
|
$pcode =~ s/^R/def/;
|
||||||
$pcode =~ s/^S/indef/;
|
$pcode =~ s/^S/indef/;
|
||||||
#push( @det, "$pos( \'$word\', $pcode, _ ).\n" );
|
#add_word("${name}_Det","mkDeterminer \"$word\"");
|
||||||
}
|
}
|
||||||
# for prepositions - nothing to say
|
# for prepositions - nothing to say
|
||||||
elsif( $pcode =~ s/^T/prep/ ) {
|
elsif( $pcode =~ s/^T/prep/ ) {
|
||||||
$pos = 'prep';
|
$pos = 'prep';
|
||||||
#push( @prep, "$pos( \'$word\', $pcode ).\n" );
|
add_word("${name}_Prep","mkPrep \"$word\"");
|
||||||
}
|
}
|
||||||
# for conjunctions - nothing to say
|
# for conjunctions - nothing to say
|
||||||
elsif( $pcode =~ s/^V/conj/ ) {
|
elsif( $pcode =~ s/^V/conj/ ) {
|
||||||
$pos = 'conj';
|
$pos = 'conj';
|
||||||
#push( @conj, "$pos( \'$word\', $pcode ).\n" );
|
add_word("${name}_Conj","mkConj \"$word\"");
|
||||||
}
|
}
|
||||||
# for miscellaneous, leave '-' as placeholder for illocutionary info
|
# for miscellaneous, leave '-' as placeholder for illocutionary info
|
||||||
elsif( $pcode =~ /^[UWXZ]/ ) {
|
elsif( $pcode =~ /^[UWXZ]/ ) {
|
||||||
@@ -325,7 +336,7 @@ print CNC "--# -path=.:alltenses\n";
|
|||||||
print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng in {\n";
|
print CNC "concrete OaldEng of Oald = CatEng ** open ParadigmsEng in {\n";
|
||||||
|
|
||||||
foreach $name (sort (keys %words)) {
|
foreach $name (sort (keys %words)) {
|
||||||
($cat = $name) =~ s/.*_([A-Z\d])$/$1/;
|
($cat = $name) =~ s/.*_([A-Z][A-Za-z\d]*)$/$1/;
|
||||||
$lin = $words{$name};
|
$lin = $words{$name};
|
||||||
print ABS "fun $name : $cat;\n";
|
print ABS "fun $name : $cat;\n";
|
||||||
print CNC "lin $name = $lin;\n";
|
print CNC "lin $name = $lin;\n";
|
||||||
|
|||||||
Reference in New Issue
Block a user