scattered DictEngFin improvements

This commit is contained in:
aarne
2013-04-02 06:32:52 +00:00
parent 9f5a5ec130
commit 3d236a077a
2 changed files with 69 additions and 24 deletions

View File

@@ -5624,8 +5624,7 @@ bitch_V = mkWV (k53A "valittaa") ;
bitchery_N = mkWN "narttumaisuus" ;
bitchy_A = mkWA (k15 "ilkeä") ;
bite_N = mkWN (k9 "pala") ;
bite_V = mkWV (k52A "pureutua") ;
--PLURNOUN --PREDEF bite_V2 = mkWV2 (k52A "pureutua") ;
bite_V = mkV "purra" ;
biter_N = mkWN (k12 "purija") ;
bitewing_N = mkWN (compoundN "hammas" (k5 "röntgen")) ;
biting_A = mkWA (k10 "pureva") ;
@@ -6350,14 +6349,14 @@ boreal_A = mkWA (compoundA "borea" (k38 "alinen")) ;
boredom_N = mkWN (k40 "ikävystyttävyys") ;
borer_N = mkWN (k34A "selkärangaton") ;
boric_A = mkWA "boori-" ;
boring_A = mkWA (k10 "rasittava") ;
boring_A = mkA "ikävystyttävä" ;
boring_N = mkWN (k39 "poraus") ;
boringness_N = mkWN (k40 "ikävystyttävyys") ;
bornite_N = mkWN (compoundN "kupari" (k1 "kiisu")) "kirjava" ;
boron_N = mkWN (k5 "boori") ;
boronic_A = mkWA "boori-" ;
borosilicate_N = mkWN (compoundN "boro" (k5A "silikaatti")) ;
borough_N = mkWN (k5A "kaupunki") ;
borough_N = mkN "kauppala" ;
borrelia_N = mkWN "Borrelia" ;
borrow_V = mkWV (k53A "ottaa") "lainaksi" ;
borrow_V2 = mkWV2 (k53A "ottaa") "lainaksi" ;
@@ -7824,12 +7823,12 @@ canary_wine_N = mkWN (kH1 "viini") "Kanarian" ;
canasta_N = mkWN (k9 "canasta") ;
canavanine_N = mkWN (compoundN "kanava" (k26 "niini")) ;
cancan_N = mkWN (k5 "cancan") ;
cancel_V = mkWV (k74 "kumota") ;
cancel_V = mkV "peruuttaa" ;
--MANUAL cancel_V2 = mkWV2 (k74 "kumota") ;
cancel_out_V2 = mkWV2 (k62 "neutraloida") ;
cancellate_A = mkWA (k38 "hohkainen") ;
cancellation_N = mkWN (k39 "peruutus") ;
cancer_N = mkWN (compoundN "Cancer-" (k1A "suku")) ;
cancer_N = mkN "syöpä" ;
cancerous_A = mkWA (k38 "syöpäinen") ;
cancerweed_N = mkWN (compoundN "lyyra" (k12 "salvia")) ;
cancroid_A = mkWA (compoundA "syöpä" (k38 "mäinen")) ;
@@ -11288,8 +11287,8 @@ conspire_V2V = mkWV2V (k67A "juonitella") ;
constable_N = mkWN (k6 "konstaapeli") ;
constabulary_N = mkWN (k6 "poliisi") ;
constancy_N = mkWN (k40 "muuttumattomuus") ;
constant_A = mkWA (k38 "uskollinen") ;
constant_N = mkWN (k48 "suure") "muuttumaton" ;
constant_A = mkA "jatkuva" ;
constant_N = mkN "vakio" ;
--PLURNOUN --POSTPONE constantan_N = mkWN (compoundN "konstanta" (k99 "ani")) ;
constellation_N = mkWN (k10 "asetelma") ;
consternation_N = mkWN (k39 "tyrmistys") ;
@@ -20230,9 +20229,9 @@ forge_V = mkWV (k73A "kekata") ;
forge_V2 = mkWV2 (k73A "kekata") ;
forger_N = mkWN (k10 "väärentäjä") ;
forgery_N = mkWN (k39 "väärennys") ;
forget_V = mkWV (k67 "olla") "muistamatta" ;
forget_V = mkV "unohtaa" ;
--PLURNOUN --PREDEF forget_V2 = mkWV2 (k67 "olla") "muistamatta" ;
forget_VS = mkWVS (k67 "olla") "muistamatta" ;
forget_VS = mkV "unohtaa" ;
forgetful_A = mkWA (compoundA "haja" (k38 "mielinen")) ;
forgetfulness_N = mkWN "huonomuistisuus" ;
forgettable_A = mkWA "helposti" "unohdettava" ;
@@ -20704,9 +20703,9 @@ fryer_N = mkWN (k4A "nuorikko") ;
frying_N = mkWN (compoundN "pai" (k38 "staminen")) ;
frying_pan_N = mkWN (compoundN "paistin" (k1 "pannu")) ;
fuchsia_N = mkWN "fuksia" ;
fuck_N = mkWN (k5 "seksi") ;
fuck_V = mkWV (k63 "saada") ;
fuck_V2 = mkWV2 (k63 "saada") ;
fuck_N = mkN "pano" ;
fuck_V = mkV "naida" ;
fuck_V2 = mkV2 "naida" partitive ;
fuck_all_N = mkWN (k99 "yhtään") "ei" ;
fucker_N = mkWN (k38 "paskiainen") ;
fucking_Adv = mkWAdv "vitun" ;
@@ -20810,17 +20809,17 @@ funnel_N = mkWN (compoundN "savu" (k10 "kanava")) ;
--PLURNOUN --POSTPONE funnel_V = mkWV (k54A "siirtää") "suppilon" "kautta" ;
--PLURNOUN --POSTPONE funnel_V2 = mkWV2 (k54A "siirtää") "suppilon" "kautta" ;
funniness_N = mkWN (k40 "hauskuus") ;
funny_A = mkWA (k38 "poikkeuksellinen") ;
funny_A = mkA "hauska" ;
funny_bone_N = mkWN (compoundN "kiukku" (k26 "suoni")) ;
funrun_N = mkWN (compoundN "hyväntekeväisyys" (k1 "juoksu")) ;
fur_N = mkWN (k9 "karva") ;
fur_N = mkN "turkis" ;
--PLURNOUN --POSTPONE furan_N = mkWN (compoundN "fura" (k99 "ani")) ;
furbelow_N = mkWN (k2 "röyhelö") ;
furbish_V2 = mkWV2 (k53A "kiillottaa") ;
furcation_N = mkWN (k10 "haarauma") ;
furcula_N = mkWN (compoundN "hanka" (k18 "luu")) ;
--PLURNOUN --POSTPONE furfural_N = mkWN (compoundN "furfura" (k99 "ali")) ;
furious_A = mkWA (k99 "suunniltaan") ;
furious_A = mkA "raivokas" ;
furl_V = mkWV (k61 "kääriä") ;
furl_V2 = mkWV2 (k61 "kääriä") ;
furlike_A = mkWA (compoundA "turkis" (k38 "mainen")) ;
@@ -23067,10 +23066,10 @@ haply_Adv = mkWAdv (k99 "sattumalta") ;
--MANUAL10_06 happen_V = mkWV (k65 "käydä") ;
happen_V2 = mkWV2 (k65 "käydä") ;
--MANUALVV happen_VV = mkWVV (k65 "käydä") ;
happening_N = mkWN (k39 "tapaus") ;
happiness_N = mkWN (k1 "ilo") ;
--MANUAL10 -- happy_A a NOT_IN_KOTUS
haptic_A = mkWA "tunto-" ;
happening_N = mkN "tapahtuma" ;
happiness_N = mkN "onnellisuus" ;
happy_A = mkA "onnellinen" ;
haptic_A = mkA "tuntoonperustuva" ;
haptoglobin_N = mkWN "haptoglobiini" ;
harakiri_N = mkWN (k5 "harakiri") ;
harangue_N = mkWN (k39 "vuodatus") ;
@@ -23520,7 +23519,7 @@ heliotropism_N = mkWN (compoundN "helio" (k5 "tropismi")) ;
heliport_N = mkWN (compoundN "helikopteri" (k10A "kenttä")) ;
helium_N = mkWN (k5 "helium") ;
helix_N = mkWN (k2 "kotilo") ;
hell_N = mkWN (compoundN "melu" (k38 "aminen")) ;
hell_N = mkN "helvetti" ;
hellbender_N = mkWN (compoundN "lieju" (k1 "piru")) ;
hellcat_N = mkWN (k10 "ämmä") "pahansisuinen" ;
hellebore_N = mkWN (compoundN "joulu" (k1 "ruusu")) ;
@@ -27918,7 +27917,7 @@ knothole_N = mkWN (compoundN "oksan" (k10A "reikä")) ;
knotty_A = mkWA (k38 "sotkuinen") ;
knout_N = mkWN (k10 "ruoska") ;
know_N = mkWN (k40 "tietoisuus") ;
know_V = mkWV (k54A "myöntää") ;
know_V = mkV "tietää" ;
--PLURNOUN --PREDEF know_V2 = mkWV2 (k54A "myöntää") ;
know_V2V = mkWV2V (k54A "myöntää") ;
--PLURNOUN --PREDEF know_VQ = mkWVQ (k54A "myöntää") ;
@@ -29569,7 +29568,7 @@ look_N = mkWN (k48 "katse") ;
--PLURNOUN --PREDEF look_V = mkWV (k53A "näyttää") "jltak" ;
look_V2 = mkWV2 (k53A "näyttää") "jltak" ;
look_V2V = mkWV2V (k53A "näyttää") "jltak" ;
look_VA = mkWVA (k53A "näyttää") "jltak" ;
look_VA = mkVA (mkV "näyttää") (mkPrep ablative) ;
--MANUALVV look_VV = mkWVV (k53A "näyttää") "jltak" ;
look_after_V2 = mkWV2 (k53A "pitää") "huolta" ;
--PLURNOUN --POSTPONE look_around_V = mkWV (k67 "katsella") "ympärilleen" "jssak" ;

View File

@@ -134,7 +134,7 @@ separate from "ole" ("ottamaan", not "otamaan") and from "ovat" (*"omaan").
Received a corrected corpus from Krasimir, with weekdays and months recognized. This changes 100 translations.
Now at version 13-eng-fin-wsj.txt, working with penn/wsj-3220/corr-wsj.full.
Dictionary revision: 368 words with 5--3 occurrences, 140 changed in 30 minutes. Effect on 425 translations.
Dictionary revision: 368 words with 5--4 occurrences, 150 changed in 30 minutes. Effect on 425 translations.
It feels that FiWN - or maybe the method we have used it? - is not the optimal source, as the translations
we get are often unusual translations, and even strange words. For instance, pay_N = "liksa", a slang word.
Now at version 14. Work done:
@@ -143,6 +143,52 @@ Now at version 14. Work done:
- 10 hours fixing RGL
1/4
Calculation of returns
- 22403 lemma tokens
- 4333 lemma types
- 390 types with 10 occurrences or more
- 61 % of tokens covered by these
- Going down from 10: (k=occs, n=lemmas with k occs, k*n)
(9,58,522),
(8,52,416),
(7,87,609),
(6,118,708),
(5,169,845),
(4,200,800),
(3,388,1164),
(2,745,1490),
(1,2126,2126)
Thus by covering >3 we now cover 79%. >2 is 84%, and >1 is 91%. >1 means 51% of the lemmas.
That is, we need to revise 2100 words to achieve 90% accuracy. Revision taking 1h/600 words (with 50% OK)
means 3.5h work. Maybe 8h work for all 4333 lemmas.
Analysed the whole log4.txt. Statistics of types of metas:
NP 25369
A 12837
N 11191
S 3961
Quant -> N -> NP 3609
N -> NP 3193
Prep -> S -> Adv 2581
NP -> VP -> S 2184
AP 2176
NP -> VPSlash -> NP 1680
S -> NP -> VP -> S 1635
Etc. 14,718 different types. Many of those could be dealt with by padding with nullables and coercions.
Quant -> N -> NP ===> \quant, n -> DetCN (DetQuant q NumSg) (UseN n)
Also tried linearization by chunks, defined as maximal fun-headed subtrees. Quite similar to
smoothing with shorter n-grams one could say. Long-distance agreements lost, but chunks make sense.