diff --git a/lib/src/finnish/stemmed/DictEngFin.gf b/lib/src/finnish/stemmed/DictEngFin.gf index 535826bad..5b0cf359f 100644 --- a/lib/src/finnish/stemmed/DictEngFin.gf +++ b/lib/src/finnish/stemmed/DictEngFin.gf @@ -5624,8 +5624,7 @@ bitch_V = mkWV (k53A "valittaa") ; bitchery_N = mkWN "narttumaisuus" ; bitchy_A = mkWA (k15 "ilkeä") ; bite_N = mkWN (k9 "pala") ; -bite_V = mkWV (k52A "pureutua") ; ---PLURNOUN --PREDEF bite_V2 = mkWV2 (k52A "pureutua") ; +bite_V = mkV "purra" ; biter_N = mkWN (k12 "purija") ; bitewing_N = mkWN (compoundN "hammas" (k5 "röntgen")) ; biting_A = mkWA (k10 "pureva") ; @@ -6350,14 +6349,14 @@ boreal_A = mkWA (compoundA "borea" (k38 "alinen")) ; boredom_N = mkWN (k40 "ikävystyttävyys") ; borer_N = mkWN (k34A "selkärangaton") ; boric_A = mkWA "boori-" ; -boring_A = mkWA (k10 "rasittava") ; +boring_A = mkA "ikävystyttävä" ; boring_N = mkWN (k39 "poraus") ; boringness_N = mkWN (k40 "ikävystyttävyys") ; bornite_N = mkWN (compoundN "kupari" (k1 "kiisu")) "kirjava" ; boron_N = mkWN (k5 "boori") ; boronic_A = mkWA "boori-" ; borosilicate_N = mkWN (compoundN "boro" (k5A "silikaatti")) ; -borough_N = mkWN (k5A "kaupunki") ; +borough_N = mkN "kauppala" ; borrelia_N = mkWN "Borrelia" ; borrow_V = mkWV (k53A "ottaa") "lainaksi" ; borrow_V2 = mkWV2 (k53A "ottaa") "lainaksi" ; @@ -7824,12 +7823,12 @@ canary_wine_N = mkWN (kH1 "viini") "Kanarian" ; canasta_N = mkWN (k9 "canasta") ; canavanine_N = mkWN (compoundN "kanava" (k26 "niini")) ; cancan_N = mkWN (k5 "cancan") ; -cancel_V = mkWV (k74 "kumota") ; +cancel_V = mkV "peruuttaa" ; --MANUAL cancel_V2 = mkWV2 (k74 "kumota") ; cancel_out_V2 = mkWV2 (k62 "neutraloida") ; cancellate_A = mkWA (k38 "hohkainen") ; cancellation_N = mkWN (k39 "peruutus") ; -cancer_N = mkWN (compoundN "Cancer-" (k1A "suku")) ; +cancer_N = mkN "syöpä" ; cancerous_A = mkWA (k38 "syöpäinen") ; cancerweed_N = mkWN (compoundN "lyyra" (k12 "salvia")) ; cancroid_A = mkWA (compoundA "syöpä" (k38 "mäinen")) ; @@ -11288,8 +11287,8 @@ conspire_V2V = mkWV2V (k67A "juonitella") ; constable_N = mkWN (k6 "konstaapeli") ; constabulary_N = mkWN (k6 "poliisi") ; constancy_N = mkWN (k40 "muuttumattomuus") ; -constant_A = mkWA (k38 "uskollinen") ; -constant_N = mkWN (k48 "suure") "muuttumaton" ; +constant_A = mkA "jatkuva" ; +constant_N = mkN "vakio" ; --PLURNOUN --POSTPONE constantan_N = mkWN (compoundN "konstanta" (k99 "ani")) ; constellation_N = mkWN (k10 "asetelma") ; consternation_N = mkWN (k39 "tyrmistys") ; @@ -20230,9 +20229,9 @@ forge_V = mkWV (k73A "kekata") ; forge_V2 = mkWV2 (k73A "kekata") ; forger_N = mkWN (k10 "väärentäjä") ; forgery_N = mkWN (k39 "väärennys") ; -forget_V = mkWV (k67 "olla") "muistamatta" ; +forget_V = mkV "unohtaa" ; --PLURNOUN --PREDEF forget_V2 = mkWV2 (k67 "olla") "muistamatta" ; -forget_VS = mkWVS (k67 "olla") "muistamatta" ; +forget_VS = mkV "unohtaa" ; forgetful_A = mkWA (compoundA "haja" (k38 "mielinen")) ; forgetfulness_N = mkWN "huonomuistisuus" ; forgettable_A = mkWA "helposti" "unohdettava" ; @@ -20704,9 +20703,9 @@ fryer_N = mkWN (k4A "nuorikko") ; frying_N = mkWN (compoundN "pai" (k38 "staminen")) ; frying_pan_N = mkWN (compoundN "paistin" (k1 "pannu")) ; fuchsia_N = mkWN "fuksia" ; -fuck_N = mkWN (k5 "seksi") ; -fuck_V = mkWV (k63 "saada") ; -fuck_V2 = mkWV2 (k63 "saada") ; +fuck_N = mkN "pano" ; +fuck_V = mkV "naida" ; +fuck_V2 = mkV2 "naida" partitive ; fuck_all_N = mkWN (k99 "yhtään") "ei" ; fucker_N = mkWN (k38 "paskiainen") ; fucking_Adv = mkWAdv "vitun" ; @@ -20810,17 +20809,17 @@ funnel_N = mkWN (compoundN "savu" (k10 "kanava")) ; --PLURNOUN --POSTPONE funnel_V = mkWV (k54A "siirtää") "suppilon" "kautta" ; --PLURNOUN --POSTPONE funnel_V2 = mkWV2 (k54A "siirtää") "suppilon" "kautta" ; funniness_N = mkWN (k40 "hauskuus") ; -funny_A = mkWA (k38 "poikkeuksellinen") ; +funny_A = mkA "hauska" ; funny_bone_N = mkWN (compoundN "kiukku" (k26 "suoni")) ; funrun_N = mkWN (compoundN "hyväntekeväisyys" (k1 "juoksu")) ; -fur_N = mkWN (k9 "karva") ; +fur_N = mkN "turkis" ; --PLURNOUN --POSTPONE furan_N = mkWN (compoundN "fura" (k99 "ani")) ; furbelow_N = mkWN (k2 "röyhelö") ; furbish_V2 = mkWV2 (k53A "kiillottaa") ; furcation_N = mkWN (k10 "haarauma") ; furcula_N = mkWN (compoundN "hanka" (k18 "luu")) ; --PLURNOUN --POSTPONE furfural_N = mkWN (compoundN "furfura" (k99 "ali")) ; -furious_A = mkWA (k99 "suunniltaan") ; +furious_A = mkA "raivokas" ; furl_V = mkWV (k61 "kääriä") ; furl_V2 = mkWV2 (k61 "kääriä") ; furlike_A = mkWA (compoundA "turkis" (k38 "mainen")) ; @@ -23067,10 +23066,10 @@ haply_Adv = mkWAdv (k99 "sattumalta") ; --MANUAL10_06 happen_V = mkWV (k65 "käydä") ; happen_V2 = mkWV2 (k65 "käydä") ; --MANUALVV happen_VV = mkWVV (k65 "käydä") ; -happening_N = mkWN (k39 "tapaus") ; -happiness_N = mkWN (k1 "ilo") ; ---MANUAL10 -- happy_A a NOT_IN_KOTUS -haptic_A = mkWA "tunto-" ; +happening_N = mkN "tapahtuma" ; +happiness_N = mkN "onnellisuus" ; +happy_A = mkA "onnellinen" ; +haptic_A = mkA "tuntoonperustuva" ; haptoglobin_N = mkWN "haptoglobiini" ; harakiri_N = mkWN (k5 "harakiri") ; harangue_N = mkWN (k39 "vuodatus") ; @@ -23520,7 +23519,7 @@ heliotropism_N = mkWN (compoundN "helio" (k5 "tropismi")) ; heliport_N = mkWN (compoundN "helikopteri" (k10A "kenttä")) ; helium_N = mkWN (k5 "helium") ; helix_N = mkWN (k2 "kotilo") ; -hell_N = mkWN (compoundN "melu" (k38 "aminen")) ; +hell_N = mkN "helvetti" ; hellbender_N = mkWN (compoundN "lieju" (k1 "piru")) ; hellcat_N = mkWN (k10 "ämmä") "pahansisuinen" ; hellebore_N = mkWN (compoundN "joulu" (k1 "ruusu")) ; @@ -27918,7 +27917,7 @@ knothole_N = mkWN (compoundN "oksan" (k10A "reikä")) ; knotty_A = mkWA (k38 "sotkuinen") ; knout_N = mkWN (k10 "ruoska") ; know_N = mkWN (k40 "tietoisuus") ; -know_V = mkWV (k54A "myöntää") ; +know_V = mkV "tietää" ; --PLURNOUN --PREDEF know_V2 = mkWV2 (k54A "myöntää") ; know_V2V = mkWV2V (k54A "myöntää") ; --PLURNOUN --PREDEF know_VQ = mkWVQ (k54A "myöntää") ; @@ -29569,7 +29568,7 @@ look_N = mkWN (k48 "katse") ; --PLURNOUN --PREDEF look_V = mkWV (k53A "näyttää") "jltak" ; look_V2 = mkWV2 (k53A "näyttää") "jltak" ; look_V2V = mkWV2V (k53A "näyttää") "jltak" ; -look_VA = mkWVA (k53A "näyttää") "jltak" ; +look_VA = mkVA (mkV "näyttää") (mkPrep ablative) ; --MANUALVV look_VV = mkWVV (k53A "näyttää") "jltak" ; look_after_V2 = mkWV2 (k53A "pitää") "huolta" ; --PLURNOUN --POSTPONE look_around_V = mkWV (k67 "katsella") "ympärilleen" "jssak" ; diff --git a/lib/src/finnish/stemmed/log.txt b/lib/src/finnish/stemmed/log.txt index c846dfb5b..0612f3a1e 100644 --- a/lib/src/finnish/stemmed/log.txt +++ b/lib/src/finnish/stemmed/log.txt @@ -134,7 +134,7 @@ separate from "ole" ("ottamaan", not "otamaan") and from "ovat" (*"omaan"). Received a corrected corpus from Krasimir, with weekdays and months recognized. This changes 100 translations. Now at version 13-eng-fin-wsj.txt, working with penn/wsj-3220/corr-wsj.full. -Dictionary revision: 368 words with 5--3 occurrences, 140 changed in 30 minutes. Effect on 425 translations. +Dictionary revision: 368 words with 5--4 occurrences, 150 changed in 30 minutes. Effect on 425 translations. It feels that FiWN - or maybe the method we have used it? - is not the optimal source, as the translations we get are often unusual translations, and even strange words. For instance, pay_N = "liksa", a slang word. Now at version 14. Work done: @@ -143,6 +143,52 @@ Now at version 14. Work done: - 10 hours fixing RGL +1/4 + +Calculation of returns +- 22403 lemma tokens +- 4333 lemma types +- 390 types with 10 occurrences or more +- 61 % of tokens covered by these +- Going down from 10: (k=occs, n=lemmas with k occs, k*n) + +(9,58,522), +(8,52,416), +(7,87,609), +(6,118,708), +(5,169,845), +(4,200,800), +(3,388,1164), +(2,745,1490), +(1,2126,2126) + +Thus by covering >3 we now cover 79%. >2 is 84%, and >1 is 91%. >1 means 51% of the lemmas. + +That is, we need to revise 2100 words to achieve 90% accuracy. Revision taking 1h/600 words (with 50% OK) +means 3.5h work. Maybe 8h work for all 4333 lemmas. + +Analysed the whole log4.txt. Statistics of types of metas: + +NP 25369 +A 12837 +N 11191 +S 3961 +Quant -> N -> NP 3609 +N -> NP 3193 +Prep -> S -> Adv 2581 +NP -> VP -> S 2184 +AP 2176 +NP -> VPSlash -> NP 1680 +S -> NP -> VP -> S 1635 + + +Etc. 14,718 different types. Many of those could be dealt with by padding with nullables and coercions. + + Quant -> N -> NP ===> \quant, n -> DetCN (DetQuant q NumSg) (UseN n) + +Also tried linearization by chunks, defined as maximal fun-headed subtrees. Quite similar to +smoothing with shorter n-grams one could say. Long-distance agreements lost, but chunks make sense. +