mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-16 16:29:32 -06:00
scattered DictEngFin improvements
This commit is contained in:
@@ -5624,8 +5624,7 @@ bitch_V = mkWV (k53A "valittaa") ;
|
||||
bitchery_N = mkWN "narttumaisuus" ;
|
||||
bitchy_A = mkWA (k15 "ilkeä") ;
|
||||
bite_N = mkWN (k9 "pala") ;
|
||||
bite_V = mkWV (k52A "pureutua") ;
|
||||
--PLURNOUN --PREDEF bite_V2 = mkWV2 (k52A "pureutua") ;
|
||||
bite_V = mkV "purra" ;
|
||||
biter_N = mkWN (k12 "purija") ;
|
||||
bitewing_N = mkWN (compoundN "hammas" (k5 "röntgen")) ;
|
||||
biting_A = mkWA (k10 "pureva") ;
|
||||
@@ -6350,14 +6349,14 @@ boreal_A = mkWA (compoundA "borea" (k38 "alinen")) ;
|
||||
boredom_N = mkWN (k40 "ikävystyttävyys") ;
|
||||
borer_N = mkWN (k34A "selkärangaton") ;
|
||||
boric_A = mkWA "boori-" ;
|
||||
boring_A = mkWA (k10 "rasittava") ;
|
||||
boring_A = mkA "ikävystyttävä" ;
|
||||
boring_N = mkWN (k39 "poraus") ;
|
||||
boringness_N = mkWN (k40 "ikävystyttävyys") ;
|
||||
bornite_N = mkWN (compoundN "kupari" (k1 "kiisu")) "kirjava" ;
|
||||
boron_N = mkWN (k5 "boori") ;
|
||||
boronic_A = mkWA "boori-" ;
|
||||
borosilicate_N = mkWN (compoundN "boro" (k5A "silikaatti")) ;
|
||||
borough_N = mkWN (k5A "kaupunki") ;
|
||||
borough_N = mkN "kauppala" ;
|
||||
borrelia_N = mkWN "Borrelia" ;
|
||||
borrow_V = mkWV (k53A "ottaa") "lainaksi" ;
|
||||
borrow_V2 = mkWV2 (k53A "ottaa") "lainaksi" ;
|
||||
@@ -7824,12 +7823,12 @@ canary_wine_N = mkWN (kH1 "viini") "Kanarian" ;
|
||||
canasta_N = mkWN (k9 "canasta") ;
|
||||
canavanine_N = mkWN (compoundN "kanava" (k26 "niini")) ;
|
||||
cancan_N = mkWN (k5 "cancan") ;
|
||||
cancel_V = mkWV (k74 "kumota") ;
|
||||
cancel_V = mkV "peruuttaa" ;
|
||||
--MANUAL cancel_V2 = mkWV2 (k74 "kumota") ;
|
||||
cancel_out_V2 = mkWV2 (k62 "neutraloida") ;
|
||||
cancellate_A = mkWA (k38 "hohkainen") ;
|
||||
cancellation_N = mkWN (k39 "peruutus") ;
|
||||
cancer_N = mkWN (compoundN "Cancer-" (k1A "suku")) ;
|
||||
cancer_N = mkN "syöpä" ;
|
||||
cancerous_A = mkWA (k38 "syöpäinen") ;
|
||||
cancerweed_N = mkWN (compoundN "lyyra" (k12 "salvia")) ;
|
||||
cancroid_A = mkWA (compoundA "syöpä" (k38 "mäinen")) ;
|
||||
@@ -11288,8 +11287,8 @@ conspire_V2V = mkWV2V (k67A "juonitella") ;
|
||||
constable_N = mkWN (k6 "konstaapeli") ;
|
||||
constabulary_N = mkWN (k6 "poliisi") ;
|
||||
constancy_N = mkWN (k40 "muuttumattomuus") ;
|
||||
constant_A = mkWA (k38 "uskollinen") ;
|
||||
constant_N = mkWN (k48 "suure") "muuttumaton" ;
|
||||
constant_A = mkA "jatkuva" ;
|
||||
constant_N = mkN "vakio" ;
|
||||
--PLURNOUN --POSTPONE constantan_N = mkWN (compoundN "konstanta" (k99 "ani")) ;
|
||||
constellation_N = mkWN (k10 "asetelma") ;
|
||||
consternation_N = mkWN (k39 "tyrmistys") ;
|
||||
@@ -20230,9 +20229,9 @@ forge_V = mkWV (k73A "kekata") ;
|
||||
forge_V2 = mkWV2 (k73A "kekata") ;
|
||||
forger_N = mkWN (k10 "väärentäjä") ;
|
||||
forgery_N = mkWN (k39 "väärennys") ;
|
||||
forget_V = mkWV (k67 "olla") "muistamatta" ;
|
||||
forget_V = mkV "unohtaa" ;
|
||||
--PLURNOUN --PREDEF forget_V2 = mkWV2 (k67 "olla") "muistamatta" ;
|
||||
forget_VS = mkWVS (k67 "olla") "muistamatta" ;
|
||||
forget_VS = mkV "unohtaa" ;
|
||||
forgetful_A = mkWA (compoundA "haja" (k38 "mielinen")) ;
|
||||
forgetfulness_N = mkWN "huonomuistisuus" ;
|
||||
forgettable_A = mkWA "helposti" "unohdettava" ;
|
||||
@@ -20704,9 +20703,9 @@ fryer_N = mkWN (k4A "nuorikko") ;
|
||||
frying_N = mkWN (compoundN "pai" (k38 "staminen")) ;
|
||||
frying_pan_N = mkWN (compoundN "paistin" (k1 "pannu")) ;
|
||||
fuchsia_N = mkWN "fuksia" ;
|
||||
fuck_N = mkWN (k5 "seksi") ;
|
||||
fuck_V = mkWV (k63 "saada") ;
|
||||
fuck_V2 = mkWV2 (k63 "saada") ;
|
||||
fuck_N = mkN "pano" ;
|
||||
fuck_V = mkV "naida" ;
|
||||
fuck_V2 = mkV2 "naida" partitive ;
|
||||
fuck_all_N = mkWN (k99 "yhtään") "ei" ;
|
||||
fucker_N = mkWN (k38 "paskiainen") ;
|
||||
fucking_Adv = mkWAdv "vitun" ;
|
||||
@@ -20810,17 +20809,17 @@ funnel_N = mkWN (compoundN "savu" (k10 "kanava")) ;
|
||||
--PLURNOUN --POSTPONE funnel_V = mkWV (k54A "siirtää") "suppilon" "kautta" ;
|
||||
--PLURNOUN --POSTPONE funnel_V2 = mkWV2 (k54A "siirtää") "suppilon" "kautta" ;
|
||||
funniness_N = mkWN (k40 "hauskuus") ;
|
||||
funny_A = mkWA (k38 "poikkeuksellinen") ;
|
||||
funny_A = mkA "hauska" ;
|
||||
funny_bone_N = mkWN (compoundN "kiukku" (k26 "suoni")) ;
|
||||
funrun_N = mkWN (compoundN "hyväntekeväisyys" (k1 "juoksu")) ;
|
||||
fur_N = mkWN (k9 "karva") ;
|
||||
fur_N = mkN "turkis" ;
|
||||
--PLURNOUN --POSTPONE furan_N = mkWN (compoundN "fura" (k99 "ani")) ;
|
||||
furbelow_N = mkWN (k2 "röyhelö") ;
|
||||
furbish_V2 = mkWV2 (k53A "kiillottaa") ;
|
||||
furcation_N = mkWN (k10 "haarauma") ;
|
||||
furcula_N = mkWN (compoundN "hanka" (k18 "luu")) ;
|
||||
--PLURNOUN --POSTPONE furfural_N = mkWN (compoundN "furfura" (k99 "ali")) ;
|
||||
furious_A = mkWA (k99 "suunniltaan") ;
|
||||
furious_A = mkA "raivokas" ;
|
||||
furl_V = mkWV (k61 "kääriä") ;
|
||||
furl_V2 = mkWV2 (k61 "kääriä") ;
|
||||
furlike_A = mkWA (compoundA "turkis" (k38 "mainen")) ;
|
||||
@@ -23067,10 +23066,10 @@ haply_Adv = mkWAdv (k99 "sattumalta") ;
|
||||
--MANUAL10_06 happen_V = mkWV (k65 "käydä") ;
|
||||
happen_V2 = mkWV2 (k65 "käydä") ;
|
||||
--MANUALVV happen_VV = mkWVV (k65 "käydä") ;
|
||||
happening_N = mkWN (k39 "tapaus") ;
|
||||
happiness_N = mkWN (k1 "ilo") ;
|
||||
--MANUAL10 -- happy_A a NOT_IN_KOTUS
|
||||
haptic_A = mkWA "tunto-" ;
|
||||
happening_N = mkN "tapahtuma" ;
|
||||
happiness_N = mkN "onnellisuus" ;
|
||||
happy_A = mkA "onnellinen" ;
|
||||
haptic_A = mkA "tuntoonperustuva" ;
|
||||
haptoglobin_N = mkWN "haptoglobiini" ;
|
||||
harakiri_N = mkWN (k5 "harakiri") ;
|
||||
harangue_N = mkWN (k39 "vuodatus") ;
|
||||
@@ -23520,7 +23519,7 @@ heliotropism_N = mkWN (compoundN "helio" (k5 "tropismi")) ;
|
||||
heliport_N = mkWN (compoundN "helikopteri" (k10A "kenttä")) ;
|
||||
helium_N = mkWN (k5 "helium") ;
|
||||
helix_N = mkWN (k2 "kotilo") ;
|
||||
hell_N = mkWN (compoundN "melu" (k38 "aminen")) ;
|
||||
hell_N = mkN "helvetti" ;
|
||||
hellbender_N = mkWN (compoundN "lieju" (k1 "piru")) ;
|
||||
hellcat_N = mkWN (k10 "ämmä") "pahansisuinen" ;
|
||||
hellebore_N = mkWN (compoundN "joulu" (k1 "ruusu")) ;
|
||||
@@ -27918,7 +27917,7 @@ knothole_N = mkWN (compoundN "oksan" (k10A "reikä")) ;
|
||||
knotty_A = mkWA (k38 "sotkuinen") ;
|
||||
knout_N = mkWN (k10 "ruoska") ;
|
||||
know_N = mkWN (k40 "tietoisuus") ;
|
||||
know_V = mkWV (k54A "myöntää") ;
|
||||
know_V = mkV "tietää" ;
|
||||
--PLURNOUN --PREDEF know_V2 = mkWV2 (k54A "myöntää") ;
|
||||
know_V2V = mkWV2V (k54A "myöntää") ;
|
||||
--PLURNOUN --PREDEF know_VQ = mkWVQ (k54A "myöntää") ;
|
||||
@@ -29569,7 +29568,7 @@ look_N = mkWN (k48 "katse") ;
|
||||
--PLURNOUN --PREDEF look_V = mkWV (k53A "näyttää") "jltak" ;
|
||||
look_V2 = mkWV2 (k53A "näyttää") "jltak" ;
|
||||
look_V2V = mkWV2V (k53A "näyttää") "jltak" ;
|
||||
look_VA = mkWVA (k53A "näyttää") "jltak" ;
|
||||
look_VA = mkVA (mkV "näyttää") (mkPrep ablative) ;
|
||||
--MANUALVV look_VV = mkWVV (k53A "näyttää") "jltak" ;
|
||||
look_after_V2 = mkWV2 (k53A "pitää") "huolta" ;
|
||||
--PLURNOUN --POSTPONE look_around_V = mkWV (k67 "katsella") "ympärilleen" "jssak" ;
|
||||
|
||||
@@ -134,7 +134,7 @@ separate from "ole" ("ottamaan", not "otamaan") and from "ovat" (*"omaan").
|
||||
Received a corrected corpus from Krasimir, with weekdays and months recognized. This changes 100 translations.
|
||||
Now at version 13-eng-fin-wsj.txt, working with penn/wsj-3220/corr-wsj.full.
|
||||
|
||||
Dictionary revision: 368 words with 5--3 occurrences, 140 changed in 30 minutes. Effect on 425 translations.
|
||||
Dictionary revision: 368 words with 5--4 occurrences, 150 changed in 30 minutes. Effect on 425 translations.
|
||||
It feels that FiWN - or maybe the method we have used it? - is not the optimal source, as the translations
|
||||
we get are often unusual translations, and even strange words. For instance, pay_N = "liksa", a slang word.
|
||||
Now at version 14. Work done:
|
||||
@@ -143,6 +143,52 @@ Now at version 14. Work done:
|
||||
- 10 hours fixing RGL
|
||||
|
||||
|
||||
1/4
|
||||
|
||||
Calculation of returns
|
||||
- 22403 lemma tokens
|
||||
- 4333 lemma types
|
||||
- 390 types with 10 occurrences or more
|
||||
- 61 % of tokens covered by these
|
||||
- Going down from 10: (k=occs, n=lemmas with k occs, k*n)
|
||||
|
||||
(9,58,522),
|
||||
(8,52,416),
|
||||
(7,87,609),
|
||||
(6,118,708),
|
||||
(5,169,845),
|
||||
(4,200,800),
|
||||
(3,388,1164),
|
||||
(2,745,1490),
|
||||
(1,2126,2126)
|
||||
|
||||
Thus by covering >3 we now cover 79%. >2 is 84%, and >1 is 91%. >1 means 51% of the lemmas.
|
||||
|
||||
That is, we need to revise 2100 words to achieve 90% accuracy. Revision taking 1h/600 words (with 50% OK)
|
||||
means 3.5h work. Maybe 8h work for all 4333 lemmas.
|
||||
|
||||
Analysed the whole log4.txt. Statistics of types of metas:
|
||||
|
||||
NP 25369
|
||||
A 12837
|
||||
N 11191
|
||||
S 3961
|
||||
Quant -> N -> NP 3609
|
||||
N -> NP 3193
|
||||
Prep -> S -> Adv 2581
|
||||
NP -> VP -> S 2184
|
||||
AP 2176
|
||||
NP -> VPSlash -> NP 1680
|
||||
S -> NP -> VP -> S 1635
|
||||
|
||||
|
||||
Etc. 14,718 different types. Many of those could be dealt with by padding with nullables and coercions.
|
||||
|
||||
Quant -> N -> NP ===> \quant, n -> DetCN (DetQuant q NumSg) (UseN n)
|
||||
|
||||
Also tried linearization by chunks, defined as maximal fun-headed subtrees. Quite similar to
|
||||
smoothing with shorter n-grams one could say. Long-distance agreements lost, but chunks make sense.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user