mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-26 11:18:55 -06:00
scattered DictEngFin improvements
This commit is contained in:
@@ -5624,8 +5624,7 @@ bitch_V = mkWV (k53A "valittaa") ;
|
|||||||
bitchery_N = mkWN "narttumaisuus" ;
|
bitchery_N = mkWN "narttumaisuus" ;
|
||||||
bitchy_A = mkWA (k15 "ilkeä") ;
|
bitchy_A = mkWA (k15 "ilkeä") ;
|
||||||
bite_N = mkWN (k9 "pala") ;
|
bite_N = mkWN (k9 "pala") ;
|
||||||
bite_V = mkWV (k52A "pureutua") ;
|
bite_V = mkV "purra" ;
|
||||||
--PLURNOUN --PREDEF bite_V2 = mkWV2 (k52A "pureutua") ;
|
|
||||||
biter_N = mkWN (k12 "purija") ;
|
biter_N = mkWN (k12 "purija") ;
|
||||||
bitewing_N = mkWN (compoundN "hammas" (k5 "röntgen")) ;
|
bitewing_N = mkWN (compoundN "hammas" (k5 "röntgen")) ;
|
||||||
biting_A = mkWA (k10 "pureva") ;
|
biting_A = mkWA (k10 "pureva") ;
|
||||||
@@ -6350,14 +6349,14 @@ boreal_A = mkWA (compoundA "borea" (k38 "alinen")) ;
|
|||||||
boredom_N = mkWN (k40 "ikävystyttävyys") ;
|
boredom_N = mkWN (k40 "ikävystyttävyys") ;
|
||||||
borer_N = mkWN (k34A "selkärangaton") ;
|
borer_N = mkWN (k34A "selkärangaton") ;
|
||||||
boric_A = mkWA "boori-" ;
|
boric_A = mkWA "boori-" ;
|
||||||
boring_A = mkWA (k10 "rasittava") ;
|
boring_A = mkA "ikävystyttävä" ;
|
||||||
boring_N = mkWN (k39 "poraus") ;
|
boring_N = mkWN (k39 "poraus") ;
|
||||||
boringness_N = mkWN (k40 "ikävystyttävyys") ;
|
boringness_N = mkWN (k40 "ikävystyttävyys") ;
|
||||||
bornite_N = mkWN (compoundN "kupari" (k1 "kiisu")) "kirjava" ;
|
bornite_N = mkWN (compoundN "kupari" (k1 "kiisu")) "kirjava" ;
|
||||||
boron_N = mkWN (k5 "boori") ;
|
boron_N = mkWN (k5 "boori") ;
|
||||||
boronic_A = mkWA "boori-" ;
|
boronic_A = mkWA "boori-" ;
|
||||||
borosilicate_N = mkWN (compoundN "boro" (k5A "silikaatti")) ;
|
borosilicate_N = mkWN (compoundN "boro" (k5A "silikaatti")) ;
|
||||||
borough_N = mkWN (k5A "kaupunki") ;
|
borough_N = mkN "kauppala" ;
|
||||||
borrelia_N = mkWN "Borrelia" ;
|
borrelia_N = mkWN "Borrelia" ;
|
||||||
borrow_V = mkWV (k53A "ottaa") "lainaksi" ;
|
borrow_V = mkWV (k53A "ottaa") "lainaksi" ;
|
||||||
borrow_V2 = mkWV2 (k53A "ottaa") "lainaksi" ;
|
borrow_V2 = mkWV2 (k53A "ottaa") "lainaksi" ;
|
||||||
@@ -7824,12 +7823,12 @@ canary_wine_N = mkWN (kH1 "viini") "Kanarian" ;
|
|||||||
canasta_N = mkWN (k9 "canasta") ;
|
canasta_N = mkWN (k9 "canasta") ;
|
||||||
canavanine_N = mkWN (compoundN "kanava" (k26 "niini")) ;
|
canavanine_N = mkWN (compoundN "kanava" (k26 "niini")) ;
|
||||||
cancan_N = mkWN (k5 "cancan") ;
|
cancan_N = mkWN (k5 "cancan") ;
|
||||||
cancel_V = mkWV (k74 "kumota") ;
|
cancel_V = mkV "peruuttaa" ;
|
||||||
--MANUAL cancel_V2 = mkWV2 (k74 "kumota") ;
|
--MANUAL cancel_V2 = mkWV2 (k74 "kumota") ;
|
||||||
cancel_out_V2 = mkWV2 (k62 "neutraloida") ;
|
cancel_out_V2 = mkWV2 (k62 "neutraloida") ;
|
||||||
cancellate_A = mkWA (k38 "hohkainen") ;
|
cancellate_A = mkWA (k38 "hohkainen") ;
|
||||||
cancellation_N = mkWN (k39 "peruutus") ;
|
cancellation_N = mkWN (k39 "peruutus") ;
|
||||||
cancer_N = mkWN (compoundN "Cancer-" (k1A "suku")) ;
|
cancer_N = mkN "syöpä" ;
|
||||||
cancerous_A = mkWA (k38 "syöpäinen") ;
|
cancerous_A = mkWA (k38 "syöpäinen") ;
|
||||||
cancerweed_N = mkWN (compoundN "lyyra" (k12 "salvia")) ;
|
cancerweed_N = mkWN (compoundN "lyyra" (k12 "salvia")) ;
|
||||||
cancroid_A = mkWA (compoundA "syöpä" (k38 "mäinen")) ;
|
cancroid_A = mkWA (compoundA "syöpä" (k38 "mäinen")) ;
|
||||||
@@ -11288,8 +11287,8 @@ conspire_V2V = mkWV2V (k67A "juonitella") ;
|
|||||||
constable_N = mkWN (k6 "konstaapeli") ;
|
constable_N = mkWN (k6 "konstaapeli") ;
|
||||||
constabulary_N = mkWN (k6 "poliisi") ;
|
constabulary_N = mkWN (k6 "poliisi") ;
|
||||||
constancy_N = mkWN (k40 "muuttumattomuus") ;
|
constancy_N = mkWN (k40 "muuttumattomuus") ;
|
||||||
constant_A = mkWA (k38 "uskollinen") ;
|
constant_A = mkA "jatkuva" ;
|
||||||
constant_N = mkWN (k48 "suure") "muuttumaton" ;
|
constant_N = mkN "vakio" ;
|
||||||
--PLURNOUN --POSTPONE constantan_N = mkWN (compoundN "konstanta" (k99 "ani")) ;
|
--PLURNOUN --POSTPONE constantan_N = mkWN (compoundN "konstanta" (k99 "ani")) ;
|
||||||
constellation_N = mkWN (k10 "asetelma") ;
|
constellation_N = mkWN (k10 "asetelma") ;
|
||||||
consternation_N = mkWN (k39 "tyrmistys") ;
|
consternation_N = mkWN (k39 "tyrmistys") ;
|
||||||
@@ -20230,9 +20229,9 @@ forge_V = mkWV (k73A "kekata") ;
|
|||||||
forge_V2 = mkWV2 (k73A "kekata") ;
|
forge_V2 = mkWV2 (k73A "kekata") ;
|
||||||
forger_N = mkWN (k10 "väärentäjä") ;
|
forger_N = mkWN (k10 "väärentäjä") ;
|
||||||
forgery_N = mkWN (k39 "väärennys") ;
|
forgery_N = mkWN (k39 "väärennys") ;
|
||||||
forget_V = mkWV (k67 "olla") "muistamatta" ;
|
forget_V = mkV "unohtaa" ;
|
||||||
--PLURNOUN --PREDEF forget_V2 = mkWV2 (k67 "olla") "muistamatta" ;
|
--PLURNOUN --PREDEF forget_V2 = mkWV2 (k67 "olla") "muistamatta" ;
|
||||||
forget_VS = mkWVS (k67 "olla") "muistamatta" ;
|
forget_VS = mkV "unohtaa" ;
|
||||||
forgetful_A = mkWA (compoundA "haja" (k38 "mielinen")) ;
|
forgetful_A = mkWA (compoundA "haja" (k38 "mielinen")) ;
|
||||||
forgetfulness_N = mkWN "huonomuistisuus" ;
|
forgetfulness_N = mkWN "huonomuistisuus" ;
|
||||||
forgettable_A = mkWA "helposti" "unohdettava" ;
|
forgettable_A = mkWA "helposti" "unohdettava" ;
|
||||||
@@ -20704,9 +20703,9 @@ fryer_N = mkWN (k4A "nuorikko") ;
|
|||||||
frying_N = mkWN (compoundN "pai" (k38 "staminen")) ;
|
frying_N = mkWN (compoundN "pai" (k38 "staminen")) ;
|
||||||
frying_pan_N = mkWN (compoundN "paistin" (k1 "pannu")) ;
|
frying_pan_N = mkWN (compoundN "paistin" (k1 "pannu")) ;
|
||||||
fuchsia_N = mkWN "fuksia" ;
|
fuchsia_N = mkWN "fuksia" ;
|
||||||
fuck_N = mkWN (k5 "seksi") ;
|
fuck_N = mkN "pano" ;
|
||||||
fuck_V = mkWV (k63 "saada") ;
|
fuck_V = mkV "naida" ;
|
||||||
fuck_V2 = mkWV2 (k63 "saada") ;
|
fuck_V2 = mkV2 "naida" partitive ;
|
||||||
fuck_all_N = mkWN (k99 "yhtään") "ei" ;
|
fuck_all_N = mkWN (k99 "yhtään") "ei" ;
|
||||||
fucker_N = mkWN (k38 "paskiainen") ;
|
fucker_N = mkWN (k38 "paskiainen") ;
|
||||||
fucking_Adv = mkWAdv "vitun" ;
|
fucking_Adv = mkWAdv "vitun" ;
|
||||||
@@ -20810,17 +20809,17 @@ funnel_N = mkWN (compoundN "savu" (k10 "kanava")) ;
|
|||||||
--PLURNOUN --POSTPONE funnel_V = mkWV (k54A "siirtää") "suppilon" "kautta" ;
|
--PLURNOUN --POSTPONE funnel_V = mkWV (k54A "siirtää") "suppilon" "kautta" ;
|
||||||
--PLURNOUN --POSTPONE funnel_V2 = mkWV2 (k54A "siirtää") "suppilon" "kautta" ;
|
--PLURNOUN --POSTPONE funnel_V2 = mkWV2 (k54A "siirtää") "suppilon" "kautta" ;
|
||||||
funniness_N = mkWN (k40 "hauskuus") ;
|
funniness_N = mkWN (k40 "hauskuus") ;
|
||||||
funny_A = mkWA (k38 "poikkeuksellinen") ;
|
funny_A = mkA "hauska" ;
|
||||||
funny_bone_N = mkWN (compoundN "kiukku" (k26 "suoni")) ;
|
funny_bone_N = mkWN (compoundN "kiukku" (k26 "suoni")) ;
|
||||||
funrun_N = mkWN (compoundN "hyväntekeväisyys" (k1 "juoksu")) ;
|
funrun_N = mkWN (compoundN "hyväntekeväisyys" (k1 "juoksu")) ;
|
||||||
fur_N = mkWN (k9 "karva") ;
|
fur_N = mkN "turkis" ;
|
||||||
--PLURNOUN --POSTPONE furan_N = mkWN (compoundN "fura" (k99 "ani")) ;
|
--PLURNOUN --POSTPONE furan_N = mkWN (compoundN "fura" (k99 "ani")) ;
|
||||||
furbelow_N = mkWN (k2 "röyhelö") ;
|
furbelow_N = mkWN (k2 "röyhelö") ;
|
||||||
furbish_V2 = mkWV2 (k53A "kiillottaa") ;
|
furbish_V2 = mkWV2 (k53A "kiillottaa") ;
|
||||||
furcation_N = mkWN (k10 "haarauma") ;
|
furcation_N = mkWN (k10 "haarauma") ;
|
||||||
furcula_N = mkWN (compoundN "hanka" (k18 "luu")) ;
|
furcula_N = mkWN (compoundN "hanka" (k18 "luu")) ;
|
||||||
--PLURNOUN --POSTPONE furfural_N = mkWN (compoundN "furfura" (k99 "ali")) ;
|
--PLURNOUN --POSTPONE furfural_N = mkWN (compoundN "furfura" (k99 "ali")) ;
|
||||||
furious_A = mkWA (k99 "suunniltaan") ;
|
furious_A = mkA "raivokas" ;
|
||||||
furl_V = mkWV (k61 "kääriä") ;
|
furl_V = mkWV (k61 "kääriä") ;
|
||||||
furl_V2 = mkWV2 (k61 "kääriä") ;
|
furl_V2 = mkWV2 (k61 "kääriä") ;
|
||||||
furlike_A = mkWA (compoundA "turkis" (k38 "mainen")) ;
|
furlike_A = mkWA (compoundA "turkis" (k38 "mainen")) ;
|
||||||
@@ -23067,10 +23066,10 @@ haply_Adv = mkWAdv (k99 "sattumalta") ;
|
|||||||
--MANUAL10_06 happen_V = mkWV (k65 "käydä") ;
|
--MANUAL10_06 happen_V = mkWV (k65 "käydä") ;
|
||||||
happen_V2 = mkWV2 (k65 "käydä") ;
|
happen_V2 = mkWV2 (k65 "käydä") ;
|
||||||
--MANUALVV happen_VV = mkWVV (k65 "käydä") ;
|
--MANUALVV happen_VV = mkWVV (k65 "käydä") ;
|
||||||
happening_N = mkWN (k39 "tapaus") ;
|
happening_N = mkN "tapahtuma" ;
|
||||||
happiness_N = mkWN (k1 "ilo") ;
|
happiness_N = mkN "onnellisuus" ;
|
||||||
--MANUAL10 -- happy_A a NOT_IN_KOTUS
|
happy_A = mkA "onnellinen" ;
|
||||||
haptic_A = mkWA "tunto-" ;
|
haptic_A = mkA "tuntoonperustuva" ;
|
||||||
haptoglobin_N = mkWN "haptoglobiini" ;
|
haptoglobin_N = mkWN "haptoglobiini" ;
|
||||||
harakiri_N = mkWN (k5 "harakiri") ;
|
harakiri_N = mkWN (k5 "harakiri") ;
|
||||||
harangue_N = mkWN (k39 "vuodatus") ;
|
harangue_N = mkWN (k39 "vuodatus") ;
|
||||||
@@ -23520,7 +23519,7 @@ heliotropism_N = mkWN (compoundN "helio" (k5 "tropismi")) ;
|
|||||||
heliport_N = mkWN (compoundN "helikopteri" (k10A "kenttä")) ;
|
heliport_N = mkWN (compoundN "helikopteri" (k10A "kenttä")) ;
|
||||||
helium_N = mkWN (k5 "helium") ;
|
helium_N = mkWN (k5 "helium") ;
|
||||||
helix_N = mkWN (k2 "kotilo") ;
|
helix_N = mkWN (k2 "kotilo") ;
|
||||||
hell_N = mkWN (compoundN "melu" (k38 "aminen")) ;
|
hell_N = mkN "helvetti" ;
|
||||||
hellbender_N = mkWN (compoundN "lieju" (k1 "piru")) ;
|
hellbender_N = mkWN (compoundN "lieju" (k1 "piru")) ;
|
||||||
hellcat_N = mkWN (k10 "ämmä") "pahansisuinen" ;
|
hellcat_N = mkWN (k10 "ämmä") "pahansisuinen" ;
|
||||||
hellebore_N = mkWN (compoundN "joulu" (k1 "ruusu")) ;
|
hellebore_N = mkWN (compoundN "joulu" (k1 "ruusu")) ;
|
||||||
@@ -27918,7 +27917,7 @@ knothole_N = mkWN (compoundN "oksan" (k10A "reikä")) ;
|
|||||||
knotty_A = mkWA (k38 "sotkuinen") ;
|
knotty_A = mkWA (k38 "sotkuinen") ;
|
||||||
knout_N = mkWN (k10 "ruoska") ;
|
knout_N = mkWN (k10 "ruoska") ;
|
||||||
know_N = mkWN (k40 "tietoisuus") ;
|
know_N = mkWN (k40 "tietoisuus") ;
|
||||||
know_V = mkWV (k54A "myöntää") ;
|
know_V = mkV "tietää" ;
|
||||||
--PLURNOUN --PREDEF know_V2 = mkWV2 (k54A "myöntää") ;
|
--PLURNOUN --PREDEF know_V2 = mkWV2 (k54A "myöntää") ;
|
||||||
know_V2V = mkWV2V (k54A "myöntää") ;
|
know_V2V = mkWV2V (k54A "myöntää") ;
|
||||||
--PLURNOUN --PREDEF know_VQ = mkWVQ (k54A "myöntää") ;
|
--PLURNOUN --PREDEF know_VQ = mkWVQ (k54A "myöntää") ;
|
||||||
@@ -29569,7 +29568,7 @@ look_N = mkWN (k48 "katse") ;
|
|||||||
--PLURNOUN --PREDEF look_V = mkWV (k53A "näyttää") "jltak" ;
|
--PLURNOUN --PREDEF look_V = mkWV (k53A "näyttää") "jltak" ;
|
||||||
look_V2 = mkWV2 (k53A "näyttää") "jltak" ;
|
look_V2 = mkWV2 (k53A "näyttää") "jltak" ;
|
||||||
look_V2V = mkWV2V (k53A "näyttää") "jltak" ;
|
look_V2V = mkWV2V (k53A "näyttää") "jltak" ;
|
||||||
look_VA = mkWVA (k53A "näyttää") "jltak" ;
|
look_VA = mkVA (mkV "näyttää") (mkPrep ablative) ;
|
||||||
--MANUALVV look_VV = mkWVV (k53A "näyttää") "jltak" ;
|
--MANUALVV look_VV = mkWVV (k53A "näyttää") "jltak" ;
|
||||||
look_after_V2 = mkWV2 (k53A "pitää") "huolta" ;
|
look_after_V2 = mkWV2 (k53A "pitää") "huolta" ;
|
||||||
--PLURNOUN --POSTPONE look_around_V = mkWV (k67 "katsella") "ympärilleen" "jssak" ;
|
--PLURNOUN --POSTPONE look_around_V = mkWV (k67 "katsella") "ympärilleen" "jssak" ;
|
||||||
|
|||||||
@@ -134,7 +134,7 @@ separate from "ole" ("ottamaan", not "otamaan") and from "ovat" (*"omaan").
|
|||||||
Received a corrected corpus from Krasimir, with weekdays and months recognized. This changes 100 translations.
|
Received a corrected corpus from Krasimir, with weekdays and months recognized. This changes 100 translations.
|
||||||
Now at version 13-eng-fin-wsj.txt, working with penn/wsj-3220/corr-wsj.full.
|
Now at version 13-eng-fin-wsj.txt, working with penn/wsj-3220/corr-wsj.full.
|
||||||
|
|
||||||
Dictionary revision: 368 words with 5--3 occurrences, 140 changed in 30 minutes. Effect on 425 translations.
|
Dictionary revision: 368 words with 5--4 occurrences, 150 changed in 30 minutes. Effect on 425 translations.
|
||||||
It feels that FiWN - or maybe the method we have used it? - is not the optimal source, as the translations
|
It feels that FiWN - or maybe the method we have used it? - is not the optimal source, as the translations
|
||||||
we get are often unusual translations, and even strange words. For instance, pay_N = "liksa", a slang word.
|
we get are often unusual translations, and even strange words. For instance, pay_N = "liksa", a slang word.
|
||||||
Now at version 14. Work done:
|
Now at version 14. Work done:
|
||||||
@@ -143,6 +143,52 @@ Now at version 14. Work done:
|
|||||||
- 10 hours fixing RGL
|
- 10 hours fixing RGL
|
||||||
|
|
||||||
|
|
||||||
|
1/4
|
||||||
|
|
||||||
|
Calculation of returns
|
||||||
|
- 22403 lemma tokens
|
||||||
|
- 4333 lemma types
|
||||||
|
- 390 types with 10 occurrences or more
|
||||||
|
- 61 % of tokens covered by these
|
||||||
|
- Going down from 10: (k=occs, n=lemmas with k occs, k*n)
|
||||||
|
|
||||||
|
(9,58,522),
|
||||||
|
(8,52,416),
|
||||||
|
(7,87,609),
|
||||||
|
(6,118,708),
|
||||||
|
(5,169,845),
|
||||||
|
(4,200,800),
|
||||||
|
(3,388,1164),
|
||||||
|
(2,745,1490),
|
||||||
|
(1,2126,2126)
|
||||||
|
|
||||||
|
Thus by covering >3 we now cover 79%. >2 is 84%, and >1 is 91%. >1 means 51% of the lemmas.
|
||||||
|
|
||||||
|
That is, we need to revise 2100 words to achieve 90% accuracy. Revision taking 1h/600 words (with 50% OK)
|
||||||
|
means 3.5h work. Maybe 8h work for all 4333 lemmas.
|
||||||
|
|
||||||
|
Analysed the whole log4.txt. Statistics of types of metas:
|
||||||
|
|
||||||
|
NP 25369
|
||||||
|
A 12837
|
||||||
|
N 11191
|
||||||
|
S 3961
|
||||||
|
Quant -> N -> NP 3609
|
||||||
|
N -> NP 3193
|
||||||
|
Prep -> S -> Adv 2581
|
||||||
|
NP -> VP -> S 2184
|
||||||
|
AP 2176
|
||||||
|
NP -> VPSlash -> NP 1680
|
||||||
|
S -> NP -> VP -> S 1635
|
||||||
|
|
||||||
|
|
||||||
|
Etc. 14,718 different types. Many of those could be dealt with by padding with nullables and coercions.
|
||||||
|
|
||||||
|
Quant -> N -> NP ===> \quant, n -> DetCN (DetQuant q NumSg) (UseN n)
|
||||||
|
|
||||||
|
Also tried linearization by chunks, defined as maximal fun-headed subtrees. Quite similar to
|
||||||
|
smoothing with shorter n-grams one could say. Long-distance agreements lost, but chunks make sense.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user