From eb791244bd93821320f3fc0e1c4e2518d26202d8 Mon Sep 17 00:00:00 2001 From: aarne Date: Fri, 29 Mar 2013 10:13:04 +0000 Subject: [PATCH] some new opers in ParadigmsFin, and 200 more words in DictEngFin: out of 3220 Penn trees now 2721 are completely translated (but mostly not so well...) --- lib/src/finnish/stemmed/DictEngFin.gf | 243 ++++++++++++++++++++++- lib/src/finnish/stemmed/ElimPredef.hs | 7 +- lib/src/finnish/stemmed/MkGFLex.hs | 29 +++ lib/src/finnish/stemmed/ParadigmsFin.gf | 65 +++++- lib/src/finnish/stemmed/StructuralFin.gf | 2 +- 5 files changed, 334 insertions(+), 12 deletions(-) create mode 100644 lib/src/finnish/stemmed/MkGFLex.hs diff --git a/lib/src/finnish/stemmed/DictEngFin.gf b/lib/src/finnish/stemmed/DictEngFin.gf index 0f67d7f34..ef88821f9 100644 --- a/lib/src/finnish/stemmed/DictEngFin.gf +++ b/lib/src/finnish/stemmed/DictEngFin.gf @@ -4,7 +4,7 @@ concrete DictEngFin of DictEngAbs = CatFin ** open ParadigmsFin, (S = StructuralFin), (L = LexiconFin), -- SyntaxFin, - MorphoFin, ParadigmsFin, (X = ConstructX), MakeStructuralFin, Kotus, WNKotus, StemFin, Prelude in { + MorphoFin, ParadigmsFin, Kotus, WNKotus, StemFin, Prelude in { flags coding=utf8 ; @@ -59074,6 +59074,247 @@ zymotic_A = mkWA "tsymoosiin" "liittyvä" ; oper tehda_V = mkV (lin VK {s = c71 "tehdä"}) ; +-- another batch of additions, 29/3/2013 + +lin +absent_Prep = mkPrep "poissa" elative ; +across_Prep = mkPrep "poikki" ; +alabama_PN = mkPN "Alabama" ; +albania_PN = mkPN "Albania" ; +along_Prep = mkPrep "pitkin" ; +already_AdV = mkAdV "jo" ; +america_PN = mkPN "Amerikka" ; +amid_Prep = mkPrep "keskellä" ; +amp_Conj = mkConj "&" ; +amsterdam_PN = mkPN "Amsterdam" ; +apart_from_Prep = mkPrep "lisäksi" ; +area_1_N = mkN "ala" ; +arizona_PN = mkPN "Arizona" ; +around_Prep = mkPrep "ympärillä" ; +as_Subj = mkSubj "kun" ; +as_long_as_Subj = mkSubj "niin kauan kun" ; +as_well_as_Conj = mkConj "yhtä hyvin kuin" ; +athens_PN = mkPN "Ateena" ; +australia_PN = mkPN "Australia" ; +austria_PN = mkPN (mkN "Itä" (mkN "valta")) ; +bare_A = mkA "paljas" ; +bear_N = mkN "karhu" ; +because_of_Prep = mkPrep "vuoksi" ; +begin_VV = mkVV "alkaa" ; +below_Prep = mkPrep "alapuolella" ; +beneath_Prep = mkPrep "alapuolella" ; +benefit_V = mkV "hyötyä" ; +besides_Prep = mkPrep "rinnalla" ; +beyond_Prep = mkPrep "saavuttamattomissa" ; +bonn_PN = mkPN "Bonn" ; +boston_PN = mkPN "Boston" ; +brazil_PN = mkPN "Brasilia" ; +brew_N = mkN "käymis_tuote" ; +britain_PN = mkPN "Britannia" ; +but_Prep = mkPrep "paitsi" nominative ; +but_Subj = mkSubj "mutta" ; +california_PN = mkPN "Kalifornia" ; +care_N = mkN "hoito" ; +chicago_PN = mkPN "Chicago" ; +china_PN = mkPN "Kiina" ; +close_A = mkA "läheinen" ; +columbia_PN = mkPN "Kolumbia" ; +comment_V = mkV "kommentoida" ; +communist_A = mkA "kommunistinen" ; +complication_N = mkN "komplikaatio" ; +connecticut_PN = mkPN "Connecticut" ; +crystalline_A = mkA "kiteinen" ; +cut_N = mkN "leikkaus" ; +dallas_PN = mkPN "Dallas" ; +default_V = mkV "laiminlyödä" ; +denizen_N = mkN "asukas" ; +denver_PN = mkPN "Denver" ; +differ_V = mkV "eritä" ; +disagree_V = mkV vOlla "eri mieltä" ; +discretionary_A = mkA "harkinnanvarainen" ; +discriminatory_A = mkA "erotteleva" ; +duty_free_A = mkA "verovapaa" ; +editorial_A = mkA "toimituksellinen" ; +else_Adv = mkAdv "muutoin" ; +encouragement_N = mkN "rohkaisu" ; +ensure_VS = mkVS "varmistaa" ; +essential_A = mkA "olennainen" ; +europe_PN = mkPN "Eurooppa" ; +even_though_Subj = mkSubj "vaikka" ; +expensive_A = mkA "kallis" ; +fact_finding_A = mkA (mkN "tietoa" (mkN "etsivä")) ; +far_reaching_A = mkA "kauaskantoinen" ; +feel_VS = mkVS "tuntea" ; +feel_V = mkV "tuntea" ; +feude_V = mkV "vihoitella" ; +finland_PN = mkPN "Suomi" ; +first_rate_A = mkA "ensiluokkainen" ; +florida_PN = mkPN "Florida" ; +for_starters_Adv = mkAdv "alkajaisiksi" ; +foreclose_V = mkV (mkV "sulkea") "markkinoilta" ; +four_part_A = mkA "neljänkeskeinen" ; +frankfurt_PN = mkPN "Frankfurt" ; +free_A = mkA "vapaa" ; +funeral_N = mkN "hautajaistilaisuus" ; +further_AdV = mkAdV "edelleen" ; +georgia_PN = mkPN "Georgia" ; +great_A = mkA "suurenmoinen" ; +greece_PN = mkPN "Kreikka" ; +half_Predet = mkPredet "puoliksi" ; +heavy_handed_A = mkA (mkN "raskas" (mkN "kätinen")) ; +help_N = mkN "apu" ; +high_priced_A = mkA "hintava" ; +hollywood_PN = mkPN "Hollywood" ; +honest_A = mkA "rehellinen" ; +hong_kong_PN = mkPN "Hong Kong" ; +hors_de_combat_A = mkA "haavoittunut" ; +hungary_PN = mkPN "Unkari" ; +hyperinflation_N = mkN "hyperinflaatio" ; +in_addition_Adv = mkAdv "lisäksi" ; +in_addition_to_Prep = mkPrep "lisäksi" ; +in_front_of_Prep = mkPrep "edessä" ; +india_PN = mkPN "Intia" ; +indianapolis_PN = mkPN "Indianapolis" ; +indonesia_PN = mkPN "Indonesia" ; +inside_Prep = mkPrep "sisällä" ; +instead_of_Prep = mkPrep "sijasta" ; +intimate_A = mkA "läheinen" ; +israel_PN = mkPN "Israel" ; +japan_PN = mkPN "Japani" ; +japaneseMasc_N = mkN "japanilainen" ; +just_AdV = mkAdV "pelkästään" ; +just_Predet = mkPredet "pelkästään" ; +kansas_PN = mkPN "Kansas" ; +key_A = mkA "ratkaiseva" ; +late_A = mkA "myöhäinen" ; +libya_PN = mkPN "Libya" ; +lie_1_V = mkV "maata" ; +lie_2_V = mkV "valehdella" ; +los_angeles_PN = mkPN "Los Angeles" ; +louisiana_PN = mkPN "Louisiana" ; +low_A = mkA "matala" ; +madrid_PN = mkPN "Madrid" ; +make_it_V = mkV "onnistua" ; +mandatory_A = mkA "pakollinen" ; +marketplace_N = mkN "markkinapaikka" ; +marriage_N = mkN "avioliitto" ; +maturity_3_N = mkN "kypsyys" ; +maybe_Adv = mkAdv "ehkä" ; +media_N = mkN "media" ; +mexico_PN = mkPN "Mexico" ; +miami_PN = mkPN "Miami" ; +milan_PN = mkPN "Milano" ; +minneapolis_PN = mkPN "Minneapolis" ; +mississippi_PN = mkPN "Mississippi" ; +mistrial_N = mkN "mistraali" ; +mod_cons_N = mkN "mukavuus" ; +more_than_AdN = mkAdN "yli" ; +moscow_PN = mkPN "Moskova" ; +namibia_PN = mkPN "Namibia" ; +nearby_A = mkA "läheinen" ; +neither7nor_DConj = mkConj "ei" "eikä" ; +never_AdV = mkAdV "koskaan" ; +nevertheless_Adv = mkAdv "kuitenkin" ; +new_york_PN = mkPN "New York" ; +nicaragua_PN = mkPN "Nicaragua" ; +no_longer_AdV = mkAdV "enää" ; +norway_PN = mkPN "Norja" ; +often_AdV = mkAdV "usein" ; +ohio_PN = mkPN "Ohio" ; +oklahoma_PN = mkPN "Oklahoma" ; +old_fashioned_A = mkA "vanhanaikainen" ; +once_Subj = mkSubj "sitten kun" ; +one_time_A = mkA "ainutkertainen" ; +onto_Prep = mkPrep "päälle" ; +optical_A = mkA "optinen" ; +outside_Prep = mkPrep "ulkopuolella" ; +painting_N = mkN "maalaus" ; +pall_N = mkN "paariliina" ; +panama_PN = mkPN "Panama" ; +party_N = mkN "puolue" ; +pend_V = mkV "riippua" ; +pennsylvania_PN = mkPN "Pennsylvania" ; +pent_up_A = mkA "patoutunut" ; +people_N = mkN "kansa" ; +per_Prep = mkPrep "per" nominative ; +perhaps_Adv = mkAdv "ehkä" ; +philippines_PN = mkPN "Filippiinit" ; +pittsburgh_PN = mkPN "Pittsburgh" ; +plastics_N = mkN "muovi" ; +plus_Conj = mkConj "plus" ; +poland_PN = mkPN "Puola" ; +present_day_A = mkA "tämänhetkinen" ; +pretoria_PN = mkPN "Pretoria" ; +pretty_AdA = mkAdA "melkoisen" ; +publishing_A = mkA "julkaiseva" ; +quite_AdA = mkAdA "melko" ; +quite_Predet = mkPredet "aika" ; +rank_N = mkN "arvoaste" ; +record_N = mkN "ennätys" ; +regime_1_N = mkN "hallinto" ; +representativeMasc_N = mkN "edustaja" ; +researcherMasc_N = mkN "tutkija" ; +resident_N = mkN "asukas" ; +role_1_N = mkN "rooli" ; +rome_PN = mkPN "Rooma" ; +run_N = mkN "juoksu" ; +san_antonio_PN = mkPN "San Antonio" ; +see_VS = mkVS (mkV "pitää" "huolta") ; +shield_N = mkN "kilpi" ; +sidney_PN = mkPN "Sidney" ; +signale_VS = mkVS "viestittää" ; +since_then_Adv = mkAdv "siitä lähtien" ; +sincere_A = mkA "vilpitön" ; +singapore_PN = mkPN "Singapore" ; +site_N = mkN "sijaintipaikka" ; +so_PConj = mkPConj "niinpä" ; +so_Subj = mkSubj "niin että" ; +so_called_A = mkA "niinsanottu" ; +soft_A = mkA "pehmeä" ; +somehow_AdV = mkAdV "jotenkin" ; +soon_AdV = mkAdV "pian" ; +soon_Adv = mkAdv "pian" ; +spain_PN = mkPN "Espanja" ; +stabilize_V = mkV "vakiinnuttaa" ; +start_V = mkV "aloittaa" ; +start_ing_VV = mkVV "alkaa" ; +start_to_VV = mkVV "alkaa" ; +stockholm_PN = mkPN "Tukholma" ; +strike_N = mkN "lakko" ; +such_Predet = mkPredet "sellainen" ; +such_as_Prep = mkPrep "kuten" nominative ; +sweden_PN = mkPN "Ruotsi" ; +syndicate_N = mkN "syndikaatti" ; +taipei_PN = mkPN "Taipei" ; +taiwan_PN = mkPN "Taiwan" ; +texas_PN = mkPN "Teksas" ; +throughout_Prep = mkPrep "läpi koko" genitive ; +tokyo_PN = mkPN "Tokio" ; +toronto_PN = mkPN "Toronto" ; +tough_A = mkA "tiukka" ; +toward_Prep = mkPrep partitive "kohti" ; +towards_Prep = mkPrep partitive "kohti" ; +turkey_PN = mkPN "Turkki" ; +typical_1_A = mkA "tyypillinen" ; +typical_3_A = mkA "tyypillinen" ; +unheard_of_A = mkA "ennenkuulumaton" ; +unique_A = mkA "ainutlaatuinen" ; +unit_3_N = mkN "yksikkö" ; +universe_N = mkN "niversumi" ; +unless_Subj = mkSubj "ellei" ; +unlike_Prep = mkPrep "erilainen kuin" nominative ; +upon_Prep = mkPrep "päällä" ; +vietnam_PN = mkPN "Vietnam" ; +virginia_PN = mkPN "Virginia" ; +washington_PN = mkPN "Washington" ; +well_known_A = mkA "tunnettu" ; +while_Subj = mkSubj "samaan aikaan kuin" ; +white_collar_A = mkA "valkokauluksinen" ; +whole_A = mkA "kokonainen" ; +wyoming_PN = mkPN "Wyoming" ; +yet_AdV = mkAdV "yhä" ; +zurich_PN = mkPN "Zürich" ; + } diff --git a/lib/src/finnish/stemmed/ElimPredef.hs b/lib/src/finnish/stemmed/ElimPredef.hs index 5b164a982..c5773fa98 100644 --- a/lib/src/finnish/stemmed/ElimPredef.hs +++ b/lib/src/finnish/stemmed/ElimPredef.hs @@ -2,16 +2,13 @@ import qualified Data.Set as S -- comment out words that are predefined in another lexicon -- runghc ElimPredef.hs >= return . S.fromList . map (head . words) . lines interact (unlines . map (elimPredef predefs) . lines) diff --git a/lib/src/finnish/stemmed/MkGFLex.hs b/lib/src/finnish/stemmed/MkGFLex.hs new file mode 100644 index 000000000..0e1b2a309 --- /dev/null +++ b/lib/src/finnish/stemmed/MkGFLex.hs @@ -0,0 +1,29 @@ +-- convert annotated word list to GF lexicon + +import Data.Char + +main = + interact (unlines . map (unwords . mkEntry . words) . lines) + +-- [bare_A] paljas + +mkEntry (fun_:trans) = [fun, "=", oper, args, ";"] where + fun = tail (init fun_) -- unbracket + (name,cat) = let (tac,eman) = span (/= '_') (reverse fun) in (reverse (tail eman),reverse tac) + oper = "mk" ++ cat + args = case cat of + 'V':_ -> unwords (map quoteIf trans) + "Prep" -> unwords (map quoteIf trans) + _ | null trans -> quote (mkUpper name) + _ -> quote (unwords trans) + +quote s = "\"" ++ s ++ "\"" + +-- [absent_Prep] poissa +elative +quoteIf s = case s of + '+':cs -> cs + _ -> quote s + +mkUpper w = case w of + c:cs -> toUpper c : cs + _ -> w diff --git a/lib/src/finnish/stemmed/ParadigmsFin.gf b/lib/src/finnish/stemmed/ParadigmsFin.gf index a3df1b4c7..924143b3a 100644 --- a/lib/src/finnish/stemmed/ParadigmsFin.gf +++ b/lib/src/finnish/stemmed/ParadigmsFin.gf @@ -69,6 +69,20 @@ oper postGenPrep : Str -> Prep ; -- genitive postposition, e.g. "takana" casePrep : Case -> Prep ; -- just case, e.g. adessive + mkPrep = overload { + mkPrep : Case -> Prep + = casePrep ; + mkPrep : Str -> Prep + = postGenPrep ; + mkPrep : Case -> Str -> Prep + = postPrep ; + mkPrep : Str -> Case -> Prep + = \s,c -> prePrep c s ; + } ; + + accusative : Prep + = {c = NPAcc ; s = [] ; isPre = True ; lock_Prep = <>} ; + NK : Type ; -- Noun from DictFin (Kotus) AK : Type ; -- Adjective from DictFin (Kotus) VK : Type ; -- Verb from DictFin (Kotus) @@ -165,6 +179,7 @@ oper mkV : (huutaa,dan,taa,tavat,takaa,detaan,sin,si,sisi,tanut,dettu,tanee : Str) -> V ; -- worst-case verb mkV : VK -> V ; -- verb from DictFin (Kotus) mkV : V -> Str -> V ; -- hakata päälle (particle verb) +--- mkV : Str -> V -> V ; -- laimin+lyödä (prefixed verb) } ; -- All the patterns above have $nominative$ as subject case. @@ -176,6 +191,8 @@ oper vOlla : V ; -- the verb "be" + olla_V : V + = vOlla ; --3 Two-place verbs -- @@ -208,10 +225,23 @@ oper -- Verbs and adjectives can take complements such as sentences, -- questions, verb phrases, and adjectives. +mkVV = overload { + mkVV : Str -> VV -- e.g. "yrittää" + = \s -> mkVVf (mkV s) infFirst ; + mkVV : V -> VV -- e.g. "alkaa" + = \v -> mkVVf v infFirst ; + } ; + +mkVS = overload { + mkVS : Str -> VS -- e.g. "väittää" + = \s -> lin VS (mk1V s) ; + mkVS : V -> VS -- e.g. "sanoa" + = \v -> lin VS v ; + } ; + mkV0 : V -> V0 ; --% - mkVS : V -> VS ; + mkV2S : V -> Prep -> V2S ; -- e.g. "sanoa" allative - mkVV : V -> VV ; -- e.g. "alkaa" mkVVf : V -> InfForm -> VV ; -- e.g. "ruveta" infIllat mkV2V : V -> Prep -> V2V ; -- e.g. "käskeä" genitive mkV2Vf : V -> Prep -> InfForm -> V2V ; -- e.g. "kieltää" partitive infElat @@ -233,6 +263,32 @@ oper V0 : Type ; --% AS, A2S, AV, A2V : Type ; --% +--2 Structural categories + + mkAdV : Str -> AdV + = \s -> lin AdV (ss s) ; + mkAdA : Str -> AdA + = \s -> lin AdA (ss s) ; + mkAdN : Str -> AdN + = \s -> lin AdN (ss s) ; + mkPConj : Str -> PConj + = \s -> lin PConj (ss s) ; + mkSubj : Str -> Subj + = \s -> lin Subj (ss s) ; + + mkPredet : Str -> Predet -- invariable Predet, such as "vain" + = \s -> lin Predet {s = \\_,_ => s} ; + + mkConj = overload { + mkConj : Str -> Conj + = \y -> {s1 = [] ; s2 = y ; n = Pl ; lock_Conj = <>} ; + mkConj : Str -> Str -> Conj + = \x,y -> {s1 = x ; s2 = y ; n = Pl ; lock_Conj = <>} ; + mkConj : Str -> Str -> Number -> Conj + = \x,y,n -> {s1 = x ; s2 = y ; n = n ; lock_Conj = <>} ; + } ; + + --. -- The definitions should not bother the user of the API. So they are -- hidden from the document. @@ -538,7 +594,7 @@ oper huutaa,huudan,huutaa,huutavat,huutakaa,huudetaan, huusin,huusi,huusisi,huutanut,huudettu,huutanee : Str) -> V = mk12V ; mkV : (sana : VK) -> V = \w -> vforms2sverb w.s ** {sc = NPCase Nom ; lock_V = <> ; p = []} ; - mkV : V -> Str -> V = \w,p -> vforms2sverb w.s ** {sc = NPCase Nom ; lock_V = <> ; p = p} ; + mkV : V -> Str -> V = \w,p -> {s = w.s ; sc = w.sc ; lock_V = <> ; h = w.h ; p = p} ; } ; mk1V : Str -> V = \s -> @@ -645,8 +701,7 @@ oper dirV3 v p = mkV3 v accPrep (casePrep p) ; dirdirV3 v = dirV3 v allative ; - mkVS v = v ** {lock_VS = <>} ; - mkVV v = mkVVf v infFirst ; + mkVVf v f = v ** {vi = f ; lock_VV = <>} ; mkVQ v = v ** {lock_VQ = <>} ; diff --git a/lib/src/finnish/stemmed/StructuralFin.gf b/lib/src/finnish/stemmed/StructuralFin.gf index a07158bd8..06e1e22f2 100644 --- a/lib/src/finnish/stemmed/StructuralFin.gf +++ b/lib/src/finnish/stemmed/StructuralFin.gf @@ -1,5 +1,5 @@ concrete StructuralFin of Structural = CatFin ** - open MorphoFin, ParadigmsFin, (X = ConstructX), MakeStructuralFin, StemFin, Prelude in { + open MorphoFin, ParadigmsFin, (X = ConstructX), StemFin, Prelude in { flags optimize=all ;