From bd60279dc0947a98757cb4747ae4e2f713fafce5 Mon Sep 17 00:00:00 2001 From: aarne Date: Thu, 22 Sep 2016 08:23:04 +0000 Subject: [PATCH] Omorfi tagging mostly complete for open classes in Fin, some closed classes and syncat words missing --- lib/src/finnish/ParadigmsFin.gf | 4 +- lib/src/finnish/StemFin.gf | 5 ++ lib/src/finnish/StructuralFin.gf | 66 ++++++++++++----------- lib/src/finnish/stemmed/StemFin.gf | 6 +++ lib/src/finnish/tagged/StemFin.gf | 6 +++ lib/src/finnish/tagged/TagFin.gf | 85 +++++++++++++++++------------- 6 files changed, 102 insertions(+), 70 deletions(-) diff --git a/lib/src/finnish/ParadigmsFin.gf b/lib/src/finnish/ParadigmsFin.gf index 5677dcaa8..1cb796bf0 100644 --- a/lib/src/finnish/ParadigmsFin.gf +++ b/lib/src/finnish/ParadigmsFin.gf @@ -766,8 +766,8 @@ mkVS = overload { dirV2 v = mk2V2 v accPrep ; mkAdv = overload { - mkAdv : Str -> Adv = \s -> {s = s ; lock_Adv = <>} ; - mkAdv : AdvK -> Adv = \s -> {s = s.s ; lock_Adv = <>} ; + mkAdv : Str -> Adv = \s -> {s = tagPOS "ADV" s ; lock_Adv = <>} ; + mkAdv : AdvK -> Adv = \s -> {s = tagPOS "ADV" s.s ; lock_Adv = <>} ; } ; mkV2 = overload { diff --git a/lib/src/finnish/StemFin.gf b/lib/src/finnish/StemFin.gf index 280be6e48..7f03f510f 100644 --- a/lib/src/finnish/StemFin.gf +++ b/lib/src/finnish/StemFin.gf @@ -4,6 +4,11 @@ resource StemFin = open MorphoFin, Prelude in { flags coding = utf8 ; +oper +-- other classes not treated below are POS tagged when the grammar is used with Omorfi + + tagPOS : Str -> Str -> Str = \_,s -> s ; + oper SNForm : Type = NForm ; SNoun : Type = Noun ; diff --git a/lib/src/finnish/StructuralFin.gf b/lib/src/finnish/StructuralFin.gf index c774b825b..cea1e4414 100644 --- a/lib/src/finnish/StructuralFin.gf +++ b/lib/src/finnish/StructuralFin.gf @@ -18,15 +18,15 @@ concrete StructuralFin of Structural = CatFin ** } } ; almost_AdA, almost_AdN = ss "melkein" ; - although_Subj = ss "vaikka" ; - always_AdV = ss "aina" ; - and_Conj = {s1 = [] ; s2 = "ja" ; n = Pl} ; - because_Subj = ss "koska" ; + although_Subj = ssp "CONJ" "vaikka" ; + always_AdV = ssp "ADV" "aina" ; + and_Conj = {s1 = [] ; s2 = tagPOS "CONJ" "ja" ; n = Pl} ; + because_Subj = ssp "CONJ" "koska" ; before_Prep = prePrep partitive "ennen" ; behind_Prep = postGenPrep "takana" ; between_Prep = postGenPrep "välissä" ; both7and_DConj = sd2 "sekä" "että" ** {n = Pl} ; - but_PConj = ss "mutta" ; + but_PConj = ssp "CONJ" "mutta" ; by8agent_Prep = postGenPrep "toimesta" ; by8means_Prep = casePrep adessive ; can8know_VV = mkVV (mkV "osata" "osasi") ; @@ -36,20 +36,20 @@ concrete StructuralFin of Structural = CatFin ** everybody_NP = lin NP (makeNP (((mkN "jokainen"))) Sg) ; every_Det = MorphoFin.mkDet Sg (snoun2nounBind (mkN "jokainen")) ; everything_NP = makeNP ((((mkN "kaikki" "kaiken" "kaikkena")))) Sg ; - everywhere_Adv = ss "kaikkialla" ; + everywhere_Adv = mkAdv "kaikkialla" ; few_Det = MorphoFin.mkDet Sg (snoun2nounBind (mkN "harva")) ; --- first_Ord = {s = \\n,c => (mkN "ensimmäinen").s ! NCase n c} ; for_Prep = casePrep allative ; from_Prep = casePrep elative ; he_Pron = mkPronoun "hän" "hänen" "häntä" "hänenä" "häneen" Sg P3 ; - here_Adv = ss "täällä" ; - here7to_Adv = ss "tänne" ; - here7from_Adv = ss "täältä" ; - how_IAdv = ss "miten" ; - how8much_IAdv = ss "kuinka paljon" ; + here_Adv = mkAdv "täällä" ; + here7to_Adv = mkAdv "tänne" ; + here7from_Adv = mkAdv "täältä" ; + how_IAdv = ssp "ADV" "miten" ; + how8much_IAdv = ssp "ADV" ("kuinka" ++ tagPOS "ADV" "paljon") ; how8many_IDet = {s = \\c => "kuinka" ++ (snoun2nounBind (mkN "moni" "monia")).s ! NCase Sg c ; n = Sg ; isNum = False} ; - if_Subj = ss "jos" ; + if_Subj = ssp "CONJ" "jos" ; in8front_Prep = postGenPrep "edessä" ; i_Pron = mkPronoun "minä" "minun" "minua" "minuna" "minuun" Sg P1 ; in_Prep = casePrep inessive ; @@ -65,18 +65,18 @@ concrete StructuralFin of Structural = CatFin ** most_Predet = {s = \\n,c => (nForms2N (dSuurin "useinta")).s ! NCase n (npform2case n c)} ; much_Det = MorphoFin.mkDet Sg (snoun2nounBind (exceptNomN (mkN "paljo") "paljon")) ** {isNum = True} ; --Harmony not relevant, it's just a CommonNoun must_VV = mkVV (caseV genitive (mkV "täytyä")) ; - no_Utt = ss "ei" ; + no_Utt = ssp "INTERJ" "ei" ; on_Prep = casePrep adessive ; --- one_Quant = MorphoFin.mkDet Sg DEPREC only_Predet = {s = \\_,_ => "vain"} ; - or_Conj = {s1 = [] ; s2 = "tai" ; n = Sg} ; - otherwise_PConj = ss "muuten" ; + or_Conj = {s1 = [] ; s2 = tagPOS "CONJ" "tai" ; n = Sg} ; + otherwise_PConj = ssp "ADV" "muuten" ; part_Prep = casePrep partitive ; please_Voc = ss ["ole hyvä"] ; --- number possess_Prep = casePrep genitive ; - quite_Adv = ss "melko" ; + quite_Adv = ssp "ADV" "melko" ; she_Pron = mkPronoun "hän" "hänen" "häntä" "hänenä" "häneen" Sg P3 ; - so_AdA = ss "niin" ; + so_AdA = ssp "ADV" "niin" ; somebody_NP = { s = \\c => jokuPron ! Sg ! npform2case Sg c ; a = agrP3 Sg ; @@ -97,7 +97,7 @@ concrete StructuralFin of Structural = CatFin ** a = agrP3 Sg ; isPron = False ; isNeg = False ; isNeg = False } ; - somewhere_Adv = ss "jossain" ; + somewhere_Adv = ssp "ADV" "jossain" ; that_Quant = heavyQuant { s1 = table (MorphoFin.Number) { Sg => table (MorphoFin.Case) { @@ -109,11 +109,11 @@ concrete StructuralFin of Structural = CatFin ** } ; s2 = \\_ => [] ; isNum,isPoss = False ; isDef = True ; isNeg = False } ; - that_Subj = ss "että" ; - there_Adv = ss "siellä" ; --- tuolla - there7to_Adv = ss "sinne" ; - there7from_Adv = ss "sieltä" ; - therefore_PConj = ss "siksi" ; + that_Subj = ssp "CONJ" "että" ; + there_Adv = ssp "ADV" "siellä" ; --- tuolla + there7to_Adv = ssp "ADV" "sinne" ; + there7from_Adv = ssp "ADV" "sieltä" ; + therefore_PConj = ssp "ADV" "siksi" ; they_Pron = mkPronoun "he" "heidän" "heitä" "heinä" "heihin" Pl P3 ; --- ne this_Quant = heavyQuant { s1 = table (MorphoFin.Number) { @@ -127,10 +127,10 @@ concrete StructuralFin of Structural = CatFin ** s2 = \\_ => [] ; isNum,isPoss = False ; isDef = True ; isNeg = False } ; through_Prep = postGenPrep "kautta" ; - too_AdA = ss "liian" ; + too_AdA = ssp "ADV" "liian" ; to_Prep = casePrep illative ; --- allative under_Prep = postGenPrep "alla" ; - very_AdA = ss "erittäin" ; + very_AdA = ssp "ADV" "erittäin" ; want_VV = mkVV (mkV "tahtoa") ; we_Pron = mkPronoun "me" "meidän" "meitä" "meinä" "meihin" Pl P1 ; whatPl_IP = { @@ -141,9 +141,9 @@ concrete StructuralFin of Structural = CatFin ** s = \\c => mikaInt ! Sg ! npform2case Sg c ; n = Sg } ; - when_IAdv = ss "milloin" ; - when_Subj = ss "kun" ; - where_IAdv = ss "missä" ; + when_IAdv = ssp "ADV" "milloin" ; + when_Subj = ssp "CONJ" "kun" ; + where_IAdv = ssp "ADV" "missä" ; which_IQuant = { s = mikaInt } ; @@ -155,10 +155,10 @@ concrete StructuralFin of Structural = CatFin ** s = table {NPAcc => "ketkä" ; c => kukaInt ! Pl ! npform2case Pl c} ; n = Pl } ; - why_IAdv = ss "miksi" ; + why_IAdv = ssp "ADV" "miksi" ; without_Prep = prePrep partitive "ilman" ; with_Prep = postGenPrep "kanssa" ; - yes_Utt = ss "kyllä" ; + yes_Utt = ssp "INTERJ" "kyllä" ; youSg_Pron = mkPronoun "sinä" "sinun" "sinua" "sinuna" "sinuun" Sg P2 ; youPl_Pron = mkPronoun "te" "teidän" "teitä" "teinä" "teihin" Pl P2 ; youPol_Pron = @@ -300,8 +300,8 @@ lin isPron = False ; isNeg = True } ; - at_least_AdN = ss "vähintään" ; - at_most_AdN = ss "enintään" ; + at_least_AdN = ssp "ADV" "vähintään" ; + at_most_AdN = ssp "ADV" "enintään" ; as_CAdv = X.mkCAdv "yhtä" "kuin" ; @@ -311,5 +311,7 @@ lin lin language_title_Utt = ss "suomi" ; +oper + ssp : Str -> Str -> {s : Str} = \p,s -> ss (tagPOS p s) ; -- used in tagged/ for Omorfi, otherwise =ss } diff --git a/lib/src/finnish/stemmed/StemFin.gf b/lib/src/finnish/stemmed/StemFin.gf index 965314a40..c3b0e1c43 100644 --- a/lib/src/finnish/stemmed/StemFin.gf +++ b/lib/src/finnish/stemmed/StemFin.gf @@ -4,6 +4,12 @@ resource StemFin = open MorphoFin, Prelude in { flags coding = utf8 ; +oper +-- other classes not treated below are POS tagged when the grammar is used with Omorfi + + tagPOS : Str -> Str -> Str = \_,s -> s ; + + oper SNForm : Type = Predef.Ints 10 ; SNoun : Type = {s : SNForm => Str ; h : Harmony} ; diff --git a/lib/src/finnish/tagged/StemFin.gf b/lib/src/finnish/tagged/StemFin.gf index ea37fb44f..345497250 100644 --- a/lib/src/finnish/tagged/StemFin.gf +++ b/lib/src/finnish/tagged/StemFin.gf @@ -4,6 +4,11 @@ resource StemFin = open TagFin, MorphoFin, Prelude in { flags coding = utf8 ; +oper +-- other classes not treated below are POS tagged when the grammar is used with Omorfi + + tagPOS : Str -> Str -> Str = \p,s -> tagWord p s ; + oper SNForm : Type = Predef.Ints 0 ; --- not really needed SNoun : Type = {s : SNForm => Str ; h : Harmony} ; --- Harmony needed only for API compatibility @@ -71,6 +76,7 @@ oper snoun2compar : SNoun -> Str = \n -> n.s ! 0 ++ "?Comp" ; ---- TODO snoun2superl : SNoun -> Str = \n -> n.s ! 0 ++ "?Superl" ; ---- TODO + -- verbs oper diff --git a/lib/src/finnish/tagged/TagFin.gf b/lib/src/finnish/tagged/TagFin.gf index 7876084d7..8ef850c30 100644 --- a/lib/src/finnish/tagged/TagFin.gf +++ b/lib/src/finnish/tagged/TagFin.gf @@ -18,20 +18,24 @@ oper consTag : (_,_,_,_,_,_ : Str) -> Tag = \t,u,v,x,y,z -> t + "|" + u + "|" + v + "|" + x + "|" + y + "|" + z ; } ; - tagNForm : NForm -> Str = \nf -> case nf of { - NCase n c => consTag (tagCase c) (tagNumber n) ; - NComit => consTag (mkTag "Case" "Com") (tagNumber Pl) ; - NInstruct => consTag (mkTag "Case" "Ins") (tagNumber Pl) ; - NPossNom n => consTag (tagCase Nom) (tagNumber n) ; - NPossGen n => consTag (tagCase Gen) (tagNumber n) ; - NPossTransl n => consTag (tagCase Transl) (tagNumber n) ; - NPossIllat n => consTag (tagCase Illat) (tagNumber n) ; - NCompound => mkTag "Comp" ---- + pairTag : Tag -> Tag -> Tag * Tag = \t,u -> ; + + tagNForm : NForm -> Tag = \nf -> let ts = tagNForms nf in consTag ts.p1 ts.p2 ; + + tagNForms : NForm -> Tag * Tag = \nf -> case nf of { -- keep separate in order to squeeze in Degree of adjectives + NCase n c => pairTag (tagCase c) (tagNumber n) ; + NComit => pairTag (mkTag "Case" "Com") (tagNumber Pl) ; + NInstruct => pairTag (mkTag "Case" "Ins") (tagNumber Pl) ; + NPossNom n => pairTag (tagCase Nom) (tagNumber n) ; + NPossGen n => pairTag (tagCase Gen) (tagNumber n) ; + NPossTransl n => pairTag (tagCase Transl) (tagNumber n) ; + NPossIllat n => pairTag (tagCase Illat) (tagNumber n) ; + NCompound => pairTag (mkTag "Form" "Comp") (tagNumber Sg) ---- TODO: how is this in UD? } ; - tagAForm : AForm -> Str = \af -> case af of { - AN nf => tagNForm nf ; - AAdv => adverbTag + tagDegreeAForm : Degree -> AForm -> Str = \d,af -> case af of { + AN nf => let ts = tagNForms nf in consTag ts.p1 (tagDegree d) ts.p2 ; + AAdv => consTag adverbTag (tagDegree d) ---- TODO: how is this in UD? } ; tagVForm : VForm -> Str = \vf -> case vf of { @@ -55,33 +59,43 @@ oper PassPotent False => consTag connegativeTag potentialTag finiteTag passiveTag ; PassImper True => consTag imperativeTag finiteTag passiveTag ; PassImper False => consTag connegativeTag imperativeTag finiteTag passiveTag ; - PastPartAct af => participleTag ++ activeTag ++ pastTag ++ tagAForm af ; - PastPartPass af => participleTag ++ activeTag ++ pastTag ++ tagAForm af ; - PresPartAct af => participleTag ++ activeTag ++ presentTag ++ tagAForm af ; - PresPartPass af => participleTag ++ activeTag ++ presentTag ++ tagAForm af ; - AgentPart af => participleTag ++ agentTag ++ tagAForm af + PastPartAct af => consTag (tagDegreeAForm Posit af) (tagPartForm "Past") participleTag activeTag ; + PastPartPass af => consTag (tagDegreeAForm Posit af) (tagPartForm "Past") participleTag passiveTag ; + PresPartAct af => consTag (tagDegreeAForm Posit af) (tagPartForm "Pres") participleTag activeTag ; + PresPartPass af => consTag (tagDegreeAForm Posit af) (tagPartForm "Pres") participleTag passiveTag ; + AgentPart af => consTag (tagDegreeAForm Posit af) (tagPartForm "Agt") participleTag activeTag } ; tagInfForm : InfForm -> Str = \vf -> case vf of { - Inf1 => infinitiveTag ; - Inf1Long => infinitiveTag ; - Inf2Iness => infinitiveTag ; - Inf2Instr => infinitiveTag ; - Inf2InessPass => infinitiveTag ; - Inf3Iness => infinitiveTag ; - Inf3Elat => infinitiveTag ; - Inf3Illat => infinitiveTag ; - Inf3Adess => infinitiveTag ; - Inf3Abess => infinitiveTag ; - Inf3Instr => infinitiveTag ; - Inf3InstrPass => infinitiveTag ; - Inf4Nom => infinitiveTag ; - Inf4Part => infinitiveTag ; - Inf5 => infinitiveTag ; - InfPresPart => infinitiveTag ; - InfPresPartAgr => infinitiveTag + Inf1 => infinitiveTag "1" ; + Inf1Long => infinitiveTag "1" ; --- insert Person[psor]=3 when used with poss suff + Inf2Iness => infinitiveTag "Ine" "2" ; + Inf2Instr => infinitiveTag "Ins" "2" ; + Inf2InessPass => infinitiveTag "Ins" "2" "Pass" ; + Inf3Iness => infinitiveTag "Ine" "3" ; + Inf3Elat => infinitiveTag "Ela" "3" ; + Inf3Illat => infinitiveTag "Ill" "3" ; + Inf3Adess => infinitiveTag "Ade" "3" ; + Inf3Abess => infinitiveTag "Abe" "3" ; + Inf3Instr => infinitiveTag "Ins" "3" ; + Inf3InstrPass => infinitiveTag "Ins" "3" "Pass" ; + Inf4Nom => infinitiveTag "Nom" "4" ; + Inf4Part => infinitiveTag "Par" "4" ; + Inf5 => infinitiveTag "5" ; ---- not in UD + InfPresPart => consTag (tagDegreeAForm Posit (AN (NCase Sg Nom))) (tagPartForm "Pres") participleTag activeTag ; + InfPresPartAgr => consTag (tagDegreeAForm Posit (AN (NCase Sg Nom))) (tagPartForm "Pres") participleTag activeTag --- poss to add } ; + infinitiveTag = overload { + infinitiveTag : Str -> Tag = \i -> + consTag (mkTag "InfForm" i) (tagNumber Sg) (mkTag "VerbForm" "Inf") activeTag ; --- UD wants voice and number + infinitiveTag : Str -> Str -> Tag = \c,i -> + consTag (mkTag "Case" c) (mkTag "InfForm" i) (tagNumber Sg) (mkTag "VerbForm" "Inf") activeTag ; + infinitiveTag : Str -> Str -> Str -> Tag = \c,i,v -> + consTag (mkTag "Case" c) (mkTag "InfForm" i) (tagNumber Sg) (mkTag "VerbForm" "Inf") (mkTag "Voice" v) ; + } ; + + tagPartForm : Str -> Tag = \pf -> mkTag "PartForm" pf ; nounTag = mkTag "NOUN" ; adjectiveTag = mkTag "ADJ" ; @@ -93,9 +107,8 @@ oper imperativeTag = mkTag "Mood" "Imp" ; indicativeTag = mkTag "Mood" "Ind" ; - participleTag = mkTag "Part" ; + participleTag = mkTag "VerbForm" "Part" ; agentTag = mkTag "Agent" ; - infinitiveTag = mkTag "Inf" ; finiteTag = mkTag "VerbForm" "Fin" ; connegativeTag = mkTag "Connegative" "Yes" ;