diff --git a/lib/resource/doc/ScanMod.dot b/lib/resource/doc/ScanMod.dot new file mode 100644 index 000000000..5f3ec23ba --- /dev/null +++ b/lib/resource/doc/ScanMod.dot @@ -0,0 +1,29 @@ +digraph { + +GrammarSwe [style = "solid", shape = "box", color = "green"]; +GrammarSwe -> GrammarScand [style = "solid"]; +GrammarSwe -> SyntaxSwe [style = "solid"]; + +LexiconSwe [style = "solid", shape = "box", color = "red"]; +LexiconSwe -> MorphoSwe [style = "solid"]; + +GrammarScand [style = "solid", shape = "box"]; +GrammarScand -> SyntaxScand [style = "solid"]; + +SyntaxScand [style = "solid", shape = "ellipse"]; +SyntaxScand -> TypesScand [style = "solid"]; + +SyntaxSwe [style = "solid", shape = "ellipse", color = "yellow"]; +SyntaxSwe -> SyntaxScand [style = "solid"]; +SyntaxSwe -> TypesSwe [style = "solid"]; +SyntaxSwe -> MorphoSwe [style = "solid"]; + +MorphoSwe [style = "solid", shape = "ellipse", color="red"]; +MorphoSwe -> TypesSwe [style = "solid"]; + +TypesSwe [style = "solid", shape = "ellipse", color = "yellow"]; +TypesSwe -> TypesScand [style = "solid"]; + +TypesScand [style = "solid", shape = "ellipse"]; + +} diff --git a/lib/resource/doc/gf-resource.html b/lib/resource/doc/gf-resource.html index 29a1ffde3..ae9eaa836 100644 --- a/lib/resource/doc/gf-resource.html +++ b/lib/resource/doc/gf-resource.html @@ -47,6 +47,77 @@ Thus not primarily another theoretical framework for linguists. + +
+ cat Prop ; Nat ; + fun Even : Nat -> Prop ; ++Concrete syntax: mapping from abstract syntax trees to strings in a language +(English, French, German, Swedish,...) +
+ lin Even x = {s = x.s ++ "is" ++ "even"} ;
+
+ lin Even x = {s = x.s ++ "est" ++ "pair"} ;
+
+ lin Even x = {s = x.s ++ "ist" ++ "gerade"} ;
+
+ lin Even x = {s = x.s ++ "är" ++ "jämnt"} ;
+
+We can translate between language via the abstract syntax.
+
++ +Is it really so simple? + + + +
+ +The previous multilingual grammar breaks these rules in many situations: +
+2 and 3 is even
+la somme de 3 et de 5 est pair
+wenn 2 ist gerade, dann 2+2 ist gerade
+om 2 är jämnt, 2+2 är jämnt
+
+
+
+
+ +Instead of just strings, we need
parameters, tables, +and record types. For instance, French: +
+ param Mod = Ind | Subj ;
+ param Gen = Masc | Fem ;
+
+ lincat Nat = {s : Str ; g : Gen} ;
+ lincat Prop = {s : Mod => Str} ;
+
+ lin Even x = {s =
+ table {
+ m => x.s ++
+ case m of {Ind => "est" ; Subj => "soit"} ++
+ case x.g of {Masc => "pair" ; Fem => "paire"}
+ }
+ } ;
+
+
+
+
-Which kind of programmer is easier to find? +Which kind of a programmer is easier to find?
Even x = - let jämn = case+of { - => "jämn" ; - => "jämnt" ; - => "jämna" + let jämn = case <x.n,x.g> of { + <Sg,Utr> => "jämn" ; + <Sg,Neutr> => "jämnt" ; + <Pl,_> => "jämna" } in {s = table { @@ -103,6 +174,7 @@ To use library functions for syntax and morphology:
-Currently no semantics,
-no language-specific structures if not
-necessary for expressivity.
+Currently,
+
(Warning for linguists:) theoretical innovation in -syntax +syntax (and it will all be hidden anyway!) diff --git a/lib/resource/doc/spraakdata2005.html b/lib/resource/doc/spraakdata2005.html index 1e4219c0c..356acc8a2 100644 --- a/lib/resource/doc/spraakdata2005.html +++ b/lib/resource/doc/spraakdata2005.html @@ -25,7 +25,7 @@ Aarne Ranta
@@ -33,6 +33,10 @@ Swedish morphology and lexicon in GF
+Syntax case study: Swedish determiners + +
+ Syntax case study: Swedish sentence structure
@@ -40,6 +44,357 @@ Syntax case study: Swedish sentence structure Danish and Norwegian through parametrization + +
+ +Designed to be nice for ordinary programmers to use. + +
+ +Mission: to make natural-language applications available for +ordinary programmers, in tasks like +
+ cat Prop ; Nat ; + fun Even : Nat -> Prop ; ++Concrete syntax: mapping from abstract syntax trees to strings in a language +(English, French, German, Swedish,...) +
+ lin Even x = {s = x.s ++ "is" ++ "even"} ;
+
+ lin Even x = {s = x.s ++ "est" ++ "pair"} ;
+
+ lin Even x = {s = x.s ++ "ist" ++ "gerade"} ;
+
+ lin Even x = {s = x.s ++ "är" ++ "jämnt"} ;
+
+We can translate between language via the abstract syntax.
+
++ +Is it really so simple? + + + +
+ +The previous multilingual grammar breaks these rules in many situations: +
+2 and 3 is even
+la somme de 3 et de 5 est pair
+wenn 2 ist gerade, dann 2+2 ist gerade
+om 2 är jämnt, 2+2 är jämnt
+
+
+
+
+ +Instead of just strings, we need
parameters, tables, +and record types. For instance, French: +
+ param Mod = Ind | Subj ;
+ param Gen = Masc | Fem ;
+
+ lincat Nat = {s : Str ; g : Gen} ;
+ lincat Prop = {s : Mod => Str} ;
+
+ lin Even x = {s =
+ table {
+ m => x.s ++
+ case m of {Ind => "est" ; Subj => "soit"} ++
+ case x.g of {Masc => "pair" ; Fem => "paire"}
+ }
+ } ;
+
+
+
+
+
++ +Which kind of a programmer is easier to find? +
+ +In main-stream programming, sorting algorithms are not +written by hand but taken from libraries. + +
+ +In the same way, we want to create grammar libraries that encapsulate +basic linguistic facts. + +
+ +Cf. the Java success story: the language is just a half of the +success - libraries are another half. + + + + +
+ Even x =
+ let jämn = case <x.n,x.g> of {
+ <Sg,Utr> => "jämn" ;
+ <Sg,Neutr> => "jämnt" ;
+ <Pl,_> => "jämna"
+ }
+ in
+ {s = table {
+ Main => x.s ! Nom ++ "är" ++ jämn ;
+ Inv => "är" ++ x.s ! Nom ++ jämn ;
+ Sub => x.s ! Nom ++ "är" ++ jämn
+ }
+ }
+
+To use library functions for syntax and morphology:
++ Even = predA (regA "jämn") ; ++ + + + +
+
+How do we organize and present the library?
+
+
+
+Where do we get the data from?
+
+
+ +Extra constraint: we want open-source free software. + + + + + + +
+ +Basic lexicon of structural, common, and irregular words + +
+ +Basic syntactic structures + +
+
+Currently,
+
+ +Semantic coverage: you can express whatever you want. + +
+ +Usability as library for non-linguists. + +
+ +(Bonus for linguists:) nice generalizations w.r.t. language +families, using the module system of GF. + + + + +
+ +Semantic correctness +
+ colourless green ideas sleep furiously + + the time is seventy past forty-two ++ +
+ +(Warning for linguists:) theoretical innovation in +syntax (and it will all be hidden anyway!) + + + + +
+ +But we do not try to give semantics once and +for all for the whole language. + +
+ +Instead, we expect semantics to be given in +application grammars built on semantic models +of different domains. + +
+ +Example application: number theory +
+ fun Even : Nat -> Prop ; -- a mathematical predicate + + lin Even = predA (regA "even") ; -- English translation + lin Even = predA (regA "pair") ; -- French translation + lin Even = predA (regA "jämn") ; -- Swedish translation ++How could the resource predict that just these +translations are correct in this domain? + + + + + + +
+ +
+ +
+ +
+ +
+ +Reservations: +
- mkN : (apa,apan,apor,aporna : Str) -> N ; - mk2N : (nyckel,nycklar : Str) -> N ; + mkN : (apa,apan,apor,aporna : Str) -> N ; + mk2N : (nyckel,nycklar : Str) -> N ; mkV : (supa,super,sup,söp,supit,supen : Str) -> V ; regV : (tala : Str) -> V ; @@ -345,9 +700,9 @@ function and a couple of "regular" patterns.Construction functions for subcategorization.
- mkV2 : V -> Preposition -> V2 ; - dirV2 : V -> V2 ; - mkV3 : V -> Preposition -> Preposition -> V3 ; + mkV2 : V -> Preposition -> V2 ; + dirV2 : V -> V2 ; + mkV3 : V -> Preposition -> Preposition -> V3 ;@@ -363,7 +718,7 @@ Idea: search for characteristic forms of paradigms in a corpus. For instance, if you find klocka and klockor, add
- klocka_N = decl1 "klocka" ; + klocka = decl1 "klocka" ;to the lexicon. @@ -392,7 +747,7 @@ In general, exclude stems shorter than 3 characters.
-It is necessary to check the results manually. +To guarantee quality, it is necessary to check the results manually. @@ -439,7 +794,7 @@ with compound analysis, 50,000 without -
-The relevant fragment consists of 4 categories and -3 functions +The relevant fragment consists of 5 categories and +4 functions
cat
N ; -- simple (lexical) common noun, e.g. "bil"
@@ -477,34 +832,113 @@ The relevant fragment consists of 4 categories and
UseN : N -> CN ;
UseA : A -> AP ;
DetCN : Det -> CN -> NP ;
- ModA : A -> CN -> CN ;
+ ModAP : AP -> CN -> CN ;
-+ +Complex common nouns have the following linearization type
- lincat
- CN = {s : Number => SpeciesP => Case => Str ; g : Gender ; isComplex : Bool} ;
- NP = {s : NPForm => Str ; g : Gender ; n : Number ; p : Person} ;
- Det = {s : Gender => Str ; n : Number ; b : SpeciesP} ;
- AP = {s : AdjFormPos => Case => Str} ;
+ CN = {
+ s : Number => SpeciesP => Case => Str ;
+ g : Gender ;
+ isComplex : Bool
+ } ;
-Here we use some new parameter types:
+Here we use a new parameter type,
- param - SpeciesP = IndefP | DefP Species ; - NPForm = PNom | PAcc | PGen GenNum ; - GenNum = ASg Gender | APl ; - AdjFormPos = Strong GenNum | Weak ; + SpeciesP = IndefP | DefP Species ; ++to distinguish between three forms: +
+ IndefP => "stor bil" + DefP Indef => "stora bil" + DefP Def => "stora bilen"+ + +
+ UseN hus =
+ {s = \\n,b,c => hus.s ! n ! unSpeciesP b ! c ;
+ g = hus.g ;
+ p = False
+ } ;
+
+ ModAP Stor Nybil =
+ {s = \\n, b, c =>
+ let
+ stor = Stor.s ! mkAdjForm (unSpeciesAdjP b) n Nybil.g ! Nom ;
+ nybil = Nybil.s ! n ! b ! c
+ in preOrPost God.p nybil god ;
+ g = Nybil.g ;
+ p = True
+ } ;
+
+
+
+
+
+
+ NP = {
+ s : NPForm => Str ;
+ g : Gender ;
+ n : Number ;
+ p : Person
+ } ;
+
+Since pronouns have special accusative and possessive forms,
+the case of noun phrases is richer than the case of nouns.
++ NPForm = PNom | PAcc | PGen GenNum ; + + GenNum = ASg Gender | APl ; ++ + + + +
+ Det = {
+ s : Gender => Str ;
+ n : Number ;
+ b : SpeciesP
+ } ;
+
+Some examples:
+
+ en_Det = {s = genForms "en" "ett" ; n = Sg ; b = IndefP} ;
+
+ denna_Det = {s = genForms "denna" "detta" ; n = Sg ; b = DefP Indef} ;
+
+ den_Det = {s = genForms "den" "det" ; n = Sg ; b = DefP Def} ;
+
+ dessa_Det = {s = \\ _ => "dessa" ; n = Pl ; b = DefP Indef} ;
+
+
@@ -528,11 +962,24 @@ Mutual agreement:
-
++ DefNP : CN -> NP ; ++So we can deal with the fact that only complex common nouns +get a determiner word. +
+ DefNP storbil = case storbil.isComplex of {
+ True => DetCN den_det storbil ;
+ False => DetCN empty_Det storbil
+ }
+ where
+ empty_Det = {s = \\_ => [] ; n = Sg ; b = DefP Def} ;
+
-
@@ -541,74 +988,473 @@ Mutual agreement:
-
-jag har inte sett dig idag
-dig jag har inte sett idag
-idag har jag inte sett dig
-inte har jag sett dig idag
-sett har jag inte dig idag (??)
-sett dig har jag inte idag
- +
+ jag har inte sett dig idag + dig jag har inte sett idag + idag har jag inte sett dig + inte har jag sett dig idag + *sett har jag inte dig idag + sett dig har jag inte idag +Rigid order in questions... -
- -har jag inte sett dig idag - -
+
+ har jag inte sett dig idag +... and in subordinate clauses -
- -jag inte har sett dig idag - -
+
+ jag inte har sett dig idag ++
+ +A sentence (Sats) consists +of different fields +
+ Nexus Field Content Field + ----------- ------------- + V1 N1 A1 V2 N2 A2 + har jag inte sett dig idag ++ + + + +
+ Fundament Nexus Field Content Field + --------- ----------- ------------- + V1 N1 A1 V2 N2 A2 + jag har _ inte sett dig idag + inte har jag _ sett dig idag + dig har jag inte sett _ idag + idag har jag inte sett dig _ ++The inverted clause has a rigid order +
+ V1 N1 A1 V2 N2 A2 + har jag inte sett dig idag ++The subordinate clause has another rigid order +
+ N1 A1 V1 V2 N2 A2 + jag inte har sett dig idag ++ +
+ Sats = {
+ s1 : Str ; -- V1
+ s2 : Str ; -- N1
+ s3 : Str ; -- A1
+ s4 : Str ; -- V2
+ s5 : Str ; -- N2
+ s6 : Str -- A2
+ } ;
+
+A "finished" sentence has three parameters,
+
+ S = {s : Order => Str} ;
+
+ Order = Main | Inv | Sub ;
+
+
+
+
+ SSats sats =
+ let
+ har = sats.s1 ;
+ jag = sats.s2 ;
+ inte = sats.s3 ;
+ sett = sats.s4 ;
+ dig = sats.s5 ;
+ idag = sats.s6
+ in {s = table {
+ Main => variants {
+ jag ++ har ++ inte ++ sett ++ dig ++ idag ;
+ inte ++ har ++ jag ++ sett ++ dig ++ idag ;
+ dig ++ har ++ jag ++ inte ++ sett ++ idag ;
+ idag ++ har ++ jag ++ inte ++ sett ++ dig
+ } ;
+ Inv => har ++ jag ++ inte ++ sett ++ dig ++ idag ;
+ Sub => jag ++ inte ++ har ++ sett ++ dig ++ idag
+ }
+ } ;
+
+
+
+
+
++ festat har jag igår + sova ska jag idag ++
+ sovit har jag idag + *sett har jag dig idag + sett dig har jag idag ++
+ du har sagt mig att han kommer idag + att han kommer idag har du sagt mig ++ +
+ +This means that we don't explicitly write records, but +use a handful of functions for writing records: +
+ mkSats : NounPhrase -> Verb -> Sats = \subj,verb ->
+ let
+ harsovit = verbSForm verb Act
+ in
+ {s1 = \\sf => (harsovit sf).fin ;
+ s2 = subj.s ! PNom ;
+ s3 = negation ;
+ s4 = \\sf => (harsovit sf).inf ++ verb.s1 ;
+ s5, s6, s7 = [] ;
+ e3,e4,e5,e6,e7 = False
+ } ;
+
+
+
+
+
+
+ insertObject : Sats -> Str -> Sats = \sats, obj ->
+ {s5 = sats.s5 ++ obj ;
+ s1 = sats.s1 ; s2 = sats.s2 ; s3 = sats.s3 ; s4 = sats.s4 ; s6 = sats.s6 ; s7 = sats.s7 ;
+ e5 = True ;
+ e3 = sats.e3 ; e4 = sats.e4 ; e6 = sats.e6 ; e7 = sats.e7
+ } ;
+
+ insertExtrapos : Sats -> Str -> Sats = ...
+
+ mkSatsObject : NounPhrase -> Verb -> Str -> Sats = \subj,verb,obj ->
+ insertObject (mkSats subj verb) obj ;
+
+ mkSatsCopula : NounPhrase -> Str -> Sats = \subj,obj ->
+ mkSatsObject subj (verbVara ** {s1 = []}) obj ;
+
+N.B. these would be nicer to define if GF had record field overwriting:
+
+ insertObject : Sats -> Str -> Sats = \sats, obj ->
+ sats ** {s5 = sats.s5 ++ obj ; e5 = True} ;
+
+
+ -- du sover + SatsV = mkSats ; + + -- du ser mig + SatsV2 subj verb obj = + mkSatsObject subj verb (verb.s2 ++ obj.s ! PAcc) ; + + -- du föredrar honom framför mig + SatsV3 subj verb obj1 obj2 = + mkSatsObject subj verb (verb.s2 ++ obj1.s ! PAcc ++ verb.s3 ++ obj2.s ! PAcc) ; + + -- du säger att det regnar + SatsVS subj verb sent = + insertExtrapos (mkSats subj verb) (optStr infinAtt ++ sent.s ! Sub) ; + + -- du undrar vem som kommer + SatsVQ subj verb quest = + insertExtrapos (mkSats subj verb) (quest.s ! IndirQ) ; ++ -
+ -- du berättade mig att det hade regnat + SatsV2S subj verb obj sent = + insertExtrapos + (mkSatsObject subj verb (verb.s2 ++ obj.s ! PAcc)) + (optStr infinAtt ++ sent.s ! Sub) ; + + -- du frågar mig om det regnar + SatsV2Q subj verb obj quest = + insertExtrapos + (mkSatsObject subj verb (verb.s2 ++ obj.s ! PAcc)) + (quest.s ! IndirQ) ; + + -- du är trött + SatsAP subj adj = + mkSatsCopula subj (adj.s ! predFormAdj subj.g subj.n ! Nom) ; +
+ +We have tried to add at least those patterns that are meaningful in +the language-independent API.
+ på torget har jag sett dig idag + idag har jag sett dig på torget + ? idag på torget har jag sett dig ++Interrogative and relative pronouns +
+ som jag har sett idag + Vem har du sett idag? + När och var har du sett henne? ++The resource grammar has an old treatment without topology: can we +make it nicer? + +
+ den stora bilen, denna stora bil + den store bil, denne store bil + den store bilen, denne store bilen ++
+ +Can we abstract away from the differences and build the three +grammars together without copy and paste? + + + + +
+ interface Agreement = {
+ param Agr ; Case ;
+ oper subject : Case
+ }
+
+ incomplete concrete PredAgr of Pred = {
+ lincat
+ NP = {s : Case => Str ; a : Agr} ;
+ VP = {s : Agr => Str} ;
+ lin
+ PredVP np vp = {s = np.s ! subject ++ vp.s ! np.a} ;
+ }
+
+ instance AgreementFin of Agreement = {
+ param Agr = {n : Number ; p : Person} ;
+ param Case = Nom | Gen | ... | Instr ; -- 14 values
+ oper subject = Nom ;
+ }
+
+ concrete PredFin of Pred = PredAgr with (Agreement = AgreementFin) ;
+
+
+
+
+
+
++ param + Gender ; + NounGender ; ++Swedish instance: +
+ Gender = Utr | Neutr ; + NounGender = NUtr Sex | Neutr ; ++Danish instance: +
+ Gender = Utr | Neutr ; + NounGender : Type = Gender ; ++Norwegian instance: +
+ Gender = Masc | Fem | Neutr ; + NounGender : Type = Gender ; ++ + + + +
+ oper + specDefPhrase : Species ; + verbVara, verbHava, verbSkola, verbFinnas : V ; + relPron : RP ; + comparÄn, infinAtt, negInte : Str ; ++Swedish instance: +
+ specDefPhrase = Def ; + verbVara = vara_V ; ... + relPron = relPronForms "som" "vars" ; + comparÄn = "än" ; ++Danish instance: +
+ specDefPhrase = Indef ; + verbVara = være_V ; ... + relPron = relPronForms "som" "hvis" ; + comparÄn = "end" ; ++Norwegian instance: +
+ specDefPhrase = Def ; + verbVara = være_V ; ... + relPron = relPronForms "som" "hvis" ; + comparÄn = "enn" ; ++ + + + + +
+ DefNP storbil = case storbil.isComplex of {
+ True => DetCN den_det storbil ;
+ False => DetCN empty_Det storbil
+ }
+ where
+ empty_Det = {s = \\_ => [] ; n = Sg ; b = DefP specDefPhrase} ;
+
+For denna, which is in the lexicon, we just have different entries
+
+ {s = genForms "denna" "detta" ; n = Sg ; b = DefP Indef} -- Swe
+ {s = genForms "denne" "dette" ; n = Sg ; b = DefP specDefPhrase} -- Dan, Nor
+
+
+
+
+
+
++ word + paradigm in Swedish ---> word + paradigm in Danish/Norwegian ++The word is transformed by "sound laws", the paradigm by a general correspondance. +Example: +
+ decl1 "jacka" ---> decl1 "jakke" ++This is computed to +
+ {s : SubstForm => Str = table {
+ SF Sg Indef Nom => "jakke" ;
+ SF Sg Indef Gen => "jakkes" ;
+ SF Sg Def Nom => "jakka" ;
+ SF Sg Def Gen => "jakkas" ;
+ SF Pl Indef Nom => "jakker" ;
+ SF Pl Indef Gen => "jakkers" ;
+ SF Pl Def Nom => "jakkene" ;
+ SF Pl Def Gen => "jakkenes"
+ } ;
+ g = Fem
+ }
+
+Notice: we do not need to assume translation equivalence.
+
+
+
+
+