diff --git a/examples/phrasebook/GreetingsSpa.gf b/examples/phrasebook/GreetingsSpa.gf index 6008688f6..673bac85e 100644 --- a/examples/phrasebook/GreetingsSpa.gf +++ b/examples/phrasebook/GreetingsSpa.gf @@ -11,6 +11,9 @@ lin GDamn = ss "joder" ; GExcuse = ss "perdón" ; GExcusePol = ss "perdone" ; + GCongratulations = ss "felicitaciones" ; + GGoodLuck = ss "buena suerte" ; + GHappyBirthday = ss "feliz cumpleaños" ; GGoodMorning, GGoodDay = ss "buenos días" ; GGoodEvening = ss "buenas tardes" ; GGoodNight = ss "buenas noches" ; diff --git a/examples/phrasebook/Implementation.html b/examples/phrasebook/Implementation.html index 41bab9f70..ff2275979 100644 --- a/examples/phrasebook/Implementation.html +++ b/examples/phrasebook/Implementation.html @@ -106,8 +106,10 @@ gfdoc - a rudimentary GF document generator. Too property = mkAP too_AdA (mkAP property) ; PropQuality property = mkAP property ; - ThePlace kind = placeNP the_Det kind ; - APlace kind = placeNP a_Det kind ; + ThePlace kind = let dd = if_then_else Det kind.isPl thePl_Det theSg_Det + in placeNP dd kind ; + APlace kind = let dd = if_then_else Det kind.isPl thePl_Det theSg_Det + in placeNP dd kind ; IMale, IFemale = mkPerson i_Pron ; YouFamMale, YouFamFemale = mkPerson youSg_Pron ; @@ -130,7 +132,11 @@ gfdoc - a rudimentary GF document generator. NNumeral n = mkCard <lin Numeral n : Numeral> ; - AHave p obj = mkCl p.name have_V2 obj ; + SHave p obj = mkS (mkCl p.name have_V2 obj) ; + SHaveNo p k = mkS negativePol (mkCl p.name have_V2 (mkNP aPl_Det k)) ; + SHaveNoMass p m = mkS negativePol (mkCl p.name have_V2 (mkNP m)) ; + QDoHave p obj = mkQS (mkQCl (mkCl p.name have_V2 obj)) ; + AHaveCurr p curr = mkCl p.name have_V2 (mkNP aPl_Det curr) ; ACitizen p n = mkCl p.name n ; ABePlace p place = mkCl p.name place.at ; @@ -166,12 +172,20 @@ These are used in Words for each language. } ; NPPlace : Type = {name : NP ; at : Adv ; to : Adv} ; - CNPlace : Type = {name : CN ; at : Prep ; to : Prep} ; + CNPlace : Type = {name : CN ; at : Prep ; to : Prep; isPl : Bool} ; mkCNPlace : CN -> Prep -> Prep -> CNPlace = \p,i,t -> { name = p ; at = i ; - to = t + to = t ; + isPl = False + } ; + + mkCNPlacePl : CN -> Prep -> Prep -> CNPlace = \p,i,t -> { + name = p ; + at = i ; + to = t ; + isPl = True } ; placeNP : Det -> CNPlace -> NPPlace = \det,kind -> @@ -344,7 +358,7 @@ Means of transportation Actions: the predication patterns are very often language-dependent.
-      AHasAge p num = mkCl p.name (mkNP (mkNP num L.year_N) (mkAdv "old"));
+      AHasAge p num = mkCl p.name (mkNP (mkNP num L.year_N) (ParadigmsEng.mkAdv "old"));
       AHasChildren p num = mkCl p.name have_V2 (mkNP num L.child_N) ;
       AHasRoom p num = mkCl p.name have_V2 
         (mkNP (mkNP a_Det (mkN "room")) (SyntaxEng.mkAdv for_Prep (mkNP num (mkN "person")))) ;
@@ -456,10 +470,10 @@ auxiliaries
         mkNPDay day (SyntaxEng.mkAdv on_Prep day) 
           (SyntaxEng.mkAdv on_Prep (mkNP a_Quant plNum (mkCN (mkN d)))) ;
   
-      mkCompoundPlace : Str -> Str -> Str -> {name : CN ; at : Prep ; to : Prep} = \comp, p, i ->
+      mkCompoundPlace : Str -> Str -> Str -> {name : CN ; at : Prep ; to : Prep; isPl : Bool} = \comp, p, i ->
        mkCNPlace (mkCN (P.mkN comp (mkN p))) (P.mkPrep i) to_Prep ;
   
-      mkPlace : Str -> Str -> {name : CN ; at : Prep ; to : Prep} = \p,i -> 
+      mkPlace : Str -> Str -> {name : CN ; at : Prep ; to : Prep; isPl : Bool} = \p,i -> 
         mkCNPlace (mkCN (mkN p)) (P.mkPrep i) to_Prep ;
   
       open_Adv = P.mkAdv "open" ;
diff --git a/examples/phrasebook/Makefile b/examples/phrasebook/Makefile
index f0dc1826d..4e36e2988 100644
--- a/examples/phrasebook/Makefile
+++ b/examples/phrasebook/Makefile
@@ -29,7 +29,7 @@ doc:
 	rm -f Ontology.gf
 	cat SentencesI.gf WordsEng.gf >Implementation.gf
 	gfdoc Implementation.gf
-	txt2tags -thtml phrasebook.txt
+	txt2tags -thtml --toc phrasebook.txt
 	rm -f Ontology.gf Implementation.gf
 
 upload:: Phrasebook.pgf
diff --git a/examples/phrasebook/Ontology.html b/examples/phrasebook/Ontology.html
index 0765ac4e0..48059049a 100644
--- a/examples/phrasebook/Ontology.html
+++ b/examples/phrasebook/Ontology.html
@@ -147,12 +147,16 @@ Determiners.
 Actions are typically language-dependent, not only lexically but also
 structurally. However, these ones are mostly functorial.
 
-      AHave     : Person -> Object      -> Action ;  -- you have pizzas
+      SHave       : Person -> Object      -> Sentence ;  -- you have beer
+      SHaveNo     : Person -> Kind        -> Sentence ;  -- you have no apples
+      SHaveNoMass : Person -> MassKind    -> Sentence ;  -- you have no beer
+      QDoHave     : Person -> Object      -> Question ;  -- do you have beer
+  
       AHaveCurr : Person -> Currency    -> Action ;  -- you have dollars
       ACitizen  : Person -> Citizenship -> Action ;  -- you are Swedish
       ABePlace  : Person -> Place       -> Action ;  -- you are in the bar
   
-      ByTransp : Transport -> ByTransport ;         -- by bus
+      ByTransp : Transport -> ByTransport ;          -- by bus
   
   }
 
diff --git a/examples/phrasebook/WordsFin.gf b/examples/phrasebook/WordsFin.gf index 29494ccb2..0e4e7d14c 100644 --- a/examples/phrasebook/WordsFin.gf +++ b/examples/phrasebook/WordsFin.gf @@ -208,7 +208,9 @@ concrete WordsFin of Words = SentencesFin ** mkQS (mkQCl (mkIP which_IDet trans.name) (mkVP (mkVP L.go_V) place.to)) ; IsTranspPlace trans place = - mkQS (mkQCl (E.AdvPredNP place.to L.go_V (E.PartCN (trans.name)))) ; + mkQS (mkQCl (mkCl (mkVP (mkVP (mkVP (mkV "päästä")) trans.by) place.to))) ; + -- pääseekö keskustaan bussilla + -- mkQS (mkQCl (E.AdvPredNP place.to L.go_V (E.PartCN (trans.name)))) ; -- meneekö keskustaan bussia -- modifiers of places diff --git a/examples/phrasebook/missing.txt b/examples/phrasebook/missing.txt index e05b4c3c2..88a998dfb 100644 --- a/examples/phrasebook/missing.txt +++ b/examples/phrasebook/missing.txt @@ -11,5 +11,5 @@ PhrasebookIta : PhrasebookNor : PhrasebookPol : PhrasebookRon : -PhrasebookSpa : GCongratulations GGoodLuck GHappyBirthday +PhrasebookSpa : PhrasebookSwe : diff --git a/examples/phrasebook/phrasebook.html b/examples/phrasebook/phrasebook.html index fae61468a..2d36e5fc0 100644 --- a/examples/phrasebook/phrasebook.html +++ b/examples/phrasebook/phrasebook.html @@ -2,6 +2,7 @@ + MOLTO Multilingual Phrasebook

MOLTO Multilingual Phrasebook

@@ -10,6 +11,25 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2.
+

+
+

+ + +

+
+


@@ -18,6 +38,8 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2. History

-

Acknowledgements

+ +

Effort and cost

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
LanguageGrammarian's language skillsGrammarian's GF skillsInformant used for developmentInformant used for testingUse of external toolsImpact of external toolsChanges on the resource grammarDevelopment time
Bulgarian######---?###
Catalan######---?##
Danish-###+++######
Dutch-###+++#####
English#####-+--_#
Finnish######---?###
French#####-+-?##
German####+++#######
Italian####---?####
Norwegian####+-+#####
Polish######+++####
Romanian######--+#######
Spanish###---?_##
Swedish#####-+-?-##
+ +

+Explanation on scores +

+ + + + + + + + + + + + + +

Example-based grammar writing prototype

+

+The figure presents the process of creating a Phrasebook using an example-based +approach for the language X, where X = {Danish, Dutch, German, Norwegian}. +

+

+ +

+ + +

+The time needed for preparing the configuration files for a grammar will not be needed +in the future, since the files are reusable for other applications. +The time for the second step can be saved if automatic tools, like Google translate +are used. This is only possible in languages with a simpler morphology and syntax +and large corpora available. +Good results were obtained for German and Dutch with Google translate, but for +languages like Romanian or Polish, which are both complex and lack enough resources, +the results are discouraging. +

+

+If the statistical oracle works well, the only step where the presence of a human +translator is needed is the evaluation and feedback step. An average of 4 hours per +round and 2 rounds were needed in average for the languages for which we performed +the experiment. It is possible that more effort is needed for more complex languages. +

+ +

Conclusions (tentative)

+

+The grammarian need not be a native speaker of the language. +

+

+For many languages, the grammarian need not even know the language - native informants are +enough. +

+

+However, evaluation by native speakers is necessary. +

+

+Correct and idiomatic translations are possible. +

+

+A typical development time was 2-3 person working days per language. +

+

+Google translate helps in bootstrapping grammars, but must be checked. +

+ + +

+Resource grammars should give some more support +

+ + + +

Acknowledgements

The Phrasebook has been built in the MOLTO project funded by the European Commission.

The authors are grateful to their native speaker informants helping to bootstrap and evaluate -the grammars: Richard Bubel, Grégoire Détrez, Michal Palka, Willard Rafnsson,... +the grammars: +Richard Bubel, +Grégoire Détrez, +Karin Keijzer, +Michał Pałka, +Willard Rafnsson, +Nick Smallbone.

- + diff --git a/examples/phrasebook/phrasebook.txt b/examples/phrasebook/phrasebook.txt index 7226ae1b1..d7bfa162d 100644 --- a/examples/phrasebook/phrasebook.txt +++ b/examples/phrasebook/phrasebook.txt @@ -3,6 +3,8 @@ Krasimir Angelov, Olga Caprotti, Ramona Enache, Thomas Hallgren, Inari Listenmaa Showcase for project FP7-ICT-247914, Deliverable D10.2. +%!Encoding:utf-8 + %!postproc(html): #HR
%!postproc(html): #BSMALL %!postproc(html): #ESMALL @@ -14,6 +16,8 @@ Showcase for project FP7-ICT-247914, Deliverable D10.2. #BSMALL History +- 2 June. Version 1.0 released! +- 29 May. Link to Google translate with the current language pair and phrase. - 27 May. Polish added. - 26 May. Version 0.9: Catalan added, mass/count noun distinction to reduce overgeneration, @@ -46,24 +50,24 @@ History =Purpose= This phrasebook is a program for translating touristic phrases -between the 15 European languages included in the +between 14 European languages included in the [MOLTO http://www.molto-project.eu] project (Multilingual On-Line Translation): - Bulgarian, Catalan, Danish, Dutch, English, Finnish, French, German, Italian, Norwegian, - Polish, Romanian, Russian, Spanish, Swedish + Polish, Romanian, Spanish, Swedish It is implemented by using the GF programming language ([Grammatical Framework http://grammaticalframework.org]). -It is the first demo for the MOLTO project, released in the third month (by June 2010) -but to be updated in the course of the project. +It is the first demo for the MOLTO project, released in the third month (by June 2010). +The first version is a very small system, but it will extended in the course of the project. -The phrasebook has the following requirements: +The phrasebook has the following requirement specification: - high quality: reliable translations to express yourself in any language - translation between all pairs of languages - runnable in web browsers -- runnable on mobile phones (also off-line: forthcoming for Android phones) +- runnable on mobile phones (forthcoming: Android phones) - easily extensible by new words (forthcoming: semi-automatic extensions by users) @@ -72,30 +76,57 @@ The source code resides in [``code.haskell.org/gf/examples/phrasebook/`` http://code.haskell.org/gf/examples/phrasebook/] -Current status (27 May 2010): -- small but useful coverage in abstract syntax -- reasonable implementations for all MOLTO languages except Russian -- works on web browsers calling a server -- web service not yet released, but preliminarily available in - http://www.grammaticalframework.org/demos/phrasebook/ - - =Points illustrated= -Interlingua-based translation. +Interlingua-based translation +- we translate meanings, rather than words -Incremental parsing. -The use of resource grammars and functors. +Incremental parsing +- the user is at every point guided by the list of possible next words -Example-based grammar writing and grammar induction from statistical models (Google). -Compile-time transfer: especially, in Action in Words. +The use of resource grammars and functors +- the translator was implemented on top of an earlier linguistic knowledge base, + the [GF Resource Grammar Library http://grammaticalframework.com/lib] -Quasi-incremental translation: many basic types are also used as phrases. -Disambiguation, esp. of politeness distinctions. +Example-based grammar writing and grammar induction from statistical models +([Google translate http://translate.google.com]) +- many of the grammars were created semi-automatically by generalization from + examples + + +Compile-time transfer: especially, in Action in Words +- the structural differences between languages are treated at compile time, + for maximal run-time efficiency + + +Quasi-incremental translation: many basic types are also used as phrases +- one can translate both words and complete sentences, and get intermediate results + + +Disambiguation, esp. of politeness distinctions +- if a phrase has many translations, each of them is shown and given an explanation + (currently just in English, later in any source language) + + +Fall-back to statistical translation +- currently just a link to Google translate (forthcoming: tailor-made statistical models) + + +Feed-back from users +- you are welcome to send comments, bug reports, and better translation suggestions! + + +The level of skills involved in grammar development +- testing different configurations (see table below) + + +Grammar testing +- use of treebanks with guided random generation for initial evaluation and regression testing + @@ -146,25 +177,15 @@ Here is the module structure as produced in GF by =To Do= -Improved translation interface -- a nicer way to show disambiguation (maybe hidden by default) - - -Complete the missing words and phrases - Disambiguation grammars for other languages than English Extend the abstract lexicon in ``Words`` by hand or (semi)automatically for - food stuff -- languages - places +- actions -Link to Google translate, for fall-back and for comparison - -Feedback facility in the UI - -Customizable distribution: make your own selection of the 2^15 language subsets +Customizable phone distribution: make your own selection of the 2^15 language subsets when downloading the phrasebook to a phone @@ -214,10 +235,151 @@ Here are the steps to follow for contributors: - Don't compromise quality to gain coverage: //non multa sed multum!// -==Acknowledgements== + +=Effort and cost= + +|| Language | Grammarian's language skills | Grammarian's GF skills | Informant used for development | Informant used for testing | Use of external tools | Impact of external tools | Changes on the resource grammar | Development time || +| Bulgarian | ### | ### | - | - | - | ? | # | ## | +| Catalan | ### | ### | - | - | - | ? | # | # | +| Danish | - | ### | + | + | + | ## | ## | ## | +| Dutch | - | ### | + | + | + | ## | # | ## | +| English | ## | ### | - | + | - | - | _ | # | +| Finnish | ### | ### | - | - | - | ? | # | ## | +| French | ## | ### | - | + | - | ? | # | # | +| German | # | ### | + | + | + | ## | ## | ### | +| Italian | ### | # | - | - | - | ? | ## | ## | +| Norwegian | # | ### | + | - | + | ## | # | ## | +| Polish | ### | ### | + | + | + | # | # | ## | +| Romanian | ### | ### | - | - | + | # | ### | ### | +| Spanish | ## | # | - | - | - | ? | _ | ## | +| Swedish | ## | ### | - | + | - | ? | - | ## | + + +Explanation on scores + +- Grammarian's language skills + - - : no skills + - # : passive knowledge + - ## : fluent non-native + - ### : native speaker + + +- Grammarian's GF skills + - - : no skills + - # : basic skills (2-day GF tutorial) + - ## : medium skills (previous experience of similar task) + - ### : advanced skills (resource grammar writer/substantial contributor) + + +- Informant used for development/Informant needed for testing/Use of external tools + - - : no + - + : yes + + +- Impact of external tools + - ? : not investigated + - - : no effect on the Phrasebook + - # : small impact (literal translation, simple idioms) + - ## : medium effect (translation of more forms of words, contextual preposition) + - ### : great effect (no extra work needed, translations are correct) + + +- Changes on the resource grammars + - - : no changes + - # : 1-3 minor changes + - ## : 4-10 minor changes, 1-3 medium changes + - ### : >10 changes of any kind + + +- Overall effort (including extra work on resource grammars) + - # : less than 8 person hours + - ## : 8-24 person hours + - ### : >24 person hours + + +=Example-based grammar writing prototype= + +The figure presents the process of creating a Phrasebook using an example-based +approach for the language X, where X = {Danish, Dutch, German, Norwegian}. + +[picpic.jpg] + +- the first step assumes an analysis of the resource grammar and extracts the necessary + information that functions that build new lexical entries would need. + A model is built so that the proper forms of the word can be rendered, + and additional information, such as gender, can be inferred. The script applies + these rules to each entry that we want to translate into the target language, and + one obtains a set of constructions. +- they are furthermore given to an external translator tool (Google translate) + or a native speaker for translation. One needs the configuration file even if the + translator is human, because formal knowledge of grammar is not assumed. +- the translations into the target language are further more processed in order to + build the linearizations of the categories first, decoding the information received. + Furthermore, having the words in the lexicon, one can parse the translations of + functions with the GF parser and generalize from that. +- the resulting grammar is tested with the aid of a script that generates + constructions covering all the functions and categories from the grammar, along + with some other constructions that proved to be problematic in some language. + The result of the script contains for each construction in the target language + its English correspondent and the abstract syntax tree. A native speaker + evaluates the results and if corrections are needed, the algorithm runs again + with the new examples. Depending on the language skills of the grammar writer, + the changes can be made directly into the GF files, and the correct examples + given by the native informant are just kept for validating the results. + The algorithm is repeated as long as corrections are needed. + + +The time needed for preparing the configuration files for a grammar will not be needed +in the future, since the files are reusable for other applications. +The time for the second step can be saved if automatic tools, like Google translate +are used. This is only possible in languages with a simpler morphology and syntax +and large corpora available. +Good results were obtained for German and Dutch with Google translate, but for +languages like Romanian or Polish, which are both complex and lack enough resources, +the results are discouraging. + +If the statistical oracle works well, the only step where the presence of a human +translator is needed is the evaluation and feedback step. An average of 4 hours per +round and 2 rounds were needed in average for the languages for which we performed +the experiment. It is possible that more effort is needed for more complex languages. + + +=Conclusions (tentative)= + +The grammarian need not be a native speaker of the language. + +For many languages, the grammarian need not even know the language - native informants are +enough. + +However, evaluation by native speakers is necessary. + +Correct and idiomatic translations are possible. + +A typical development time was 2-3 person working days per language. + +Google translate helps in bootstrapping grammars, but must be checked. +- in particular, unreliable for morphologically rich languages + + +Resource grammars should give some more support +- higher-level access to constructions like negative expressions +- large-scale morphological lexica + + + + + + +=Acknowledgements= The Phrasebook has been built in the MOLTO project funded by the European Commission. The authors are grateful to their native speaker informants helping to bootstrap and evaluate -the grammars: Richard Bubel, Grégoire Détrez, Michal Palka, Willard Rafnsson,... +the grammars: +Richard Bubel, +Grégoire Détrez, +Karin Keijzer, +Michał Pałka, +Willard Rafnsson, +Nick Smallbone. diff --git a/examples/phrasebook/picpic.jpg b/examples/phrasebook/picpic.jpg new file mode 100644 index 000000000..aac20b611 Binary files /dev/null and b/examples/phrasebook/picpic.jpg differ