From 398212bd653bc8dfdd756934f41b5ed5053f7a22 Mon Sep 17 00:00:00 2001 From: leiss Date: Tue, 23 Feb 2016 16:30:39 +0000 Subject: [PATCH] add lexer and unlexer for Ancient Greek accent normalization --- gf.cabal | 2 ++ src/compiler/GF/Command/CommonCommands.hs | 14 +++++--- src/compiler/GF/Text/Lexing.hs | 4 +++ src/compiler/GF/Text/Transliterations.hs | 43 ++++++++++++++++++----- 4 files changed, 51 insertions(+), 12 deletions(-) diff --git a/gf.cabal b/gf.cabal index 308868911..5569e17af 100644 --- a/gf.cabal +++ b/gf.cabal @@ -105,8 +105,10 @@ Library PGF PGF.Internal PGF.Lexing + PGF.LexingAGreek PGF.Utilities PGF.Haskell + other-modules: PGF.Data PGF.Macros diff --git a/src/compiler/GF/Command/CommonCommands.hs b/src/compiler/GF/Command/CommonCommands.hs index 0037ba249..0dce8e894 100644 --- a/src/compiler/GF/Command/CommonCommands.hs +++ b/src/compiler/GF/Command/CommonCommands.hs @@ -101,13 +101,16 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [ "To see transliteration tables, use command ut." ], examples = [ - mkEx "l (EAdd 3 4) | ps -code -- linearize code-like output", - mkEx "ps -lexer=code | p -cat=Exp -- parse code-like input", +-- mkEx "l (EAdd 3 4) | ps -code -- linearize code-like output", + mkEx "l (EAdd 3 4) | ps -unlexcode -- linearize code-like output", +-- mkEx "ps -lexer=code | p -cat=Exp -- parse code-like input", + mkEx "ps -lexcode | p -cat=Exp -- parse code-like input", mkEx "gr -cat=QCl | l | ps -bind -- linearization output from LangFin", mkEx "ps -to_devanagari \"A-p\" -- show Devanagari in UTF8 terminal", mkEx "rf -file=Hin.gf | ps -env=quotes -to_devanagari -- convert translit to UTF8", mkEx "rf -file=Ara.gf | ps -from_utf8 -env=quotes -from_arabic -- convert UTF8 to transliteration", - mkEx "ps -to=chinese.trans \"abc\" -- apply transliteration defined in file chinese.trans" + mkEx "ps -to=chinese.trans \"abc\" -- apply transliteration defined in file chinese.trans", + mkEx "ps -lexgreek \"a)gavoi` a)'nvrwpoi' tines*\" -- normalize ancient greek accentuation" ], exec = \opts x-> do let (os,fs) = optsAndFlags opts @@ -232,6 +235,8 @@ stringOpOptions = sort $ [ ("lextext","text-like lexer"), ("lexcode","code-like lexer"), ("lexmixed","mixture of text and code, as in LaTeX (code between $...$, \\(...)\\, \\[...\\])"), + ("lexgreek","lexer normalizing ancient Greek accentuation"), + ("lexgreek2","lexer normalizing ancient Greek accentuation for text with vowel length annotations"), ("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"), ("to_html","wrap in a html file with linebreaks"), ("to_utf8","encode to utf8 (default)"), @@ -239,6 +244,7 @@ stringOpOptions = sort $ [ ("unlexcode","code-like unlexer"), ("unlexmixed","mixture of text and code (code between $...$, \\(...)\\, \\[...\\])"), ("unchars","unlexer that puts no spaces between tokens"), + ("unlexgreek","unlexer de-normalizing ancient Greek accentuation"), ("unwords","unlexer that puts a single space between tokens (default)"), ("words","lexer that assumes tokens separated by spaces (default)") ] ++ @@ -262,4 +268,4 @@ trie = render . pptss . H.toTrie . map H.toATree -- ** Converting command input toString = unwords . toStrings -toLines = unlines . toStrings \ No newline at end of file +toLines = unlines . toStrings diff --git a/src/compiler/GF/Text/Lexing.hs b/src/compiler/GF/Text/Lexing.hs index bfe38ca83..782e6ea9a 100644 --- a/src/compiler/GF/Text/Lexing.hs +++ b/src/compiler/GF/Text/Lexing.hs @@ -3,6 +3,7 @@ module GF.Text.Lexing (stringOp,opInEnv) where import GF.Text.Transliterations import PGF.Lexing +import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016 import Data.Char (isSpace) import Data.List (intersperse) @@ -13,12 +14,15 @@ stringOp name = case name of "lextext" -> Just $ appLexer lexText "lexcode" -> Just $ appLexer lexCode "lexmixed" -> Just $ appLexer lexMixed + "lexgreek" -> Just $ appLexer lexAGreek + "lexgreek2" -> Just $ appLexer lexAGreek2 "words" -> Just $ appLexer words "bind" -> Just $ appUnlexer (unwords . bindTok) "unchars" -> Just $ appUnlexer concat "unlextext" -> Just $ appUnlexer (unlexText . unquote) "unlexcode" -> Just $ appUnlexer unlexCode "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote) + "unlexgreek" -> Just $ appUnlexer unlexAGreek "unwords" -> Just $ appUnlexer unwords "to_html" -> Just wrapHTML _ -> transliterate name diff --git a/src/compiler/GF/Text/Transliterations.hs b/src/compiler/GF/Text/Transliterations.hs index 022e8fda9..4c706bf73 100644 --- a/src/compiler/GF/Text/Transliterations.hs +++ b/src/compiler/GF/Text/Transliterations.hs @@ -239,12 +239,13 @@ transGreek = mkTransliteration "modern Greek" allTrans allCodes where transAncientGreek :: Transliteration transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where allTrans = words $ +-- "- - - - - - - c: - - - - - - - - " ++ -- standard code point for colon: 00B7 "- - - - - - - - - - - - - - - - " ++ "i= A B G D E Z H V I K L M N X O " ++ "P R - S T Y F C Q W I- Y- - - - - " ++ "y= a b g d e z h v i k l m n x o " ++ "p r s* s t y f c q w i- y- - - - - " ++ - "a) a( a)` a(` a)' a(' a)~ a(~ A) A( A)` A(` A)' A(' A)~ A(~ " ++ + "a) a( a)` a(` a)' a(' a)~ a(~ A) A( A)` A(` A)' A(' A)~ A(~ " ++ -- 1f00-1f09,1f0a-1f0f "e) e( e)` e(` e)' e(' - - E) E( E)` E(` E)' E(' - - " ++ "h) h( h)` h(` h)' h(' h)~ h(~ H) H( H)` H(` H)' H(' H)~ H(~ " ++ "i) i( i)` i(` i)' i(' i)~ i(~ I) I( I)` I(` I)' I(' I)~ I(~ " ++ @@ -252,16 +253,42 @@ transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where "y) y( y)` y(` y)' y(' y)~ y(~ - Y( - Y(` - Y(' - Y(~ " ++ "w) w( w)` w(` w)' w(' w)~ w(~ W) W( W)` W(` W)' W(' W)~ W(~ " ++ "a` a' e` e' h` h' i` i' o` o' y` y' w` w' - - " ++ - "a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80- -- HL: a|) a|( for a|( a|) - "h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90- -- HL: h|) h|( for h|( h|) - "w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0- -- HL: w|) w|( for w|( w|) + "a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80- + "h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90- + "w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0- "a. a_ a|` a| a|' - a~ a|~ - - - - - - - - " ++ -- 1fb0- "- - h|` h| h|' - h~ h|~ - - - - - - - - " ++ -- 1fc0- "i. i_ i=` i=' - - i~ i=~ - - - - - - - - " ++ -- 1fd0- - "y. y_ y=` y=' r) r( y~ y=~ - - - - - - - - " ++ -- 1fe0- -- HL: y=~ for y|~ - "- - w|` w| w|' - w~ w|~ - - - - - - - - " -- 1ff0- - allCodes = [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff] - + "y. y_ y=` y=' r) r( y~ y=~ - - - - - - - - " ++ -- 1fe0- + "- - w|` w| w|' - w~ w|~ - - - - - - - - " ++ -- 1ff0- + -- HL, Private Use Area Code Points (New Athena Unicode, Cardo, ALPHABETUM, Antioch) + -- see: http://apagreekkeys.org/technicalDetails.html + -- GreekKeys Support by Donald Mastronarde + "- - - - - - - - - e. o. R) Y) Y)` Y)' Y)~ " ++ -- e1a0-e1af + "e~ e)~ e(~ e_ e_' e_` e_) e_( e_)` e_(` e_)' e_(' E)~ E(~ E_ E. " ++ -- e1b0-e1bf + "o~ o)~ o(~ o_ o_' o_` o_) o_( o_)` o_(` o_)' o_(' O)~ O(~ O_ O. " ++ -- e1c0-e1cf + "a_` - a_~ a_)` a_(` a_)~ a_(~ - a.` a.) a.)` a.(' a.(` - - - " ++ -- eaf0-eaff + "a_' - - - a_) a_( - a_)' - a_(' a.' a.( a.)' - - - " ++ -- eb00-eb0f + "e_)~ e_(~ - - - - - e_~ - - - - - - - - " ++ -- eb20-eb2f + "- - - - - - i_~ - i_` i_' - - i_) i_)' i_( i_(' " ++ -- eb30-eb3f + "i.' i.) i.)' i.( i.` i.)` - i.(' i.(` - - - - - - - " ++ -- eb40-eb4f + "- - - - i_)` i_(` - i_)~ i_(~ - o_~ o_)~ o_(~ - - - " ++ -- eb50-eb5f + "y_` " ++ -- eb6f + "y_~ y_)` - - - y_(` - y_)~ y_(~ - y_' - - y_) y_( y_)' " ++ -- eb70-eb7f + "y_(' y.' y.( y.` y.) y.)' - - y.)` y.(' y.(` - - - - - " -- eb80-eb8f + allCodes = -- [0x00B0 .. 0x00Bf] + [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff] + ++ [0xe1a0 .. 0xe1af] + ++ [0xe1b0 .. 0xe1bf] + ++ [0xe1c0 .. 0xe1cf] + ++ [0xeaf0 .. 0xeaff] + ++ [0xeb00 .. 0xeb0f] + ++ [0xeb20 .. 0xeb2f] + ++ [0xeb30 .. 0xeb3f] + ++ [0xeb40 .. 0xeb4f] + ++ [0xeb50 .. 0xeb5f] ++ [0xeb6f] + ++ [0xeb70 .. 0xeb7f] + ++ [0xeb80 .. 0xeb8f] transAmharic :: Transliteration transAmharic = mkTransliteration "Amharic" allTrans allCodes where