add lexer and unlexer for Ancient Greek accent normalization

2016-02-23 16:30:39 +00:00
parent 8548979d1c
commit df2901c9c0
4 changed files with 51 additions and 12 deletions
--- a/gf.cabal
+++ b/gf.cabal
@@ -105,8 +105,10 @@ Library
    PGF
    PGF.Internal
    PGF.Lexing
+    PGF.LexingAGreek
    PGF.Utilities
    PGF.Haskell
+
  other-modules:
    PGF.Data
    PGF.Macros
--- a/src/compiler/GF/Command/CommonCommands.hs
+++ b/src/compiler/GF/Command/CommonCommands.hs
@@ -101,13 +101,16 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [
       "To see transliteration tables, use command ut."
       ],
     examples = [
-       mkEx "l (EAdd 3 4) | ps -code         -- linearize code-like output",
-       mkEx "ps -lexer=code | p -cat=Exp     -- parse code-like input",
+--       mkEx "l (EAdd 3 4) | ps -code         -- linearize code-like output",
+       mkEx "l (EAdd 3 4) | ps -unlexcode    -- linearize code-like output",
+--       mkEx "ps -lexer=code | p -cat=Exp     -- parse code-like input",
+       mkEx "ps -lexcode | p -cat=Exp        -- parse code-like input",
       mkEx "gr -cat=QCl | l | ps -bind      -- linearization output from LangFin",
       mkEx "ps -to_devanagari \"A-p\"         -- show Devanagari in UTF8 terminal",
       mkEx "rf -file=Hin.gf | ps -env=quotes -to_devanagari -- convert translit to UTF8",
       mkEx "rf -file=Ara.gf | ps -from_utf8 -env=quotes -from_arabic -- convert UTF8 to transliteration",
-       mkEx "ps -to=chinese.trans \"abc\"      -- apply transliteration defined in file chinese.trans"
+       mkEx "ps -to=chinese.trans \"abc\"      -- apply transliteration defined in file chinese.trans",
+       mkEx "ps -lexgreek \"a)gavoi` a)'nvrwpoi' tines*\" -- normalize ancient greek accentuation"
       ],
     exec = \opts x-> do
               let (os,fs) = optsAndFlags opts
@@ -232,6 +235,8 @@ stringOpOptions = sort $ [
       ("lextext","text-like lexer"),
       ("lexcode","code-like lexer"),
       ("lexmixed","mixture of text and code, as in LaTeX (code between $...$, \\(...)\\, \\[...\\])"),
+       ("lexgreek","lexer normalizing ancient Greek accentuation"),
+       ("lexgreek2","lexer normalizing ancient Greek accentuation for text with vowel length annotations"),
       ("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"),
       ("to_html","wrap in a html file with linebreaks"),
       ("to_utf8","encode to utf8 (default)"),
@@ -239,6 +244,7 @@ stringOpOptions = sort $ [
       ("unlexcode","code-like unlexer"),
       ("unlexmixed","mixture of text and code (code between $...$, \\(...)\\, \\[...\\])"),
       ("unchars","unlexer that puts no spaces between tokens"),
+       ("unlexgreek","unlexer de-normalizing ancient Greek accentuation"),
       ("unwords","unlexer that puts a single space between tokens (default)"),
       ("words","lexer that assumes tokens separated by spaces (default)")
       ] ++
@@ -262,4 +268,4 @@ trie = render . pptss . H.toTrie . map H.toATree

 -- ** Converting command input
 toString  = unwords . toStrings
-toLines = unlines . toStrings
+toLines = unlines . toStrings
--- a/src/compiler/GF/Text/Lexing.hs
+++ b/src/compiler/GF/Text/Lexing.hs
@@ -3,6 +3,7 @@ module GF.Text.Lexing (stringOp,opInEnv) where

 import GF.Text.Transliterations
 import PGF.Lexing
+import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016

 import Data.Char (isSpace)
 import Data.List (intersperse)
@@ -13,12 +14,15 @@ stringOp name = case name of
  "lextext"    -> Just $ appLexer lexText
  "lexcode"    -> Just $ appLexer lexCode
  "lexmixed"   -> Just $ appLexer lexMixed
+  "lexgreek"   -> Just $ appLexer lexAGreek
+  "lexgreek2"  -> Just $ appLexer lexAGreek2
  "words"      -> Just $ appLexer words
  "bind"       -> Just $ appUnlexer (unwords . bindTok)
  "unchars"    -> Just $ appUnlexer concat
  "unlextext"  -> Just $ appUnlexer (unlexText . unquote)
  "unlexcode"  -> Just $ appUnlexer unlexCode
  "unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote)
+  "unlexgreek" -> Just $ appUnlexer unlexAGreek
  "unwords"    -> Just $ appUnlexer unwords
  "to_html"    -> Just wrapHTML
  _ -> transliterate name
--- a/src/compiler/GF/Text/Transliterations.hs
+++ b/src/compiler/GF/Text/Transliterations.hs
@@ -239,12 +239,13 @@ transGreek = mkTransliteration "modern Greek" allTrans allCodes where
 transAncientGreek :: Transliteration
 transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
 allTrans = words $
+--   "-  -  -  -  -  -  -  c: -  -  -  -  -  -  -  - " ++  -- standard code point for colon: 00B7
   "-  -  -  -  -  -  -  -  -  -  -  -  -  -  -  - " ++
   "i= A  B  G  D  E  Z  H  V  I  K  L  M  N  X  O  " ++
   "P  R  -  S  T  Y  F  C  Q  W  I- Y- -  -  -  -  " ++
   "y= a  b  g  d  e  z  h  v  i  k  l  m  n  x  o  " ++
   "p  r  s* s  t  y  f  c  q  w  i- y- -  -  -  -  " ++
-   "a)  a(  a)` a(` a)' a(' a)~ a(~ A)  A(  A)` A(` A)' A(' A)~ A(~ " ++
+   "a)  a(  a)` a(` a)' a(' a)~ a(~ A)  A(  A)` A(` A)' A(' A)~ A(~ " ++  -- 1f00-1f09,1f0a-1f0f
   "e)  e(  e)` e(` e)' e(' -   -   E)  E(  E)` E(` E)' E(' -   -   " ++
   "h)  h(  h)` h(` h)' h(' h)~ h(~ H)  H(  H)` H(` H)' H(' H)~ H(~ " ++
   "i)  i(  i)` i(` i)' i(' i)~ i(~ I)  I(  I)` I(` I)' I(' I)~ I(~ " ++
@@ -252,16 +253,42 @@ transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
   "y)  y(  y)` y(` y)' y(' y)~ y(~ -   Y(  -   Y(` -   Y(' -   Y(~ " ++
   "w)  w(  w)` w(` w)' w(' w)~ w(~ W)  W(  W)` W(` W)' W(' W)~ W(~ " ++
   "a`  a'  e`  e'  h`  h'  i`  i'  o`  o'  y`  y'  w`  w'  -   -   " ++
-   "a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80-  -- HL: a|) a|( for a|( a|)
-   "h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90-  -- HL: h|) h|( for h|( h|)
-   "w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0-  -- HL: w|) w|( for w|( w|)
+   "a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80-  
+   "h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90-  
+   "w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0-  
   "a.  a_  a|` a|  a|'  -  a~ a|~ - - - - - - - - " ++ -- 1fb0-
   "-  -  h|` h|  h|'  -  h~ h|~ - - - - - - - - " ++ -- 1fc0-
   "i. i_ i=` i=' -    -  i~ i=~ - - - - - - - - " ++ -- 1fd0-
-   "y. y_ y=` y=' r)   r( y~ y=~ - - - - - - - - " ++ -- 1fe0-   -- HL: y=~ for y|~
-   "-  -  w|` w|  w|'  -  w~ w|~ - - - - - - - - "    -- 1ff0-
- allCodes = [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff]
-
+   "y. y_ y=` y=' r)   r( y~ y=~ - - - - - - - - " ++ -- 1fe0-   
+   "-  -  w|` w|  w|'  -  w~ w|~ - - - - - - - - " ++ -- 1ff0-
+   -- HL, Private Use Area Code Points (New Athena Unicode, Cardo, ALPHABETUM, Antioch)
+   -- see: http://apagreekkeys.org/technicalDetails.html
+   --      GreekKeys Support by Donald Mastronarde
+   "- - - - - - - - - e. o. R) Y) Y)` Y)' Y)~ "    ++ -- e1a0-e1af  
+   "e~ e)~ e(~ e_ e_' e_` e_) e_( e_)` e_(` e_)' e_(' E)~ E(~ E_ E. " ++ -- e1b0-e1bf
+   "o~ o)~ o(~ o_ o_' o_` o_) o_( o_)` o_(` o_)' o_(' O)~ O(~ O_ O. " ++ -- e1c0-e1cf
+   "a_` - a_~ a_)` a_(` a_)~ a_(~ - a.` a.) a.)` a.(' a.(` - - - "    ++ -- eaf0-eaff  
+   "a_' - - - a_) a_( - a_)' - a_(' a.' a.( a.)' - - - "              ++ -- eb00-eb0f  
+   "e_)~ e_(~ - - - - - e_~ - - - - - - - - "                         ++ -- eb20-eb2f
+   "- - - - - - i_~ - i_` i_' - - i_) i_)' i_( i_(' "                 ++ -- eb30-eb3f   
+   "i.' i.) i.)' i.( i.` i.)` - i.(' i.(` - - - - - - - "             ++ -- eb40-eb4f
+   "- - - - i_)` i_(` - i_)~ i_(~ - o_~ o_)~ o_(~ - - - "             ++ -- eb50-eb5f
+   "y_` " ++ -- eb6f
+   "y_~ y_)` - - - y_(` - y_)~ y_(~ - y_' - - y_) y_( y_)' "  ++         -- eb70-eb7f
+   "y_(' y.' y.( y.` y.) y.)' - - y.)` y.(' y.(` - - - - - "             -- eb80-eb8f
+ allCodes =  -- [0x00B0 .. 0x00Bf] 
+             [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff] 
+          ++ [0xe1a0 .. 0xe1af] 
+          ++ [0xe1b0 .. 0xe1bf]
+          ++ [0xe1c0 .. 0xe1cf]
+          ++ [0xeaf0 .. 0xeaff]
+          ++ [0xeb00 .. 0xeb0f]
+          ++ [0xeb20 .. 0xeb2f]
+          ++ [0xeb30 .. 0xeb3f]
+          ++ [0xeb40 .. 0xeb4f]
+          ++ [0xeb50 .. 0xeb5f] ++ [0xeb6f]
+          ++ [0xeb70 .. 0xeb7f]
+          ++ [0xeb80 .. 0xeb8f]
 
 transAmharic :: Transliteration
 transAmharic = mkTransliteration "Amharic" allTrans allCodes where