forked from GitHub/gf-core
add lexer and unlexer for Ancient Greek accent normalization
This commit is contained in:
2
gf.cabal
2
gf.cabal
@@ -105,8 +105,10 @@ Library
|
||||
PGF
|
||||
PGF.Internal
|
||||
PGF.Lexing
|
||||
PGF.LexingAGreek
|
||||
PGF.Utilities
|
||||
PGF.Haskell
|
||||
|
||||
other-modules:
|
||||
PGF.Data
|
||||
PGF.Macros
|
||||
|
||||
@@ -101,13 +101,16 @@ commonCommands = fmap (mapCommandExec liftSIO) $ Map.fromList [
|
||||
"To see transliteration tables, use command ut."
|
||||
],
|
||||
examples = [
|
||||
mkEx "l (EAdd 3 4) | ps -code -- linearize code-like output",
|
||||
mkEx "ps -lexer=code | p -cat=Exp -- parse code-like input",
|
||||
-- mkEx "l (EAdd 3 4) | ps -code -- linearize code-like output",
|
||||
mkEx "l (EAdd 3 4) | ps -unlexcode -- linearize code-like output",
|
||||
-- mkEx "ps -lexer=code | p -cat=Exp -- parse code-like input",
|
||||
mkEx "ps -lexcode | p -cat=Exp -- parse code-like input",
|
||||
mkEx "gr -cat=QCl | l | ps -bind -- linearization output from LangFin",
|
||||
mkEx "ps -to_devanagari \"A-p\" -- show Devanagari in UTF8 terminal",
|
||||
mkEx "rf -file=Hin.gf | ps -env=quotes -to_devanagari -- convert translit to UTF8",
|
||||
mkEx "rf -file=Ara.gf | ps -from_utf8 -env=quotes -from_arabic -- convert UTF8 to transliteration",
|
||||
mkEx "ps -to=chinese.trans \"abc\" -- apply transliteration defined in file chinese.trans"
|
||||
mkEx "ps -to=chinese.trans \"abc\" -- apply transliteration defined in file chinese.trans",
|
||||
mkEx "ps -lexgreek \"a)gavoi` a)'nvrwpoi' tines*\" -- normalize ancient greek accentuation"
|
||||
],
|
||||
exec = \opts x-> do
|
||||
let (os,fs) = optsAndFlags opts
|
||||
@@ -232,6 +235,8 @@ stringOpOptions = sort $ [
|
||||
("lextext","text-like lexer"),
|
||||
("lexcode","code-like lexer"),
|
||||
("lexmixed","mixture of text and code, as in LaTeX (code between $...$, \\(...)\\, \\[...\\])"),
|
||||
("lexgreek","lexer normalizing ancient Greek accentuation"),
|
||||
("lexgreek2","lexer normalizing ancient Greek accentuation for text with vowel length annotations"),
|
||||
("to_cp1251","encode to cp1251 (Cyrillic used in Bulgarian resource)"),
|
||||
("to_html","wrap in a html file with linebreaks"),
|
||||
("to_utf8","encode to utf8 (default)"),
|
||||
@@ -239,6 +244,7 @@ stringOpOptions = sort $ [
|
||||
("unlexcode","code-like unlexer"),
|
||||
("unlexmixed","mixture of text and code (code between $...$, \\(...)\\, \\[...\\])"),
|
||||
("unchars","unlexer that puts no spaces between tokens"),
|
||||
("unlexgreek","unlexer de-normalizing ancient Greek accentuation"),
|
||||
("unwords","unlexer that puts a single space between tokens (default)"),
|
||||
("words","lexer that assumes tokens separated by spaces (default)")
|
||||
] ++
|
||||
@@ -262,4 +268,4 @@ trie = render . pptss . H.toTrie . map H.toATree
|
||||
|
||||
-- ** Converting command input
|
||||
toString = unwords . toStrings
|
||||
toLines = unlines . toStrings
|
||||
toLines = unlines . toStrings
|
||||
|
||||
@@ -3,6 +3,7 @@ module GF.Text.Lexing (stringOp,opInEnv) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import PGF.Lexing
|
||||
import PGF.LexingAGreek(lexAGreek,unlexAGreek,lexAGreek2) -- HL 20.2.2016
|
||||
|
||||
import Data.Char (isSpace)
|
||||
import Data.List (intersperse)
|
||||
@@ -13,12 +14,15 @@ stringOp name = case name of
|
||||
"lextext" -> Just $ appLexer lexText
|
||||
"lexcode" -> Just $ appLexer lexCode
|
||||
"lexmixed" -> Just $ appLexer lexMixed
|
||||
"lexgreek" -> Just $ appLexer lexAGreek
|
||||
"lexgreek2" -> Just $ appLexer lexAGreek2
|
||||
"words" -> Just $ appLexer words
|
||||
"bind" -> Just $ appUnlexer (unwords . bindTok)
|
||||
"unchars" -> Just $ appUnlexer concat
|
||||
"unlextext" -> Just $ appUnlexer (unlexText . unquote)
|
||||
"unlexcode" -> Just $ appUnlexer unlexCode
|
||||
"unlexmixed" -> Just $ appUnlexer (unlexMixed . unquote)
|
||||
"unlexgreek" -> Just $ appUnlexer unlexAGreek
|
||||
"unwords" -> Just $ appUnlexer unwords
|
||||
"to_html" -> Just wrapHTML
|
||||
_ -> transliterate name
|
||||
|
||||
@@ -239,12 +239,13 @@ transGreek = mkTransliteration "modern Greek" allTrans allCodes where
|
||||
transAncientGreek :: Transliteration
|
||||
transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
|
||||
allTrans = words $
|
||||
-- "- - - - - - - c: - - - - - - - - " ++ -- standard code point for colon: 00B7
|
||||
"- - - - - - - - - - - - - - - - " ++
|
||||
"i= A B G D E Z H V I K L M N X O " ++
|
||||
"P R - S T Y F C Q W I- Y- - - - - " ++
|
||||
"y= a b g d e z h v i k l m n x o " ++
|
||||
"p r s* s t y f c q w i- y- - - - - " ++
|
||||
"a) a( a)` a(` a)' a(' a)~ a(~ A) A( A)` A(` A)' A(' A)~ A(~ " ++
|
||||
"a) a( a)` a(` a)' a(' a)~ a(~ A) A( A)` A(` A)' A(' A)~ A(~ " ++ -- 1f00-1f09,1f0a-1f0f
|
||||
"e) e( e)` e(` e)' e(' - - E) E( E)` E(` E)' E(' - - " ++
|
||||
"h) h( h)` h(` h)' h(' h)~ h(~ H) H( H)` H(` H)' H(' H)~ H(~ " ++
|
||||
"i) i( i)` i(` i)' i(' i)~ i(~ I) I( I)` I(` I)' I(' I)~ I(~ " ++
|
||||
@@ -252,16 +253,42 @@ transAncientGreek = mkTransliteration "ancient Greek" allTrans allCodes where
|
||||
"y) y( y)` y(` y)' y(' y)~ y(~ - Y( - Y(` - Y(' - Y(~ " ++
|
||||
"w) w( w)` w(` w)' w(' w)~ w(~ W) W( W)` W(` W)' W(' W)~ W(~ " ++
|
||||
"a` a' e` e' h` h' i` i' o` o' y` y' w` w' - - " ++
|
||||
"a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80- -- HL: a|) a|( for a|( a|)
|
||||
"h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90- -- HL: h|) h|( for h|( h|)
|
||||
"w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0- -- HL: w|) w|( for w|( w|)
|
||||
"a|) a|( a|)` a|(` a|)' a|(' a|)~ a|(~ - - - - - - - - " ++ -- 1f80-
|
||||
"h|) h|( h|)` h|(` h|)' h|(' h|)~ h|(~ - - - - - - - - " ++ -- 1f90-
|
||||
"w|) w|( w|)` w|(` w|)' w|(' w|)~ w|(~ - - - - - - - - " ++ -- 1fa0-
|
||||
"a. a_ a|` a| a|' - a~ a|~ - - - - - - - - " ++ -- 1fb0-
|
||||
"- - h|` h| h|' - h~ h|~ - - - - - - - - " ++ -- 1fc0-
|
||||
"i. i_ i=` i=' - - i~ i=~ - - - - - - - - " ++ -- 1fd0-
|
||||
"y. y_ y=` y=' r) r( y~ y=~ - - - - - - - - " ++ -- 1fe0- -- HL: y=~ for y|~
|
||||
"- - w|` w| w|' - w~ w|~ - - - - - - - - " -- 1ff0-
|
||||
allCodes = [0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff]
|
||||
|
||||
"y. y_ y=` y=' r) r( y~ y=~ - - - - - - - - " ++ -- 1fe0-
|
||||
"- - w|` w| w|' - w~ w|~ - - - - - - - - " ++ -- 1ff0-
|
||||
-- HL, Private Use Area Code Points (New Athena Unicode, Cardo, ALPHABETUM, Antioch)
|
||||
-- see: http://apagreekkeys.org/technicalDetails.html
|
||||
-- GreekKeys Support by Donald Mastronarde
|
||||
"- - - - - - - - - e. o. R) Y) Y)` Y)' Y)~ " ++ -- e1a0-e1af
|
||||
"e~ e)~ e(~ e_ e_' e_` e_) e_( e_)` e_(` e_)' e_(' E)~ E(~ E_ E. " ++ -- e1b0-e1bf
|
||||
"o~ o)~ o(~ o_ o_' o_` o_) o_( o_)` o_(` o_)' o_(' O)~ O(~ O_ O. " ++ -- e1c0-e1cf
|
||||
"a_` - a_~ a_)` a_(` a_)~ a_(~ - a.` a.) a.)` a.(' a.(` - - - " ++ -- eaf0-eaff
|
||||
"a_' - - - a_) a_( - a_)' - a_(' a.' a.( a.)' - - - " ++ -- eb00-eb0f
|
||||
"e_)~ e_(~ - - - - - e_~ - - - - - - - - " ++ -- eb20-eb2f
|
||||
"- - - - - - i_~ - i_` i_' - - i_) i_)' i_( i_(' " ++ -- eb30-eb3f
|
||||
"i.' i.) i.)' i.( i.` i.)` - i.(' i.(` - - - - - - - " ++ -- eb40-eb4f
|
||||
"- - - - i_)` i_(` - i_)~ i_(~ - o_~ o_)~ o_(~ - - - " ++ -- eb50-eb5f
|
||||
"y_` " ++ -- eb6f
|
||||
"y_~ y_)` - - - y_(` - y_)~ y_(~ - y_' - - y_) y_( y_)' " ++ -- eb70-eb7f
|
||||
"y_(' y.' y.( y.` y.) y.)' - - y.)` y.(' y.(` - - - - - " -- eb80-eb8f
|
||||
allCodes = -- [0x00B0 .. 0x00Bf]
|
||||
[0x0380 .. 0x03cf] ++ [0x1f00 .. 0x1fff]
|
||||
++ [0xe1a0 .. 0xe1af]
|
||||
++ [0xe1b0 .. 0xe1bf]
|
||||
++ [0xe1c0 .. 0xe1cf]
|
||||
++ [0xeaf0 .. 0xeaff]
|
||||
++ [0xeb00 .. 0xeb0f]
|
||||
++ [0xeb20 .. 0xeb2f]
|
||||
++ [0xeb30 .. 0xeb3f]
|
||||
++ [0xeb40 .. 0xeb4f]
|
||||
++ [0xeb50 .. 0xeb5f] ++ [0xeb6f]
|
||||
++ [0xeb70 .. 0xeb7f]
|
||||
++ [0xeb80 .. 0xeb8f]
|
||||
|
||||
transAmharic :: Transliteration
|
||||
transAmharic = mkTransliteration "Amharic" allTrans allCodes where
|
||||
|
||||
Reference in New Issue
Block a user