forked from GitHub/gf-core
commands for displaying transliteration tables
This commit is contained in:
18
index-3.html
18
index-3.html
@@ -94,14 +94,11 @@ is available for
|
|||||||
<P>
|
<P>
|
||||||
GF was born in 1998 at Xerox Research Centre Europe, Grenoble in the project
|
GF was born in 1998 at Xerox Research Centre Europe, Grenoble in the project
|
||||||
Multilingual Document Authoring. At Xerox, it was used for prototypes including
|
Multilingual Document Authoring. At Xerox, it was used for prototypes including
|
||||||
|
a restaurant phrase book in 6 languages,
|
||||||
|
a database query system in 7 languages,
|
||||||
|
a formalization of an alarm system instructions with translations to 5 languages, and
|
||||||
|
an authoring system for medical drug descriptions in 2 languages.
|
||||||
</P>
|
</P>
|
||||||
<UL>
|
|
||||||
<LI>restaurant phrase book in 6 languages
|
|
||||||
<LI>database queries in 7 languages
|
|
||||||
<LI>alarm system instructions in 5 languages
|
|
||||||
<LI>medical drug descriptions in 2 languages
|
|
||||||
</UL>
|
|
||||||
|
|
||||||
<P>
|
<P>
|
||||||
Later projects using GF and involving third parties include, in chronological order,
|
Later projects using GF and involving third parties include, in chronological order,
|
||||||
</P>
|
</P>
|
||||||
@@ -155,7 +152,7 @@ applications, libraries are a way to cope with thousands of details involved in
|
|||||||
syntax, lexicon, and inflection. The <A HREF="lib/">GF resource grammar library</A> has
|
syntax, lexicon, and inflection. The <A HREF="lib/">GF resource grammar library</A> has
|
||||||
support for an increasing number of languages, currently including
|
support for an increasing number of languages, currently including
|
||||||
</P>
|
</P>
|
||||||
<UL>
|
<OL>
|
||||||
<LI>Arabic (partial)
|
<LI>Arabic (partial)
|
||||||
<LI>Bulgarian
|
<LI>Bulgarian
|
||||||
<LI>Catalan (partial)
|
<LI>Catalan (partial)
|
||||||
@@ -164,14 +161,15 @@ support for an increasing number of languages, currently including
|
|||||||
<LI>Finnish
|
<LI>Finnish
|
||||||
<LI>French
|
<LI>French
|
||||||
<LI>German
|
<LI>German
|
||||||
<LI>Hindi/Urdu (partial)
|
<LI>Hindi/Urdu (fragments)
|
||||||
<LI><A HREF="http://www.interlingua.com/">Interlingua</A>
|
<LI><A HREF="http://www.interlingua.com/">Interlingua</A>
|
||||||
<LI>Italian
|
<LI>Italian
|
||||||
<LI>Norwegian bokmål
|
<LI>Norwegian bokmål
|
||||||
<LI>Russian
|
<LI>Russian
|
||||||
<LI>Spanish
|
<LI>Spanish
|
||||||
<LI>Swedish
|
<LI>Swedish
|
||||||
</UL>
|
<LI>Thai (fragments)
|
||||||
|
</OL>
|
||||||
|
|
||||||
<P>
|
<P>
|
||||||
Adding a language to the resource library takes 3 to 9
|
Adding a language to the resource library takes 3 to 9
|
||||||
|
|||||||
32
index-3.txt
32
index-3.txt
@@ -131,26 +131,28 @@ Libraries are at the heart of modern software engineering. In natural language
|
|||||||
applications, libraries are a way to cope with thousands of details involved in
|
applications, libraries are a way to cope with thousands of details involved in
|
||||||
syntax, lexicon, and inflection. The [GF resource grammar library lib/] has
|
syntax, lexicon, and inflection. The [GF resource grammar library lib/] has
|
||||||
support for an increasing number of languages, currently including
|
support for an increasing number of languages, currently including
|
||||||
- Arabic (partial)
|
+ Arabic (partial)
|
||||||
- Bulgarian
|
+ Bulgarian
|
||||||
- Catalan (partial)
|
+ Catalan (partial)
|
||||||
- Danish
|
+ Danish
|
||||||
- English
|
+ English
|
||||||
- Finnish
|
+ Finnish
|
||||||
- French
|
+ French
|
||||||
- German
|
+ German
|
||||||
- Hindi/Urdu (partial)
|
+ Hindi/Urdu (fragments)
|
||||||
- [Interlingua http://www.interlingua.com/]
|
+ [Interlingua http://www.interlingua.com/]
|
||||||
- Italian
|
+ Italian
|
||||||
- Norwegian bokmål
|
+ Norwegian bokmål
|
||||||
- Russian
|
+ Russian
|
||||||
- Spanish
|
+ Spanish
|
||||||
- Swedish
|
+ Swedish
|
||||||
|
+ Thai (fragments)
|
||||||
|
|
||||||
|
|
||||||
Adding a language to the resource library takes 3 to 9
|
Adding a language to the resource library takes 3 to 9
|
||||||
months - [contributions doc/projects.html]
|
months - [contributions doc/projects.html]
|
||||||
are welcome!
|
are welcome!
|
||||||
|
|
||||||
|
% [doc/10lang-small.png]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import GF.Data.ErrM ----
|
|||||||
import PGF.ExprSyntax (readExp)
|
import PGF.ExprSyntax (readExp)
|
||||||
import GF.Command.Abstract
|
import GF.Command.Abstract
|
||||||
import GF.Text.Lexing
|
import GF.Text.Lexing
|
||||||
|
import GF.Text.Transliterations
|
||||||
|
|
||||||
import GF.Data.Operations
|
import GF.Data.Operations
|
||||||
|
|
||||||
@@ -301,21 +302,27 @@ allCommands pgf = Map.fromList [
|
|||||||
"string processing functions in the order given in the command line",
|
"string processing functions in the order given in the command line",
|
||||||
"option list. Thus 'ps -f -g s' returns g (f s). Typical string processors",
|
"option list. Thus 'ps -f -g s' returns g (f s). Typical string processors",
|
||||||
"are lexers and unlexers, but also character encoding conversions are possible.",
|
"are lexers and unlexers, but also character encoding conversions are possible.",
|
||||||
"The unlexers preserve the division of their input to lines."
|
"The unlexers preserve the division of their input to lines.",
|
||||||
|
"To see transliteration tables, use command ut."
|
||||||
],
|
],
|
||||||
examples = [
|
examples = [
|
||||||
"l (EAdd 3 4) | ps -code -- linearize code-like output",
|
"l (EAdd 3 4) | ps -code -- linearize code-like output",
|
||||||
"ps -lexer=code | p -cat=Exp -- parse code-like input",
|
"ps -lexer=code | p -cat=Exp -- parse code-like input",
|
||||||
"gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin",
|
"gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin",
|
||||||
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UYF8 terminal"
|
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal",
|
||||||
|
"ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal"
|
||||||
],
|
],
|
||||||
exec = \opts -> return . fromString . stringOps opts . toString,
|
exec = \opts -> return . fromString . stringOps opts . toString,
|
||||||
options = [
|
options = [
|
||||||
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
|
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
|
||||||
|
("from_devanagari","from unicode to GF Devanagari transliteration"),
|
||||||
|
("from_thai","from unicode to GF Thai transliteration"),
|
||||||
("from_utf8","decode from utf8"),
|
("from_utf8","decode from utf8"),
|
||||||
("lextext","text-like lexer"),
|
("lextext","text-like lexer"),
|
||||||
("lexcode","code-like lexer"),
|
("lexcode","code-like lexer"),
|
||||||
("lexmixed","mixture of text and code (code between $...$)"),
|
("lexmixed","mixture of text and code (code between $...$)"),
|
||||||
|
("to_devanagari","from GF Devanagari transliteration to unicode"),
|
||||||
|
("to_thai","from GF Thai transliteration to unicode"),
|
||||||
("to_utf8","encode to utf8"),
|
("to_utf8","encode to utf8"),
|
||||||
("unlextext","text-like unlexer"),
|
("unlextext","text-like unlexer"),
|
||||||
("unlexcode","code-like unlexer"),
|
("unlexcode","code-like unlexer"),
|
||||||
@@ -370,6 +377,18 @@ allCommands pgf = Map.fromList [
|
|||||||
("number","the maximum number of questions")
|
("number","the maximum number of questions")
|
||||||
]
|
]
|
||||||
}),
|
}),
|
||||||
|
("ut", emptyCommandInfo {
|
||||||
|
longname = "unicode_table",
|
||||||
|
synopsis = "show a transliteration table for a unicode character set",
|
||||||
|
exec = \opts arg -> do
|
||||||
|
let t = concatMap prOpt (take 1 opts)
|
||||||
|
let out = maybe "no such transliteration" characterTable $ transliteration t
|
||||||
|
return $ fromString out,
|
||||||
|
options = [
|
||||||
|
("devanagari","Devanagari"),
|
||||||
|
("thai", "Thai")
|
||||||
|
]
|
||||||
|
}),
|
||||||
("wf", emptyCommandInfo {
|
("wf", emptyCommandInfo {
|
||||||
longname = "write_file",
|
longname = "write_file",
|
||||||
synopsis = "send string or tree to a file",
|
synopsis = "send string or tree to a file",
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
module GF.Text.Lexing (stringOp) where
|
module GF.Text.Lexing (stringOp) where
|
||||||
|
|
||||||
|
import GF.Text.Transliterations
|
||||||
import GF.Text.UTF8
|
import GF.Text.UTF8
|
||||||
|
|
||||||
import Data.Char
|
import Data.Char
|
||||||
@@ -19,7 +20,7 @@ stringOp name = case name of
|
|||||||
"unwords" -> Just $ appUnlexer unwords
|
"unwords" -> Just $ appUnlexer unwords
|
||||||
"to_utf8" -> Just encodeUTF8
|
"to_utf8" -> Just encodeUTF8
|
||||||
"from_utf8" -> Just decodeUTF8
|
"from_utf8" -> Just decodeUTF8
|
||||||
_ -> Nothing
|
_ -> transliterate name
|
||||||
|
|
||||||
appLexer :: (String -> [String]) -> String -> String
|
appLexer :: (String -> [String]) -> String -> String
|
||||||
appLexer f = unwords . filter (not . null) . f
|
appLexer f = unwords . filter (not . null) . f
|
||||||
|
|||||||
87
src-3.0/GF/Text/Transliterations.hs
Normal file
87
src-3.0/GF/Text/Transliterations.hs
Normal file
@@ -0,0 +1,87 @@
|
|||||||
|
module GF.Text.Transliterations (transliterate,transliteration,characterTable) where
|
||||||
|
|
||||||
|
import GF.Text.UTF8
|
||||||
|
|
||||||
|
import Data.Char
|
||||||
|
import qualified Data.Map as Map
|
||||||
|
|
||||||
|
transliterate :: String -> Maybe (String -> String)
|
||||||
|
transliterate s = case s of
|
||||||
|
'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
|
||||||
|
't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
|
||||||
|
_ -> Nothing
|
||||||
|
|
||||||
|
transliteration :: String -> Maybe Transliteration
|
||||||
|
transliteration s = case s of
|
||||||
|
"devanagari" -> Just transDevanagari
|
||||||
|
"thai" -> Just transThai
|
||||||
|
_ -> Nothing
|
||||||
|
|
||||||
|
characterTable :: Transliteration -> String
|
||||||
|
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
|
||||||
|
prOne (i,s) = unwords ["|", show i, "|", encodeUTF8 [toEnum i], "|", s, "|"]
|
||||||
|
|
||||||
|
data Transliteration = Trans {
|
||||||
|
trans_to_unicode :: Map.Map String Int,
|
||||||
|
trans_from_unicode :: Map.Map Int String
|
||||||
|
}
|
||||||
|
|
||||||
|
appTransToUnicode :: Transliteration -> String -> String
|
||||||
|
appTransToUnicode trans =
|
||||||
|
concat .
|
||||||
|
map (\c -> maybe c (return . toEnum) $
|
||||||
|
Map.lookup c (trans_to_unicode trans)
|
||||||
|
) .
|
||||||
|
unchar
|
||||||
|
|
||||||
|
appTransFromUnicode :: Transliteration -> String -> String
|
||||||
|
appTransFromUnicode trans =
|
||||||
|
concat .
|
||||||
|
map (maybe "?" id .
|
||||||
|
flip Map.lookup (trans_from_unicode trans)
|
||||||
|
) .
|
||||||
|
map fromEnum
|
||||||
|
|
||||||
|
|
||||||
|
-- conventions:
|
||||||
|
-- each character is either [letter] or [letter+nonletter]
|
||||||
|
-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations
|
||||||
|
|
||||||
|
mkTransliteration :: [String] -> [Int] -> Transliteration
|
||||||
|
mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts))
|
||||||
|
where
|
||||||
|
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
|
||||||
|
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
|
||||||
|
|
||||||
|
|
||||||
|
unchar :: String -> [String]
|
||||||
|
unchar s = case s of
|
||||||
|
c:d:cs
|
||||||
|
| isAlpha d -> [c] : unchar (d:cs)
|
||||||
|
| isSpace d -> [c] : unchar cs
|
||||||
|
| otherwise -> [c,d] : unchar cs
|
||||||
|
[_] -> [s]
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
transThai :: Transliteration
|
||||||
|
transThai = mkTransliteration allTrans allCodes where
|
||||||
|
allTrans = words $
|
||||||
|
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
|
||||||
|
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
|
||||||
|
"p3 m y r - l - w s- s. s h l' O h' - " ++
|
||||||
|
"a. a a: a+ i i: v v: u u: - - - - - - " ++
|
||||||
|
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
|
||||||
|
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
|
||||||
|
allCodes = [0x0e00 .. 0x0e7f]
|
||||||
|
|
||||||
|
transDevanagari :: Transliteration
|
||||||
|
transDevanagari = mkTransliteration allTrans allCodes where
|
||||||
|
allTrans = words $
|
||||||
|
"~ * - - " ++
|
||||||
|
"a- A- i- I- u- U- R- - - - e- E- - - o- O- " ++
|
||||||
|
"k K g G N: c C j J n: t. T. d. D. n. t " ++
|
||||||
|
"T d D n - p P b B m y r - l - - v " ++
|
||||||
|
"S s. s h - - r. - A i I u U R - - " ++
|
||||||
|
"- e E o O "
|
||||||
|
allCodes = [0x0901 .. 0x094c]
|
||||||
|
|
||||||
Reference in New Issue
Block a user