forked from GitHub/gf-core
commands for displaying transliteration tables
This commit is contained in:
18
index-3.html
18
index-3.html
@@ -94,14 +94,11 @@ is available for
|
||||
<P>
|
||||
GF was born in 1998 at Xerox Research Centre Europe, Grenoble in the project
|
||||
Multilingual Document Authoring. At Xerox, it was used for prototypes including
|
||||
a restaurant phrase book in 6 languages,
|
||||
a database query system in 7 languages,
|
||||
a formalization of an alarm system instructions with translations to 5 languages, and
|
||||
an authoring system for medical drug descriptions in 2 languages.
|
||||
</P>
|
||||
<UL>
|
||||
<LI>restaurant phrase book in 6 languages
|
||||
<LI>database queries in 7 languages
|
||||
<LI>alarm system instructions in 5 languages
|
||||
<LI>medical drug descriptions in 2 languages
|
||||
</UL>
|
||||
|
||||
<P>
|
||||
Later projects using GF and involving third parties include, in chronological order,
|
||||
</P>
|
||||
@@ -155,7 +152,7 @@ applications, libraries are a way to cope with thousands of details involved in
|
||||
syntax, lexicon, and inflection. The <A HREF="lib/">GF resource grammar library</A> has
|
||||
support for an increasing number of languages, currently including
|
||||
</P>
|
||||
<UL>
|
||||
<OL>
|
||||
<LI>Arabic (partial)
|
||||
<LI>Bulgarian
|
||||
<LI>Catalan (partial)
|
||||
@@ -164,14 +161,15 @@ support for an increasing number of languages, currently including
|
||||
<LI>Finnish
|
||||
<LI>French
|
||||
<LI>German
|
||||
<LI>Hindi/Urdu (partial)
|
||||
<LI>Hindi/Urdu (fragments)
|
||||
<LI><A HREF="http://www.interlingua.com/">Interlingua</A>
|
||||
<LI>Italian
|
||||
<LI>Norwegian bokmål
|
||||
<LI>Russian
|
||||
<LI>Spanish
|
||||
<LI>Swedish
|
||||
</UL>
|
||||
<LI>Thai (fragments)
|
||||
</OL>
|
||||
|
||||
<P>
|
||||
Adding a language to the resource library takes 3 to 9
|
||||
|
||||
32
index-3.txt
32
index-3.txt
@@ -131,26 +131,28 @@ Libraries are at the heart of modern software engineering. In natural language
|
||||
applications, libraries are a way to cope with thousands of details involved in
|
||||
syntax, lexicon, and inflection. The [GF resource grammar library lib/] has
|
||||
support for an increasing number of languages, currently including
|
||||
- Arabic (partial)
|
||||
- Bulgarian
|
||||
- Catalan (partial)
|
||||
- Danish
|
||||
- English
|
||||
- Finnish
|
||||
- French
|
||||
- German
|
||||
- Hindi/Urdu (partial)
|
||||
- [Interlingua http://www.interlingua.com/]
|
||||
- Italian
|
||||
- Norwegian bokmål
|
||||
- Russian
|
||||
- Spanish
|
||||
- Swedish
|
||||
+ Arabic (partial)
|
||||
+ Bulgarian
|
||||
+ Catalan (partial)
|
||||
+ Danish
|
||||
+ English
|
||||
+ Finnish
|
||||
+ French
|
||||
+ German
|
||||
+ Hindi/Urdu (fragments)
|
||||
+ [Interlingua http://www.interlingua.com/]
|
||||
+ Italian
|
||||
+ Norwegian bokmål
|
||||
+ Russian
|
||||
+ Spanish
|
||||
+ Swedish
|
||||
+ Thai (fragments)
|
||||
|
||||
|
||||
Adding a language to the resource library takes 3 to 9
|
||||
months - [contributions doc/projects.html]
|
||||
are welcome!
|
||||
|
||||
% [doc/10lang-small.png]
|
||||
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ import GF.Data.ErrM ----
|
||||
import PGF.ExprSyntax (readExp)
|
||||
import GF.Command.Abstract
|
||||
import GF.Text.Lexing
|
||||
import GF.Text.Transliterations
|
||||
|
||||
import GF.Data.Operations
|
||||
|
||||
@@ -301,21 +302,27 @@ allCommands pgf = Map.fromList [
|
||||
"string processing functions in the order given in the command line",
|
||||
"option list. Thus 'ps -f -g s' returns g (f s). Typical string processors",
|
||||
"are lexers and unlexers, but also character encoding conversions are possible.",
|
||||
"The unlexers preserve the division of their input to lines."
|
||||
"The unlexers preserve the division of their input to lines.",
|
||||
"To see transliteration tables, use command ut."
|
||||
],
|
||||
examples = [
|
||||
"l (EAdd 3 4) | ps -code -- linearize code-like output",
|
||||
"ps -lexer=code | p -cat=Exp -- parse code-like input",
|
||||
"gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin",
|
||||
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UYF8 terminal"
|
||||
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal",
|
||||
"ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal"
|
||||
],
|
||||
exec = \opts -> return . fromString . stringOps opts . toString,
|
||||
options = [
|
||||
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
|
||||
("from_devanagari","from unicode to GF Devanagari transliteration"),
|
||||
("from_thai","from unicode to GF Thai transliteration"),
|
||||
("from_utf8","decode from utf8"),
|
||||
("lextext","text-like lexer"),
|
||||
("lexcode","code-like lexer"),
|
||||
("lexmixed","mixture of text and code (code between $...$)"),
|
||||
("to_devanagari","from GF Devanagari transliteration to unicode"),
|
||||
("to_thai","from GF Thai transliteration to unicode"),
|
||||
("to_utf8","encode to utf8"),
|
||||
("unlextext","text-like unlexer"),
|
||||
("unlexcode","code-like unlexer"),
|
||||
@@ -370,6 +377,18 @@ allCommands pgf = Map.fromList [
|
||||
("number","the maximum number of questions")
|
||||
]
|
||||
}),
|
||||
("ut", emptyCommandInfo {
|
||||
longname = "unicode_table",
|
||||
synopsis = "show a transliteration table for a unicode character set",
|
||||
exec = \opts arg -> do
|
||||
let t = concatMap prOpt (take 1 opts)
|
||||
let out = maybe "no such transliteration" characterTable $ transliteration t
|
||||
return $ fromString out,
|
||||
options = [
|
||||
("devanagari","Devanagari"),
|
||||
("thai", "Thai")
|
||||
]
|
||||
}),
|
||||
("wf", emptyCommandInfo {
|
||||
longname = "write_file",
|
||||
synopsis = "send string or tree to a file",
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
module GF.Text.Lexing (stringOp) where
|
||||
|
||||
import GF.Text.Transliterations
|
||||
import GF.Text.UTF8
|
||||
|
||||
import Data.Char
|
||||
@@ -19,7 +20,7 @@ stringOp name = case name of
|
||||
"unwords" -> Just $ appUnlexer unwords
|
||||
"to_utf8" -> Just encodeUTF8
|
||||
"from_utf8" -> Just decodeUTF8
|
||||
_ -> Nothing
|
||||
_ -> transliterate name
|
||||
|
||||
appLexer :: (String -> [String]) -> String -> String
|
||||
appLexer f = unwords . filter (not . null) . f
|
||||
|
||||
87
src-3.0/GF/Text/Transliterations.hs
Normal file
87
src-3.0/GF/Text/Transliterations.hs
Normal file
@@ -0,0 +1,87 @@
|
||||
module GF.Text.Transliterations (transliterate,transliteration,characterTable) where
|
||||
|
||||
import GF.Text.UTF8
|
||||
|
||||
import Data.Char
|
||||
import qualified Data.Map as Map
|
||||
|
||||
transliterate :: String -> Maybe (String -> String)
|
||||
transliterate s = case s of
|
||||
'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
|
||||
't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
|
||||
_ -> Nothing
|
||||
|
||||
transliteration :: String -> Maybe Transliteration
|
||||
transliteration s = case s of
|
||||
"devanagari" -> Just transDevanagari
|
||||
"thai" -> Just transThai
|
||||
_ -> Nothing
|
||||
|
||||
characterTable :: Transliteration -> String
|
||||
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
|
||||
prOne (i,s) = unwords ["|", show i, "|", encodeUTF8 [toEnum i], "|", s, "|"]
|
||||
|
||||
data Transliteration = Trans {
|
||||
trans_to_unicode :: Map.Map String Int,
|
||||
trans_from_unicode :: Map.Map Int String
|
||||
}
|
||||
|
||||
appTransToUnicode :: Transliteration -> String -> String
|
||||
appTransToUnicode trans =
|
||||
concat .
|
||||
map (\c -> maybe c (return . toEnum) $
|
||||
Map.lookup c (trans_to_unicode trans)
|
||||
) .
|
||||
unchar
|
||||
|
||||
appTransFromUnicode :: Transliteration -> String -> String
|
||||
appTransFromUnicode trans =
|
||||
concat .
|
||||
map (maybe "?" id .
|
||||
flip Map.lookup (trans_from_unicode trans)
|
||||
) .
|
||||
map fromEnum
|
||||
|
||||
|
||||
-- conventions:
|
||||
-- each character is either [letter] or [letter+nonletter]
|
||||
-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations
|
||||
|
||||
mkTransliteration :: [String] -> [Int] -> Transliteration
|
||||
mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts))
|
||||
where
|
||||
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
|
||||
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
|
||||
|
||||
|
||||
unchar :: String -> [String]
|
||||
unchar s = case s of
|
||||
c:d:cs
|
||||
| isAlpha d -> [c] : unchar (d:cs)
|
||||
| isSpace d -> [c] : unchar cs
|
||||
| otherwise -> [c,d] : unchar cs
|
||||
[_] -> [s]
|
||||
_ -> []
|
||||
|
||||
transThai :: Transliteration
|
||||
transThai = mkTransliteration allTrans allCodes where
|
||||
allTrans = words $
|
||||
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
|
||||
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
|
||||
"p3 m y r - l - w s- s. s h l' O h' - " ++
|
||||
"a. a a: a+ i i: v v: u u: - - - - - - " ++
|
||||
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
|
||||
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
|
||||
allCodes = [0x0e00 .. 0x0e7f]
|
||||
|
||||
transDevanagari :: Transliteration
|
||||
transDevanagari = mkTransliteration allTrans allCodes where
|
||||
allTrans = words $
|
||||
"~ * - - " ++
|
||||
"a- A- i- I- u- U- R- - - - e- E- - - o- O- " ++
|
||||
"k K g G N: c C j J n: t. T. d. D. n. t " ++
|
||||
"T d D n - p P b B m y r - l - - v " ++
|
||||
"S s. s h - - r. - A i I u U R - - " ++
|
||||
"- e E o O "
|
||||
allCodes = [0x0901 .. 0x094c]
|
||||
|
||||
Reference in New Issue
Block a user