1
0
forked from GitHub/gf-core

commands for displaying transliteration tables

This commit is contained in:
aarne
2008-06-15 15:24:11 +00:00
parent 8c3111e36a
commit c2bbdc8a16
5 changed files with 135 additions and 28 deletions

View File

@@ -94,14 +94,11 @@ is available for
<P>
GF was born in 1998 at Xerox Research Centre Europe, Grenoble in the project
Multilingual Document Authoring. At Xerox, it was used for prototypes including
a restaurant phrase book in 6 languages,
a database query system in 7 languages,
a formalization of an alarm system instructions with translations to 5 languages, and
an authoring system for medical drug descriptions in 2 languages.
</P>
<UL>
<LI>restaurant phrase book in 6 languages
<LI>database queries in 7 languages
<LI>alarm system instructions in 5 languages
<LI>medical drug descriptions in 2 languages
</UL>
<P>
Later projects using GF and involving third parties include, in chronological order,
</P>
@@ -155,7 +152,7 @@ applications, libraries are a way to cope with thousands of details involved in
syntax, lexicon, and inflection. The <A HREF="lib/">GF resource grammar library</A> has
support for an increasing number of languages, currently including
</P>
<UL>
<OL>
<LI>Arabic (partial)
<LI>Bulgarian
<LI>Catalan (partial)
@@ -164,14 +161,15 @@ support for an increasing number of languages, currently including
<LI>Finnish
<LI>French
<LI>German
<LI>Hindi/Urdu (partial)
<LI>Hindi/Urdu (fragments)
<LI><A HREF="http://www.interlingua.com/">Interlingua</A>
<LI>Italian
<LI>Norwegian bokmål
<LI>Russian
<LI>Spanish
<LI>Swedish
</UL>
<LI>Thai (fragments)
</OL>
<P>
Adding a language to the resource library takes 3 to 9

View File

@@ -131,26 +131,28 @@ Libraries are at the heart of modern software engineering. In natural language
applications, libraries are a way to cope with thousands of details involved in
syntax, lexicon, and inflection. The [GF resource grammar library lib/] has
support for an increasing number of languages, currently including
- Arabic (partial)
- Bulgarian
- Catalan (partial)
- Danish
- English
- Finnish
- French
- German
- Hindi/Urdu (partial)
- [Interlingua http://www.interlingua.com/]
- Italian
- Norwegian bokmål
- Russian
- Spanish
- Swedish
+ Arabic (partial)
+ Bulgarian
+ Catalan (partial)
+ Danish
+ English
+ Finnish
+ French
+ German
+ Hindi/Urdu (fragments)
+ [Interlingua http://www.interlingua.com/]
+ Italian
+ Norwegian bokmål
+ Russian
+ Spanish
+ Swedish
+ Thai (fragments)
Adding a language to the resource library takes 3 to 9
months - [contributions doc/projects.html]
are welcome!
% [doc/10lang-small.png]

View File

@@ -22,6 +22,7 @@ import GF.Data.ErrM ----
import PGF.ExprSyntax (readExp)
import GF.Command.Abstract
import GF.Text.Lexing
import GF.Text.Transliterations
import GF.Data.Operations
@@ -301,21 +302,27 @@ allCommands pgf = Map.fromList [
"string processing functions in the order given in the command line",
"option list. Thus 'ps -f -g s' returns g (f s). Typical string processors",
"are lexers and unlexers, but also character encoding conversions are possible.",
"The unlexers preserve the division of their input to lines."
"The unlexers preserve the division of their input to lines.",
"To see transliteration tables, use command ut."
],
examples = [
"l (EAdd 3 4) | ps -code -- linearize code-like output",
"ps -lexer=code | p -cat=Exp -- parse code-like input",
"gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin",
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UYF8 terminal"
"ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal",
"ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal"
],
exec = \opts -> return . fromString . stringOps opts . toString,
options = [
("bind","bind tokens separated by Prelude.BIND, i.e. &+"),
("from_devanagari","from unicode to GF Devanagari transliteration"),
("from_thai","from unicode to GF Thai transliteration"),
("from_utf8","decode from utf8"),
("lextext","text-like lexer"),
("lexcode","code-like lexer"),
("lexmixed","mixture of text and code (code between $...$)"),
("to_devanagari","from GF Devanagari transliteration to unicode"),
("to_thai","from GF Thai transliteration to unicode"),
("to_utf8","encode to utf8"),
("unlextext","text-like unlexer"),
("unlexcode","code-like unlexer"),
@@ -370,6 +377,18 @@ allCommands pgf = Map.fromList [
("number","the maximum number of questions")
]
}),
("ut", emptyCommandInfo {
longname = "unicode_table",
synopsis = "show a transliteration table for a unicode character set",
exec = \opts arg -> do
let t = concatMap prOpt (take 1 opts)
let out = maybe "no such transliteration" characterTable $ transliteration t
return $ fromString out,
options = [
("devanagari","Devanagari"),
("thai", "Thai")
]
}),
("wf", emptyCommandInfo {
longname = "write_file",
synopsis = "send string or tree to a file",

View File

@@ -1,5 +1,6 @@
module GF.Text.Lexing (stringOp) where
import GF.Text.Transliterations
import GF.Text.UTF8
import Data.Char
@@ -19,7 +20,7 @@ stringOp name = case name of
"unwords" -> Just $ appUnlexer unwords
"to_utf8" -> Just encodeUTF8
"from_utf8" -> Just decodeUTF8
_ -> Nothing
_ -> transliterate name
appLexer :: (String -> [String]) -> String -> String
appLexer f = unwords . filter (not . null) . f

View File

@@ -0,0 +1,87 @@
module GF.Text.Transliterations (transliterate,transliteration,characterTable) where
import GF.Text.UTF8
import Data.Char
import qualified Data.Map as Map
transliterate :: String -> Maybe (String -> String)
transliterate s = case s of
'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t
't':'o':'_':t -> fmap appTransToUnicode $ transliteration t
_ -> Nothing
transliteration :: String -> Maybe Transliteration
transliteration s = case s of
"devanagari" -> Just transDevanagari
"thai" -> Just transThai
_ -> Nothing
characterTable :: Transliteration -> String
characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where
prOne (i,s) = unwords ["|", show i, "|", encodeUTF8 [toEnum i], "|", s, "|"]
data Transliteration = Trans {
trans_to_unicode :: Map.Map String Int,
trans_from_unicode :: Map.Map Int String
}
appTransToUnicode :: Transliteration -> String -> String
appTransToUnicode trans =
concat .
map (\c -> maybe c (return . toEnum) $
Map.lookup c (trans_to_unicode trans)
) .
unchar
appTransFromUnicode :: Transliteration -> String -> String
appTransFromUnicode trans =
concat .
map (maybe "?" id .
flip Map.lookup (trans_from_unicode trans)
) .
map fromEnum
-- conventions:
-- each character is either [letter] or [letter+nonletter]
-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations
mkTransliteration :: [String] -> [Int] -> Transliteration
mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts))
where
tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"]
uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"]
unchar :: String -> [String]
unchar s = case s of
c:d:cs
| isAlpha d -> [c] : unchar (d:cs)
| isSpace d -> [c] : unchar cs
| otherwise -> [c,d] : unchar cs
[_] -> [s]
_ -> []
transThai :: Transliteration
transThai = mkTransliteration allTrans allCodes where
allTrans = words $
"- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++
"t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++
"p3 m y r - l - w s- s. s h l' O h' - " ++
"a. a a: a+ i i: v v: u u: - - - - - - " ++
"e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++
"N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - "
allCodes = [0x0e00 .. 0x0e7f]
transDevanagari :: Transliteration
transDevanagari = mkTransliteration allTrans allCodes where
allTrans = words $
"~ * - - " ++
"a- A- i- I- u- U- R- - - - e- E- - - o- O- " ++
"k K g G N: c C j J n: t. T. d. D. n. t " ++
"T d D n - p P b B m y r - l - - v " ++
"S s. s h - - r. - A i I u U R - - " ++
"- e E o O "
allCodes = [0x0901 .. 0x094c]