diff --git a/index-3.html b/index-3.html index be60ae150..e0d14657c 100644 --- a/index-3.html +++ b/index-3.html @@ -94,14 +94,11 @@ is available for
GF was born in 1998 at Xerox Research Centre Europe, Grenoble in the project Multilingual Document Authoring. At Xerox, it was used for prototypes including +a restaurant phrase book in 6 languages, +a database query system in 7 languages, +a formalization of an alarm system instructions with translations to 5 languages, and +an authoring system for medical drug descriptions in 2 languages.
-Later projects using GF and involving third parties include, in chronological order,
@@ -155,7 +152,7 @@ applications, libraries are a way to cope with thousands of details involved in syntax, lexicon, and inflection. The GF resource grammar library has support for an increasing number of languages, currently including -Adding a language to the resource library takes 3 to 9 diff --git a/index-3.txt b/index-3.txt index bcab59461..4d4138aa5 100644 --- a/index-3.txt +++ b/index-3.txt @@ -131,26 +131,28 @@ Libraries are at the heart of modern software engineering. In natural language applications, libraries are a way to cope with thousands of details involved in syntax, lexicon, and inflection. The [GF resource grammar library lib/] has support for an increasing number of languages, currently including -- Arabic (partial) -- Bulgarian -- Catalan (partial) -- Danish -- English -- Finnish -- French -- German -- Hindi/Urdu (partial) -- [Interlingua http://www.interlingua.com/] -- Italian -- Norwegian bokmål -- Russian -- Spanish -- Swedish ++ Arabic (partial) ++ Bulgarian ++ Catalan (partial) ++ Danish ++ English ++ Finnish ++ French ++ German ++ Hindi/Urdu (fragments) ++ [Interlingua http://www.interlingua.com/] ++ Italian ++ Norwegian bokmål ++ Russian ++ Spanish ++ Swedish ++ Thai (fragments) Adding a language to the resource library takes 3 to 9 months - [contributions doc/projects.html] are welcome! +% [doc/10lang-small.png] diff --git a/src-3.0/GF/Command/Commands.hs b/src-3.0/GF/Command/Commands.hs index cb002f5d7..f71e1611c 100644 --- a/src-3.0/GF/Command/Commands.hs +++ b/src-3.0/GF/Command/Commands.hs @@ -22,6 +22,7 @@ import GF.Data.ErrM ---- import PGF.ExprSyntax (readExp) import GF.Command.Abstract import GF.Text.Lexing +import GF.Text.Transliterations import GF.Data.Operations @@ -301,21 +302,27 @@ allCommands pgf = Map.fromList [ "string processing functions in the order given in the command line", "option list. Thus 'ps -f -g s' returns g (f s). Typical string processors", "are lexers and unlexers, but also character encoding conversions are possible.", - "The unlexers preserve the division of their input to lines." + "The unlexers preserve the division of their input to lines.", + "To see transliteration tables, use command ut." ], examples = [ "l (EAdd 3 4) | ps -code -- linearize code-like output", "ps -lexer=code | p -cat=Exp -- parse code-like input", "gr -cat=QCl | l | ps -bind -to_utf8 -- linearization output from LangFin", - "ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UYF8 terminal" + "ps -from_utf8 \"jag ?r h?r\" | p -- parser in LangSwe in UTF8 terminal", + "ps -to_devanagari -to_utf8 \"A-p\" -- show Devanagari in UTF8 terminal" ], exec = \opts -> return . fromString . stringOps opts . toString, options = [ ("bind","bind tokens separated by Prelude.BIND, i.e. &+"), + ("from_devanagari","from unicode to GF Devanagari transliteration"), + ("from_thai","from unicode to GF Thai transliteration"), ("from_utf8","decode from utf8"), ("lextext","text-like lexer"), ("lexcode","code-like lexer"), ("lexmixed","mixture of text and code (code between $...$)"), + ("to_devanagari","from GF Devanagari transliteration to unicode"), + ("to_thai","from GF Thai transliteration to unicode"), ("to_utf8","encode to utf8"), ("unlextext","text-like unlexer"), ("unlexcode","code-like unlexer"), @@ -370,6 +377,18 @@ allCommands pgf = Map.fromList [ ("number","the maximum number of questions") ] }), + ("ut", emptyCommandInfo { + longname = "unicode_table", + synopsis = "show a transliteration table for a unicode character set", + exec = \opts arg -> do + let t = concatMap prOpt (take 1 opts) + let out = maybe "no such transliteration" characterTable $ transliteration t + return $ fromString out, + options = [ + ("devanagari","Devanagari"), + ("thai", "Thai") + ] + }), ("wf", emptyCommandInfo { longname = "write_file", synopsis = "send string or tree to a file", diff --git a/src-3.0/GF/Text/Lexing.hs b/src-3.0/GF/Text/Lexing.hs index 5ad2a69b7..16391d183 100644 --- a/src-3.0/GF/Text/Lexing.hs +++ b/src-3.0/GF/Text/Lexing.hs @@ -1,5 +1,6 @@ module GF.Text.Lexing (stringOp) where +import GF.Text.Transliterations import GF.Text.UTF8 import Data.Char @@ -19,7 +20,7 @@ stringOp name = case name of "unwords" -> Just $ appUnlexer unwords "to_utf8" -> Just encodeUTF8 "from_utf8" -> Just decodeUTF8 - _ -> Nothing + _ -> transliterate name appLexer :: (String -> [String]) -> String -> String appLexer f = unwords . filter (not . null) . f diff --git a/src-3.0/GF/Text/Transliterations.hs b/src-3.0/GF/Text/Transliterations.hs new file mode 100644 index 000000000..28f653dcf --- /dev/null +++ b/src-3.0/GF/Text/Transliterations.hs @@ -0,0 +1,87 @@ +module GF.Text.Transliterations (transliterate,transliteration,characterTable) where + +import GF.Text.UTF8 + +import Data.Char +import qualified Data.Map as Map + +transliterate :: String -> Maybe (String -> String) +transliterate s = case s of + 'f':'r':'o':'m':'_':t -> fmap appTransFromUnicode $ transliteration t + 't':'o':'_':t -> fmap appTransToUnicode $ transliteration t + _ -> Nothing + +transliteration :: String -> Maybe Transliteration +transliteration s = case s of + "devanagari" -> Just transDevanagari + "thai" -> Just transThai + _ -> Nothing + +characterTable :: Transliteration -> String +characterTable = unlines . map prOne . Map.assocs . trans_from_unicode where + prOne (i,s) = unwords ["|", show i, "|", encodeUTF8 [toEnum i], "|", s, "|"] + +data Transliteration = Trans { + trans_to_unicode :: Map.Map String Int, + trans_from_unicode :: Map.Map Int String + } + +appTransToUnicode :: Transliteration -> String -> String +appTransToUnicode trans = + concat . + map (\c -> maybe c (return . toEnum) $ + Map.lookup c (trans_to_unicode trans) + ) . + unchar + +appTransFromUnicode :: Transliteration -> String -> String +appTransFromUnicode trans = + concat . + map (maybe "?" id . + flip Map.lookup (trans_from_unicode trans) + ) . + map fromEnum + + +-- conventions: +-- each character is either [letter] or [letter+nonletter] +-- when using a sparse range of unicodes, mark missing codes as "-" in transliterations + +mkTransliteration :: [String] -> [Int] -> Transliteration +mkTransliteration ts us = Trans (Map.fromList (tzip ts us)) (Map.fromList (uzip us ts)) + where + tzip ts us = [(t,u) | (t,u) <- zip ts us, t /= "-"] + uzip us ts = [(u,t) | (u,t) <- zip us ts, t /= "-"] + + +unchar :: String -> [String] +unchar s = case s of + c:d:cs + | isAlpha d -> [c] : unchar (d:cs) + | isSpace d -> [c] : unchar cs + | otherwise -> [c,d] : unchar cs + [_] -> [s] + _ -> [] + +transThai :: Transliteration +transThai = mkTransliteration allTrans allCodes where + allTrans = words $ + "- k k1 - k2 - k3 g c c1 c2 s' c3 y' d' t' " ++ + "t1 t2 t3 n' d t t4 t5 t6 n b p p1 f p2 f' " ++ + "p3 m y r - l - w s- s. s h l' O h' - " ++ + "a. a a: a+ i i: v v: u u: - - - - - - " ++ + "e e' o: a% a& L R S T1 T2 T3 T4 K - - - " ++ + "N0 N1 N2 N3 N4 N5 N6 N7 N8 N9 - - - - - - " + allCodes = [0x0e00 .. 0x0e7f] + +transDevanagari :: Transliteration +transDevanagari = mkTransliteration allTrans allCodes where + allTrans = words $ + "~ * - - " ++ + "a- A- i- I- u- U- R- - - - e- E- - - o- O- " ++ + "k K g G N: c C j J n: t. T. d. D. n. t " ++ + "T d D n - p P b B m y r - l - - v " ++ + "S s. s h - - r. - A i I u U R - - " ++ + "- e E o O " + allCodes = [0x0901 .. 0x094c] +