gf-core/src-3.0/GF/Text/Text.hs

----------------------------------------------------------------------
-- |
-- Module      : Text
-- Maintainer  : AR
-- Stability   : (stable)
-- Portability : (portable)
--
-- > CVS $Date: 2005/06/23 14:32:44 $
-- > CVS $Author: aarne $
-- > CVS $Revision: 1.10 $
--
-- elementary text postprocessing. AR 21\/11\/2001.
--
-- This is very primitive indeed. The functions should work on
-- token lists and not on strings. AR 5\/12\/2002
--
-- XML hack 14\/8\/2004; not in use yet
-----------------------------------------------------------------------------

module GF.Text.Text (untokWithXML,
	     exceptXML,
	     formatAsTextLit,
	     formatAsCodeLit,
	     formatAsText,
	     formatAsHTML,
	     formatAsLatex,
	     formatAsCode,
	     performBinds,
	     performBindsFinnish,
	     unStringLit,
	     concatRemSpace
	    ) where

import GF.Data.Operations
import Data.Char

-- | does not apply untokenizer within XML tags --- heuristic "< "
-- this function is applied from top level...
untokWithXML :: (String -> String) -> String -> String
untokWithXML unt s = case s of
  '<':cs@(c:_) | isAlpha c -> '<':beg ++ ">" ++ unto (drop 1 rest) where
                  (beg,rest) = span (/='>') cs
  '<':cs -> '<':unto cs ---
  [] -> []
  _ -> unt beg ++ unto rest where
               (beg,rest) = span (/='<') s
 where
   unto = untokWithXML unt

-- | ... whereas this one is embedded on a branch
exceptXML :: (String -> String) -> String -> String
exceptXML unt s = '<':beg ++ ">" ++ unt (drop 1 rest) where
  (beg,rest) = span (/='>') s

formatAsTextLit :: String -> String
formatAsTextLit = formatAsText . unwords . map unStringLit . words
--- hope that there will be deforestation...

formatAsCodeLit :: String -> String
formatAsCodeLit = formatAsCode . unwords . map unStringLit . words

formatAsText,formatAsHTML,formatAsLatex :: String -> String
formatAsText  = formatAsTextGen (const False) (=="&-")
formatAsHTML  = formatAsTextGen (\s -> take 1 s == "<" || last s == '>') (const False)
formatAsLatex = formatAsTextGen ((=="\\") . take 1)  (const False)

formatAsTextGen :: (String -> Bool) -> (String -> Bool) -> String -> String
formatAsTextGen tag para = unwords . format . cap . words where
  format ws = case ws of
    w :     ww | capit w -> format $ (cap ww)
    w : c : ww | major c -> format $ (w ++ c) :(cap ww)
    w : c : ww | minor c -> format $ (w ++ c) :    ww
    p : c : ww | openp p -> format $ (p ++ c) :ww
    p : c : ww | spanish p -> format $ (p ++ concat (cap [c])) :ww
    c     : ww | para  c -> "\n\n"        : format ww
    w     : ww           -> w             : format ww
    [] -> []
  cap (p:ww) | tag p = p : cap ww
  cap ((c:cs):ww) = (toUpper c : cs) : ww
  cap [] = []
  capit = (=="&|")
  major = flip elem (map singleton ".!?")
  minor = flip elem (map singleton ",:;)")
  openp = all (flip elem "(")
  spanish = all (flip elem "\161\191")

formatAsCode :: String -> String
formatAsCode = rend 0 . words where
  -- render from BNF Converter
  rend i ss = case ss of
    "["      :ts -> cons "["  $ rend i ts
    "("      :ts -> cons "("  $ rend i ts
    "{"      :ts -> cons "{"  $ new (i+1) $ rend (i+1) ts
    "}" : ";":ts -> new (i-1) $ space "}" $ cons ";" $ new (i-1) $ rend (i-1) ts
    "}"      :ts -> new (i-1) $ cons "}" $ new (i-1) $ rend (i-1) ts
    ";"      :ts -> cons ";"  $ new i $ rend i ts
    t  : "," :ts -> cons t    $ space "," $ rend i ts
    t  : ")" :ts -> cons t    $ cons ")"  $ rend i ts
    t  : "]" :ts -> cons t    $ cons "]"  $ rend i ts
    t        :ts -> space t   $ rend i ts
    _            -> ""
  cons s t  = s ++ t
  new i s   = '\n' : replicate (2*i) ' ' ++ dropWhile isSpace s
  space t s = if null s then t else t ++ " " ++ s

performBinds :: String -> String
performBinds = performBindsOpt (\x y -> y)


-- The function defines an effect of the former on the latter part,
-- such as in vowel harmony. It is triggered by the binder token "&*"

performBindsOpt :: (String -> String -> String) -> String -> String
performBindsOpt harm = unwords . format . words where
  format ws = case ws of
    w : "&+" : u : ws -> format ((w ++        u) : ws)
    w : "&*" : u : ws -> format ((w ++ harm w u) : ws)
    w : ws            -> w : format ws
    []                -> []

-- unlexer for Finnish particles
-- Notice: left associativity crucial for "tie &* ko &* han" --> "tieköhän"

performBindsFinnish :: String -> String
performBindsFinnish = performBindsOpt vowelHarmony where
  vowelHarmony w p = if any (flip elem "aouAOU") w then p else map toFront p
  toFront c = case c of
    'A' -> '\196'
    'O' -> '\214'
    'a' -> '\228'
    'o' -> '\246'
    _ -> c

unStringLit :: String -> String
unStringLit s = case s of
  c : cs | strlim c && strlim (last cs) -> init cs
  _ -> s
 where
   strlim = (=='\'')

concatRemSpace :: String -> String
concatRemSpace = concat . words
{-
concatRemSpace s = case s of
  '<':cs -> exceptXML concatRemSpace cs
  c : cs | isSpace c -> concatRemSpace cs
  c :cs -> c : concatRemSpace cs
  _ -> s
-}