From 17163ae88163587f2c9615898a5789aaf3bee298 Mon Sep 17 00:00:00 2001 From: krasimir Date: Thu, 26 Jan 2017 10:31:43 +0000 Subject: [PATCH] copy the types for BracketedString from the Haskell runtime to the Haskell bindings --- src/runtime/haskell-bind/PGF2.hsc | 40 +++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 1f8d07c12..5d0484c1e 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -44,6 +44,8 @@ module PGF2 (-- * PGF ConcName,Concr,languages, -- ** Linearization linearize,linearizeAll, + FId, LIndex, BracketedString(..), showBracketedString, flattenBracketedString, + alignWords, -- ** Parsing parse, parseWithHeuristics, @@ -65,6 +67,7 @@ import Prelude hiding (fromEnum) import Control.Exception(Exception,throwIO) import Control.Monad(forM_) import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO) +import Text.PrettyPrint import PGF2.Expr import PGF2.FFI @@ -541,6 +544,43 @@ linearizeAll lang e = unsafePerformIO $ else do gu_pool_free pl throwIO (PGFError "The abstract tree cannot be linearized") +type FId = Int +type LIndex = Int + +-- | BracketedString represents a sentence that is linearized +-- as usual but we also want to retain the ''brackets'' that +-- mark the beginning and the end of each constituent. +data BracketedString + = Leaf String -- ^ this is the leaf i.e. a single token + | Bracket CId {-# UNPACK #-} !FId {-# UNPACK #-} !LIndex CId [BracketedString] + -- ^ this is a bracket. The 'CId' is the category of + -- the phrase. The 'FId' is an unique identifier for + -- every phrase in the sentence. For context-free grammars + -- i.e. without discontinuous constituents this identifier + -- is also unique for every bracket. When there are discontinuous + -- phrases then the identifiers are unique for every phrase but + -- not for every bracket since the bracket represents a constituent. + -- The different constituents could still be distinguished by using + -- the constituent index i.e. 'LIndex'. If the grammar is reduplicating + -- then the constituent indices will be the same for all brackets + -- that represents the same constituent. + -- The second 'CId' is the name of the abstract function that generated + -- this phrase. + +-- | Renders the bracketed string as a string where +-- the brackets are shown as @(S ...)@ where +-- @S@ is the category. +showBracketedString :: BracketedString -> String +showBracketedString = render . ppBracketedString + +ppBracketedString (Leaf t) = text t +ppBracketedString (Bracket cat fid index _ bss) = parens (ppCId cat <> colon <> int fid <+> hsep (map ppBracketedString bss)) + +-- | Extracts the sequence of tokens from the bracketed string +flattenBracketedString :: BracketedString -> [String] +flattenBracketedString (Leaf w) = [w] +flattenBracketedString (Bracket _ _ _ _ bss) = concatMap flattenBracketedString bss + alignWords :: Concr -> Expr -> [(String, [Int])] alignWords lang e = unsafePerformIO $ withGuPool $ \pl ->