{-# LANGUAGE ExistentialQuantification, DeriveDataTypeable, ScopedTypeVariables #-} ------------------------------------------------- -- | -- Module : PGF2 -- Maintainer : Krasimir Angelov -- Stability : stable -- Portability : portable -- -- This module is an Application Programming Interface to -- load and interpret grammars compiled in the Portable Grammar Format (PGF). -- The PGF format is produced as the final output from the GF compiler. -- The API is meant to be used for embedding GF grammars in Haskell -- programs ------------------------------------------------- #include #include #include #include module PGF2 (-- * PGF PGF,readPGF,showPGF, -- * Abstract syntax AbsName,abstractName, -- ** Categories Cat,categories,categoryContext,categoryProbability, -- ** Functions Fun, functions, functionsByCat, functionType, functionIsDataCon, hasLinearization, -- ** Expressions Expr,showExpr,readExpr,pExpr,pIdent, mkAbs,unAbs, mkApp,unApp,unapply, mkStr,unStr, mkInt,unInt, mkFloat,unFloat, mkMeta,unMeta, exprHash, exprSize, exprFunctions, exprSubstitute, treeProbability, -- ** Types Type, Hypo, BindType(..), startCat, readType, showType, showContext, mkType, unType, -- ** Type checking checkExpr, inferExpr, checkType, -- ** Computing compute, -- * Concrete syntax ConcName,Concr,languages,concreteName,languageCode, -- ** Linearization linearize,linearizeAll,tabularLinearize,tabularLinearizeAll,bracketedLinearize, FId, LIndex, BracketedString(..), showBracketedString, flattenBracketedString, printName, alignWords, -- ** Parsing ParseOutput(..), parse, parseWithHeuristics, complete, -- ** Sentence Lookup lookupSentence, -- ** Generation generateAll, -- ** Morphological Analysis MorphoAnalysis, lookupMorpho, fullFormLexicon, -- ** Visualizations GraphvizOptions(..), graphvizDefaults, graphvizAbstractTree, graphvizParseTree, graphvizDependencyTree, conlls2latexDoc, getCncDepLabels, graphvizWordAlignment, -- * Exceptions PGFError(..), -- * Grammar specific callbacks LiteralCallback,literalCallbacks ) where import Prelude hiding (fromEnum) import Control.Exception(Exception,throwIO) import Control.Monad(forM_) import System.IO.Unsafe(unsafePerformIO,unsafeInterleaveIO) import Text.PrettyPrint import PGF2.Expr import PGF2.Type import PGF2.FFI import Foreign hiding ( Pool, newPool, unsafePerformIO ) import Foreign.C import Data.Typeable import qualified Data.Map as Map import Data.IORef import Data.Char(isUpper,isSpace) import Data.List(isSuffixOf,maximumBy,nub,mapAccumL,intersperse,groupBy,find) import Data.Maybe(fromMaybe) import Data.Function(on) ----------------------------------------------------------------------- -- Functions that take a PGF. -- PGF has many Concrs. -- -- A Concr retains its PGF in a field in order to retain a reference to -- the foreign pointer in case if the application still has a reference -- to Concr but has lost its reference to PGF. type AbsName = String -- ^ Name of abstract syntax type ConcName = String -- ^ Name of concrete syntax -- | Reads file in Portable Grammar Format and produces -- 'PGF' structure. The file is usually produced with: -- -- > $ gf -make readPGF :: FilePath -> IO PGF readPGF fpath = do pool <- gu_new_pool pgf <- withCString fpath $ \c_fpath -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl pgf <- pgf_read c_fpath pool exn failed <- gu_exn_is_raised exn if failed then do is_errno <- gu_exn_caught exn gu_exn_type_GuErrno if is_errno then do perrno <- (#peek GuExn, data.data) exn errno <- peek perrno gu_pool_free pool ioError (errnoToIOError "readPGF" (Errno errno) Nothing (Just fpath)) else do gu_pool_free pool throwIO (PGFError "The grammar cannot be loaded") else return pgf pgfFPtr <- newForeignPtr gu_pool_finalizer pool let touch = touchForeignPtr pgfFPtr ref <- newIORef Map.empty allocaBytes (#size GuMapItor) $ \itor -> do fptr <- wrapMapItorCallback (getLanguages ref touch) (#poke GuMapItor, fn) itor fptr pgf_iter_languages pgf itor nullPtr freeHaskellFunPtr fptr langs <- readIORef ref return (PGF pgf langs touch) where getLanguages :: IORef (Map.Map String Concr) -> Touch -> MapItorCallback getLanguages ref touch itor key value exn = do langs <- readIORef ref name <- peekUtf8CString (castPtr key) concr <- fmap (\ptr -> Concr ptr touch) $ peek (castPtr value) writeIORef ref $! Map.insert name concr langs showPGF :: PGF -> String showPGF p = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl withArrayLen ((map concr . Map.elems . languages) p) $ \n_concrs concrs -> pgf_print (pgf p) (fromIntegral n_concrs) concrs out exn touchPGF p s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s -- | List of all languages available in the grammar. languages :: PGF -> Map.Map ConcName Concr languages p = langs p -- | The abstract language name is the name of the top-level -- abstract module concreteName :: Concr -> ConcName concreteName c = unsafePerformIO (peekUtf8CString =<< pgf_concrete_name (concr c)) languageCode :: Concr -> String languageCode c = unsafePerformIO (peekUtf8CString =<< pgf_language_code (concr c)) -- | Generates an exhaustive possibly infinite list of -- all abstract syntax expressions of the given type. -- The expressions are ordered by their probability. generateAll :: PGF -> Type -> [(Expr,Float)] generateAll p (Type ctype _) = unsafePerformIO $ do genPl <- gu_new_pool exprPl <- gu_new_pool exn <- gu_new_exn genPl enum <- pgf_generate_all (pgf p) ctype exn genPl exprPl genFPl <- newForeignPtr gu_pool_finalizer genPl exprFPl <- newForeignPtr gu_pool_finalizer exprPl fromPgfExprEnum enum genFPl (touchPGF p >> touchForeignPtr exprFPl) -- | The abstract language name is the name of the top-level -- abstract module abstractName :: PGF -> AbsName abstractName p = unsafePerformIO (peekUtf8CString =<< pgf_abstract_name (pgf p)) -- | The start category is defined in the grammar with -- the \'startcat\' flag. This is usually the sentence category -- but it is not necessary. Despite that there is a start category -- defined you can parse with any category. The start category -- definition is just for convenience. startCat :: PGF -> Type startCat p = unsafePerformIO $ do typPl <- gu_new_pool c_type <- pgf_start_cat (pgf p) typPl typeFPl <- newForeignPtr gu_pool_finalizer typPl return (Type c_type (touchForeignPtr typeFPl)) loadConcr :: Concr -> FilePath -> IO () loadConcr c fpath = withCString fpath $ \c_fpath -> withCString "rb" $ \c_mode -> withGuPool $ \tmpPl -> do file <- fopen c_fpath c_mode inp <- gu_file_in file tmpPl exn <- gu_new_exn tmpPl pgf_concrete_load (concr c) inp exn failed <- gu_exn_is_raised exn if failed then do is_errno <- gu_exn_caught exn gu_exn_type_GuErrno if is_errno then do perrno <- (#peek GuExn, data.data) exn errno <- peek perrno ioError (errnoToIOError "loadConcr" (Errno errno) Nothing (Just fpath)) else do throwIO (PGFError "The language cannot be loaded") else return () unloadConcr :: Concr -> IO () unloadConcr c = pgf_concrete_unload (concr c) -- | The type of a function functionType :: PGF -> Fun -> Maybe Type functionType p fn = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fn <- newUtf8CString fn tmpPl c_type <- pgf_function_type (pgf p) c_fn return (if c_type == nullPtr then Nothing else Just (Type c_type (touchPGF p))) -- | The type of a function functionIsDataCon :: PGF -> Fun -> Bool functionIsDataCon p fn = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fn <- newUtf8CString fn tmpPl res <- pgf_function_is_constructor (pgf p) c_fn touchPGF p return (res /= 0) -- | Checks an expression against a specified type. checkExpr :: PGF -> Expr -> Type -> Either String Expr checkExpr p (Expr c_expr touch1) (Type c_ty touch2) = unsafePerformIO $ alloca $ \pexpr -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool poke pexpr c_expr pgf_check_expr (pgf p) pexpr c_ty exn exprPl touchPGF p >> touch1 >> touch2 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl c_expr <- peek pexpr return (Right (Expr c_expr (touchForeignPtr exprFPl))) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl if is_tyerr then return (Left msg) else throwIO (PGFError msg) -- | Tries to infer the type of an expression. Note that -- even if the expression is type correct it is not always -- possible to infer its type in the GF type system. -- In this case the function returns an error. inferExpr :: PGF -> Expr -> Either String (Expr, Type) inferExpr p (Expr c_expr touch1) = unsafePerformIO $ alloca $ \pexpr -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool poke pexpr c_expr c_ty <- pgf_infer_expr (pgf p) pexpr exn exprPl touchPGF p >> touch1 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl let touch = touchForeignPtr exprFPl c_expr <- peek pexpr return (Right (Expr c_expr touch, Type c_ty touch)) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl if is_tyerr then return (Left msg) else throwIO (PGFError msg) -- | Check whether a type is consistent with the abstract -- syntax of the grammar. checkType :: PGF -> Type -> Either String Type checkType p (Type c_ty touch1) = unsafePerformIO $ alloca $ \pty -> withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl typePl <- gu_new_pool poke pty c_ty pgf_check_type (pgf p) pty exn typePl touchPGF p >> touch1 status <- gu_exn_is_raised exn if not status then do typeFPl <- newForeignPtr gu_pool_finalizer typePl c_ty <- peek pty return (Right (Type c_ty (touchForeignPtr typeFPl))) else do is_tyerr <- gu_exn_caught exn gu_exn_type_PgfTypeError c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free typePl if is_tyerr then return (Left msg) else throwIO (PGFError msg) compute :: PGF -> Expr -> Expr compute p (Expr c_expr touch1) = unsafePerformIO $ withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl exprPl <- gu_new_pool c_expr <- pgf_compute (pgf p) c_expr exn tmpPl exprPl touchPGF p >> touch1 status <- gu_exn_is_raised exn if not status then do exprFPl <- newForeignPtr gu_pool_finalizer exprPl return (Expr c_expr (touchForeignPtr exprFPl)) else do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free exprPl throwIO (PGFError msg) treeProbability :: PGF -> Expr -> Float treeProbability p (Expr c_expr touch1) = unsafePerformIO $ do res <- pgf_compute_tree_probability (pgf p) c_expr touchPGF p >> touch1 return (realToFrac res) exprHash :: Int32 -> Expr -> Int32 exprHash h (Expr c_expr touch1) = unsafePerformIO $ do h <- pgf_expr_hash (fromIntegral h) c_expr touch1 return (fromIntegral h) exprSize :: Expr -> Int exprSize (Expr c_expr touch1) = unsafePerformIO $ do size <- pgf_expr_size c_expr touch1 return (fromIntegral size) exprFunctions :: Expr -> [Fun] exprFunctions (Expr c_expr touch) = unsafePerformIO $ withGuPool $ \tmpPl -> do seq <- pgf_expr_functions c_expr tmpPl len <- (#peek GuSeq, len) seq arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data)) funs <- mapM peekUtf8CString arr touch return funs exprSubstitute :: Expr -> [Expr] -> Expr exprSubstitute (Expr c_expr touch) meta_values = unsafePerformIO $ withGuPool $ \tmpPl -> do c_meta_values <- newSequence (#size PgfExpr) pokeExpr meta_values tmpPl exprPl <- gu_new_pool c_expr <- pgf_expr_substitute c_expr c_meta_values exprPl touch exprFPl <- newForeignPtr gu_pool_finalizer exprPl let touch' = sequence_ (touchForeignPtr exprFPl : map touchExpr meta_values) return (Expr c_expr touch') where pokeExpr ptr (Expr c_expr _) = poke ptr c_expr ----------------------------------------------------------------------------- -- Graphviz data GraphvizOptions = GraphvizOptions {noLeaves :: Bool, noFun :: Bool, noCat :: Bool, noDep :: Bool, nodeFont :: String, leafFont :: String, nodeColor :: String, leafColor :: String, nodeEdgeStyle :: String, leafEdgeStyle :: String } graphvizDefaults = GraphvizOptions False False False True "" "" "" "" "" "" -- | Renders an abstract syntax tree in a Graphviz format. graphvizAbstractTree :: PGF -> GraphvizOptions -> Expr -> String graphvizAbstractTree p opts e = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_abstract_tree (pgf p) (expr e) c_opts out exn touchExpr e s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s graphvizParseTree :: Concr -> GraphvizOptions -> Expr -> String graphvizParseTree c opts e = unsafePerformIO $ withGuPool $ \tmpPl -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_parse_tree (concr c) (expr e) c_opts out exn touchExpr e s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s graphvizWordAlignment :: [Concr] -> GraphvizOptions -> Expr -> String graphvizWordAlignment cs opts e = unsafePerformIO $ withGuPool $ \tmpPl -> withArrayLen (map concr cs) $ \n_concrs ptr -> do (sb,out) <- newOut tmpPl exn <- gu_new_exn tmpPl c_opts <- newGraphvizOptions tmpPl opts pgf_graphviz_word_alignment ptr (fromIntegral n_concrs) (expr e) c_opts out exn touchExpr e s <- gu_string_buf_freeze sb tmpPl peekUtf8CString s type Labels = Map.Map Fun [String] -- | Visualize word dependency tree. graphvizDependencyTree :: String -- ^ Output format: @"latex"@, @"conll"@, @"malt_tab"@, @"malt_input"@ or @"dot"@ -> Bool -- ^ Include extra information (debug) -> Maybe Labels -- ^ abstract label information obtained with 'getDepLabels' -> Maybe CncLabels -- ^ concrete label information obtained with ' ' (was: unused (was: @Maybe String@)) -> Concr -> Expr -> String -- ^ Rendered output in the specified format graphvizDependencyTree format debug mlab mclab concr t = case format of "latex" -> render . ppLaTeX $ conll2latex' conll "svg" -> render . ppSVG . toSVG $ conll2latex' conll "conll" -> printCoNLL conll "malt_tab" -> render $ vcat (map (hcat . intersperse (char '\t') . (\ws -> [ws !! 0,ws !! 1,ws !! 3,ws !! 6,ws !! 7])) wnodes) "malt_input" -> render $ vcat (map (hcat . intersperse (char '\t') . take 6) wnodes) _ -> render $ text "digraph {" $$ space $$ nest 2 (text "rankdir=LR ;" $$ text "node [shape = plaintext] ;" $$ vcat nodes $$ vcat links) $$ text "}" where conll = maybe conll0 (\ls -> fixCoNLL ls conll0) mclab conll0 = (map.map) render wnodes nodes = map mkNode leaves links = map mkLink [(fid, fromMaybe (dep_lbl,nil) (lookup fid deps)) | ((cat,fid,fun),_,w) <- tail leaves] -- CoNLL format: ID FORM LEMMA PLEMMA POS PPOS FEAT PFEAT HEAD PHEAD DEPREL PDEPREL -- P variants are automatically predicted rather than gold standard wnodes = [[int i, maltws ws, text fun, text (posCat cat), text cat, unspec, int parent, text lab, unspec, unspec] | ((cat,fid,fun),i,ws) <- tail leaves, let (lab,parent) = fromMaybe (dep_lbl,0) (do (lbl,fid) <- lookup fid deps (_,i,_) <- find (\((_,fid1,_),i,_) -> fid == fid1) leaves return (lbl,i)) ] maltws = text . concat . intersperse "+" . words -- no spaces in column 2 nil = -1 bss = bracketedLinearize concr t root = ("_",nil,"_") leaves = (root,0,root_lbl) : (groupAndIndexIt 1 . concatMap (getLeaves root)) bss deps = let (_,(h,deps)) = getDeps 0 [] t in (h,(dep_lbl,nil)):deps groupAndIndexIt id [] = [] groupAndIndexIt id ((p,w):pws) = (p,id,w) : groupAndIndexIt (id+1) pws --- groupAndIndexIt id ((p,w):pws) = let (ws,pws1) = collect pws --- in (p,id,unwords (w:ws)) : groupAndIndexIt (id+1) pws1 where collect pws@((p1,w):pws1) | p == p1 = let (ws,pws2) = collect pws1 in (w:ws,pws2) collect pws = ([],pws) getLeaves parent bs = case bs of Leaf w -> [(parent,w)] Bracket cat fid _ fun bss -> concatMap (getLeaves (cat,fid,fun)) bss mkNode ((_,p,_),i,w) = tag p <+> brackets (text "label = " <> doubleQuotes (int i <> char '.' <+> text w)) <+> semi mkLink (x,(lbl,y)) = tag y <+> text "->" <+> tag x <+> text "[label = " <> doubleQuotes (text lbl) <> text "] ;" labels = maybe Map.empty id mlab clabels = maybe [] id mclab posCat cat = case Map.lookup cat labels of Just [p] -> p _ -> cat getDeps n_fid xs e = case unAbs e of Just (_, x, e) -> getDeps n_fid (x:xs) e Nothing -> case unApp e of Just (f,es) -> let (n_fid_1,ds) = descend n_fid xs es (mb_h, deps) = selectHead f ds in case mb_h of Just (fid,deps0) -> (n_fid_1+1,(fid,deps0++ [(n_fid_1,(dep_lbl,fid))]++ concat [(m,(lbl,fid)):ds | (lbl,(m,ds)) <- deps])) Nothing -> (n_fid_1+1,(n_fid_1,concat [(m,(lbl,n_fid_1)):ds | (lbl,(m,ds)) <- deps])) Nothing -> (n_fid+1,(n_fid,[])) descend n_fid xs es = mapAccumL (\n_fid e -> getDeps n_fid xs e) n_fid es selectHead f ds = case Map.lookup f labels of Just lbls -> extractHead (zip lbls ds) Nothing -> extractLast ds where extractHead [] = (Nothing, []) extractHead (ld@(l,d):lds) | l == head_lbl = (Just d,lds) | otherwise = let (mb_h,deps) = extractHead lds in (mb_h,ld:deps) extractLast [] = (Nothing, []) extractLast (d:ds) | null ds = (Just d,[]) | otherwise = let (mb_h,deps) = extractLast ds in (mb_h,(dep_lbl,d):deps) dep_lbl = "dep" head_lbl = "head" root_lbl = "ROOT" unspec = text "_" ---------------------- should be a separate module? -- visualization with latex output. AR Nov 2015 conlls2latexDoc :: [String] -> String conlls2latexDoc = render . latexDoc . vcat . intersperse (text "" $+$ app "vspace" (text "4mm")) . map conll2latex . filter (not . null) conll2latex :: String -> Doc conll2latex = ppLaTeX . conll2latex' . parseCoNLL conll2latex' :: CoNLL -> [LaTeX] conll2latex' = dep2latex . conll2dep' data Dep = Dep { wordLength :: Int -> Double -- length of word at position int -- was: fixed width, millimetres (>= 20.0) , tokens :: [(String,String)] -- word, pos (0..) , deps :: [((Int,Int),String)] -- from, to, label , root :: Int -- root word position } -- some general measures defaultWordLength = 20.0 -- the default fixed width word length, making word 100 units defaultUnit = 0.2 -- unit in latex pictures, 0.2 millimetres spaceLength = 10.0 charWidth = 1.8 wsize rwld w = 100 * rwld w + spaceLength -- word length, units wpos rwld i = sum [wsize rwld j | j <- [0..i-1]] -- start position of the i'th word wdist rwld x y = sum [wsize rwld i | i <- [min x y .. max x y - 1]] -- distance between words x and y labelheight h = h + arcbase + 3 -- label just above arc; 25 would put it just below labelstart c = c - 15.0 -- label starts 15u left of arc centre arcbase = 30.0 -- arcs start and end 40u above the bottom arcfactor r = r * 600 -- reduction of arc size from word distance xyratio = 3 -- width/height ratio of arcs putArc :: (Int -> Double) -> Int -> Int -> Int -> String -> [DrawingCommand] putArc frwld height x y label = [oval,arrowhead,labelling] where oval = Put (ctr,arcbase) (OvalTop (wdth,hght)) arrowhead = Put (endp,arcbase + 5) (ArrowDown 5) -- downgoing arrow 5u above the arc base labelling = Put (labelstart ctr,labelheight (hght/2)) (TinyText label) dxy = wdist frwld x y -- distance between words, >>= 20.0 ndxy = 100 * rwld * fromIntegral height -- distance that is indep of word length hdxy = dxy / 2 -- half the distance wdth = dxy - (arcfactor rwld)/dxy -- longer arcs are wider in proportion hght = ndxy / (xyratio * rwld) -- arc height is independent of word length begp = min x y -- begin position of oval ctr = wpos frwld begp + hdxy + (if x < y then 20 else 10) -- LR arcs are farther right from center of oval endp = (if x < y then (+) else (-)) ctr (wdth/2) -- the point of the arrow rwld = 0.5 ---- dep2latex :: Dep -> [LaTeX] dep2latex d = [Comment (unwords (map fst (tokens d))), Picture defaultUnit (width,height) ( [Put (wpos rwld i,0) (Text w) | (i,w) <- zip [0..] (map fst (tokens d))] -- words ++ [Put (wpos rwld i,15) (TinyText w) | (i,w) <- zip [0..] (map snd (tokens d))] -- pos tags 15u above bottom ++ concat [putArc rwld (aheight x y) x y label | ((x,y),label) <- deps d] -- arcs and labels ++ [Put (wpos rwld (root d) + 15,height) (ArrowDown (height-arcbase))] ++ [Put (wpos rwld (root d) + 20,height - 10) (TinyText "ROOT")] )] where wld i = wordLength d i -- >= 20.0 rwld i = (wld i) / defaultWordLength -- >= 1.0 aheight x y = depth (min x y) (max x y) + 1 ---- abs (x-y) arcs = [(min u v, max u v) | ((u,v),_) <- deps d] depth x y = case [(u,v) | (u,v) <- arcs, (x < u && v <= y) || (x == u && v < y)] of ---- only projective arcs counted [] -> 0 uvs -> 1 + maximum (0:[depth u v | (u,v) <- uvs]) width = {-round-} (sum [wsize rwld w | (w,_) <- zip [0..] (tokens d)]) + {-round-} spaceLength * fromIntegral ((length (tokens d)) - 1) height = 50 + 20 * {-round-} (maximum (0:[aheight x y | ((x,y),_) <- deps d])) type CoNLL = [[String]] parseCoNLL :: String -> CoNLL parseCoNLL = map words . lines --conll2dep :: String -> Dep --conll2dep = conll2dep' . parseCoNLL conll2dep' :: CoNLL -> Dep conll2dep' ls = Dep { wordLength = wld , tokens = toks , deps = dps , root = head $ [read x-1 | x:_:_:_:_:_:"0":_ <- ls] ++ [1] } where wld i = maximum (0:[charWidth * fromIntegral (length w) | w <- let (tok,pos) = toks !! i in [tok,pos]]) toks = [(w,c) | _:w:_:c:_ <- ls] dps = [((read y-1, read x-1),lab) | x:_:_:_:_:_:y:lab:_ <- ls, y /="0"] --maxdist = maximum [abs (x-y) | ((x,y),_) <- dps] -- * LaTeX Pictures (see https://en.wikibooks.org/wiki/LaTeX/Picture) -- We render both LaTeX and SVG from this intermediate representation of -- LaTeX pictures. data LaTeX = Comment String | Picture UnitLengthMM Size [DrawingCommand] data DrawingCommand = Put Position Object data Object = Text String | TinyText String | OvalTop Size | ArrowDown Length type UnitLengthMM = Double type Size = (Double,Double) type Position = (Double,Double) type Length = Double -- * latex formatting ppLaTeX = vcat . map ppLaTeX1 where ppLaTeX1 el = case el of Comment s -> comment s Picture unit size cmds -> app "setlength{\\unitlength}" (text (show unit ++ "mm")) $$ hang (app "begin" (text "picture")<>text (show size)) 2 (vcat (map ppDrawingCommand cmds)) $$ app "end" (text "picture") $$ text "" ppDrawingCommand (Put pos obj) = put pos (ppObject obj) ppObject obj = case obj of Text s -> text s TinyText s -> small (text s) OvalTop size -> text "\\oval" <> text (show size) <> text "[t]" ArrowDown len -> app "vector(0,-1)" (text (show len)) put p@(_,_) = app ("put" ++ show p) small w = text "{\\tiny" <+> w <> text "}" comment s = text "%%" <+> text s -- line break show follow app macro arg = text "\\" <> text macro <> text "{" <> arg <> text "}" latexDoc :: Doc -> Doc latexDoc body = vcat [text "\\documentclass{article}", text "\\usepackage[utf8]{inputenc}", text "\\begin{document}", body, text "\\end{document}"] -- * SVG (see https://www.w3.org/Graphics/SVG/IG/resources/svgprimer.html) -- | Render LaTeX pictures as SVG toSVG = concatMap toSVG1 where toSVG1 el = case el of Comment s -> [] Picture unit size@(w,h) cmds -> [Elem "svg" ["width".=x1,"height".=y0+5, ("viewBox",unwords (map show [0,0,x1,y0+5])), ("version","1.1"), ("xmlns","http://www.w3.org/2000/svg")] (white_bg:concatMap draw cmds)] where white_bg = Elem "rect" ["x".=0,"y".=0,"width".=x1,"height".=y0+5, ("fill","white")] [] draw (Put pos obj) = objectSVG pos obj objectSVG pos obj = case obj of Text s -> [text 16 pos s] TinyText s -> [text 10 pos s] OvalTop size -> [ovalTop pos size] ArrowDown len -> arrowDown pos len text h (x,y) s = Elem "text" ["x".=xc x,"y".=yc y-2,"font-size".=h] [CharData s] ovalTop (x,y) (w,h) = Elem "path" [("d",path),("stroke","black"),("fill","none")] [] where x1 = x-w/2 x2 = min x (x1+r) x3 = max x (x4-r) x4 = x+w/2 y1 = y y2 = y+r r = h/2 sx = show . xc sy = show . yc path = unwords (["M",sx x1,sy y1,"Q",sx x1,sy y2,sx x2,sy y2, "L",sx x3,sy y2,"Q",sx x4,sy y2,sx x4,sy y1]) arrowDown (x,y) len = [Elem "line" ["x1".=xc x,"y1".=yc y,"x2".=xc x,"y2".=y2, ("stroke","black")] [], Elem "path" [("d",unwords arrowhead)] []] where x2 = xc x y2 = yc (y-len) arrowhead = "M":map show [x2,y2,x2-3,y2-6,x2+3,y2-6] xc x = num x+5 yc y = y0-num y x1 = num w+10 y0 = num h+20 num x = round (scale*x) scale = unit*5 infix 0 .= n.=v = (n,show v) -- * SVG is XML data SVG = CharData String | Elem TagName Attrs [SVG] type TagName = String type Attrs = [(String,String)] ppSVG svg = vcat [text "", text "", text "", vcat (map ppSVG1 svg)] -- It should be a single element... where ppSVG1 svg1 = case svg1 of CharData s -> text (encode s) Elem tag attrs [] -> text "<"<>text tag<>cat (map attr attrs) <> text "/>" Elem tag attrs svg -> cat [text "<"<>text tag<>cat (map attr attrs) <> text ">", nest 2 (cat (map ppSVG1 svg)), text "text tag<>text ">"] attr (n,v) = text " "<>text n<>text "=\""<>text (encode v)<>text "\"" encode s = foldr encodeEntity "" s encodeEntity = encodeEntity' (const False) encodeEntity' esc c r = case c of '&' -> "&"++r '<' -> "<"++r '>' -> ">"++r _ -> c:r ---------------------------------- -- concrete syntax annotations (local) on top of conll -- examples of annotations: -- UseComp {"not"} PART neg head -- UseComp {*} AUX cop head type CncLabels = [(String, String -> Maybe (String -> String,String,String))] -- (fun, word -> (pos,label,target)) -- the pos can remain unchanged, as in the current notation in the article fixCoNLL :: CncLabels -> CoNLL -> CoNLL fixCoNLL labels conll = map fixc conll where fixc row = case row of (i:word:fun:pos:cat:x_:"0":"dep":xs) -> (i:word:fun:pos:cat:x_:"0":"root":xs) --- change the root label from dep to root (i:word:fun:pos:cat:x_:j:label:xs) -> case look (fun,word) of Just (pos',label',"head") -> (i:word:fun:pos' pos:cat:x_:j :label':xs) Just (pos',label',target) -> (i:word:fun:pos' pos:cat:x_: getDep j target:label':xs) _ -> row _ -> row look (fun,word) = case lookup fun labels of Just relabel -> case relabel word of Just row -> Just row _ -> case lookup "*" labels of Just starlabel -> starlabel word _ -> Nothing _ -> case lookup "*" labels of Just starlabel -> starlabel word _ -> Nothing getDep j label = maybe j id $ lookup (label,j) [((label,j),i) | i:word:fun:pos:cat:x_:j:label:xs <- conll] getCncDepLabels :: String -> CncLabels getCncDepLabels = map merge . groupBy (\ (x,_) (a,_) -> x == a) . concatMap analyse . filter choose . lines where --- choose is for compatibility with the general notation choose line = notElem '(' line && elem '{' line --- ignoring non-local (with "(") and abstract (without "{") rules analyse line = case break (=='{') line of (beg,_:ws) -> case break (=='}') ws of (toks,_:target) -> case (words beg, words target) of (fun:_,[ label,j]) -> [(fun, (tok, (id, label,j))) | tok <- getToks toks] (fun:_,[pos,label,j]) -> [(fun, (tok, (const pos,label,j))) | tok <- getToks toks] _ -> [] _ -> [] _ -> [] merge rules@((fun,_):_) = (fun, \tok -> case lookup tok (map snd rules) of Just new -> return new _ -> lookup "*" (map snd rules) ) getToks = words . map (\c -> if elem c "\"," then ' ' else c) printCoNLL :: CoNLL -> String printCoNLL = unlines . map (concat . intersperse "\t") newGraphvizOptions :: Ptr GuPool -> GraphvizOptions -> IO (Ptr PgfGraphvizOptions) newGraphvizOptions pool opts = do c_opts <- gu_malloc pool (#size PgfGraphvizOptions) (#poke PgfGraphvizOptions, noLeaves) c_opts (if noLeaves opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noFun) c_opts (if noFun opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noCat) c_opts (if noCat opts then 1 else 0 :: CInt) (#poke PgfGraphvizOptions, noDep) c_opts (if noDep opts then 1 else 0 :: CInt) newUtf8CString (nodeFont opts) pool >>= (#poke PgfGraphvizOptions, nodeFont) c_opts newUtf8CString (leafFont opts) pool >>= (#poke PgfGraphvizOptions, leafFont) c_opts newUtf8CString (nodeColor opts) pool >>= (#poke PgfGraphvizOptions, nodeColor) c_opts newUtf8CString (leafColor opts) pool >>= (#poke PgfGraphvizOptions, leafColor) c_opts newUtf8CString (nodeEdgeStyle opts) pool >>= (#poke PgfGraphvizOptions, nodeEdgeStyle) c_opts newUtf8CString (leafEdgeStyle opts) pool >>= (#poke PgfGraphvizOptions, leafEdgeStyle) c_opts return c_opts ----------------------------------------------------------------------------- -- Functions using Concr -- Morpho analyses, parsing & linearization type MorphoAnalysis = (Fun,Cat,Float) lookupMorpho :: Concr -> String -> [MorphoAnalysis] lookupMorpho (Concr concr master) sent = unsafePerformIO $ withGuPool $ \tmpPl -> do ref <- newIORef [] cback <- gu_malloc tmpPl (#size PgfMorphoCallback) fptr <- wrapLookupMorphoCallback (getAnalysis ref) (#poke PgfMorphoCallback, callback) cback fptr c_sent <- newUtf8CString sent tmpPl pgf_lookup_morpho concr c_sent cback nullPtr freeHaskellFunPtr fptr readIORef ref fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon lang = unsafePerformIO $ do pl <- gu_new_pool enum <- pgf_fullform_lexicon (concr lang) pl fpl <- newForeignPtr gu_pool_finalizer pl fromFullFormEntry enum fpl where fromFullFormEntry :: Ptr GuEnum -> ForeignPtr GuPool -> IO [(String, [MorphoAnalysis])] fromFullFormEntry enum fpl = do ffEntry <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if ffEntry == nullPtr then do finalizeForeignPtr fpl touchConcr lang return [] else do tok <- peekUtf8CString =<< pgf_fullform_get_string ffEntry ref <- newIORef [] allocaBytes (#size PgfMorphoCallback) $ \cback -> do fptr <- wrapLookupMorphoCallback (getAnalysis ref) (#poke PgfMorphoCallback, callback) cback fptr pgf_fullform_get_analyses ffEntry cback nullPtr ans <- readIORef ref toks <- unsafeInterleaveIO (fromFullFormEntry enum fpl) return ((tok,ans) : toks) getAnalysis :: IORef [MorphoAnalysis] -> LookupMorphoCallback getAnalysis ref self c_lemma c_anal prob exn = do ans <- readIORef ref lemma <- peekUtf8CString c_lemma anal <- peekUtf8CString c_anal writeIORef ref ((lemma, anal, prob):ans) -- | This data type encodes the different outcomes which you could get from the parser. data ParseOutput = ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed. -- The string is the token where the parser have failed. | ParseOk [(Expr,Float)] -- ^ If the parsing and the type checking are successful we get a list of abstract syntax trees. -- The list should be non-empty. | ParseIncomplete -- ^ The sentence is not complete. parse :: Concr -> Type -> String -> ParseOutput parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] parseWithHeuristics :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence -> Double -- ^ the heuristic factor. -- A negative value tells the parser -- to lookup up the default from -- the grammar flags -> [(Cat, Int -> Int -> Maybe (Expr,Float,Int))] -- ^ a list of callbacks for literal categories. -- The arguments of the callback are: -- the index of the constituent for the literal category; -- the input sentence; the current offset in the sentence. -- If a literal has been recognized then the output should -- be Just (expr,probability,end_offset) -> ParseOutput parseWithHeuristics lang (Type ctype _) sent heuristic callbacks = unsafePerformIO $ do exprPl <- gu_new_pool parsePl <- gu_new_pool exn <- gu_new_exn parsePl sent <- newUtf8CString sent parsePl callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl enum <- pgf_parse_with_heuristics (concr lang) ctype sent heuristic callbacks_map exn parsePl exprPl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_incomplete <- (#peek PgfParseError, incomplete) c_err if (c_incomplete :: CInt) == 0 then do c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len gu_pool_free parsePl gu_pool_free exprPl return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do gu_pool_free parsePl gu_pool_free exprPl return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) else do gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError "Parsing failed") else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return (ParseOk exprs) mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do callbacks_map <- pgf_new_callbacks_map concr pool forM_ callbacks $ \(cat,match) -> do ccat <- newUtf8CString cat pool match <- wrapLiteralMatchCallback (match_callback match) predict <- wrapLiteralPredictCallback predict_callback hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool return callbacks_map where match_callback match clin_idx poffset out_pool = do coffset <- peek poffset case match (fromIntegral clin_idx) (fromIntegral coffset) of Nothing -> return nullPtr Just (e,prob,offset') -> do poke poffset (fromIntegral offset') -- here we copy the expression to out_pool c_e <- withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl (sb,out) <- newOut tmpPl let printCtxt = nullPtr pgf_print_expr (expr e) printCtxt 1 out exn c_str <- gu_string_buf_freeze sb tmpPl guin <- gu_string_in c_str tmpPl pgf_read_expr guin out_pool tmpPl exn ep <- gu_malloc out_pool (#size PgfExprProb) (#poke PgfExprProb, expr) ep c_e (#poke PgfExprProb, prob) ep prob return ep predict_callback _ _ _ = return nullPtr complete :: Concr -- ^ the language with which we do word completion -> Type -- ^ the start category -> String -- ^ the input sentence -> String -- ^ prefix for the word to be completed -> [(String, Cat, Fun, Float)] complete lang (Type ctype _) sent prefix = unsafePerformIO $ do pl <- gu_new_pool exn <- gu_new_exn pl sent <- newUtf8CString sent pl prefix <- newUtf8CString prefix pl enum <- pgf_complete (concr lang) ctype sent prefix exn pl failed <- gu_exn_is_raised exn if failed then do gu_pool_free pl return [] else do fpl <- newForeignPtr gu_pool_finalizer pl tokens <- fromPgfTokenEnum enum fpl return tokens lookupSentence :: Concr -- ^ the language with which we parse -> Type -- ^ the start category -> String -- ^ the input sentence -> [(Expr,Float)] lookupSentence lang (Type ctype _) sent = unsafePerformIO $ do exprPl <- gu_new_pool parsePl <- gu_new_pool sent <- newUtf8CString sent parsePl enum <- pgf_lookup_sentence (concr lang) ctype sent parsePl exprPl parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return exprs -- | The oracle is a triple of functions. -- The first two take a category name and a linearization field name -- and they should return True/False when the corresponding -- prediction or completion is appropriate. The third function -- is the oracle for literals. type Oracle = (Maybe (Cat -> String -> Int -> Bool) ,Maybe (Cat -> String -> Int -> Bool) ,Maybe (Cat -> String -> Int -> Maybe (Expr,Float,Int)) ) parseWithOracle :: Concr -- ^ the language with which we parse -> Cat -- ^ the start category -> String -- ^ the input sentence -> Oracle -> ParseOutput parseWithOracle lang cat sent (predict,complete,literal) = unsafePerformIO $ do parsePl <- gu_new_pool exprPl <- gu_new_pool exn <- gu_new_exn parsePl cat <- newUtf8CString cat parsePl sent <- newUtf8CString sent parsePl predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl enum <- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_err <- (#peek GuExn, data.data) exn c_incomplete <- (#peek PgfParseError, incomplete) c_err if (c_incomplete :: CInt) == 0 then do c_offset <- (#peek PgfParseError, offset) c_err token_ptr <- (#peek PgfParseError, token_ptr) c_err token_len <- (#peek PgfParseError, token_len) c_err tok <- peekUtf8CStringLen token_ptr token_len gu_pool_free parsePl gu_pool_free exprPl return (ParseFailed (fromIntegral (c_offset :: CInt)) tok) else do gu_pool_free parsePl gu_pool_free exprPl return ParseIncomplete else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) else do gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError "Parsing failed") else do parseFPl <- newForeignPtr gu_pool_finalizer parsePl exprFPl <- newForeignPtr gu_pool_finalizer exprPl exprs <- fromPgfExprEnum enum parseFPl (touchConcr lang >> touchForeignPtr exprFPl) return (ParseOk exprs) where oracleWrapper oracle catPtr lblPtr offset = do cat <- peekUtf8CString catPtr lbl <- peekUtf8CString lblPtr return (oracle cat lbl (fromIntegral offset)) oracleLiteralWrapper oracle catPtr lblPtr poffset out_pool = do cat <- peekUtf8CString catPtr lbl <- peekUtf8CString lblPtr offset <- peek poffset case oracle cat lbl (fromIntegral offset) of Just (e,prob,offset) -> do poke poffset (fromIntegral offset) -- here we copy the expression to out_pool c_e <- withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl (sb,out) <- newOut tmpPl let printCtxt = nullPtr pgf_print_expr (expr e) printCtxt 1 out exn c_str <- gu_string_buf_freeze sb tmpPl guin <- gu_string_in c_str tmpPl pgf_read_expr guin out_pool tmpPl exn ep <- gu_malloc out_pool (#size PgfExprProb) (#poke PgfExprProb, expr) ep c_e (#poke PgfExprProb, prob) ep prob return ep Nothing -> do return nullPtr -- | Returns True if there is a linearization defined for that function in that language hasLinearization :: Concr -> Fun -> Bool hasLinearization lang id = unsafePerformIO $ withGuPool $ \pl -> do res <- newUtf8CString id pl >>= pgf_has_linearization (concr lang) return (res /= 0) -- | Linearizes an expression as a string in the language linearize :: Concr -> Expr -> String linearize lang e = unsafePerformIO $ withGuPool $ \pl -> do (sb,out) <- newOut pl exn <- gu_new_exn pl pgf_linearize (concr lang) (expr e) out exn touchExpr e failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return "" else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do lin <- gu_string_buf_freeze sb pl peekUtf8CString lin -- | Generates all possible linearizations of an expression linearizeAll :: Concr -> Expr -> [String] linearizeAll lang e = unsafePerformIO $ do pl <- gu_new_pool exn <- gu_new_exn pl cts <- pgf_lzr_concretize (concr lang) (expr e) exn pl failed <- gu_exn_is_raised exn if failed then throwExn exn pl else collect cts exn pl where collect cts exn pl = withGuPool $ \tmpPl -> do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr tmpPl peek ptr if ctree == nullPtr then do gu_pool_free pl touchExpr e return [] else do (sb,out) <- newOut tmpPl ctree <- pgf_lzr_wrap_linref ctree tmpPl pgf_lzr_linearize_simple (concr lang) ctree 0 out exn tmpPl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then collect cts exn pl else throwExn exn pl else do lin <- gu_string_buf_freeze sb tmpPl s <- peekUtf8CString lin ss <- collect cts exn pl return (s:ss) throwExn exn pl = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg gu_pool_free pl throwIO (PGFError msg) else do gu_pool_free pl throwIO (PGFError "The abstract tree cannot be linearized") -- | Generates a table of linearizations for an expression tabularLinearize :: Concr -> Expr -> [(String, String)] tabularLinearize lang e = case tabularLinearizeAll lang e of (lins:_) -> lins _ -> [] -- | Generates a table of linearizations for an expression tabularLinearizeAll :: Concr -> Expr -> [[(String, String)]] tabularLinearizeAll lang e = unsafePerformIO $ withGuPool $ \tmpPl -> do exn <- gu_new_exn tmpPl cts <- pgf_lzr_concretize (concr lang) (expr e) exn tmpPl failed <- gu_exn_is_raised exn if failed then throwExn exn else collect cts exn tmpPl where collect cts exn tmpPl = do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr tmpPl peek ptr if ctree == nullPtr then do touchExpr e return [] else do labels <- alloca $ \p_n_lins -> alloca $ \p_labels -> do pgf_lzr_get_table (concr lang) ctree p_n_lins p_labels n_lins <- peek p_n_lins labels <- peek p_labels labels <- peekArray (fromIntegral n_lins) labels labels <- mapM peekCString labels return labels lins <- collectTable lang ctree 0 labels exn tmpPl linss <- collect cts exn tmpPl return (lins : linss) collectTable lang ctree lin_idx [] exn tmpPl = return [] collectTable lang ctree lin_idx (label:labels) exn tmpPl = do (sb,out) <- newOut tmpPl pgf_lzr_linearize_simple (concr lang) ctree lin_idx out exn tmpPl failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then collectTable lang ctree (lin_idx+1) labels exn tmpPl else throwExn exn else do lin <- gu_string_buf_freeze sb tmpPl s <- peekUtf8CString lin ss <- collectTable lang ctree (lin_idx+1) labels exn tmpPl return ((label,s):ss) throwExn exn = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else do throwIO (PGFError "The abstract tree cannot be linearized") type FId = Int type LIndex = Int -- | BracketedString represents a sentence that is linearized -- as usual but we also want to retain the ''brackets'' that -- mark the beginning and the end of each constituent. data BracketedString = Leaf String -- ^ this is the leaf i.e. a single token | Bracket Cat {-# UNPACK #-} !FId {-# UNPACK #-} !LIndex Fun [BracketedString] -- ^ this is a bracket. The 'Cat' is the category of -- the phrase. The 'FId' is an unique identifier for -- every phrase in the sentence. For context-free grammars -- i.e. without discontinuous constituents this identifier -- is also unique for every bracket. When there are discontinuous -- phrases then the identifiers are unique for every phrase but -- not for every bracket since the bracket represents a constituent. -- The different constituents could still be distinguished by using -- the constituent index i.e. 'LIndex'. If the grammar is reduplicating -- then the constituent indices will be the same for all brackets -- that represents the same constituent. -- The 'Fun' is the name of the abstract function that generated -- this phrase. -- | Renders the bracketed string as a string where -- the brackets are shown as @(S ...)@ where -- @S@ is the category. showBracketedString :: BracketedString -> String showBracketedString = render . ppBracketedString ppBracketedString (Leaf t) = text t ppBracketedString (Bracket cat fid index _ bss) = parens (text cat <> colon <> int fid <+> hsep (map ppBracketedString bss)) -- | Extracts the sequence of tokens from the bracketed string flattenBracketedString :: BracketedString -> [String] flattenBracketedString (Leaf w) = [w] flattenBracketedString (Bracket _ _ _ _ bss) = concatMap flattenBracketedString bss bracketedLinearize :: Concr -> Expr -> [BracketedString] bracketedLinearize lang e = unsafePerformIO $ withGuPool $ \pl -> do exn <- gu_new_exn pl cts <- pgf_lzr_concretize (concr lang) (expr e) exn pl failed <- gu_exn_is_raised exn if failed then throwExn exn else do ctree <- alloca $ \ptr -> do gu_enum_next cts ptr pl peek ptr if ctree == nullPtr then do touchExpr e return [] else do ctree <- pgf_lzr_wrap_linref ctree pl ref <- newIORef ([],[]) allocaBytes (#size PgfLinFuncs) $ \pLinFuncs -> alloca $ \ppLinFuncs -> do fptr_symbol_token <- wrapSymbolTokenCallback (symbol_token ref) fptr_begin_phrase <- wrapPhraseCallback (begin_phrase ref) fptr_end_phrase <- wrapPhraseCallback (end_phrase ref) fptr_symbol_ne <- wrapSymbolNonExistCallback (symbol_ne exn) fptr_symbol_meta <- wrapSymbolMetaCallback (symbol_meta ref) (#poke PgfLinFuncs, symbol_token) pLinFuncs fptr_symbol_token (#poke PgfLinFuncs, begin_phrase) pLinFuncs fptr_begin_phrase (#poke PgfLinFuncs, end_phrase) pLinFuncs fptr_end_phrase (#poke PgfLinFuncs, symbol_ne) pLinFuncs fptr_symbol_ne (#poke PgfLinFuncs, symbol_bind) pLinFuncs nullPtr (#poke PgfLinFuncs, symbol_capit) pLinFuncs nullPtr (#poke PgfLinFuncs, symbol_meta) pLinFuncs fptr_symbol_meta poke ppLinFuncs pLinFuncs pgf_lzr_linearize (concr lang) ctree 0 ppLinFuncs pl freeHaskellFunPtr fptr_symbol_token freeHaskellFunPtr fptr_begin_phrase freeHaskellFunPtr fptr_end_phrase freeHaskellFunPtr fptr_symbol_ne freeHaskellFunPtr fptr_symbol_meta failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return [] else throwExn exn else do (_,bs) <- readIORef ref return (reverse bs) where symbol_token ref _ c_token = do (stack,bs) <- readIORef ref token <- peekUtf8CString c_token writeIORef ref (stack,Leaf token : bs) begin_phrase ref _ c_cat c_fid c_lindex c_fun = do (stack,bs) <- readIORef ref writeIORef ref (bs:stack,[]) end_phrase ref _ c_cat c_fid c_lindex c_fun = do (bs':stack,bs) <- readIORef ref cat <- peekUtf8CString c_cat let fid = fromIntegral c_fid let lindex = fromIntegral c_lindex fun <- peekUtf8CString c_fun writeIORef ref (stack, Bracket cat fid lindex fun (reverse bs) : bs') symbol_ne exn _ = do gu_exn_raise exn gu_exn_type_PgfLinNonExist return () symbol_meta ref _ meta_id = do (stack,bs) <- readIORef ref writeIORef ref (stack,Leaf "?" : bs) throwExn exn = do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else do throwIO (PGFError "The abstract tree cannot be linearized") alignWords :: Concr -> Expr -> [(String, [Int])] alignWords lang e = unsafePerformIO $ withGuPool $ \pl -> do exn <- gu_new_exn pl seq <- pgf_align_words (concr lang) (expr e) exn pl touchConcr lang touchExpr e failed <- gu_exn_is_raised exn if failed then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist if is_nonexist then return [] else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do len <- (#peek GuSeq, len) seq arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data)) mapM peekAlignmentPhrase arr where peekAlignmentPhrase :: Ptr () -> IO (String, [Int]) peekAlignmentPhrase ptr = do c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr phrase <- peekUtf8CString c_phrase n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr (fids :: [CInt]) <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids)) return (phrase, map fromIntegral fids) printName :: Concr -> Fun -> Maybe String printName lang fun = unsafePerformIO $ withGuPool $ \tmpPl -> do c_fun <- newUtf8CString fun tmpPl c_name <- pgf_print_name (concr lang) c_fun name <- if c_name == nullPtr then return Nothing else fmap Just (peekUtf8CString c_name) touchConcr lang return name -- | List of all functions defined in the abstract syntax functions :: PGF -> [Fun] functions p = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getFunctions ref) (#poke GuMapItor, fn) itor fptr pgf_iter_functions (pgf p) itor exn touchPGF p freeHaskellFunPtr fptr fs <- readIORef ref return (reverse fs) where getFunctions :: IORef [String] -> MapItorCallback getFunctions ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) -- | List of all functions defined for a category functionsByCat :: PGF -> Cat -> [Fun] functionsByCat p cat = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getFunctions ref) (#poke GuMapItor, fn) itor fptr ccat <- newUtf8CString cat tmpPl pgf_iter_functions_by_cat (pgf p) ccat itor exn touchPGF p freeHaskellFunPtr fptr fs <- readIORef ref return (reverse fs) where getFunctions :: IORef [String] -> MapItorCallback getFunctions ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) -- | List of all categories defined in the grammar. -- The categories are defined in the abstract syntax -- with the \'cat\' keyword. categories :: PGF -> [Cat] categories p = unsafePerformIO $ withGuPool $ \tmpPl -> allocaBytes (#size GuMapItor) $ \itor -> do exn <- gu_new_exn tmpPl ref <- newIORef [] fptr <- wrapMapItorCallback (getCategories ref) (#poke GuMapItor, fn) itor fptr pgf_iter_categories (pgf p) itor exn touchPGF p freeHaskellFunPtr fptr cs <- readIORef ref return (reverse cs) where getCategories :: IORef [String] -> MapItorCallback getCategories ref itor key value exn = do names <- readIORef ref name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) categoryContext :: PGF -> Cat -> Maybe [Hypo] categoryContext p cat = unsafePerformIO $ withGuPool $ \tmpPl -> do c_cat <- newUtf8CString cat tmpPl c_hypos <- pgf_category_context (pgf p) c_cat if c_hypos == nullPtr then return Nothing else do n_hypos <- (#peek GuSeq, len) c_hypos hypos <- peekHypos (c_hypos `plusPtr` (#offset GuSeq, data)) 0 n_hypos return (Just hypos) where peekHypos :: Ptr a -> Int -> Int -> IO [Hypo] peekHypos c_hypo i n | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekUtf8CString c_ty <- (#peek PgfHypo, type) c_hypo bt <- fmap toBindType ((#peek PgfHypo, bind_type) c_hypo) hs <- peekHypos (plusPtr c_hypo (#size PgfHypo)) (i+1) n return ((bt,cid,Type c_ty (touchPGF p)) : hs) | otherwise = return [] toBindType :: CInt -> BindType toBindType (#const PGF_BIND_TYPE_EXPLICIT) = Explicit toBindType (#const PGF_BIND_TYPE_IMPLICIT) = Implicit categoryProbability :: PGF -> Cat -> Float categoryProbability p cat = unsafePerformIO $ withGuPool $ \tmpPl -> do c_cat <- newUtf8CString cat tmpPl c_prob <- pgf_category_prob (pgf p) c_cat touchPGF p return (realToFrac c_prob) ----------------------------------------------------------------------------- -- Helper functions fromPgfExprEnum :: Ptr GuEnum -> ForeignPtr GuPool -> Touch -> IO [(Expr, Float)] fromPgfExprEnum enum fpl touch = do pgfExprProb <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if pgfExprProb == nullPtr then do finalizeForeignPtr fpl return [] else do expr <- (#peek PgfExprProb, expr) pgfExprProb ts <- unsafeInterleaveIO (fromPgfExprEnum enum fpl touch) prob <- (#peek PgfExprProb, prob) pgfExprProb return ((Expr expr touch,prob) : ts) fromPgfTokenEnum :: Ptr GuEnum -> ForeignPtr GuPool -> IO [(String, Cat, Fun, Float)] fromPgfTokenEnum enum fpl = do pgfTokenProb <- alloca $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl peek ptr if pgfTokenProb == nullPtr then do finalizeForeignPtr fpl return [] else do tok <- (#peek PgfTokenProb, tok) pgfTokenProb >>= peekUtf8CString cat <- (#peek PgfTokenProb, cat) pgfTokenProb >>= peekUtf8CString fun <- (#peek PgfTokenProb, fun) pgfTokenProb >>= peekUtf8CString prob <- (#peek PgfTokenProb, prob) pgfTokenProb ts <- unsafeInterleaveIO (fromPgfTokenEnum enum fpl) return ((tok,cat,fun,prob) : ts) ----------------------------------------------------------------------- -- Exceptions newtype PGFError = PGFError String deriving (Show, Typeable) instance Exception PGFError ----------------------------------------------------------------------- type LiteralCallback = PGF -> (ConcName,Concr) -> String -> Int -> Int -> Maybe (Expr,Float,Int) -- | Callbacks for the App grammar literalCallbacks :: [(AbsName,[(Cat,LiteralCallback)])] literalCallbacks = [("App",[("PN",nerc),("Symb",chunk)])] -- | Named entity recognition for the App grammar -- (based on ../java/org/grammaticalframework/pgf/NercLiteralCallback.java) nerc :: LiteralCallback nerc pgf (lang,concr) sentence lin_idx offset = case consume capitalized (drop offset sentence) of (capwords@(_:_),rest) | not ("Eng" `isSuffixOf` lang && name `elem` ["I","I'm"]) -> if null ls then pn else case cat of "PN" -> retLit (mkApp lemma []) "WeekDay" -> retLit (mkApp "weekdayPN" [mkApp lemma []]) "Month" -> retLit (mkApp "monthPN" [mkApp lemma []]) _ -> Nothing where retLit e = Just (e,0,end_offset) where end_offset = offset+length name pn = retLit (mkApp "SymbPN" [mkApp "MkSymb" [mkStr name]]) ((lemma,cat),_) = maximumBy (compare `on` snd) (reverse ls) ls = [((fun,cat),p) |(fun,_,p)<-lookupMorpho concr name, Just cat <- [functionCat fun], cat/="Nationality"] name = trimRight (concat capwords) _ -> Nothing where -- | Variant of unfoldr consume munch xs = case munch xs of Nothing -> ([],xs) Just (y,xs') -> (y:ys,xs'') where (ys,xs'') = consume munch xs' functionCat f = fmap ((\(_,c,_) -> c) . unType) (functionType pgf f) -- | Callback to parse arbitrary words as chunks (from -- ../java/org/grammaticalframework/pgf/UnknownLiteralCallback.java) chunk :: LiteralCallback chunk _ (_,concr) sentence lin_idx offset = case uncapitalized (drop offset sentence) of Just (word0@(_:_),rest) | null (lookupMorpho concr word) -> Just (expr,0,offset+length word) where word = trimRight word0 expr = mkApp "MkSymb" [mkStr word] _ -> Nothing -- More helper functions trimRight = reverse . dropWhile isSpace . reverse capitalized = capitalized' isUpper uncapitalized = capitalized' (not.isUpper) capitalized' test s@(c:_) | test c = case span (not.isSpace) s of (name,rest1) -> case span isSpace rest1 of (space,rest2) -> Just (name++space,rest2) capitalized' not s = Nothing tag i | i < 0 = char 'r' <> int (negate i) | otherwise = char 'n' <> int i