mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-04 08:42:50 -06:00
the compiler now compiles with the new runtime
This commit is contained in:
@@ -14,17 +14,20 @@
|
||||
-------------------------------------------------
|
||||
|
||||
module PGF2 (-- * PGF
|
||||
PGF,readPGF,bootNGF,readNGF,
|
||||
PGF,readPGF,bootNGF,readNGF,writePGF,showPGF,
|
||||
|
||||
-- * Abstract syntax
|
||||
AbsName,abstractName,globalFlag,abstractFlag,
|
||||
|
||||
-- ** Categories
|
||||
Cat,categories,categoryContext,categoryProb,
|
||||
Cat,categories,categoryContext,categoryProbability,
|
||||
|
||||
-- ** Functions
|
||||
Fun, functions, functionsByCat,
|
||||
functionType, functionIsConstructor, functionProb,
|
||||
functionType, functionIsConstructor, functionProbability,
|
||||
|
||||
-- ** Expressions
|
||||
Expr(..), Literal(..), showExpr, readExpr,
|
||||
Expr(..), Literal(..), showExpr, readExpr, pExpr, pIdent,
|
||||
mkAbs, unAbs,
|
||||
mkApp, unApp, unapply,
|
||||
mkStr, unStr,
|
||||
@@ -33,20 +36,58 @@ module PGF2 (-- * PGF
|
||||
mkFloat, unFloat,
|
||||
mkMeta, unMeta,
|
||||
-- extra
|
||||
exprSize, exprFunctions,
|
||||
exprSize, exprFunctions, exprSubstitute, exprProbability,
|
||||
|
||||
-- ** Types
|
||||
Type(..), Hypo, BindType(..), startCat,
|
||||
readType, showType,
|
||||
readType, showType, showContext,
|
||||
mkType, unType,
|
||||
mkHypo, mkDepHypo, mkImplHypo,
|
||||
|
||||
-- ** Type checking
|
||||
-- | Dynamically-built expressions should always be type-checked before using in other functions,
|
||||
-- as the exceptions thrown by using invalid expressions may not catchable.
|
||||
checkExpr, inferExpr, checkType,
|
||||
|
||||
-- ** Computing
|
||||
compute,
|
||||
|
||||
-- ** Generation
|
||||
generateAll, generateAllFrom, generateRandom, generateRandomFrom,
|
||||
|
||||
-- ** Morphological Analysis
|
||||
MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon,
|
||||
filterBest, filterLongest,
|
||||
|
||||
-- ** Visualizations
|
||||
GraphvizOptions(..), graphvizDefaults,
|
||||
graphvizAbstractTree, graphvizParseTree,
|
||||
Labels, getDepLabels,
|
||||
graphvizDependencyTree, conlls2latexDoc, getCncDepLabels,
|
||||
graphvizWordAlignment,
|
||||
|
||||
-- * Concrete syntax
|
||||
ConcName,
|
||||
ConcName,Concr,languages,concreteName,languageCode,
|
||||
|
||||
-- ** Linearization
|
||||
linearize, linearizeAll, tabularLinearize, tabularLinearizeAll,
|
||||
FId, BracketedString(..), showBracketedString, flattenBracketedString,
|
||||
bracketedLinearize, bracketedLinearizeAll,
|
||||
hasLinearization,
|
||||
printName, alignWords, gizaAlignment,
|
||||
|
||||
-- ** Parsing
|
||||
ParseOutput(..), parse, parseWithHeuristics, complete,
|
||||
|
||||
-- * Exceptions
|
||||
PGFError(..)
|
||||
PGFError(..),
|
||||
|
||||
-- * Auxiliaries
|
||||
readProbabilitiesFromFile
|
||||
) where
|
||||
|
||||
import Prelude hiding ((<>))
|
||||
|
||||
import PGF2.Expr
|
||||
import PGF2.FFI
|
||||
|
||||
@@ -54,15 +95,16 @@ import Foreign
|
||||
import Foreign.C
|
||||
import Control.Exception(mask_,bracket)
|
||||
import System.IO.Unsafe(unsafePerformIO)
|
||||
import System.Random
|
||||
import qualified Foreign.Concurrent as C
|
||||
import qualified Data.Map as Map
|
||||
import Data.IORef
|
||||
import Data.List(intersperse,groupBy)
|
||||
import Data.Char(isUpper,isSpace,isPunctuation)
|
||||
import Text.PrettyPrint
|
||||
|
||||
#include <pgf/pgf.h>
|
||||
|
||||
type AbsName = String -- ^ Name of abstract syntax
|
||||
type ConcName = String -- ^ Name of concrete syntax
|
||||
|
||||
-- | Reads a PGF file and keeps it in memory.
|
||||
readPGF :: FilePath -> IO PGF
|
||||
readPGF fpath =
|
||||
@@ -106,6 +148,12 @@ readNGF fpath =
|
||||
fptr2 <- C.newForeignPtr c_revision (withForeignPtr fptr1 (\c_db -> pgf_free_revision c_db c_revision))
|
||||
return (PGF fptr1 fptr2 Map.empty)
|
||||
|
||||
writePGF :: FilePath -> PGF -> IO ()
|
||||
writePGF = error "TODO: writePGF"
|
||||
|
||||
showPGF :: PGF -> String
|
||||
showPGF = error "TODO: showPGF"
|
||||
|
||||
-- | The abstract language name is the name of the top-level
|
||||
-- abstract module
|
||||
abstractName :: PGF -> AbsName
|
||||
@@ -156,14 +204,273 @@ functionIsConstructor p fun =
|
||||
do res <- withPgfExn (pgf_function_is_constructor c_db c_revision c_fun)
|
||||
return (res /= 0)
|
||||
|
||||
functionProb :: PGF -> Fun -> Float
|
||||
functionProb p fun =
|
||||
functionProbability :: PGF -> Fun -> Float
|
||||
functionProbability p fun =
|
||||
unsafePerformIO $
|
||||
withText fun $ \c_fun ->
|
||||
withForeignPtr (a_db p) $ \c_db ->
|
||||
withForeignPtr (revision p) $ \c_revision ->
|
||||
withPgfExn (pgf_function_prob c_db c_revision c_fun)
|
||||
|
||||
exprProbability :: PGF -> Expr -> Float
|
||||
exprProbability = error "TODO: exprProbability"
|
||||
|
||||
checkExpr :: PGF -> Expr -> Type -> Either String Expr
|
||||
checkExpr = error "TODO: checkExpr"
|
||||
|
||||
-- | Tries to infer the type of an expression. Note that
|
||||
-- even if the expression is type correct it is not always
|
||||
-- possible to infer its type in the GF type system.
|
||||
-- In this case the function returns an error.
|
||||
inferExpr :: PGF -> Expr -> Either String (Expr, Type)
|
||||
inferExpr = error "TODO: inferExpr"
|
||||
|
||||
-- | Check whether a type is consistent with the abstract
|
||||
-- syntax of the grammar.
|
||||
checkType :: PGF -> Type -> Either String Type
|
||||
checkType = error "TODO: checkType"
|
||||
|
||||
compute :: PGF -> Expr -> Expr
|
||||
compute = error "TODO: compute"
|
||||
|
||||
concreteName :: Concr -> ConcName
|
||||
concreteName c = error "TODO: concreteName"
|
||||
|
||||
languageCode :: Concr -> Maybe String
|
||||
languageCode c = error "TODO: languageCode"
|
||||
|
||||
printName :: Concr -> Fun -> Maybe String
|
||||
printName lang fun = error "TODO: printName"
|
||||
|
||||
alignWords :: Concr -> Expr -> [(String, [Int])]
|
||||
alignWords = error "TODO: alignWords"
|
||||
|
||||
gizaAlignment = error "TODO: gizaAlignment"
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
-- Functions using Concr
|
||||
-- Morpho analyses, parsing & linearization
|
||||
|
||||
-- | This triple is returned by all functions that deal with
|
||||
-- the grammar's lexicon. Its first element is the name of an abstract
|
||||
-- lexical function which can produce a given word or
|
||||
-- a multiword expression (i.e. this is the lemma).
|
||||
-- After that follows a string which describes
|
||||
-- the particular inflection form.
|
||||
--
|
||||
-- The last element is a logarithm from the
|
||||
-- the probability of the function. The probability is not
|
||||
-- conditionalized on the category of the function. This makes it
|
||||
-- possible to compare the likelihood of two functions even if they
|
||||
-- have different types.
|
||||
type MorphoAnalysis = (Fun,String,Float)
|
||||
|
||||
-- | 'lookupMorpho' takes a string which must be a single word or
|
||||
-- a multiword expression. It then computes the list of all possible
|
||||
-- morphological analyses.
|
||||
lookupMorpho :: Concr -> String -> [MorphoAnalysis]
|
||||
lookupMorpho = error "TODO: lookupMorpho"
|
||||
|
||||
-- | 'lookupCohorts' takes an arbitrary string an produces
|
||||
-- a list of all places where lexical items from the grammar have been
|
||||
-- identified (i.e. cohorts). The list consists of triples of the format @(start,ans,end)@,
|
||||
-- where @start-end@ identifies the span in the text and @ans@ is
|
||||
-- the list of possible morphological analyses similar to 'lookupMorpho'.
|
||||
--
|
||||
-- The list is sorted first by the @start@ position and after than
|
||||
-- by the @end@ position. This can be used for instance if you want to
|
||||
-- filter only the longest matches.
|
||||
lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
lookupCohorts = error "TODO: lookupCohorts"
|
||||
|
||||
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
filterBest ans =
|
||||
reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] [])
|
||||
where
|
||||
iterate v0 [] [] res = res
|
||||
iterate v0 [] new res = iterate v0 new [] res
|
||||
iterate v0 ((_,v,conf, []):old) new res =
|
||||
case compare v0 v of
|
||||
LT -> res
|
||||
EQ -> iterate v0 old new (merge conf res)
|
||||
GT -> iterate v old new conf
|
||||
iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res
|
||||
|
||||
valueOf (_,_,[],_) = 2
|
||||
valueOf _ = 1
|
||||
|
||||
insert v conf an@(start,_,_,end) ans l_new [] =
|
||||
match start v conf ans ((end,v,comb conf an,filter end ans):l_new) []
|
||||
insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) =
|
||||
case compare end0 end of
|
||||
LT -> insert v conf an ans (new:l_new) r_new
|
||||
EQ -> case compare v0 v of
|
||||
LT -> match start v conf ans ((end,v, conf0,ans0): l_new) r_new
|
||||
EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0): l_new) r_new
|
||||
GT -> match start v conf ans ((end,v,comb conf an, ans0): l_new) r_new
|
||||
GT -> match start v conf ans ((end,v,comb conf an, filter end ans):new:l_new) r_new
|
||||
|
||||
match start0 v conf (an@(start,_,_,end):ans) l_new r_new
|
||||
| start0 == start = insert v conf an ans l_new r_new
|
||||
match start0 v conf ans l_new r_new = revOn l_new r_new
|
||||
|
||||
comb ((start0,w0,an0,end0):conf) (start,w,an,end)
|
||||
| end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf
|
||||
comb conf an = an:conf
|
||||
|
||||
filter end [] = []
|
||||
filter end (next@(start,_,_,_):ans)
|
||||
| end <= start = next:ans
|
||||
| otherwise = filter end ans
|
||||
|
||||
revOn [] ys = ys
|
||||
revOn (x:xs) ys = revOn xs (x:ys)
|
||||
|
||||
merge [] ans = ans
|
||||
merge ans [] = ans
|
||||
merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) =
|
||||
case compare (start1,end1) (start2,end2) of
|
||||
GT -> an1 : merge ans1 (an2:ans2)
|
||||
EQ -> an1 : merge ans1 ans2
|
||||
LT -> an2 : merge (an1:ans1) ans2
|
||||
|
||||
filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
filterLongest [] = []
|
||||
filterLongest (an:ans) = longest an ans
|
||||
where
|
||||
longest prev [] = [prev]
|
||||
longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans)
|
||||
| start0 == start = longest next ans
|
||||
| otherwise = filter prev (next:ans)
|
||||
|
||||
filter prev [] = [prev]
|
||||
filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans)
|
||||
| end0 == start && (unk w0 an0 || unk w an)
|
||||
= filter (start0,w0++w,[],end) ans
|
||||
| end0 <= start = prev : longest next ans
|
||||
| otherwise = filter prev ans
|
||||
|
||||
unk w [] | any (not . isPunctuation) w = True
|
||||
unk _ _ = False
|
||||
|
||||
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
|
||||
fullFormLexicon lang = error "TODO: fullFormLexicon"
|
||||
|
||||
-- | This data type encodes the different outcomes which you could get from the parser.
|
||||
data ParseOutput a
|
||||
= ParseFailed Int String -- ^ The integer is the position in number of unicode characters where the parser failed.
|
||||
-- The string is the token where the parser have failed.
|
||||
| ParseOk a -- ^ If the parsing and the type checking are successful
|
||||
-- we get the abstract syntax trees as either a list or a chart.
|
||||
| ParseIncomplete -- ^ The sentence is not complete.
|
||||
|
||||
parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)]
|
||||
parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) []
|
||||
|
||||
parseWithHeuristics :: Concr -- ^ the language with which we parse
|
||||
-> Type -- ^ the start category
|
||||
-> String -- ^ the input sentence
|
||||
-> Double -- ^ the heuristic factor.
|
||||
-- A negative value tells the parser
|
||||
-- to lookup up the default from
|
||||
-- the grammar flags
|
||||
-> [(Cat, String -> Int -> Maybe (Expr,Float,Int))]
|
||||
-- ^ a list of callbacks for literal categories.
|
||||
-- The arguments of the callback are:
|
||||
-- the index of the constituent for the literal category;
|
||||
-- the input sentence; the current offset in the sentence.
|
||||
-- If a literal has been recognized then the output should
|
||||
-- be Just (expr,probability,end_offset)
|
||||
-> ParseOutput [(Expr,Float)]
|
||||
parseWithHeuristics = error "TODO: parseWithHeuristics"
|
||||
|
||||
-- | Returns possible completions of the current partial input.
|
||||
complete :: Concr -- ^ the language with which we parse
|
||||
-> Type -- ^ the start category
|
||||
-> String -- ^ the input sentence (excluding token being completed)
|
||||
-> String -- ^ prefix (partial token being completed)
|
||||
-> ParseOutput [(String, Fun, Cat, Float)] -- ^ (token, category, function, probability)
|
||||
complete = error "TODO: complete"
|
||||
|
||||
-- | Returns True if there is a linearization defined for that function in that language
|
||||
hasLinearization :: Concr -> Fun -> Bool
|
||||
hasLinearization = error "TODO: linearize"
|
||||
|
||||
-- | Linearizes an expression as a string in the language
|
||||
linearize :: Concr -> Expr -> String
|
||||
linearize lang e = error "TODO: linearize"
|
||||
|
||||
-- | Generates all possible linearizations of an expression
|
||||
linearizeAll :: Concr -> Expr -> [String]
|
||||
linearizeAll lang e = error "TODO: linearizeAll"
|
||||
|
||||
-- | Generates a table of linearizations for an expression
|
||||
tabularLinearize :: Concr -> Expr -> [(String, String)]
|
||||
tabularLinearize lang e =
|
||||
case tabularLinearizeAll lang e of
|
||||
(lins:_) -> lins
|
||||
_ -> []
|
||||
|
||||
-- | Generates a table of linearizations for an expression
|
||||
tabularLinearizeAll :: Concr -> Expr -> [[(String, String)]]
|
||||
tabularLinearizeAll lang e = error "TODO: tabularLinearizeAll"
|
||||
|
||||
type FId = Int
|
||||
|
||||
-- | BracketedString represents a sentence that is linearized
|
||||
-- as usual but we also want to retain the ''brackets'' that
|
||||
-- mark the beginning and the end of each constituent.
|
||||
data BracketedString
|
||||
= Leaf String -- ^ this is the leaf i.e. a single token
|
||||
| BIND -- ^ the surrounding tokens must be bound together
|
||||
| Bracket Cat {-# UNPACK #-} !FId String Fun [BracketedString]
|
||||
-- ^ this is a bracket. The 'Cat' is the category of
|
||||
-- the phrase. The 'FId' is an unique identifier for
|
||||
-- every phrase in the sentence. For context-free grammars
|
||||
-- i.e. without discontinuous constituents this identifier
|
||||
-- is also unique for every bracket. When there are discontinuous
|
||||
-- phrases then the identifiers are unique for every phrase but
|
||||
-- not for every bracket since the bracket represents a constituent.
|
||||
-- The different constituents could still be distinguished by using
|
||||
-- the analysis string. If the grammar is reduplicating
|
||||
-- then the constituent indices will be the same for all brackets
|
||||
-- that represents the same constituent.
|
||||
-- The 'Fun' is the name of the abstract function that generated
|
||||
-- this phrase.
|
||||
|
||||
-- | Renders the bracketed string as a string where
|
||||
-- the brackets are shown as @(S ...)@ where
|
||||
-- @S@ is the category.
|
||||
showBracketedString :: BracketedString -> String
|
||||
showBracketedString = render . ppBracketedString
|
||||
|
||||
ppBracketedString (Leaf t) = text t
|
||||
ppBracketedString BIND = text "&+"
|
||||
ppBracketedString (Bracket cat fid _ _ bss) = parens (text cat <> colon <> int fid <+> hsep (map ppBracketedString bss))
|
||||
|
||||
-- | Extracts the sequence of tokens from the bracketed string
|
||||
flattenBracketedString :: BracketedString -> [String]
|
||||
flattenBracketedString (Leaf w) = [w]
|
||||
flattenBracketedString (Bracket _ _ _ _ bss) = concatMap flattenBracketedString bss
|
||||
|
||||
bracketedLinearize :: Concr -> Expr -> [BracketedString]
|
||||
bracketedLinearize = error "TODO: bracketedLinearize"
|
||||
|
||||
bracketedLinearizeAll :: Concr -> Expr -> [[BracketedString]]
|
||||
bracketedLinearizeAll = error "TODO: bracketedLinearizeAll"
|
||||
|
||||
generateAll :: PGF -> Type -> [(Expr,Float)]
|
||||
generateAll p ty = error "TODO: generateAll"
|
||||
|
||||
generateAllFrom :: PGF -> Expr -> [(Expr,Float)]
|
||||
generateAllFrom p ty = error "TODO: generateAllFrom"
|
||||
|
||||
generateRandom :: StdGen -> PGF -> Type -> [a]
|
||||
generateRandom = error "TODO: generateRandom"
|
||||
|
||||
generateRandomFrom :: StdGen -> PGF -> Expr -> [a]
|
||||
generateRandomFrom = error "TODO: generateRandomFrom"
|
||||
|
||||
-- | List of all functions defined in the abstract syntax
|
||||
categories :: PGF -> [Cat]
|
||||
categories p =
|
||||
@@ -184,7 +491,7 @@ categories p =
|
||||
name <- peekText key
|
||||
writeIORef ref $ (name : names)
|
||||
|
||||
categoryContext :: PGF -> Cat -> [Hypo]
|
||||
categoryContext :: PGF -> Cat -> Maybe [Hypo]
|
||||
categoryContext p cat =
|
||||
unsafePerformIO $
|
||||
withText cat $ \c_cat ->
|
||||
@@ -195,11 +502,11 @@ categoryContext p cat =
|
||||
mask_ $ do
|
||||
c_hypos <- withPgfExn (pgf_category_context c_db c_revision c_cat p_n_hypos u)
|
||||
if c_hypos == nullPtr
|
||||
then return []
|
||||
then return Nothing
|
||||
else do n_hypos <- peek p_n_hypos
|
||||
hypos <- peekHypos c_hypos 0 n_hypos
|
||||
free c_hypos
|
||||
return hypos
|
||||
return (Just hypos)
|
||||
where
|
||||
peekHypos :: Ptr PgfTypeHypo -> CSize -> CSize -> IO [Hypo]
|
||||
peekHypos c_hypo i n
|
||||
@@ -214,8 +521,8 @@ categoryContext p cat =
|
||||
return ((bt,cat,ty) : hs)
|
||||
| otherwise = return []
|
||||
|
||||
categoryProb :: PGF -> Cat -> Float
|
||||
categoryProb p cat =
|
||||
categoryProbability :: PGF -> Cat -> Float
|
||||
categoryProbability p cat =
|
||||
unsafePerformIO $
|
||||
withText cat $ \c_cat ->
|
||||
withForeignPtr (a_db p) $ \c_db ->
|
||||
@@ -291,6 +598,256 @@ abstractFlag p name =
|
||||
freeStablePtr c_lit
|
||||
return (Just lit)
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
-- Graphviz
|
||||
|
||||
data GraphvizOptions = GraphvizOptions {noLeaves :: Bool,
|
||||
noFun :: Bool,
|
||||
noCat :: Bool,
|
||||
noDep :: Bool,
|
||||
nodeFont :: String,
|
||||
leafFont :: String,
|
||||
nodeColor :: String,
|
||||
leafColor :: String,
|
||||
nodeEdgeStyle :: String,
|
||||
leafEdgeStyle :: String
|
||||
}
|
||||
|
||||
graphvizDefaults = GraphvizOptions False False False True "" "" "" "" "" ""
|
||||
|
||||
-- | Renders an abstract syntax tree in a Graphviz format.
|
||||
graphvizAbstractTree :: PGF -> GraphvizOptions -> Expr -> String
|
||||
graphvizAbstractTree p opts e = error "TODO: graphvizAbstractTree"
|
||||
|
||||
graphvizParseTree :: Concr -> GraphvizOptions -> Expr -> String
|
||||
graphvizParseTree c opts e = error "TODO: graphvizParseTree"
|
||||
|
||||
graphvizWordAlignment :: [Concr] -> GraphvizOptions -> Expr -> String
|
||||
graphvizWordAlignment cs opts e = error "TODO: graphvizWordAlignment"
|
||||
|
||||
|
||||
type Labels = Map.Map Fun [String]
|
||||
|
||||
getDepLabels :: String -> Labels
|
||||
getDepLabels s = Map.fromList [(f,ls) | f:ls <- map words (lines s)]
|
||||
|
||||
-- | Visualize word dependency tree.
|
||||
graphvizDependencyTree
|
||||
:: String -- ^ Output format: @"latex"@, @"conll"@, @"malt_tab"@, @"malt_input"@ or @"dot"@
|
||||
-> Bool -- ^ Include extra information (debug)
|
||||
-> Maybe Labels -- ^ abstract label information obtained with 'getDepLabels'
|
||||
-> Maybe CncLabels -- ^ concrete label information obtained with ' ' (was: unused (was: @Maybe String@))
|
||||
-> Concr
|
||||
-> Expr
|
||||
-> String -- ^ Rendered output in the specified format
|
||||
graphvizDependencyTree format debug mlab mclab concr t = error "TODO: graphvizDependencyTree"
|
||||
|
||||
---------------------- should be a separate module?
|
||||
|
||||
-- visualization with latex output. AR Nov 2015
|
||||
|
||||
conlls2latexDoc :: [String] -> String
|
||||
conlls2latexDoc =
|
||||
render .
|
||||
latexDoc .
|
||||
vcat .
|
||||
intersperse (text "" $+$ app "vspace" (text "4mm")) .
|
||||
map conll2latex .
|
||||
filter (not . null)
|
||||
|
||||
conll2latex :: String -> Doc
|
||||
conll2latex = ppLaTeX . conll2latex' . parseCoNLL
|
||||
|
||||
conll2latex' :: CoNLL -> [LaTeX]
|
||||
conll2latex' = dep2latex . conll2dep'
|
||||
|
||||
data Dep = Dep {
|
||||
wordLength :: Int -> Double -- length of word at position int -- was: fixed width, millimetres (>= 20.0)
|
||||
, tokens :: [(String,String)] -- word, pos (0..)
|
||||
, deps :: [((Int,Int),String)] -- from, to, label
|
||||
, root :: Int -- root word position
|
||||
}
|
||||
|
||||
-- some general measures
|
||||
defaultWordLength = 20.0 -- the default fixed width word length, making word 100 units
|
||||
defaultUnit = 0.2 -- unit in latex pictures, 0.2 millimetres
|
||||
spaceLength = 10.0
|
||||
charWidth = 1.8
|
||||
|
||||
wsize rwld w = 100 * rwld w + spaceLength -- word length, units
|
||||
wpos rwld i = sum [wsize rwld j | j <- [0..i-1]] -- start position of the i'th word
|
||||
wdist rwld x y = sum [wsize rwld i | i <- [min x y .. max x y - 1]] -- distance between words x and y
|
||||
labelheight h = h + arcbase + 3 -- label just above arc; 25 would put it just below
|
||||
labelstart c = c - 15.0 -- label starts 15u left of arc centre
|
||||
arcbase = 30.0 -- arcs start and end 40u above the bottom
|
||||
arcfactor r = r * 600 -- reduction of arc size from word distance
|
||||
xyratio = 3 -- width/height ratio of arcs
|
||||
|
||||
putArc :: (Int -> Double) -> Int -> Int -> Int -> String -> [DrawingCommand]
|
||||
putArc frwld height x y label = [oval,arrowhead,labelling] where
|
||||
oval = Put (ctr,arcbase) (OvalTop (wdth,hght))
|
||||
arrowhead = Put (endp,arcbase + 5) (ArrowDown 5) -- downgoing arrow 5u above the arc base
|
||||
labelling = Put (labelstart ctr,labelheight (hght/2)) (TinyText label)
|
||||
dxy = wdist frwld x y -- distance between words, >>= 20.0
|
||||
ndxy = 100 * rwld * fromIntegral height -- distance that is indep of word length
|
||||
hdxy = dxy / 2 -- half the distance
|
||||
wdth = dxy - (arcfactor rwld)/dxy -- longer arcs are wider in proportion
|
||||
hght = ndxy / (xyratio * rwld) -- arc height is independent of word length
|
||||
begp = min x y -- begin position of oval
|
||||
ctr = wpos frwld begp + hdxy + (if x < y then 20 else 10) -- LR arcs are farther right from center of oval
|
||||
endp = (if x < y then (+) else (-)) ctr (wdth/2) -- the point of the arrow
|
||||
rwld = 0.5 ----
|
||||
|
||||
dep2latex :: Dep -> [LaTeX]
|
||||
dep2latex d =
|
||||
[Comment (unwords (map fst (tokens d))),
|
||||
Picture defaultUnit (width,height) (
|
||||
[Put (wpos rwld i,0) (Text w) | (i,w) <- zip [0..] (map fst (tokens d))] -- words
|
||||
++ [Put (wpos rwld i,15) (TinyText w) | (i,w) <- zip [0..] (map snd (tokens d))] -- pos tags 15u above bottom
|
||||
++ concat [putArc rwld (aheight x y) x y label | ((x,y),label) <- deps d] -- arcs and labels
|
||||
++ [Put (wpos rwld (root d) + 15,height) (ArrowDown (height-arcbase))]
|
||||
++ [Put (wpos rwld (root d) + 20,height - 10) (TinyText "ROOT")]
|
||||
)]
|
||||
where
|
||||
wld i = wordLength d i -- >= 20.0
|
||||
rwld i = (wld i) / defaultWordLength -- >= 1.0
|
||||
aheight x y = depth (min x y) (max x y) + 1 ---- abs (x-y)
|
||||
arcs = [(min u v, max u v) | ((u,v),_) <- deps d]
|
||||
depth x y = case [(u,v) | (u,v) <- arcs, (x < u && v <= y) || (x == u && v < y)] of ---- only projective arcs counted
|
||||
[] -> 0
|
||||
uvs -> 1 + maximum (0:[depth u v | (u,v) <- uvs])
|
||||
width = {-round-} (sum [wsize rwld w | (w,_) <- zip [0..] (tokens d)]) + {-round-} spaceLength * fromIntegral ((length (tokens d)) - 1)
|
||||
height = 50 + 20 * {-round-} (maximum (0:[aheight x y | ((x,y),_) <- deps d]))
|
||||
|
||||
type CoNLL = [[String]]
|
||||
parseCoNLL :: String -> CoNLL
|
||||
parseCoNLL = map words . lines
|
||||
|
||||
--conll2dep :: String -> Dep
|
||||
--conll2dep = conll2dep' . parseCoNLL
|
||||
|
||||
conll2dep' :: CoNLL -> Dep
|
||||
conll2dep' ls = Dep {
|
||||
wordLength = wld
|
||||
, tokens = toks
|
||||
, deps = dps
|
||||
, root = head $ [read x-1 | x:_:_:_:_:_:"0":_ <- ls] ++ [1]
|
||||
}
|
||||
where
|
||||
wld i = maximum (0:[charWidth * fromIntegral (length w) | w <- let (tok,pos) = toks !! i in [tok,pos]])
|
||||
toks = [(w,c) | _:w:_:c:_ <- ls]
|
||||
dps = [((read y-1, read x-1),lab) | x:_:_:_:_:_:y:lab:_ <- ls, y /="0"]
|
||||
--maxdist = maximum [abs (x-y) | ((x,y),_) <- dps]
|
||||
|
||||
|
||||
-- * LaTeX Pictures (see https://en.wikibooks.org/wiki/LaTeX/Picture)
|
||||
|
||||
-- We render both LaTeX and SVG from this intermediate representation of
|
||||
-- LaTeX pictures.
|
||||
|
||||
data LaTeX = Comment String | Picture UnitLengthMM Size [DrawingCommand]
|
||||
data DrawingCommand = Put Position Object
|
||||
data Object = Text String | TinyText String | OvalTop Size | ArrowDown Length
|
||||
|
||||
type UnitLengthMM = Double
|
||||
type Size = (Double,Double)
|
||||
type Position = (Double,Double)
|
||||
type Length = Double
|
||||
|
||||
|
||||
-- * latex formatting
|
||||
ppLaTeX = vcat . map ppLaTeX1
|
||||
where
|
||||
ppLaTeX1 el =
|
||||
case el of
|
||||
Comment s -> comment s
|
||||
Picture unit size cmds ->
|
||||
app "setlength{\\unitlength}" (text (show unit ++ "mm"))
|
||||
$$ hang (app "begin" (text "picture")<>text (show size)) 2
|
||||
(vcat (map ppDrawingCommand cmds))
|
||||
$$ app "end" (text "picture")
|
||||
$$ text ""
|
||||
|
||||
ppDrawingCommand (Put pos obj) = put pos (ppObject obj)
|
||||
|
||||
ppObject obj =
|
||||
case obj of
|
||||
Text s -> text s
|
||||
TinyText s -> small (text s)
|
||||
OvalTop size -> text "\\oval" <> text (show size) <> text "[t]"
|
||||
ArrowDown len -> app "vector(0,-1)" (text (show len))
|
||||
|
||||
put p@(_,_) = app ("put" ++ show p)
|
||||
small w = text "{\\tiny" <+> w <> text "}"
|
||||
comment s = text "%%" <+> text s -- line break show follow
|
||||
|
||||
app macro arg = text "\\" <> text macro <> text "{" <> arg <> text "}"
|
||||
|
||||
|
||||
latexDoc :: Doc -> Doc
|
||||
latexDoc body =
|
||||
vcat [text "\\documentclass{article}",
|
||||
text "\\usepackage[utf8]{inputenc}",
|
||||
text "\\begin{document}",
|
||||
body,
|
||||
text "\\end{document}"]
|
||||
|
||||
|
||||
----------------------------------
|
||||
-- concrete syntax annotations (local) on top of conll
|
||||
-- examples of annotations:
|
||||
-- UseComp {"not"} PART neg head
|
||||
-- UseComp {*} AUX cop head
|
||||
|
||||
type CncLabels = [(String, String -> Maybe (String -> String,String,String))]
|
||||
-- (fun, word -> (pos,label,target))
|
||||
-- the pos can remain unchanged, as in the current notation in the article
|
||||
|
||||
fixCoNLL :: CncLabels -> CoNLL -> CoNLL
|
||||
fixCoNLL labels conll = map fixc conll where
|
||||
fixc row = case row of
|
||||
(i:word:fun:pos:cat:x_:"0":"dep":xs) -> (i:word:fun:pos:cat:x_:"0":"root":xs) --- change the root label from dep to root
|
||||
(i:word:fun:pos:cat:x_:j:label:xs) -> case look (fun,word) of
|
||||
Just (pos',label',"head") -> (i:word:fun:pos' pos:cat:x_:j :label':xs)
|
||||
Just (pos',label',target) -> (i:word:fun:pos' pos:cat:x_: getDep j target:label':xs)
|
||||
_ -> row
|
||||
_ -> row
|
||||
|
||||
look (fun,word) = case lookup fun labels of
|
||||
Just relabel -> case relabel word of
|
||||
Just row -> Just row
|
||||
_ -> case lookup "*" labels of
|
||||
Just starlabel -> starlabel word
|
||||
_ -> Nothing
|
||||
_ -> case lookup "*" labels of
|
||||
Just starlabel -> starlabel word
|
||||
_ -> Nothing
|
||||
|
||||
getDep j label = maybe j id $ lookup (label,j) [((label,j),i) | i:word:fun:pos:cat:x_:j:label:xs <- conll]
|
||||
|
||||
getCncDepLabels :: String -> CncLabels
|
||||
getCncDepLabels = map merge . groupBy (\ (x,_) (a,_) -> x == a) . concatMap analyse . filter choose . lines where
|
||||
--- choose is for compatibility with the general notation
|
||||
choose line = notElem '(' line && elem '{' line --- ignoring non-local (with "(") and abstract (without "{") rules
|
||||
|
||||
analyse line = case break (=='{') line of
|
||||
(beg,_:ws) -> case break (=='}') ws of
|
||||
(toks,_:target) -> case (words beg, words target) of
|
||||
(fun:_,[ label,j]) -> [(fun, (tok, (id, label,j))) | tok <- getToks toks]
|
||||
(fun:_,[pos,label,j]) -> [(fun, (tok, (const pos,label,j))) | tok <- getToks toks]
|
||||
_ -> []
|
||||
_ -> []
|
||||
_ -> []
|
||||
merge rules@((fun,_):_) = (fun, \tok ->
|
||||
case lookup tok (map snd rules) of
|
||||
Just new -> return new
|
||||
_ -> lookup "*" (map snd rules)
|
||||
)
|
||||
getToks = words . map (\c -> if elem c "\"," then ' ' else c)
|
||||
|
||||
printCoNLL :: CoNLL -> String
|
||||
printCoNLL = unlines . map (concat . intersperse "\t")
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
-- Expressions & types
|
||||
|
||||
@@ -335,6 +892,12 @@ readExpr str =
|
||||
freeStablePtr c_expr
|
||||
return (Just expr)
|
||||
|
||||
pIdent :: ReadS String
|
||||
pIdent = error "TODO: pIdent"
|
||||
|
||||
pExpr :: ReadS Expr
|
||||
pExpr = error "TODO: pExpr"
|
||||
|
||||
-- | renders a type as a 'String'. The list
|
||||
-- of identifiers is the list of all free variables
|
||||
-- in the type in order reverse to the order
|
||||
@@ -348,6 +911,9 @@ showType scope ty =
|
||||
bracket (pgf_print_type c_ty pctxt 0 m) free $ \c_text ->
|
||||
peekText c_text
|
||||
|
||||
showContext :: [Var] -> [(BindType,Var,Type)] -> String
|
||||
showContext = error "TODO: showContext"
|
||||
|
||||
-- | parses a 'String' as a type
|
||||
readType :: String -> Maybe Type
|
||||
readType str =
|
||||
@@ -360,3 +926,8 @@ readType str =
|
||||
else do ty <- deRefStablePtr c_ty
|
||||
freeStablePtr c_ty
|
||||
return (Just ty)
|
||||
|
||||
readProbabilitiesFromFile :: FilePath -> IO (Map.Map String Double)
|
||||
readProbabilitiesFromFile fpath = do
|
||||
s <- readFile fpath
|
||||
return $ Map.fromList [(f,read p) | f:p:_ <- map words (lines s)]
|
||||
|
||||
Reference in New Issue
Block a user