mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-23 11:42:49 -06:00
API for word alignment in the C runtime and in the Haskell binding
This commit is contained in:
@@ -77,6 +77,7 @@ libpgf_la_SOURCES = \
|
|||||||
pgf/hopu.c \
|
pgf/hopu.c \
|
||||||
pgf/printer.c \
|
pgf/printer.c \
|
||||||
pgf/graphviz.c \
|
pgf/graphviz.c \
|
||||||
|
pgf/aligner.c \
|
||||||
pgf/pgf.c \
|
pgf/pgf.c \
|
||||||
pgf/pgf.h \
|
pgf/pgf.h \
|
||||||
libpgf_la_LDFLAGS = "-no-undefined"
|
libpgf_la_LDFLAGS = "-no-undefined"
|
||||||
|
|||||||
@@ -82,6 +82,12 @@ gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool)
|
|||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
gu_string_buf_flush(GuStringBuf* sb)
|
||||||
|
{
|
||||||
|
gu_buf_flush(sb->buf);
|
||||||
|
}
|
||||||
|
|
||||||
GuIn*
|
GuIn*
|
||||||
gu_string_in(GuString s, GuPool* pool)
|
gu_string_in(GuString s, GuPool* pool)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -33,6 +33,9 @@ gu_string_buf_out(GuStringBuf* sb);
|
|||||||
GuString
|
GuString
|
||||||
gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool);
|
gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool);
|
||||||
|
|
||||||
|
void
|
||||||
|
gu_string_buf_flush(GuStringBuf* sb);
|
||||||
|
|
||||||
GuString
|
GuString
|
||||||
gu_format_string_v(const char* fmt, va_list args, GuPool* pool);
|
gu_format_string_v(const char* fmt, va_list args, GuPool* pool);
|
||||||
|
|
||||||
|
|||||||
214
src/runtime/c/pgf/aligner.c
Normal file
214
src/runtime/c/pgf/aligner.c
Normal file
@@ -0,0 +1,214 @@
|
|||||||
|
#include "data.h"
|
||||||
|
#include "linearizer.h"
|
||||||
|
#include "pgf.h"
|
||||||
|
#include <gu/utf8.h>
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
PgfLinFuncs* funcs;
|
||||||
|
GuBuf* parent_stack;
|
||||||
|
GuBuf* parent_current;
|
||||||
|
GuBuf* phrases;
|
||||||
|
PgfAlignmentPhrase* last_phrase;
|
||||||
|
GuStringBuf* sbuf;
|
||||||
|
size_t n_matches;
|
||||||
|
GuExn* err;
|
||||||
|
bool bind;
|
||||||
|
bool capit;
|
||||||
|
GuPool* out_pool;
|
||||||
|
GuPool* tmp_pool;
|
||||||
|
} PgfAlignerLin;
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_flush_phrase(PgfAlignerLin* alin)
|
||||||
|
{
|
||||||
|
size_t n_fids = gu_buf_length(alin->parent_current);
|
||||||
|
|
||||||
|
if (alin->n_matches == n_fids &&
|
||||||
|
alin->n_matches == alin->last_phrase->n_fids) {
|
||||||
|
// if the current compound word has the same parents
|
||||||
|
// as the last one then we just combine them with a space
|
||||||
|
|
||||||
|
alin->last_phrase->phrase =
|
||||||
|
gu_format_string(alin->out_pool, "%s %s",
|
||||||
|
alin->last_phrase->phrase,
|
||||||
|
gu_string_buf_freeze(alin->sbuf, alin->tmp_pool));
|
||||||
|
} else {
|
||||||
|
// push the current word to the buffer of words
|
||||||
|
|
||||||
|
PgfAlignmentPhrase* phrase =
|
||||||
|
gu_new_flex(alin->out_pool, PgfAlignmentPhrase, fids, n_fids);
|
||||||
|
phrase->phrase = gu_string_buf_freeze(alin->sbuf, alin->out_pool);
|
||||||
|
phrase->n_fids = n_fids;
|
||||||
|
for (size_t i = 0; i < n_fids; i++) {
|
||||||
|
phrase->fids[i] = gu_buf_get(alin->parent_current, int, i);
|
||||||
|
}
|
||||||
|
gu_buf_push(alin->phrases, PgfAlignmentPhrase*, phrase);
|
||||||
|
|
||||||
|
alin->last_phrase = phrase;
|
||||||
|
}
|
||||||
|
|
||||||
|
alin->n_matches = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_push_parent(PgfAlignerLin* alin, int fid)
|
||||||
|
{
|
||||||
|
gu_buf_push(alin->parent_current, int, fid);
|
||||||
|
|
||||||
|
if (alin->last_phrase != NULL) {
|
||||||
|
for (size_t i = 0; i < alin->last_phrase->n_fids; i++) {
|
||||||
|
if (fid == alin->last_phrase->fids[i]) {
|
||||||
|
alin->n_matches++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
if (!gu_ok(alin->err)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// get the tree node id that generates this token
|
||||||
|
size_t n_parents = gu_buf_length(alin->parent_stack);
|
||||||
|
int fid = gu_buf_get(alin->parent_stack, int, n_parents-1);
|
||||||
|
|
||||||
|
// how many nodes so far are involved in the current compound word
|
||||||
|
size_t n_fids = gu_buf_length(alin->parent_current);
|
||||||
|
|
||||||
|
if (alin->bind) {
|
||||||
|
// here we glue tokens
|
||||||
|
|
||||||
|
alin->bind = false;
|
||||||
|
|
||||||
|
bool found = false;
|
||||||
|
for (size_t i = 0; i < n_fids; i++) {
|
||||||
|
int current_fid = gu_buf_get(alin->parent_current, int, i);
|
||||||
|
if (fid == current_fid) {
|
||||||
|
found = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add the tree node id to the list of parents if it has not
|
||||||
|
// been added already.
|
||||||
|
if (!found) {
|
||||||
|
pgf_aligner_push_parent(alin, fid);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// here we start a new (compound) word
|
||||||
|
|
||||||
|
pgf_aligner_flush_phrase(alin);
|
||||||
|
gu_string_buf_flush(alin->sbuf);
|
||||||
|
gu_buf_flush(alin->parent_current);
|
||||||
|
|
||||||
|
pgf_aligner_push_parent(alin, fid);
|
||||||
|
}
|
||||||
|
|
||||||
|
GuOut* out = gu_string_buf_out(alin->sbuf);
|
||||||
|
|
||||||
|
if (alin->capit) {
|
||||||
|
GuUCS c = gu_utf8_decode((const uint8_t**) &tok);
|
||||||
|
c = gu_ucs_to_upper(c);
|
||||||
|
gu_out_utf8(c, out, alin->err);
|
||||||
|
alin->capit = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
gu_string_write(tok, out, alin->err);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
gu_buf_push(alin->parent_stack, int, fid);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
gu_buf_pop(alin->parent_stack, int);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_symbol_ne(PgfLinFuncs** funcs)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
gu_raise(alin->err, PgalinNonExist);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_symbol_bind(PgfLinFuncs** funcs)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
alin->bind = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_aligner_lzn_symbol_capit(PgfLinFuncs** funcs)
|
||||||
|
{
|
||||||
|
PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs);
|
||||||
|
alin->capit = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static PgfLinFuncs pgf_file_lin_funcs = {
|
||||||
|
.symbol_token = pgf_aligner_lzn_symbol_token,
|
||||||
|
.begin_phrase = pgf_aligner_lzn_begin_phrase,
|
||||||
|
.end_phrase = pgf_aligner_lzn_end_phrase,
|
||||||
|
.symbol_ne = pgf_aligner_lzn_symbol_ne,
|
||||||
|
.symbol_bind = pgf_aligner_lzn_symbol_bind,
|
||||||
|
.symbol_capit = pgf_aligner_lzn_symbol_capit
|
||||||
|
};
|
||||||
|
|
||||||
|
GuSeq*
|
||||||
|
pgf_align_words(PgfConcr* concr, PgfExpr expr,
|
||||||
|
GuExn* err, GuPool* pool)
|
||||||
|
{
|
||||||
|
GuPool* tmp_pool = gu_local_pool();
|
||||||
|
|
||||||
|
GuEnum* cts =
|
||||||
|
pgf_lzr_concretize(concr, expr, err, tmp_pool);
|
||||||
|
if (!gu_ok(err)) {
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
GuBuf* phrases = gu_new_buf(PgfAlignmentPhrase*, pool);
|
||||||
|
|
||||||
|
PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool);
|
||||||
|
if (!gu_variant_is_null(ctree)) {
|
||||||
|
ctree = pgf_lzr_wrap_linref(ctree, tmp_pool);
|
||||||
|
|
||||||
|
PgfAlignerLin alin = {
|
||||||
|
.funcs = &pgf_file_lin_funcs,
|
||||||
|
.parent_stack = gu_new_buf(int, tmp_pool),
|
||||||
|
.parent_current = gu_new_buf(int, tmp_pool),
|
||||||
|
.phrases = phrases,
|
||||||
|
.last_phrase = NULL,
|
||||||
|
.sbuf = gu_string_buf(tmp_pool),
|
||||||
|
.n_matches = 0,
|
||||||
|
.err = err,
|
||||||
|
.bind = true,
|
||||||
|
.capit = false,
|
||||||
|
.out_pool = pool,
|
||||||
|
.tmp_pool = tmp_pool
|
||||||
|
};
|
||||||
|
gu_buf_push(alin.parent_stack, int, -1);
|
||||||
|
|
||||||
|
pgf_lzr_linearize(concr, ctree, 0, &alin.funcs, tmp_pool);
|
||||||
|
if (!gu_ok(err)) {
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
pgf_aligner_flush_phrase(&alin);
|
||||||
|
}
|
||||||
|
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
|
return gu_buf_data_seq(phrases);
|
||||||
|
}
|
||||||
@@ -1160,8 +1160,10 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err)
|
|||||||
|
|
||||||
GuEnum* cts =
|
GuEnum* cts =
|
||||||
pgf_lzr_concretize(concr, expr, err, tmp_pool);
|
pgf_lzr_concretize(concr, expr, err, tmp_pool);
|
||||||
if (!gu_ok(err))
|
if (!gu_ok(err)) {
|
||||||
|
gu_pool_free(tmp_pool);
|
||||||
return;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool);
|
PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool);
|
||||||
if (!gu_variant_is_null(ctree)) {
|
if (!gu_variant_is_null(ctree)) {
|
||||||
|
|||||||
@@ -75,6 +75,16 @@ pgf_has_linearization(PgfConcr* concr, PgfCId id);
|
|||||||
void
|
void
|
||||||
pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err);
|
pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err);
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
GuString phrase;
|
||||||
|
size_t n_fids;
|
||||||
|
int fids[];
|
||||||
|
} PgfAlignmentPhrase;
|
||||||
|
|
||||||
|
GuSeq*
|
||||||
|
pgf_align_words(PgfConcr* concr, PgfExpr expr,
|
||||||
|
GuExn* err, GuPool* pool);
|
||||||
|
|
||||||
bool
|
bool
|
||||||
pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
|
pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat,
|
||||||
double *precision, double *recall, double *exact);
|
double *precision, double *recall, double *exact);
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
module PGF2 (-- * PGF
|
module PGF2 (-- * PGF
|
||||||
PGF,readPGF,abstractName,startCat,
|
PGF,readPGF,abstractName,startCat,
|
||||||
-- * Concrete syntax
|
-- * Concrete syntax
|
||||||
Concr,languages,parse,parseWithHeuristics,linearize,
|
Concr,languages,parse,parseWithHeuristics,linearize,alignWords,
|
||||||
-- * Trees
|
-- * Trees
|
||||||
Expr,readExpr,showExpr,mkApp,unApp,mkStr,
|
Expr,readExpr,showExpr,mkApp,unApp,mkStr,
|
||||||
-- * Morphology
|
-- * Morphology
|
||||||
@@ -362,6 +362,33 @@ linearize lang e = unsafePerformIO $
|
|||||||
else do lin <- gu_string_buf_freeze sb pl
|
else do lin <- gu_string_buf_freeze sb pl
|
||||||
peekCString lin
|
peekCString lin
|
||||||
|
|
||||||
|
alignWords :: Concr -> Expr -> [(String, [Int])]
|
||||||
|
alignWords lang e = unsafePerformIO $
|
||||||
|
withGuPool $ \pl ->
|
||||||
|
do exn <- gu_new_exn pl
|
||||||
|
seq <- pgf_align_words (concr lang) (expr e) exn pl
|
||||||
|
failed <- gu_exn_is_raised exn
|
||||||
|
if failed
|
||||||
|
then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist
|
||||||
|
if is_nonexist
|
||||||
|
then return []
|
||||||
|
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
|
||||||
|
if is_exn
|
||||||
|
then do c_msg <- (#peek GuExn, data.data) exn
|
||||||
|
msg <- peekCString c_msg
|
||||||
|
throwIO (PGFError msg)
|
||||||
|
else throwIO (PGFError "The abstract tree cannot be linearized")
|
||||||
|
else do len <- (#peek GuSeq, len) seq
|
||||||
|
arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data))
|
||||||
|
mapM peekAlignmentPhrase arr
|
||||||
|
where
|
||||||
|
peekAlignmentPhrase :: Ptr () -> IO (String, [Int])
|
||||||
|
peekAlignmentPhrase ptr = do
|
||||||
|
c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr
|
||||||
|
phrase <- peekCString c_phrase
|
||||||
|
n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr
|
||||||
|
fids <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids))
|
||||||
|
return (phrase, fids)
|
||||||
|
|
||||||
-----------------------------------------------------------------------------
|
-----------------------------------------------------------------------------
|
||||||
-- Helper functions
|
-- Helper functions
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ data GuString
|
|||||||
data GuStringBuf
|
data GuStringBuf
|
||||||
data GuMapItor
|
data GuMapItor
|
||||||
data GuOut
|
data GuOut
|
||||||
|
data GuSeq
|
||||||
data GuPool
|
data GuPool
|
||||||
|
|
||||||
foreign import ccall fopen :: CString -> CString -> IO (Ptr ())
|
foreign import ccall fopen :: CString -> CString -> IO (Ptr ())
|
||||||
@@ -135,6 +136,9 @@ foreign import ccall "pgf/pgf.h pgf_print_name"
|
|||||||
foreign import ccall "pgf/pgf.h pgf_linearize"
|
foreign import ccall "pgf/pgf.h pgf_linearize"
|
||||||
pgf_linearize :: Ptr PgfConcr -> PgfExpr -> Ptr GuOut -> Ptr GuExn -> IO ()
|
pgf_linearize :: Ptr PgfConcr -> PgfExpr -> Ptr GuOut -> Ptr GuExn -> IO ()
|
||||||
|
|
||||||
|
foreign import ccall "pgf/pgf.h pgf_align_words"
|
||||||
|
pgf_align_words :: Ptr PgfConcr -> PgfExpr -> Ptr GuExn -> Ptr GuPool -> IO (Ptr GuSeq)
|
||||||
|
|
||||||
foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics"
|
foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics"
|
||||||
pgf_parse_with_heuristics :: Ptr PgfConcr -> CString -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum)
|
pgf_parse_with_heuristics :: Ptr PgfConcr -> CString -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user