From 3bd40dbab68c8354d8cfceb6dad32d24b13bc723 Mon Sep 17 00:00:00 2001 From: "kr.angelov" Date: Mon, 29 Dec 2014 10:59:20 +0000 Subject: [PATCH] API for word alignment in the C runtime and in the Haskell binding --- src/runtime/c/Makefile.am | 1 + src/runtime/c/gu/string.c | 6 + src/runtime/c/gu/string.h | 3 + src/runtime/c/pgf/aligner.c | 214 +++++++++++++++++++++++++++ src/runtime/c/pgf/linearizer.c | 4 +- src/runtime/c/pgf/pgf.h | 10 ++ src/runtime/haskell-bind/PGF2.hsc | 29 +++- src/runtime/haskell-bind/PGF2/FFI.hs | 4 + 8 files changed, 269 insertions(+), 2 deletions(-) create mode 100644 src/runtime/c/pgf/aligner.c diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index 4129c6157..726f00080 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -77,6 +77,7 @@ libpgf_la_SOURCES = \ pgf/hopu.c \ pgf/printer.c \ pgf/graphviz.c \ + pgf/aligner.c \ pgf/pgf.c \ pgf/pgf.h \ libpgf_la_LDFLAGS = "-no-undefined" diff --git a/src/runtime/c/gu/string.c b/src/runtime/c/gu/string.c index d380fca49..0947cf9e0 100644 --- a/src/runtime/c/gu/string.c +++ b/src/runtime/c/gu/string.c @@ -82,6 +82,12 @@ gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool) return p; } +void +gu_string_buf_flush(GuStringBuf* sb) +{ + gu_buf_flush(sb->buf); +} + GuIn* gu_string_in(GuString s, GuPool* pool) { diff --git a/src/runtime/c/gu/string.h b/src/runtime/c/gu/string.h index 111050606..e4729239c 100644 --- a/src/runtime/c/gu/string.h +++ b/src/runtime/c/gu/string.h @@ -33,6 +33,9 @@ gu_string_buf_out(GuStringBuf* sb); GuString gu_string_buf_freeze(GuStringBuf* sb, GuPool* pool); +void +gu_string_buf_flush(GuStringBuf* sb); + GuString gu_format_string_v(const char* fmt, va_list args, GuPool* pool); diff --git a/src/runtime/c/pgf/aligner.c b/src/runtime/c/pgf/aligner.c new file mode 100644 index 000000000..a3eb4e2c0 --- /dev/null +++ b/src/runtime/c/pgf/aligner.c @@ -0,0 +1,214 @@ +#include "data.h" +#include "linearizer.h" +#include "pgf.h" +#include + +typedef struct { + PgfLinFuncs* funcs; + GuBuf* parent_stack; + GuBuf* parent_current; + GuBuf* phrases; + PgfAlignmentPhrase* last_phrase; + GuStringBuf* sbuf; + size_t n_matches; + GuExn* err; + bool bind; + bool capit; + GuPool* out_pool; + GuPool* tmp_pool; +} PgfAlignerLin; + +static void +pgf_aligner_flush_phrase(PgfAlignerLin* alin) +{ + size_t n_fids = gu_buf_length(alin->parent_current); + + if (alin->n_matches == n_fids && + alin->n_matches == alin->last_phrase->n_fids) { + // if the current compound word has the same parents + // as the last one then we just combine them with a space + + alin->last_phrase->phrase = + gu_format_string(alin->out_pool, "%s %s", + alin->last_phrase->phrase, + gu_string_buf_freeze(alin->sbuf, alin->tmp_pool)); + } else { + // push the current word to the buffer of words + + PgfAlignmentPhrase* phrase = + gu_new_flex(alin->out_pool, PgfAlignmentPhrase, fids, n_fids); + phrase->phrase = gu_string_buf_freeze(alin->sbuf, alin->out_pool); + phrase->n_fids = n_fids; + for (size_t i = 0; i < n_fids; i++) { + phrase->fids[i] = gu_buf_get(alin->parent_current, int, i); + } + gu_buf_push(alin->phrases, PgfAlignmentPhrase*, phrase); + + alin->last_phrase = phrase; + } + + alin->n_matches = 0; +} + +static void +pgf_aligner_push_parent(PgfAlignerLin* alin, int fid) +{ + gu_buf_push(alin->parent_current, int, fid); + + if (alin->last_phrase != NULL) { + for (size_t i = 0; i < alin->last_phrase->n_fids; i++) { + if (fid == alin->last_phrase->fids[i]) { + alin->n_matches++; + break; + } + } + } +} + +static void +pgf_aligner_lzn_symbol_token(PgfLinFuncs** funcs, PgfToken tok) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + if (!gu_ok(alin->err)) { + return; + } + + // get the tree node id that generates this token + size_t n_parents = gu_buf_length(alin->parent_stack); + int fid = gu_buf_get(alin->parent_stack, int, n_parents-1); + + // how many nodes so far are involved in the current compound word + size_t n_fids = gu_buf_length(alin->parent_current); + + if (alin->bind) { + // here we glue tokens + + alin->bind = false; + + bool found = false; + for (size_t i = 0; i < n_fids; i++) { + int current_fid = gu_buf_get(alin->parent_current, int, i); + if (fid == current_fid) { + found = true; + break; + } + } + + // add the tree node id to the list of parents if it has not + // been added already. + if (!found) { + pgf_aligner_push_parent(alin, fid); + } + } else { + // here we start a new (compound) word + + pgf_aligner_flush_phrase(alin); + gu_string_buf_flush(alin->sbuf); + gu_buf_flush(alin->parent_current); + + pgf_aligner_push_parent(alin, fid); + } + + GuOut* out = gu_string_buf_out(alin->sbuf); + + if (alin->capit) { + GuUCS c = gu_utf8_decode((const uint8_t**) &tok); + c = gu_ucs_to_upper(c); + gu_out_utf8(c, out, alin->err); + alin->capit = false; + } + + gu_string_write(tok, out, alin->err); +} + +static void +pgf_aligner_lzn_begin_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + gu_buf_push(alin->parent_stack, int, fid); +} + +static void +pgf_aligner_lzn_end_phrase(PgfLinFuncs** funcs, PgfCId cat, int fid, int lindex, PgfCId fun) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + gu_buf_pop(alin->parent_stack, int); +} + +static void +pgf_aligner_lzn_symbol_ne(PgfLinFuncs** funcs) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + gu_raise(alin->err, PgalinNonExist); +} + +static void +pgf_aligner_lzn_symbol_bind(PgfLinFuncs** funcs) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + alin->bind = true; +} + +static void +pgf_aligner_lzn_symbol_capit(PgfLinFuncs** funcs) +{ + PgfAlignerLin* alin = gu_container(funcs, PgfAlignerLin, funcs); + alin->capit = true; +} + +static PgfLinFuncs pgf_file_lin_funcs = { + .symbol_token = pgf_aligner_lzn_symbol_token, + .begin_phrase = pgf_aligner_lzn_begin_phrase, + .end_phrase = pgf_aligner_lzn_end_phrase, + .symbol_ne = pgf_aligner_lzn_symbol_ne, + .symbol_bind = pgf_aligner_lzn_symbol_bind, + .symbol_capit = pgf_aligner_lzn_symbol_capit +}; + +GuSeq* +pgf_align_words(PgfConcr* concr, PgfExpr expr, + GuExn* err, GuPool* pool) +{ + GuPool* tmp_pool = gu_local_pool(); + + GuEnum* cts = + pgf_lzr_concretize(concr, expr, err, tmp_pool); + if (!gu_ok(err)) { + gu_pool_free(tmp_pool); + return NULL; + } + + GuBuf* phrases = gu_new_buf(PgfAlignmentPhrase*, pool); + + PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool); + if (!gu_variant_is_null(ctree)) { + ctree = pgf_lzr_wrap_linref(ctree, tmp_pool); + + PgfAlignerLin alin = { + .funcs = &pgf_file_lin_funcs, + .parent_stack = gu_new_buf(int, tmp_pool), + .parent_current = gu_new_buf(int, tmp_pool), + .phrases = phrases, + .last_phrase = NULL, + .sbuf = gu_string_buf(tmp_pool), + .n_matches = 0, + .err = err, + .bind = true, + .capit = false, + .out_pool = pool, + .tmp_pool = tmp_pool + }; + gu_buf_push(alin.parent_stack, int, -1); + + pgf_lzr_linearize(concr, ctree, 0, &alin.funcs, tmp_pool); + if (!gu_ok(err)) { + gu_pool_free(tmp_pool); + return NULL; + } + + pgf_aligner_flush_phrase(&alin); + } + + gu_pool_free(tmp_pool); + return gu_buf_data_seq(phrases); +} diff --git a/src/runtime/c/pgf/linearizer.c b/src/runtime/c/pgf/linearizer.c index bbec2f3c2..409d60a2c 100644 --- a/src/runtime/c/pgf/linearizer.c +++ b/src/runtime/c/pgf/linearizer.c @@ -1160,8 +1160,10 @@ pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err) GuEnum* cts = pgf_lzr_concretize(concr, expr, err, tmp_pool); - if (!gu_ok(err)) + if (!gu_ok(err)) { + gu_pool_free(tmp_pool); return; + } PgfCncTree ctree = gu_next(cts, PgfCncTree, tmp_pool); if (!gu_variant_is_null(ctree)) { diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index e542e4213..e2fc6f74d 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -75,6 +75,16 @@ pgf_has_linearization(PgfConcr* concr, PgfCId id); void pgf_linearize(PgfConcr* concr, PgfExpr expr, GuOut* out, GuExn* err); +typedef struct { + GuString phrase; + size_t n_fids; + int fids[]; +} PgfAlignmentPhrase; + +GuSeq* +pgf_align_words(PgfConcr* concr, PgfExpr expr, + GuExn* err, GuPool* pool); + bool pgf_parseval(PgfConcr* concr, PgfExpr expr, PgfCId cat, double *precision, double *recall, double *exact); diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 44f9d2b1c..02f74dd7a 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -15,7 +15,7 @@ module PGF2 (-- * PGF PGF,readPGF,abstractName,startCat, -- * Concrete syntax - Concr,languages,parse,parseWithHeuristics,linearize, + Concr,languages,parse,parseWithHeuristics,linearize,alignWords, -- * Trees Expr,readExpr,showExpr,mkApp,unApp,mkStr, -- * Morphology @@ -362,6 +362,33 @@ linearize lang e = unsafePerformIO $ else do lin <- gu_string_buf_freeze sb pl peekCString lin +alignWords :: Concr -> Expr -> [(String, [Int])] +alignWords lang e = unsafePerformIO $ + withGuPool $ \pl -> + do exn <- gu_new_exn pl + seq <- pgf_align_words (concr lang) (expr e) exn pl + failed <- gu_exn_is_raised exn + if failed + then do is_nonexist <- gu_exn_caught exn gu_exn_type_PgfLinNonExist + if is_nonexist + then return [] + else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn + if is_exn + then do c_msg <- (#peek GuExn, data.data) exn + msg <- peekCString c_msg + throwIO (PGFError msg) + else throwIO (PGFError "The abstract tree cannot be linearized") + else do len <- (#peek GuSeq, len) seq + arr <- peekArray (fromIntegral (len :: CInt)) (seq `plusPtr` (#offset GuSeq, data)) + mapM peekAlignmentPhrase arr + where + peekAlignmentPhrase :: Ptr () -> IO (String, [Int]) + peekAlignmentPhrase ptr = do + c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr + phrase <- peekCString c_phrase + n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr + fids <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids)) + return (phrase, fids) ----------------------------------------------------------------------------- -- Helper functions diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs index b96c93e17..f36fa1368 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hs +++ b/src/runtime/haskell-bind/PGF2/FFI.hs @@ -21,6 +21,7 @@ data GuString data GuStringBuf data GuMapItor data GuOut +data GuSeq data GuPool foreign import ccall fopen :: CString -> CString -> IO (Ptr ()) @@ -135,6 +136,9 @@ foreign import ccall "pgf/pgf.h pgf_print_name" foreign import ccall "pgf/pgf.h pgf_linearize" pgf_linearize :: Ptr PgfConcr -> PgfExpr -> Ptr GuOut -> Ptr GuExn -> IO () +foreign import ccall "pgf/pgf.h pgf_align_words" + pgf_align_words :: Ptr PgfConcr -> PgfExpr -> Ptr GuExn -> Ptr GuPool -> IO (Ptr GuSeq) + foreign import ccall "pgf/pgf.h pgf_parse_with_heuristics" pgf_parse_with_heuristics :: Ptr PgfConcr -> CString -> CString -> Double -> Ptr PgfCallbacksMap -> Ptr GuExn -> Ptr GuPool -> Ptr GuPool -> IO (Ptr GuEnum)