restored the word alignment API

This commit is contained in:
Krasimir Angelov
2023-02-23 20:17:23 +01:00
parent 57126f6d28
commit 1b2c8ce961
7 changed files with 382 additions and 3 deletions

View File

@@ -26,6 +26,8 @@ libpgf_la_SOURCES = \
pgf/typechecker.h \
pgf/linearizer.cxx \
pgf/linearizer.h \
pgf/aligner.cxx \
pgf/aligner.h \
pgf/parser.cxx \
pgf/parser.h \
pgf/graphviz.cxx \

View File

@@ -0,0 +1,153 @@
#include "data.h"
#include "printer.h"
#include "aligner.h"
PgfAlignerOutput::PgfAlignerOutput() : printer(NULL, 0, NULL)
{
n_phrases = 0;
last_phrase = NULL;
phrases = NULL;
n_matches = 0;
bind = true;
nonexist = false;
}
PgfAlignerOutput::~PgfAlignerOutput()
{
free_phrases(phrases, n_phrases);
}
void PgfAlignerOutput::free_phrases(PgfAlignmentPhrase **phrases, size_t n_phrases)
{
if (phrases) {
for (size_t i = 0; i < n_phrases; i++) {
free(phrases[i]->phrase);
free(phrases[i]);
}
free(phrases);
}
}
PgfAlignmentPhrase **PgfAlignerOutput::get_phrases(size_t *n_phrases)
{
if (nonexist) {
*n_phrases = 0;
return NULL;
}
*n_phrases = this->n_phrases;
PgfAlignmentPhrase **res = phrases;
this->n_phrases = 0;
this->last_phrase = NULL;
this->phrases = NULL;
return res;
}
void PgfAlignerOutput::push_parent(int fid)
{
parent_current.push_back(fid);
if (last_phrase != NULL) {
for (size_t i = 0; i < last_phrase->n_fids; i++) {
if (fid == last_phrase->fids[i]) {
n_matches++;
break;
}
}
}
}
void PgfAlignerOutput::symbol_token(PgfText *tok)
{
if (nonexist)
return;
size_t n_parents = parent_stack.size();
int fid = parent_stack.back();
// how many nodes so far are involved in the current compound word
size_t n_fids = parent_current.size();
if (bind) {
// here we glue tokens
bind = false;
bool found = false;
for (int current_fid : parent_current) {
if (fid == current_fid) {
found = true;
break;
}
}
// add the tree node id to the list of parents if it has not
// been added already.
if (!found) {
push_parent(fid);
}
} else {
// here we start a new (compound) word
flush();
parent_current.clear();
push_parent(fid);
}
printer.puts(tok);
}
void PgfAlignerOutput::begin_phrase(PgfText *cat, int fid, PgfText *ann, PgfText *fun)
{
parent_stack.push_back(fid);
}
void PgfAlignerOutput::end_phrase(PgfText *cat, int fid, PgfText *ann, PgfText *fun)
{
parent_stack.pop_back();
}
void PgfAlignerOutput::symbol_ne()
{
nonexist = true;
}
void PgfAlignerOutput::symbol_bind()
{
bind = true;
}
void PgfAlignerOutput::flush()
{
size_t n_fids = parent_current.size();
if (n_matches == n_fids &&
n_matches == last_phrase->n_fids) {
// if the current compound word has the same parents
// as the last one then we just combine them with a space
PgfText *phrase = printer.get_text();
printer.puts(last_phrase->phrase); free(last_phrase->phrase);
printer.puts(" ");
printer.puts(phrase); free(phrase);
last_phrase->phrase = printer.get_text();
} else {
// push the current word to the buffer of words
PgfAlignmentPhrase* phrase = (PgfAlignmentPhrase*)
malloc(sizeof(PgfAlignmentPhrase)+n_fids*sizeof(int));
phrase->phrase = printer.get_text();
phrase->n_fids = n_fids;
for (size_t i = 0; i < n_fids; i++) {
phrase->fids[i] = parent_current[i];
}
phrases = (PgfAlignmentPhrase**)
realloc(phrases, (n_phrases+1)*sizeof(PgfAlignmentPhrase*));
phrases[n_phrases++] = phrase;
last_phrase = phrase;
}
n_matches = 0;
}

View File

@@ -0,0 +1,36 @@
#ifndef ALIGNER_H
#define ALIGNER_H
#include <vector>
class PGF_INTERNAL_DECL PgfAlignerOutput : public PgfLinearizationOutputIface {
public:
PgfAlignerOutput();
~PgfAlignerOutput();
virtual void symbol_token(PgfText *tok);
virtual void begin_phrase(PgfText *cat, int fid, PgfText *ann, PgfText *fun);
virtual void end_phrase(PgfText *cat, int fid, PgfText *ann, PgfText *fun);
virtual void symbol_ne();
virtual void symbol_bind();
virtual void flush();
PgfAlignmentPhrase **get_phrases(size_t *n_phrases);
static void free_phrases(PgfAlignmentPhrase **phrases, size_t n_phrases);
private:
bool bind;
bool nonexist;
std::vector<int> parent_current;
std::vector<int> parent_stack;
size_t n_phrases;
PgfAlignmentPhrase *last_phrase;
PgfAlignmentPhrase **phrases;
size_t n_matches;
PgfPrinter printer;
void push_parent(int fid);
};
#endif

View File

@@ -13,6 +13,7 @@
#include "linearizer.h"
#include "parser.h"
#include "graphviz.h"
#include "aligner.h"
static void
pgf_exn_clear(PgfExn* err)
@@ -2641,3 +2642,121 @@ pgf_graphviz_parse_tree(PgfDB *db, PgfConcrRevision revision,
return NULL;
}
PGF_API PgfText *
pgf_graphviz_word_alignment(PgfDB *db, PgfConcrRevision* revisions, size_t n_revisions,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
PgfGraphvizOptions* opts,
PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
PgfPrinter printer(NULL, 0, NULL);
printer.puts("digraph {\n");
printer.puts("rankdir=LR ;\n");
printer.puts("node [shape = record");
if (opts->leafFont != NULL && *opts->leafFont)
printer.nprintf(40, ", fontname = \"%s\"", opts->leafFont);
if (opts->leafColor != NULL && *opts->leafColor)
printer.nprintf(40, ", fontcolor = \"%s\"", opts->leafColor);
printer.puts("] ;\n\n");
if (opts->leafEdgeStyle != NULL && *opts->leafEdgeStyle)
printer.nprintf(40, "edge [style = %s];\n", opts->leafEdgeStyle);
printer.puts("\n");
size_t last_n_phrases = 0;
PgfAlignmentPhrase **last_phrases = NULL;
for (size_t i = 0; i < n_revisions; i++) {
ref<PgfConcr> concr = db->revision2concr(revisions[i]);
PgfAlignerOutput out;
PgfLinearizer linearizer(ctxt, concr, m);
m->match_expr(&linearizer, expr);
linearizer.reverse_and_label(true);
if (linearizer.resolve()) {
linearizer.linearize(&out, 0);
out.flush();
printer.nprintf(40, " struct%zu[label=\"", i);
size_t n_phrases;
PgfAlignmentPhrase **phrases =
out.get_phrases(&n_phrases);
for (size_t j = 0; j < n_phrases; j++) {
PgfAlignmentPhrase* phrase = phrases[j];
if (j > 0)
printer.puts(" | ");
printer.nprintf(16, "<n%zu> ", j);
printer.puts(phrase->phrase);
}
printer.puts("\"] ;\n");
if (last_phrases != NULL) {
for (size_t j = 0; j < n_phrases; j++) {
PgfAlignmentPhrase* phrase = phrases[j];
for (size_t k = 0; k < phrase->n_fids; k++) {
int fid = phrase->fids[k];
for (size_t l = 0; l < last_n_phrases; l++) {
PgfAlignmentPhrase* last_phrase = last_phrases[l];
for (size_t r = 0; r < last_phrase->n_fids; r++) {
int last_fid = last_phrase->fids[r];
if (fid == last_fid) {
printer.nprintf(50, "struct%zu:n%zu:e -> struct%zu:n%zu:w ;\n",i-1,l,i,j);
}
}
}
}
}
}
PgfAlignerOutput::free_phrases(last_phrases, last_n_phrases);
last_n_phrases = n_phrases;
last_phrases = phrases;
}
}
PgfAlignerOutput::free_phrases(last_phrases, last_n_phrases);
printer.puts("}");
return printer.get_text();
} PGF_API_END
return NULL;
}
PGF_API
PgfAlignmentPhrase **
pgf_align_words(PgfDB *db, PgfConcrRevision revision,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
size_t *n_phrases /* out */,
PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(revision);
PgfAlignerOutput out;
PgfLinearizer linearizer(ctxt, concr, m);
m->match_expr(&linearizer, expr);
linearizer.reverse_and_label(true);
if (linearizer.resolve()) {
linearizer.linearize(&out, 0);
out.flush();
return out.get_phrases(n_phrases);
}
} PGF_API_END
return NULL;
}

View File

@@ -825,4 +825,25 @@ pgf_graphviz_parse_tree(PgfDB *db, PgfConcrRevision revision,
PgfGraphvizOptions* opts,
PgfExn *err);
PGF_API_DECL PgfText *
pgf_graphviz_word_alignment(PgfDB *db, PgfConcrRevision* revisions, size_t n_revisions,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
PgfGraphvizOptions* opts,
PgfExn* err);
typedef struct {
PgfText *phrase;
int n_fids;
int fids[];
} PgfAlignmentPhrase;
PGF_API_DECL
PgfAlignmentPhrase **
pgf_align_words(PgfDB *db, PgfConcrRevision revision,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
size_t *n_phrases /* out */,
PgfExn* err);
#endif // PGF_H_