now word completion from Java works. It could be made better

2026-05-24 10:22:50 -06:00 · 2014-04-17 11:00:27 +00:00
parent d2f48f8156
commit d8e5206e7f
1 changed files with 128 additions and 219 deletions
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -54,7 +54,9 @@ typedef struct {
    int max_fid;
    PgfParseState *before;
    PgfParseState *after;
-    PgfExprEnum en;    // enumeration for the generated trees
+    PgfToken prefix;
+    PgfTokenProb* tp;
+    PgfExprEnum en;    // enumeration for the generated trees/tokens
 #ifdef PGF_COUNTS_DEBUG
    int item_full_count;
    int item_real_count;
@@ -1278,7 +1280,7 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset, BIND_TYPE bind_type)
 	}

 	size_t end_offset = start_offset;
-	GuString current = ps->sentence + start_offset;
+	GuString current = ps->sentence + end_offset;
 	size_t len = strlen(current);
 	while (skip_space(&current, &len)) {
 		end_offset++;
@@ -1317,17 +1319,25 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
 	GuString current = ps->sentence + ps->before->end_offset;
 	size_t len = strlen(current);

-    if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
-		PgfParseState* state =
-			pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
-		if (state->next == NULL) {
-			state->viterbi_prob = 
-				item->inside_prob+item->conts->outside_prob;
+	if (ps->prefix != NULL && ps->sentence[ps->before->end_offset] == 0) {
+		if (gu_string_is_prefix(ps->prefix, tok)) {
+			ps->tp = gu_new(PgfTokenProb, ps->out_pool);
+			ps->tp->tok  = tok;
+			ps->tp->prob = item->inside_prob + item->conts->outside_prob;
 		}
+	} else {
+		if (!ps->before->needs_bind && cmp_string(&current, &len, tok) == 0) {
+			PgfParseState* state =
+				pgf_new_parse_state(ps, (current - ps->sentence), BIND_NONE);
+			if (state->next == NULL) {
+				state->viterbi_prob = 
+					item->inside_prob+item->conts->outside_prob;
+			}

-		gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
-    } else {
-		pgf_item_free(ps, item);
+			gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
+		} else {
+			pgf_item_free(ps, item);
+		}
 	}
 }

@@ -1361,53 +1371,68 @@ pgf_parsing_td_predict(PgfParsing* ps,
 			item->inside_prob-conts->ccat->viterbi_prob+
 			item->conts->outside_prob;

-		// Top-down prediction for syntactic rules
-		for (size_t i = 0; i < ccat->n_synprods; i++) {
-			PgfProduction prod =
-				gu_seq_get(ccat->prods, PgfProduction, i);
-			pgf_parsing_production(ps, ps->before, conts, prod);
-		}
+		if (ps->prefix != NULL) {
+			// We do completion:
+			//   - top-down prediction for both syntactic and lexical rules
+			size_t n_prods;
+			if (ccat->fid < ps->concr->total_cats) // in grammar
+				n_prods = gu_seq_length(ccat->prods);
+			else
+				n_prods = ccat->n_synprods;
+			for (size_t i = 0; i < n_prods; i++) {
+				PgfProduction prod =
+					gu_seq_get(ccat->prods, PgfProduction, i);
+				pgf_parsing_production(ps, ps->before, conts, prod);
+			}
+		} else {
+			// Top-down prediction for syntactic rules
+			for (size_t i = 0; i < ccat->n_synprods; i++) {
+				PgfProduction prod =
+					gu_seq_get(ccat->prods, PgfProduction, i);
+				pgf_parsing_production(ps, ps->before, conts, prod);
+			}

-		// Bottom-up prediction for lexical and epsilon rules
-		size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
-		for (size_t i = 0; i < n_idcs; i++) {
-			PgfLexiconIdxEntry* lentry =
-				gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
-			PgfParseState* state =
-				pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);
+			// Bottom-up prediction for lexical and epsilon rules
+			size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
+			for (size_t i = 0; i < n_idcs; i++) {
+				PgfLexiconIdxEntry* lentry =
+					gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
+				PgfParseState* state =
+					pgf_new_parse_state(ps, lentry->offset, lentry->bind_type);

-			if (state != NULL) {
-				PgfProductionIdxEntry key;
-				key.ccat    = ccat;
-				key.lin_idx = lin_idx;
-				key.papp    = NULL;
-				PgfProductionIdxEntry* value =
-					gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
-									 pgf_production_idx_entry_order,
-									 PgfProductionIdxEntry, &key);
+				if (state != NULL) {
+					PgfProductionIdxEntry key;
+					key.ccat    = ccat;
+					key.lin_idx = lin_idx;
+					key.papp    = NULL;
+					PgfProductionIdxEntry* value =
+						gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
+										 pgf_production_idx_entry_order,
+										 PgfProductionIdxEntry, &key);

-				if (value != NULL) {
-                    pgf_parsing_predict_lexeme(ps, state, conts, value);
+					if (value != NULL) {
+						pgf_parsing_predict_lexeme(ps, state, conts, value);

-                    PgfProductionIdxEntry* start =
-						gu_buf_data(lentry->idx);
-					PgfProductionIdxEntry* end =
-						start + gu_buf_length(lentry->idx)-1;
+						PgfProductionIdxEntry* start =
+							gu_buf_data(lentry->idx);
+						PgfProductionIdxEntry* end =
+							start + gu_buf_length(lentry->idx)-1;

-                    PgfProductionIdxEntry* left = value-1;
-                    while (left >= start &&
-                           value->ccat->fid == left->ccat->fid &&
-                           value->lin_idx   == left->lin_idx) {
-						pgf_parsing_predict_lexeme(ps, state, conts, left);
-						left--;
-					}
+						PgfProductionIdxEntry* left = value-1;
+						while (left >= start &&
+							   value->ccat->fid == left->ccat->fid &&
+							   value->lin_idx   == left->lin_idx) {
+							pgf_parsing_predict_lexeme(ps, state, conts, left);
+							left--;
+						}

-					PgfProductionIdxEntry* right = value+1;
-                    while (right <= end &&
-                           value->ccat->fid == right->ccat->fid &&
-                           value->lin_idx   == right->lin_idx) {
-						pgf_parsing_predict_lexeme(ps, state, conts, right);
-						right++;
+						PgfProductionIdxEntry* right = value+1;
+						while (right <= end &&
+							   value->ccat->fid == right->ccat->fid &&
+							   value->lin_idx   == right->lin_idx) {
+							pgf_parsing_predict_lexeme(ps, state, conts, right);
+							right++;
+						}
 					}
 				}
 			}
@@ -1871,6 +1896,8 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence,
 	ps->ccat_full_count = 0;
 	ps->prod_full_count = 0;
 #endif
+	ps->prefix = NULL;
+	ps->tp = NULL;
 	ps->free_item = NULL;
 	ps->heuristic_factor = 0;
 	ps->meta_prob = INFINITY;
@@ -1906,157 +1933,6 @@ void pgf_parsing_print_counts(PgfParsing* ps)
 }
 #endif

-/*static bool
-*************
-typedef struct {
-	PgfTokenState ts;
-	PgfToken tok;
-	PgfProductionIdx *lexicon_idx;
-} PgfRealTokenState;
-
-static bool
-pgf_real_match_token(PgfTokenState* ts, PgfToken tok, PgfItem* item)
-{
-	return strcmp(gu_container(ts, PgfRealTokenState, ts)->tok, tok) == 0;
-}
-
-static PgfToken
-pgf_real_get_token(PgfTokenState* ts) {
-	return gu_container(ts, PgfRealTokenState, ts)->tok;
-}
-
-static PgfProductionIdx*
-pgf_real_get_lexicon_idx(PgfTokenState* ts) {
-	return gu_container(ts, PgfRealTokenState, ts)->lexicon_idx;
-}
-
-static PgfTokenFn pgf_tsfn_PgfRealTokenState = {
-	pgf_real_match_token,
-	pgf_real_get_token,
-	pgf_real_get_lexicon_idx
-};
-
-PgfParseState*
-pgf_parser_next_state(PgfParseState* prev, PgfToken tok)
-{
-#ifdef PGF_COUNTS_DEBUG
-	pgf_parsing_print_counts(prev->ps);
-#endif
-
-	PgfRealTokenState* ts =
-		pgf_new_token_state(PgfRealTokenState, prev->ps->pool);
-	ts->tok = tok;
-    ts->lexicon_idx = gu_map_get(prev->ps->concr->leftcorner_tok_idx,
-                                 tok, PgfProductionIdx*);	
-	if (ts->lexicon_idx != NULL) {
-		PgfLexiconFn clo = { { pgf_parser_compute_lexicon_prob }, &ts->ts };
-		gu_map_iter(ts->lexicon_idx, &clo.fn, NULL);
-	}
-	if (ts->ts.lexical_prob == INFINITY)
-		ts->ts.lexical_prob = 0;
-
-	PgfParseState* state =
-	    pgf_new_parse_state(prev->ps, prev, &ts->ts, prev->ps->pool);
-
-	while (gu_buf_length(state->agenda) == 0) {
-		if (!pgf_parsing_proceed(state))
-			return NULL;
-	}
-
-	return state;
-}
-
-typedef struct {
-	PgfTokenState ts;
-	GuEnum en;
-	GuString prefix;
-	PgfTokenProb* tp;
-	GuPool* pool;
-	PgfParseState* state;
-} PgfPrefixTokenState;
-
-static bool
-^ ^ ^ ^ ^ ^ ^
-pgf_prefix_match_token(PgfTokenState* ts0, PgfToken tok, PgfItem* item)
-{
-	PgfPrefixTokenState* ts = 
-		gu_container(ts0, PgfPrefixTokenState, ts);
-
-	if (gu_string_is_prefix(ts->prefix, tok)) {
-		size_t lin_idx;
-		PgfSequence* seq;
-		pgf_item_sequence(item, &lin_idx, &seq, ts->pool);
-
-		uint16_t seq_idx = item->seq_idx;
-		uint8_t  tok_idx = item->tok_idx;
-
-		// go one token back
-		if (tok_idx > 0)
-			tok_idx--;
-		else
-			seq_idx--;
-
-		ts->tp = gu_new(PgfTokenProb, ts->pool);
-		ts->tp->tok  =
-			pgf_get_tokens(seq, seq_idx, tok_idx, ts->pool);
-		ts->tp->cat  = item->conts->ccat->cnccat->abscat->name;
-		ts->tp->prob = item->inside_prob+item->conts->outside_prob;
-	}
-
-	return false;
-}
-
-static PgfToken
-pgf_prefix_get_token(PgfTokenState* ts) {
-	return "";
-}
-
-static PgfProductionIdx*
-pgf_prefix_get_lexicon_idx(PgfTokenState* ts) {
-	return NULL;
-}
-
-static PgfTokenFn pgf_tsfn_PgfPrefixTokenState = {
-	pgf_prefix_match_token,
-	pgf_prefix_get_token,
-	pgf_prefix_get_lexicon_idx
-};
-
-static void
-pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
-{
-	PgfPrefixTokenState* ts =
-		gu_container(self, PgfPrefixTokenState, en);
-
-	ts->tp   = NULL;
-	ts->pool = pool;
-	while (ts->tp == NULL) {
-		if (!pgf_parsing_proceed(ts->state))
-			break;
-	}
-
-	*((PgfTokenProb**)to) = ts->tp;
-}*/
-
-GuEnum*
-pgf_parsing_completions(PgfParsing* ps, GuString prefix)
-{
-#ifdef PGF_COUNTS_DEBUG
-	pgf_parsing_print_counts(ps);
-#endif
-
-/*	PgfPrefixTokenState* ts =
-		pgf_new_token_state(PgfPrefixTokenState, prev->ps->pool);
-	ts->en.next = pgf_parser_completions_next;
-	ts->prefix  = prefix;
-	ts->tp      = NULL;
-	ts->state   =
-	    pgf_new_parse_state(prev->ps, prev, &ts->ts);
-
-	return &ts->en;*/
-	return NULL;
-}
-
 static int
 cmp_expr_state(GuOrder* self, const void* a, const void* b)
 {
@@ -2501,35 +2377,68 @@ pgf_parse_with_heuristics(PgfConcr* concr, PgfCId cat, GuString sentence,
 	return &ps->en;
 }

+static void
+pgf_parser_completions_next(GuEnum* self, void* to, GuPool* pool)
+{
+	PgfParsing* ps =
+		gu_container(self, PgfParsing, en);
+
+	ps->tp = NULL;
+	while (ps->tp == NULL) {
+		if (!pgf_parsing_proceed(ps))
+			break;
+			
+#ifdef PGF_COUNTS_DEBUG
+		pgf_parsing_print_counts(ps);
+#endif
+	}
+
+	*((PgfTokenProb**)to) = ps->tp;
+}
+
 GuEnum*
 pgf_complete(PgfConcr* concr, PgfCId cat, GuString sentence, 
             GuString prefix, GuExn *err, GuPool* pool)
 {
-	// Begin parsing a sentence of the specified category
+	if (concr->sequences == NULL ||
+	    concr->pre_sequences == NULL ||
+	    concr->cnccats == NULL) {
+		GuExnData* err_data = gu_raise(err, PgfExn);
+		if (err_data) {
+			err_data->data = "The concrete syntax is not loaded";
+			return NULL;
+		}
+	}
+
+	// Begin parsing a sentence with the specified category
 	PgfParsing* ps =
-		pgf_parsing_init(concr, cat, 0, sentence, -1, err, pool, pool);
+		pgf_parsing_init(concr, cat, 0, sentence, -1.0, err, pool, pool);
 	if (ps == NULL) {
 		return NULL;
 	}

-	// Tokenization
-	GuExn* lex_err = gu_new_exn(NULL, gu_kind(type), pool);
-/*	PgfToken tok = pgf_lexer_read_token(lexer, lex_err);
-	while (!gu_exn_is_raised(lex_err)) {
-		// feed the token to get a new parse state
-		state = pgf_parser_next_state(state, tok);
-		if (state == NULL) {
+#ifdef PGF_COUNTS_DEBUG
+	pgf_parsing_print_counts(ps);
+#endif
+
+	size_t len = strlen(ps->sentence);
+	while (ps->before->end_offset < len) {
+		if (!pgf_parsing_proceed(ps)) {
+			GuExnData* exn = gu_raise(err, PgfParseError);
+			exn->data = (void*) pgf_parsing_last_token(ps, exn->pool);
 			return NULL;
 		}

-		tok = pgf_lexer_read_token(lexer, lex_err);
-	}*/
+#ifdef PGF_COUNTS_DEBUG
+		pgf_parsing_print_counts(ps);
+#endif
+	}

-	if (gu_exn_caught(lex_err) != gu_type(GuEOF))
-		return NULL;
-
-	// Now begin enumerating the resulting syntax trees
-	return pgf_parsing_completions(ps, prefix);
+	// Now begin enumerating the completions
+	ps->en.next = pgf_parser_completions_next;
+	ps->prefix  = prefix;
+	ps->tp      = NULL;
+	return &ps->en;
 }

 static void