mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-30 14:52:51 -06:00
"flags case_sensitive=off" makes the parser case insensitive
This commit is contained in:
@@ -173,6 +173,7 @@ data Flags = Flags {
|
|||||||
optDump :: [Dump],
|
optDump :: [Dump],
|
||||||
optTagsOnly :: Bool,
|
optTagsOnly :: Bool,
|
||||||
optHeuristicFactor :: Maybe Double,
|
optHeuristicFactor :: Maybe Double,
|
||||||
|
optCaseSensitive :: Bool,
|
||||||
optPlusAsBind :: Bool,
|
optPlusAsBind :: Bool,
|
||||||
optJobs :: Maybe (Maybe Int)
|
optJobs :: Maybe (Maybe Int)
|
||||||
}
|
}
|
||||||
@@ -221,6 +222,7 @@ optionsPGF opts =
|
|||||||
maybe [] (\x -> [("language",LStr x)]) (flag optSpeechLanguage opts)
|
maybe [] (\x -> [("language",LStr x)]) (flag optSpeechLanguage opts)
|
||||||
++ maybe [] (\x -> [("startcat",LStr x)]) (flag optStartCat opts)
|
++ maybe [] (\x -> [("startcat",LStr x)]) (flag optStartCat opts)
|
||||||
++ maybe [] (\x -> [("heuristic_search_factor",LFlt x)]) (flag optHeuristicFactor opts)
|
++ maybe [] (\x -> [("heuristic_search_factor",LFlt x)]) (flag optHeuristicFactor opts)
|
||||||
|
++ (if flag optCaseSensitive opts then [] else [("case_sensitive",LStr "off")])
|
||||||
|
|
||||||
-- Option manipulation
|
-- Option manipulation
|
||||||
|
|
||||||
@@ -282,6 +284,7 @@ defaultFlags = Flags {
|
|||||||
optDump = [],
|
optDump = [],
|
||||||
optTagsOnly = False,
|
optTagsOnly = False,
|
||||||
optHeuristicFactor = Nothing,
|
optHeuristicFactor = Nothing,
|
||||||
|
optCaseSensitive = True,
|
||||||
optPlusAsBind = False,
|
optPlusAsBind = False,
|
||||||
optJobs = Nothing
|
optJobs = Nothing
|
||||||
}
|
}
|
||||||
@@ -365,6 +368,7 @@ optDescr =
|
|||||||
Option [] ["cse"] (onOff (toggleOptimize OptCSE) True) "Perform common sub-expression elimination (default on).",
|
Option [] ["cse"] (onOff (toggleOptimize OptCSE) True) "Perform common sub-expression elimination (default on).",
|
||||||
Option [] ["cfg"] (ReqArg cfgTransform "TRANS") "Enable or disable specific CFG transformations. TRANS = merge, no-merge, bottomup, no-bottomup, ...",
|
Option [] ["cfg"] (ReqArg cfgTransform "TRANS") "Enable or disable specific CFG transformations. TRANS = merge, no-merge, bottomup, no-bottomup, ...",
|
||||||
Option [] ["heuristic_search_factor"] (ReqArg (readDouble (\d o -> o { optHeuristicFactor = Just d })) "FACTOR") "Set the heuristic search factor for statistical parsing",
|
Option [] ["heuristic_search_factor"] (ReqArg (readDouble (\d o -> o { optHeuristicFactor = Just d })) "FACTOR") "Set the heuristic search factor for statistical parsing",
|
||||||
|
Option [] ["case_sensitive"] (onOff (\v -> set $ \o -> o{optCaseSensitive=v}) True) "Set the parser in case-sensitive/insensitive mode [sensitive by default]",
|
||||||
Option [] ["plus-as-bind"] (NoArg (set $ \o -> o{optPlusAsBind=True})) "Uses of (+) with runtime variables automatically generate BIND (experimental feature).",
|
Option [] ["plus-as-bind"] (NoArg (set $ \o -> o{optPlusAsBind=True})) "Uses of (+) with runtime variables automatically generate BIND (experimental feature).",
|
||||||
dumpOption "source" Source,
|
dumpOption "source" Source,
|
||||||
dumpOption "rebuild" Rebuild,
|
dumpOption "rebuild" Rebuild,
|
||||||
|
|||||||
@@ -38,6 +38,7 @@ typedef struct {
|
|||||||
GuPool* pool; // this pool is used for structures internal to the parser
|
GuPool* pool; // this pool is used for structures internal to the parser
|
||||||
GuPool* out_pool; // this pool is used for the allocating the final abstract trees
|
GuPool* out_pool; // this pool is used for the allocating the final abstract trees
|
||||||
GuString sentence; // the sentence to be parsed
|
GuString sentence; // the sentence to be parsed
|
||||||
|
bool case_sensitive;
|
||||||
GuBuf* expr_queue; // during the extraction of abstract trees we push them in this queue
|
GuBuf* expr_queue; // during the extraction of abstract trees we push them in this queue
|
||||||
int max_fid;
|
int max_fid;
|
||||||
PgfParseState *before;
|
PgfParseState *before;
|
||||||
@@ -474,22 +475,25 @@ pgf_print_expr_state0(PgfExprState* st,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
cmp_string(GuString* psent, GuString tok)
|
cmp_string(GuString* psent, GuString tok, bool case_sensitive)
|
||||||
{
|
{
|
||||||
for (;;) {
|
for (;;) {
|
||||||
uint8_t c2 = *tok;
|
GuUCS c2 = gu_utf8_decode((const uint8_t**) &tok);
|
||||||
if (c2 == 0)
|
if (c2 == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
uint8_t c1 = **psent;
|
const uint8_t* p = (uint8_t*) *psent;
|
||||||
|
GuUCS c1 = gu_utf8_decode(&p);
|
||||||
if (c1 == 0)
|
if (c1 == 0)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
if (!case_sensitive)
|
||||||
|
c1 = gu_ucs_to_lower(c1);
|
||||||
|
|
||||||
if (c1 != c2)
|
if (c1 != c2)
|
||||||
return (c1-c2);
|
return (c1-c2);
|
||||||
|
|
||||||
tok++;
|
*psent = (GuString) p;
|
||||||
(*psent)++;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1016,7 +1020,8 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
|||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
pgf_symbols_cmp(GuString* psent, BIND_TYPE* pbind, PgfSymbols* syms)
|
pgf_symbols_cmp(GuString* psent, BIND_TYPE* pbind, PgfSymbols* syms,
|
||||||
|
bool case_sensitive)
|
||||||
{
|
{
|
||||||
size_t n_syms = gu_seq_length(syms);
|
size_t n_syms = gu_seq_length(syms);
|
||||||
for (size_t i = 0; i < n_syms; i++) {
|
for (size_t i = 0; i < n_syms; i++) {
|
||||||
@@ -1048,7 +1053,7 @@ pgf_symbols_cmp(GuString* psent, BIND_TYPE* pbind, PgfSymbols* syms)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int cmp = cmp_string(psent, pks->token);
|
int cmp = cmp_string(psent, pks->token, case_sensitive);
|
||||||
if (cmp != 0)
|
if (cmp != 0)
|
||||||
return cmp;
|
return cmp;
|
||||||
break;
|
break;
|
||||||
@@ -1098,7 +1103,7 @@ pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
|||||||
GuString start = ps->sentence + state->end_offset;
|
GuString start = ps->sentence + state->end_offset;
|
||||||
GuString current = start;
|
GuString current = start;
|
||||||
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
||||||
int cmp = pgf_symbols_cmp(¤t, &bind_type, seq->syms);
|
int cmp = pgf_symbols_cmp(¤t, &bind_type, seq->syms, ps->case_sensitive);
|
||||||
if (cmp < 0) {
|
if (cmp < 0) {
|
||||||
j = k-1;
|
j = k-1;
|
||||||
} else if (cmp > 0) {
|
} else if (cmp > 0) {
|
||||||
@@ -1141,7 +1146,7 @@ pgf_parsing_lookahead_pre(PgfParsing *ps, PgfParseState* state)
|
|||||||
|
|
||||||
GuString current = ps->sentence + state->end_offset;
|
GuString current = ps->sentence + state->end_offset;
|
||||||
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
BIND_TYPE bind_type = state->needs_bind ? BIND_NONE : BIND_HARD;
|
||||||
if (pgf_symbols_cmp(¤t, &bind_type, seq->syms) == 0) {
|
if (pgf_symbols_cmp(¤t, &bind_type, seq->syms, ps->case_sensitive) == 0) {
|
||||||
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||||
entry->idx = seq->idx;
|
entry->idx = seq->idx;
|
||||||
entry->bind_type = bind_type;
|
entry->bind_type = bind_type;
|
||||||
@@ -1233,7 +1238,7 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
|||||||
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
ps->tp->prob = item->inside_prob + item->conts->outside_prob;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (!ps->before->needs_bind && cmp_string(¤t, tok) == 0) {
|
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, (current - ps->sentence),
|
pgf_new_parse_state(ps, (current - ps->sentence),
|
||||||
BIND_NONE,
|
BIND_NONE,
|
||||||
@@ -1675,6 +1680,8 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence, PgfCallbacksMap* callbacks,
|
|||||||
ps->pool = pool;
|
ps->pool = pool;
|
||||||
ps->out_pool = out_pool;
|
ps->out_pool = out_pool;
|
||||||
ps->sentence = sentence;
|
ps->sentence = sentence;
|
||||||
|
ps->case_sensitive =
|
||||||
|
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||||
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
|
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
|
||||||
ps->max_fid = concr->total_cats;
|
ps->max_fid = concr->total_cats;
|
||||||
ps->before = NULL;
|
ps->before = NULL;
|
||||||
@@ -2217,7 +2224,7 @@ pgf_sequence_cmp_fn(GuOrder* self, const void* p1, const void* p2)
|
|||||||
const PgfSequence* sp2 = p2;
|
const PgfSequence* sp2 = p2;
|
||||||
|
|
||||||
BIND_TYPE bind = BIND_HARD;
|
BIND_TYPE bind = BIND_HARD;
|
||||||
int res = pgf_symbols_cmp(&sent, &bind, sp2->syms);
|
int res = pgf_symbols_cmp(&sent, &bind, sp2->syms, true);
|
||||||
if (res == 0 && *sent != 0) {
|
if (res == 0 && *sent != 0) {
|
||||||
res = 1;
|
res = 1;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user