forked from GitHub/gf-core
fully supported case-insensitive parsing/lookup
This commit is contained in:
3
gf.cabal
3
gf.cabal
@@ -81,7 +81,8 @@ Library
|
||||
random,
|
||||
pretty,
|
||||
mtl,
|
||||
exceptions
|
||||
exceptions,
|
||||
ghc-prim
|
||||
hs-source-dirs: src/runtime/haskell
|
||||
|
||||
other-modules:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
{-# LANGUAGE BangPatterns, FlexibleContexts #-}
|
||||
{-# LANGUAGE BangPatterns, FlexibleContexts, MagicHash #-}
|
||||
module GF.Compile.GrammarToPGF (mkCanon2pgf) where
|
||||
|
||||
--import GF.Compile.Export
|
||||
@@ -30,6 +30,10 @@ import qualified Data.Map as Map
|
||||
import qualified Data.IntMap as IntMap
|
||||
import Data.Array.IArray
|
||||
|
||||
import Data.Char
|
||||
import GHC.Prim
|
||||
import GHC.Base(getTag)
|
||||
|
||||
mkCanon2pgf :: Options -> SourceGrammar -> ModuleName -> IOE D.PGF
|
||||
mkCanon2pgf opts gr am = do
|
||||
(an,abs) <- mkAbstr am
|
||||
@@ -59,7 +63,9 @@ mkCanon2pgf opts gr am = do
|
||||
[(0,i2i f) | ((m,f),AbsFun (Just (L _ ty)) _ _ (Just True)) <- adefs, snd (GM.valCat ty) == cat]
|
||||
|
||||
mkConcr cm = do
|
||||
let cflags = err (const noOptions) mflags (lookupModule gr cm)
|
||||
let cflags = err (const noOptions) mflags (lookupModule gr cm)
|
||||
ciCmp | flag optCaseSensitive cflags = compare
|
||||
| otherwise = compareCaseInsensitve
|
||||
|
||||
(ex_seqs,cdefs) <- addMissingPMCFGs
|
||||
Map.empty
|
||||
@@ -68,15 +74,15 @@ mkCanon2pgf opts gr am = do
|
||||
|
||||
let flags = Map.fromList [(mkCId f,x) | (f,x) <- optionsPGF cflags]
|
||||
|
||||
seqs = (mkSetArray . Set.fromList . concat) $
|
||||
seqs = (mkArray . sortNubBy ciCmp . concat) $
|
||||
(Map.keys ex_seqs : [maybe [] elems (mseqs mi) | (m,mi) <- allExtends gr cm])
|
||||
|
||||
ex_seqs_arr = mkMapArray ex_seqs :: Array SeqId Sequence
|
||||
|
||||
!(!fid_cnt1,!cnccats) = genCncCats gr am cm cdefs
|
||||
!(!fid_cnt2,!productions,!lindefs,!linrefs,!cncfuns)
|
||||
= genCncFuns gr am cm ex_seqs_arr seqs cdefs fid_cnt1 cnccats
|
||||
|
||||
= genCncFuns gr am cm ex_seqs_arr ciCmp seqs cdefs fid_cnt1 cnccats
|
||||
|
||||
printnames = genPrintNames cdefs
|
||||
return (mi2i cm, D.Concr flags
|
||||
printnames
|
||||
@@ -186,6 +192,7 @@ genCncFuns :: Grammar
|
||||
-> ModuleName
|
||||
-> ModuleName
|
||||
-> Array SeqId Sequence
|
||||
-> (Sequence -> Sequence -> Ordering)
|
||||
-> Array SeqId Sequence
|
||||
-> [(QIdent, Info)]
|
||||
-> FId
|
||||
@@ -195,7 +202,7 @@ genCncFuns :: Grammar
|
||||
IntMap.IntMap [FunId],
|
||||
IntMap.IntMap [FunId],
|
||||
Array FunId D.CncFun)
|
||||
genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats =
|
||||
genCncFuns gr am cm ex_seqs ciCmp seqs cdefs fid_cnt cnccats =
|
||||
let (fid_cnt1,funs_cnt1,funs1,lindefs,linrefs) = mkCncCats cdefs fid_cnt 0 [] IntMap.empty IntMap.empty
|
||||
(fid_cnt2,funs_cnt2,funs2,prods) = mkCncFuns cdefs fid_cnt1 funs_cnt1 funs1 lindefs Map.empty IntMap.empty
|
||||
in (fid_cnt2,prods,lindefs,linrefs,array (0,funs_cnt2-1) funs2)
|
||||
@@ -282,9 +289,9 @@ genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats =
|
||||
in (offs+funid0,C.CncFun (i2i id) (amap (newIndex mseqs) lins0)):funs
|
||||
where
|
||||
newIndex mseqs i = binSearch (mseqs ! i) seqs (bounds seqs)
|
||||
|
||||
|
||||
binSearch v arr (i,j)
|
||||
| i <= j = case compare v (arr ! k) of
|
||||
| i <= j = case ciCmp v (arr ! k) of
|
||||
LT -> binSearch v arr (i,k-1)
|
||||
EQ -> k
|
||||
GT -> binSearch v arr (k+1,j)
|
||||
@@ -303,6 +310,121 @@ genPrintNames cdefs =
|
||||
flatten (Alts x _) = flatten x
|
||||
flatten (C x y) = flatten x +++ flatten y
|
||||
|
||||
--mkArray lst = listArray (0,length lst-1) lst
|
||||
mkArray lst = listArray (0,length lst-1) lst
|
||||
mkMapArray map = array (0,Map.size map-1) [(v,k) | (k,v) <- Map.toList map]
|
||||
mkSetArray set = listArray (0,Set.size set-1) [v | v <- Set.toList set]
|
||||
|
||||
-- The following is a version of Data.List.sortBy which together
|
||||
-- with the sorting also eliminates duplicate values
|
||||
sortNubBy cmp = mergeAll . sequences
|
||||
where
|
||||
sequences (a:b:xs) =
|
||||
case cmp a b of
|
||||
GT -> descending b [a] xs
|
||||
EQ -> sequences (b:xs)
|
||||
LT -> ascending b (a:) xs
|
||||
sequences xs = [xs]
|
||||
|
||||
descending a as [] = [a:as]
|
||||
descending a as (b:bs) =
|
||||
case cmp a b of
|
||||
GT -> descending b (a:as) bs
|
||||
EQ -> descending a as bs
|
||||
LT -> (a:as) : sequences (b:bs)
|
||||
|
||||
ascending a as [] = let !x = as [a]
|
||||
in [x]
|
||||
ascending a as (b:bs) =
|
||||
case cmp a b of
|
||||
GT -> let !x = as [a]
|
||||
in x : sequences (b:bs)
|
||||
EQ -> ascending a as bs
|
||||
LT -> ascending b (\ys -> as (a:ys)) bs
|
||||
|
||||
mergeAll [x] = x
|
||||
mergeAll xs = mergeAll (mergePairs xs)
|
||||
|
||||
mergePairs (a:b:xs) = let !x = merge a b
|
||||
in x : mergePairs xs
|
||||
mergePairs xs = xs
|
||||
|
||||
merge as@(a:as') bs@(b:bs') =
|
||||
case cmp a b of
|
||||
GT -> b:merge as bs'
|
||||
EQ -> a:merge as' bs'
|
||||
LT -> a:merge as' bs
|
||||
merge [] bs = bs
|
||||
merge as [] = as
|
||||
|
||||
-- The following function does case-insensitive comparison of sequences.
|
||||
-- This is used to allow case-insensitive parsing, while
|
||||
-- the linearizer still has access to the original cases.
|
||||
compareCaseInsensitve s1 s2 =
|
||||
compareSeq (elems s1) (elems s2)
|
||||
where
|
||||
compareSeq [] [] = EQ
|
||||
compareSeq [] _ = LT
|
||||
compareSeq _ [] = GT
|
||||
compareSeq (x:xs) (y:ys) =
|
||||
case compareSym x y of
|
||||
EQ -> compareSeq xs ys
|
||||
x -> x
|
||||
|
||||
compareSym s1 s2 =
|
||||
case s1 of
|
||||
D.SymCat d1 r1
|
||||
-> case s2 of
|
||||
D.SymCat d2 r2
|
||||
-> case compare d1 d2 of
|
||||
EQ -> r1 `compare` r2
|
||||
x -> x
|
||||
_ -> LT
|
||||
D.SymLit d1 r1
|
||||
-> case s2 of
|
||||
D.SymCat {} -> GT
|
||||
D.SymLit d2 r2
|
||||
-> case compare d1 d2 of
|
||||
EQ -> r1 `compare` r2
|
||||
x -> x
|
||||
_ -> LT
|
||||
D.SymVar d1 r1
|
||||
-> if tagToEnum# (getTag s2 ># 2#)
|
||||
then LT
|
||||
else case s2 of
|
||||
D.SymVar d2 r2
|
||||
-> case compare d1 d2 of
|
||||
EQ -> r1 `compare` r2
|
||||
x -> x
|
||||
_ -> GT
|
||||
D.SymKS t1
|
||||
-> if tagToEnum# (getTag s2 ># 3#)
|
||||
then LT
|
||||
else case s2 of
|
||||
D.SymKS t2 -> t1 `compareToken` t2
|
||||
_ -> GT
|
||||
D.SymKP a1 b1
|
||||
-> if tagToEnum# (getTag s2 ># 4#)
|
||||
then LT
|
||||
else case s2 of
|
||||
D.SymKP a2 b2
|
||||
-> case compare a1 a2 of
|
||||
EQ -> b1 `compare` b2
|
||||
x -> x
|
||||
_ -> GT
|
||||
_ -> let t1 = getTag s1
|
||||
t2 = getTag s2
|
||||
in if tagToEnum# (t1 <# t2)
|
||||
then LT
|
||||
else if tagToEnum# (t1 ==# t2)
|
||||
then EQ
|
||||
else GT
|
||||
|
||||
compareToken [] [] = EQ
|
||||
compareToken [] _ = LT
|
||||
compareToken _ [] = GT
|
||||
compareToken (x:xs) (y:ys)
|
||||
| x == y = compareToken xs ys
|
||||
| otherwise = case compare (toLower x) (toLower y) of
|
||||
EQ -> case compareToken xs ys of
|
||||
EQ -> compare x y
|
||||
x -> x
|
||||
x -> x
|
||||
|
||||
@@ -1078,8 +1078,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
|
||||
ptrdiff_t len = current.ptr - start.ptr;
|
||||
found = true;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_parsing_scan_helper(ps, state, i, k-1, min, len-1);
|
||||
if (min <= len)
|
||||
pgf_parsing_scan_helper(ps, state, i, k-1, min, len);
|
||||
|
||||
// Here we do bottom-up prediction for all lexical categories.
|
||||
// The epsilon productions will be predicted in top-down
|
||||
@@ -1141,8 +1141,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
|
||||
}
|
||||
}
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max);
|
||||
if (len <= max)
|
||||
pgf_parsing_scan_helper(ps, state, k+1, j, len, max);
|
||||
|
||||
break;
|
||||
}
|
||||
@@ -1633,6 +1633,9 @@ pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr)
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL_DECL bool
|
||||
pgf_is_case_sensitive(PgfConcr* concr);
|
||||
|
||||
static PgfParsing*
|
||||
pgf_new_parsing(PgfConcr* concr, GuString sentence,
|
||||
PgfCallbacksMap* callbacks, PgfOracleCallback* oracle,
|
||||
@@ -1643,8 +1646,7 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence,
|
||||
ps->pool = pool;
|
||||
ps->out_pool = out_pool;
|
||||
ps->sentence = sentence;
|
||||
ps->case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
ps->case_sensitive = pgf_is_case_sensitive(concr);
|
||||
ps->expr_queue = gu_new_buf(PgfExprState*, pool);
|
||||
ps->max_fid = concr->total_cats;
|
||||
ps->before = NULL;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#include <pgf/data.h>
|
||||
#include <pgf/expr.h>
|
||||
#include <pgf/linearizer.h>
|
||||
#include <gu/utf8.h>
|
||||
|
||||
@@ -16,8 +17,10 @@ cmp_string(PgfCohortSpot* spot, GuString tok,
|
||||
if (c1 == 0)
|
||||
return -1;
|
||||
|
||||
if (!case_sensitive)
|
||||
if (!case_sensitive) {
|
||||
c1 = gu_ucs_to_lower(c1);
|
||||
c2 = gu_ucs_to_lower(c2);
|
||||
}
|
||||
|
||||
if (c1 != c2)
|
||||
return (c1-c2);
|
||||
@@ -126,6 +129,22 @@ typedef struct {
|
||||
bool case_sensitive;
|
||||
} PgfSequenceOrder;
|
||||
|
||||
PGF_INTERNAL bool
|
||||
pgf_is_case_sensitive(PgfConcr* concr)
|
||||
{
|
||||
PgfFlag* flag =
|
||||
gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive");
|
||||
if (flag != NULL) {
|
||||
GuVariantInfo inf = gu_variant_open(flag->value);
|
||||
if (inf.tag == PGF_LITERAL_STR) {
|
||||
PgfLiteralStr* lstr = inf.data;
|
||||
if (strcmp(lstr->val, "off") == 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int
|
||||
pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
|
||||
{
|
||||
@@ -156,16 +175,59 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
}
|
||||
}
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
size_t index = 0;
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn },
|
||||
pgf_is_case_sensitive(concr) };
|
||||
if (gu_seq_binsearch_index(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) sentence,
|
||||
&index)) {
|
||||
PgfSequence* seq = NULL;
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
PgfSequence* seq = (PgfSequence*)
|
||||
gu_seq_binsearch(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) sentence);
|
||||
/* If the match is case-insensitive then there might be more
|
||||
* matches around the current index. We must check the neighbour
|
||||
* sequences for matching as well.
|
||||
*/
|
||||
|
||||
if (seq != NULL && seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
if (!order.case_sensitive) {
|
||||
size_t i = index;
|
||||
while (i > 0) {
|
||||
seq = gu_seq_index(concr->sequences, PgfSequence, i-1);
|
||||
|
||||
size_t sym_idx = 0;
|
||||
PgfCohortSpot spot = {0, sentence};
|
||||
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
|
||||
i--;
|
||||
}
|
||||
}
|
||||
|
||||
seq = gu_seq_index(concr->sequences, PgfSequence, index);
|
||||
if (seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
|
||||
if (!order.case_sensitive) {
|
||||
size_t i = index+1;
|
||||
while (i < gu_seq_length(concr->sequences)) {
|
||||
seq = gu_seq_index(concr->sequences, PgfSequence, i);
|
||||
|
||||
size_t sym_idx = 0;
|
||||
PgfCohortSpot spot = {0, sentence};
|
||||
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
if (seq->idx != NULL)
|
||||
pgf_morpho_iter(seq->idx, callback, err);
|
||||
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
typedef struct {
|
||||
@@ -225,8 +287,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
} else {
|
||||
ptrdiff_t len = current.ptr - spot->ptr;
|
||||
|
||||
if (min <= len-1)
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1);
|
||||
if (min <= len)
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
|
||||
|
||||
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
@@ -242,8 +304,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
|
||||
if (len+1 <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max);
|
||||
if (len <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
|
||||
|
||||
break;
|
||||
}
|
||||
@@ -289,11 +351,13 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
pRes->buf = NULL;
|
||||
state->current = NULL;
|
||||
return;
|
||||
} else {
|
||||
} else do {
|
||||
*pRes = gu_buf_pop(state->found, PgfCohortRange);
|
||||
state->current = pRes->start.ptr;
|
||||
pgf_morpho_iter(pRes->buf, state->callback, state->err);
|
||||
}
|
||||
} while (gu_buf_length(state->found) > 0 &&
|
||||
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
|
||||
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
@@ -309,9 +373,6 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
}
|
||||
}
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
@@ -319,7 +380,7 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
state->len = strlen(sentence);
|
||||
state->callback= callback;
|
||||
state->err = err;
|
||||
state->case_sensitive = case_sensitive;
|
||||
state->case_sensitive = pgf_is_case_sensitive(concr);
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
|
||||
@@ -339,6 +400,7 @@ typedef struct {
|
||||
PgfSequences* sequences;
|
||||
GuString prefix;
|
||||
size_t seq_idx;
|
||||
bool case_sensitive;
|
||||
} PgfFullFormState;
|
||||
|
||||
struct PgfFullFormEntry {
|
||||
@@ -358,7 +420,8 @@ gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
|
||||
GuString tokens = pgf_get_tokens(seq->syms, 0, pool);
|
||||
|
||||
if (!gu_string_is_prefix(st->prefix, tokens)) {
|
||||
PgfCohortSpot spot = {0, st->prefix};
|
||||
if (cmp_string(&spot, tokens, st->case_sensitive) > 0 || *spot.ptr != 0) {
|
||||
st->seq_idx = n_seqs;
|
||||
break;
|
||||
}
|
||||
@@ -387,6 +450,7 @@ pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
|
||||
st->sequences = concr->sequences;
|
||||
st->prefix = "";
|
||||
st->seq_idx = 0;
|
||||
st->case_sensitive = true;
|
||||
return &st->en;
|
||||
}
|
||||
|
||||
@@ -420,15 +484,32 @@ pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix,
|
||||
state->sequences = concr->sequences;
|
||||
state->prefix = prefix;
|
||||
state->seq_idx = 0;
|
||||
state->case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
bool case_sensitive =
|
||||
(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
|
||||
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
|
||||
PgfSequenceOrder order = { { pgf_sequence_cmp_fn },
|
||||
state->case_sensitive };
|
||||
if (!gu_seq_binsearch_index(concr->sequences, &order.order,
|
||||
PgfSequence, (void*) prefix,
|
||||
&state->seq_idx)) {
|
||||
state->seq_idx++;
|
||||
} else if (!state->case_sensitive) {
|
||||
/* If the match is case-insensitive then there might be more
|
||||
* matches around the current index. Since we scroll down
|
||||
* anyway, it is enough to search upwards now.
|
||||
*/
|
||||
|
||||
while (state->seq_idx > 0) {
|
||||
PgfSequence* seq =
|
||||
gu_seq_index(concr->sequences, PgfSequence, state->seq_idx-1);
|
||||
|
||||
size_t sym_idx = 0;
|
||||
PgfCohortSpot spot = {0, state->prefix};
|
||||
if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, state->case_sensitive) > 0 || *spot.ptr != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
state->seq_idx--;
|
||||
}
|
||||
}
|
||||
|
||||
return &state->en;
|
||||
|
||||
Reference in New Issue
Block a user