diff --git a/gf.cabal b/gf.cabal index ec889a335..e7a5e100d 100644 --- a/gf.cabal +++ b/gf.cabal @@ -81,7 +81,8 @@ Library random, pretty, mtl, - exceptions + exceptions, + ghc-prim hs-source-dirs: src/runtime/haskell other-modules: diff --git a/src/compiler/GF/Compile/GrammarToPGF.hs b/src/compiler/GF/Compile/GrammarToPGF.hs index cd2e6b8ce..94a874506 100644 --- a/src/compiler/GF/Compile/GrammarToPGF.hs +++ b/src/compiler/GF/Compile/GrammarToPGF.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE BangPatterns, FlexibleContexts #-} +{-# LANGUAGE BangPatterns, FlexibleContexts, MagicHash #-} module GF.Compile.GrammarToPGF (mkCanon2pgf) where --import GF.Compile.Export @@ -30,6 +30,10 @@ import qualified Data.Map as Map import qualified Data.IntMap as IntMap import Data.Array.IArray +import Data.Char +import GHC.Prim +import GHC.Base(getTag) + mkCanon2pgf :: Options -> SourceGrammar -> ModuleName -> IOE D.PGF mkCanon2pgf opts gr am = do (an,abs) <- mkAbstr am @@ -59,7 +63,9 @@ mkCanon2pgf opts gr am = do [(0,i2i f) | ((m,f),AbsFun (Just (L _ ty)) _ _ (Just True)) <- adefs, snd (GM.valCat ty) == cat] mkConcr cm = do - let cflags = err (const noOptions) mflags (lookupModule gr cm) + let cflags = err (const noOptions) mflags (lookupModule gr cm) + ciCmp | flag optCaseSensitive cflags = compare + | otherwise = compareCaseInsensitve (ex_seqs,cdefs) <- addMissingPMCFGs Map.empty @@ -68,15 +74,15 @@ mkCanon2pgf opts gr am = do let flags = Map.fromList [(mkCId f,x) | (f,x) <- optionsPGF cflags] - seqs = (mkSetArray . Set.fromList . concat) $ + seqs = (mkArray . sortNubBy ciCmp . concat) $ (Map.keys ex_seqs : [maybe [] elems (mseqs mi) | (m,mi) <- allExtends gr cm]) ex_seqs_arr = mkMapArray ex_seqs :: Array SeqId Sequence !(!fid_cnt1,!cnccats) = genCncCats gr am cm cdefs !(!fid_cnt2,!productions,!lindefs,!linrefs,!cncfuns) - = genCncFuns gr am cm ex_seqs_arr seqs cdefs fid_cnt1 cnccats - + = genCncFuns gr am cm ex_seqs_arr ciCmp seqs cdefs fid_cnt1 cnccats + printnames = genPrintNames cdefs return (mi2i cm, D.Concr flags printnames @@ -186,6 +192,7 @@ genCncFuns :: Grammar -> ModuleName -> ModuleName -> Array SeqId Sequence + -> (Sequence -> Sequence -> Ordering) -> Array SeqId Sequence -> [(QIdent, Info)] -> FId @@ -195,7 +202,7 @@ genCncFuns :: Grammar IntMap.IntMap [FunId], IntMap.IntMap [FunId], Array FunId D.CncFun) -genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats = +genCncFuns gr am cm ex_seqs ciCmp seqs cdefs fid_cnt cnccats = let (fid_cnt1,funs_cnt1,funs1,lindefs,linrefs) = mkCncCats cdefs fid_cnt 0 [] IntMap.empty IntMap.empty (fid_cnt2,funs_cnt2,funs2,prods) = mkCncFuns cdefs fid_cnt1 funs_cnt1 funs1 lindefs Map.empty IntMap.empty in (fid_cnt2,prods,lindefs,linrefs,array (0,funs_cnt2-1) funs2) @@ -282,9 +289,9 @@ genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats = in (offs+funid0,C.CncFun (i2i id) (amap (newIndex mseqs) lins0)):funs where newIndex mseqs i = binSearch (mseqs ! i) seqs (bounds seqs) - + binSearch v arr (i,j) - | i <= j = case compare v (arr ! k) of + | i <= j = case ciCmp v (arr ! k) of LT -> binSearch v arr (i,k-1) EQ -> k GT -> binSearch v arr (k+1,j) @@ -303,6 +310,121 @@ genPrintNames cdefs = flatten (Alts x _) = flatten x flatten (C x y) = flatten x +++ flatten y ---mkArray lst = listArray (0,length lst-1) lst +mkArray lst = listArray (0,length lst-1) lst mkMapArray map = array (0,Map.size map-1) [(v,k) | (k,v) <- Map.toList map] -mkSetArray set = listArray (0,Set.size set-1) [v | v <- Set.toList set] + +-- The following is a version of Data.List.sortBy which together +-- with the sorting also eliminates duplicate values +sortNubBy cmp = mergeAll . sequences + where + sequences (a:b:xs) = + case cmp a b of + GT -> descending b [a] xs + EQ -> sequences (b:xs) + LT -> ascending b (a:) xs + sequences xs = [xs] + + descending a as [] = [a:as] + descending a as (b:bs) = + case cmp a b of + GT -> descending b (a:as) bs + EQ -> descending a as bs + LT -> (a:as) : sequences (b:bs) + + ascending a as [] = let !x = as [a] + in [x] + ascending a as (b:bs) = + case cmp a b of + GT -> let !x = as [a] + in x : sequences (b:bs) + EQ -> ascending a as bs + LT -> ascending b (\ys -> as (a:ys)) bs + + mergeAll [x] = x + mergeAll xs = mergeAll (mergePairs xs) + + mergePairs (a:b:xs) = let !x = merge a b + in x : mergePairs xs + mergePairs xs = xs + + merge as@(a:as') bs@(b:bs') = + case cmp a b of + GT -> b:merge as bs' + EQ -> a:merge as' bs' + LT -> a:merge as' bs + merge [] bs = bs + merge as [] = as + +-- The following function does case-insensitive comparison of sequences. +-- This is used to allow case-insensitive parsing, while +-- the linearizer still has access to the original cases. +compareCaseInsensitve s1 s2 = + compareSeq (elems s1) (elems s2) + where + compareSeq [] [] = EQ + compareSeq [] _ = LT + compareSeq _ [] = GT + compareSeq (x:xs) (y:ys) = + case compareSym x y of + EQ -> compareSeq xs ys + x -> x + + compareSym s1 s2 = + case s1 of + D.SymCat d1 r1 + -> case s2 of + D.SymCat d2 r2 + -> case compare d1 d2 of + EQ -> r1 `compare` r2 + x -> x + _ -> LT + D.SymLit d1 r1 + -> case s2 of + D.SymCat {} -> GT + D.SymLit d2 r2 + -> case compare d1 d2 of + EQ -> r1 `compare` r2 + x -> x + _ -> LT + D.SymVar d1 r1 + -> if tagToEnum# (getTag s2 ># 2#) + then LT + else case s2 of + D.SymVar d2 r2 + -> case compare d1 d2 of + EQ -> r1 `compare` r2 + x -> x + _ -> GT + D.SymKS t1 + -> if tagToEnum# (getTag s2 ># 3#) + then LT + else case s2 of + D.SymKS t2 -> t1 `compareToken` t2 + _ -> GT + D.SymKP a1 b1 + -> if tagToEnum# (getTag s2 ># 4#) + then LT + else case s2 of + D.SymKP a2 b2 + -> case compare a1 a2 of + EQ -> b1 `compare` b2 + x -> x + _ -> GT + _ -> let t1 = getTag s1 + t2 = getTag s2 + in if tagToEnum# (t1 <# t2) + then LT + else if tagToEnum# (t1 ==# t2) + then EQ + else GT + + compareToken [] [] = EQ + compareToken [] _ = LT + compareToken _ [] = GT + compareToken (x:xs) (y:ys) + | x == y = compareToken xs ys + | otherwise = case compare (toLower x) (toLower y) of + EQ -> case compareToken xs ys of + EQ -> compare x y + x -> x + x -> x diff --git a/src/runtime/c/pgf/parser.c b/src/runtime/c/pgf/parser.c index baf1e3eb3..be672d571 100644 --- a/src/runtime/c/pgf/parser.c +++ b/src/runtime/c/pgf/parser.c @@ -1078,8 +1078,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, ptrdiff_t len = current.ptr - start.ptr; found = true; - if (min <= len-1) - pgf_parsing_scan_helper(ps, state, i, k-1, min, len-1); + if (min <= len) + pgf_parsing_scan_helper(ps, state, i, k-1, min, len); // Here we do bottom-up prediction for all lexical categories. // The epsilon productions will be predicted in top-down @@ -1141,8 +1141,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state, } } - if (len+1 <= max) - pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max); + if (len <= max) + pgf_parsing_scan_helper(ps, state, k+1, j, len, max); break; } @@ -1633,6 +1633,9 @@ pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr) } } +PGF_INTERNAL_DECL bool +pgf_is_case_sensitive(PgfConcr* concr); + static PgfParsing* pgf_new_parsing(PgfConcr* concr, GuString sentence, PgfCallbacksMap* callbacks, PgfOracleCallback* oracle, @@ -1643,8 +1646,7 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence, ps->pool = pool; ps->out_pool = out_pool; ps->sentence = sentence; - ps->case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + ps->case_sensitive = pgf_is_case_sensitive(concr); ps->expr_queue = gu_new_buf(PgfExprState*, pool); ps->max_fid = concr->total_cats; ps->before = NULL; diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c index dae857ff1..e8de23afb 100644 --- a/src/runtime/c/pgf/scanner.c +++ b/src/runtime/c/pgf/scanner.c @@ -1,4 +1,5 @@ #include +#include #include #include @@ -16,8 +17,10 @@ cmp_string(PgfCohortSpot* spot, GuString tok, if (c1 == 0) return -1; - if (!case_sensitive) + if (!case_sensitive) { c1 = gu_ucs_to_lower(c1); + c2 = gu_ucs_to_lower(c2); + } if (c1 != c2) return (c1-c2); @@ -126,6 +129,22 @@ typedef struct { bool case_sensitive; } PgfSequenceOrder; +PGF_INTERNAL bool +pgf_is_case_sensitive(PgfConcr* concr) +{ + PgfFlag* flag = + gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive"); + if (flag != NULL) { + GuVariantInfo inf = gu_variant_open(flag->value); + if (inf.tag == PGF_LITERAL_STR) { + PgfLiteralStr* lstr = inf.data; + if (strcmp(lstr->val, "off") == 0) + return false; + } + } + return true; +} + static int pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2) { @@ -156,16 +175,59 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence, } } - bool case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); + size_t index = 0; + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, + pgf_is_case_sensitive(concr) }; + if (gu_seq_binsearch_index(concr->sequences, &order.order, + PgfSequence, (void*) sentence, + &index)) { + PgfSequence* seq = NULL; - PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; - PgfSequence* seq = (PgfSequence*) - gu_seq_binsearch(concr->sequences, &order.order, - PgfSequence, (void*) sentence); + /* If the match is case-insensitive then there might be more + * matches around the current index. We must check the neighbour + * sequences for matching as well. + */ - if (seq != NULL && seq->idx != NULL) - pgf_morpho_iter(seq->idx, callback, err); + if (!order.case_sensitive) { + size_t i = index; + while (i > 0) { + seq = gu_seq_index(concr->sequences, PgfSequence, i-1); + + size_t sym_idx = 0; + PgfCohortSpot spot = {0, sentence}; + if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) { + break; + } + + if (seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); + + i--; + } + } + + seq = gu_seq_index(concr->sequences, PgfSequence, index); + if (seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); + + if (!order.case_sensitive) { + size_t i = index+1; + while (i < gu_seq_length(concr->sequences)) { + seq = gu_seq_index(concr->sequences, PgfSequence, i); + + size_t sym_idx = 0; + PgfCohortSpot spot = {0, sentence}; + if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) { + break; + } + + if (seq->idx != NULL) + pgf_morpho_iter(seq->idx, callback, err); + + i++; + } + } + } } typedef struct { @@ -225,8 +287,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, } else { ptrdiff_t len = current.ptr - spot->ptr; - if (min <= len-1) - pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1); + if (min <= len) + pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) { PgfCohortRange* range = gu_buf_insert(state->found, 0); @@ -242,8 +304,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); - if (len+1 <= max) - pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max); + if (len <= max) + pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max); break; } @@ -289,11 +351,13 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) pRes->buf = NULL; state->current = NULL; return; - } else { + } else do { *pRes = gu_buf_pop(state->found, PgfCohortRange); state->current = pRes->start.ptr; pgf_morpho_iter(pRes->buf, state->callback, state->err); - } + } while (gu_buf_length(state->found) > 0 && + gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr); + } PGF_API GuEnum* @@ -309,9 +373,6 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, } } - bool case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); - PgfCohortsState* state = gu_new(PgfCohortsState, pool); state->en.next = pgf_lookup_cohorts_enum_next; state->concr = concr; @@ -319,7 +380,7 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, state->len = strlen(sentence); state->callback= callback; state->err = err; - state->case_sensitive = case_sensitive; + state->case_sensitive = pgf_is_case_sensitive(concr); state->spots = gu_new_buf(PgfCohortSpot, pool); state->found = gu_new_buf(PgfCohortRange, pool); @@ -339,6 +400,7 @@ typedef struct { PgfSequences* sequences; GuString prefix; size_t seq_idx; + bool case_sensitive; } PgfFullFormState; struct PgfFullFormEntry { @@ -358,7 +420,8 @@ gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool) PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx); GuString tokens = pgf_get_tokens(seq->syms, 0, pool); - if (!gu_string_is_prefix(st->prefix, tokens)) { + PgfCohortSpot spot = {0, st->prefix}; + if (cmp_string(&spot, tokens, st->case_sensitive) > 0 || *spot.ptr != 0) { st->seq_idx = n_seqs; break; } @@ -387,6 +450,7 @@ pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool) st->sequences = concr->sequences; st->prefix = ""; st->seq_idx = 0; + st->case_sensitive = true; return &st->en; } @@ -420,15 +484,32 @@ pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix, state->sequences = concr->sequences; state->prefix = prefix; state->seq_idx = 0; + state->case_sensitive = pgf_is_case_sensitive(concr); - bool case_sensitive = - (gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL); - - PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive }; + PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, + state->case_sensitive }; if (!gu_seq_binsearch_index(concr->sequences, &order.order, PgfSequence, (void*) prefix, &state->seq_idx)) { state->seq_idx++; + } else if (!state->case_sensitive) { + /* If the match is case-insensitive then there might be more + * matches around the current index. Since we scroll down + * anyway, it is enough to search upwards now. + */ + + while (state->seq_idx > 0) { + PgfSequence* seq = + gu_seq_index(concr->sequences, PgfSequence, state->seq_idx-1); + + size_t sym_idx = 0; + PgfCohortSpot spot = {0, state->prefix}; + if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, state->case_sensitive) > 0 || *spot.ptr != 0) { + break; + } + + state->seq_idx--; + } } return &state->en;