fully supported case-insensitive parsing/lookup

2019-06-30 08:48:23 +02:00
parent b56591c6b6
commit 32379a8d11
4 changed files with 247 additions and 41 deletions
--- a/gf.cabal
+++ b/gf.cabal
@@ -81,7 +81,8 @@ Library
                 random,
                 pretty,
                 mtl,
-                 exceptions
+                 exceptions,
+                 ghc-prim
  hs-source-dirs: src/runtime/haskell

  other-modules:
--- a/src/compiler/GF/Compile/GrammarToPGF.hs
+++ b/src/compiler/GF/Compile/GrammarToPGF.hs
@@ -1,4 +1,4 @@
-{-# LANGUAGE BangPatterns, FlexibleContexts #-}
+{-# LANGUAGE BangPatterns, FlexibleContexts, MagicHash #-}
 module GF.Compile.GrammarToPGF (mkCanon2pgf) where

 --import GF.Compile.Export
@@ -30,6 +30,10 @@ import qualified Data.Map as Map
 import qualified Data.IntMap as IntMap
 import Data.Array.IArray

+import Data.Char
+import GHC.Prim
+import GHC.Base(getTag)
+
 mkCanon2pgf :: Options -> SourceGrammar -> ModuleName -> IOE D.PGF
 mkCanon2pgf opts gr am = do
  (an,abs) <- mkAbstr am
@@ -59,7 +63,9 @@ mkCanon2pgf opts gr am = do
              [(0,i2i f) | ((m,f),AbsFun (Just (L _ ty)) _ _ (Just True)) <- adefs, snd (GM.valCat ty) == cat]

    mkConcr cm = do
-      let cflags  = err (const noOptions) mflags (lookupModule gr cm)
+      let cflags = err (const noOptions) mflags (lookupModule gr cm)
+          ciCmp | flag optCaseSensitive cflags = compare
+                | otherwise                    = compareCaseInsensitve

      (ex_seqs,cdefs) <- addMissingPMCFGs
                            Map.empty 
@@ -68,15 +74,15 @@ mkCanon2pgf opts gr am = do

      let flags = Map.fromList [(mkCId f,x) | (f,x) <- optionsPGF cflags]

-          seqs = (mkSetArray . Set.fromList . concat) $
+          seqs = (mkArray . sortNubBy ciCmp . concat) $
                     (Map.keys ex_seqs : [maybe [] elems (mseqs mi) | (m,mi) <- allExtends gr cm])

          ex_seqs_arr = mkMapArray ex_seqs :: Array SeqId Sequence

          !(!fid_cnt1,!cnccats) = genCncCats gr am cm cdefs
          !(!fid_cnt2,!productions,!lindefs,!linrefs,!cncfuns)
-                                = genCncFuns gr am cm ex_seqs_arr seqs cdefs fid_cnt1 cnccats
-        
+                                = genCncFuns gr am cm ex_seqs_arr ciCmp seqs cdefs fid_cnt1 cnccats
+
          printnames = genPrintNames cdefs
      return (mi2i cm, D.Concr flags
                              printnames
@@ -186,6 +192,7 @@ genCncFuns :: Grammar
           -> ModuleName
           -> ModuleName
           -> Array SeqId Sequence
+           -> (Sequence -> Sequence -> Ordering)
           -> Array SeqId Sequence
           -> [(QIdent, Info)]
           -> FId
@@ -195,7 +202,7 @@ genCncFuns :: Grammar
               IntMap.IntMap [FunId],
               IntMap.IntMap [FunId],
               Array FunId D.CncFun)
-genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats =
+genCncFuns gr am cm ex_seqs ciCmp seqs cdefs fid_cnt cnccats =
  let (fid_cnt1,funs_cnt1,funs1,lindefs,linrefs) = mkCncCats cdefs fid_cnt  0 [] IntMap.empty IntMap.empty
      (fid_cnt2,funs_cnt2,funs2,prods)           = mkCncFuns cdefs fid_cnt1 funs_cnt1 funs1 lindefs Map.empty IntMap.empty
  in (fid_cnt2,prods,lindefs,linrefs,array (0,funs_cnt2-1) funs2)
@@ -282,9 +289,9 @@ genCncFuns gr am cm ex_seqs seqs cdefs fid_cnt cnccats =
      in (offs+funid0,C.CncFun (i2i id) (amap (newIndex mseqs) lins0)):funs                                          
      where
        newIndex mseqs i = binSearch (mseqs ! i) seqs (bounds seqs)
-           
+
        binSearch v arr (i,j)
-          | i <= j    = case compare v (arr ! k) of
+          | i <= j    = case ciCmp v (arr ! k) of
                          LT -> binSearch v arr (i,k-1)
                          EQ -> k
                          GT -> binSearch v arr (k+1,j)
@@ -303,6 +310,121 @@ genPrintNames cdefs =
    flatten (Alts x _) = flatten x
    flatten (C x y)    = flatten x +++ flatten y

--mkArray    lst = listArray (0,length lst-1) lst
+mkArray    lst = listArray (0,length lst-1) lst
 mkMapArray map = array (0,Map.size map-1) [(v,k) | (k,v) <- Map.toList map]
-mkSetArray set = listArray (0,Set.size set-1) [v | v <- Set.toList set]
+
+-- The following is a version of Data.List.sortBy which together
+-- with the sorting also eliminates duplicate values 
+sortNubBy cmp = mergeAll . sequences
+  where
+    sequences (a:b:xs) =
+      case cmp a b of
+        GT -> descending b [a]  xs
+        EQ -> sequences (b:xs)
+        LT -> ascending  b (a:) xs
+    sequences xs = [xs]
+
+    descending a as []     = [a:as]
+    descending a as (b:bs) =
+      case cmp a b of
+        GT -> descending b (a:as) bs
+        EQ -> descending a as bs
+        LT -> (a:as) : sequences (b:bs)
+
+    ascending a as []     = let !x = as [a]
+                            in [x]
+    ascending a as (b:bs) =
+      case cmp a b of
+        GT -> let !x = as [a]
+              in x : sequences (b:bs)
+        EQ -> ascending a as bs
+        LT -> ascending b (\ys -> as (a:ys)) bs
+
+    mergeAll [x] = x
+    mergeAll xs  = mergeAll (mergePairs xs)
+
+    mergePairs (a:b:xs) = let !x = merge a b
+                          in x : mergePairs xs
+    mergePairs xs       = xs
+
+    merge as@(a:as') bs@(b:bs') =
+      case cmp a b of
+        GT -> b:merge as  bs'
+        EQ -> a:merge as' bs'
+        LT -> a:merge as' bs
+    merge [] bs         = bs
+    merge as []         = as
+
+-- The following function does case-insensitive comparison of sequences.
+-- This is used to allow case-insensitive parsing, while
+-- the linearizer still has access to the original cases.
+compareCaseInsensitve s1 s2 =
+    compareSeq (elems s1) (elems s2)
+    where
+      compareSeq []     []     = EQ
+      compareSeq []     _      = LT
+      compareSeq _      []     = GT
+      compareSeq (x:xs) (y:ys) =
+        case compareSym x y of
+          EQ -> compareSeq xs ys
+          x  -> x
+
+      compareSym s1 s2 =
+        case s1 of
+          D.SymCat d1 r1
+            -> case s2 of
+                 D.SymCat d2 r2
+                   -> case compare d1 d2 of
+                        EQ -> r1 `compare` r2
+                        x  -> x
+                 _ -> LT
+          D.SymLit d1 r1
+            -> case s2 of
+                 D.SymCat {} -> GT
+                 D.SymLit d2 r2
+                   -> case compare d1 d2 of
+                        EQ -> r1 `compare` r2
+                        x  -> x
+                 _ -> LT
+          D.SymVar d1 r1
+            -> if tagToEnum# (getTag s2 ># 2#)
+                 then LT
+                 else case s2 of
+                        D.SymVar d2 r2
+                          -> case compare d1 d2 of
+                               EQ -> r1 `compare` r2
+                               x  -> x
+                        _ -> GT
+          D.SymKS t1
+            -> if tagToEnum# (getTag s2 ># 3#)
+                 then LT
+                 else case s2 of
+                        D.SymKS t2 -> t1 `compareToken` t2
+                        _          -> GT
+          D.SymKP a1 b1
+            -> if tagToEnum# (getTag s2 ># 4#)
+                 then LT
+                 else case s2 of
+                        D.SymKP a2 b2
+                          -> case compare a1 a2 of
+                               EQ -> b1 `compare` b2
+                               x  -> x
+                        _ -> GT
+          _ -> let t1 = getTag s1
+                   t2 = getTag s2
+               in if tagToEnum# (t1 <# t2) 
+                    then LT
+                    else if tagToEnum# (t1 ==# t2)
+                           then EQ
+                           else GT
+  
+      compareToken []     []     = EQ
+      compareToken []     _      = LT
+      compareToken _      []     = GT
+      compareToken (x:xs) (y:ys)
+        | x == y    = compareToken xs ys
+        | otherwise = case compare (toLower x) (toLower y) of
+                        EQ -> case compareToken xs ys of
+                                EQ -> compare x y
+                                x  -> x
+                        x  -> x
--- a/src/runtime/c/pgf/parser.c
+++ b/src/runtime/c/pgf/parser.c
@@ -1078,8 +1078,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
 			ptrdiff_t len = current.ptr - start.ptr;
 			found = true;

-			if (min <= len-1)
-				pgf_parsing_scan_helper(ps, state, i, k-1, min, len-1);
+			if (min <= len)
+				pgf_parsing_scan_helper(ps, state, i, k-1, min, len);

 			// Here we do bottom-up prediction for all lexical categories. 
 			// The epsilon productions will be predicted in top-down
@@ -1141,8 +1141,8 @@ pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
 				}
 			}

-			if (len+1 <= max)
-				pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max);
+			if (len <= max)
+				pgf_parsing_scan_helper(ps, state, k+1, j, len, max);

 			break;
 		}
@@ -1633,6 +1633,9 @@ pgf_parsing_set_default_factors(PgfParsing* ps, PgfAbstr* abstr)
 	}
 }

+PGF_INTERNAL_DECL bool
+pgf_is_case_sensitive(PgfConcr* concr);
+
 static PgfParsing*
 pgf_new_parsing(PgfConcr* concr, GuString sentence,
                PgfCallbacksMap* callbacks, PgfOracleCallback* oracle,
@@ -1643,8 +1646,7 @@ pgf_new_parsing(PgfConcr* concr, GuString sentence,
 	ps->pool = pool;
 	ps->out_pool = out_pool;
 	ps->sentence = sentence;
-	ps->case_sensitive =
-		(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
+	ps->case_sensitive = pgf_is_case_sensitive(concr);
 	ps->expr_queue = gu_new_buf(PgfExprState*, pool);
 	ps->max_fid = concr->total_cats;
 	ps->before = NULL;
--- a/src/runtime/c/pgf/scanner.c
+++ b/src/runtime/c/pgf/scanner.c
@@ -1,4 +1,5 @@
 #include <pgf/data.h>
+#include <pgf/expr.h>
 #include <pgf/linearizer.h>
 #include <gu/utf8.h>

@@ -16,8 +17,10 @@ cmp_string(PgfCohortSpot* spot, GuString tok,
 		if (c1 == 0)
 			return -1;

-		if (!case_sensitive)
+		if (!case_sensitive) {
 			c1 = gu_ucs_to_lower(c1);
+			c2 = gu_ucs_to_lower(c2);
+		}

 		if (c1 != c2)
 			return (c1-c2);
@@ -126,6 +129,22 @@ typedef struct {
 	bool case_sensitive;
 } PgfSequenceOrder;

+PGF_INTERNAL bool
+pgf_is_case_sensitive(PgfConcr* concr)
+{
+	PgfFlag* flag =
+		gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive");
+	if (flag != NULL) {
+		GuVariantInfo inf = gu_variant_open(flag->value);
+		if (inf.tag == PGF_LITERAL_STR) {
+			PgfLiteralStr* lstr = inf.data;
+			if (strcmp(lstr->val, "off") == 0)
+				return false;
+		}
+	}
+	return true;
+}
+
 static int
 pgf_sequence_cmp_fn(GuOrder* order, const void* p1, const void* p2)
 {
@@ -156,16 +175,59 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
 		}
 	}

-	bool case_sensitive =
-		(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
+	size_t index = 0;
+	PgfSequenceOrder order = { { pgf_sequence_cmp_fn },
+		                       pgf_is_case_sensitive(concr) };
+	if (gu_seq_binsearch_index(concr->sequences, &order.order,
+	                           PgfSequence, (void*) sentence, 
+	                           &index)) {
+		PgfSequence* seq = NULL;

-	PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
-	PgfSequence* seq = (PgfSequence*)
-		gu_seq_binsearch(concr->sequences, &order.order,
-		                 PgfSequence, (void*) sentence);
+		/* If the match is case-insensitive then there might be more
+		 * matches around the current index. We must check the neighbour
+		 * sequences for matching as well.
+		 */

-	if (seq != NULL && seq->idx != NULL)
-		pgf_morpho_iter(seq->idx, callback, err);
+		if (!order.case_sensitive) {
+			size_t i = index;
+			while (i > 0) {
+				seq = gu_seq_index(concr->sequences, PgfSequence, i-1);
+
+				size_t sym_idx = 0;
+				PgfCohortSpot spot = {0, sentence};
+				if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
+					break;
+				}
+
+				if (seq->idx != NULL)
+					pgf_morpho_iter(seq->idx, callback, err);
+
+				i--;
+			}
+		}
+
+		seq = gu_seq_index(concr->sequences, PgfSequence, index);
+		if (seq->idx != NULL)
+			pgf_morpho_iter(seq->idx, callback, err);
+
+		if (!order.case_sensitive) {
+			size_t i = index+1;
+			while (i < gu_seq_length(concr->sequences)) {
+				seq = gu_seq_index(concr->sequences, PgfSequence, i);
+
+				size_t sym_idx = 0;
+				PgfCohortSpot spot = {0, sentence};
+				if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, order.case_sensitive) != 0) {
+					break;
+				}
+
+				if (seq->idx != NULL)
+					pgf_morpho_iter(seq->idx, callback, err);
+
+				i++;
+			}
+		}
+	}
 }

 typedef struct {
@@ -225,8 +287,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
 		} else {
 			ptrdiff_t len = current.ptr - spot->ptr;

-			if (min <= len-1)
-				pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len-1);
+			if (min <= len)
+				pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);

 			if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
 				PgfCohortRange* range = gu_buf_insert(state->found, 0);
@@ -242,8 +304,8 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,

 			gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);

-			if (len+1 <= max)
-				pgf_lookup_cohorts_helper(state, spot, k+1, j, len+1, max);
+			if (len <= max)
+				pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);

 			break;
 		}
@@ -289,11 +351,13 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
 		pRes->buf       = NULL;
 		state->current  = NULL;
 		return;
-	} else {
+	} else do {
 		*pRes = gu_buf_pop(state->found, PgfCohortRange);
 		state->current = pRes->start.ptr;
 		pgf_morpho_iter(pRes->buf, state->callback, state->err);
-	}
+	} while (gu_buf_length(state->found) > 0 &&
+	         gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
+	         
 }

 PGF_API GuEnum*
@@ -309,9 +373,6 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
 		}
 	}

-	bool case_sensitive =
-		(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
-
 	PgfCohortsState* state = gu_new(PgfCohortsState, pool);
 	state->en.next = pgf_lookup_cohorts_enum_next;
 	state->concr   = concr;
@@ -319,7 +380,7 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
 	state->len     = strlen(sentence);
 	state->callback= callback;
 	state->err     = err;
-	state->case_sensitive = case_sensitive;
+	state->case_sensitive = pgf_is_case_sensitive(concr);
 	state->spots   = gu_new_buf(PgfCohortSpot,  pool);
 	state->found   = gu_new_buf(PgfCohortRange, pool);

@@ -339,6 +400,7 @@ typedef struct {
 	PgfSequences* sequences;
 	GuString prefix;
 	size_t seq_idx;
+	bool case_sensitive;
 } PgfFullFormState;

 struct PgfFullFormEntry {
@@ -358,7 +420,8 @@ gu_fullform_enum_next(GuEnum* self, void* to, GuPool* pool)
 			PgfSequence* seq = gu_seq_index(st->sequences, PgfSequence, st->seq_idx);
 			GuString tokens = pgf_get_tokens(seq->syms, 0, pool);

-			if (!gu_string_is_prefix(st->prefix, tokens)) {
+			PgfCohortSpot spot = {0, st->prefix};
+			if (cmp_string(&spot, tokens, st->case_sensitive) > 0 || *spot.ptr != 0) {
 				st->seq_idx = n_seqs;
 				break;
 			}
@@ -387,6 +450,7 @@ pgf_fullform_lexicon(PgfConcr *concr, GuPool* pool)
 	st->sequences = concr->sequences;
 	st->prefix    = "";
 	st->seq_idx   = 0;
+	st->case_sensitive = true;
 	return &st->en;
 }

@@ -420,15 +484,32 @@ pgf_lookup_word_prefix(PgfConcr *concr, GuString prefix,
 	state->sequences = concr->sequences;
 	state->prefix    = prefix;
 	state->seq_idx   = 0;
+	state->case_sensitive = pgf_is_case_sensitive(concr);

-	bool case_sensitive =
-		(gu_seq_binsearch(concr->cflags, pgf_flag_order, PgfFlag, "case_sensitive") == NULL);
-
-	PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, case_sensitive };
+	PgfSequenceOrder order = { { pgf_sequence_cmp_fn }, 
+		                       state->case_sensitive };
 	if (!gu_seq_binsearch_index(concr->sequences, &order.order,
 	                            PgfSequence, (void*) prefix, 
 	                            &state->seq_idx)) {
 		state->seq_idx++;
+	} else if (!state->case_sensitive) {
+		/* If the match is case-insensitive then there might be more
+		 * matches around the current index. Since we scroll down 
+		 * anyway, it is enough to search upwards now.
+		 */
+
+		while (state->seq_idx > 0) {
+			PgfSequence* seq =
+				gu_seq_index(concr->sequences, PgfSequence, state->seq_idx-1);
+
+			size_t sym_idx = 0;
+			PgfCohortSpot spot = {0, state->prefix};
+			if (pgf_symbols_cmp(&spot, seq->syms, &sym_idx, state->case_sensitive) > 0 || *spot.ptr != 0) {
+				break;
+			}
+
+			state->seq_idx--;
+		}
 	}

 	return &state->en;