lookupCohorts now detects and reports unknown words. Also:

- added added two filtering functions: filterLongest and filterBest - updated the PGF service to work with the new API
2020-05-14 15:03:30 +02:00
parent 57a1ea5b56
commit 62bc78380e
4 changed files with 165 additions and 58 deletions
@@ -171,8 +171,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
                  PgfMorphoCallback* callback, GuExn* err);

 typedef struct {
-	size_t pos;
-	GuString ptr;
+	size_t pos;       // position in Unicode characters
+	GuString ptr;     // pointer into the string
 } PgfCohortSpot;

 typedef struct {
@@ -233,12 +233,13 @@ typedef struct {
 	GuEnum en;
 	PgfConcr* concr;
 	GuString sentence;
-	GuString current;
 	size_t len;
 	PgfMorphoCallback* callback;
    GuExn* err;
 	bool case_sensitive;
 	GuBuf* spots;
+	GuBuf* skip_spots;
+	GuBuf* empty_buf;
 	GuBuf* found;
 } PgfCohortsState;

@@ -254,6 +255,29 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
 static GuOrder
 pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};

+static void
+pgf_lookup_cohorts_report_skip(PgfCohortsState *state,
+                               PgfCohortSpot* spot, GuString msg)
+{
+	PgfCohortSpot end_spot = *spot;
+	while (gu_ucs_is_space(*(end_spot.ptr-1))) {
+		end_spot.pos--;
+		end_spot.ptr--;
+	}
+
+	size_t n_spots = gu_buf_length(state->skip_spots);
+	for (size_t i = 0; i < n_spots; i++) {
+		PgfCohortSpot* skip_spot =
+			gu_buf_index(state->skip_spots, PgfCohortSpot, i);
+
+		PgfCohortRange* range = gu_buf_insert(state->found, 0);
+		range->start = *skip_spot;
+		range->end   = end_spot;
+		range->buf   = state->empty_buf;
+	}
+	gu_buf_flush(state->skip_spots);
+}
+
 static void
 pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
                          int i, int j, ptrdiff_t min, ptrdiff_t max)
@@ -290,18 +314,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
 				pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);

 			if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
+				// Report unknown words
+				pgf_lookup_cohorts_report_skip(state, spot, "a");
+
+				// Report the actual hit
 				PgfCohortRange* range = gu_buf_insert(state->found, 0);
 				range->start = *spot;
 				range->end   = current;
 				range->buf   = seq->idx;
-			}

-			while (*current.ptr != 0) {
-				if (!skip_space(&current.ptr, &current.pos))
-					break;
-			}
+				// Schedule the next search spot
+				while (*current.ptr != 0) {
+					if (!skip_space(&current.ptr, &current.pos))
+						break;
+				}

-			gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
+				gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
+			}

 			if (len <= max)
 				pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
@@ -317,29 +346,45 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
 	PgfCohortsState* state = gu_container(self, PgfCohortsState, en);

 	while (gu_buf_length(state->found) == 0 &&
-		   gu_buf_length(state->spots) > 0) {
+	       gu_buf_length(state->spots) > 0) {
 		PgfCohortSpot spot;
 		gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);

-		if (spot.ptr == state->current)
-			continue;
+		GuString next_ptr = state->sentence+state->len;
+		while (gu_buf_length(state->spots) > 0) {
+			GuString ptr =
+				gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr;
+			if (ptr > spot.ptr) {
+				next_ptr = ptr;
+				break;
+			}
+			gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
+		}

-		if (*spot.ptr == 0)
-			break;
+		bool needs_report = true;
+		while (next_ptr > spot.ptr) {
+			pgf_lookup_cohorts_helper
+						   (state, &spot,
+							0, gu_seq_length(state->concr->sequences)-1,
+							1, (state->sentence+state->len)-spot.ptr);
+
+			if (gu_buf_length(state->found) > 0)
+				break;
+
+			if (needs_report) {
+				gu_buf_push(state->skip_spots, PgfCohortSpot, spot);
+				needs_report = false;
+			}

-		pgf_lookup_cohorts_helper
-					   (state, &spot,
-						0, gu_seq_length(state->concr->sequences)-1,
-						1, (state->sentence+state->len)-spot.ptr);
-						
-		if (gu_buf_length(state->found) == 0) {
 			// skip one character and try again
 			gu_utf8_decode((const uint8_t**) &spot.ptr);
 			spot.pos++;
-			gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
 		}
 	}

+	PgfCohortSpot end_spot = {state->len, state->sentence+state->len};
+	pgf_lookup_cohorts_report_skip(state, &end_spot, "b");
+
 	PgfCohortRange* pRes = (PgfCohortRange*)to;

 	if (gu_buf_length(state->found) == 0) {
@@ -348,15 +393,11 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
 		pRes->end.pos   = 0;
 		pRes->end.ptr   = NULL;
 		pRes->buf       = NULL;
-		state->current  = NULL;
-		return;
 	} else do {
 		*pRes = gu_buf_pop(state->found, PgfCohortRange);
-		state->current = pRes->start.ptr;
 		pgf_morpho_iter(pRes->buf, state->callback, state->err);
 	} while (gu_buf_length(state->found) > 0 &&
 	         gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
-	         
 }

 PGF_API GuEnum*
@@ -373,15 +414,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
 	}

 	PgfCohortsState* state = gu_new(PgfCohortsState, pool);
-	state->en.next = pgf_lookup_cohorts_enum_next;
-	state->concr   = concr;
-	state->sentence= sentence;
-	state->len     = strlen(sentence);
-	state->callback= callback;
-	state->err     = err;
-	state->case_sensitive = pgf_is_case_sensitive(concr);
-	state->spots   = gu_new_buf(PgfCohortSpot,  pool);
-	state->found   = gu_new_buf(PgfCohortRange, pool);
+	state->en.next       = pgf_lookup_cohorts_enum_next;
+	state->concr         = concr;
+	state->sentence      = sentence;
+	state->len           = strlen(sentence);
+	state->callback      = callback;
+	state->err           = err;
+	state->case_sensitive= pgf_is_case_sensitive(concr);
+	state->spots         = gu_new_buf(PgfCohortSpot, pool);
+	state->skip_spots    = gu_new_buf(PgfCohortSpot, pool);
+	state->empty_buf     = gu_new_buf(PgfProductionIdxEntry, pool);
+	state->found         = gu_new_buf(PgfCohortRange, pool);

 	PgfCohortSpot spot = {0,sentence};
 	while (*spot.ptr != 0) {
@@ -73,6 +73,7 @@ module PGF2 (-- * PGF
             generateAll,
             -- ** Morphological Analysis
             MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon,
+             filterBest, filterLongest,
             -- ** Visualizations
             GraphvizOptions(..), graphvizDefaults,
             graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment,
@@ -99,11 +100,11 @@ import Foreign.C
 import Data.Typeable
 import qualified Data.Map as Map
 import Data.IORef
-import Data.Char(isUpper,isSpace)
+import Data.Char(isUpper,isSpace,isPunctuation)
 import Data.List(isSuffixOf,maximumBy,nub)
 import Data.Function(on)
 import Data.Maybe(maybe)
- 
+
 -----------------------------------------------------------------------
 -- Functions that take a PGF.
 -- PGF has many Concrs.
@@ -506,7 +507,7 @@ lookupMorpho (Concr concr master) sent =
 -- The list is sorted first by the @start@ position and after than
 -- by the @end@ position. This can be used for instance if you want to
 -- filter only the longest matches.
-lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)]
+lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
 lookupCohorts lang@(Concr concr master) sent =
  unsafePerformIO $
    do pl <- gu_new_pool
@@ -517,9 +518,9 @@ lookupCohorts lang@(Concr concr master) sent =
       c_sent <- newUtf8CString sent pl
       enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr
       fpl <- newForeignPtr gu_pool_finalizer pl
-       fromCohortRange enum fpl fptr ref
+       fromCohortRange enum fpl fptr 0 sent ref
  where
-    fromCohortRange enum fpl fptr ref =
+    fromCohortRange enum fpl fptr i sent ref =
      allocaBytes (#size PgfCohortRange) $ \ptr ->
      withForeignPtr fpl $ \pl ->
        do gu_enum_next enum ptr pl
@@ -533,8 +534,80 @@ lookupCohorts lang@(Concr concr master) sent =
                     end   <- (#peek PgfCohortRange, end.pos)   ptr
                     ans   <- readIORef ref
                     writeIORef ref []
-                     cohs  <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref)
-                     return ((start,ans,end):cohs)
+                     let sent' = drop (start-i)   sent
+                         tok   = take (end-start) sent'
+                     cohs  <- unsafeInterleaveIO (fromCohortRange enum fpl fptr start sent' ref)
+                     return ((start,tok,ans,end):cohs)
+
+filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
+filterBest ans = 
+  reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] [])
+  where
+    iterate v0 []                      []  res = res
+    iterate v0 []                      new res = iterate v0 new []  res
+    iterate v0 ((_,v,conf,    []):old) new res = 
+      case compare v0 v of
+        LT                                    -> res
+        EQ                                    -> iterate v0 old new (merge conf res)
+        GT                                    -> iterate v  old new conf
+    iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res
+
+    valueOf (_,_,[],_) = 2
+    valueOf _          = 1
+
+    insert v conf an@(start,_,_,end) ans l_new []                               =
+      match start v conf ans ((end,v,comb conf an,filter end ans):l_new) []
+    insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) =
+      case compare end0 end of
+        LT         -> insert v conf an ans (new:l_new) r_new
+        EQ -> case compare v0 v of
+                LT -> match start v conf ans ((end,v,                     conf0,ans0):    l_new) r_new
+                EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0):    l_new) r_new
+                GT -> match start v conf ans ((end,v,comb conf an,              ans0):    l_new) r_new
+        GT         -> match start v conf ans ((end,v,comb conf an,    filter end ans):new:l_new) r_new
+
+    match start0 v conf (an@(start,_,_,end):ans) l_new r_new
+      | start0 == start                 = insert v conf an ans l_new r_new
+    match start0 v conf ans l_new r_new = revOn l_new r_new
+
+    comb ((start0,w0,an0,end0):conf) (start,w,an,end)
+      | end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf
+    comb conf an = an:conf
+
+    filter end                     [] = []
+    filter end (next@(start,_,_,_):ans)
+      | end <= start                  = next:ans
+      | otherwise                     = filter end ans
+
+    revOn []     ys = ys
+    revOn (x:xs) ys = revOn xs (x:ys)
+
+    merge []                           ans                          = ans
+    merge ans                          []                           = ans
+    merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) =
+      case compare (start1,end1) (start2,end2) of
+        GT -> an1 : merge ans1 (an2:ans2)
+        EQ -> an1 : merge ans1 ans2
+        LT -> an2 : merge (an1:ans1) ans2
+
+filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
+filterLongest []       = []
+filterLongest (an:ans) = longest an ans
+  where
+    longest prev                   [] = [prev]
+    longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans)
+      | start0 == start               = longest next ans
+      | otherwise                     = filter prev (next:ans)
+
+    filter prev                      [] = [prev]
+    filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans)
+      | end0 == start && (unk w0 an0 || unk w an)
+                                        = filter (start0,w0++w,[],end) ans
+      | end0 <=  start                  = prev : longest next ans
+      | otherwise                       = filter prev ans
+
+unk w [] | any (not . isPunctuation) w = True
+unk _ _                                = False

 fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
 fullFormLexicon lang =
@@ -159,7 +159,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
    "c-translate"   -> withQSem qsem $
                       out t=<<join(trans # input % to % start % limit%treeopts)
    "c-lookupmorpho"-> out t=<< morpho # from1 % textInput
-    "c-lookupcohorts"->out t=<< cohorts # from1 % getInput "longest" % textInput
+    "c-lookupcohorts"->out t=<< cohorts # from1 % getInput "filter" % textInput
    "c-flush"       -> out t=<< flush
    "c-grammar"     -> out t grammar
    "c-abstrtree"   -> outputGraphviz=<< C.graphvizAbstractTree pgf C.graphvizDefaults # tree
@@ -251,29 +251,20 @@ cpgfMain qsem command (t,(pgf,pc)) =
                          ,"prob".=p]
                     | (l,a,p)<-C.lookupMorpho concr input]

-    cohorts (from,concr) longest input =
+    cohorts (from,concr) filter input =
      showJSON [makeObj ["start" .=showJSON s
+                        ,"word"  .=showJSON w
                        ,"morpho".=showJSON [makeObj ["lemma".=l
                                                     ,"analysis".=a
                                                     ,"prob".=p] 
                                                | (l,a,p)<-ms]
                        ,"end"   .=showJSON e
                        ]
-                   | (s,ms,e) <- (if longest==Just "true" then filterLongest else id)
-                                    (C.lookupCohorts concr input)]
-      where
-        filterLongest []       = []
-        filterLongest (an:ans) = longest an ans
-          where
-            longest prev                   [] = [prev]
-            longest prev@(start0,_,end0) (next@(start,an,end):ans)
-              | start0 == start               = longest next ans
-              | otherwise                     = prev : filter end0 (next:ans)
-
-            filter end                     [] = []
-            filter end (next@(start,_,_):ans)
-              | end <= start                  = filterLongest (next:ans)
-              | otherwise                     = filter end ans
+                   | (s,w,ms,e) <- (case filter of
+                                      Just "longest" -> C.filterLongest
+                                      Just "best"    -> C.filterBest
+                                      _              -> id)
+                                     (C.lookupCohorts concr input)]

    wordforword input@((from,_),_) = jsonWFW from . wordforword' input