From 62bc78380e69af2de3253130204fc45bac00f3f0 Mon Sep 17 00:00:00 2001 From: krangelov Date: Thu, 14 May 2020 15:03:30 +0200 Subject: [PATCH] lookupCohorts now detects and reports unknown words. Also: - added added two filtering functions: filterLongest and filterBest - updated the PGF service to work with the new API --- src/runtime/c/pgf/pgf.h | 4 +- src/runtime/c/pgf/scanner.c | 107 +++++++++++++++++++++--------- src/runtime/haskell-bind/PGF2.hsc | 87 ++++++++++++++++++++++-- src/server/PGFService.hs | 25 +++---- 4 files changed, 165 insertions(+), 58 deletions(-) diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 5dbe2e2e1..6ff269e00 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -171,8 +171,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence, PgfMorphoCallback* callback, GuExn* err); typedef struct { - size_t pos; - GuString ptr; + size_t pos; // position in Unicode characters + GuString ptr; // pointer into the string } PgfCohortSpot; typedef struct { diff --git a/src/runtime/c/pgf/scanner.c b/src/runtime/c/pgf/scanner.c index ad3605edc..0b2f9680f 100644 --- a/src/runtime/c/pgf/scanner.c +++ b/src/runtime/c/pgf/scanner.c @@ -233,12 +233,13 @@ typedef struct { GuEnum en; PgfConcr* concr; GuString sentence; - GuString current; size_t len; PgfMorphoCallback* callback; GuExn* err; bool case_sensitive; GuBuf* spots; + GuBuf* skip_spots; + GuBuf* empty_buf; GuBuf* found; } PgfCohortsState; @@ -254,6 +255,29 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b) static GuOrder pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; +static void +pgf_lookup_cohorts_report_skip(PgfCohortsState *state, + PgfCohortSpot* spot, GuString msg) +{ + PgfCohortSpot end_spot = *spot; + while (gu_ucs_is_space(*(end_spot.ptr-1))) { + end_spot.pos--; + end_spot.ptr--; + } + + size_t n_spots = gu_buf_length(state->skip_spots); + for (size_t i = 0; i < n_spots; i++) { + PgfCohortSpot* skip_spot = + gu_buf_index(state->skip_spots, PgfCohortSpot, i); + + PgfCohortRange* range = gu_buf_insert(state->found, 0); + range->start = *skip_spot; + range->end = end_spot; + range->buf = state->empty_buf; + } + gu_buf_flush(state->skip_spots); +} + static void pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, int i, int j, ptrdiff_t min, ptrdiff_t max) @@ -290,18 +314,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) { + // Report unknown words + pgf_lookup_cohorts_report_skip(state, spot, "a"); + + // Report the actual hit PgfCohortRange* range = gu_buf_insert(state->found, 0); range->start = *spot; range->end = current; range->buf = seq->idx; - } - while (*current.ptr != 0) { - if (!skip_space(¤t.ptr, ¤t.pos)) - break; - } + // Schedule the next search spot + while (*current.ptr != 0) { + if (!skip_space(¤t.ptr, ¤t.pos)) + break; + } - gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t); + } if (len <= max) pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max); @@ -317,29 +346,45 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) PgfCohortsState* state = gu_container(self, PgfCohortsState, en); while (gu_buf_length(state->found) == 0 && - gu_buf_length(state->spots) > 0) { + gu_buf_length(state->spots) > 0) { PgfCohortSpot spot; gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); - if (spot.ptr == state->current) - continue; + GuString next_ptr = state->sentence+state->len; + while (gu_buf_length(state->spots) > 0) { + GuString ptr = + gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr; + if (ptr > spot.ptr) { + next_ptr = ptr; + break; + } + gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); + } - if (*spot.ptr == 0) - break; + bool needs_report = true; + while (next_ptr > spot.ptr) { + pgf_lookup_cohorts_helper + (state, &spot, + 0, gu_seq_length(state->concr->sequences)-1, + 1, (state->sentence+state->len)-spot.ptr); + + if (gu_buf_length(state->found) > 0) + break; + + if (needs_report) { + gu_buf_push(state->skip_spots, PgfCohortSpot, spot); + needs_report = false; + } - pgf_lookup_cohorts_helper - (state, &spot, - 0, gu_seq_length(state->concr->sequences)-1, - 1, (state->sentence+state->len)-spot.ptr); - - if (gu_buf_length(state->found) == 0) { // skip one character and try again gu_utf8_decode((const uint8_t**) &spot.ptr); spot.pos++; - gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot); } } + PgfCohortSpot end_spot = {state->len, state->sentence+state->len}; + pgf_lookup_cohorts_report_skip(state, &end_spot, "b"); + PgfCohortRange* pRes = (PgfCohortRange*)to; if (gu_buf_length(state->found) == 0) { @@ -348,15 +393,11 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool) pRes->end.pos = 0; pRes->end.ptr = NULL; pRes->buf = NULL; - state->current = NULL; - return; } else do { *pRes = gu_buf_pop(state->found, PgfCohortRange); - state->current = pRes->start.ptr; pgf_morpho_iter(pRes->buf, state->callback, state->err); } while (gu_buf_length(state->found) > 0 && gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr); - } PGF_API GuEnum* @@ -373,15 +414,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence, } PgfCohortsState* state = gu_new(PgfCohortsState, pool); - state->en.next = pgf_lookup_cohorts_enum_next; - state->concr = concr; - state->sentence= sentence; - state->len = strlen(sentence); - state->callback= callback; - state->err = err; - state->case_sensitive = pgf_is_case_sensitive(concr); - state->spots = gu_new_buf(PgfCohortSpot, pool); - state->found = gu_new_buf(PgfCohortRange, pool); + state->en.next = pgf_lookup_cohorts_enum_next; + state->concr = concr; + state->sentence = sentence; + state->len = strlen(sentence); + state->callback = callback; + state->err = err; + state->case_sensitive= pgf_is_case_sensitive(concr); + state->spots = gu_new_buf(PgfCohortSpot, pool); + state->skip_spots = gu_new_buf(PgfCohortSpot, pool); + state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool); + state->found = gu_new_buf(PgfCohortRange, pool); PgfCohortSpot spot = {0,sentence}; while (*spot.ptr != 0) { diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index a84f7511c..4b41a7471 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -73,6 +73,7 @@ module PGF2 (-- * PGF generateAll, -- ** Morphological Analysis MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon, + filterBest, filterLongest, -- ** Visualizations GraphvizOptions(..), graphvizDefaults, graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment, @@ -99,11 +100,11 @@ import Foreign.C import Data.Typeable import qualified Data.Map as Map import Data.IORef -import Data.Char(isUpper,isSpace) +import Data.Char(isUpper,isSpace,isPunctuation) import Data.List(isSuffixOf,maximumBy,nub) import Data.Function(on) import Data.Maybe(maybe) - + ----------------------------------------------------------------------- -- Functions that take a PGF. -- PGF has many Concrs. @@ -506,7 +507,7 @@ lookupMorpho (Concr concr master) sent = -- The list is sorted first by the @start@ position and after than -- by the @end@ position. This can be used for instance if you want to -- filter only the longest matches. -lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)] +lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)] lookupCohorts lang@(Concr concr master) sent = unsafePerformIO $ do pl <- gu_new_pool @@ -517,9 +518,9 @@ lookupCohorts lang@(Concr concr master) sent = c_sent <- newUtf8CString sent pl enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr fpl <- newForeignPtr gu_pool_finalizer pl - fromCohortRange enum fpl fptr ref + fromCohortRange enum fpl fptr 0 sent ref where - fromCohortRange enum fpl fptr ref = + fromCohortRange enum fpl fptr i sent ref = allocaBytes (#size PgfCohortRange) $ \ptr -> withForeignPtr fpl $ \pl -> do gu_enum_next enum ptr pl @@ -533,8 +534,80 @@ lookupCohorts lang@(Concr concr master) sent = end <- (#peek PgfCohortRange, end.pos) ptr ans <- readIORef ref writeIORef ref [] - cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref) - return ((start,ans,end):cohs) + let sent' = drop (start-i) sent + tok = take (end-start) sent' + cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr start sent' ref) + return ((start,tok,ans,end):cohs) + +filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] +filterBest ans = + reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] []) + where + iterate v0 [] [] res = res + iterate v0 [] new res = iterate v0 new [] res + iterate v0 ((_,v,conf, []):old) new res = + case compare v0 v of + LT -> res + EQ -> iterate v0 old new (merge conf res) + GT -> iterate v old new conf + iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res + + valueOf (_,_,[],_) = 2 + valueOf _ = 1 + + insert v conf an@(start,_,_,end) ans l_new [] = + match start v conf ans ((end,v,comb conf an,filter end ans):l_new) [] + insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) = + case compare end0 end of + LT -> insert v conf an ans (new:l_new) r_new + EQ -> case compare v0 v of + LT -> match start v conf ans ((end,v, conf0,ans0): l_new) r_new + EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0): l_new) r_new + GT -> match start v conf ans ((end,v,comb conf an, ans0): l_new) r_new + GT -> match start v conf ans ((end,v,comb conf an, filter end ans):new:l_new) r_new + + match start0 v conf (an@(start,_,_,end):ans) l_new r_new + | start0 == start = insert v conf an ans l_new r_new + match start0 v conf ans l_new r_new = revOn l_new r_new + + comb ((start0,w0,an0,end0):conf) (start,w,an,end) + | end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf + comb conf an = an:conf + + filter end [] = [] + filter end (next@(start,_,_,_):ans) + | end <= start = next:ans + | otherwise = filter end ans + + revOn [] ys = ys + revOn (x:xs) ys = revOn xs (x:ys) + + merge [] ans = ans + merge ans [] = ans + merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) = + case compare (start1,end1) (start2,end2) of + GT -> an1 : merge ans1 (an2:ans2) + EQ -> an1 : merge ans1 ans2 + LT -> an2 : merge (an1:ans1) ans2 + +filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] +filterLongest [] = [] +filterLongest (an:ans) = longest an ans + where + longest prev [] = [prev] + longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans) + | start0 == start = longest next ans + | otherwise = filter prev (next:ans) + + filter prev [] = [prev] + filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans) + | end0 == start && (unk w0 an0 || unk w an) + = filter (start0,w0++w,[],end) ans + | end0 <= start = prev : longest next ans + | otherwise = filter prev ans + +unk w [] | any (not . isPunctuation) w = True +unk _ _ = False fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon lang = diff --git a/src/server/PGFService.hs b/src/server/PGFService.hs index fa515e018..5817be7f0 100644 --- a/src/server/PGFService.hs +++ b/src/server/PGFService.hs @@ -159,7 +159,7 @@ cpgfMain qsem command (t,(pgf,pc)) = "c-translate" -> withQSem qsem $ out t=< out t=<< morpho # from1 % textInput - "c-lookupcohorts"->out t=<< cohorts # from1 % getInput "longest" % textInput + "c-lookupcohorts"->out t=<< cohorts # from1 % getInput "filter" % textInput "c-flush" -> out t=<< flush "c-grammar" -> out t grammar "c-abstrtree" -> outputGraphviz=<< C.graphvizAbstractTree pgf C.graphvizDefaults # tree @@ -251,29 +251,20 @@ cpgfMain qsem command (t,(pgf,pc)) = ,"prob".=p] | (l,a,p)<-C.lookupMorpho concr input] - cohorts (from,concr) longest input = + cohorts (from,concr) filter input = showJSON [makeObj ["start" .=showJSON s + ,"word" .=showJSON w ,"morpho".=showJSON [makeObj ["lemma".=l ,"analysis".=a ,"prob".=p] | (l,a,p)<-ms] ,"end" .=showJSON e ] - | (s,ms,e) <- (if longest==Just "true" then filterLongest else id) - (C.lookupCohorts concr input)] - where - filterLongest [] = [] - filterLongest (an:ans) = longest an ans - where - longest prev [] = [prev] - longest prev@(start0,_,end0) (next@(start,an,end):ans) - | start0 == start = longest next ans - | otherwise = prev : filter end0 (next:ans) - - filter end [] = [] - filter end (next@(start,_,_):ans) - | end <= start = filterLongest (next:ans) - | otherwise = filter end ans + | (s,w,ms,e) <- (case filter of + Just "longest" -> C.filterLongest + Just "best" -> C.filterBest + _ -> id) + (C.lookupCohorts concr input)] wordforword input@((from,_),_) = jsonWFW from . wordforword' input