1
0
forked from GitHub/gf-core

lookupCohorts now detects and reports unknown words. Also:

- added added two filtering functions: filterLongest and filterBest
 - updated the PGF service to work with the new API
This commit is contained in:
krangelov
2020-05-14 15:03:30 +02:00
parent 57a1ea5b56
commit 62bc78380e
4 changed files with 165 additions and 58 deletions

View File

@@ -171,8 +171,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
PgfMorphoCallback* callback, GuExn* err); PgfMorphoCallback* callback, GuExn* err);
typedef struct { typedef struct {
size_t pos; size_t pos; // position in Unicode characters
GuString ptr; GuString ptr; // pointer into the string
} PgfCohortSpot; } PgfCohortSpot;
typedef struct { typedef struct {

View File

@@ -233,12 +233,13 @@ typedef struct {
GuEnum en; GuEnum en;
PgfConcr* concr; PgfConcr* concr;
GuString sentence; GuString sentence;
GuString current;
size_t len; size_t len;
PgfMorphoCallback* callback; PgfMorphoCallback* callback;
GuExn* err; GuExn* err;
bool case_sensitive; bool case_sensitive;
GuBuf* spots; GuBuf* spots;
GuBuf* skip_spots;
GuBuf* empty_buf;
GuBuf* found; GuBuf* found;
} PgfCohortsState; } PgfCohortsState;
@@ -254,6 +255,29 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
static GuOrder static GuOrder
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }}; pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
static void
pgf_lookup_cohorts_report_skip(PgfCohortsState *state,
PgfCohortSpot* spot, GuString msg)
{
PgfCohortSpot end_spot = *spot;
while (gu_ucs_is_space(*(end_spot.ptr-1))) {
end_spot.pos--;
end_spot.ptr--;
}
size_t n_spots = gu_buf_length(state->skip_spots);
for (size_t i = 0; i < n_spots; i++) {
PgfCohortSpot* skip_spot =
gu_buf_index(state->skip_spots, PgfCohortSpot, i);
PgfCohortRange* range = gu_buf_insert(state->found, 0);
range->start = *skip_spot;
range->end = end_spot;
range->buf = state->empty_buf;
}
gu_buf_flush(state->skip_spots);
}
static void static void
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot, pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
int i, int j, ptrdiff_t min, ptrdiff_t max) int i, int j, ptrdiff_t min, ptrdiff_t max)
@@ -290,18 +314,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len); pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) { if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
// Report unknown words
pgf_lookup_cohorts_report_skip(state, spot, "a");
// Report the actual hit
PgfCohortRange* range = gu_buf_insert(state->found, 0); PgfCohortRange* range = gu_buf_insert(state->found, 0);
range->start = *spot; range->start = *spot;
range->end = current; range->end = current;
range->buf = seq->idx; range->buf = seq->idx;
}
while (*current.ptr != 0) { // Schedule the next search spot
if (!skip_space(&current.ptr, &current.pos)) while (*current.ptr != 0) {
break; if (!skip_space(&current.ptr, &current.pos))
} break;
}
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current); gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &current);
}
if (len <= max) if (len <= max)
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max); pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
@@ -317,29 +346,45 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
PgfCohortsState* state = gu_container(self, PgfCohortsState, en); PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
while (gu_buf_length(state->found) == 0 && while (gu_buf_length(state->found) == 0 &&
gu_buf_length(state->spots) > 0) { gu_buf_length(state->spots) > 0) {
PgfCohortSpot spot; PgfCohortSpot spot;
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot); gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
if (spot.ptr == state->current) GuString next_ptr = state->sentence+state->len;
continue; while (gu_buf_length(state->spots) > 0) {
GuString ptr =
gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr;
if (ptr > spot.ptr) {
next_ptr = ptr;
break;
}
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
}
if (*spot.ptr == 0) bool needs_report = true;
break; while (next_ptr > spot.ptr) {
pgf_lookup_cohorts_helper
(state, &spot,
0, gu_seq_length(state->concr->sequences)-1,
1, (state->sentence+state->len)-spot.ptr);
if (gu_buf_length(state->found) > 0)
break;
if (needs_report) {
gu_buf_push(state->skip_spots, PgfCohortSpot, spot);
needs_report = false;
}
pgf_lookup_cohorts_helper
(state, &spot,
0, gu_seq_length(state->concr->sequences)-1,
1, (state->sentence+state->len)-spot.ptr);
if (gu_buf_length(state->found) == 0) {
// skip one character and try again // skip one character and try again
gu_utf8_decode((const uint8_t**) &spot.ptr); gu_utf8_decode((const uint8_t**) &spot.ptr);
spot.pos++; spot.pos++;
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
} }
} }
PgfCohortSpot end_spot = {state->len, state->sentence+state->len};
pgf_lookup_cohorts_report_skip(state, &end_spot, "b");
PgfCohortRange* pRes = (PgfCohortRange*)to; PgfCohortRange* pRes = (PgfCohortRange*)to;
if (gu_buf_length(state->found) == 0) { if (gu_buf_length(state->found) == 0) {
@@ -348,15 +393,11 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
pRes->end.pos = 0; pRes->end.pos = 0;
pRes->end.ptr = NULL; pRes->end.ptr = NULL;
pRes->buf = NULL; pRes->buf = NULL;
state->current = NULL;
return;
} else do { } else do {
*pRes = gu_buf_pop(state->found, PgfCohortRange); *pRes = gu_buf_pop(state->found, PgfCohortRange);
state->current = pRes->start.ptr;
pgf_morpho_iter(pRes->buf, state->callback, state->err); pgf_morpho_iter(pRes->buf, state->callback, state->err);
} while (gu_buf_length(state->found) > 0 && } while (gu_buf_length(state->found) > 0 &&
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr); gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
} }
PGF_API GuEnum* PGF_API GuEnum*
@@ -373,15 +414,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
} }
PgfCohortsState* state = gu_new(PgfCohortsState, pool); PgfCohortsState* state = gu_new(PgfCohortsState, pool);
state->en.next = pgf_lookup_cohorts_enum_next; state->en.next = pgf_lookup_cohorts_enum_next;
state->concr = concr; state->concr = concr;
state->sentence= sentence; state->sentence = sentence;
state->len = strlen(sentence); state->len = strlen(sentence);
state->callback= callback; state->callback = callback;
state->err = err; state->err = err;
state->case_sensitive = pgf_is_case_sensitive(concr); state->case_sensitive= pgf_is_case_sensitive(concr);
state->spots = gu_new_buf(PgfCohortSpot, pool); state->spots = gu_new_buf(PgfCohortSpot, pool);
state->found = gu_new_buf(PgfCohortRange, pool); state->skip_spots = gu_new_buf(PgfCohortSpot, pool);
state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool);
state->found = gu_new_buf(PgfCohortRange, pool);
PgfCohortSpot spot = {0,sentence}; PgfCohortSpot spot = {0,sentence};
while (*spot.ptr != 0) { while (*spot.ptr != 0) {

View File

@@ -73,6 +73,7 @@ module PGF2 (-- * PGF
generateAll, generateAll,
-- ** Morphological Analysis -- ** Morphological Analysis
MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon, MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon,
filterBest, filterLongest,
-- ** Visualizations -- ** Visualizations
GraphvizOptions(..), graphvizDefaults, GraphvizOptions(..), graphvizDefaults,
graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment, graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment,
@@ -99,11 +100,11 @@ import Foreign.C
import Data.Typeable import Data.Typeable
import qualified Data.Map as Map import qualified Data.Map as Map
import Data.IORef import Data.IORef
import Data.Char(isUpper,isSpace) import Data.Char(isUpper,isSpace,isPunctuation)
import Data.List(isSuffixOf,maximumBy,nub) import Data.List(isSuffixOf,maximumBy,nub)
import Data.Function(on) import Data.Function(on)
import Data.Maybe(maybe) import Data.Maybe(maybe)
----------------------------------------------------------------------- -----------------------------------------------------------------------
-- Functions that take a PGF. -- Functions that take a PGF.
-- PGF has many Concrs. -- PGF has many Concrs.
@@ -506,7 +507,7 @@ lookupMorpho (Concr concr master) sent =
-- The list is sorted first by the @start@ position and after than -- The list is sorted first by the @start@ position and after than
-- by the @end@ position. This can be used for instance if you want to -- by the @end@ position. This can be used for instance if you want to
-- filter only the longest matches. -- filter only the longest matches.
lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)] lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
lookupCohorts lang@(Concr concr master) sent = lookupCohorts lang@(Concr concr master) sent =
unsafePerformIO $ unsafePerformIO $
do pl <- gu_new_pool do pl <- gu_new_pool
@@ -517,9 +518,9 @@ lookupCohorts lang@(Concr concr master) sent =
c_sent <- newUtf8CString sent pl c_sent <- newUtf8CString sent pl
enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr
fpl <- newForeignPtr gu_pool_finalizer pl fpl <- newForeignPtr gu_pool_finalizer pl
fromCohortRange enum fpl fptr ref fromCohortRange enum fpl fptr 0 sent ref
where where
fromCohortRange enum fpl fptr ref = fromCohortRange enum fpl fptr i sent ref =
allocaBytes (#size PgfCohortRange) $ \ptr -> allocaBytes (#size PgfCohortRange) $ \ptr ->
withForeignPtr fpl $ \pl -> withForeignPtr fpl $ \pl ->
do gu_enum_next enum ptr pl do gu_enum_next enum ptr pl
@@ -533,8 +534,80 @@ lookupCohorts lang@(Concr concr master) sent =
end <- (#peek PgfCohortRange, end.pos) ptr end <- (#peek PgfCohortRange, end.pos) ptr
ans <- readIORef ref ans <- readIORef ref
writeIORef ref [] writeIORef ref []
cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref) let sent' = drop (start-i) sent
return ((start,ans,end):cohs) tok = take (end-start) sent'
cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr start sent' ref)
return ((start,tok,ans,end):cohs)
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
filterBest ans =
reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] [])
where
iterate v0 [] [] res = res
iterate v0 [] new res = iterate v0 new [] res
iterate v0 ((_,v,conf, []):old) new res =
case compare v0 v of
LT -> res
EQ -> iterate v0 old new (merge conf res)
GT -> iterate v old new conf
iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res
valueOf (_,_,[],_) = 2
valueOf _ = 1
insert v conf an@(start,_,_,end) ans l_new [] =
match start v conf ans ((end,v,comb conf an,filter end ans):l_new) []
insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) =
case compare end0 end of
LT -> insert v conf an ans (new:l_new) r_new
EQ -> case compare v0 v of
LT -> match start v conf ans ((end,v, conf0,ans0): l_new) r_new
EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0): l_new) r_new
GT -> match start v conf ans ((end,v,comb conf an, ans0): l_new) r_new
GT -> match start v conf ans ((end,v,comb conf an, filter end ans):new:l_new) r_new
match start0 v conf (an@(start,_,_,end):ans) l_new r_new
| start0 == start = insert v conf an ans l_new r_new
match start0 v conf ans l_new r_new = revOn l_new r_new
comb ((start0,w0,an0,end0):conf) (start,w,an,end)
| end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf
comb conf an = an:conf
filter end [] = []
filter end (next@(start,_,_,_):ans)
| end <= start = next:ans
| otherwise = filter end ans
revOn [] ys = ys
revOn (x:xs) ys = revOn xs (x:ys)
merge [] ans = ans
merge ans [] = ans
merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) =
case compare (start1,end1) (start2,end2) of
GT -> an1 : merge ans1 (an2:ans2)
EQ -> an1 : merge ans1 ans2
LT -> an2 : merge (an1:ans1) ans2
filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
filterLongest [] = []
filterLongest (an:ans) = longest an ans
where
longest prev [] = [prev]
longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans)
| start0 == start = longest next ans
| otherwise = filter prev (next:ans)
filter prev [] = [prev]
filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans)
| end0 == start && (unk w0 an0 || unk w an)
= filter (start0,w0++w,[],end) ans
| end0 <= start = prev : longest next ans
| otherwise = filter prev ans
unk w [] | any (not . isPunctuation) w = True
unk _ _ = False
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
fullFormLexicon lang = fullFormLexicon lang =

View File

@@ -159,7 +159,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
"c-translate" -> withQSem qsem $ "c-translate" -> withQSem qsem $
out t=<<join(trans # input % to % start % limit%treeopts) out t=<<join(trans # input % to % start % limit%treeopts)
"c-lookupmorpho"-> out t=<< morpho # from1 % textInput "c-lookupmorpho"-> out t=<< morpho # from1 % textInput
"c-lookupcohorts"->out t=<< cohorts # from1 % getInput "longest" % textInput "c-lookupcohorts"->out t=<< cohorts # from1 % getInput "filter" % textInput
"c-flush" -> out t=<< flush "c-flush" -> out t=<< flush
"c-grammar" -> out t grammar "c-grammar" -> out t grammar
"c-abstrtree" -> outputGraphviz=<< C.graphvizAbstractTree pgf C.graphvizDefaults # tree "c-abstrtree" -> outputGraphviz=<< C.graphvizAbstractTree pgf C.graphvizDefaults # tree
@@ -251,29 +251,20 @@ cpgfMain qsem command (t,(pgf,pc)) =
,"prob".=p] ,"prob".=p]
| (l,a,p)<-C.lookupMorpho concr input] | (l,a,p)<-C.lookupMorpho concr input]
cohorts (from,concr) longest input = cohorts (from,concr) filter input =
showJSON [makeObj ["start" .=showJSON s showJSON [makeObj ["start" .=showJSON s
,"word" .=showJSON w
,"morpho".=showJSON [makeObj ["lemma".=l ,"morpho".=showJSON [makeObj ["lemma".=l
,"analysis".=a ,"analysis".=a
,"prob".=p] ,"prob".=p]
| (l,a,p)<-ms] | (l,a,p)<-ms]
,"end" .=showJSON e ,"end" .=showJSON e
] ]
| (s,ms,e) <- (if longest==Just "true" then filterLongest else id) | (s,w,ms,e) <- (case filter of
(C.lookupCohorts concr input)] Just "longest" -> C.filterLongest
where Just "best" -> C.filterBest
filterLongest [] = [] _ -> id)
filterLongest (an:ans) = longest an ans (C.lookupCohorts concr input)]
where
longest prev [] = [prev]
longest prev@(start0,_,end0) (next@(start,an,end):ans)
| start0 == start = longest next ans
| otherwise = prev : filter end0 (next:ans)
filter end [] = []
filter end (next@(start,_,_):ans)
| end <= start = filterLongest (next:ans)
| otherwise = filter end ans
wordforword input@((from,_),_) = jsonWFW from . wordforword' input wordforword input@((from,_),_) = jsonWFW from . wordforword' input