forked from GitHub/gf-core
lookupCohorts now detects and reports unknown words. Also:
- added added two filtering functions: filterLongest and filterBest - updated the PGF service to work with the new API
This commit is contained in:
@@ -171,8 +171,8 @@ pgf_lookup_morpho(PgfConcr *concr, GuString sentence,
|
||||
PgfMorphoCallback* callback, GuExn* err);
|
||||
|
||||
typedef struct {
|
||||
size_t pos;
|
||||
GuString ptr;
|
||||
size_t pos; // position in Unicode characters
|
||||
GuString ptr; // pointer into the string
|
||||
} PgfCohortSpot;
|
||||
|
||||
typedef struct {
|
||||
|
||||
@@ -233,12 +233,13 @@ typedef struct {
|
||||
GuEnum en;
|
||||
PgfConcr* concr;
|
||||
GuString sentence;
|
||||
GuString current;
|
||||
size_t len;
|
||||
PgfMorphoCallback* callback;
|
||||
GuExn* err;
|
||||
bool case_sensitive;
|
||||
GuBuf* spots;
|
||||
GuBuf* skip_spots;
|
||||
GuBuf* empty_buf;
|
||||
GuBuf* found;
|
||||
} PgfCohortsState;
|
||||
|
||||
@@ -254,6 +255,29 @@ cmp_cohort_spot(GuOrder* self, const void* a, const void* b)
|
||||
static GuOrder
|
||||
pgf_cohort_spot_order[1] = {{ cmp_cohort_spot }};
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_report_skip(PgfCohortsState *state,
|
||||
PgfCohortSpot* spot, GuString msg)
|
||||
{
|
||||
PgfCohortSpot end_spot = *spot;
|
||||
while (gu_ucs_is_space(*(end_spot.ptr-1))) {
|
||||
end_spot.pos--;
|
||||
end_spot.ptr--;
|
||||
}
|
||||
|
||||
size_t n_spots = gu_buf_length(state->skip_spots);
|
||||
for (size_t i = 0; i < n_spots; i++) {
|
||||
PgfCohortSpot* skip_spot =
|
||||
gu_buf_index(state->skip_spots, PgfCohortSpot, i);
|
||||
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
range->start = *skip_spot;
|
||||
range->end = end_spot;
|
||||
range->buf = state->empty_buf;
|
||||
}
|
||||
gu_buf_flush(state->skip_spots);
|
||||
}
|
||||
|
||||
static void
|
||||
pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||
@@ -290,18 +314,23 @@ pgf_lookup_cohorts_helper(PgfCohortsState *state, PgfCohortSpot* spot,
|
||||
pgf_lookup_cohorts_helper(state, spot, i, k-1, min, len);
|
||||
|
||||
if (seq->idx != NULL && gu_buf_length(seq->idx) > 0) {
|
||||
// Report unknown words
|
||||
pgf_lookup_cohorts_report_skip(state, spot, "a");
|
||||
|
||||
// Report the actual hit
|
||||
PgfCohortRange* range = gu_buf_insert(state->found, 0);
|
||||
range->start = *spot;
|
||||
range->end = current;
|
||||
range->buf = seq->idx;
|
||||
}
|
||||
|
||||
while (*current.ptr != 0) {
|
||||
if (!skip_space(¤t.ptr, ¤t.pos))
|
||||
break;
|
||||
}
|
||||
// Schedule the next search spot
|
||||
while (*current.ptr != 0) {
|
||||
if (!skip_space(¤t.ptr, ¤t.pos))
|
||||
break;
|
||||
}
|
||||
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, ¤t);
|
||||
}
|
||||
|
||||
if (len <= max)
|
||||
pgf_lookup_cohorts_helper(state, spot, k+1, j, len, max);
|
||||
@@ -317,29 +346,45 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
PgfCohortsState* state = gu_container(self, PgfCohortsState, en);
|
||||
|
||||
while (gu_buf_length(state->found) == 0 &&
|
||||
gu_buf_length(state->spots) > 0) {
|
||||
gu_buf_length(state->spots) > 0) {
|
||||
PgfCohortSpot spot;
|
||||
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
|
||||
|
||||
if (spot.ptr == state->current)
|
||||
continue;
|
||||
GuString next_ptr = state->sentence+state->len;
|
||||
while (gu_buf_length(state->spots) > 0) {
|
||||
GuString ptr =
|
||||
gu_buf_index(state->spots, PgfCohortSpot, 0)->ptr;
|
||||
if (ptr > spot.ptr) {
|
||||
next_ptr = ptr;
|
||||
break;
|
||||
}
|
||||
gu_buf_heap_pop(state->spots, pgf_cohort_spot_order, &spot);
|
||||
}
|
||||
|
||||
if (*spot.ptr == 0)
|
||||
break;
|
||||
bool needs_report = true;
|
||||
while (next_ptr > spot.ptr) {
|
||||
pgf_lookup_cohorts_helper
|
||||
(state, &spot,
|
||||
0, gu_seq_length(state->concr->sequences)-1,
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
if (gu_buf_length(state->found) > 0)
|
||||
break;
|
||||
|
||||
if (needs_report) {
|
||||
gu_buf_push(state->skip_spots, PgfCohortSpot, spot);
|
||||
needs_report = false;
|
||||
}
|
||||
|
||||
pgf_lookup_cohorts_helper
|
||||
(state, &spot,
|
||||
0, gu_seq_length(state->concr->sequences)-1,
|
||||
1, (state->sentence+state->len)-spot.ptr);
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
// skip one character and try again
|
||||
gu_utf8_decode((const uint8_t**) &spot.ptr);
|
||||
spot.pos++;
|
||||
gu_buf_heap_push(state->spots, pgf_cohort_spot_order, &spot);
|
||||
}
|
||||
}
|
||||
|
||||
PgfCohortSpot end_spot = {state->len, state->sentence+state->len};
|
||||
pgf_lookup_cohorts_report_skip(state, &end_spot, "b");
|
||||
|
||||
PgfCohortRange* pRes = (PgfCohortRange*)to;
|
||||
|
||||
if (gu_buf_length(state->found) == 0) {
|
||||
@@ -348,15 +393,11 @@ pgf_lookup_cohorts_enum_next(GuEnum* self, void* to, GuPool* pool)
|
||||
pRes->end.pos = 0;
|
||||
pRes->end.ptr = NULL;
|
||||
pRes->buf = NULL;
|
||||
state->current = NULL;
|
||||
return;
|
||||
} else do {
|
||||
*pRes = gu_buf_pop(state->found, PgfCohortRange);
|
||||
state->current = pRes->start.ptr;
|
||||
pgf_morpho_iter(pRes->buf, state->callback, state->err);
|
||||
} while (gu_buf_length(state->found) > 0 &&
|
||||
gu_buf_index_last(state->found, PgfCohortRange)->end.ptr == pRes->end.ptr);
|
||||
|
||||
}
|
||||
|
||||
PGF_API GuEnum*
|
||||
@@ -373,15 +414,17 @@ pgf_lookup_cohorts(PgfConcr *concr, GuString sentence,
|
||||
}
|
||||
|
||||
PgfCohortsState* state = gu_new(PgfCohortsState, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
state->sentence= sentence;
|
||||
state->len = strlen(sentence);
|
||||
state->callback= callback;
|
||||
state->err = err;
|
||||
state->case_sensitive = pgf_is_case_sensitive(concr);
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
state->en.next = pgf_lookup_cohorts_enum_next;
|
||||
state->concr = concr;
|
||||
state->sentence = sentence;
|
||||
state->len = strlen(sentence);
|
||||
state->callback = callback;
|
||||
state->err = err;
|
||||
state->case_sensitive= pgf_is_case_sensitive(concr);
|
||||
state->spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->skip_spots = gu_new_buf(PgfCohortSpot, pool);
|
||||
state->empty_buf = gu_new_buf(PgfProductionIdxEntry, pool);
|
||||
state->found = gu_new_buf(PgfCohortRange, pool);
|
||||
|
||||
PgfCohortSpot spot = {0,sentence};
|
||||
while (*spot.ptr != 0) {
|
||||
|
||||
@@ -73,6 +73,7 @@ module PGF2 (-- * PGF
|
||||
generateAll,
|
||||
-- ** Morphological Analysis
|
||||
MorphoAnalysis, lookupMorpho, lookupCohorts, fullFormLexicon,
|
||||
filterBest, filterLongest,
|
||||
-- ** Visualizations
|
||||
GraphvizOptions(..), graphvizDefaults,
|
||||
graphvizAbstractTree, graphvizParseTree, graphvizWordAlignment,
|
||||
@@ -99,11 +100,11 @@ import Foreign.C
|
||||
import Data.Typeable
|
||||
import qualified Data.Map as Map
|
||||
import Data.IORef
|
||||
import Data.Char(isUpper,isSpace)
|
||||
import Data.Char(isUpper,isSpace,isPunctuation)
|
||||
import Data.List(isSuffixOf,maximumBy,nub)
|
||||
import Data.Function(on)
|
||||
import Data.Maybe(maybe)
|
||||
|
||||
|
||||
-----------------------------------------------------------------------
|
||||
-- Functions that take a PGF.
|
||||
-- PGF has many Concrs.
|
||||
@@ -506,7 +507,7 @@ lookupMorpho (Concr concr master) sent =
|
||||
-- The list is sorted first by the @start@ position and after than
|
||||
-- by the @end@ position. This can be used for instance if you want to
|
||||
-- filter only the longest matches.
|
||||
lookupCohorts :: Concr -> String -> [(Int,[MorphoAnalysis],Int)]
|
||||
lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
lookupCohorts lang@(Concr concr master) sent =
|
||||
unsafePerformIO $
|
||||
do pl <- gu_new_pool
|
||||
@@ -517,9 +518,9 @@ lookupCohorts lang@(Concr concr master) sent =
|
||||
c_sent <- newUtf8CString sent pl
|
||||
enum <- pgf_lookup_cohorts concr c_sent cback pl nullPtr
|
||||
fpl <- newForeignPtr gu_pool_finalizer pl
|
||||
fromCohortRange enum fpl fptr ref
|
||||
fromCohortRange enum fpl fptr 0 sent ref
|
||||
where
|
||||
fromCohortRange enum fpl fptr ref =
|
||||
fromCohortRange enum fpl fptr i sent ref =
|
||||
allocaBytes (#size PgfCohortRange) $ \ptr ->
|
||||
withForeignPtr fpl $ \pl ->
|
||||
do gu_enum_next enum ptr pl
|
||||
@@ -533,8 +534,80 @@ lookupCohorts lang@(Concr concr master) sent =
|
||||
end <- (#peek PgfCohortRange, end.pos) ptr
|
||||
ans <- readIORef ref
|
||||
writeIORef ref []
|
||||
cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr ref)
|
||||
return ((start,ans,end):cohs)
|
||||
let sent' = drop (start-i) sent
|
||||
tok = take (end-start) sent'
|
||||
cohs <- unsafeInterleaveIO (fromCohortRange enum fpl fptr start sent' ref)
|
||||
return ((start,tok,ans,end):cohs)
|
||||
|
||||
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
filterBest ans =
|
||||
reverse (iterate (maxBound :: Int) [(0,0,[],ans)] [] [])
|
||||
where
|
||||
iterate v0 [] [] res = res
|
||||
iterate v0 [] new res = iterate v0 new [] res
|
||||
iterate v0 ((_,v,conf, []):old) new res =
|
||||
case compare v0 v of
|
||||
LT -> res
|
||||
EQ -> iterate v0 old new (merge conf res)
|
||||
GT -> iterate v old new conf
|
||||
iterate v0 ((_,v,conf,an:ans):old) new res = iterate v0 old (insert (v+valueOf an) conf an ans [] new) res
|
||||
|
||||
valueOf (_,_,[],_) = 2
|
||||
valueOf _ = 1
|
||||
|
||||
insert v conf an@(start,_,_,end) ans l_new [] =
|
||||
match start v conf ans ((end,v,comb conf an,filter end ans):l_new) []
|
||||
insert v conf an@(start,_,_,end) ans l_new (new@(end0,v0,conf0,ans0):r_new) =
|
||||
case compare end0 end of
|
||||
LT -> insert v conf an ans (new:l_new) r_new
|
||||
EQ -> case compare v0 v of
|
||||
LT -> match start v conf ans ((end,v, conf0,ans0): l_new) r_new
|
||||
EQ -> match start v conf ans ((end,v,merge (comb conf an) conf0,ans0): l_new) r_new
|
||||
GT -> match start v conf ans ((end,v,comb conf an, ans0): l_new) r_new
|
||||
GT -> match start v conf ans ((end,v,comb conf an, filter end ans):new:l_new) r_new
|
||||
|
||||
match start0 v conf (an@(start,_,_,end):ans) l_new r_new
|
||||
| start0 == start = insert v conf an ans l_new r_new
|
||||
match start0 v conf ans l_new r_new = revOn l_new r_new
|
||||
|
||||
comb ((start0,w0,an0,end0):conf) (start,w,an,end)
|
||||
| end0 == start && (unk w0 an0 || unk w an) = (start0,w0++w,[],end):conf
|
||||
comb conf an = an:conf
|
||||
|
||||
filter end [] = []
|
||||
filter end (next@(start,_,_,_):ans)
|
||||
| end <= start = next:ans
|
||||
| otherwise = filter end ans
|
||||
|
||||
revOn [] ys = ys
|
||||
revOn (x:xs) ys = revOn xs (x:ys)
|
||||
|
||||
merge [] ans = ans
|
||||
merge ans [] = ans
|
||||
merge (an1@(start1,_,_,end1):ans1) (an2@(start2,_,_,end2):ans2) =
|
||||
case compare (start1,end1) (start2,end2) of
|
||||
GT -> an1 : merge ans1 (an2:ans2)
|
||||
EQ -> an1 : merge ans1 ans2
|
||||
LT -> an2 : merge (an1:ans1) ans2
|
||||
|
||||
filterLongest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
filterLongest [] = []
|
||||
filterLongest (an:ans) = longest an ans
|
||||
where
|
||||
longest prev [] = [prev]
|
||||
longest prev@(start0,_,_,end0) (next@(start,_,_,end):ans)
|
||||
| start0 == start = longest next ans
|
||||
| otherwise = filter prev (next:ans)
|
||||
|
||||
filter prev [] = [prev]
|
||||
filter prev@(start0,w0,an0,end0) (next@(start,w,an,end):ans)
|
||||
| end0 == start && (unk w0 an0 || unk w an)
|
||||
= filter (start0,w0++w,[],end) ans
|
||||
| end0 <= start = prev : longest next ans
|
||||
| otherwise = filter prev ans
|
||||
|
||||
unk w [] | any (not . isPunctuation) w = True
|
||||
unk _ _ = False
|
||||
|
||||
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
|
||||
fullFormLexicon lang =
|
||||
|
||||
@@ -159,7 +159,7 @@ cpgfMain qsem command (t,(pgf,pc)) =
|
||||
"c-translate" -> withQSem qsem $
|
||||
out t=<<join(trans # input % to % start % limit%treeopts)
|
||||
"c-lookupmorpho"-> out t=<< morpho # from1 % textInput
|
||||
"c-lookupcohorts"->out t=<< cohorts # from1 % getInput "longest" % textInput
|
||||
"c-lookupcohorts"->out t=<< cohorts # from1 % getInput "filter" % textInput
|
||||
"c-flush" -> out t=<< flush
|
||||
"c-grammar" -> out t grammar
|
||||
"c-abstrtree" -> outputGraphviz=<< C.graphvizAbstractTree pgf C.graphvizDefaults # tree
|
||||
@@ -251,29 +251,20 @@ cpgfMain qsem command (t,(pgf,pc)) =
|
||||
,"prob".=p]
|
||||
| (l,a,p)<-C.lookupMorpho concr input]
|
||||
|
||||
cohorts (from,concr) longest input =
|
||||
cohorts (from,concr) filter input =
|
||||
showJSON [makeObj ["start" .=showJSON s
|
||||
,"word" .=showJSON w
|
||||
,"morpho".=showJSON [makeObj ["lemma".=l
|
||||
,"analysis".=a
|
||||
,"prob".=p]
|
||||
| (l,a,p)<-ms]
|
||||
,"end" .=showJSON e
|
||||
]
|
||||
| (s,ms,e) <- (if longest==Just "true" then filterLongest else id)
|
||||
(C.lookupCohorts concr input)]
|
||||
where
|
||||
filterLongest [] = []
|
||||
filterLongest (an:ans) = longest an ans
|
||||
where
|
||||
longest prev [] = [prev]
|
||||
longest prev@(start0,_,end0) (next@(start,an,end):ans)
|
||||
| start0 == start = longest next ans
|
||||
| otherwise = prev : filter end0 (next:ans)
|
||||
|
||||
filter end [] = []
|
||||
filter end (next@(start,_,_):ans)
|
||||
| end <= start = filterLongest (next:ans)
|
||||
| otherwise = filter end ans
|
||||
| (s,w,ms,e) <- (case filter of
|
||||
Just "longest" -> C.filterLongest
|
||||
Just "best" -> C.filterBest
|
||||
_ -> id)
|
||||
(C.lookupCohorts concr input)]
|
||||
|
||||
wordforword input@((from,_),_) = jsonWFW from . wordforword' input
|
||||
|
||||
|
||||
Reference in New Issue
Block a user