started with lookupCohorts

This commit is contained in:
Krasimir Angelov
2022-07-07 14:03:07 +02:00
parent c783da51a4
commit a66693770c
7 changed files with 189 additions and 37 deletions

View File

@@ -283,14 +283,18 @@ pgfCommands = Map.fromList [
exec = needPGF $ \opts ts pgf -> do
concr <- optLang pgf opts
case opts of
_ | isOpt "missing" opts ->
return . fromString . unwords .
morphoMissing concr .
concatMap words $ toStrings ts
_ | isOpt "all" opts ->
return . fromString . unlines .
map prMorphoAnalysis . concatMap (morphoAll concr) $
toStrings ts
_ | isOpt "known" opts ->
return . fromString . unwords .
morphoKnown concr .
concatMap words $ toStrings ts
concatMap (morphoKnown concr) $
toStrings ts
_ | isOpt "missing" opts ->
return . fromString . unwords .
concatMap (morphoMissing concr) $
toStrings ts
_ -> return . fromString . unlines .
map prMorphoAnalysis . concatMap (morphos pgf opts) $
toStrings ts,
@@ -298,8 +302,9 @@ pgfCommands = Map.fromList [
("lang","the languages of analysis (comma-separated, no spaces)")
],
options = [
("known", "return only the known words, in order of appearance"),
("missing","show the list of unknown words, in order of appearance")
("all", "scan the text for all words, not just a single one"),
("known", "scan the text only for known words, in order of appearance"),
("missing","scan the text for all unknown words, in order of appearance")
]
}),
@@ -839,6 +844,18 @@ pgfCommands = Map.fromList [
morphos pgf opts s =
[(s,lookupMorpho concr s) | concr <- optLangs pgf opts]
morphoAll concr s =
[(w,ans) | (_,w,ans,_) <- lookupCohorts concr s]
morphoKnown = morphoClassify True
morphoMissing = morphoClassify False
morphoClassify k concr s =
[w | (_,w,ans,_) <- lookupCohorts concr s, k /= null ans, notLiteral w]
where
notLiteral w = not (all isDigit w)
optClitics opts = case valStrOpts "clitics" "" opts of
"" -> []
cs -> map reverse $ chunks ',' cs
@@ -853,16 +870,6 @@ pgfCommands = Map.fromList [
app (OFlag op (LStr x)) | Just (Right f) <- treeOp pgf op = f x
app _ = id
morphoMissing :: Concr -> [String] -> [String]
morphoMissing = morphoClassify False
morphoKnown :: Concr -> [String] -> [String]
morphoKnown = morphoClassify True
morphoClassify :: Bool -> Concr -> [String] -> [String]
morphoClassify k concr ws = [w | w <- ws, k /= null (lookupMorpho concr w), notLiteral w] where
notLiteral w = not (all isDigit w)
treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf]
treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf]

View File

@@ -825,7 +825,37 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr);
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
callback, err);
} PGF_API_END
}
PGF_API
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup_prefixes(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
1, sentence->size,
callback, err);
} PGF_API_END
}

View File

@@ -418,6 +418,18 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfMorphoCallback* callback, PgfExn* err);
typedef struct PgfCohortsCallback PgfCohortsCallback;
struct PgfCohortsCallback {
PgfMorphoCallback morpho;
void (*fn)(PgfCohortsCallback* self, size_t start, size_t end,
PgfExn* err);
};
PGF_API_DECL
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err);
PGF_API_DECL
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
PgfSequenceItor *itor,

View File

@@ -228,14 +228,11 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
}
static
int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
bool case_sensitive)
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
bool case_sensitive)
{
int res1 = 0;
const uint8_t *s1 = (uint8_t *) &sentence->text;
const uint8_t *e1 = s1+sentence->size;
size_t i = 0;
const uint8_t *s2 = NULL;
const uint8_t *e2 = NULL;
@@ -243,13 +240,13 @@ int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
size_t count = 0;
for (;;) {
if (s1 >= e1) {
if (range->begin >= range->end) {
if (s2 < e2 || i < seq->syms.len)
return -1;
return case_sensitive ? res1 : 0;
}
uint32_t ucs1 = pgf_utf8_decode(&s1);
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
if (s2 >= e2) {
@@ -469,7 +466,7 @@ size_t phrasetable_size(PgfPhrasetable table)
PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err)
@@ -477,7 +474,8 @@ void phrasetable_lookup(PgfPhrasetable table,
if (table == 0)
return;
int cmp = text_cmp(sentence,table->value.seq,case_sensitive);
PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
} else if (cmp > 0) {
@@ -521,6 +519,71 @@ void phrasetable_lookup(PgfPhrasetable table,
}
}
PGF_INTERNAL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err)
{
if (table == 0)
return;
PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
} else if (cmp > 0) {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
} else {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
auto backrefs = table->value.backrefs;
if (backrefs != 0) {
for (size_t i = 0; i < backrefs->len; i++) {
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}
break;
}
case PgfConcrLincat::tag: {
//ignore
break;
}
}
}
callback->fn(callback, sentence->pos, current.pos, err);
if (err->type != PGF_EXN_NONE)
return;
}
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
}
}
PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -68,13 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table);
typedef struct {
size_t pos; // position in Unicode characters
const uint8_t *begin; // pointer into the beginning of the range
const uint8_t *end; // pointer into the end of the range
} PgfTextRange;
PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -493,13 +493,6 @@ lookupMorpho c sent = unsafePerformIO $ do
(#poke PgfMorphoCallback, fn) itor fptr
withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor))
fmap reverse (readIORef ref)
where
getMorphology ref _ c_name c_field c_prob exn = do
name <- peekText c_name
field <- peekText c_field
let prob = realToFrac c_prob
ann = (name,field,prob)
modifyIORef ref ((:) ann)
-- | 'lookupCohorts' takes an arbitrary string an produces
-- a list of all places where lexical items from the grammar have been
@@ -511,7 +504,33 @@ lookupMorpho c sent = unsafePerformIO $ do
-- by the @end@ position. This can be used for instance if you want to
-- filter only the longest matches.
lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
lookupCohorts = error "TODO: lookupCohorts"
lookupCohorts c sent = unsafePerformIO $ do
morpho_ref <- newIORef []
cohorts_ref <- newIORef []
(withText sent $ \c_sent ->
allocaBytes (#size PgfCohortsCallback) $ \itor ->
bracket (wrapMorphoCallback (getMorphology morpho_ref)) freeHaskellFunPtr $ \morpho_fptr ->
bracket (wrapCohortsCallback (getCohorts morpho_ref cohorts_ref)) freeHaskellFunPtr $ \cohorts_fptr ->
withForeignPtr (c_revision c) $ \c_revision -> do
(#poke PgfCohortsCallback, morpho.fn) itor morpho_fptr
(#poke PgfCohortsCallback, fn) itor cohorts_fptr
withPgfExn "lookupCohorts" (pgf_lookup_cohorts (c_db c) c_revision c_sent itor))
fmap reverse (readIORef cohorts_ref)
where
getCohorts morpho_ref cohorts_ref _ start' end' exn = do
ans <- readIORef morpho_ref
let start = fromIntegral start'
end = fromIntegral end'
word = take (end-start) (drop start sent)
modifyIORef cohorts_ref ((:) (start, word, reverse ans, end))
writeIORef morpho_ref []
getMorphology ref _ c_name c_field c_prob exn = do
name <- peekText c_name
field <- peekText c_field
let prob = realToFrac c_prob
ann = (name,field,prob)
modifyIORef ref ((:) ann)
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
filterBest ans =

View File

@@ -47,6 +47,7 @@ data PgfLinearizationOutputIface
data PgfGraphvizOptions
data PgfSequenceItor
data PgfMorphoCallback
data PgfCohortsCallback
data PgfPhrasetableIds
type Wrapper a = a -> IO (FunPtr a)
@@ -121,6 +122,12 @@ foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback
foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO ()
type CohortsCallback = Ptr PgfCohortsCallback -> CSize -> CSize -> Ptr PgfExn -> IO ()
foreign import ccall "wrapper" wrapCohortsCallback :: Wrapper CohortsCallback
foreign import ccall pgf_lookup_cohorts :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfCohortsCallback -> Ptr PgfExn -> IO ()
foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds)
foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()