started with lookupCohorts

This commit is contained in:
Krasimir Angelov
2022-07-07 14:03:07 +02:00
parent c783da51a4
commit a66693770c
7 changed files with 189 additions and 37 deletions

View File

@@ -283,14 +283,18 @@ pgfCommands = Map.fromList [
exec = needPGF $ \opts ts pgf -> do exec = needPGF $ \opts ts pgf -> do
concr <- optLang pgf opts concr <- optLang pgf opts
case opts of case opts of
_ | isOpt "missing" opts -> _ | isOpt "all" opts ->
return . fromString . unwords . return . fromString . unlines .
morphoMissing concr . map prMorphoAnalysis . concatMap (morphoAll concr) $
concatMap words $ toStrings ts toStrings ts
_ | isOpt "known" opts -> _ | isOpt "known" opts ->
return . fromString . unwords . return . fromString . unwords .
morphoKnown concr . concatMap (morphoKnown concr) $
concatMap words $ toStrings ts toStrings ts
_ | isOpt "missing" opts ->
return . fromString . unwords .
concatMap (morphoMissing concr) $
toStrings ts
_ -> return . fromString . unlines . _ -> return . fromString . unlines .
map prMorphoAnalysis . concatMap (morphos pgf opts) $ map prMorphoAnalysis . concatMap (morphos pgf opts) $
toStrings ts, toStrings ts,
@@ -298,8 +302,9 @@ pgfCommands = Map.fromList [
("lang","the languages of analysis (comma-separated, no spaces)") ("lang","the languages of analysis (comma-separated, no spaces)")
], ],
options = [ options = [
("known", "return only the known words, in order of appearance"), ("all", "scan the text for all words, not just a single one"),
("missing","show the list of unknown words, in order of appearance") ("known", "scan the text only for known words, in order of appearance"),
("missing","scan the text for all unknown words, in order of appearance")
] ]
}), }),
@@ -839,6 +844,18 @@ pgfCommands = Map.fromList [
morphos pgf opts s = morphos pgf opts s =
[(s,lookupMorpho concr s) | concr <- optLangs pgf opts] [(s,lookupMorpho concr s) | concr <- optLangs pgf opts]
morphoAll concr s =
[(w,ans) | (_,w,ans,_) <- lookupCohorts concr s]
morphoKnown = morphoClassify True
morphoMissing = morphoClassify False
morphoClassify k concr s =
[w | (_,w,ans,_) <- lookupCohorts concr s, k /= null ans, notLiteral w]
where
notLiteral w = not (all isDigit w)
optClitics opts = case valStrOpts "clitics" "" opts of optClitics opts = case valStrOpts "clitics" "" opts of
"" -> [] "" -> []
cs -> map reverse $ chunks ',' cs cs -> map reverse $ chunks ',' cs
@@ -853,16 +870,6 @@ pgfCommands = Map.fromList [
app (OFlag op (LStr x)) | Just (Right f) <- treeOp pgf op = f x app (OFlag op (LStr x)) | Just (Right f) <- treeOp pgf op = f x
app _ = id app _ = id
morphoMissing :: Concr -> [String] -> [String]
morphoMissing = morphoClassify False
morphoKnown :: Concr -> [String] -> [String]
morphoKnown = morphoClassify True
morphoClassify :: Bool -> Concr -> [String] -> [String]
morphoClassify k concr ws = [w | w <- ws, k /= null (lookupMorpho concr w), notLiteral w] where
notLiteral w = not (all isDigit w)
treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf] treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf]
treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf] treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf]

View File

@@ -825,7 +825,37 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr); bool case_sensitive = pgf_is_case_sensitive(concr);
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err); PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
callback, err);
} PGF_API_END
}
PGF_API
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfTextRange range;
range.pos = 0;
range.begin = (uint8_t *) &sentence->text[0];
range.end = (uint8_t *) &sentence->text[sentence->size];
phrasetable_lookup_prefixes(concr->phrasetable,
&range, case_sensitive,
concr->lincats,
1, sentence->size,
callback, err);
} PGF_API_END } PGF_API_END
} }

View File

@@ -418,6 +418,18 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence, PgfText *sentence,
PgfMorphoCallback* callback, PgfExn* err); PgfMorphoCallback* callback, PgfExn* err);
typedef struct PgfCohortsCallback PgfCohortsCallback;
struct PgfCohortsCallback {
PgfMorphoCallback morpho;
void (*fn)(PgfCohortsCallback* self, size_t start, size_t end,
PgfExn* err);
};
PGF_API_DECL
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfCohortsCallback* callback, PgfExn* err);
PGF_API_DECL PGF_API_DECL
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision, PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
PgfSequenceItor *itor, PgfSequenceItor *itor,

View File

@@ -228,14 +228,11 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
} }
static static
int text_cmp(PgfText *sentence, ref<PgfSequence> seq, int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
bool case_sensitive) bool case_sensitive)
{ {
int res1 = 0; int res1 = 0;
const uint8_t *s1 = (uint8_t *) &sentence->text;
const uint8_t *e1 = s1+sentence->size;
size_t i = 0; size_t i = 0;
const uint8_t *s2 = NULL; const uint8_t *s2 = NULL;
const uint8_t *e2 = NULL; const uint8_t *e2 = NULL;
@@ -243,13 +240,13 @@ int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
size_t count = 0; size_t count = 0;
for (;;) { for (;;) {
if (s1 >= e1) { if (range->begin >= range->end) {
if (s2 < e2 || i < seq->syms.len) if (s2 < e2 || i < seq->syms.len)
return -1; return -1;
return case_sensitive ? res1 : 0; return case_sensitive ? res1 : 0;
} }
uint32_t ucs1 = pgf_utf8_decode(&s1); uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
uint32_t ucs1i = pgf_utf8_to_upper(ucs1); uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
if (s2 >= e2) { if (s2 >= e2) {
@@ -469,7 +466,7 @@ size_t phrasetable_size(PgfPhrasetable table)
PGF_INTERNAL PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table, void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence, PgfTextRange *sentence,
bool case_sensitive, bool case_sensitive,
Namespace<PgfConcrLincat> lincats, Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err) PgfMorphoCallback* callback, PgfExn* err)
@@ -477,7 +474,8 @@ void phrasetable_lookup(PgfPhrasetable table,
if (table == 0) if (table == 0)
return; return;
int cmp = text_cmp(sentence,table->value.seq,case_sensitive); PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) { if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
} else if (cmp > 0) { } else if (cmp > 0) {
@@ -521,6 +519,71 @@ void phrasetable_lookup(PgfPhrasetable table,
} }
} }
PGF_INTERNAL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err)
{
if (table == 0)
return;
PgfTextRange current = *sentence;
int cmp = text_range_cmp(&current,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
} else if (cmp > 0) {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
} else {
ptrdiff_t len = current.begin - sentence->begin;
if (min <= len)
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
auto backrefs = table->value.backrefs;
if (backrefs != 0) {
for (size_t i = 0; i < backrefs->len; i++) {
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}
break;
}
case PgfConcrLincat::tag: {
//ignore
break;
}
}
}
callback->fn(callback, sentence->pos, current.pos, err);
if (err->type != PGF_EXN_NONE)
return;
}
if (len <= max)
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
}
}
PGF_INTERNAL PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr, void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table, PgfPhrasetable table,

View File

@@ -68,13 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table); size_t phrasetable_size(PgfPhrasetable table);
typedef struct {
size_t pos; // position in Unicode characters
const uint8_t *begin; // pointer into the beginning of the range
const uint8_t *end; // pointer into the end of the range
} PgfTextRange;
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table, void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence, PgfTextRange *sentence,
bool case_sensitive, bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats, Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err); PgfMorphoCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_lookup_prefixes(PgfPhrasetable table,
PgfTextRange *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
ptrdiff_t min, ptrdiff_t max,
PgfCohortsCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr, void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table, PgfPhrasetable table,

View File

@@ -493,13 +493,6 @@ lookupMorpho c sent = unsafePerformIO $ do
(#poke PgfMorphoCallback, fn) itor fptr (#poke PgfMorphoCallback, fn) itor fptr
withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor)) withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor))
fmap reverse (readIORef ref) fmap reverse (readIORef ref)
where
getMorphology ref _ c_name c_field c_prob exn = do
name <- peekText c_name
field <- peekText c_field
let prob = realToFrac c_prob
ann = (name,field,prob)
modifyIORef ref ((:) ann)
-- | 'lookupCohorts' takes an arbitrary string an produces -- | 'lookupCohorts' takes an arbitrary string an produces
-- a list of all places where lexical items from the grammar have been -- a list of all places where lexical items from the grammar have been
@@ -511,7 +504,33 @@ lookupMorpho c sent = unsafePerformIO $ do
-- by the @end@ position. This can be used for instance if you want to -- by the @end@ position. This can be used for instance if you want to
-- filter only the longest matches. -- filter only the longest matches.
lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)] lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
lookupCohorts = error "TODO: lookupCohorts" lookupCohorts c sent = unsafePerformIO $ do
morpho_ref <- newIORef []
cohorts_ref <- newIORef []
(withText sent $ \c_sent ->
allocaBytes (#size PgfCohortsCallback) $ \itor ->
bracket (wrapMorphoCallback (getMorphology morpho_ref)) freeHaskellFunPtr $ \morpho_fptr ->
bracket (wrapCohortsCallback (getCohorts morpho_ref cohorts_ref)) freeHaskellFunPtr $ \cohorts_fptr ->
withForeignPtr (c_revision c) $ \c_revision -> do
(#poke PgfCohortsCallback, morpho.fn) itor morpho_fptr
(#poke PgfCohortsCallback, fn) itor cohorts_fptr
withPgfExn "lookupCohorts" (pgf_lookup_cohorts (c_db c) c_revision c_sent itor))
fmap reverse (readIORef cohorts_ref)
where
getCohorts morpho_ref cohorts_ref _ start' end' exn = do
ans <- readIORef morpho_ref
let start = fromIntegral start'
end = fromIntegral end'
word = take (end-start) (drop start sent)
modifyIORef cohorts_ref ((:) (start, word, reverse ans, end))
writeIORef morpho_ref []
getMorphology ref _ c_name c_field c_prob exn = do
name <- peekText c_name
field <- peekText c_field
let prob = realToFrac c_prob
ann = (name,field,prob)
modifyIORef ref ((:) ann)
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
filterBest ans = filterBest ans =

View File

@@ -47,6 +47,7 @@ data PgfLinearizationOutputIface
data PgfGraphvizOptions data PgfGraphvizOptions
data PgfSequenceItor data PgfSequenceItor
data PgfMorphoCallback data PgfMorphoCallback
data PgfCohortsCallback
data PgfPhrasetableIds data PgfPhrasetableIds
type Wrapper a = a -> IO (FunPtr a) type Wrapper a = a -> IO (FunPtr a)
@@ -121,6 +122,12 @@ foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback
foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO () foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO ()
type CohortsCallback = Ptr PgfCohortsCallback -> CSize -> CSize -> Ptr PgfExn -> IO ()
foreign import ccall "wrapper" wrapCohortsCallback :: Wrapper CohortsCallback
foreign import ccall pgf_lookup_cohorts :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfCohortsCallback -> Ptr PgfExn -> IO ()
foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds) foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds)
foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO () foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()