mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-16 00:09:31 -06:00
started with lookupCohorts
This commit is contained in:
@@ -283,14 +283,18 @@ pgfCommands = Map.fromList [
|
||||
exec = needPGF $ \opts ts pgf -> do
|
||||
concr <- optLang pgf opts
|
||||
case opts of
|
||||
_ | isOpt "missing" opts ->
|
||||
return . fromString . unwords .
|
||||
morphoMissing concr .
|
||||
concatMap words $ toStrings ts
|
||||
_ | isOpt "all" opts ->
|
||||
return . fromString . unlines .
|
||||
map prMorphoAnalysis . concatMap (morphoAll concr) $
|
||||
toStrings ts
|
||||
_ | isOpt "known" opts ->
|
||||
return . fromString . unwords .
|
||||
morphoKnown concr .
|
||||
concatMap words $ toStrings ts
|
||||
concatMap (morphoKnown concr) $
|
||||
toStrings ts
|
||||
_ | isOpt "missing" opts ->
|
||||
return . fromString . unwords .
|
||||
concatMap (morphoMissing concr) $
|
||||
toStrings ts
|
||||
_ -> return . fromString . unlines .
|
||||
map prMorphoAnalysis . concatMap (morphos pgf opts) $
|
||||
toStrings ts,
|
||||
@@ -298,8 +302,9 @@ pgfCommands = Map.fromList [
|
||||
("lang","the languages of analysis (comma-separated, no spaces)")
|
||||
],
|
||||
options = [
|
||||
("known", "return only the known words, in order of appearance"),
|
||||
("missing","show the list of unknown words, in order of appearance")
|
||||
("all", "scan the text for all words, not just a single one"),
|
||||
("known", "scan the text only for known words, in order of appearance"),
|
||||
("missing","scan the text for all unknown words, in order of appearance")
|
||||
]
|
||||
}),
|
||||
|
||||
@@ -839,6 +844,18 @@ pgfCommands = Map.fromList [
|
||||
morphos pgf opts s =
|
||||
[(s,lookupMorpho concr s) | concr <- optLangs pgf opts]
|
||||
|
||||
morphoAll concr s =
|
||||
[(w,ans) | (_,w,ans,_) <- lookupCohorts concr s]
|
||||
|
||||
morphoKnown = morphoClassify True
|
||||
|
||||
morphoMissing = morphoClassify False
|
||||
|
||||
morphoClassify k concr s =
|
||||
[w | (_,w,ans,_) <- lookupCohorts concr s, k /= null ans, notLiteral w]
|
||||
where
|
||||
notLiteral w = not (all isDigit w)
|
||||
|
||||
optClitics opts = case valStrOpts "clitics" "" opts of
|
||||
"" -> []
|
||||
cs -> map reverse $ chunks ',' cs
|
||||
@@ -853,16 +870,6 @@ pgfCommands = Map.fromList [
|
||||
app (OFlag op (LStr x)) | Just (Right f) <- treeOp pgf op = f x
|
||||
app _ = id
|
||||
|
||||
morphoMissing :: Concr -> [String] -> [String]
|
||||
morphoMissing = morphoClassify False
|
||||
|
||||
morphoKnown :: Concr -> [String] -> [String]
|
||||
morphoKnown = morphoClassify True
|
||||
|
||||
morphoClassify :: Bool -> Concr -> [String] -> [String]
|
||||
morphoClassify k concr ws = [w | w <- ws, k /= null (lookupMorpho concr w), notLiteral w] where
|
||||
notLiteral w = not (all isDigit w)
|
||||
|
||||
treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf]
|
||||
treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf]
|
||||
|
||||
|
||||
@@ -825,7 +825,37 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
|
||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err);
|
||||
PgfTextRange range;
|
||||
range.pos = 0;
|
||||
range.begin = (uint8_t *) &sentence->text[0];
|
||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
||||
phrasetable_lookup(concr->phrasetable,
|
||||
&range, case_sensitive,
|
||||
concr->lincats,
|
||||
callback, err);
|
||||
} PGF_API_END
|
||||
}
|
||||
|
||||
PGF_API
|
||||
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfText *sentence,
|
||||
PgfCohortsCallback* callback, PgfExn* err)
|
||||
{
|
||||
PGF_API_BEGIN {
|
||||
DB_scope scope(db, READER_SCOPE);
|
||||
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
|
||||
|
||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
PgfTextRange range;
|
||||
range.pos = 0;
|
||||
range.begin = (uint8_t *) &sentence->text[0];
|
||||
range.end = (uint8_t *) &sentence->text[sentence->size];
|
||||
phrasetable_lookup_prefixes(concr->phrasetable,
|
||||
&range, case_sensitive,
|
||||
concr->lincats,
|
||||
1, sentence->size,
|
||||
callback, err);
|
||||
} PGF_API_END
|
||||
}
|
||||
|
||||
|
||||
@@ -418,6 +418,18 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfText *sentence,
|
||||
PgfMorphoCallback* callback, PgfExn* err);
|
||||
|
||||
typedef struct PgfCohortsCallback PgfCohortsCallback;
|
||||
struct PgfCohortsCallback {
|
||||
PgfMorphoCallback morpho;
|
||||
void (*fn)(PgfCohortsCallback* self, size_t start, size_t end,
|
||||
PgfExn* err);
|
||||
};
|
||||
|
||||
PGF_API_DECL
|
||||
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfText *sentence,
|
||||
PgfCohortsCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_API_DECL
|
||||
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfSequenceItor *itor,
|
||||
|
||||
@@ -228,14 +228,11 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
|
||||
}
|
||||
|
||||
static
|
||||
int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
|
||||
bool case_sensitive)
|
||||
int text_range_cmp(PgfTextRange *range, ref<PgfSequence> seq,
|
||||
bool case_sensitive)
|
||||
{
|
||||
int res1 = 0;
|
||||
|
||||
const uint8_t *s1 = (uint8_t *) &sentence->text;
|
||||
const uint8_t *e1 = s1+sentence->size;
|
||||
|
||||
size_t i = 0;
|
||||
const uint8_t *s2 = NULL;
|
||||
const uint8_t *e2 = NULL;
|
||||
@@ -243,13 +240,13 @@ int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
|
||||
size_t count = 0;
|
||||
|
||||
for (;;) {
|
||||
if (s1 >= e1) {
|
||||
if (range->begin >= range->end) {
|
||||
if (s2 < e2 || i < seq->syms.len)
|
||||
return -1;
|
||||
return case_sensitive ? res1 : 0;
|
||||
}
|
||||
|
||||
uint32_t ucs1 = pgf_utf8_decode(&s1);
|
||||
uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++;
|
||||
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
|
||||
|
||||
if (s2 >= e2) {
|
||||
@@ -469,7 +466,7 @@ size_t phrasetable_size(PgfPhrasetable table)
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err)
|
||||
@@ -477,7 +474,8 @@ void phrasetable_lookup(PgfPhrasetable table,
|
||||
if (table == 0)
|
||||
return;
|
||||
|
||||
int cmp = text_cmp(sentence,table->value.seq,case_sensitive);
|
||||
PgfTextRange current = *sentence;
|
||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive);
|
||||
if (cmp < 0) {
|
||||
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
|
||||
} else if (cmp > 0) {
|
||||
@@ -521,6 +519,71 @@ void phrasetable_lookup(PgfPhrasetable table,
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
ptrdiff_t min, ptrdiff_t max,
|
||||
PgfCohortsCallback* callback, PgfExn* err)
|
||||
{
|
||||
if (table == 0)
|
||||
return;
|
||||
|
||||
PgfTextRange current = *sentence;
|
||||
int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive);
|
||||
if (cmp < 0) {
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err);
|
||||
} else if (cmp > 0) {
|
||||
ptrdiff_t len = current.begin - sentence->begin;
|
||||
|
||||
if (min <= len)
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
||||
|
||||
if (len <= max)
|
||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
||||
} else {
|
||||
ptrdiff_t len = current.begin - sentence->begin;
|
||||
|
||||
if (min <= len)
|
||||
phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err);
|
||||
|
||||
auto backrefs = table->value.backrefs;
|
||||
if (backrefs != 0) {
|
||||
for (size_t i = 0; i < backrefs->len; i++) {
|
||||
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
|
||||
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
|
||||
case PgfConcrLin::tag: {
|
||||
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
|
||||
ref<PgfConcrLincat> lincat =
|
||||
namespace_lookup(lincats, &lin->absfun->type->name);
|
||||
if (lincat != 0) {
|
||||
ref<PgfText> field =
|
||||
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
|
||||
|
||||
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PgfConcrLincat::tag: {
|
||||
//ignore
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
callback->fn(callback, sentence->pos, current.pos, err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
|
||||
if (len <= max)
|
||||
phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err);
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
PgfPhrasetable table,
|
||||
|
||||
@@ -68,13 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
|
||||
PGF_INTERNAL_DECL
|
||||
size_t phrasetable_size(PgfPhrasetable table);
|
||||
|
||||
typedef struct {
|
||||
size_t pos; // position in Unicode characters
|
||||
const uint8_t *begin; // pointer into the beginning of the range
|
||||
const uint8_t *end; // pointer into the end of the range
|
||||
} PgfTextRange;
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<struct PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_lookup_prefixes(PgfPhrasetable table,
|
||||
PgfTextRange *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
ptrdiff_t min, ptrdiff_t max,
|
||||
PgfCohortsCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
PgfPhrasetable table,
|
||||
|
||||
@@ -493,13 +493,6 @@ lookupMorpho c sent = unsafePerformIO $ do
|
||||
(#poke PgfMorphoCallback, fn) itor fptr
|
||||
withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor))
|
||||
fmap reverse (readIORef ref)
|
||||
where
|
||||
getMorphology ref _ c_name c_field c_prob exn = do
|
||||
name <- peekText c_name
|
||||
field <- peekText c_field
|
||||
let prob = realToFrac c_prob
|
||||
ann = (name,field,prob)
|
||||
modifyIORef ref ((:) ann)
|
||||
|
||||
-- | 'lookupCohorts' takes an arbitrary string an produces
|
||||
-- a list of all places where lexical items from the grammar have been
|
||||
@@ -511,7 +504,33 @@ lookupMorpho c sent = unsafePerformIO $ do
|
||||
-- by the @end@ position. This can be used for instance if you want to
|
||||
-- filter only the longest matches.
|
||||
lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
lookupCohorts = error "TODO: lookupCohorts"
|
||||
lookupCohorts c sent = unsafePerformIO $ do
|
||||
morpho_ref <- newIORef []
|
||||
cohorts_ref <- newIORef []
|
||||
(withText sent $ \c_sent ->
|
||||
allocaBytes (#size PgfCohortsCallback) $ \itor ->
|
||||
bracket (wrapMorphoCallback (getMorphology morpho_ref)) freeHaskellFunPtr $ \morpho_fptr ->
|
||||
bracket (wrapCohortsCallback (getCohorts morpho_ref cohorts_ref)) freeHaskellFunPtr $ \cohorts_fptr ->
|
||||
withForeignPtr (c_revision c) $ \c_revision -> do
|
||||
(#poke PgfCohortsCallback, morpho.fn) itor morpho_fptr
|
||||
(#poke PgfCohortsCallback, fn) itor cohorts_fptr
|
||||
withPgfExn "lookupCohorts" (pgf_lookup_cohorts (c_db c) c_revision c_sent itor))
|
||||
fmap reverse (readIORef cohorts_ref)
|
||||
where
|
||||
getCohorts morpho_ref cohorts_ref _ start' end' exn = do
|
||||
ans <- readIORef morpho_ref
|
||||
let start = fromIntegral start'
|
||||
end = fromIntegral end'
|
||||
word = take (end-start) (drop start sent)
|
||||
modifyIORef cohorts_ref ((:) (start, word, reverse ans, end))
|
||||
writeIORef morpho_ref []
|
||||
|
||||
getMorphology ref _ c_name c_field c_prob exn = do
|
||||
name <- peekText c_name
|
||||
field <- peekText c_field
|
||||
let prob = realToFrac c_prob
|
||||
ann = (name,field,prob)
|
||||
modifyIORef ref ((:) ann)
|
||||
|
||||
filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)]
|
||||
filterBest ans =
|
||||
|
||||
@@ -47,6 +47,7 @@ data PgfLinearizationOutputIface
|
||||
data PgfGraphvizOptions
|
||||
data PgfSequenceItor
|
||||
data PgfMorphoCallback
|
||||
data PgfCohortsCallback
|
||||
data PgfPhrasetableIds
|
||||
|
||||
type Wrapper a = a -> IO (FunPtr a)
|
||||
@@ -121,6 +122,12 @@ foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback
|
||||
|
||||
foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO ()
|
||||
|
||||
type CohortsCallback = Ptr PgfCohortsCallback -> CSize -> CSize -> Ptr PgfExn -> IO ()
|
||||
|
||||
foreign import ccall "wrapper" wrapCohortsCallback :: Wrapper CohortsCallback
|
||||
|
||||
foreign import ccall pgf_lookup_cohorts :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfCohortsCallback -> Ptr PgfExn -> IO ()
|
||||
|
||||
foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds)
|
||||
|
||||
foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()
|
||||
|
||||
Reference in New Issue
Block a user