first draft for lookupMorpho

This commit is contained in:
Krasimir Angelov
2022-05-30 21:16:34 +02:00
parent 92fbe08f51
commit 18f70b786f
9 changed files with 225 additions and 5 deletions

View File

@@ -32,7 +32,6 @@ import Data.List (sort)
import Control.Monad(mplus)
import qualified Control.Monad.Fail as Fail
class (Functor m,Monad m,MonadSIO m) => HasPGF m where getPGF :: m (Maybe PGF)
instance (Monad m,HasPGF m,Fail.MonadFail m) => TypeCheckArg m where
@@ -292,8 +291,8 @@ pgfCommands = Map.fromList [
morphoKnown concr .
concatMap words $ toStrings ts
_ -> return . fromString . unlines .
map prMorphoAnalysis . concatMap (morphos pgf opts) .
concatMap words $ toStrings ts,
map prMorphoAnalysis . concatMap (morphos pgf opts) $
toStrings ts,
flags = [
("lang","the languages of analysis (comma-separated, no spaces)")
],

View File

@@ -792,6 +792,43 @@ void pgf_iter_lins(PgfDB *db, PgfConcrRevision cnc_revision,
} PGF_API_END
}
static bool
pgf_is_case_sensitive(ref<PgfConcr> concr)
{
PgfText *case_sensitive = (PgfText *)
alloca(sizeof(PgfText)+15);
case_sensitive->size = 14;
strcpy(case_sensitive->text, "case_sensitive");
ref<PgfFlag> flag =
namespace_lookup(concr->cflags, case_sensitive);
if (flag != 0) {
switch (ref<PgfLiteral>::get_tag(flag->value)) {
case PgfLiteralStr::tag: {
auto lstr = ref<PgfLiteralStr>::untagged(flag->value);
if (lstr->val.size == 3 && strcmp(lstr->val.text, "off") == 0)
return false;
}
}
}
return true;
}
PGF_API
void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfMorphoCallback* callback, PgfExn* err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
bool case_sensitive = pgf_is_case_sensitive(concr);
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err);
} PGF_API_END
}
PGF_API
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
PgfSequenceItor *itor,

View File

@@ -410,6 +410,11 @@ struct PgfMorphoCallback {
PgfExn* err);
};
PGF_API_DECL
void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
PgfMorphoCallback* callback, PgfExn* err);
PGF_API_DECL
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
PgfSequenceItor *itor,

View File

@@ -227,6 +227,77 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
return 0;
}
static
int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
bool case_sensitive)
{
int res1 = 0;
const uint8_t *s1 = (uint8_t *) &sentence->text;
const uint8_t *e1 = s1+sentence->size;
size_t i = 0;
const uint8_t *s2 = NULL;
const uint8_t *e2 = NULL;
size_t count = 0;
for (;;) {
if (s1 >= e1) {
if (s2 < e2 || i < seq->syms.len)
return -1;
return case_sensitive ? res1 : 0;
}
uint32_t ucs1 = pgf_utf8_decode(&s1);
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
if (s2 >= e2) {
if (i >= seq->syms.len)
return 1;
if (s2 != NULL) {
if (pgf_utf8_is_space(ucs1)) {
count++;
continue;
}
if (count == 0) {
return (((int) ucs1) - ' ');
} else {
count = 0;
}
}
uint8_t t = ref<PgfSymbol>::get_tag(seq->syms.data[i]);
if (t != PgfSymbolKS::tag) {
return ((int) PgfSymbolKS::tag) - ((int) t);
}
auto sym_ks = ref<PgfSymbolKS>::untagged(seq->syms.data[i]);
s2 = (uint8_t *) &sym_ks->token.text;
e2 = s2+sym_ks->token.size;
i++;
}
uint32_t ucs2 = pgf_utf8_decode(&s2);
uint32_t ucs2i = pgf_utf8_to_upper(ucs2);
if (ucs1i > ucs2i) {
return 1;
}
else if (ucs1i < ucs2i) {
return -1;
}
else if (ucs1 > ucs2) {
res1 = 1;
} else if (ucs1 < ucs2) {
res1 = -1;
}
}
}
PGF_INTERNAL
PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
ref<PgfSequence> seq,
@@ -394,6 +465,60 @@ size_t phrasetable_size(PgfPhrasetable table)
return Node<PgfPhrasetableEntry>::size(table);
}
PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err)
{
if (table == 0)
return;
int cmp = text_cmp(sentence,table->value.seq,case_sensitive);
if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
} else if (cmp > 0) {
phrasetable_lookup(table->right,sentence,case_sensitive,lincats,callback,err);
} else {
auto backrefs = table->value.backrefs;
if (backrefs != 0) {
for (size_t i = 0; i < backrefs->len; i++) {
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}
break;
}
case PgfConcrLincat::tag: {
//ignore
break;
}
}
}
}
if (!case_sensitive) {
phrasetable_lookup(table->left,sentence,false,lincats,callback,err);
if (err->type != PGF_EXN_NONE)
return;
phrasetable_lookup(table->right,sentence,false,lincats,callback,err);
if (err->type != PGF_EXN_NONE)
return;
}
}
}
PGF_INTERNAL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -68,6 +68,13 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table);
PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr,
PgfPhrasetable table,

View File

@@ -163,6 +163,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t **buf)
}
}
PGF_INTERNAL
uint32_t pgf_utf8_to_upper(uint32_t c)
{
if (c >= 97 && c <= 122) return (c-32);
@@ -300,3 +301,26 @@ uint32_t pgf_utf8_to_upper(uint32_t c)
if (c >= 71872 && c <= 71903) return (c-32);
return c;
}
PGF_INTERNAL
bool pgf_utf8_is_space(uint32_t c)
{
if (c >= 9 && c <= 13)
return true;
if (c == 32)
return true;
if (c == 160)
return true;
if (c == 5760)
return true;
if (c >= 8192 && c <= 8202)
return true;
if (c == 8239)
return true;
if (c == 8287)
return true;
if (c == 12288)
return true;
return false;
}

View File

@@ -35,4 +35,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t** buf);
PGF_INTERNAL_DECL
uint32_t pgf_utf8_to_upper(uint32_t c);
PGF_INTERNAL_DECL
bool pgf_utf8_is_space(uint32_t c);
#endif

View File

@@ -484,7 +484,22 @@ type MorphoAnalysis = (Fun,String,Float)
-- a multiword expression. It then computes the list of all possible
-- morphological analyses.
lookupMorpho :: Concr -> String -> [MorphoAnalysis]
lookupMorpho = error "TODO: lookupMorpho"
lookupMorpho c sent = unsafePerformIO $ do
ref <- newIORef []
(withText sent $ \c_sent ->
allocaBytes (#size PgfMorphoCallback) $ \itor ->
bracket (wrapMorphoCallback (getMorphology ref)) freeHaskellFunPtr $ \fptr ->
withForeignPtr (c_revision c) $ \c_revision -> do
(#poke PgfMorphoCallback, fn) itor fptr
withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor))
fmap reverse (readIORef ref)
where
getMorphology ref _ c_name c_field c_prob exn = do
name <- peekText c_name
field <- peekText c_field
let prob = realToFrac c_prob
ann = (name,field,prob)
modifyIORef ref ((:) ann)
-- | 'lookupCohorts' takes an arbitrary string an produces
-- a list of all places where lexical items from the grammar have been
@@ -580,7 +595,7 @@ fullFormLexicon c = unsafePerformIO $ do
(#poke PgfMorphoCallback, fn) itor2 fptr2
seq_ids <- withPgfExn "fullFormLexicon" (pgf_iter_sequences (c_db c) c_revision itor1 itor2)
pgf_release_phrasetable_ids seq_ids)
fmap reverse (readIORef ref)
fmap (reverse2 []) (readIORef ref)
where
getSequences ref _ seq_id val exn = do
bracket (pgf_sequence_get_text_internal val) free $ \c_text ->
@@ -599,6 +614,9 @@ fullFormLexicon c = unsafePerformIO $ do
ann = (name,field,prob)
modifyIORef ref (\((form,anns) : lexicon) -> (form,ann:anns) : lexicon)
reverse2 ys [] = ys
reverse2 ys ((x1,x2):xs) = reverse2 ((x1,reverse x2):ys) xs
-- | This data type encodes the different outcomes which you could get from the parser.
data ParseOutput a

View File

@@ -119,6 +119,8 @@ type MorphoCallback = Ptr PgfMorphoCallback -> Ptr PgfText -> Ptr PgfText -> (#t
foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback
foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO ()
foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds)
foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()