mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-09 04:59:31 -06:00
first draft for lookupMorpho
This commit is contained in:
@@ -32,7 +32,6 @@ import Data.List (sort)
|
||||
import Control.Monad(mplus)
|
||||
import qualified Control.Monad.Fail as Fail
|
||||
|
||||
|
||||
class (Functor m,Monad m,MonadSIO m) => HasPGF m where getPGF :: m (Maybe PGF)
|
||||
|
||||
instance (Monad m,HasPGF m,Fail.MonadFail m) => TypeCheckArg m where
|
||||
@@ -292,8 +291,8 @@ pgfCommands = Map.fromList [
|
||||
morphoKnown concr .
|
||||
concatMap words $ toStrings ts
|
||||
_ -> return . fromString . unlines .
|
||||
map prMorphoAnalysis . concatMap (morphos pgf opts) .
|
||||
concatMap words $ toStrings ts,
|
||||
map prMorphoAnalysis . concatMap (morphos pgf opts) $
|
||||
toStrings ts,
|
||||
flags = [
|
||||
("lang","the languages of analysis (comma-separated, no spaces)")
|
||||
],
|
||||
|
||||
@@ -792,6 +792,43 @@ void pgf_iter_lins(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
} PGF_API_END
|
||||
}
|
||||
|
||||
static bool
|
||||
pgf_is_case_sensitive(ref<PgfConcr> concr)
|
||||
{
|
||||
PgfText *case_sensitive = (PgfText *)
|
||||
alloca(sizeof(PgfText)+15);
|
||||
case_sensitive->size = 14;
|
||||
strcpy(case_sensitive->text, "case_sensitive");
|
||||
|
||||
ref<PgfFlag> flag =
|
||||
namespace_lookup(concr->cflags, case_sensitive);
|
||||
if (flag != 0) {
|
||||
switch (ref<PgfLiteral>::get_tag(flag->value)) {
|
||||
case PgfLiteralStr::tag: {
|
||||
auto lstr = ref<PgfLiteralStr>::untagged(flag->value);
|
||||
if (lstr->val.size == 3 && strcmp(lstr->val.text, "off") == 0)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
PGF_API
|
||||
void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfText *sentence,
|
||||
PgfMorphoCallback* callback, PgfExn* err)
|
||||
{
|
||||
PGF_API_BEGIN {
|
||||
DB_scope scope(db, READER_SCOPE);
|
||||
ref<PgfConcr> concr = db->revision2concr(cnc_revision);
|
||||
|
||||
bool case_sensitive = pgf_is_case_sensitive(concr);
|
||||
|
||||
phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err);
|
||||
} PGF_API_END
|
||||
}
|
||||
|
||||
PGF_API
|
||||
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfSequenceItor *itor,
|
||||
|
||||
@@ -410,6 +410,11 @@ struct PgfMorphoCallback {
|
||||
PgfExn* err);
|
||||
};
|
||||
|
||||
PGF_API_DECL
|
||||
void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfText *sentence,
|
||||
PgfMorphoCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_API_DECL
|
||||
PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision,
|
||||
PgfSequenceItor *itor,
|
||||
|
||||
@@ -227,6 +227,77 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static
|
||||
int text_cmp(PgfText *sentence, ref<PgfSequence> seq,
|
||||
bool case_sensitive)
|
||||
{
|
||||
int res1 = 0;
|
||||
|
||||
const uint8_t *s1 = (uint8_t *) &sentence->text;
|
||||
const uint8_t *e1 = s1+sentence->size;
|
||||
|
||||
size_t i = 0;
|
||||
const uint8_t *s2 = NULL;
|
||||
const uint8_t *e2 = NULL;
|
||||
|
||||
size_t count = 0;
|
||||
|
||||
for (;;) {
|
||||
if (s1 >= e1) {
|
||||
if (s2 < e2 || i < seq->syms.len)
|
||||
return -1;
|
||||
return case_sensitive ? res1 : 0;
|
||||
}
|
||||
|
||||
uint32_t ucs1 = pgf_utf8_decode(&s1);
|
||||
uint32_t ucs1i = pgf_utf8_to_upper(ucs1);
|
||||
|
||||
if (s2 >= e2) {
|
||||
if (i >= seq->syms.len)
|
||||
return 1;
|
||||
|
||||
if (s2 != NULL) {
|
||||
if (pgf_utf8_is_space(ucs1)) {
|
||||
count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (count == 0) {
|
||||
return (((int) ucs1) - ' ');
|
||||
} else {
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t t = ref<PgfSymbol>::get_tag(seq->syms.data[i]);
|
||||
if (t != PgfSymbolKS::tag) {
|
||||
return ((int) PgfSymbolKS::tag) - ((int) t);
|
||||
}
|
||||
|
||||
auto sym_ks = ref<PgfSymbolKS>::untagged(seq->syms.data[i]);
|
||||
s2 = (uint8_t *) &sym_ks->token.text;
|
||||
e2 = s2+sym_ks->token.size;
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
uint32_t ucs2 = pgf_utf8_decode(&s2);
|
||||
uint32_t ucs2i = pgf_utf8_to_upper(ucs2);
|
||||
|
||||
if (ucs1i > ucs2i) {
|
||||
return 1;
|
||||
}
|
||||
else if (ucs1i < ucs2i) {
|
||||
return -1;
|
||||
}
|
||||
else if (ucs1 > ucs2) {
|
||||
res1 = 1;
|
||||
} else if (ucs1 < ucs2) {
|
||||
res1 = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
PgfPhrasetable phrasetable_internalize(PgfPhrasetable table,
|
||||
ref<PgfSequence> seq,
|
||||
@@ -394,6 +465,60 @@ size_t phrasetable_size(PgfPhrasetable table)
|
||||
return Node<PgfPhrasetableEntry>::size(table);
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err)
|
||||
{
|
||||
if (table == 0)
|
||||
return;
|
||||
|
||||
int cmp = text_cmp(sentence,table->value.seq,case_sensitive);
|
||||
if (cmp < 0) {
|
||||
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
|
||||
} else if (cmp > 0) {
|
||||
phrasetable_lookup(table->right,sentence,case_sensitive,lincats,callback,err);
|
||||
} else {
|
||||
auto backrefs = table->value.backrefs;
|
||||
if (backrefs != 0) {
|
||||
for (size_t i = 0; i < backrefs->len; i++) {
|
||||
PgfSequenceBackref backref = *vector_elem<PgfSequenceBackref>(backrefs,i);
|
||||
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
|
||||
case PgfConcrLin::tag: {
|
||||
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
|
||||
ref<PgfConcrLincat> lincat =
|
||||
namespace_lookup(lincats, &lin->absfun->type->name);
|
||||
if (lincat != 0) {
|
||||
ref<PgfText> field =
|
||||
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
|
||||
|
||||
callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case PgfConcrLincat::tag: {
|
||||
//ignore
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!case_sensitive) {
|
||||
phrasetable_lookup(table->left,sentence,false,lincats,callback,err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
phrasetable_lookup(table->right,sentence,false,lincats,callback,err);
|
||||
if (err->type != PGF_EXN_NONE)
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
PgfPhrasetable table,
|
||||
|
||||
@@ -68,6 +68,13 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
|
||||
PGF_INTERNAL_DECL
|
||||
size_t phrasetable_size(PgfPhrasetable table);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_lookup(PgfPhrasetable table,
|
||||
PgfText *sentence,
|
||||
bool case_sensitive,
|
||||
Namespace<struct PgfConcrLincat> lincats,
|
||||
PgfMorphoCallback* callback, PgfExn* err);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
void phrasetable_iter(PgfConcr *concr,
|
||||
PgfPhrasetable table,
|
||||
|
||||
@@ -163,6 +163,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t **buf)
|
||||
}
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
uint32_t pgf_utf8_to_upper(uint32_t c)
|
||||
{
|
||||
if (c >= 97 && c <= 122) return (c-32);
|
||||
@@ -300,3 +301,26 @@ uint32_t pgf_utf8_to_upper(uint32_t c)
|
||||
if (c >= 71872 && c <= 71903) return (c-32);
|
||||
return c;
|
||||
}
|
||||
|
||||
PGF_INTERNAL
|
||||
bool pgf_utf8_is_space(uint32_t c)
|
||||
{
|
||||
if (c >= 9 && c <= 13)
|
||||
return true;
|
||||
if (c == 32)
|
||||
return true;
|
||||
if (c == 160)
|
||||
return true;
|
||||
if (c == 5760)
|
||||
return true;
|
||||
if (c >= 8192 && c <= 8202)
|
||||
return true;
|
||||
if (c == 8239)
|
||||
return true;
|
||||
if (c == 8287)
|
||||
return true;
|
||||
if (c == 12288)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -35,4 +35,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t** buf);
|
||||
PGF_INTERNAL_DECL
|
||||
uint32_t pgf_utf8_to_upper(uint32_t c);
|
||||
|
||||
PGF_INTERNAL_DECL
|
||||
bool pgf_utf8_is_space(uint32_t c);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -484,7 +484,22 @@ type MorphoAnalysis = (Fun,String,Float)
|
||||
-- a multiword expression. It then computes the list of all possible
|
||||
-- morphological analyses.
|
||||
lookupMorpho :: Concr -> String -> [MorphoAnalysis]
|
||||
lookupMorpho = error "TODO: lookupMorpho"
|
||||
lookupMorpho c sent = unsafePerformIO $ do
|
||||
ref <- newIORef []
|
||||
(withText sent $ \c_sent ->
|
||||
allocaBytes (#size PgfMorphoCallback) $ \itor ->
|
||||
bracket (wrapMorphoCallback (getMorphology ref)) freeHaskellFunPtr $ \fptr ->
|
||||
withForeignPtr (c_revision c) $ \c_revision -> do
|
||||
(#poke PgfMorphoCallback, fn) itor fptr
|
||||
withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor))
|
||||
fmap reverse (readIORef ref)
|
||||
where
|
||||
getMorphology ref _ c_name c_field c_prob exn = do
|
||||
name <- peekText c_name
|
||||
field <- peekText c_field
|
||||
let prob = realToFrac c_prob
|
||||
ann = (name,field,prob)
|
||||
modifyIORef ref ((:) ann)
|
||||
|
||||
-- | 'lookupCohorts' takes an arbitrary string an produces
|
||||
-- a list of all places where lexical items from the grammar have been
|
||||
@@ -580,7 +595,7 @@ fullFormLexicon c = unsafePerformIO $ do
|
||||
(#poke PgfMorphoCallback, fn) itor2 fptr2
|
||||
seq_ids <- withPgfExn "fullFormLexicon" (pgf_iter_sequences (c_db c) c_revision itor1 itor2)
|
||||
pgf_release_phrasetable_ids seq_ids)
|
||||
fmap reverse (readIORef ref)
|
||||
fmap (reverse2 []) (readIORef ref)
|
||||
where
|
||||
getSequences ref _ seq_id val exn = do
|
||||
bracket (pgf_sequence_get_text_internal val) free $ \c_text ->
|
||||
@@ -599,6 +614,9 @@ fullFormLexicon c = unsafePerformIO $ do
|
||||
ann = (name,field,prob)
|
||||
modifyIORef ref (\((form,anns) : lexicon) -> (form,ann:anns) : lexicon)
|
||||
|
||||
reverse2 ys [] = ys
|
||||
reverse2 ys ((x1,x2):xs) = reverse2 ((x1,reverse x2):ys) xs
|
||||
|
||||
|
||||
-- | This data type encodes the different outcomes which you could get from the parser.
|
||||
data ParseOutput a
|
||||
|
||||
@@ -119,6 +119,8 @@ type MorphoCallback = Ptr PgfMorphoCallback -> Ptr PgfText -> Ptr PgfText -> (#t
|
||||
|
||||
foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback
|
||||
|
||||
foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO ()
|
||||
|
||||
foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds)
|
||||
|
||||
foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()
|
||||
|
||||
Reference in New Issue
Block a user