From 18f70b786ff436bed2e554d6076775b56235c96f Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Mon, 30 May 2022 21:16:34 +0200 Subject: [PATCH] first draft for lookupMorpho --- src/compiler/GF/Command/Commands.hs | 5 +- src/runtime/c/pgf/pgf.cxx | 37 ++++++++ src/runtime/c/pgf/pgf.h | 5 ++ src/runtime/c/pgf/phrasetable.cxx | 125 ++++++++++++++++++++++++++++ src/runtime/c/pgf/phrasetable.h | 7 ++ src/runtime/c/pgf/text.cxx | 24 ++++++ src/runtime/c/pgf/text.h | 3 + src/runtime/haskell/PGF2.hsc | 22 ++++- src/runtime/haskell/PGF2/FFI.hsc | 2 + 9 files changed, 225 insertions(+), 5 deletions(-) diff --git a/src/compiler/GF/Command/Commands.hs b/src/compiler/GF/Command/Commands.hs index 586147e53..a35790201 100644 --- a/src/compiler/GF/Command/Commands.hs +++ b/src/compiler/GF/Command/Commands.hs @@ -32,7 +32,6 @@ import Data.List (sort) import Control.Monad(mplus) import qualified Control.Monad.Fail as Fail - class (Functor m,Monad m,MonadSIO m) => HasPGF m where getPGF :: m (Maybe PGF) instance (Monad m,HasPGF m,Fail.MonadFail m) => TypeCheckArg m where @@ -292,8 +291,8 @@ pgfCommands = Map.fromList [ morphoKnown concr . concatMap words $ toStrings ts _ -> return . fromString . unlines . - map prMorphoAnalysis . concatMap (morphos pgf opts) . - concatMap words $ toStrings ts, + map prMorphoAnalysis . concatMap (morphos pgf opts) $ + toStrings ts, flags = [ ("lang","the languages of analysis (comma-separated, no spaces)") ], diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 340f7d9ba..39e82fdef 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -792,6 +792,43 @@ void pgf_iter_lins(PgfDB *db, PgfConcrRevision cnc_revision, } PGF_API_END } +static bool +pgf_is_case_sensitive(ref concr) +{ + PgfText *case_sensitive = (PgfText *) + alloca(sizeof(PgfText)+15); + case_sensitive->size = 14; + strcpy(case_sensitive->text, "case_sensitive"); + + ref flag = + namespace_lookup(concr->cflags, case_sensitive); + if (flag != 0) { + switch (ref::get_tag(flag->value)) { + case PgfLiteralStr::tag: { + auto lstr = ref::untagged(flag->value); + if (lstr->val.size == 3 && strcmp(lstr->val.text, "off") == 0) + return false; + } + } + } + return true; +} + +PGF_API +void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, + PgfText *sentence, + PgfMorphoCallback* callback, PgfExn* err) +{ + PGF_API_BEGIN { + DB_scope scope(db, READER_SCOPE); + ref concr = db->revision2concr(cnc_revision); + + bool case_sensitive = pgf_is_case_sensitive(concr); + + phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err); + } PGF_API_END +} + PGF_API PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision, PgfSequenceItor *itor, diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index aae3d0e8b..9dac796bb 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -410,6 +410,11 @@ struct PgfMorphoCallback { PgfExn* err); }; +PGF_API_DECL +void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, + PgfText *sentence, + PgfMorphoCallback* callback, PgfExn* err); + PGF_API_DECL PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision, PgfSequenceItor *itor, diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index ba9e88b8c..378bd205c 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -227,6 +227,77 @@ int sequence_cmp(ref seq1, ref seq2) return 0; } +static +int text_cmp(PgfText *sentence, ref seq, + bool case_sensitive) +{ + int res1 = 0; + + const uint8_t *s1 = (uint8_t *) &sentence->text; + const uint8_t *e1 = s1+sentence->size; + + size_t i = 0; + const uint8_t *s2 = NULL; + const uint8_t *e2 = NULL; + + size_t count = 0; + + for (;;) { + if (s1 >= e1) { + if (s2 < e2 || i < seq->syms.len) + return -1; + return case_sensitive ? res1 : 0; + } + + uint32_t ucs1 = pgf_utf8_decode(&s1); + uint32_t ucs1i = pgf_utf8_to_upper(ucs1); + + if (s2 >= e2) { + if (i >= seq->syms.len) + return 1; + + if (s2 != NULL) { + if (pgf_utf8_is_space(ucs1)) { + count++; + continue; + } + + if (count == 0) { + return (((int) ucs1) - ' '); + } else { + count = 0; + } + } + + uint8_t t = ref::get_tag(seq->syms.data[i]); + if (t != PgfSymbolKS::tag) { + return ((int) PgfSymbolKS::tag) - ((int) t); + } + + auto sym_ks = ref::untagged(seq->syms.data[i]); + s2 = (uint8_t *) &sym_ks->token.text; + e2 = s2+sym_ks->token.size; + + i++; + } + + uint32_t ucs2 = pgf_utf8_decode(&s2); + uint32_t ucs2i = pgf_utf8_to_upper(ucs2); + + if (ucs1i > ucs2i) { + return 1; + } + else if (ucs1i < ucs2i) { + return -1; + } + else if (ucs1 > ucs2) { + res1 = 1; + } else if (ucs1 < ucs2) { + res1 = -1; + } + } +} + PGF_INTERNAL PgfPhrasetable phrasetable_internalize(PgfPhrasetable table, ref seq, @@ -394,6 +465,60 @@ size_t phrasetable_size(PgfPhrasetable table) return Node::size(table); } +PGF_INTERNAL +void phrasetable_lookup(PgfPhrasetable table, + PgfText *sentence, + bool case_sensitive, + Namespace lincats, + PgfMorphoCallback* callback, PgfExn* err) +{ + if (table == 0) + return; + + int cmp = text_cmp(sentence,table->value.seq,case_sensitive); + if (cmp < 0) { + phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); + } else if (cmp > 0) { + phrasetable_lookup(table->right,sentence,case_sensitive,lincats,callback,err); + } else { + auto backrefs = table->value.backrefs; + if (backrefs != 0) { + for (size_t i = 0; i < backrefs->len; i++) { + PgfSequenceBackref backref = *vector_elem(backrefs,i); + switch (ref::get_tag(backref.container)) { + case PgfConcrLin::tag: { + ref lin = ref::untagged(backref.container); + ref lincat = + namespace_lookup(lincats, &lin->absfun->type->name); + if (lincat != 0) { + ref field = + *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); + + callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); + if (err->type != PGF_EXN_NONE) + return; + } + break; + } + case PgfConcrLincat::tag: { + //ignore + break; + } + } + } + } + + if (!case_sensitive) { + phrasetable_lookup(table->left,sentence,false,lincats,callback,err); + if (err->type != PGF_EXN_NONE) + return; + phrasetable_lookup(table->right,sentence,false,lincats,callback,err); + if (err->type != PGF_EXN_NONE) + return; + } + } +} + PGF_INTERNAL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index a60f4e02b..3db53ca01 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -68,6 +68,13 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table, PGF_INTERNAL_DECL size_t phrasetable_size(PgfPhrasetable table); +PGF_INTERNAL_DECL +void phrasetable_lookup(PgfPhrasetable table, + PgfText *sentence, + bool case_sensitive, + Namespace lincats, + PgfMorphoCallback* callback, PgfExn* err); + PGF_INTERNAL_DECL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/c/pgf/text.cxx b/src/runtime/c/pgf/text.cxx index 76b219e2f..c7b1770b3 100644 --- a/src/runtime/c/pgf/text.cxx +++ b/src/runtime/c/pgf/text.cxx @@ -163,6 +163,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t **buf) } } +PGF_INTERNAL uint32_t pgf_utf8_to_upper(uint32_t c) { if (c >= 97 && c <= 122) return (c-32); @@ -300,3 +301,26 @@ uint32_t pgf_utf8_to_upper(uint32_t c) if (c >= 71872 && c <= 71903) return (c-32); return c; } + +PGF_INTERNAL +bool pgf_utf8_is_space(uint32_t c) +{ + if (c >= 9 && c <= 13) + return true; + if (c == 32) + return true; + if (c == 160) + return true; + if (c == 5760) + return true; + if (c >= 8192 && c <= 8202) + return true; + if (c == 8239) + return true; + if (c == 8287) + return true; + if (c == 12288) + return true; + + return false; +} diff --git a/src/runtime/c/pgf/text.h b/src/runtime/c/pgf/text.h index 8028432b9..aa6dd0edc 100644 --- a/src/runtime/c/pgf/text.h +++ b/src/runtime/c/pgf/text.h @@ -35,4 +35,7 @@ pgf_utf8_encode(uint32_t ucs, uint8_t** buf); PGF_INTERNAL_DECL uint32_t pgf_utf8_to_upper(uint32_t c); +PGF_INTERNAL_DECL +bool pgf_utf8_is_space(uint32_t c); + #endif diff --git a/src/runtime/haskell/PGF2.hsc b/src/runtime/haskell/PGF2.hsc index ad6300fc4..f21e33576 100644 --- a/src/runtime/haskell/PGF2.hsc +++ b/src/runtime/haskell/PGF2.hsc @@ -484,7 +484,22 @@ type MorphoAnalysis = (Fun,String,Float) -- a multiword expression. It then computes the list of all possible -- morphological analyses. lookupMorpho :: Concr -> String -> [MorphoAnalysis] -lookupMorpho = error "TODO: lookupMorpho" +lookupMorpho c sent = unsafePerformIO $ do + ref <- newIORef [] + (withText sent $ \c_sent -> + allocaBytes (#size PgfMorphoCallback) $ \itor -> + bracket (wrapMorphoCallback (getMorphology ref)) freeHaskellFunPtr $ \fptr -> + withForeignPtr (c_revision c) $ \c_revision -> do + (#poke PgfMorphoCallback, fn) itor fptr + withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor)) + fmap reverse (readIORef ref) + where + getMorphology ref _ c_name c_field c_prob exn = do + name <- peekText c_name + field <- peekText c_field + let prob = realToFrac c_prob + ann = (name,field,prob) + modifyIORef ref ((:) ann) -- | 'lookupCohorts' takes an arbitrary string an produces -- a list of all places where lexical items from the grammar have been @@ -580,7 +595,7 @@ fullFormLexicon c = unsafePerformIO $ do (#poke PgfMorphoCallback, fn) itor2 fptr2 seq_ids <- withPgfExn "fullFormLexicon" (pgf_iter_sequences (c_db c) c_revision itor1 itor2) pgf_release_phrasetable_ids seq_ids) - fmap reverse (readIORef ref) + fmap (reverse2 []) (readIORef ref) where getSequences ref _ seq_id val exn = do bracket (pgf_sequence_get_text_internal val) free $ \c_text -> @@ -599,6 +614,9 @@ fullFormLexicon c = unsafePerformIO $ do ann = (name,field,prob) modifyIORef ref (\((form,anns) : lexicon) -> (form,ann:anns) : lexicon) + reverse2 ys [] = ys + reverse2 ys ((x1,x2):xs) = reverse2 ((x1,reverse x2):ys) xs + -- | This data type encodes the different outcomes which you could get from the parser. data ParseOutput a diff --git a/src/runtime/haskell/PGF2/FFI.hsc b/src/runtime/haskell/PGF2/FFI.hsc index e0e6db673..86cbd9e1a 100644 --- a/src/runtime/haskell/PGF2/FFI.hsc +++ b/src/runtime/haskell/PGF2/FFI.hsc @@ -119,6 +119,8 @@ type MorphoCallback = Ptr PgfMorphoCallback -> Ptr PgfText -> Ptr PgfText -> (#t foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback +foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO () + foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds) foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()