diff --git a/src/compiler/GF/Command/Commands.hs b/src/compiler/GF/Command/Commands.hs index 433eeddd7..87422f20c 100644 --- a/src/compiler/GF/Command/Commands.hs +++ b/src/compiler/GF/Command/Commands.hs @@ -283,14 +283,18 @@ pgfCommands = Map.fromList [ exec = needPGF $ \opts ts pgf -> do concr <- optLang pgf opts case opts of - _ | isOpt "missing" opts -> - return . fromString . unwords . - morphoMissing concr . - concatMap words $ toStrings ts + _ | isOpt "all" opts -> + return . fromString . unlines . + map prMorphoAnalysis . concatMap (morphoAll concr) $ + toStrings ts _ | isOpt "known" opts -> return . fromString . unwords . - morphoKnown concr . - concatMap words $ toStrings ts + concatMap (morphoKnown concr) $ + toStrings ts + _ | isOpt "missing" opts -> + return . fromString . unwords . + concatMap (morphoMissing concr) $ + toStrings ts _ -> return . fromString . unlines . map prMorphoAnalysis . concatMap (morphos pgf opts) $ toStrings ts, @@ -298,8 +302,9 @@ pgfCommands = Map.fromList [ ("lang","the languages of analysis (comma-separated, no spaces)") ], options = [ - ("known", "return only the known words, in order of appearance"), - ("missing","show the list of unknown words, in order of appearance") + ("all", "scan the text for all words, not just a single one"), + ("known", "scan the text only for known words, in order of appearance"), + ("missing","scan the text for all unknown words, in order of appearance") ] }), @@ -839,6 +844,18 @@ pgfCommands = Map.fromList [ morphos pgf opts s = [(s,lookupMorpho concr s) | concr <- optLangs pgf opts] + morphoAll concr s = + [(w,ans) | (_,w,ans,_) <- lookupCohorts concr s] + + morphoKnown = morphoClassify True + + morphoMissing = morphoClassify False + + morphoClassify k concr s = + [w | (_,w,ans,_) <- lookupCohorts concr s, k /= null ans, notLiteral w] + where + notLiteral w = not (all isDigit w) + optClitics opts = case valStrOpts "clitics" "" opts of "" -> [] cs -> map reverse $ chunks ',' cs @@ -853,16 +870,6 @@ pgfCommands = Map.fromList [ app (OFlag op (LStr x)) | Just (Right f) <- treeOp pgf op = f x app _ = id -morphoMissing :: Concr -> [String] -> [String] -morphoMissing = morphoClassify False - -morphoKnown :: Concr -> [String] -> [String] -morphoKnown = morphoClassify True - -morphoClassify :: Bool -> Concr -> [String] -> [String] -morphoClassify k concr ws = [w | w <- ws, k /= null (lookupMorpho concr w), notLiteral w] where - notLiteral w = not (all isDigit w) - treeOpOptions pgf = [(op,expl) | (op,(expl,Left _)) <- allTreeOps pgf] treeOpFlags pgf = [(op,expl) | (op,(expl,Right _)) <- allTreeOps pgf] diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 39e82fdef..525b045e3 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -825,7 +825,37 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, bool case_sensitive = pgf_is_case_sensitive(concr); - phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, concr->lincats, callback, err); + PgfTextRange range; + range.pos = 0; + range.begin = (uint8_t *) &sentence->text[0]; + range.end = (uint8_t *) &sentence->text[sentence->size]; + phrasetable_lookup(concr->phrasetable, + &range, case_sensitive, + concr->lincats, + callback, err); + } PGF_API_END +} + +PGF_API +void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision, + PgfText *sentence, + PgfCohortsCallback* callback, PgfExn* err) +{ + PGF_API_BEGIN { + DB_scope scope(db, READER_SCOPE); + ref concr = db->revision2concr(cnc_revision); + + bool case_sensitive = pgf_is_case_sensitive(concr); + + PgfTextRange range; + range.pos = 0; + range.begin = (uint8_t *) &sentence->text[0]; + range.end = (uint8_t *) &sentence->text[sentence->size]; + phrasetable_lookup_prefixes(concr->phrasetable, + &range, case_sensitive, + concr->lincats, + 1, sentence->size, + callback, err); } PGF_API_END } diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index a60e22f27..81f8f9188 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -418,6 +418,18 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, PgfText *sentence, PgfMorphoCallback* callback, PgfExn* err); +typedef struct PgfCohortsCallback PgfCohortsCallback; +struct PgfCohortsCallback { + PgfMorphoCallback morpho; + void (*fn)(PgfCohortsCallback* self, size_t start, size_t end, + PgfExn* err); +}; + +PGF_API_DECL +void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision, + PgfText *sentence, + PgfCohortsCallback* callback, PgfExn* err); + PGF_API_DECL PgfPhrasetableIds *pgf_iter_sequences(PgfDB *db, PgfConcrRevision cnc_revision, PgfSequenceItor *itor, diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index 63b502d86..369894549 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -228,14 +228,11 @@ int sequence_cmp(ref seq1, ref seq2) } static -int text_cmp(PgfText *sentence, ref seq, - bool case_sensitive) +int text_range_cmp(PgfTextRange *range, ref seq, + bool case_sensitive) { int res1 = 0; - const uint8_t *s1 = (uint8_t *) &sentence->text; - const uint8_t *e1 = s1+sentence->size; - size_t i = 0; const uint8_t *s2 = NULL; const uint8_t *e2 = NULL; @@ -243,13 +240,13 @@ int text_cmp(PgfText *sentence, ref seq, size_t count = 0; for (;;) { - if (s1 >= e1) { + if (range->begin >= range->end) { if (s2 < e2 || i < seq->syms.len) return -1; return case_sensitive ? res1 : 0; } - uint32_t ucs1 = pgf_utf8_decode(&s1); + uint32_t ucs1 = pgf_utf8_decode(&range->begin); range->pos++; uint32_t ucs1i = pgf_utf8_to_upper(ucs1); if (s2 >= e2) { @@ -469,7 +466,7 @@ size_t phrasetable_size(PgfPhrasetable table) PGF_INTERNAL void phrasetable_lookup(PgfPhrasetable table, - PgfText *sentence, + PgfTextRange *sentence, bool case_sensitive, Namespace lincats, PgfMorphoCallback* callback, PgfExn* err) @@ -477,7 +474,8 @@ void phrasetable_lookup(PgfPhrasetable table, if (table == 0) return; - int cmp = text_cmp(sentence,table->value.seq,case_sensitive); + PgfTextRange current = *sentence; + int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive); if (cmp < 0) { phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); } else if (cmp > 0) { @@ -521,6 +519,71 @@ void phrasetable_lookup(PgfPhrasetable table, } } +PGF_INTERNAL +void phrasetable_lookup_prefixes(PgfPhrasetable table, + PgfTextRange *sentence, + bool case_sensitive, + Namespace lincats, + ptrdiff_t min, ptrdiff_t max, + PgfCohortsCallback* callback, PgfExn* err) +{ + if (table == 0) + return; + + PgfTextRange current = *sentence; + int cmp = text_range_cmp(¤t,table->value.seq,case_sensitive); + if (cmp < 0) { + phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,max,callback,err); + } else if (cmp > 0) { + ptrdiff_t len = current.begin - sentence->begin; + + if (min <= len) + phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); + + if (len <= max) + phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); + } else { + ptrdiff_t len = current.begin - sentence->begin; + + if (min <= len) + phrasetable_lookup_prefixes(table->left,sentence,case_sensitive,lincats,min,len,callback,err); + + auto backrefs = table->value.backrefs; + if (backrefs != 0) { + for (size_t i = 0; i < backrefs->len; i++) { + PgfSequenceBackref backref = *vector_elem(backrefs,i); + switch (ref::get_tag(backref.container)) { + case PgfConcrLin::tag: { + ref lin = ref::untagged(backref.container); + ref lincat = + namespace_lookup(lincats, &lin->absfun->type->name); + if (lincat != 0) { + ref field = + *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); + + callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); + if (err->type != PGF_EXN_NONE) + return; + } + break; + } + case PgfConcrLincat::tag: { + //ignore + break; + } + } + } + + callback->fn(callback, sentence->pos, current.pos, err); + if (err->type != PGF_EXN_NONE) + return; + } + + if (len <= max) + phrasetable_lookup_prefixes(table->right,sentence,case_sensitive,lincats,len,max,callback,err); + } +} + PGF_INTERNAL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index 3db53ca01..1ee9b8f07 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -68,13 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table, PGF_INTERNAL_DECL size_t phrasetable_size(PgfPhrasetable table); +typedef struct { + size_t pos; // position in Unicode characters + const uint8_t *begin; // pointer into the beginning of the range + const uint8_t *end; // pointer into the end of the range +} PgfTextRange; + PGF_INTERNAL_DECL void phrasetable_lookup(PgfPhrasetable table, - PgfText *sentence, + PgfTextRange *sentence, bool case_sensitive, Namespace lincats, PgfMorphoCallback* callback, PgfExn* err); +PGF_INTERNAL_DECL +void phrasetable_lookup_prefixes(PgfPhrasetable table, + PgfTextRange *sentence, + bool case_sensitive, + Namespace lincats, + ptrdiff_t min, ptrdiff_t max, + PgfCohortsCallback* callback, PgfExn* err); + PGF_INTERNAL_DECL void phrasetable_iter(PgfConcr *concr, PgfPhrasetable table, diff --git a/src/runtime/haskell/PGF2.hsc b/src/runtime/haskell/PGF2.hsc index f21e33576..6af03cefc 100644 --- a/src/runtime/haskell/PGF2.hsc +++ b/src/runtime/haskell/PGF2.hsc @@ -493,13 +493,6 @@ lookupMorpho c sent = unsafePerformIO $ do (#poke PgfMorphoCallback, fn) itor fptr withPgfExn "lookupMorpho" (pgf_lookup_morpho (c_db c) c_revision c_sent itor)) fmap reverse (readIORef ref) - where - getMorphology ref _ c_name c_field c_prob exn = do - name <- peekText c_name - field <- peekText c_field - let prob = realToFrac c_prob - ann = (name,field,prob) - modifyIORef ref ((:) ann) -- | 'lookupCohorts' takes an arbitrary string an produces -- a list of all places where lexical items from the grammar have been @@ -511,7 +504,33 @@ lookupMorpho c sent = unsafePerformIO $ do -- by the @end@ position. This can be used for instance if you want to -- filter only the longest matches. lookupCohorts :: Concr -> String -> [(Int,String,[MorphoAnalysis],Int)] -lookupCohorts = error "TODO: lookupCohorts" +lookupCohorts c sent = unsafePerformIO $ do + morpho_ref <- newIORef [] + cohorts_ref <- newIORef [] + (withText sent $ \c_sent -> + allocaBytes (#size PgfCohortsCallback) $ \itor -> + bracket (wrapMorphoCallback (getMorphology morpho_ref)) freeHaskellFunPtr $ \morpho_fptr -> + bracket (wrapCohortsCallback (getCohorts morpho_ref cohorts_ref)) freeHaskellFunPtr $ \cohorts_fptr -> + withForeignPtr (c_revision c) $ \c_revision -> do + (#poke PgfCohortsCallback, morpho.fn) itor morpho_fptr + (#poke PgfCohortsCallback, fn) itor cohorts_fptr + withPgfExn "lookupCohorts" (pgf_lookup_cohorts (c_db c) c_revision c_sent itor)) + fmap reverse (readIORef cohorts_ref) + where + getCohorts morpho_ref cohorts_ref _ start' end' exn = do + ans <- readIORef morpho_ref + let start = fromIntegral start' + end = fromIntegral end' + word = take (end-start) (drop start sent) + modifyIORef cohorts_ref ((:) (start, word, reverse ans, end)) + writeIORef morpho_ref [] + +getMorphology ref _ c_name c_field c_prob exn = do + name <- peekText c_name + field <- peekText c_field + let prob = realToFrac c_prob + ann = (name,field,prob) + modifyIORef ref ((:) ann) filterBest :: [(Int,String,[MorphoAnalysis],Int)] -> [(Int,String,[MorphoAnalysis],Int)] filterBest ans = diff --git a/src/runtime/haskell/PGF2/FFI.hsc b/src/runtime/haskell/PGF2/FFI.hsc index 86cbd9e1a..2d588b786 100644 --- a/src/runtime/haskell/PGF2/FFI.hsc +++ b/src/runtime/haskell/PGF2/FFI.hsc @@ -47,6 +47,7 @@ data PgfLinearizationOutputIface data PgfGraphvizOptions data PgfSequenceItor data PgfMorphoCallback +data PgfCohortsCallback data PgfPhrasetableIds type Wrapper a = a -> IO (FunPtr a) @@ -121,6 +122,12 @@ foreign import ccall "wrapper" wrapMorphoCallback :: Wrapper MorphoCallback foreign import ccall pgf_lookup_morpho :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO () +type CohortsCallback = Ptr PgfCohortsCallback -> CSize -> CSize -> Ptr PgfExn -> IO () + +foreign import ccall "wrapper" wrapCohortsCallback :: Wrapper CohortsCallback + +foreign import ccall pgf_lookup_cohorts :: Ptr PgfDB -> Ptr Concr -> Ptr PgfText -> Ptr PgfCohortsCallback -> Ptr PgfExn -> IO () + foreign import ccall pgf_iter_sequences :: Ptr PgfDB -> Ptr Concr -> Ptr PgfSequenceItor -> Ptr PgfMorphoCallback -> Ptr PgfExn -> IO (Ptr PgfPhrasetableIds) foreign import ccall pgf_get_lincat_counts_internal :: Ptr () -> Ptr CSize -> IO ()