From 3e0cc91a02e73deaf5d6773ba6ecb67d01506e21 Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Fri, 16 Sep 2022 12:34:46 +0200 Subject: [PATCH] first rudimentary version of a parser --- cabal.project | 3 - src/compiler/GF/Grammar/Printer.hs | 8 +- src/compiler/Setup.hs | 211 +----------- src/compiler/gf.cabal | 12 +- src/runtime/c/Makefile.am | 2 + src/runtime/c/pgf/data.cxx | 11 +- src/runtime/c/pgf/data.h | 19 +- src/runtime/c/pgf/heap.h | 79 ----- src/runtime/c/pgf/linearizer.cxx | 10 +- src/runtime/c/pgf/linearizer.h | 8 + src/runtime/c/pgf/parser.cxx | 455 ++++++++++++++++++++++++++ src/runtime/c/pgf/parser.h | 51 +++ src/runtime/c/pgf/pgf.cxx | 202 +++++++++--- src/runtime/c/pgf/pgf.h | 25 ++ src/runtime/c/pgf/phrasetable.cxx | 203 +++++++----- src/runtime/c/pgf/phrasetable.h | 16 +- src/runtime/c/pgf/printer.h | 8 +- src/runtime/c/pgf/reader.cxx | 78 ++++- src/runtime/c/pgf/reader.h | 1 + src/runtime/c/pgf/writer.cxx | 7 +- src/runtime/c/pgf/writer.h | 1 + src/runtime/haskell/PGF2.hsc | 27 +- src/runtime/haskell/PGF2/FFI.hsc | 7 + src/runtime/haskell/tests/basic.pmcfg | 42 +-- 24 files changed, 1009 insertions(+), 477 deletions(-) delete mode 100644 cabal.project delete mode 100644 src/runtime/c/pgf/heap.h create mode 100644 src/runtime/c/pgf/parser.cxx create mode 100644 src/runtime/c/pgf/parser.h diff --git a/cabal.project b/cabal.project deleted file mode 100644 index d74f0b25a..000000000 --- a/cabal.project +++ /dev/null @@ -1,3 +0,0 @@ -packages: src/runtime/haskell - src/server - src/compiler diff --git a/src/compiler/GF/Grammar/Printer.hs b/src/compiler/GF/Grammar/Printer.hs index b63cdb029..8abffbf78 100644 --- a/src/compiler/GF/Grammar/Printer.hs +++ b/src/compiler/GF/Grammar/Printer.hs @@ -168,11 +168,9 @@ ppPmcfgRule id arg_cats res_cat (Production vars args res seqids) = (if null vars then empty else "∀{" <> hsep (punctuate ',' [ppLVar v <> '<' <> m | (v,m) <- vars]) <> '}' <+> '.') <+> - (if null args - then empty - else hsep (intersperse (pp '*') (zipWith ppPArg arg_cats args)) <+> "->") <+> - ppPmcfgCat res_cat res $$ - '=' <+> brackets (hcat (intersperse (pp ',') (map ppSeqId seqids)))) + ppPmcfgCat res_cat res <+> "->" <+> + brackets (hcat (intersperse (pp ',') (zipWith ppPArg arg_cats args))) <+> '=' <+> + brackets (hcat (intersperse (pp ',') (map ppSeqId seqids)))) ppPArg cat (PArg _ p) = ppPmcfgCat cat p diff --git a/src/compiler/Setup.hs b/src/compiler/Setup.hs index 938563802..7b7b31375 100644 --- a/src/compiler/Setup.hs +++ b/src/compiler/Setup.hs @@ -1,211 +1,4 @@ -import Distribution.System(Platform(..),OS(..)) -import Distribution.Simple(defaultMainWithHooks,UserHooks(..),simpleUserHooks) -import Distribution.Simple.LocalBuildInfo(LocalBuildInfo(..),absoluteInstallDirs,datadir,buildDir) -import Distribution.Simple.Setup(BuildFlags(..),Flag(..),InstallFlags(..),CopyDest(..),CopyFlags(..),SDistFlags(..),copyDest) -import Distribution.PackageDescription(PackageDescription(..),emptyHookedBuildInfo) -import Distribution.Simple.BuildPaths(exeExtension) -import System.FilePath((),(<.>),dropExtension) -import System.Directory(createDirectoryIfMissing,copyFile,doesDirectoryExist,doesFileExist) -import System.Process(rawSystem) -import System.Exit(ExitCode(..)) - --- | Notice about RGL not built anymore -noRGLmsg :: IO () -noRGLmsg = putStrLn "Notice: the RGL is not built as part of GF anymore. See https://github.com/GrammaticalFramework/gf-rgl" +import Distribution.Simple(defaultMain) main :: IO () -main = defaultMainWithHooks simpleUserHooks - { preBuild = gfPreBuild - , postBuild = gfPostBuild - , preInst = gfPreInst - , postInst = gfPostInst - , postCopy = gfPostCopy - } - where - gfPreBuild args = gfPre args . buildDistPref - gfPreInst args = gfPre args . installDistPref - - gfPre args distFlag = do - return emptyHookedBuildInfo - - gfPostBuild args flags pkg lbi = do - -- noRGLmsg - let gf = default_gf lbi - buildWeb gf flags (pkg,lbi) - - gfPostInst args flags pkg lbi = do - -- noRGLmsg - saveInstallPath args flags (pkg,lbi) - installWeb (pkg,lbi) - - gfPostCopy args flags pkg lbi = do - -- noRGLmsg - saveCopyPath args flags (pkg,lbi) - copyWeb flags (pkg,lbi) - - -- `cabal sdist` will not make a proper dist archive, for that see `make sdist` - -- However this function should exit quietly to allow building gf in sandbox - gfSDist pkg lbi hooks flags = do - return () - -saveInstallPath :: [String] -> InstallFlags -> (PackageDescription, LocalBuildInfo) -> IO () -saveInstallPath args flags bi = do - let - dest = NoCopyDest - dir = datadir (uncurry absoluteInstallDirs bi dest) - writeFile dataDirFile dir - -saveCopyPath :: [String] -> CopyFlags -> (PackageDescription, LocalBuildInfo) -> IO () -saveCopyPath args flags bi = do - let - dest = case copyDest flags of - NoFlag -> NoCopyDest - Flag d -> d - dir = datadir (uncurry absoluteInstallDirs bi dest) - writeFile dataDirFile dir - --- | Name of file where installation's data directory is recording --- This is a last-resort way in which the seprate RGL build script --- can determine where to put the compiled RGL files -dataDirFile :: String -dataDirFile = "DATA_DIR" - --- | Get path to locally-built gf -default_gf :: LocalBuildInfo -> FilePath -default_gf lbi = buildDir lbi exeName' exeNameReal - where - -- shadows Distribution.Simple.BuildPaths.exeExtension, which changed type signature in Cabal 2.4 - exeExtension = case hostPlatform lbi of - Platform arch Windows -> "exe" - _ -> "" - exeName' = "gf" - exeNameReal = exeName' <.> exeExtension - -{- - To test the GF web services, the minibar and the grammar editor, use - "cabal install" (or "runhaskell Setup.hs install") to install gf as usual. - Then start the server with the command "gf -server" and open - http://localhost:41296/ in your web browser (Firefox, Safari, Opera or - Chrome). The example grammars listed below will be available in the minibar. --} - -{- - Update 2018-07-04 - - The example grammars have now been removed from the GF repository. - This script will look for them in ../gf-contrib and build them from there if possible. - If not, the user will be given a message and nothing is build or copied. - (Unfortunately cabal install seems to hide all messages from stdout, - so users won't see this message unless they check the log.) --} - --- | Notice about contrib grammars -noContribMsg :: IO () -noContribMsg = putStr $ unlines - [ "Example grammars are no longer included in the main GF repository, but have moved to gf-contrib." - , "If you want them to be built, clone the following repository in the same directory as gf-core:" - , "https://github.com/GrammaticalFramework/gf-contrib.git" - ] - -example_grammars :: [(String, String, [String])] -- [(pgf, subdir, source modules)] -example_grammars = - [("Letter.pgf","letter",letterSrc) - ,("Foods.pgf","foods",foodsSrc) - ,("Phrasebook.pgf","phrasebook",phrasebookSrc) - ] - where - foodsSrc = ["Foods"++lang++".gf"|lang<-foodsLangs] - foodsLangs = words "Afr Amh Bul Cat Cze Dut Eng Epo Fin Fre Ger Gle Heb Hin Ice Ita Jpn Lav Mlt Mon Nep Pes Por Ron Spa Swe Tha Tsn Tur Urd" - - phrasebookSrc = ["Phrasebook"++lang++".gf"|lang<-phrasebookLangs] - phrasebookLangs = words "Bul Cat Chi Dan Dut Eng Lav Hin Nor Spa Swe Tha" -- only fastish languages - - letterSrc = ["Letter"++lang++".gf"|lang<-letterLangs] - letterLangs = words "Eng Fin Fre Heb Rus Swe" - -contrib_dir :: FilePath -contrib_dir = "..""gf-contrib" - -buildWeb :: String -> BuildFlags -> (PackageDescription, LocalBuildInfo) -> IO () -buildWeb gf flags (pkg,lbi) = do - contrib_exists <- doesDirectoryExist contrib_dir - if contrib_exists - then mapM_ build_pgf example_grammars - -- else noContribMsg - else return () - where - gfo_dir = buildDir lbi "examples" - - build_pgf :: (String, String, [String]) -> IO Bool - build_pgf (pgf,subdir,src) = - do createDirectoryIfMissing True tmp_dir - putStrLn $ "Building "++pgf - execute gf args - where - tmp_dir = gfo_dirsubdir - dir = contrib_dirsubdir - dest = NoCopyDest - gf_lib_path = datadir (absoluteInstallDirs pkg lbi dest) "lib" - args = numJobs flags++["-make","-s"] -- ,"-optimize-pgf" - ++["--gfo-dir="++tmp_dir, - --"--gf-lib-path="++gf_lib_path, - "--name="++dropExtension pgf, - "--output-dir="++gfo_dir] - ++[dirfile|file<-src] - -installWeb :: (PackageDescription, LocalBuildInfo) -> IO () -installWeb = setupWeb NoCopyDest - -copyWeb :: CopyFlags -> (PackageDescription, LocalBuildInfo) -> IO () -copyWeb flags = setupWeb dest - where - dest = case copyDest flags of - NoFlag -> NoCopyDest - Flag d -> d - -setupWeb :: CopyDest -> (PackageDescription, LocalBuildInfo) -> IO () -setupWeb dest (pkg,lbi) = do - mapM_ (createDirectoryIfMissing True) [grammars_dir,cloud_dir] - contrib_exists <- doesDirectoryExist contrib_dir - if contrib_exists - then mapM_ copy_pgf example_grammars - else return () -- message already displayed from buildWeb - where - grammars_dir = www_dir "grammars" - cloud_dir = www_dir "tmp" -- hmm - www_dir = datadir (absoluteInstallDirs pkg lbi dest) "www" - gfo_dir = buildDir lbi "examples" - - copy_pgf :: (String, String, [String]) -> IO () - copy_pgf (pgf,subdir,_) = - do let src = gfo_dir pgf - let dst = grammars_dir pgf - ex <- doesFileExist src - if ex then do putStrLn $ "Installing "++dst - copyFile src dst - else putStrLn $ "Not installing "++dst - --- | Run an arbitrary system command, returning False on failure -execute :: String -> [String] -> IO Bool -execute command args = - do let cmdline = command ++ " " ++ unwords (map showArg args) - e <- rawSystem command args - case e of - ExitSuccess -> return True - ExitFailure i -> do putStrLn $ "Ran: " ++ cmdline - putStrLn $ command++" exited with exit code: " ++ show i - return False - where - showArg arg = if ' ' `elem` arg then "'" ++ arg ++ "'" else arg - --- | This function is used to enable parallel compilation of the RGL and example grammars -numJobs :: BuildFlags -> [String] -numJobs flags = - if null n - then ["-j","+RTS","-A20M","-N","-RTS"] - else ["-j="++n,"+RTS","-A20M","-N"++n,"-RTS"] - where - -- buildNumJobs is only available in Cabal>=1.20 - n = case buildNumJobs flags of - Flag mn | mn/=Just 1-> maybe "" show mn - _ -> "" +main = defaultMain diff --git a/src/compiler/gf.cabal b/src/compiler/gf.cabal index 9f36f4b6e..84e3ad832 100644 --- a/src/compiler/gf.cabal +++ b/src/compiler/gf.cabal @@ -2,7 +2,7 @@ name: gf version: 3.11.0-git cabal-version: 1.22 -build-type: Custom +build-type: Simple license: OtherLicense license-file: LICENSE category: Natural Language Processing, Compiler @@ -39,14 +39,6 @@ data-files: www/translator/*.css www/translator/*.js -custom-setup - setup-depends: - base >= 4.9.1, - Cabal >= 1.22.0.0, - directory >= 1.3.0 && < 1.4, - filepath >= 1.4.1 && < 1.5, - process >= 1.0.1.1 && < 1.7 - source-repository head type: git location: https://github.com/GrammaticalFramework/gf-core.git @@ -192,7 +184,7 @@ executable gf GF.Text.Lexing GF.Text.Transliterations Paths_gf - + -- not really part of GF but I have changed the original binary library -- and we have to keep the copy for now. Data.Binary diff --git a/src/runtime/c/Makefile.am b/src/runtime/c/Makefile.am index f5feae7a3..7765ab300 100644 --- a/src/runtime/c/Makefile.am +++ b/src/runtime/c/Makefile.am @@ -26,6 +26,8 @@ libpgf_la_SOURCES = \ pgf/typechecker.h \ pgf/linearizer.cxx \ pgf/linearizer.h \ + pgf/parser.cxx \ + pgf/parser.h \ pgf/graphviz.cxx \ pgf/graphviz.h \ pgf/data.cxx \ diff --git a/src/runtime/c/pgf/data.cxx b/src/runtime/c/pgf/data.cxx index 42e2f504d..f7ba0b3fb 100644 --- a/src/runtime/c/pgf/data.cxx +++ b/src/runtime/c/pgf/data.cxx @@ -47,9 +47,9 @@ void PgfConcr::release(ref concr) void PgfConcrLincat::release(ref lincat) { for (size_t i = 0; i < lincat->fields->len; i++) { - text_db_release(*vector_elem(lincat->fields, i)); + PgfLincatField::release(vector_elem(lincat->fields, i)); } - Vector>::release(lincat->fields); + Vector::release(lincat->fields); for (size_t i = 0; i < lincat->args->len; i++) { PgfLParam::release(vector_elem(lincat->args, i)->param); @@ -66,6 +66,13 @@ void PgfConcrLincat::release(ref lincat) PgfDB::free(lincat, lincat->name.size+1); } +void PgfLincatField::release(ref field) +{ + text_db_release(field->name); + if (field->backrefs != 0) + Vector::release(field->backrefs); +} + void PgfLParam::release(ref param) { PgfDB::free(param, param->n_terms*sizeof(param->terms[0])); diff --git a/src/runtime/c/pgf/data.h b/src/runtime/c/pgf/data.h index b3ad9846f..185e9af05 100644 --- a/src/runtime/c/pgf/data.h +++ b/src/runtime/c/pgf/data.h @@ -209,17 +209,25 @@ struct PGF_INTERNAL_DECL PgfSymbolALLCAPIT { static const uint8_t tag = 10; }; +struct PGF_INTERNAL_DECL PgfLincatBackref; + +struct PGF_INTERNAL_DECL PgfLincatField { + ref name; + ref> backrefs; + + static void release(ref field); +}; + struct PGF_INTERNAL_DECL PgfConcrLincat { static const uint8_t tag = 0; ref abscat; - ref>> fields; - size_t n_lindefs; ref> args; ref>> res; ref>> seqs; + ref> fields; PgfText name; @@ -230,6 +238,7 @@ struct PGF_INTERNAL_DECL PgfConcrLin { static const uint8_t tag = 1; ref absfun; + ref lincat; ref> args; ref>> res; @@ -240,6 +249,12 @@ struct PGF_INTERNAL_DECL PgfConcrLin { static void release(ref lin); }; +struct PGF_INTERNAL_DECL PgfLincatBackref { + ref lin; + size_t seq_index; + size_t dot; +}; + struct PGF_INTERNAL_DECL PgfConcrPrintname { ref printname; PgfText name; diff --git a/src/runtime/c/pgf/heap.h b/src/runtime/c/pgf/heap.h deleted file mode 100644 index 61f9d7ce8..000000000 --- a/src/runtime/c/pgf/heap.h +++ /dev/null @@ -1,79 +0,0 @@ -#ifndef HEAP_H -#define HEAP_H - -template -class PGF_INTERNAL_DECL Heap { -public: - Heap() { - len = 0; - avail = 0; - values = NULL; - } - - ~Heap() { free(values); } - - void push(A value) { - if (len >= avail) { - avail = get_next_padovan(len+1); - A *new_values = (A *) realloc(values, sizeof(A)*avail); - if (new_values == NULL) - throw pgf_systemerror(errno); - values = new_values; - } - siftdown(value, 0, len); - len++; - } - - bool is_empty() { return (len == 0); } - - A top() { return values[0]; } - - A pop() { - A top = values[0]; - siftup(&values[len-1],0); - len--; - return top; - } - -private: - size_t len; - size_t avail; - A *values; - - void siftdown(A value, size_t startpos, size_t pos) { - while (pos > startpos) { - size_t parentpos = (pos - 1) >> 1; - A parent = values[parentpos]; - - if (value >= parent) - break; - - values[pos] = parent; - pos = parentpos; - } - - values[pos] = value; - } - - void siftup(A *pvalue, size_t pos) { - size_t startpos = pos; - size_t endpos = len; - - size_t childpos = 2*pos + 1; - while (childpos < endpos) { - size_t rightpos = childpos + 1; - if (rightpos < endpos && - values[childpos] >= values[rightpos]) { - childpos = rightpos; - } - - values[pos] = values[childpos]; - pos = childpos; - childpos = 2*pos + 1; - } - - siftdown(*pvalue, startpos, pos); - } -}; - -#endif diff --git a/src/runtime/c/pgf/linearizer.cxx b/src/runtime/c/pgf/linearizer.cxx index 43df0ca34..0156761f0 100644 --- a/src/runtime/c/pgf/linearizer.cxx +++ b/src/runtime/c/pgf/linearizer.cxx @@ -287,11 +287,7 @@ void PgfLinearizer::TreeLinNode::check_category(PgfLinearizer *linearizer, PgfTe void PgfLinearizer::TreeLinNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex) { PgfText *cat = &lin->absfun->type->name; - PgfText *field = NULL; - ref lincat = namespace_lookup(linearizer->concr->lincats, cat); - if (lincat != 0) { - field = &(**vector_elem(lincat->fields, lindex)); - } + PgfText *field = &*(vector_elem(lin->lincat->fields, lindex)->name); if (linearizer->pre_stack == NULL) out->begin_phrase(cat, fid, field, &lin->name); @@ -393,7 +389,7 @@ void PgfLinearizer::TreeLindefNode::linearize_arg(PgfLinearizationOutputIface *o void PgfLinearizer::TreeLindefNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex) { if (lincat != 0) { - PgfText *field = &(**vector_elem(lincat->fields, lindex)); + PgfText *field = &*(vector_elem(lincat->fields, lindex)->name); if (linearizer->pre_stack == NULL) out->begin_phrase(&lincat->name, fid, field, linearizer->wild); else { @@ -546,7 +542,7 @@ void PgfLinearizer::TreeLitNode::linearize(PgfLinearizationOutputIface *out, Pgf { PgfText *field = NULL; if (lincat != 0) { - field = &(**vector_elem(lincat->fields, lindex)); + field = &*(vector_elem(lincat->fields, lindex)->name); } linearizer->flush_pre_stack(out, literal); diff --git a/src/runtime/c/pgf/linearizer.h b/src/runtime/c/pgf/linearizer.h index 9e61b7576..3bd015cb7 100644 --- a/src/runtime/c/pgf/linearizer.h +++ b/src/runtime/c/pgf/linearizer.h @@ -98,6 +98,14 @@ class PGF_INTERNAL_DECL PgfLinearizer : public PgfUnmarshaller { ~TreeLitNode() { free(literal); }; }; + struct TreeChunksNode : public TreeNode { + TreeChunksNode(PgfLinearizer *linearizer); + virtual bool resolve(PgfLinearizer *linearizer); + virtual void check_category(PgfLinearizer *linearizer, PgfText *cat); + virtual void linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex); + virtual ref get_lincat(PgfLinearizer *linearizer); + }; + TreeNode *prev; TreeNode *next; TreeNode *args; diff --git a/src/runtime/c/pgf/parser.cxx b/src/runtime/c/pgf/parser.cxx new file mode 100644 index 000000000..71f47d4f2 --- /dev/null +++ b/src/runtime/c/pgf/parser.cxx @@ -0,0 +1,455 @@ +#include "data.h" +#include "printer.h" +#include "parser.h" +#include +#include +#include +#include + +// #define PARSER_DEBUG + +class PGF_INTERNAL_DECL PgfParser::CFGCat { +public: + ref field; + size_t value; + + // copy assignment + bool operator<(const CFGCat& other) const + { + if (field < other.field) + return true; + else if (field == other.field) + return (value < other.value); + else + return false; + } +}; + +struct PGF_INTERNAL_DECL PgfParser::Choice { + size_t id; + std::vector prods; + + Choice(size_t id) { + this->id = id; + } +}; + + +class PGF_INTERNAL_DECL PgfParser::Production +{ +public: + static + void predict(Choice *choice, ref lin, size_t seq_index) + { + size_t n_args = lin->absfun->type->hypos->len; + + Production *prod = (Production*) + malloc(sizeof(Production)+sizeof(Choice*)*n_args); + prod->lin = lin; + prod->seq_index = seq_index; + memset(prod->args, 0, sizeof(Choice*)*n_args); + + prod->log(choice); + choice->prods.push_back(prod); + } + + void log(Choice *res) { +#ifdef PARSER_DEBUG + PgfPrinter printer(NULL,0,NULL); + printer.nprintf(10, "?%ld = ", res->id); + printer.puts(&lin->name); + + auto hypos = lin->absfun->type->hypos; + for (size_t i = 0; i < hypos->len; i++) { + if (args[i] == NULL) + printer.efun(&hypos->data[i].type->name); + else + printer.nprintf(10, " ?%ld", args[i]->id); + } + printer.puts("\n"); + printer.dump(); +#endif + } + + ref lin; + size_t seq_index; + Choice *args[]; +}; + +struct PGF_INTERNAL_DECL PgfParser::ItemConts { + State *state; + std::vector items; +}; + +class PGF_INTERNAL_DECL PgfParser::Item +{ +public: + static + void combine(State *state, PgfLincatBackref *backref, Choice *choice) + { + ref seq = + *vector_elem(backref->lin->seqs, backref->seq_index); + + size_t index = backref->seq_index % backref->lin->lincat->fields->len; + ref field = vector_elem(backref->lin->lincat->fields, index); + +// state->get_conts(field, 0); + if (backref->dot+1 < seq->syms.len) { + size_t n_args = backref->lin->absfun->type->hypos->len; + + Item *item = (Item*) + malloc(sizeof(Item)+sizeof(Choice*)*n_args); + item->lin = backref->lin; + item->seq_index = backref->seq_index; + item->dot = backref->dot+1; + + memset(item->args, 0, sizeof(Choice*)*n_args); + ref seq = + *vector_elem(item->lin->seqs, backref->seq_index); + PgfSymbol sym = seq->syms.data[backref->dot]; + ref symcat = ref::untagged(sym); + item->args[symcat->d] = choice; + + item->log(); + } else { + Production::predict(choice, backref->lin, backref->seq_index); + } + } + + Production *complete() + { + size_t n_args = lin->absfun->type->hypos->len; + + Production *prod = (Production*) + malloc(sizeof(Production)+sizeof(Choice*)*n_args); + prod->lin = lin; + prod->seq_index = seq_index; + memcpy(prod->args, args, sizeof(Choice*)*n_args); + + return prod; + } + + void log() { +#ifdef PARSER_DEBUG + PgfPrinter printer(NULL,0,NULL); + + size_t index = seq_index / lin->lincat->fields->len; + ref res = *vector_elem(lin->res, index); + ref ty = lin->absfun->type; + + if (res->vars != 0) { + printer.lvar_ranges(res->vars); + printer.puts(" . "); + } + + printer.efun(&ty->name); + printer.puts("("); + printer.lparam(ref::from_ptr(&res->param)); + printer.puts(") -> "); + + printer.efun(&lin->name); + printer.puts("["); + size_t n_args = lin->args->len / lin->res->len; + for (size_t i = 0; i < n_args; i++) { + if (i > 0) + printer.puts(","); + + if (args[i] == NULL) + printer.parg(vector_elem(ty->hypos, i)->type, + vector_elem(lin->args, index*n_args + i)); + else + printer.nprintf(10, "?%ld", args[i]->id); + } + + printer.nprintf(10, "]; %ld : ", seq_index % lin->lincat->fields->len); + ref seq = *vector_elem(lin->seqs, seq_index); + for (size_t i = 0; i < seq->syms.len; i++) { + if (i > 0) + printer.puts(" "); + if (i == dot) + printer.puts(". "); + printer.symbol(*vector_elem(&seq->syms, i)); + } + printer.puts("\n"); + + printer.dump(); +#endif + } + + +private: + ItemConts *conts; + ref lin; + size_t seq_index; + size_t dot; + Choice *args[]; +}; + +class PGF_INTERNAL_DECL PgfParser::State +{ +public: + ItemConts *get_conts(ref field, size_t value) + { + ItemConts *conts; + CFGCat cfg_cat = {field, value}; + auto itr1 = contss.find(cfg_cat); + if (itr1 == contss.end()) { + conts = new ItemConts(); + conts->state = this; + contss.insert(std::pair(cfg_cat, conts)); + } else { + conts = itr1->second; + } + return conts; + } + +public: + size_t start, end; + State *prev, *next; + + std::map contss; + std::map choices; + std::priority_queue,PgfParser::ResultComparator> queue; +}; + + +class PgfParser::ResultExpr : public Result +{ +public: + ResultExpr(Production *prod) + { + this->inside_prob = prod->lin->absfun->prob; + this->outside_prob = prod->lin->lincat->abscat->prob; + this->prod = prod; + this->arg_index = 0; + } + + virtual prob_t prob() + { + return inside_prob+outside_prob; + } + + virtual PgfExpr expr(PgfUnmarshaller *u) + { + return u->efun(&prod->lin->name); + } + + virtual void proceed(PgfParser *parser, PgfUnmarshaller *u) + { + } + +private: + prob_t inside_prob; + prob_t outside_prob; + + Production *prod; + size_t arg_index; +}; + +class PgfParser::ResultMeta : public Result +{ +public: + ResultMeta(State *state, + PgfExpr arg, prob_t prob, + ResultMeta *next) + { + this->inside_prob = prob + (next ? next->inside_prob : 0); + this->state = state; + this->arg = arg; + this->next = next; + } + + virtual prob_t prob() + { + return inside_prob; + } + + virtual PgfExpr expr(PgfUnmarshaller *u) + { + ResultMeta *res = this; + PgfExpr expr = u->emeta(0); + while (res->arg != 0) { + PgfExpr expr1 = u->eapp(expr, res->arg); + u->free_ref(expr); + expr = expr1; + res = res->next; + } + return expr; + } + + virtual void proceed(PgfParser *parser, PgfUnmarshaller *u) + { + if (state->choices.size() == 0) { + State *prev = state; + while (prev->prev != NULL && prev->choices.size() == 0) { + prev = prev->prev; + } + + size_t size = state->start-prev->end; + PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1); + token->size = size; + memcpy(token->text,parser->sentence->text+prev->end,size); + token->text[size] = 0; + + PgfExpr expr = u->elit(u->lstr(token)); + prev->queue.push(new ResultMeta(prev, + expr, 0, + this)); + } else { + for (auto it : state->choices) { + ItemConts *conts = it.first; + Choice *choice = it.second; + + for (Production *prod : choice->prods) { + PgfExpr expr = u->efun(&prod->lin->name); + prob_t prob = prod->lin->absfun->prob + + prod->lin->lincat->abscat->prob; + conts->state->queue.push(new ResultMeta(conts->state, + expr, prob, + this)); + } + } + } + } + +private: + prob_t inside_prob; + State *state; + PgfExpr arg; + ResultMeta *next; +}; + +PgfParser::PgfParser(ref start, PgfText *sentence) +{ + this->start = start; + this->sentence = textdup(sentence); + this->last_choice_id = 0; + this->before = NULL; + this->after = NULL; + this->fetch_state = NULL; +} + +void PgfParser::space(size_t start, size_t end, PgfExn* err) +{ + State *prev = NULL; + State *next = before; + while (next != NULL && next->start < start) { + prev = next; + next = next->next; + } + + if (next == NULL || next->start != start) { + before = new State(); + before->start = start; + before->end = end; + before->prev = prev; + before->next = next; + + if (prev != NULL) prev->next = before; + if (next != NULL) next->prev = before; + } else { + before = next; + before->end = end; + } + + if (end == sentence->size) { + fetch_state = after; + fetch_state->queue.push(new ResultMeta(after,0,0,NULL)); + } +} + +void PgfParser::start_matches(size_t end, PgfExn* err) +{ + State *prev = NULL; + State *next = before; + while (next != NULL && next->start < end) { + prev = next; + next = next->next; + } + + if (next == NULL || next->start != end) { + after = new State(); + after->start = end; + after->end = end; + after->prev = prev; + after->next = next; + + if (prev != NULL) prev->next = after; + if (next != NULL) next->prev = after; + } else { + after = next; + } +} + +void PgfParser::match(ref lin, size_t seq_index, PgfExn* err) +{ + size_t index = seq_index % lin->lincat->fields->len; + ref field = vector_elem(lin->lincat->fields, index); + + ItemConts *conts = before->get_conts(field, 0); + + Choice *choice; + auto itr2 = after->choices.find(conts); + if (itr2 == after->choices.end()) { + choice = new Choice(++last_choice_id); + after->choices.insert(std::pair(conts, choice)); + } else { + choice = itr2->second; + } + + Production::predict(choice,lin,seq_index); +/* + if (itr2 == after->choices.end()) { + for (size_t i = 0; i < field->backrefs->len; i++) { + PgfLincatBackref *backref = vector_elem(field->backrefs, i); + Item::combine(before, backref, choice); + } + }*/ +} + +void PgfParser::end_matches(size_t end, PgfExn* err) +{ + if (end == sentence->size) { + fetch_state = after; + fetch_state->queue.push(new ResultMeta(after,0,0,NULL)); + } +} + +PgfExpr PgfParser::fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob) +{ + DB_scope scope(db, READER_SCOPE); + + while (fetch_state != NULL && fetch_state->queue.empty()) { + fetch_state = fetch_state->next; + } + + if (fetch_state == NULL) { + return 0; + } + + while (fetch_state->prev != NULL) { + if (!fetch_state->queue.empty()) { + Result *res = fetch_state->queue.top(); + fetch_state->queue.pop(); + res->proceed(this,u); + } + + fetch_state = fetch_state->prev; + } + + if (fetch_state->queue.empty()) { + return 0; + } + + Result *res = fetch_state->queue.top(); + fetch_state->queue.pop(); + *prob = res->prob(); + + return res->expr(u); +} + +PgfParser::~PgfParser() +{ + free(sentence); + printf("~PgfParser()\n"); +} diff --git a/src/runtime/c/pgf/parser.h b/src/runtime/c/pgf/parser.h new file mode 100644 index 000000000..b0410fe60 --- /dev/null +++ b/src/runtime/c/pgf/parser.h @@ -0,0 +1,51 @@ +#ifndef PARSER_H +#define PARSER_H + +class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum { +public: + PgfParser(ref start, PgfText *sentence); + + void space(size_t start, size_t end, PgfExn* err); + void start_matches(size_t end, PgfExn* err); + void match(ref lin, size_t seq_index, PgfExn* err); + void end_matches(size_t end, PgfExn* err); + + PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob); + + virtual ~PgfParser(); + +private: + class CFGCat; + class State; + class Item; + class ItemConts; + class Choice; + class Production; + + class Result { + public: + virtual prob_t prob() = 0; + virtual PgfExpr expr(PgfUnmarshaller *u) = 0; + virtual void proceed(PgfParser *parser, PgfUnmarshaller *u) = 0; + }; + + class ResultExpr; + class ResultMeta; + + class ResultComparator : std::less { + public: + bool operator()(Result* &lhs, Result* &rhs) const + { + return lhs->prob() > rhs->prob(); + } + }; + + ref start; + PgfText *sentence; + + size_t last_choice_id; + + State *before, *after, *fetch_state; +}; + +#endif diff --git a/src/runtime/c/pgf/pgf.cxx b/src/runtime/c/pgf/pgf.cxx index 1ab3a1256..028edee46 100644 --- a/src/runtime/c/pgf/pgf.cxx +++ b/src/runtime/c/pgf/pgf.cxx @@ -11,6 +11,7 @@ #include "printer.h" #include "typechecker.h" #include "linearizer.h" +#include "parser.h" #include "graphviz.h" static void @@ -815,6 +816,35 @@ pgf_is_case_sensitive(ref concr) return true; } +class PGF_INTERNAL_DECL PgfMorphoScanner : public PgfPhraseScanner { +public: + PgfMorphoScanner(PgfMorphoCallback* callback) { + this->callback = callback; + } + + virtual void space(size_t start, size_t end, PgfExn* err) + { + } + + virtual void start_matches(size_t end, PgfExn* err) + { + } + + virtual void match(ref lin, size_t seq_index, PgfExn* err) + { + ref field = + vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); + callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); + } + + virtual void end_matches(size_t end, PgfExn* err) + { + } + +private: + PgfMorphoCallback* callback; +}; + PGF_API void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, PgfText *sentence, @@ -826,13 +856,45 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision, bool case_sensitive = pgf_is_case_sensitive(concr); + PgfMorphoScanner scanner(callback); phrasetable_lookup(concr->phrasetable, sentence, case_sensitive, - concr->lincats, - callback, err); + &scanner, err); } PGF_API_END } +class PGF_INTERNAL_DECL PgfCohortsScanner : public PgfPhraseScanner { +public: + PgfCohortsScanner(PgfCohortsCallback* callback) { + this->callback = callback; + } + + virtual void space(size_t start, size_t end, PgfExn* err) + { + match_start = end; + } + + virtual void start_matches(size_t match_end, PgfExn* err) + { + } + + virtual void match(ref lin, size_t seq_index, PgfExn* err) + { + ref field = + vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len); + callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err); + } + + virtual void end_matches(size_t match_end, PgfExn* err) + { + callback->fn(callback, match_start, match_end, err); + } + +private: + size_t match_start; + PgfCohortsCallback* callback; +}; + PGF_API void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision, PgfText *sentence, @@ -844,10 +906,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision, bool case_sensitive = pgf_is_case_sensitive(concr); + PgfCohortsScanner scanner(callback); phrasetable_lookup_cohorts(concr->phrasetable, sentence, case_sensitive, - concr->lincats, - callback, err); + &scanner, err); } PGF_API_END } @@ -885,7 +947,7 @@ PGF_API PgfText *pgf_get_lincat_field_internal(object o, size_t i) { ref lincat = o; - return &(**vector_elem(lincat->fields, i)); + return &*(vector_elem(lincat->fields, i)->name); } PGF_API @@ -903,22 +965,18 @@ PgfText *pgf_print_lindef_internal(PgfPhrasetableIds *seq_ids, object o, size_t PgfInternalMarshaller m; PgfPrinter printer(NULL,0,&m); - printer.efun(&lincat->name); - printer.puts(" : "); - ref res = *vector_elem(lincat->res, i); - if (res->vars != 0) { printer.lvar_ranges(res->vars); printer.puts(" . "); } - printer.puts(" String(0) -> "); - printer.efun(&lincat->name); printer.puts("("); printer.lparam(ref::from_ptr(&res->param)); - printer.puts(") = ["); + printer.puts(") -> "); + printer.efun(&lincat->name); + printer.puts("[String(0)] = ["); size_t n_seqs = lincat->fields->len; for (size_t j = 0; j < n_seqs; j++) { @@ -942,20 +1000,19 @@ PgfText *pgf_print_linref_internal(PgfPhrasetableIds *seq_ids, object o, size_t PgfInternalMarshaller m; PgfPrinter printer(NULL,0,&m); - printer.efun(&lincat->name); - printer.puts(" : "); - ref res = *vector_elem(lincat->res, lincat->n_lindefs+i); - if (res->vars != 0) { printer.lvar_ranges(res->vars); printer.puts(" . "); } + printer.puts("String(0) -> "); + printer.efun(&lincat->name); + printer.puts("["); printer.efun(&lincat->name); printer.puts("("); printer.lparam(vector_elem(lincat->args, lincat->n_lindefs+i)->param); - printer.puts(") -> String(0) = ["); + printer.puts(")] = ["); size_t n_seqs = lincat->fields->len; ref seq = *vector_elem(lincat->seqs, lincat->n_lindefs*n_seqs+i); @@ -970,37 +1027,33 @@ PGF_API PgfText *pgf_print_lin_internal(PgfPhrasetableIds *seq_ids, object o, size_t i) { ref lin = o; - ref ty = lin->absfun->type; PgfInternalMarshaller m; PgfPrinter printer(NULL,0,&m); - printer.efun(&lin->name); - printer.puts(" : "); - ref res = *vector_elem(lin->res, i); + ref ty = lin->absfun->type; if (res->vars != 0) { printer.lvar_ranges(res->vars); printer.puts(" . "); } - size_t n_args = lin->args->len / lin->res->len; - for (size_t j = 0; j < n_args; j++) { - if (j > 0) - printer.puts(" * "); - - printer.parg(vector_elem(ty->hypos, j)->type, - vector_elem(lin->args, i*n_args + j)); - } - - if (n_args > 0) - printer.puts(" -> "); - printer.efun(&ty->name); printer.puts("("); printer.lparam(ref::from_ptr(&res->param)); - printer.puts(") = ["); + printer.puts(") -> "); + + printer.efun(&lin->name); + printer.puts("["); + size_t n_args = lin->args->len / lin->res->len; + for (size_t j = 0; j < n_args; j++) { + if (j > 0) + printer.puts(","); + printer.parg(vector_elem(ty->hypos, j)->type, + vector_elem(lin->args, i*n_args + j)); + } + printer.puts("] = ["); size_t n_seqs = lin->seqs->len / lin->res->len; for (size_t j = 0; j < n_seqs; j++) { @@ -1439,10 +1492,11 @@ public: this->n_lindefs = n_lindefs; this->n_linrefs = n_linrefs; - ref>> db_fields = vector_new>(n_fields); + ref> db_fields = vector_new(n_fields); for (size_t i = 0; i < n_fields; i++) { - ref field = textdup_db(fields[i]); - *vector_elem(db_fields, i) = field; + ref name = textdup_db(fields[i]); + vector_elem(db_fields, i)->name = name; + vector_elem(db_fields, i)->backrefs = 0; } ref lincat = PgfDB::malloc(abscat->name.size+1); @@ -2098,7 +2152,7 @@ PgfText **pgf_category_fields(PgfDB *db, PgfConcrRevision revision, if (fields == 0) throw pgf_systemerror(ENOMEM); for (size_t i = 0; i < n_fields; i++) { - fields[i] = textdup(lincat->fields->data[i]); + fields[i] = textdup(vector_elem(lincat->fields, i)->name); } *p_n_fields = n_fields; return fields; @@ -2188,7 +2242,7 @@ PgfText **pgf_tabular_linearize(PgfDB *db, PgfConcrRevision revision, PgfText *text = out.get_text(); if (text != NULL) { - res[pos++] = textdup(&(*lincat->fields->data[i])); + res[pos++] = textdup(&*(vector_elem(lincat->fields,i)->name)); res[pos++] = text; } } @@ -2227,7 +2281,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision, PgfText *text = out.get_text(); if (text != NULL) { - res[pos++] = textdup(&(*lincat->fields->data[i])); + res[pos++] = textdup(&*(vector_elem(lincat->fields, i)->name)); res[pos++] = text; } } @@ -2240,7 +2294,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision, return NULL; } -PGF_API_DECL +PGF_API void pgf_bracketed_linearize(PgfDB *db, PgfConcrRevision revision, PgfExpr expr, PgfPrintContext *ctxt, PgfMarshaller *m, @@ -2260,7 +2314,7 @@ void pgf_bracketed_linearize(PgfDB *db, PgfConcrRevision revision, } PGF_API_END } -PGF_API_DECL +PGF_API void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision, PgfExpr expr, PgfPrintContext *ctxt, PgfMarshaller *m, @@ -2281,6 +2335,70 @@ void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision, } PGF_API_END } +struct PGF_INTERNAL_DECL PgfLincatUnmarshaller : PgfUnmarshaller { + PgfLincatUnmarshaller(ref concr) { + this->concr = concr; + this->lincat = 0; + } + + virtual PgfExpr eabs(PgfBindType btype, PgfText *name, PgfExpr body) { return 0; } + virtual PgfExpr eapp(PgfExpr fun, PgfExpr arg) { return 0; } + virtual PgfExpr elit(PgfLiteral lit) { return 0; } + virtual PgfExpr emeta(PgfMetaId meta) { return 0; } + virtual PgfExpr efun(PgfText *name) { return 0; } + virtual PgfExpr evar(int index) { return 0; } + virtual PgfExpr etyped(PgfExpr expr, PgfType typ) { return 0; } + virtual PgfExpr eimplarg(PgfExpr expr) { return 0; } + virtual PgfLiteral lint(size_t size, uintmax_t *v) { return 0; } + virtual PgfLiteral lflt(double v) { return 0; } + virtual PgfLiteral lstr(PgfText *v) { return 0; } + virtual PgfType dtyp(size_t n_hypos, PgfTypeHypo *hypos, + PgfText *cat, + size_t n_exprs, PgfExpr *exprs) { + lincat = + namespace_lookup(concr->lincats, cat); + return 0; + } + virtual void free_ref(object x) {}; + + ref concr; + ref lincat; +}; + +PGF_API +PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision, + PgfType ty, PgfMarshaller *m, + PgfText *sentence, + PgfExn * err) +{ + PGF_API_BEGIN { + DB_scope scope(db, READER_SCOPE); + + ref concr = db->revision2concr(revision); + + bool case_sensitive = pgf_is_case_sensitive(concr); + + PgfLincatUnmarshaller u(concr); + m->match_type(&u, ty); + if (u.lincat == 0) + return 0; + + PgfParser *parser = new PgfParser(u.lincat, sentence); + phrasetable_lookup_cohorts(concr->phrasetable, + sentence, case_sensitive, + parser, err); + return parser; + } PGF_API_END + + return NULL; +} + +PGF_API +void pgf_free_expr_enum(PgfExprEnum *en) +{ + delete en; +} + PGF_API PgfText *pgf_get_printname(PgfDB *db, PgfConcrRevision revision, PgfText *fun, PgfExn* err) diff --git a/src/runtime/c/pgf/pgf.h b/src/runtime/c/pgf/pgf.h index 93f5c30ed..043961737 100644 --- a/src/runtime/c/pgf/pgf.h +++ b/src/runtime/c/pgf/pgf.h @@ -724,6 +724,31 @@ void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision, PgfLinearizationOutputIface *out, PgfExn* err); +#ifdef __cplusplus +struct PgfExprEnum { + virtual PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob)=0; + virtual ~PgfExprEnum() {}; +}; +#else +typedef struct PgfExprEnum PgfExprEnum; +typedef struct PgfExprEnumVtbl PgfExprEnumVtbl; +struct PgfExprEnumVtbl { + PgfExpr (*fetch)(PgfExprEnum *this, PgfDB *db, PgfUnmarshaller *u, prob_t *prob); +}; +struct PgfExprEnum { + PgfExprEnumVtbl *vtbl; +}; +#endif + +PGF_API_DECL +PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision, + PgfType ty, PgfMarshaller *m, + PgfText *sentence, + PgfExn * err); + +PGF_API_DECL +void pgf_free_expr_enum(PgfExprEnum *en); + PGF_API_DECL PgfText *pgf_get_printname(PgfDB *db, PgfConcrRevision revision, PgfText *fun, PgfExn* err); diff --git a/src/runtime/c/pgf/phrasetable.cxx b/src/runtime/c/pgf/phrasetable.cxx index b032337ba..791acc324 100644 --- a/src/runtime/c/pgf/phrasetable.cxx +++ b/src/runtime/c/pgf/phrasetable.cxx @@ -1,5 +1,5 @@ #include "data.h" -#include "heap.h" +#include PgfPhrasetableIds::PgfPhrasetableIds() { @@ -231,10 +231,6 @@ int sequence_cmp(ref seq1, ref seq2) struct PGF_INTERNAL_DECL PgfTextSpot { size_t pos; // position in Unicode characters const uint8_t *ptr; // pointer into the spot location - - bool operator >= (PgfTextSpot const &obj) { - return pos >= obj.pos; - } }; static @@ -479,8 +475,7 @@ PGF_INTERNAL void phrasetable_lookup(PgfPhrasetable table, PgfText *sentence, bool case_sensitive, - Namespace lincats, - PgfMorphoCallback* callback, PgfExn* err) + PgfPhraseScanner *scanner, PgfExn* err) { if (table == 0) return; @@ -491,9 +486,9 @@ void phrasetable_lookup(PgfPhrasetable table, const uint8_t *end = current.ptr+sentence->size; int cmp = text_sequence_cmp(¤t,end,table->value.seq,case_sensitive,true); if (cmp < 0) { - phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err); + phrasetable_lookup(table->left,sentence,case_sensitive,scanner,err); } else if (cmp > 0) { - phrasetable_lookup(table->right,sentence,case_sensitive,lincats,callback,err); + phrasetable_lookup(table->right,sentence,case_sensitive,scanner,err); } else { auto backrefs = table->value.backrefs; if (backrefs != 0) { @@ -502,13 +497,8 @@ void phrasetable_lookup(PgfPhrasetable table, switch (ref::get_tag(backref.container)) { case PgfConcrLin::tag: { ref lin = ref::untagged(backref.container); - ref lincat = - namespace_lookup(lincats, &lin->absfun->type->name); - if (lin->absfun->type->hypos->len == 0 && lincat != 0) { - ref field = - *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); - - callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); + if (lin->absfun->type->hypos->len == 0) { + scanner->match(lin, backref.seq_index, err); if (err->type != PGF_EXN_NONE) return; } @@ -523,10 +513,10 @@ void phrasetable_lookup(PgfPhrasetable table, } if (!case_sensitive) { - phrasetable_lookup(table->left,sentence,false,lincats,callback,err); + phrasetable_lookup(table->left,sentence,false,scanner,err); if (err->type != PGF_EXN_NONE) return; - phrasetable_lookup(table->right,sentence,false,lincats,callback,err); + phrasetable_lookup(table->right,sentence,false,scanner,err); if (err->type != PGF_EXN_NONE) return; } @@ -534,18 +524,66 @@ void phrasetable_lookup(PgfPhrasetable table, } struct PGF_INTERNAL_DECL PgfCohortsState { + class PgfTextSpotComparator : std::less { + public: + bool operator()(PgfTextSpot &lhs, PgfTextSpot &rhs) const + { + return lhs.pos > rhs.pos; + } + }; + PgfTextSpot spot; - Heap queue; + std::priority_queue, PgfTextSpotComparator> queue; + size_t last_pos; - size_t skip_pos; + bool skipping; const uint8_t *end; // pointer into the end of the sentence bool case_sensitive; - Namespace lincats; - PgfCohortsCallback* callback; + PgfPhraseScanner *scanner; PgfExn* err; }; +static +void finish_skipping(PgfCohortsState *state) { + if (state->skipping) { + while (!state->queue.empty()) { + PgfTextSpot spot = state->queue.top(); + if (spot.pos >= state->spot.pos) + break; + + if (spot.pos != state->last_pos) { + if (state->last_pos > 0) { + state->scanner->space(spot.pos, spot.pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + } + + state->scanner->start_matches(state->spot.pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + + state->scanner->end_matches(state->spot.pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + + state->last_pos = spot.pos; + } + + state->queue.pop(); + } + + state->scanner->space(state->spot.pos, state->spot.pos, + state->err); + + state->last_pos = 0; + state->skipping = false; + } +} + static void phrasetable_lookup_prefixes(PgfCohortsState *state, PgfPhrasetable table, @@ -561,38 +599,38 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state, } else if (cmp > 0) { ptrdiff_t len = current.ptr - state->spot.ptr; - if (min <= len) - phrasetable_lookup_prefixes(state,table->left,min,len); + if (min <= len-1) + phrasetable_lookup_prefixes(state,table->left,min,len-1); - if (len+1 <= max) - phrasetable_lookup_prefixes(state,table->right,len+1,max); + if (len <= max) + phrasetable_lookup_prefixes(state,table->right,len,max); } else { ptrdiff_t len = current.ptr - state->spot.ptr; + finish_skipping(state); + if (state->err->type != PGF_EXN_NONE) + return; + if (min <= len) phrasetable_lookup_prefixes(state,table->left,min,len); auto backrefs = table->value.backrefs; if (len > 0 && backrefs != 0) { - if (state->skip_pos != (size_t) -1) { - state->callback->fn(state->callback, - state->skip_pos, - state->spot.pos, - state->err); - if (state->err->type != PGF_EXN_NONE) - return; - state->skip_pos = (size_t) -1; - } + if (state->last_pos != current.pos) { + if (state->last_pos > 0) { + state->scanner->end_matches(state->last_pos, + state->err); + if (state->err->type != PGF_EXN_NONE) + return; + } - if (state->last_pos > 0 && state->last_pos != current.pos) { - state->callback->fn(state->callback, - state->spot.pos, - state->last_pos, - state->err); + state->scanner->start_matches(current.pos, + state->err); if (state->err->type != PGF_EXN_NONE) return; + + state->last_pos = current.pos; } - state->last_pos = current.pos; state->queue.push(current); for (size_t i = 0; i < backrefs->len; i++) { @@ -600,17 +638,10 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state, switch (ref::get_tag(backref.container)) { case PgfConcrLin::tag: { ref lin = ref::untagged(backref.container); - ref lincat = - namespace_lookup(state->lincats, &lin->absfun->type->name); - if (lin->absfun->type->hypos->len == 0 && lincat != 0) { - ref field = - *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); - - state->callback->morpho.fn(&state->callback->morpho, - &lin->absfun->name, - &(*field), - lincat->abscat->prob+lin->absfun->prob, - state->err); + if (lin->absfun->type->hypos->len == 0) { + state->scanner->match(lin, + backref.seq_index, + state->err); if (state->err->type != PGF_EXN_NONE) return; } @@ -633,8 +664,7 @@ PGF_INTERNAL void phrasetable_lookup_cohorts(PgfPhrasetable table, PgfText *sentence, bool case_sensitive, - Namespace lincats, - PgfCohortsCallback* callback, PgfExn* err) + PgfPhraseScanner *scanner, PgfExn* err) { PgfTextSpot spot; spot.pos = 0; @@ -645,15 +675,16 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, state.spot.ptr = NULL; state.queue.push(spot); state.last_pos = 0; - state.skip_pos = (size_t) -1; + state.skipping = false; state.end = (uint8_t *) &sentence->text[sentence->size]; state.case_sensitive = case_sensitive; - state.lincats = lincats; - state.callback = callback; + state.scanner = scanner; state.err = err; - while (!state.queue.is_empty()) { - PgfTextSpot spot = state.queue.pop(); + while (!state.queue.empty()) { + PgfTextSpot spot = state.queue.top(); + state.queue.pop(); + if (spot.pos != state.spot.pos) { state.spot = spot; @@ -667,36 +698,38 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, state.spot.ptr = ptr; } - state.skip_pos = (size_t) -1; + state.scanner->space(spot.pos,state.spot.pos,state.err); + if (state.err->type != PGF_EXN_NONE) + return; + while (state.spot.ptr < state.end) { phrasetable_lookup_prefixes(&state, table, 1, sentence->size); + if (state.err->type != PGF_EXN_NONE) + return; if (state.last_pos > 0) { // We found at least one match. // The last range is yet to be reported. - state.callback->fn(state.callback, - state.spot.pos, - state.last_pos, - state.err); + state.scanner->end_matches(state.last_pos, + state.err); if (state.err->type != PGF_EXN_NONE) return; state.last_pos = 0; break; } else { - // We didn't find any matches at this position, - // therefore we must skip one character and try again. - if (state.skip_pos == (size_t) -1) - state.skip_pos = state.spot.pos; + // No matches were found, try the next position + if (!state.skipping) { + while (!state.queue.empty() && + state.queue.top().pos < state.spot.pos) { + state.queue.pop(); + } + state.queue.push(state.spot); + state.skipping = true; + } + const uint8_t *ptr = state.spot.ptr; uint32_t ucs = pgf_utf8_decode(&ptr); if (pgf_utf8_is_space(ucs)) { - state.callback->fn(state.callback, - state.skip_pos, - state.spot.pos, - state.err); - if (state.err->type != PGF_EXN_NONE) - return; - state.skip_pos = -1; state.queue.push(state.spot); break; } @@ -704,16 +737,10 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table, state.spot.ptr = ptr; } } - - if (state.skip_pos != (size_t) -1) { - state.callback->fn(state.callback, - state.skip_pos, - state.spot.pos, - state.err); - if (state.err->type != PGF_EXN_NONE) - return; - state.skip_pos = (size_t) -1; - } + + finish_skipping(&state); + if (state.err->type != PGF_EXN_NONE) + return; state.spot = spot; } @@ -748,10 +775,10 @@ void phrasetable_iter(PgfConcr *concr, ref lincat = namespace_lookup(concr->lincats, &lin->absfun->type->name); if (lincat != 0) { - ref field = - *vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); + ref field = + vector_elem(lincat->fields, backref.seq_index % lincat->fields->len); - callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err); + callback->fn(callback, &lin->absfun->name, &(*field->name), lincat->abscat->prob+lin->absfun->prob, err); if (err->type != PGF_EXN_NONE) return; } diff --git a/src/runtime/c/pgf/phrasetable.h b/src/runtime/c/pgf/phrasetable.h index 45c23ead9..39c6d5963 100644 --- a/src/runtime/c/pgf/phrasetable.h +++ b/src/runtime/c/pgf/phrasetable.h @@ -68,19 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table, PGF_INTERNAL_DECL size_t phrasetable_size(PgfPhrasetable table); +class PgfConcrLin; + +class PGF_INTERNAL_DECL PgfPhraseScanner { +public: + virtual void space(size_t start, size_t end, PgfExn* err)=0; + virtual void start_matches(size_t pos, PgfExn* err)=0; + virtual void match(ref lin, size_t seq_index, PgfExn* err)=0; + virtual void end_matches(size_t pos, PgfExn* err)=0; +}; + PGF_INTERNAL_DECL void phrasetable_lookup(PgfPhrasetable table, PgfText *sentence, bool case_sensitive, - Namespace lincats, - PgfMorphoCallback* callback, PgfExn* err); + PgfPhraseScanner *scanner, PgfExn* err); PGF_INTERNAL_DECL void phrasetable_lookup_cohorts(PgfPhrasetable table, PgfText *sentence, bool case_sensitive, - Namespace lincats, - PgfCohortsCallback* callback, PgfExn* err); + PgfPhraseScanner *scanner, PgfExn* err); PGF_INTERNAL_DECL void phrasetable_iter(PgfConcr *concr, diff --git a/src/runtime/c/pgf/printer.h b/src/runtime/c/pgf/printer.h index e41df556a..678720dde 100644 --- a/src/runtime/c/pgf/printer.h +++ b/src/runtime/c/pgf/printer.h @@ -33,7 +33,7 @@ public: PgfPrinter(PgfPrintContext *context, int priority, PgfMarshaller *marshaller); - PgfPrinter() { free(res); } + ~PgfPrinter() { free(res); } // Push a new variable in the printing context. If the name // collides with an existing variable, the variable is renamed @@ -52,6 +52,12 @@ public: PgfText *get_text(); + void dump() { + PgfText *text = get_text(); + fprintf(stderr, "%.*s", (int) text->size, text->text); + free(text); + }; + void hypo(PgfTypeHypo *hypo, int prio); void parg(ref ty, ref parg); diff --git a/src/runtime/c/pgf/reader.cxx b/src/runtime/c/pgf/reader.cxx index f875ba980..be71d1dec 100644 --- a/src/runtime/c/pgf/reader.cxx +++ b/src/runtime/c/pgf/reader.cxx @@ -667,7 +667,7 @@ ref PgfReader::read_lincat() { ref lincat = read_name(&PgfConcrLincat::name); lincat->abscat = namespace_lookup(abstract->cats, &lincat->name); - lincat->fields = read_vector(&PgfReader::read_text2); + lincat->fields = read_vector(&PgfReader::read_lincat_field); lincat->n_lindefs = read_len(); lincat->args = read_vector(&PgfReader::read_parg); lincat->res = read_vector(&PgfReader::read_presult2); @@ -675,6 +675,12 @@ ref PgfReader::read_lincat() return lincat; } +void PgfReader::read_lincat_field(ref field) +{ + field->name = read_text(); + field->backrefs = 0; +} + ref PgfReader::read_lin() { ref lin = read_name(&PgfConcrLin::name); @@ -682,6 +688,76 @@ ref PgfReader::read_lin() lin->args = read_vector(&PgfReader::read_parg); lin->res = read_vector(&PgfReader::read_presult2); lin->seqs = read_seq_ids(lin.tagged()); + + lin->lincat = + namespace_lookup(concrete->lincats, &lin->absfun->type->name); + if (lin->lincat == 0) + throw pgf_error("Found a lin which uses a category without a lincat"); + + ref> hypos = lin->absfun->type->hypos; + ref lincats[hypos->len]; + for (size_t d = 0; d < hypos->len; d++) { + lincats[d] = + namespace_lookup(concrete->lincats, + &vector_elem(hypos,d)->type->name); + if (lincats[d] == 0) + throw pgf_error("Found a lin which uses a category without a lincat"); + } + + size_t n_fields = lin->lincat->fields->len; + for (size_t seq_index = 0; seq_index < lin->seqs->len; seq_index++) { + ref seq = *vector_elem(lin->seqs,seq_index); + ref result = *vector_elem(lin->res, seq_index / n_fields); + + size_t dot = 0; + if (dot < seq->syms.len) { + PgfSymbol sym = *vector_elem(&seq->syms,dot); + switch (ref::get_tag(sym)) { + case PgfSymbolCat::tag: { + auto sym_cat = ref::untagged(sym); + ref lincat = lincats[sym_cat->d]; + + size_t max_values = 1; + size_t ranges[sym_cat->r.n_terms]; + for (size_t i = 0; i < sym_cat->r.n_terms; i++) { + size_t range = 1; + for (size_t j = 0; j < result->vars->len; j++) { + auto var_range = vector_elem(result->vars, j); + if (var_range->var == sym_cat->r.terms[i].var) { + range = var_range->range; + break; + } + } + + ranges[i] = range; + max_values *= range; + } + + for (size_t values = 0; values < max_values; values++) { + size_t v = values; + size_t index = sym_cat->r.i0; + for (size_t i = 0; i < sym_cat->r.n_terms; i++) { + index += sym_cat->r.terms[i].factor * (v % ranges[i]); + v = v / ranges[i]; + } + + ref> backrefs = + vector_elem(lincat->fields,index)->backrefs; + backrefs = + vector_resize(backrefs, backrefs->len+1, + PgfDB::get_txn_id()); + vector_elem(lincat->fields,index)->backrefs = backrefs; + ref backref = + vector_elem(backrefs,backrefs->len-1); + backref->lin = lin; + backref->seq_index = seq_index; + backref->dot = dot; + } + break; + } + } + } + } return lin; } diff --git a/src/runtime/c/pgf/reader.h b/src/runtime/c/pgf/reader.h index 609b9b8bc..2e853dd1d 100644 --- a/src/runtime/c/pgf/reader.h +++ b/src/runtime/c/pgf/reader.h @@ -69,6 +69,7 @@ public: void merge_abstract(ref abstract); ref read_lincat(); + void read_lincat_field(ref field); ref read_lparam(); void read_variable_range(ref var_info); void read_parg(ref parg); diff --git a/src/runtime/c/pgf/writer.cxx b/src/runtime/c/pgf/writer.cxx index 2666fae68..8b20495e0 100644 --- a/src/runtime/c/pgf/writer.cxx +++ b/src/runtime/c/pgf/writer.cxx @@ -383,13 +383,18 @@ void PgfWriter::write_phrasetable_helper(PgfPhrasetable table) void PgfWriter::write_lincat(ref lincat) { write_name(&lincat->name); - write_vector(lincat->fields, &PgfWriter::write_text); + write_vector(lincat->fields, &PgfWriter::write_lincat_field); write_len(lincat->n_lindefs); write_vector(lincat->args, &PgfWriter::write_parg); write_vector(lincat->res, &PgfWriter::write_presult); write_vector(lincat->seqs, &PgfWriter::write_seq_id); } +void PgfWriter::write_lincat_field(ref field) +{ + write_text(field->name); +} + void PgfWriter::write_lin(ref lin) { write_name(&lin->name); diff --git a/src/runtime/c/pgf/writer.h b/src/runtime/c/pgf/writer.h index 6e9b1a83a..ab7400ec8 100644 --- a/src/runtime/c/pgf/writer.h +++ b/src/runtime/c/pgf/writer.h @@ -39,6 +39,7 @@ public: void write_abstract(ref abstract); void write_lincat(ref lincat); + void write_lincat_field(ref field); void write_variable_range(ref var); void write_lparam(ref lparam); void write_parg(ref linarg); diff --git a/src/runtime/haskell/PGF2.hsc b/src/runtime/haskell/PGF2.hsc index b8098637d..34d75990e 100644 --- a/src/runtime/haskell/PGF2.hsc +++ b/src/runtime/haskell/PGF2.hsc @@ -97,7 +97,7 @@ import Foreign import Foreign.C import Control.Monad(forM,forM_) import Control.Exception(bracket,mask_,throwIO) -import System.IO.Unsafe(unsafePerformIO) +import System.IO.Unsafe(unsafePerformIO, unsafeInterleaveIO) import System.Random import qualified Data.Map as Map import Data.IORef @@ -673,7 +673,30 @@ data ParseOutput a | ParseIncomplete -- ^ The sentence is not complete. parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)] -parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) [] +parse c ty sent = + unsafePerformIO $ + withForeignPtr (c_revision c) $ \c_revision -> + withForeignPtr marshaller $ \m -> + bracket (newStablePtr ty) freeStablePtr $ \c_ty -> + withText sent $ \c_sent -> do + c_enum <- withPgfExn "parse" (pgf_parse (c_db c) c_revision c_ty m c_sent) + c_fetch <- (#peek PgfExprEnumVtbl, fetch) =<< (#peek PgfExprEnum, vtbl) c_enum + exprs <- unsafeInterleaveIO (fetchLazy c_fetch c_enum) + return (ParseOk exprs) + where + fetchLazy c_fetch c_enum = + withForeignPtr (c_revision c) $ \c_revision -> + withForeignPtr unmarshaller $ \u -> + alloca $ \p_prob -> do + c_expr <- callFetch c_fetch c_enum (c_db c) u p_prob + if c_expr == castPtrToStablePtr nullPtr + then do pgf_free_expr_enum c_enum + return [] + else do expr <- deRefStablePtr c_expr + freeStablePtr c_expr + prob <- peek p_prob + rest <- unsafeInterleaveIO (fetchLazy c_fetch c_enum) + return ((expr,prob) : rest) parseWithHeuristics :: Concr -- ^ the language with which we parse -> Type -- ^ the start category diff --git a/src/runtime/haskell/PGF2/FFI.hsc b/src/runtime/haskell/PGF2/FFI.hsc index bfefaabd4..32235550e 100644 --- a/src/runtime/haskell/PGF2/FFI.hsc +++ b/src/runtime/haskell/PGF2/FFI.hsc @@ -50,6 +50,7 @@ data PgfProbsCallback data PgfMorphoCallback data PgfCohortsCallback data PgfPhrasetableIds +data PgfExprEnum type Wrapper a = a -> IO (FunPtr a) type Dynamic a = FunPtr a -> a @@ -253,6 +254,12 @@ foreign import ccall pgf_bracketed_linearize :: Ptr PgfDB -> Ptr Concr -> Stable foreign import ccall pgf_bracketed_linearize_all :: Ptr PgfDB -> Ptr Concr -> StablePtr Expr -> Ptr PgfPrintContext -> Ptr PgfMarshaller -> Ptr PgfLinearizationOutputIface -> Ptr PgfExn -> IO () +foreign import ccall pgf_parse :: Ptr PgfDB -> Ptr Concr -> StablePtr Type -> Ptr PgfMarshaller -> Ptr PgfText -> Ptr PgfExn -> IO (Ptr PgfExprEnum) + +foreign import ccall "dynamic" callFetch :: Dynamic (Ptr PgfExprEnum -> Ptr PgfDB -> Ptr PgfUnmarshaller -> Ptr (#type prob_t) -> IO (StablePtr Expr)) + +foreign import ccall pgf_free_expr_enum :: Ptr PgfExprEnum -> IO () + foreign import ccall "wrapper" wrapSymbol0 :: Wrapper (Ptr PgfLinearizationOutputIface -> IO ()) foreign import ccall "wrapper" wrapSymbol1 :: Wrapper (Ptr PgfLinearizationOutputIface -> Ptr PgfText -> IO ()) diff --git a/src/runtime/haskell/tests/basic.pmcfg b/src/runtime/haskell/tests/basic.pmcfg index 1f5c15eb6..8948602f0 100644 --- a/src/runtime/haskell/tests/basic.pmcfg +++ b/src/runtime/haskell/tests/basic.pmcfg @@ -18,42 +18,42 @@ concrete basic_cnc { lincat Float = [ "s" ] - lindef Float : String(0) -> Float(0) = [S0] - linref Float : Float(0) -> String(0) = [S0] + lindef Float(0) -> Float[String(0)] = [S0] + linref String(0) -> Float[Float(0)] = [S0] lincat Int = [ "s" ] - lindef Int : String(0) -> Int(0) = [S0] - linref Int : Int(0) -> String(0) = [S0] + lindef Int(0) -> Int[String(0)] = [S0] + linref String(0) -> Int[Int(0)] = [S0] lincat N = [ "s" ] - lindef N : String(0) -> N(0) = [S0] - linref N : ∀{i<2} . N(i) -> String(0) = [S0] + lindef N(0) -> N[String(0)] = [S0] + linref ∀{i<2} . String(0) -> N[N(i)] = [S0] lincat P = [ "s" ] - lindef P : String(0) -> P(0) = [S0] - linref P : P(0) -> String(0) = [S0] + lindef P(0) -> P[String(0)] = [S0] + linref String(0) -> P[P(0)] = [S0] lincat S = [ "" ] - lindef S : String(0) -> S(0) = [S0] - linref S : S(0) -> String(0) = [S0] + lindef S(0) -> S[String(0)] = [S0] + linref String(0) -> S[S(0)] = [S0] lincat String = [ "s" ] - lindef String : String(0) -> String(0) = [S0] - linref String : String(0) -> String(0) = [S0] - lin c : ∀{i<2} . N(i) -> S(0) = [S0] - lin floatLit : Float(0) -> S(0) = [S0] - lin ind : ∀{i<2} . P(0) * P(0) * N(i) -> P(0) = [S1] - lin intLit : Int(0) -> S(0) = [S0] - lin nat : ∀{i<2} . N(i) -> P(0) = [S5] - lin s : N(0) -> N(0) = [S2] - lin s : N(1) -> N(0) = [S4] - lin stringLit : String(0) -> S(0) = [S0] - lin z : N(1) = [S3] + lindef String(0) -> String[String(0)] = [S0] + linref String(0) -> String[String(0)] = [S0] + lin ∀{i<2} . S(0) -> c[N(i)] = [S0] + lin S(0) -> floatLit[Float(0)] = [S0] + lin ∀{i<2} . P(0) -> ind[P(0),P(0),N(i)] = [S1] + lin S(0) -> intLit[Int(0)] = [S0] + lin ∀{i<2} . P(0) -> nat[N(i)] = [S5] + lin N(0) -> s[N(0)] = [S2] + lin N(0) -> s[N(1)] = [S4] + lin S(0) -> stringLit[String(0)] = [S0] + lin N(1) -> z[] = [S3] sequences { S0 = <0,0> S1 = <0,0> "&" "λ" SOFT_BIND <1,$0> SOFT_BIND "," SOFT_BIND <1,$1> "." <1,0>