first rudimentary version of a parser

This commit is contained in:
Krasimir Angelov
2022-09-16 12:34:46 +02:00
parent bcb1076dda
commit 3e0cc91a02
24 changed files with 1009 additions and 477 deletions

View File

@@ -1,3 +0,0 @@
packages: src/runtime/haskell
src/server
src/compiler

View File

@@ -168,11 +168,9 @@ ppPmcfgRule id arg_cats res_cat (Production vars args res seqids) =
(if null vars
then empty
else "∀{" <> hsep (punctuate ',' [ppLVar v <> '<' <> m | (v,m) <- vars]) <> '}' <+> '.') <+>
(if null args
then empty
else hsep (intersperse (pp '*') (zipWith ppPArg arg_cats args)) <+> "->") <+>
ppPmcfgCat res_cat res $$
'=' <+> brackets (hcat (intersperse (pp ',') (map ppSeqId seqids))))
ppPmcfgCat res_cat res <+> "->" <+>
brackets (hcat (intersperse (pp ',') (zipWith ppPArg arg_cats args))) <+> '=' <+>
brackets (hcat (intersperse (pp ',') (map ppSeqId seqids))))
ppPArg cat (PArg _ p) = ppPmcfgCat cat p

View File

@@ -1,211 +1,4 @@
import Distribution.System(Platform(..),OS(..))
import Distribution.Simple(defaultMainWithHooks,UserHooks(..),simpleUserHooks)
import Distribution.Simple.LocalBuildInfo(LocalBuildInfo(..),absoluteInstallDirs,datadir,buildDir)
import Distribution.Simple.Setup(BuildFlags(..),Flag(..),InstallFlags(..),CopyDest(..),CopyFlags(..),SDistFlags(..),copyDest)
import Distribution.PackageDescription(PackageDescription(..),emptyHookedBuildInfo)
import Distribution.Simple.BuildPaths(exeExtension)
import System.FilePath((</>),(<.>),dropExtension)
import System.Directory(createDirectoryIfMissing,copyFile,doesDirectoryExist,doesFileExist)
import System.Process(rawSystem)
import System.Exit(ExitCode(..))
-- | Notice about RGL not built anymore
noRGLmsg :: IO ()
noRGLmsg = putStrLn "Notice: the RGL is not built as part of GF anymore. See https://github.com/GrammaticalFramework/gf-rgl"
import Distribution.Simple(defaultMain)
main :: IO ()
main = defaultMainWithHooks simpleUserHooks
{ preBuild = gfPreBuild
, postBuild = gfPostBuild
, preInst = gfPreInst
, postInst = gfPostInst
, postCopy = gfPostCopy
}
where
gfPreBuild args = gfPre args . buildDistPref
gfPreInst args = gfPre args . installDistPref
gfPre args distFlag = do
return emptyHookedBuildInfo
gfPostBuild args flags pkg lbi = do
-- noRGLmsg
let gf = default_gf lbi
buildWeb gf flags (pkg,lbi)
gfPostInst args flags pkg lbi = do
-- noRGLmsg
saveInstallPath args flags (pkg,lbi)
installWeb (pkg,lbi)
gfPostCopy args flags pkg lbi = do
-- noRGLmsg
saveCopyPath args flags (pkg,lbi)
copyWeb flags (pkg,lbi)
-- `cabal sdist` will not make a proper dist archive, for that see `make sdist`
-- However this function should exit quietly to allow building gf in sandbox
gfSDist pkg lbi hooks flags = do
return ()
saveInstallPath :: [String] -> InstallFlags -> (PackageDescription, LocalBuildInfo) -> IO ()
saveInstallPath args flags bi = do
let
dest = NoCopyDest
dir = datadir (uncurry absoluteInstallDirs bi dest)
writeFile dataDirFile dir
saveCopyPath :: [String] -> CopyFlags -> (PackageDescription, LocalBuildInfo) -> IO ()
saveCopyPath args flags bi = do
let
dest = case copyDest flags of
NoFlag -> NoCopyDest
Flag d -> d
dir = datadir (uncurry absoluteInstallDirs bi dest)
writeFile dataDirFile dir
-- | Name of file where installation's data directory is recording
-- This is a last-resort way in which the seprate RGL build script
-- can determine where to put the compiled RGL files
dataDirFile :: String
dataDirFile = "DATA_DIR"
-- | Get path to locally-built gf
default_gf :: LocalBuildInfo -> FilePath
default_gf lbi = buildDir lbi </> exeName' </> exeNameReal
where
-- shadows Distribution.Simple.BuildPaths.exeExtension, which changed type signature in Cabal 2.4
exeExtension = case hostPlatform lbi of
Platform arch Windows -> "exe"
_ -> ""
exeName' = "gf"
exeNameReal = exeName' <.> exeExtension
{-
To test the GF web services, the minibar and the grammar editor, use
"cabal install" (or "runhaskell Setup.hs install") to install gf as usual.
Then start the server with the command "gf -server" and open
http://localhost:41296/ in your web browser (Firefox, Safari, Opera or
Chrome). The example grammars listed below will be available in the minibar.
-}
{-
Update 2018-07-04
The example grammars have now been removed from the GF repository.
This script will look for them in ../gf-contrib and build them from there if possible.
If not, the user will be given a message and nothing is build or copied.
(Unfortunately cabal install seems to hide all messages from stdout,
so users won't see this message unless they check the log.)
-}
-- | Notice about contrib grammars
noContribMsg :: IO ()
noContribMsg = putStr $ unlines
[ "Example grammars are no longer included in the main GF repository, but have moved to gf-contrib."
, "If you want them to be built, clone the following repository in the same directory as gf-core:"
, "https://github.com/GrammaticalFramework/gf-contrib.git"
]
example_grammars :: [(String, String, [String])] -- [(pgf, subdir, source modules)]
example_grammars =
[("Letter.pgf","letter",letterSrc)
,("Foods.pgf","foods",foodsSrc)
,("Phrasebook.pgf","phrasebook",phrasebookSrc)
]
where
foodsSrc = ["Foods"++lang++".gf"|lang<-foodsLangs]
foodsLangs = words "Afr Amh Bul Cat Cze Dut Eng Epo Fin Fre Ger Gle Heb Hin Ice Ita Jpn Lav Mlt Mon Nep Pes Por Ron Spa Swe Tha Tsn Tur Urd"
phrasebookSrc = ["Phrasebook"++lang++".gf"|lang<-phrasebookLangs]
phrasebookLangs = words "Bul Cat Chi Dan Dut Eng Lav Hin Nor Spa Swe Tha" -- only fastish languages
letterSrc = ["Letter"++lang++".gf"|lang<-letterLangs]
letterLangs = words "Eng Fin Fre Heb Rus Swe"
contrib_dir :: FilePath
contrib_dir = ".."</>"gf-contrib"
buildWeb :: String -> BuildFlags -> (PackageDescription, LocalBuildInfo) -> IO ()
buildWeb gf flags (pkg,lbi) = do
contrib_exists <- doesDirectoryExist contrib_dir
if contrib_exists
then mapM_ build_pgf example_grammars
-- else noContribMsg
else return ()
where
gfo_dir = buildDir lbi </> "examples"
build_pgf :: (String, String, [String]) -> IO Bool
build_pgf (pgf,subdir,src) =
do createDirectoryIfMissing True tmp_dir
putStrLn $ "Building "++pgf
execute gf args
where
tmp_dir = gfo_dir</>subdir
dir = contrib_dir</>subdir
dest = NoCopyDest
gf_lib_path = datadir (absoluteInstallDirs pkg lbi dest) </> "lib"
args = numJobs flags++["-make","-s"] -- ,"-optimize-pgf"
++["--gfo-dir="++tmp_dir,
--"--gf-lib-path="++gf_lib_path,
"--name="++dropExtension pgf,
"--output-dir="++gfo_dir]
++[dir</>file|file<-src]
installWeb :: (PackageDescription, LocalBuildInfo) -> IO ()
installWeb = setupWeb NoCopyDest
copyWeb :: CopyFlags -> (PackageDescription, LocalBuildInfo) -> IO ()
copyWeb flags = setupWeb dest
where
dest = case copyDest flags of
NoFlag -> NoCopyDest
Flag d -> d
setupWeb :: CopyDest -> (PackageDescription, LocalBuildInfo) -> IO ()
setupWeb dest (pkg,lbi) = do
mapM_ (createDirectoryIfMissing True) [grammars_dir,cloud_dir]
contrib_exists <- doesDirectoryExist contrib_dir
if contrib_exists
then mapM_ copy_pgf example_grammars
else return () -- message already displayed from buildWeb
where
grammars_dir = www_dir </> "grammars"
cloud_dir = www_dir </> "tmp" -- hmm
www_dir = datadir (absoluteInstallDirs pkg lbi dest) </> "www"
gfo_dir = buildDir lbi </> "examples"
copy_pgf :: (String, String, [String]) -> IO ()
copy_pgf (pgf,subdir,_) =
do let src = gfo_dir </> pgf
let dst = grammars_dir </> pgf
ex <- doesFileExist src
if ex then do putStrLn $ "Installing "++dst
copyFile src dst
else putStrLn $ "Not installing "++dst
-- | Run an arbitrary system command, returning False on failure
execute :: String -> [String] -> IO Bool
execute command args =
do let cmdline = command ++ " " ++ unwords (map showArg args)
e <- rawSystem command args
case e of
ExitSuccess -> return True
ExitFailure i -> do putStrLn $ "Ran: " ++ cmdline
putStrLn $ command++" exited with exit code: " ++ show i
return False
where
showArg arg = if ' ' `elem` arg then "'" ++ arg ++ "'" else arg
-- | This function is used to enable parallel compilation of the RGL and example grammars
numJobs :: BuildFlags -> [String]
numJobs flags =
if null n
then ["-j","+RTS","-A20M","-N","-RTS"]
else ["-j="++n,"+RTS","-A20M","-N"++n,"-RTS"]
where
-- buildNumJobs is only available in Cabal>=1.20
n = case buildNumJobs flags of
Flag mn | mn/=Just 1-> maybe "" show mn
_ -> ""
main = defaultMain

View File

@@ -2,7 +2,7 @@ name: gf
version: 3.11.0-git
cabal-version: 1.22
build-type: Custom
build-type: Simple
license: OtherLicense
license-file: LICENSE
category: Natural Language Processing, Compiler
@@ -39,14 +39,6 @@ data-files:
www/translator/*.css
www/translator/*.js
custom-setup
setup-depends:
base >= 4.9.1,
Cabal >= 1.22.0.0,
directory >= 1.3.0 && < 1.4,
filepath >= 1.4.1 && < 1.5,
process >= 1.0.1.1 && < 1.7
source-repository head
type: git
location: https://github.com/GrammaticalFramework/gf-core.git
@@ -192,7 +184,7 @@ executable gf
GF.Text.Lexing
GF.Text.Transliterations
Paths_gf
-- not really part of GF but I have changed the original binary library
-- and we have to keep the copy for now.
Data.Binary

View File

@@ -26,6 +26,8 @@ libpgf_la_SOURCES = \
pgf/typechecker.h \
pgf/linearizer.cxx \
pgf/linearizer.h \
pgf/parser.cxx \
pgf/parser.h \
pgf/graphviz.cxx \
pgf/graphviz.h \
pgf/data.cxx \

View File

@@ -47,9 +47,9 @@ void PgfConcr::release(ref<PgfConcr> concr)
void PgfConcrLincat::release(ref<PgfConcrLincat> lincat)
{
for (size_t i = 0; i < lincat->fields->len; i++) {
text_db_release(*vector_elem(lincat->fields, i));
PgfLincatField::release(vector_elem(lincat->fields, i));
}
Vector<ref<PgfText>>::release(lincat->fields);
Vector<PgfLincatField>::release(lincat->fields);
for (size_t i = 0; i < lincat->args->len; i++) {
PgfLParam::release(vector_elem(lincat->args, i)->param);
@@ -66,6 +66,13 @@ void PgfConcrLincat::release(ref<PgfConcrLincat> lincat)
PgfDB::free(lincat, lincat->name.size+1);
}
void PgfLincatField::release(ref<PgfLincatField> field)
{
text_db_release(field->name);
if (field->backrefs != 0)
Vector<PgfLincatBackref>::release(field->backrefs);
}
void PgfLParam::release(ref<PgfLParam> param)
{
PgfDB::free(param, param->n_terms*sizeof(param->terms[0]));

View File

@@ -209,17 +209,25 @@ struct PGF_INTERNAL_DECL PgfSymbolALLCAPIT {
static const uint8_t tag = 10;
};
struct PGF_INTERNAL_DECL PgfLincatBackref;
struct PGF_INTERNAL_DECL PgfLincatField {
ref<PgfText> name;
ref<Vector<PgfLincatBackref>> backrefs;
static void release(ref<PgfLincatField> field);
};
struct PGF_INTERNAL_DECL PgfConcrLincat {
static const uint8_t tag = 0;
ref<PgfAbsCat> abscat;
ref<Vector<ref<PgfText>>> fields;
size_t n_lindefs;
ref<Vector<PgfPArg>> args;
ref<Vector<ref<PgfPResult>>> res;
ref<Vector<ref<PgfSequence>>> seqs;
ref<Vector<PgfLincatField>> fields;
PgfText name;
@@ -230,6 +238,7 @@ struct PGF_INTERNAL_DECL PgfConcrLin {
static const uint8_t tag = 1;
ref<PgfAbsFun> absfun;
ref<PgfConcrLincat> lincat;
ref<Vector<PgfPArg>> args;
ref<Vector<ref<PgfPResult>>> res;
@@ -240,6 +249,12 @@ struct PGF_INTERNAL_DECL PgfConcrLin {
static void release(ref<PgfConcrLin> lin);
};
struct PGF_INTERNAL_DECL PgfLincatBackref {
ref<PgfConcrLin> lin;
size_t seq_index;
size_t dot;
};
struct PGF_INTERNAL_DECL PgfConcrPrintname {
ref<PgfText> printname;
PgfText name;

View File

@@ -1,79 +0,0 @@
#ifndef HEAP_H
#define HEAP_H
template <class A>
class PGF_INTERNAL_DECL Heap {
public:
Heap() {
len = 0;
avail = 0;
values = NULL;
}
~Heap() { free(values); }
void push(A value) {
if (len >= avail) {
avail = get_next_padovan(len+1);
A *new_values = (A *) realloc(values, sizeof(A)*avail);
if (new_values == NULL)
throw pgf_systemerror(errno);
values = new_values;
}
siftdown(value, 0, len);
len++;
}
bool is_empty() { return (len == 0); }
A top() { return values[0]; }
A pop() {
A top = values[0];
siftup(&values[len-1],0);
len--;
return top;
}
private:
size_t len;
size_t avail;
A *values;
void siftdown(A value, size_t startpos, size_t pos) {
while (pos > startpos) {
size_t parentpos = (pos - 1) >> 1;
A parent = values[parentpos];
if (value >= parent)
break;
values[pos] = parent;
pos = parentpos;
}
values[pos] = value;
}
void siftup(A *pvalue, size_t pos) {
size_t startpos = pos;
size_t endpos = len;
size_t childpos = 2*pos + 1;
while (childpos < endpos) {
size_t rightpos = childpos + 1;
if (rightpos < endpos &&
values[childpos] >= values[rightpos]) {
childpos = rightpos;
}
values[pos] = values[childpos];
pos = childpos;
childpos = 2*pos + 1;
}
siftdown(*pvalue, startpos, pos);
}
};
#endif

View File

@@ -287,11 +287,7 @@ void PgfLinearizer::TreeLinNode::check_category(PgfLinearizer *linearizer, PgfTe
void PgfLinearizer::TreeLinNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex)
{
PgfText *cat = &lin->absfun->type->name;
PgfText *field = NULL;
ref<PgfConcrLincat> lincat = namespace_lookup(linearizer->concr->lincats, cat);
if (lincat != 0) {
field = &(**vector_elem(lincat->fields, lindex));
}
PgfText *field = &*(vector_elem(lin->lincat->fields, lindex)->name);
if (linearizer->pre_stack == NULL)
out->begin_phrase(cat, fid, field, &lin->name);
@@ -393,7 +389,7 @@ void PgfLinearizer::TreeLindefNode::linearize_arg(PgfLinearizationOutputIface *o
void PgfLinearizer::TreeLindefNode::linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex)
{
if (lincat != 0) {
PgfText *field = &(**vector_elem(lincat->fields, lindex));
PgfText *field = &*(vector_elem(lincat->fields, lindex)->name);
if (linearizer->pre_stack == NULL)
out->begin_phrase(&lincat->name, fid, field, linearizer->wild);
else {
@@ -546,7 +542,7 @@ void PgfLinearizer::TreeLitNode::linearize(PgfLinearizationOutputIface *out, Pgf
{
PgfText *field = NULL;
if (lincat != 0) {
field = &(**vector_elem(lincat->fields, lindex));
field = &*(vector_elem(lincat->fields, lindex)->name);
}
linearizer->flush_pre_stack(out, literal);

View File

@@ -98,6 +98,14 @@ class PGF_INTERNAL_DECL PgfLinearizer : public PgfUnmarshaller {
~TreeLitNode() { free(literal); };
};
struct TreeChunksNode : public TreeNode {
TreeChunksNode(PgfLinearizer *linearizer);
virtual bool resolve(PgfLinearizer *linearizer);
virtual void check_category(PgfLinearizer *linearizer, PgfText *cat);
virtual void linearize(PgfLinearizationOutputIface *out, PgfLinearizer *linearizer, size_t lindex);
virtual ref<PgfConcrLincat> get_lincat(PgfLinearizer *linearizer);
};
TreeNode *prev;
TreeNode *next;
TreeNode *args;

View File

@@ -0,0 +1,455 @@
#include "data.h"
#include "printer.h"
#include "parser.h"
#include <type_traits>
#include <map>
#include <vector>
#include <queue>
// #define PARSER_DEBUG
class PGF_INTERNAL_DECL PgfParser::CFGCat {
public:
ref<PgfLincatField> field;
size_t value;
// copy assignment
bool operator<(const CFGCat& other) const
{
if (field < other.field)
return true;
else if (field == other.field)
return (value < other.value);
else
return false;
}
};
struct PGF_INTERNAL_DECL PgfParser::Choice {
size_t id;
std::vector<Production*> prods;
Choice(size_t id) {
this->id = id;
}
};
class PGF_INTERNAL_DECL PgfParser::Production
{
public:
static
void predict(Choice *choice, ref<PgfConcrLin> lin, size_t seq_index)
{
size_t n_args = lin->absfun->type->hypos->len;
Production *prod = (Production*)
malloc(sizeof(Production)+sizeof(Choice*)*n_args);
prod->lin = lin;
prod->seq_index = seq_index;
memset(prod->args, 0, sizeof(Choice*)*n_args);
prod->log(choice);
choice->prods.push_back(prod);
}
void log(Choice *res) {
#ifdef PARSER_DEBUG
PgfPrinter printer(NULL,0,NULL);
printer.nprintf(10, "?%ld = ", res->id);
printer.puts(&lin->name);
auto hypos = lin->absfun->type->hypos;
for (size_t i = 0; i < hypos->len; i++) {
if (args[i] == NULL)
printer.efun(&hypos->data[i].type->name);
else
printer.nprintf(10, " ?%ld", args[i]->id);
}
printer.puts("\n");
printer.dump();
#endif
}
ref<PgfConcrLin> lin;
size_t seq_index;
Choice *args[];
};
struct PGF_INTERNAL_DECL PgfParser::ItemConts {
State *state;
std::vector<Item> items;
};
class PGF_INTERNAL_DECL PgfParser::Item
{
public:
static
void combine(State *state, PgfLincatBackref *backref, Choice *choice)
{
ref<PgfSequence> seq =
*vector_elem(backref->lin->seqs, backref->seq_index);
size_t index = backref->seq_index % backref->lin->lincat->fields->len;
ref<PgfLincatField> field = vector_elem(backref->lin->lincat->fields, index);
// state->get_conts(field, 0);
if (backref->dot+1 < seq->syms.len) {
size_t n_args = backref->lin->absfun->type->hypos->len;
Item *item = (Item*)
malloc(sizeof(Item)+sizeof(Choice*)*n_args);
item->lin = backref->lin;
item->seq_index = backref->seq_index;
item->dot = backref->dot+1;
memset(item->args, 0, sizeof(Choice*)*n_args);
ref<PgfSequence> seq =
*vector_elem(item->lin->seqs, backref->seq_index);
PgfSymbol sym = seq->syms.data[backref->dot];
ref<PgfSymbolCat> symcat = ref<PgfSymbolCat>::untagged(sym);
item->args[symcat->d] = choice;
item->log();
} else {
Production::predict(choice, backref->lin, backref->seq_index);
}
}
Production *complete()
{
size_t n_args = lin->absfun->type->hypos->len;
Production *prod = (Production*)
malloc(sizeof(Production)+sizeof(Choice*)*n_args);
prod->lin = lin;
prod->seq_index = seq_index;
memcpy(prod->args, args, sizeof(Choice*)*n_args);
return prod;
}
void log() {
#ifdef PARSER_DEBUG
PgfPrinter printer(NULL,0,NULL);
size_t index = seq_index / lin->lincat->fields->len;
ref<PgfPResult> res = *vector_elem(lin->res, index);
ref<PgfDTyp> ty = lin->absfun->type;
if (res->vars != 0) {
printer.lvar_ranges(res->vars);
printer.puts(" . ");
}
printer.efun(&ty->name);
printer.puts("(");
printer.lparam(ref<PgfLParam>::from_ptr(&res->param));
printer.puts(") -> ");
printer.efun(&lin->name);
printer.puts("[");
size_t n_args = lin->args->len / lin->res->len;
for (size_t i = 0; i < n_args; i++) {
if (i > 0)
printer.puts(",");
if (args[i] == NULL)
printer.parg(vector_elem(ty->hypos, i)->type,
vector_elem(lin->args, index*n_args + i));
else
printer.nprintf(10, "?%ld", args[i]->id);
}
printer.nprintf(10, "]; %ld : ", seq_index % lin->lincat->fields->len);
ref<PgfSequence> seq = *vector_elem(lin->seqs, seq_index);
for (size_t i = 0; i < seq->syms.len; i++) {
if (i > 0)
printer.puts(" ");
if (i == dot)
printer.puts(". ");
printer.symbol(*vector_elem(&seq->syms, i));
}
printer.puts("\n");
printer.dump();
#endif
}
private:
ItemConts *conts;
ref<PgfConcrLin> lin;
size_t seq_index;
size_t dot;
Choice *args[];
};
class PGF_INTERNAL_DECL PgfParser::State
{
public:
ItemConts *get_conts(ref<PgfLincatField> field, size_t value)
{
ItemConts *conts;
CFGCat cfg_cat = {field, value};
auto itr1 = contss.find(cfg_cat);
if (itr1 == contss.end()) {
conts = new ItemConts();
conts->state = this;
contss.insert(std::pair<CFGCat,ItemConts*>(cfg_cat, conts));
} else {
conts = itr1->second;
}
return conts;
}
public:
size_t start, end;
State *prev, *next;
std::map<CFGCat,ItemConts*> contss;
std::map<ItemConts*,Choice*> choices;
std::priority_queue<PgfParser::Result*,std::vector<PgfParser::Result*>,PgfParser::ResultComparator> queue;
};
class PgfParser::ResultExpr : public Result
{
public:
ResultExpr(Production *prod)
{
this->inside_prob = prod->lin->absfun->prob;
this->outside_prob = prod->lin->lincat->abscat->prob;
this->prod = prod;
this->arg_index = 0;
}
virtual prob_t prob()
{
return inside_prob+outside_prob;
}
virtual PgfExpr expr(PgfUnmarshaller *u)
{
return u->efun(&prod->lin->name);
}
virtual void proceed(PgfParser *parser, PgfUnmarshaller *u)
{
}
private:
prob_t inside_prob;
prob_t outside_prob;
Production *prod;
size_t arg_index;
};
class PgfParser::ResultMeta : public Result
{
public:
ResultMeta(State *state,
PgfExpr arg, prob_t prob,
ResultMeta *next)
{
this->inside_prob = prob + (next ? next->inside_prob : 0);
this->state = state;
this->arg = arg;
this->next = next;
}
virtual prob_t prob()
{
return inside_prob;
}
virtual PgfExpr expr(PgfUnmarshaller *u)
{
ResultMeta *res = this;
PgfExpr expr = u->emeta(0);
while (res->arg != 0) {
PgfExpr expr1 = u->eapp(expr, res->arg);
u->free_ref(expr);
expr = expr1;
res = res->next;
}
return expr;
}
virtual void proceed(PgfParser *parser, PgfUnmarshaller *u)
{
if (state->choices.size() == 0) {
State *prev = state;
while (prev->prev != NULL && prev->choices.size() == 0) {
prev = prev->prev;
}
size_t size = state->start-prev->end;
PgfText *token = (PgfText *) alloca(sizeof(PgfText)+size+1);
token->size = size;
memcpy(token->text,parser->sentence->text+prev->end,size);
token->text[size] = 0;
PgfExpr expr = u->elit(u->lstr(token));
prev->queue.push(new ResultMeta(prev,
expr, 0,
this));
} else {
for (auto it : state->choices) {
ItemConts *conts = it.first;
Choice *choice = it.second;
for (Production *prod : choice->prods) {
PgfExpr expr = u->efun(&prod->lin->name);
prob_t prob = prod->lin->absfun->prob +
prod->lin->lincat->abscat->prob;
conts->state->queue.push(new ResultMeta(conts->state,
expr, prob,
this));
}
}
}
}
private:
prob_t inside_prob;
State *state;
PgfExpr arg;
ResultMeta *next;
};
PgfParser::PgfParser(ref<PgfConcrLincat> start, PgfText *sentence)
{
this->start = start;
this->sentence = textdup(sentence);
this->last_choice_id = 0;
this->before = NULL;
this->after = NULL;
this->fetch_state = NULL;
}
void PgfParser::space(size_t start, size_t end, PgfExn* err)
{
State *prev = NULL;
State *next = before;
while (next != NULL && next->start < start) {
prev = next;
next = next->next;
}
if (next == NULL || next->start != start) {
before = new State();
before->start = start;
before->end = end;
before->prev = prev;
before->next = next;
if (prev != NULL) prev->next = before;
if (next != NULL) next->prev = before;
} else {
before = next;
before->end = end;
}
if (end == sentence->size) {
fetch_state = after;
fetch_state->queue.push(new ResultMeta(after,0,0,NULL));
}
}
void PgfParser::start_matches(size_t end, PgfExn* err)
{
State *prev = NULL;
State *next = before;
while (next != NULL && next->start < end) {
prev = next;
next = next->next;
}
if (next == NULL || next->start != end) {
after = new State();
after->start = end;
after->end = end;
after->prev = prev;
after->next = next;
if (prev != NULL) prev->next = after;
if (next != NULL) next->prev = after;
} else {
after = next;
}
}
void PgfParser::match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
{
size_t index = seq_index % lin->lincat->fields->len;
ref<PgfLincatField> field = vector_elem(lin->lincat->fields, index);
ItemConts *conts = before->get_conts(field, 0);
Choice *choice;
auto itr2 = after->choices.find(conts);
if (itr2 == after->choices.end()) {
choice = new Choice(++last_choice_id);
after->choices.insert(std::pair<ItemConts*,Choice*>(conts, choice));
} else {
choice = itr2->second;
}
Production::predict(choice,lin,seq_index);
/*
if (itr2 == after->choices.end()) {
for (size_t i = 0; i < field->backrefs->len; i++) {
PgfLincatBackref *backref = vector_elem(field->backrefs, i);
Item::combine(before, backref, choice);
}
}*/
}
void PgfParser::end_matches(size_t end, PgfExn* err)
{
if (end == sentence->size) {
fetch_state = after;
fetch_state->queue.push(new ResultMeta(after,0,0,NULL));
}
}
PgfExpr PgfParser::fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob)
{
DB_scope scope(db, READER_SCOPE);
while (fetch_state != NULL && fetch_state->queue.empty()) {
fetch_state = fetch_state->next;
}
if (fetch_state == NULL) {
return 0;
}
while (fetch_state->prev != NULL) {
if (!fetch_state->queue.empty()) {
Result *res = fetch_state->queue.top();
fetch_state->queue.pop();
res->proceed(this,u);
}
fetch_state = fetch_state->prev;
}
if (fetch_state->queue.empty()) {
return 0;
}
Result *res = fetch_state->queue.top();
fetch_state->queue.pop();
*prob = res->prob();
return res->expr(u);
}
PgfParser::~PgfParser()
{
free(sentence);
printf("~PgfParser()\n");
}

View File

@@ -0,0 +1,51 @@
#ifndef PARSER_H
#define PARSER_H
class PGF_INTERNAL_DECL PgfParser : public PgfPhraseScanner, public PgfExprEnum {
public:
PgfParser(ref<PgfConcrLincat> start, PgfText *sentence);
void space(size_t start, size_t end, PgfExn* err);
void start_matches(size_t end, PgfExn* err);
void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err);
void end_matches(size_t end, PgfExn* err);
PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob);
virtual ~PgfParser();
private:
class CFGCat;
class State;
class Item;
class ItemConts;
class Choice;
class Production;
class Result {
public:
virtual prob_t prob() = 0;
virtual PgfExpr expr(PgfUnmarshaller *u) = 0;
virtual void proceed(PgfParser *parser, PgfUnmarshaller *u) = 0;
};
class ResultExpr;
class ResultMeta;
class ResultComparator : std::less<Result*> {
public:
bool operator()(Result* &lhs, Result* &rhs) const
{
return lhs->prob() > rhs->prob();
}
};
ref<PgfConcrLincat> start;
PgfText *sentence;
size_t last_choice_id;
State *before, *after, *fetch_state;
};
#endif

View File

@@ -11,6 +11,7 @@
#include "printer.h"
#include "typechecker.h"
#include "linearizer.h"
#include "parser.h"
#include "graphviz.h"
static void
@@ -815,6 +816,35 @@ pgf_is_case_sensitive(ref<PgfConcr> concr)
return true;
}
class PGF_INTERNAL_DECL PgfMorphoScanner : public PgfPhraseScanner {
public:
PgfMorphoScanner(PgfMorphoCallback* callback) {
this->callback = callback;
}
virtual void space(size_t start, size_t end, PgfExn* err)
{
}
virtual void start_matches(size_t end, PgfExn* err)
{
}
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
{
ref<PgfLincatField> field =
vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
}
virtual void end_matches(size_t end, PgfExn* err)
{
}
private:
PgfMorphoCallback* callback;
};
PGF_API
void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
@@ -826,13 +856,45 @@ void pgf_lookup_morpho(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfMorphoScanner scanner(callback);
phrasetable_lookup(concr->phrasetable,
sentence, case_sensitive,
concr->lincats,
callback, err);
&scanner, err);
} PGF_API_END
}
class PGF_INTERNAL_DECL PgfCohortsScanner : public PgfPhraseScanner {
public:
PgfCohortsScanner(PgfCohortsCallback* callback) {
this->callback = callback;
}
virtual void space(size_t start, size_t end, PgfExn* err)
{
match_start = end;
}
virtual void start_matches(size_t match_end, PgfExn* err)
{
}
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)
{
ref<PgfLincatField> field =
vector_elem(lin->lincat->fields, seq_index % lin->lincat->fields->len);
callback->morpho.fn(&callback->morpho, &lin->absfun->name, &(*field->name), lin->lincat->abscat->prob+lin->absfun->prob, err);
}
virtual void end_matches(size_t match_end, PgfExn* err)
{
callback->fn(callback, match_start, match_end, err);
}
private:
size_t match_start;
PgfCohortsCallback* callback;
};
PGF_API
void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
PgfText *sentence,
@@ -844,10 +906,10 @@ void pgf_lookup_cohorts(PgfDB *db, PgfConcrRevision cnc_revision,
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfCohortsScanner scanner(callback);
phrasetable_lookup_cohorts(concr->phrasetable,
sentence, case_sensitive,
concr->lincats,
callback, err);
&scanner, err);
} PGF_API_END
}
@@ -885,7 +947,7 @@ PGF_API
PgfText *pgf_get_lincat_field_internal(object o, size_t i)
{
ref<PgfConcrLincat> lincat = o;
return &(**vector_elem(lincat->fields, i));
return &*(vector_elem(lincat->fields, i)->name);
}
PGF_API
@@ -903,22 +965,18 @@ PgfText *pgf_print_lindef_internal(PgfPhrasetableIds *seq_ids, object o, size_t
PgfInternalMarshaller m;
PgfPrinter printer(NULL,0,&m);
printer.efun(&lincat->name);
printer.puts(" : ");
ref<PgfPResult> res = *vector_elem(lincat->res, i);
if (res->vars != 0) {
printer.lvar_ranges(res->vars);
printer.puts(" . ");
}
printer.puts(" String(0) -> ");
printer.efun(&lincat->name);
printer.puts("(");
printer.lparam(ref<PgfLParam>::from_ptr(&res->param));
printer.puts(") = [");
printer.puts(") -> ");
printer.efun(&lincat->name);
printer.puts("[String(0)] = [");
size_t n_seqs = lincat->fields->len;
for (size_t j = 0; j < n_seqs; j++) {
@@ -942,20 +1000,19 @@ PgfText *pgf_print_linref_internal(PgfPhrasetableIds *seq_ids, object o, size_t
PgfInternalMarshaller m;
PgfPrinter printer(NULL,0,&m);
printer.efun(&lincat->name);
printer.puts(" : ");
ref<PgfPResult> res = *vector_elem(lincat->res, lincat->n_lindefs+i);
if (res->vars != 0) {
printer.lvar_ranges(res->vars);
printer.puts(" . ");
}
printer.puts("String(0) -> ");
printer.efun(&lincat->name);
printer.puts("[");
printer.efun(&lincat->name);
printer.puts("(");
printer.lparam(vector_elem(lincat->args, lincat->n_lindefs+i)->param);
printer.puts(") -> String(0) = [");
printer.puts(")] = [");
size_t n_seqs = lincat->fields->len;
ref<PgfSequence> seq = *vector_elem(lincat->seqs, lincat->n_lindefs*n_seqs+i);
@@ -970,37 +1027,33 @@ PGF_API
PgfText *pgf_print_lin_internal(PgfPhrasetableIds *seq_ids, object o, size_t i)
{
ref<PgfConcrLin> lin = o;
ref<PgfDTyp> ty = lin->absfun->type;
PgfInternalMarshaller m;
PgfPrinter printer(NULL,0,&m);
printer.efun(&lin->name);
printer.puts(" : ");
ref<PgfPResult> res = *vector_elem(lin->res, i);
ref<PgfDTyp> ty = lin->absfun->type;
if (res->vars != 0) {
printer.lvar_ranges(res->vars);
printer.puts(" . ");
}
size_t n_args = lin->args->len / lin->res->len;
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
printer.puts(" * ");
printer.parg(vector_elem(ty->hypos, j)->type,
vector_elem(lin->args, i*n_args + j));
}
if (n_args > 0)
printer.puts(" -> ");
printer.efun(&ty->name);
printer.puts("(");
printer.lparam(ref<PgfLParam>::from_ptr(&res->param));
printer.puts(") = [");
printer.puts(") -> ");
printer.efun(&lin->name);
printer.puts("[");
size_t n_args = lin->args->len / lin->res->len;
for (size_t j = 0; j < n_args; j++) {
if (j > 0)
printer.puts(",");
printer.parg(vector_elem(ty->hypos, j)->type,
vector_elem(lin->args, i*n_args + j));
}
printer.puts("] = [");
size_t n_seqs = lin->seqs->len / lin->res->len;
for (size_t j = 0; j < n_seqs; j++) {
@@ -1439,10 +1492,11 @@ public:
this->n_lindefs = n_lindefs;
this->n_linrefs = n_linrefs;
ref<Vector<ref<PgfText>>> db_fields = vector_new<ref<PgfText>>(n_fields);
ref<Vector<PgfLincatField>> db_fields = vector_new<PgfLincatField>(n_fields);
for (size_t i = 0; i < n_fields; i++) {
ref<PgfText> field = textdup_db(fields[i]);
*vector_elem(db_fields, i) = field;
ref<PgfText> name = textdup_db(fields[i]);
vector_elem(db_fields, i)->name = name;
vector_elem(db_fields, i)->backrefs = 0;
}
ref<PgfConcrLincat> lincat = PgfDB::malloc<PgfConcrLincat>(abscat->name.size+1);
@@ -2098,7 +2152,7 @@ PgfText **pgf_category_fields(PgfDB *db, PgfConcrRevision revision,
if (fields == 0)
throw pgf_systemerror(ENOMEM);
for (size_t i = 0; i < n_fields; i++) {
fields[i] = textdup(lincat->fields->data[i]);
fields[i] = textdup(vector_elem(lincat->fields, i)->name);
}
*p_n_fields = n_fields;
return fields;
@@ -2188,7 +2242,7 @@ PgfText **pgf_tabular_linearize(PgfDB *db, PgfConcrRevision revision,
PgfText *text = out.get_text();
if (text != NULL) {
res[pos++] = textdup(&(*lincat->fields->data[i]));
res[pos++] = textdup(&*(vector_elem(lincat->fields,i)->name));
res[pos++] = text;
}
}
@@ -2227,7 +2281,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision,
PgfText *text = out.get_text();
if (text != NULL) {
res[pos++] = textdup(&(*lincat->fields->data[i]));
res[pos++] = textdup(&*(vector_elem(lincat->fields, i)->name));
res[pos++] = text;
}
}
@@ -2240,7 +2294,7 @@ PgfText **pgf_tabular_linearize_all(PgfDB *db, PgfConcrRevision revision,
return NULL;
}
PGF_API_DECL
PGF_API
void pgf_bracketed_linearize(PgfDB *db, PgfConcrRevision revision,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
@@ -2260,7 +2314,7 @@ void pgf_bracketed_linearize(PgfDB *db, PgfConcrRevision revision,
} PGF_API_END
}
PGF_API_DECL
PGF_API
void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision,
PgfExpr expr, PgfPrintContext *ctxt,
PgfMarshaller *m,
@@ -2281,6 +2335,70 @@ void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision,
} PGF_API_END
}
struct PGF_INTERNAL_DECL PgfLincatUnmarshaller : PgfUnmarshaller {
PgfLincatUnmarshaller(ref<PgfConcr> concr) {
this->concr = concr;
this->lincat = 0;
}
virtual PgfExpr eabs(PgfBindType btype, PgfText *name, PgfExpr body) { return 0; }
virtual PgfExpr eapp(PgfExpr fun, PgfExpr arg) { return 0; }
virtual PgfExpr elit(PgfLiteral lit) { return 0; }
virtual PgfExpr emeta(PgfMetaId meta) { return 0; }
virtual PgfExpr efun(PgfText *name) { return 0; }
virtual PgfExpr evar(int index) { return 0; }
virtual PgfExpr etyped(PgfExpr expr, PgfType typ) { return 0; }
virtual PgfExpr eimplarg(PgfExpr expr) { return 0; }
virtual PgfLiteral lint(size_t size, uintmax_t *v) { return 0; }
virtual PgfLiteral lflt(double v) { return 0; }
virtual PgfLiteral lstr(PgfText *v) { return 0; }
virtual PgfType dtyp(size_t n_hypos, PgfTypeHypo *hypos,
PgfText *cat,
size_t n_exprs, PgfExpr *exprs) {
lincat =
namespace_lookup(concr->lincats, cat);
return 0;
}
virtual void free_ref(object x) {};
ref<PgfConcr> concr;
ref<PgfConcrLincat> lincat;
};
PGF_API
PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision,
PgfType ty, PgfMarshaller *m,
PgfText *sentence,
PgfExn * err)
{
PGF_API_BEGIN {
DB_scope scope(db, READER_SCOPE);
ref<PgfConcr> concr = db->revision2concr(revision);
bool case_sensitive = pgf_is_case_sensitive(concr);
PgfLincatUnmarshaller u(concr);
m->match_type(&u, ty);
if (u.lincat == 0)
return 0;
PgfParser *parser = new PgfParser(u.lincat, sentence);
phrasetable_lookup_cohorts(concr->phrasetable,
sentence, case_sensitive,
parser, err);
return parser;
} PGF_API_END
return NULL;
}
PGF_API
void pgf_free_expr_enum(PgfExprEnum *en)
{
delete en;
}
PGF_API
PgfText *pgf_get_printname(PgfDB *db, PgfConcrRevision revision,
PgfText *fun, PgfExn* err)

View File

@@ -724,6 +724,31 @@ void pgf_bracketed_linearize_all(PgfDB *db, PgfConcrRevision revision,
PgfLinearizationOutputIface *out,
PgfExn* err);
#ifdef __cplusplus
struct PgfExprEnum {
virtual PgfExpr fetch(PgfDB *db, PgfUnmarshaller *u, prob_t *prob)=0;
virtual ~PgfExprEnum() {};
};
#else
typedef struct PgfExprEnum PgfExprEnum;
typedef struct PgfExprEnumVtbl PgfExprEnumVtbl;
struct PgfExprEnumVtbl {
PgfExpr (*fetch)(PgfExprEnum *this, PgfDB *db, PgfUnmarshaller *u, prob_t *prob);
};
struct PgfExprEnum {
PgfExprEnumVtbl *vtbl;
};
#endif
PGF_API_DECL
PgfExprEnum *pgf_parse(PgfDB *db, PgfConcrRevision revision,
PgfType ty, PgfMarshaller *m,
PgfText *sentence,
PgfExn * err);
PGF_API_DECL
void pgf_free_expr_enum(PgfExprEnum *en);
PGF_API_DECL
PgfText *pgf_get_printname(PgfDB *db, PgfConcrRevision revision,
PgfText *fun, PgfExn* err);

View File

@@ -1,5 +1,5 @@
#include "data.h"
#include "heap.h"
#include <queue>
PgfPhrasetableIds::PgfPhrasetableIds()
{
@@ -231,10 +231,6 @@ int sequence_cmp(ref<PgfSequence> seq1, ref<PgfSequence> seq2)
struct PGF_INTERNAL_DECL PgfTextSpot {
size_t pos; // position in Unicode characters
const uint8_t *ptr; // pointer into the spot location
bool operator >= (PgfTextSpot const &obj) {
return pos >= obj.pos;
}
};
static
@@ -479,8 +475,7 @@ PGF_INTERNAL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err)
PgfPhraseScanner *scanner, PgfExn* err)
{
if (table == 0)
return;
@@ -491,9 +486,9 @@ void phrasetable_lookup(PgfPhrasetable table,
const uint8_t *end = current.ptr+sentence->size;
int cmp = text_sequence_cmp(&current,end,table->value.seq,case_sensitive,true);
if (cmp < 0) {
phrasetable_lookup(table->left,sentence,case_sensitive,lincats,callback,err);
phrasetable_lookup(table->left,sentence,case_sensitive,scanner,err);
} else if (cmp > 0) {
phrasetable_lookup(table->right,sentence,case_sensitive,lincats,callback,err);
phrasetable_lookup(table->right,sentence,case_sensitive,scanner,err);
} else {
auto backrefs = table->value.backrefs;
if (backrefs != 0) {
@@ -502,13 +497,8 @@ void phrasetable_lookup(PgfPhrasetable table,
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(lincats, &lin->absfun->type->name);
if (lin->absfun->type->hypos->len == 0 && lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
if (lin->absfun->type->hypos->len == 0) {
scanner->match(lin, backref.seq_index, err);
if (err->type != PGF_EXN_NONE)
return;
}
@@ -523,10 +513,10 @@ void phrasetable_lookup(PgfPhrasetable table,
}
if (!case_sensitive) {
phrasetable_lookup(table->left,sentence,false,lincats,callback,err);
phrasetable_lookup(table->left,sentence,false,scanner,err);
if (err->type != PGF_EXN_NONE)
return;
phrasetable_lookup(table->right,sentence,false,lincats,callback,err);
phrasetable_lookup(table->right,sentence,false,scanner,err);
if (err->type != PGF_EXN_NONE)
return;
}
@@ -534,18 +524,66 @@ void phrasetable_lookup(PgfPhrasetable table,
}
struct PGF_INTERNAL_DECL PgfCohortsState {
class PgfTextSpotComparator : std::less<PgfTextSpot> {
public:
bool operator()(PgfTextSpot &lhs, PgfTextSpot &rhs) const
{
return lhs.pos > rhs.pos;
}
};
PgfTextSpot spot;
Heap<PgfTextSpot> queue;
std::priority_queue<PgfTextSpot, std::vector<PgfTextSpot>, PgfTextSpotComparator> queue;
size_t last_pos;
size_t skip_pos;
bool skipping;
const uint8_t *end; // pointer into the end of the sentence
bool case_sensitive;
Namespace<PgfConcrLincat> lincats;
PgfCohortsCallback* callback;
PgfPhraseScanner *scanner;
PgfExn* err;
};
static
void finish_skipping(PgfCohortsState *state) {
if (state->skipping) {
while (!state->queue.empty()) {
PgfTextSpot spot = state->queue.top();
if (spot.pos >= state->spot.pos)
break;
if (spot.pos != state->last_pos) {
if (state->last_pos > 0) {
state->scanner->space(spot.pos, spot.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
}
state->scanner->start_matches(state->spot.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
state->scanner->end_matches(state->spot.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
state->last_pos = spot.pos;
}
state->queue.pop();
}
state->scanner->space(state->spot.pos, state->spot.pos,
state->err);
state->last_pos = 0;
state->skipping = false;
}
}
static
void phrasetable_lookup_prefixes(PgfCohortsState *state,
PgfPhrasetable table,
@@ -561,38 +599,38 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state,
} else if (cmp > 0) {
ptrdiff_t len = current.ptr - state->spot.ptr;
if (min <= len)
phrasetable_lookup_prefixes(state,table->left,min,len);
if (min <= len-1)
phrasetable_lookup_prefixes(state,table->left,min,len-1);
if (len+1 <= max)
phrasetable_lookup_prefixes(state,table->right,len+1,max);
if (len <= max)
phrasetable_lookup_prefixes(state,table->right,len,max);
} else {
ptrdiff_t len = current.ptr - state->spot.ptr;
finish_skipping(state);
if (state->err->type != PGF_EXN_NONE)
return;
if (min <= len)
phrasetable_lookup_prefixes(state,table->left,min,len);
auto backrefs = table->value.backrefs;
if (len > 0 && backrefs != 0) {
if (state->skip_pos != (size_t) -1) {
state->callback->fn(state->callback,
state->skip_pos,
state->spot.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
state->skip_pos = (size_t) -1;
}
if (state->last_pos != current.pos) {
if (state->last_pos > 0) {
state->scanner->end_matches(state->last_pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
}
if (state->last_pos > 0 && state->last_pos != current.pos) {
state->callback->fn(state->callback,
state->spot.pos,
state->last_pos,
state->err);
state->scanner->start_matches(current.pos,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
state->last_pos = current.pos;
}
state->last_pos = current.pos;
state->queue.push(current);
for (size_t i = 0; i < backrefs->len; i++) {
@@ -600,17 +638,10 @@ void phrasetable_lookup_prefixes(PgfCohortsState *state,
switch (ref<PgfConcrLin>::get_tag(backref.container)) {
case PgfConcrLin::tag: {
ref<PgfConcrLin> lin = ref<PgfConcrLin>::untagged(backref.container);
ref<PgfConcrLincat> lincat =
namespace_lookup(state->lincats, &lin->absfun->type->name);
if (lin->absfun->type->hypos->len == 0 && lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
state->callback->morpho.fn(&state->callback->morpho,
&lin->absfun->name,
&(*field),
lincat->abscat->prob+lin->absfun->prob,
state->err);
if (lin->absfun->type->hypos->len == 0) {
state->scanner->match(lin,
backref.seq_index,
state->err);
if (state->err->type != PGF_EXN_NONE)
return;
}
@@ -633,8 +664,7 @@ PGF_INTERNAL
void phrasetable_lookup_cohorts(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfCohortsCallback* callback, PgfExn* err)
PgfPhraseScanner *scanner, PgfExn* err)
{
PgfTextSpot spot;
spot.pos = 0;
@@ -645,15 +675,16 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
state.spot.ptr = NULL;
state.queue.push(spot);
state.last_pos = 0;
state.skip_pos = (size_t) -1;
state.skipping = false;
state.end = (uint8_t *) &sentence->text[sentence->size];
state.case_sensitive = case_sensitive;
state.lincats = lincats;
state.callback = callback;
state.scanner = scanner;
state.err = err;
while (!state.queue.is_empty()) {
PgfTextSpot spot = state.queue.pop();
while (!state.queue.empty()) {
PgfTextSpot spot = state.queue.top();
state.queue.pop();
if (spot.pos != state.spot.pos) {
state.spot = spot;
@@ -667,36 +698,38 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
state.spot.ptr = ptr;
}
state.skip_pos = (size_t) -1;
state.scanner->space(spot.pos,state.spot.pos,state.err);
if (state.err->type != PGF_EXN_NONE)
return;
while (state.spot.ptr < state.end) {
phrasetable_lookup_prefixes(&state, table, 1, sentence->size);
if (state.err->type != PGF_EXN_NONE)
return;
if (state.last_pos > 0) {
// We found at least one match.
// The last range is yet to be reported.
state.callback->fn(state.callback,
state.spot.pos,
state.last_pos,
state.err);
state.scanner->end_matches(state.last_pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.last_pos = 0;
break;
} else {
// We didn't find any matches at this position,
// therefore we must skip one character and try again.
if (state.skip_pos == (size_t) -1)
state.skip_pos = state.spot.pos;
// No matches were found, try the next position
if (!state.skipping) {
while (!state.queue.empty() &&
state.queue.top().pos < state.spot.pos) {
state.queue.pop();
}
state.queue.push(state.spot);
state.skipping = true;
}
const uint8_t *ptr = state.spot.ptr;
uint32_t ucs = pgf_utf8_decode(&ptr);
if (pgf_utf8_is_space(ucs)) {
state.callback->fn(state.callback,
state.skip_pos,
state.spot.pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.skip_pos = -1;
state.queue.push(state.spot);
break;
}
@@ -704,16 +737,10 @@ void phrasetable_lookup_cohorts(PgfPhrasetable table,
state.spot.ptr = ptr;
}
}
if (state.skip_pos != (size_t) -1) {
state.callback->fn(state.callback,
state.skip_pos,
state.spot.pos,
state.err);
if (state.err->type != PGF_EXN_NONE)
return;
state.skip_pos = (size_t) -1;
}
finish_skipping(&state);
if (state.err->type != PGF_EXN_NONE)
return;
state.spot = spot;
}
@@ -748,10 +775,10 @@ void phrasetable_iter(PgfConcr *concr,
ref<PgfConcrLincat> lincat =
namespace_lookup(concr->lincats, &lin->absfun->type->name);
if (lincat != 0) {
ref<PgfText> field =
*vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
ref<PgfLincatField> field =
vector_elem(lincat->fields, backref.seq_index % lincat->fields->len);
callback->fn(callback, &lin->absfun->name, &(*field), lincat->abscat->prob+lin->absfun->prob, err);
callback->fn(callback, &lin->absfun->name, &(*field->name), lincat->abscat->prob+lin->absfun->prob, err);
if (err->type != PGF_EXN_NONE)
return;
}

View File

@@ -68,19 +68,27 @@ PgfPhrasetable phrasetable_delete(PgfPhrasetable table,
PGF_INTERNAL_DECL
size_t phrasetable_size(PgfPhrasetable table);
class PgfConcrLin;
class PGF_INTERNAL_DECL PgfPhraseScanner {
public:
virtual void space(size_t start, size_t end, PgfExn* err)=0;
virtual void start_matches(size_t pos, PgfExn* err)=0;
virtual void match(ref<PgfConcrLin> lin, size_t seq_index, PgfExn* err)=0;
virtual void end_matches(size_t pos, PgfExn* err)=0;
};
PGF_INTERNAL_DECL
void phrasetable_lookup(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<struct PgfConcrLincat> lincats,
PgfMorphoCallback* callback, PgfExn* err);
PgfPhraseScanner *scanner, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_lookup_cohorts(PgfPhrasetable table,
PgfText *sentence,
bool case_sensitive,
Namespace<PgfConcrLincat> lincats,
PgfCohortsCallback* callback, PgfExn* err);
PgfPhraseScanner *scanner, PgfExn* err);
PGF_INTERNAL_DECL
void phrasetable_iter(PgfConcr *concr,

View File

@@ -33,7 +33,7 @@ public:
PgfPrinter(PgfPrintContext *context, int priority,
PgfMarshaller *marshaller);
PgfPrinter() { free(res); }
~PgfPrinter() { free(res); }
// Push a new variable in the printing context. If the name
// collides with an existing variable, the variable is renamed
@@ -52,6 +52,12 @@ public:
PgfText *get_text();
void dump() {
PgfText *text = get_text();
fprintf(stderr, "%.*s", (int) text->size, text->text);
free(text);
};
void hypo(PgfTypeHypo *hypo, int prio);
void parg(ref<PgfDTyp> ty, ref<PgfPArg> parg);

View File

@@ -667,7 +667,7 @@ ref<PgfConcrLincat> PgfReader::read_lincat()
{
ref<PgfConcrLincat> lincat = read_name(&PgfConcrLincat::name);
lincat->abscat = namespace_lookup(abstract->cats, &lincat->name);
lincat->fields = read_vector(&PgfReader::read_text2);
lincat->fields = read_vector(&PgfReader::read_lincat_field);
lincat->n_lindefs = read_len();
lincat->args = read_vector(&PgfReader::read_parg);
lincat->res = read_vector(&PgfReader::read_presult2);
@@ -675,6 +675,12 @@ ref<PgfConcrLincat> PgfReader::read_lincat()
return lincat;
}
void PgfReader::read_lincat_field(ref<PgfLincatField> field)
{
field->name = read_text();
field->backrefs = 0;
}
ref<PgfConcrLin> PgfReader::read_lin()
{
ref<PgfConcrLin> lin = read_name(&PgfConcrLin::name);
@@ -682,6 +688,76 @@ ref<PgfConcrLin> PgfReader::read_lin()
lin->args = read_vector(&PgfReader::read_parg);
lin->res = read_vector(&PgfReader::read_presult2);
lin->seqs = read_seq_ids(lin.tagged());
lin->lincat =
namespace_lookup(concrete->lincats, &lin->absfun->type->name);
if (lin->lincat == 0)
throw pgf_error("Found a lin which uses a category without a lincat");
ref<Vector<PgfHypo>> hypos = lin->absfun->type->hypos;
ref<PgfConcrLincat> lincats[hypos->len];
for (size_t d = 0; d < hypos->len; d++) {
lincats[d] =
namespace_lookup(concrete->lincats,
&vector_elem(hypos,d)->type->name);
if (lincats[d] == 0)
throw pgf_error("Found a lin which uses a category without a lincat");
}
size_t n_fields = lin->lincat->fields->len;
for (size_t seq_index = 0; seq_index < lin->seqs->len; seq_index++) {
ref<PgfSequence> seq = *vector_elem(lin->seqs,seq_index);
ref<PgfPResult> result = *vector_elem(lin->res, seq_index / n_fields);
size_t dot = 0;
if (dot < seq->syms.len) {
PgfSymbol sym = *vector_elem(&seq->syms,dot);
switch (ref<PgfSymbol>::get_tag(sym)) {
case PgfSymbolCat::tag: {
auto sym_cat = ref<PgfSymbolCat>::untagged(sym);
ref<PgfConcrLincat> lincat = lincats[sym_cat->d];
size_t max_values = 1;
size_t ranges[sym_cat->r.n_terms];
for (size_t i = 0; i < sym_cat->r.n_terms; i++) {
size_t range = 1;
for (size_t j = 0; j < result->vars->len; j++) {
auto var_range = vector_elem(result->vars, j);
if (var_range->var == sym_cat->r.terms[i].var) {
range = var_range->range;
break;
}
}
ranges[i] = range;
max_values *= range;
}
for (size_t values = 0; values < max_values; values++) {
size_t v = values;
size_t index = sym_cat->r.i0;
for (size_t i = 0; i < sym_cat->r.n_terms; i++) {
index += sym_cat->r.terms[i].factor * (v % ranges[i]);
v = v / ranges[i];
}
ref<Vector<PgfLincatBackref>> backrefs =
vector_elem(lincat->fields,index)->backrefs;
backrefs =
vector_resize(backrefs, backrefs->len+1,
PgfDB::get_txn_id());
vector_elem(lincat->fields,index)->backrefs = backrefs;
ref<PgfLincatBackref> backref =
vector_elem(backrefs,backrefs->len-1);
backref->lin = lin;
backref->seq_index = seq_index;
backref->dot = dot;
}
break;
}
}
}
}
return lin;
}

View File

@@ -69,6 +69,7 @@ public:
void merge_abstract(ref<PgfAbstr> abstract);
ref<PgfConcrLincat> read_lincat();
void read_lincat_field(ref<PgfLincatField> field);
ref<PgfLParam> read_lparam();
void read_variable_range(ref<PgfVariableRange> var_info);
void read_parg(ref<PgfPArg> parg);

View File

@@ -383,13 +383,18 @@ void PgfWriter::write_phrasetable_helper(PgfPhrasetable table)
void PgfWriter::write_lincat(ref<PgfConcrLincat> lincat)
{
write_name(&lincat->name);
write_vector(lincat->fields, &PgfWriter::write_text);
write_vector(lincat->fields, &PgfWriter::write_lincat_field);
write_len(lincat->n_lindefs);
write_vector(lincat->args, &PgfWriter::write_parg);
write_vector(lincat->res, &PgfWriter::write_presult);
write_vector(lincat->seqs, &PgfWriter::write_seq_id);
}
void PgfWriter::write_lincat_field(ref<PgfLincatField> field)
{
write_text(field->name);
}
void PgfWriter::write_lin(ref<PgfConcrLin> lin)
{
write_name(&lin->name);

View File

@@ -39,6 +39,7 @@ public:
void write_abstract(ref<PgfAbstr> abstract);
void write_lincat(ref<PgfConcrLincat> lincat);
void write_lincat_field(ref<PgfLincatField> field);
void write_variable_range(ref<PgfVariableRange> var);
void write_lparam(ref<PgfLParam> lparam);
void write_parg(ref<PgfPArg> linarg);

View File

@@ -97,7 +97,7 @@ import Foreign
import Foreign.C
import Control.Monad(forM,forM_)
import Control.Exception(bracket,mask_,throwIO)
import System.IO.Unsafe(unsafePerformIO)
import System.IO.Unsafe(unsafePerformIO, unsafeInterleaveIO)
import System.Random
import qualified Data.Map as Map
import Data.IORef
@@ -673,7 +673,30 @@ data ParseOutput a
| ParseIncomplete -- ^ The sentence is not complete.
parse :: Concr -> Type -> String -> ParseOutput [(Expr,Float)]
parse lang ty sent = parseWithHeuristics lang ty sent (-1.0) []
parse c ty sent =
unsafePerformIO $
withForeignPtr (c_revision c) $ \c_revision ->
withForeignPtr marshaller $ \m ->
bracket (newStablePtr ty) freeStablePtr $ \c_ty ->
withText sent $ \c_sent -> do
c_enum <- withPgfExn "parse" (pgf_parse (c_db c) c_revision c_ty m c_sent)
c_fetch <- (#peek PgfExprEnumVtbl, fetch) =<< (#peek PgfExprEnum, vtbl) c_enum
exprs <- unsafeInterleaveIO (fetchLazy c_fetch c_enum)
return (ParseOk exprs)
where
fetchLazy c_fetch c_enum =
withForeignPtr (c_revision c) $ \c_revision ->
withForeignPtr unmarshaller $ \u ->
alloca $ \p_prob -> do
c_expr <- callFetch c_fetch c_enum (c_db c) u p_prob
if c_expr == castPtrToStablePtr nullPtr
then do pgf_free_expr_enum c_enum
return []
else do expr <- deRefStablePtr c_expr
freeStablePtr c_expr
prob <- peek p_prob
rest <- unsafeInterleaveIO (fetchLazy c_fetch c_enum)
return ((expr,prob) : rest)
parseWithHeuristics :: Concr -- ^ the language with which we parse
-> Type -- ^ the start category

View File

@@ -50,6 +50,7 @@ data PgfProbsCallback
data PgfMorphoCallback
data PgfCohortsCallback
data PgfPhrasetableIds
data PgfExprEnum
type Wrapper a = a -> IO (FunPtr a)
type Dynamic a = FunPtr a -> a
@@ -253,6 +254,12 @@ foreign import ccall pgf_bracketed_linearize :: Ptr PgfDB -> Ptr Concr -> Stable
foreign import ccall pgf_bracketed_linearize_all :: Ptr PgfDB -> Ptr Concr -> StablePtr Expr -> Ptr PgfPrintContext -> Ptr PgfMarshaller -> Ptr PgfLinearizationOutputIface -> Ptr PgfExn -> IO ()
foreign import ccall pgf_parse :: Ptr PgfDB -> Ptr Concr -> StablePtr Type -> Ptr PgfMarshaller -> Ptr PgfText -> Ptr PgfExn -> IO (Ptr PgfExprEnum)
foreign import ccall "dynamic" callFetch :: Dynamic (Ptr PgfExprEnum -> Ptr PgfDB -> Ptr PgfUnmarshaller -> Ptr (#type prob_t) -> IO (StablePtr Expr))
foreign import ccall pgf_free_expr_enum :: Ptr PgfExprEnum -> IO ()
foreign import ccall "wrapper" wrapSymbol0 :: Wrapper (Ptr PgfLinearizationOutputIface -> IO ())
foreign import ccall "wrapper" wrapSymbol1 :: Wrapper (Ptr PgfLinearizationOutputIface -> Ptr PgfText -> IO ())

View File

@@ -18,42 +18,42 @@ concrete basic_cnc {
lincat Float = [
"s"
]
lindef Float : String(0) -> Float(0) = [S0]
linref Float : Float(0) -> String(0) = [S0]
lindef Float(0) -> Float[String(0)] = [S0]
linref String(0) -> Float[Float(0)] = [S0]
lincat Int = [
"s"
]
lindef Int : String(0) -> Int(0) = [S0]
linref Int : Int(0) -> String(0) = [S0]
lindef Int(0) -> Int[String(0)] = [S0]
linref String(0) -> Int[Int(0)] = [S0]
lincat N = [
"s"
]
lindef N : String(0) -> N(0) = [S0]
linref N : ∀{i<2} . N(i) -> String(0) = [S0]
lindef N(0) -> N[String(0)] = [S0]
linref ∀{i<2} . String(0) -> N[N(i)] = [S0]
lincat P = [
"s"
]
lindef P : String(0) -> P(0) = [S0]
linref P : P(0) -> String(0) = [S0]
lindef P(0) -> P[String(0)] = [S0]
linref String(0) -> P[P(0)] = [S0]
lincat S = [
""
]
lindef S : String(0) -> S(0) = [S0]
linref S : S(0) -> String(0) = [S0]
lindef S(0) -> S[String(0)] = [S0]
linref String(0) -> S[S(0)] = [S0]
lincat String = [
"s"
]
lindef String : String(0) -> String(0) = [S0]
linref String : String(0) -> String(0) = [S0]
lin c : ∀{i<2} . N(i) -> S(0) = [S0]
lin floatLit : Float(0) -> S(0) = [S0]
lin ind : ∀{i<2} . P(0) * P(0) * N(i) -> P(0) = [S1]
lin intLit : Int(0) -> S(0) = [S0]
lin nat : ∀{i<2} . N(i) -> P(0) = [S5]
lin s : N(0) -> N(0) = [S2]
lin s : N(1) -> N(0) = [S4]
lin stringLit : String(0) -> S(0) = [S0]
lin z : N(1) = [S3]
lindef String(0) -> String[String(0)] = [S0]
linref String(0) -> String[String(0)] = [S0]
lin ∀{i<2} . S(0) -> c[N(i)] = [S0]
lin S(0) -> floatLit[Float(0)] = [S0]
lin ∀{i<2} . P(0) -> ind[P(0),P(0),N(i)] = [S1]
lin S(0) -> intLit[Int(0)] = [S0]
lin ∀{i<2} . P(0) -> nat[N(i)] = [S5]
lin N(0) -> s[N(0)] = [S2]
lin N(0) -> s[N(1)] = [S4]
lin S(0) -> stringLit[String(0)] = [S0]
lin N(1) -> z[] = [S3]
sequences {
S0 = <0,0>
S1 = <0,0> "&" "λ" SOFT_BIND <1,$0> SOFT_BIND "," SOFT_BIND <1,$1> "." <1,0>