forked from GitHub/gf-core
Merge remote-tracking branch 'origin/master' into build-binary-packages
This commit is contained in:
47
RELEASE.md
Normal file
47
RELEASE.md
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# GF Core releases
|
||||||
|
|
||||||
|
🚨 WARNING! The information here is preliminary!
|
||||||
|
|
||||||
|
## Creating a new release
|
||||||
|
|
||||||
|
### 1. Prepare the repository
|
||||||
|
|
||||||
|
**Web pages**
|
||||||
|
|
||||||
|
1. Create `download/index-X.Y.md` with installation instructions.
|
||||||
|
1. Create `download/release-X.Y.md` with changelog information.
|
||||||
|
1. Update `download/index.html` to redirect to the new version.
|
||||||
|
1. Add announcement in news section in `index.html`
|
||||||
|
|
||||||
|
**Version numbers**
|
||||||
|
|
||||||
|
1. Update version number in `gf.cabal` (ommitting `-git` suffix)
|
||||||
|
1. Add a new line in `debian/changelog`
|
||||||
|
|
||||||
|
### 2. Create GitHub release
|
||||||
|
|
||||||
|
1. When the above changes are committed to the `master` branch in the repository,
|
||||||
|
check that all builds are successful:
|
||||||
|
- https://github.com/GrammaticalFramework/gf-core/actions
|
||||||
|
- https://travis-ci.org/github/GrammaticalFramework/gf-core
|
||||||
|
1. Create a GitHub release here: https://github.com/GrammaticalFramework/gf-core/releases/new
|
||||||
|
with a tag format `RELEASE-X.Y`
|
||||||
|
|
||||||
|
### 3. Binary packages
|
||||||
|
|
||||||
|
Build and attach binaries to the release by running the relevant GitHub Actions workflows (TODO):
|
||||||
|
|
||||||
|
1. Go to https://github.com/GrammaticalFramework/gf-rgl/actions
|
||||||
|
1. Click "Build [platform] package" under _Workflows_
|
||||||
|
1. Click "Run workflow" and specify the tag `RELEASE-X.Y`
|
||||||
|
|
||||||
|
### 4. Upload to Hackage
|
||||||
|
|
||||||
|
1. Run `make sdist`
|
||||||
|
1. Visit `https://hackage.haskell.org/upload` and upload the file `dist/gf-X.Y.tar.gz`,
|
||||||
|
OR upload directly with Cabal (≥2.4): `cabal upload dist/gf-X.Y.tar.gz`
|
||||||
|
1. If the documentation-building fails on the Hackage server, do:
|
||||||
|
```
|
||||||
|
cabal v2-haddock --builddir=dist/docs --haddock-for-hackage --enable-doc
|
||||||
|
cabal upload --documentation dist/docs/*-docs.tar.gz
|
||||||
|
```
|
||||||
@@ -147,7 +147,7 @@ else
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
find . -name '*.md' | while read file ; do
|
find . -name '*.md' | while read file ; do
|
||||||
if [[ "$file" == *"README.md" ]] ; then continue ; fi
|
if [[ "$file" == *"README.md" ]] || [[ "$file" == *"RELEASE.md" ]] ; then continue ; fi
|
||||||
html="${file%.md}.html"
|
html="${file%.md}.html"
|
||||||
if [ "$file" -nt "$html" ] || [ "$template" -nt "$html" ] ; then
|
if [ "$file" -nt "$html" ] || [ "$template" -nt "$html" ] ; then
|
||||||
render_md_html "$file" "$html"
|
render_md_html "$file" "$html"
|
||||||
|
|||||||
25
download/gfc
25
download/gfc
@@ -1,25 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
|
|
||||||
prefix="/usr/local"
|
|
||||||
|
|
||||||
case "i386-apple-darwin9.3.0" in
|
|
||||||
*-cygwin)
|
|
||||||
prefix=`cygpath -w "$prefix"`;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
exec_prefix="${prefix}"
|
|
||||||
GF_BIN_DIR="${exec_prefix}/bin"
|
|
||||||
GF_DATA_DIR="${prefix}/share/GF-3.0-beta"
|
|
||||||
|
|
||||||
GFBIN="$GF_BIN_DIR/gf"
|
|
||||||
|
|
||||||
if [ ! -x "${GFBIN}" ]; then
|
|
||||||
GFBIN=`which gf`
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ ! -x "${GFBIN}" ]; then
|
|
||||||
echo "gf not found."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
exec $GFBIN --batch "$@"
|
|
||||||
182
download/index-3.11.md
Normal file
182
download/index-3.11.md
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
---
|
||||||
|
title: Grammatical Framework Download and Installation
|
||||||
|
...
|
||||||
|
|
||||||
|
**GF 3.11** was released on ? December 2020.
|
||||||
|
|
||||||
|
What's new? See the [release notes](release-3.11.html).
|
||||||
|
|
||||||
|
## Binary packages
|
||||||
|
|
||||||
|
Unlike previous versions, these binary packages include only the GF core (compiler and runtime).
|
||||||
|
|
||||||
|
| Platform | Download | Features | How to install |
|
||||||
|
|:----------------|:---------------------------------------------------|:---------------|:---------------------------------|
|
||||||
|
| macOS | [gf-3.11.pkg](gf-3.11.pkg) | GF, S, C, J, P | Double-click on the package icon |
|
||||||
|
| Ubuntu (32-bit) | [gf\_3.11\_i386.deb](gf_3.11_i386.deb) | GF, S, C, J, P | `sudo dpkg -i gf_3.11_i386.deb` |
|
||||||
|
| Ubuntu (64-bit) | [gf\_3.11\_amd64.deb](gf_3.11_amd64.deb) | GF, S, C, J, P | `sudo dpkg -i gf_3.11_amd64.deb` |
|
||||||
|
| Windows | [gf-3.11-bin-windows.zip](gf-3.11-bin-windows.zip) | GF, S | `unzip gf-3.11-bin-windows.zip` |
|
||||||
|
|
||||||
|
**Features**
|
||||||
|
|
||||||
|
- GF = GF shell and grammar compiler
|
||||||
|
- S = `gf -server` mode
|
||||||
|
- C = C run-time system
|
||||||
|
- J/P = Java/Python binding to the C run-time system
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
The Windows package is installed by just unpacking it anywhere. You will
|
||||||
|
probably need to set the `PATH` and `GF_LIB_PATH` environment variables,
|
||||||
|
see Inari's notes on [Installing GF on Windows](http://www.grammaticalframework.org/~inari/gf-windows.html#toc3).
|
||||||
|
|
||||||
|
The Ubuntu `.deb` packages should work on Ubuntu 16.04 and 18.04 and
|
||||||
|
similar Linux distributions. The `.deb` packages were updated
|
||||||
|
to version 3.10-2 after the release of GF 3.10.
|
||||||
|
(Because of a packaging bug the Resource Grammar Library was missing
|
||||||
|
in the 3.10-1 packages.)
|
||||||
|
|
||||||
|
The packages for macOS (Mac OS X) should work on at least 10.13 and
|
||||||
|
10.14 (High Sierra and Mojave)
|
||||||
|
|
||||||
|
## Installing the latest release from source
|
||||||
|
|
||||||
|
[GF is on Hackage](http://hackage.haskell.org/package/gf), so under
|
||||||
|
normal circumstances the procedure is fairly simple:
|
||||||
|
|
||||||
|
1. Install a recent version of the [Haskell
|
||||||
|
Platform](http://hackage.haskell.org/platform) (see note below)
|
||||||
|
2. `cabal update`
|
||||||
|
3. On Linux: install some C libraries from your Linux distribution (see note below)
|
||||||
|
4. `cabal install gf`
|
||||||
|
|
||||||
|
This installs the GF executable and Haskell libraries, but **does not include the RGL**.
|
||||||
|
|
||||||
|
You can also download the source code release from [GitHub](https://github.com/GrammaticalFramework/gf-core/releases),
|
||||||
|
and follow the instructions below under **Installing from the latest developer source code**.
|
||||||
|
|
||||||
|
### Notes
|
||||||
|
|
||||||
|
**Installation location**
|
||||||
|
|
||||||
|
The above steps installs GF for a single user. The executables are put
|
||||||
|
in `$HOME/.cabal/bin` (or, with recent versions of the Haskell platform
|
||||||
|
on Mac OS X, in `$HOME/Library/Haskell/bin`), so it is a good idea to
|
||||||
|
put a line in your `.bash_profile` or `.profile` to add that directory
|
||||||
|
to you path:
|
||||||
|
|
||||||
|
```
|
||||||
|
PATH=$HOME/.cabal/bin:$PATH
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```
|
||||||
|
PATH=$HOME/Library/Haskell/bin:$PATH
|
||||||
|
```
|
||||||
|
|
||||||
|
**Build tools**
|
||||||
|
|
||||||
|
In order to compile GF you need the build tools **Alex** and **Happy**.
|
||||||
|
These can be installed via Cabal, e.g.:
|
||||||
|
|
||||||
|
```
|
||||||
|
cabal install alex happy
|
||||||
|
```
|
||||||
|
|
||||||
|
or obtained by other means, depending on your OS.
|
||||||
|
|
||||||
|
**Haskeline**
|
||||||
|
|
||||||
|
GF uses [`haskeline`](http://hackage.haskell.org/package/haskeline), which
|
||||||
|
on Linux depends on some non-Haskell libraries that won't be installed
|
||||||
|
automatically by cabal, and therefore need to be installed manually.
|
||||||
|
Here is one way to do this:
|
||||||
|
|
||||||
|
- On Ubuntu: `sudo apt-get install libghc-haskeline-dev`
|
||||||
|
- On Fedora: `sudo dnf install ghc-haskeline-devel`
|
||||||
|
|
||||||
|
**GHC version**
|
||||||
|
|
||||||
|
The GF source code has been updated to compile with GHC 8.4.
|
||||||
|
Using older versions of GHC (e.g. 8.2, 8.0 and 7.10) should still work too.
|
||||||
|
|
||||||
|
## Installing from the latest developer source code
|
||||||
|
|
||||||
|
If you haven't already, clone the repository with:
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://github.com/GrammaticalFramework/gf-core.git
|
||||||
|
```
|
||||||
|
|
||||||
|
If you've already cloned the repository previously, update with:
|
||||||
|
|
||||||
|
```
|
||||||
|
git pull
|
||||||
|
```
|
||||||
|
|
||||||
|
Then install with:
|
||||||
|
|
||||||
|
```
|
||||||
|
cabal install
|
||||||
|
```
|
||||||
|
|
||||||
|
or, if you're a Stack user:
|
||||||
|
|
||||||
|
```
|
||||||
|
stack install
|
||||||
|
```
|
||||||
|
|
||||||
|
The above notes for installing from source apply also in these cases.
|
||||||
|
For more info on working with the GF source code, see the
|
||||||
|
[GF Developers Guide](../doc/gf-developers.html).
|
||||||
|
|
||||||
|
## Installing the RGL from source
|
||||||
|
|
||||||
|
To install the RGL from source,
|
||||||
|
you can download a release from [GitHub](https://github.com/GrammaticalFramework/gf-rgl/releases)
|
||||||
|
or get the latest version by cloning the repository:
|
||||||
|
|
||||||
|
```
|
||||||
|
git clone https://github.com/GrammaticalFramework/gf-rgl.git
|
||||||
|
```
|
||||||
|
|
||||||
|
In both cases, once you have the RGL sources you can install them by running:
|
||||||
|
|
||||||
|
```
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
in the RGL folder.
|
||||||
|
This assumes that you already have GF installed.
|
||||||
|
For more details about building the RGL, see the [RGL README](https://github.com/GrammaticalFramework/gf-rgl/blob/master/README.md).
|
||||||
|
|
||||||
|
## Installing the Python bindings from PyPI
|
||||||
|
|
||||||
|
The Python library is available on PyPI as `pgf`, so it can be installed using:
|
||||||
|
|
||||||
|
```
|
||||||
|
pip install pgf
|
||||||
|
```
|
||||||
|
|
||||||
|
We provide binary wheels for Linux and OSX (with Windows missing so far), which
|
||||||
|
include the C runtime and a ready-to-go. If there is no binary distribution for
|
||||||
|
your platform, this will install the source tarball, which will attempt to build
|
||||||
|
the binding during installation, and requires the GF C runtime to be installed on
|
||||||
|
your system.
|
||||||
|
|
||||||
|
## Older releases
|
||||||
|
|
||||||
|
- [GF 3.10](index-3.10.html) (December 2018)
|
||||||
|
- [GF 3.9](index-3.9.html) (August 2017)
|
||||||
|
- [GF 3.8](index-3.8.html) (June 2016)
|
||||||
|
- [GF 3.7.1](index-3.7.1.html) (October 2015)
|
||||||
|
- [GF 3.7](index-3.7.html) (June 2015)
|
||||||
|
- [GF 3.6](index-3.6.html) (June 2014)
|
||||||
|
- [GF 3.5](index-3.5.html) (August 2013)
|
||||||
|
- [GF 3.4](index-3.4.html) (January 2013)
|
||||||
|
- [GF 3.3.3](index-3.3.3.html) (March 2012)
|
||||||
|
- [GF 3.3](index-3.3.html) (October 2011)
|
||||||
|
- [GF 3.2.9](index-3.2.9.html) source-only snapshot (September 2011)
|
||||||
|
- [GF 3.2](index-3.2.html) (December 2010)
|
||||||
|
- [GF 3.1.6](index-3.1.6.html) (April 2010)
|
||||||
8
download/index.html
Normal file
8
download/index.html
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="refresh" content="0; URL=/download/index-3.10.html" />
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
You are being redirected to <a href="index-3.10.html">the current version</a> of this page.
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
25
download/release-3.11.md
Normal file
25
download/release-3.11.md
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
---
|
||||||
|
title: GF 3.11 Release Notes
|
||||||
|
date: ? December 2020
|
||||||
|
...
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
See the [download page](index.html).
|
||||||
|
|
||||||
|
## What's new
|
||||||
|
|
||||||
|
From this release, the binary GF core packages do not contain the RGL.
|
||||||
|
The RGL's release cycle is now completely separate from GF's. See [RGL releases](https://github.com/GrammaticalFramework/gf-rgl/releases).
|
||||||
|
|
||||||
|
Over ... changes have been pushed to GF core
|
||||||
|
since the release of GF 3.10 in December 2018.
|
||||||
|
|
||||||
|
## General
|
||||||
|
|
||||||
|
- Testsuite.
|
||||||
|
- Compatibiilty with new versions of GHC.
|
||||||
|
|
||||||
|
## GF compiler and run-time library
|
||||||
|
|
||||||
|
- More improvements to error messages.
|
||||||
@@ -61,6 +61,14 @@ typedef struct {
|
|||||||
|
|
||||||
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
|
typedef enum { BIND_NONE, BIND_HARD, BIND_SOFT } BIND_TYPE;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
PgfProductionIdx* idx;
|
||||||
|
size_t offset;
|
||||||
|
size_t sym_idx;
|
||||||
|
} PgfLexiconIdxEntry;
|
||||||
|
|
||||||
|
typedef GuBuf PgfLexiconIdx;
|
||||||
|
|
||||||
struct PgfParseState {
|
struct PgfParseState {
|
||||||
PgfParseState* next;
|
PgfParseState* next;
|
||||||
|
|
||||||
@@ -74,6 +82,8 @@ struct PgfParseState {
|
|||||||
size_t end_offset;
|
size_t end_offset;
|
||||||
|
|
||||||
prob_t viterbi_prob;
|
prob_t viterbi_prob;
|
||||||
|
|
||||||
|
PgfLexiconIdx* lexicon_idx;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef struct PgfAnswers {
|
typedef struct PgfAnswers {
|
||||||
@@ -686,16 +696,6 @@ pgf_result_production(PgfParsing* ps,
|
|||||||
static void
|
static void
|
||||||
pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep);
|
pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep);
|
||||||
|
|
||||||
static void
|
|
||||||
pgf_parsing_push_item(PgfParseState* state, PgfItem* item)
|
|
||||||
{
|
|
||||||
if (gu_buf_length(state->agenda) == 0) {
|
|
||||||
state->viterbi_prob =
|
|
||||||
item->inside_prob+item->conts->outside_prob;
|
|
||||||
}
|
|
||||||
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state,
|
pgf_parsing_push_production(PgfParsing* ps, PgfParseState* state,
|
||||||
PgfItemConts* conts, PgfProduction prod)
|
PgfItemConts* conts, PgfProduction prod)
|
||||||
@@ -727,7 +727,7 @@ pgf_parsing_combine(PgfParsing* ps,
|
|||||||
}
|
}
|
||||||
|
|
||||||
pgf_item_advance(item, ps->pool);
|
pgf_item_advance(item, ps->pool);
|
||||||
pgf_parsing_push_item(before, item);
|
gu_buf_heap_push(before->agenda, pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
|
|
||||||
static PgfProduction
|
static PgfProduction
|
||||||
@@ -898,9 +898,65 @@ pgf_parsing_complete(PgfParsing* ps, PgfItem* item, PgfExprProb *ep)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
PGF_INTERNAL_DECL int
|
||||||
|
pgf_symbols_cmp(PgfCohortSpot* spot,
|
||||||
|
PgfSymbols* syms, size_t* sym_idx,
|
||||||
|
bool case_sensitive);
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_parsing_lookahead(PgfParsing *ps, PgfParseState* state,
|
||||||
|
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
||||||
|
{
|
||||||
|
// This is a variation of a binary search algorithm which
|
||||||
|
// can retrieve all prefixes of a string with minimal
|
||||||
|
// comparisons, i.e. there is no need to lookup every
|
||||||
|
// prefix separately.
|
||||||
|
|
||||||
|
while (i <= j) {
|
||||||
|
int k = (i+j) / 2;
|
||||||
|
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
||||||
|
|
||||||
|
PgfCohortSpot start = {0, ps->sentence + state->end_offset};
|
||||||
|
PgfCohortSpot current = start;
|
||||||
|
size_t sym_idx = 0;
|
||||||
|
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
||||||
|
if (cmp < 0) {
|
||||||
|
j = k-1;
|
||||||
|
} else if (cmp > 0) {
|
||||||
|
ptrdiff_t len = current.ptr - start.ptr;
|
||||||
|
|
||||||
|
if (min <= len)
|
||||||
|
pgf_parsing_lookahead(ps, state, i, k-1, min, len);
|
||||||
|
|
||||||
|
if (len+1 <= max)
|
||||||
|
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||||
|
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
ptrdiff_t len = current.ptr - start.ptr;
|
||||||
|
|
||||||
|
if (min <= len-1)
|
||||||
|
pgf_parsing_lookahead(ps, state, i, k-1, min, len-1);
|
||||||
|
|
||||||
|
if (seq->idx != NULL) {
|
||||||
|
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||||
|
entry->idx = seq->idx;
|
||||||
|
entry->offset = (size_t) (current.ptr - ps->sentence);
|
||||||
|
entry->sym_idx = sym_idx;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len+1 <= max)
|
||||||
|
pgf_parsing_lookahead(ps, state, k+1, j, len+1, max);
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static PgfParseState*
|
static PgfParseState*
|
||||||
pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
||||||
BIND_TYPE bind_type)
|
BIND_TYPE bind_type,
|
||||||
|
prob_t viterbi_prob)
|
||||||
{
|
{
|
||||||
PgfParseState** pstate;
|
PgfParseState** pstate;
|
||||||
if (ps->before == NULL && start_offset == 0)
|
if (ps->before == NULL && start_offset == 0)
|
||||||
@@ -953,172 +1009,36 @@ pgf_new_parse_state(PgfParsing* ps, size_t start_offset,
|
|||||||
(start_offset == end_offset);
|
(start_offset == end_offset);
|
||||||
state->start_offset = start_offset;
|
state->start_offset = start_offset;
|
||||||
state->end_offset = end_offset;
|
state->end_offset = end_offset;
|
||||||
state->viterbi_prob = 0;
|
state->viterbi_prob = viterbi_prob;
|
||||||
|
state->lexicon_idx =
|
||||||
|
gu_new_buf(PgfLexiconIdxEntry, ps->pool);
|
||||||
|
|
||||||
if (ps->before == NULL && start_offset == 0)
|
if (ps->before == NULL && start_offset == 0)
|
||||||
state->needs_bind = false;
|
state->needs_bind = false;
|
||||||
|
|
||||||
|
if (gu_seq_length(ps->concr->sequences) > 0) {
|
||||||
|
// Add epsilon lexical rules to the bottom up index
|
||||||
|
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
||||||
|
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
||||||
|
PgfLexiconIdxEntry* entry = gu_buf_extend(state->lexicon_idx);
|
||||||
|
entry->idx = seq->idx;
|
||||||
|
entry->offset = state->start_offset;
|
||||||
|
entry->sym_idx= 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add non-epsilon lexical rules to the bottom up index
|
||||||
|
if (!state->needs_bind) {
|
||||||
|
pgf_parsing_lookahead(ps, state,
|
||||||
|
0, gu_seq_length(ps->concr->sequences)-1,
|
||||||
|
1, strlen(ps->sentence)-state->end_offset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
*pstate = state;
|
*pstate = state;
|
||||||
|
|
||||||
return state;
|
return state;
|
||||||
}
|
}
|
||||||
|
|
||||||
PGF_INTERNAL_DECL int
|
|
||||||
pgf_symbols_cmp(PgfCohortSpot* spot,
|
|
||||||
PgfSymbols* syms, size_t* sym_idx,
|
|
||||||
bool case_sensitive);
|
|
||||||
|
|
||||||
static bool
|
|
||||||
pgf_parsing_scan_helper(PgfParsing *ps, PgfParseState* state,
|
|
||||||
int i, int j, ptrdiff_t min, ptrdiff_t max)
|
|
||||||
{
|
|
||||||
// This is a variation of a binary search algorithm which
|
|
||||||
// can retrieve all prefixes of a string with minimal
|
|
||||||
// comparisons, i.e. there is no need to lookup every
|
|
||||||
// prefix separately.
|
|
||||||
|
|
||||||
bool found = false;
|
|
||||||
while (i <= j) {
|
|
||||||
int k = (i+j) / 2;
|
|
||||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, k);
|
|
||||||
|
|
||||||
PgfCohortSpot start = {0, ps->sentence+state->end_offset};
|
|
||||||
PgfCohortSpot current = start;
|
|
||||||
|
|
||||||
size_t sym_idx = 0;
|
|
||||||
int cmp = pgf_symbols_cmp(¤t, seq->syms, &sym_idx, ps->case_sensitive);
|
|
||||||
if (cmp < 0) {
|
|
||||||
j = k-1;
|
|
||||||
} else if (cmp > 0) {
|
|
||||||
ptrdiff_t len = current.ptr - start.ptr;
|
|
||||||
|
|
||||||
if (min <= len)
|
|
||||||
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
if (len+1 <= max)
|
|
||||||
if (pgf_parsing_scan_helper(ps, state, k+1, j, len+1, max))
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
ptrdiff_t len = current.ptr - start.ptr;
|
|
||||||
|
|
||||||
if (min <= len)
|
|
||||||
if (pgf_parsing_scan_helper(ps, state, i, k-1, min, len))
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
// Here we do bottom-up prediction for all lexical categories.
|
|
||||||
// The epsilon productions will be predicted in top-down
|
|
||||||
// fashion while parsing.
|
|
||||||
if (seq->idx != NULL && len > 0) {
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
// A new state will mark the end of the current match
|
|
||||||
PgfParseState* new_state =
|
|
||||||
pgf_new_parse_state(ps, (size_t) (current.ptr - ps->sentence), BIND_NONE);
|
|
||||||
|
|
||||||
// Bottom-up prediction for lexical rules
|
|
||||||
size_t n_entries = gu_buf_length(seq->idx);
|
|
||||||
for (size_t i = 0; i < n_entries; i++) {
|
|
||||||
PgfProductionIdxEntry* entry =
|
|
||||||
gu_buf_index(seq->idx, PgfProductionIdxEntry, i);
|
|
||||||
|
|
||||||
PgfItemConts* conts =
|
|
||||||
pgf_parsing_get_conts(state,
|
|
||||||
entry->ccat, entry->lin_idx,
|
|
||||||
ps->pool);
|
|
||||||
|
|
||||||
// Create the new category if it doesn't exist yet
|
|
||||||
PgfCCat* tmp_ccat = pgf_parsing_get_completed(new_state, conts);
|
|
||||||
PgfCCat* ccat = tmp_ccat;
|
|
||||||
if (ccat == NULL) {
|
|
||||||
ccat = pgf_parsing_create_completed(ps, new_state, conts, INFINITY);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the production
|
|
||||||
if (ccat->prods == NULL || ccat->n_synprods >= gu_seq_length(ccat->prods)) {
|
|
||||||
ccat->prods = gu_realloc_seq(ccat->prods, PgfProduction, ccat->n_synprods+1);
|
|
||||||
}
|
|
||||||
GuVariantInfo i;
|
|
||||||
i.tag = PGF_PRODUCTION_APPLY;
|
|
||||||
i.data = entry->papp;
|
|
||||||
PgfProduction prod = gu_variant_close(i);
|
|
||||||
gu_seq_set(ccat->prods, PgfProduction, ccat->n_synprods++, prod);
|
|
||||||
|
|
||||||
// Update the category's probability to be minimum
|
|
||||||
if (ccat->viterbi_prob > entry->papp->fun->ep->prob)
|
|
||||||
ccat->viterbi_prob = entry->papp->fun->ep->prob;
|
|
||||||
|
|
||||||
#ifdef PGF_PARSER_DEBUG
|
|
||||||
GuPool* tmp_pool = gu_new_pool();
|
|
||||||
GuOut* out = gu_file_out(stderr, tmp_pool);
|
|
||||||
GuExn* err = gu_exn(tmp_pool);
|
|
||||||
if (tmp_ccat == NULL) {
|
|
||||||
gu_printf(out, err, "[");
|
|
||||||
pgf_print_range(state, new_state, out, err);
|
|
||||||
gu_puts("; ", out, err);
|
|
||||||
pgf_print_fid(conts->ccat->fid, out, err);
|
|
||||||
gu_printf(out, err, "; %d; ",
|
|
||||||
conts->lin_idx);
|
|
||||||
pgf_print_fid(ccat->fid, out, err);
|
|
||||||
gu_puts("] ", out, err);
|
|
||||||
pgf_print_fid(ccat->fid, out, err);
|
|
||||||
gu_printf(out, err, ".chunk_count=%d\n", ccat->chunk_count);
|
|
||||||
}
|
|
||||||
pgf_print_production(ccat->fid, prod, out, err);
|
|
||||||
gu_pool_free(tmp_pool);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (len <= max)
|
|
||||||
if (pgf_parsing_scan_helper(ps, state, k+1, j, len, max))
|
|
||||||
found = true;
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return found;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
|
||||||
pgf_parsing_scan(PgfParsing *ps)
|
|
||||||
{
|
|
||||||
size_t len = strlen(ps->sentence);
|
|
||||||
|
|
||||||
PgfParseState* state =
|
|
||||||
pgf_new_parse_state(ps, 0, BIND_SOFT);
|
|
||||||
|
|
||||||
while (state != NULL && state->end_offset < len) {
|
|
||||||
if (state->needs_bind) {
|
|
||||||
// We have encountered two tokens without space in between.
|
|
||||||
// Those can be accepted only if there is a BIND token
|
|
||||||
// in between. We encode this by having one more state
|
|
||||||
// at the same offset. A transition between these two
|
|
||||||
// states is possible only with the BIND token.
|
|
||||||
state =
|
|
||||||
pgf_new_parse_state(ps, state->end_offset, BIND_HARD);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!pgf_parsing_scan_helper
|
|
||||||
(ps, state,
|
|
||||||
0, gu_seq_length(ps->concr->sequences)-1,
|
|
||||||
1, len-state->end_offset)) {
|
|
||||||
// skip one character and try again
|
|
||||||
GuString s = ps->sentence+state->end_offset;
|
|
||||||
gu_utf8_decode((const uint8_t**) &s);
|
|
||||||
pgf_new_parse_state(ps, s-ps->sentence, BIND_NONE);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (state == ps->before)
|
|
||||||
state = ps->after;
|
|
||||||
else
|
|
||||||
state = state->next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
||||||
{
|
{
|
||||||
@@ -1138,14 +1058,36 @@ pgf_parsing_add_transition(PgfParsing* ps, PgfToken tok, PgfItem* item)
|
|||||||
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
if (!ps->before->needs_bind && cmp_string(¤t, tok, ps->case_sensitive) == 0) {
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, (current.ptr - ps->sentence),
|
pgf_new_parse_state(ps, (current.ptr - ps->sentence),
|
||||||
BIND_NONE);
|
BIND_NONE,
|
||||||
pgf_parsing_push_item(state, item);
|
item->inside_prob+item->conts->outside_prob);
|
||||||
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
} else {
|
} else {
|
||||||
pgf_item_free(ps, item);
|
pgf_item_free(ps, item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pgf_parsing_predict_lexeme(PgfParsing* ps, PgfItemConts* conts,
|
||||||
|
PgfProductionIdxEntry* entry,
|
||||||
|
size_t offset, size_t sym_idx)
|
||||||
|
{
|
||||||
|
GuVariantInfo i = { PGF_PRODUCTION_APPLY, entry->papp };
|
||||||
|
PgfProduction prod = gu_variant_close(i);
|
||||||
|
PgfItem* item =
|
||||||
|
pgf_new_item(ps, conts, prod);
|
||||||
|
PgfSymbols* syms = entry->papp->fun->lins[conts->lin_idx]->syms;
|
||||||
|
item->sym_idx = sym_idx;
|
||||||
|
pgf_item_set_curr_symbol(item, ps->pool);
|
||||||
|
prob_t prob = item->inside_prob+item->conts->outside_prob;
|
||||||
|
PgfParseState* state =
|
||||||
|
pgf_new_parse_state(ps, offset, BIND_NONE, prob);
|
||||||
|
if (state->viterbi_prob > prob) {
|
||||||
|
state->viterbi_prob = prob;
|
||||||
|
}
|
||||||
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
pgf_parsing_td_predict(PgfParsing* ps,
|
pgf_parsing_td_predict(PgfParsing* ps,
|
||||||
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
|
PgfItem* item, PgfCCat* ccat, size_t lin_idx)
|
||||||
@@ -1193,36 +1135,34 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
|||||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Top-down prediction for epsilon lexical rules if any
|
// Bottom-up prediction for lexical and epsilon rules
|
||||||
PgfSequence* seq = gu_seq_index(ps->concr->sequences, PgfSequence, 0);
|
size_t n_idcs = gu_buf_length(ps->before->lexicon_idx);
|
||||||
if (gu_seq_length(seq->syms) == 0 && seq->idx != NULL) {
|
for (size_t i = 0; i < n_idcs; i++) {
|
||||||
|
PgfLexiconIdxEntry* lentry =
|
||||||
|
gu_buf_index(ps->before->lexicon_idx, PgfLexiconIdxEntry, i);
|
||||||
|
|
||||||
PgfProductionIdxEntry key;
|
PgfProductionIdxEntry key;
|
||||||
key.ccat = ccat;
|
key.ccat = ccat;
|
||||||
key.lin_idx = lin_idx;
|
key.lin_idx = lin_idx;
|
||||||
key.papp = NULL;
|
key.papp = NULL;
|
||||||
PgfProductionIdxEntry* value =
|
PgfProductionIdxEntry* value =
|
||||||
gu_seq_binsearch(gu_buf_data_seq(seq->idx),
|
gu_seq_binsearch(gu_buf_data_seq(lentry->idx),
|
||||||
pgf_production_idx_entry_order,
|
pgf_production_idx_entry_order,
|
||||||
PgfProductionIdxEntry, &key);
|
PgfProductionIdxEntry, &key);
|
||||||
|
|
||||||
if (value != NULL) {
|
if (value != NULL) {
|
||||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, value->papp };
|
pgf_parsing_predict_lexeme(ps, conts, value, lentry->offset, lentry->sym_idx);
|
||||||
PgfProduction prod = gu_variant_close(i);
|
|
||||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
|
||||||
|
|
||||||
PgfProductionIdxEntry* start =
|
PgfProductionIdxEntry* start =
|
||||||
gu_buf_data(seq->idx);
|
gu_buf_data(lentry->idx);
|
||||||
PgfProductionIdxEntry* end =
|
PgfProductionIdxEntry* end =
|
||||||
start + gu_buf_length(seq->idx)-1;
|
start + gu_buf_length(lentry->idx)-1;
|
||||||
|
|
||||||
PgfProductionIdxEntry* left = value-1;
|
PgfProductionIdxEntry* left = value-1;
|
||||||
while (left >= start &&
|
while (left >= start &&
|
||||||
value->ccat->fid == left->ccat->fid &&
|
value->ccat->fid == left->ccat->fid &&
|
||||||
value->lin_idx == left->lin_idx) {
|
value->lin_idx == left->lin_idx) {
|
||||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, left->papp };
|
pgf_parsing_predict_lexeme(ps, conts, left, lentry->offset, lentry->sym_idx);
|
||||||
PgfProduction prod = gu_variant_close(i);
|
|
||||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
|
||||||
left--;
|
left--;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1230,9 +1170,7 @@ pgf_parsing_td_predict(PgfParsing* ps,
|
|||||||
while (right <= end &&
|
while (right <= end &&
|
||||||
value->ccat->fid == right->ccat->fid &&
|
value->ccat->fid == right->ccat->fid &&
|
||||||
value->lin_idx == right->lin_idx) {
|
value->lin_idx == right->lin_idx) {
|
||||||
GuVariantInfo i = { PGF_PRODUCTION_APPLY, right->papp };
|
pgf_parsing_predict_lexeme(ps, conts, right, lentry->offset, lentry->sym_idx);
|
||||||
PgfProduction prod = gu_variant_close(i);
|
|
||||||
pgf_parsing_push_production(ps, ps->before, conts, prod);
|
|
||||||
right++;
|
right++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1271,7 +1209,7 @@ pgf_parsing_pre(PgfParsing* ps, PgfItem* item, PgfSymbols* syms)
|
|||||||
} else {
|
} else {
|
||||||
item->alt = 0;
|
item->alt = 0;
|
||||||
pgf_item_advance(item, ps->pool);
|
pgf_item_advance(item, ps->pool);
|
||||||
pgf_parsing_push_item(ps->before, item);
|
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1401,8 +1339,9 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
|||||||
item->curr_sym = gu_null_variant;
|
item->curr_sym = gu_null_variant;
|
||||||
item->sym_idx = gu_seq_length(syms);
|
item->sym_idx = gu_seq_length(syms);
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, offset, BIND_NONE);
|
pgf_new_parse_state(ps, offset, BIND_NONE,
|
||||||
pgf_parsing_push_item(state, item);
|
item->inside_prob+item->conts->outside_prob);
|
||||||
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
match = true;
|
match = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -1445,10 +1384,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
|||||||
if (ps->before->start_offset == ps->before->end_offset &&
|
if (ps->before->start_offset == ps->before->end_offset &&
|
||||||
ps->before->needs_bind) {
|
ps->before->needs_bind) {
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||||
|
item->inside_prob+item->conts->outside_prob);
|
||||||
if (state != NULL) {
|
if (state != NULL) {
|
||||||
pgf_item_advance(item, ps->pool);
|
pgf_item_advance(item, ps->pool);
|
||||||
pgf_parsing_push_item(state, item);
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
} else {
|
} else {
|
||||||
pgf_item_free(ps, item);
|
pgf_item_free(ps, item);
|
||||||
}
|
}
|
||||||
@@ -1462,10 +1402,11 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
|||||||
if (ps->before->start_offset == ps->before->end_offset) {
|
if (ps->before->start_offset == ps->before->end_offset) {
|
||||||
if (ps->before->needs_bind) {
|
if (ps->before->needs_bind) {
|
||||||
PgfParseState* state =
|
PgfParseState* state =
|
||||||
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD);
|
pgf_new_parse_state(ps, ps->before->end_offset, BIND_HARD,
|
||||||
|
item->inside_prob+item->conts->outside_prob);
|
||||||
if (state != NULL) {
|
if (state != NULL) {
|
||||||
pgf_item_advance(item, ps->pool);
|
pgf_item_advance(item, ps->pool);
|
||||||
pgf_parsing_push_item(state, item);
|
gu_buf_heap_push(state->agenda, pgf_item_prob_order, &item);
|
||||||
} else {
|
} else {
|
||||||
pgf_item_free(ps, item);
|
pgf_item_free(ps, item);
|
||||||
}
|
}
|
||||||
@@ -1474,7 +1415,7 @@ pgf_parsing_symbol(PgfParsing* ps, PgfItem* item, PgfSymbol sym)
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
pgf_item_advance(item, ps->pool);
|
pgf_item_advance(item, ps->pool);
|
||||||
pgf_parsing_push_item(ps->before, item);
|
gu_buf_heap_push(ps->before->agenda, pgf_item_prob_order, &item);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -1725,7 +1666,8 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
|||||||
ps->heuristic_factor = heuristic_factor;
|
ps->heuristic_factor = heuristic_factor;
|
||||||
}
|
}
|
||||||
|
|
||||||
pgf_parsing_scan(ps);
|
PgfParseState* state =
|
||||||
|
pgf_new_parse_state(ps, 0, BIND_SOFT, 0);
|
||||||
|
|
||||||
int fidString = -1;
|
int fidString = -1;
|
||||||
PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool);
|
PgfCCat* start_ccat = gu_new(PgfCCat, ps->pool);
|
||||||
@@ -1745,7 +1687,7 @@ pgf_parsing_init(PgfConcr* concr, PgfCId cat,
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
PgfItemConts* conts =
|
PgfItemConts* conts =
|
||||||
pgf_parsing_get_conts(ps->before, start_ccat, 0, ps->pool);
|
pgf_parsing_get_conts(state, start_ccat, 0, ps->pool);
|
||||||
gu_buf_push(conts->items, PgfItem*, NULL);
|
gu_buf_push(conts->items, PgfItem*, NULL);
|
||||||
|
|
||||||
size_t n_ccats = gu_seq_length(cnccat->cats);
|
size_t n_ccats = gu_seq_length(cnccat->cats);
|
||||||
|
|||||||
Reference in New Issue
Block a user