diff --git a/.github/workflows/build-majestic.yml b/.github/workflows/build-majestic.yml index 395053bcb..7b8c8a322 100644 --- a/.github/workflows/build-majestic.yml +++ b/.github/workflows/build-majestic.yml @@ -22,7 +22,7 @@ jobs: # make \ # g++ - - name: Build runtime + - name: Build C runtime working-directory: ./src/runtime/c run: | autoreconf -i @@ -36,9 +36,22 @@ jobs: # ghc-version: '8.6' # cabal-version: '2.4.1.0' - - name: Run testsuite + - name: Run Haskell testsuite working-directory: ./src/runtime/haskell env: LD_LIBRARY_PATH: /usr/local/lib run: | cabal test --extra-lib-dirs=/usr/local/lib + + - name: Install Python bindings + working-directory: ./src/runtime/python + run: | + python setup.py build + sudo python setup.py install + + - name: Run Python testsuite + working-directory: ./src/runtime/python + env: + LD_LIBRARY_PATH: /usr/local/lib + run: | + python test.py diff --git a/src/runtime/c/doc/DESIDERATA.md b/src/runtime/c/doc/DESIDERATA.md index 4716fddf7..d2f7cda20 100644 --- a/src/runtime/c/doc/DESIDERATA.md +++ b/src/runtime/c/doc/DESIDERATA.md @@ -1,18 +1,18 @@ This is an experiment to develop **a majestic new GF runtime**. -The reason is that there several features that we want to have and they all require a majour rewrite of the existing C runtime. +The reason is that there are several features that we want to have and they all require a major rewrite of the existing C runtime. Instead of beating the old code until it starts doing what we want, it is time to start from scratch. # New Features The features that we want are: -- We want to support **even bigger grammars that don't fit in the main memory** anymore. Instead, they should reside on the disc and parts will be loaded on demand. -The current design is that all memory allocated for the grammars should be from memory-mapped files. In this way the only limit for the grammar size will -be the size of the virtual memory, i.e. 2^64 bytes. The swap file is completely circumvented, while all of the available RAM can be used as a cache for loading parts -of the grammar. +- We want to support **even bigger grammars that don't fit in the main memory** anymore. Instead, they should reside on the disc and parts will be loaded on demand. +The current design is that all memory allocated for the grammars should be from memory-mapped files. In this way the only limit for the grammar size will +be the size of the virtual memory, i.e. 2^64 bytes. The swap file is completely circumvented, while all of the available RAM can be used as a cache for loading parts +of the grammar. -- We want to be able to **update grammars dynamically**. This is a highly desired feature since recompiling large grammars takes hours. +- We want to be able to **update grammars dynamically**. This is a highly desired feature since recompiling large grammars takes hours. Instead, dynamic updates should happen instantly. - We want to be able to **store additional information in the PGF**. For example that could be application specific semantic data. @@ -21,32 +21,31 @@ Another example is to store the source code of the different grammar rules, to a - We want to **allow a single file to contain slightly different versions of the grammar**. This will be a kind of a version control system, which will allow different users to store their own grammar extensions while still using the same core content. -- We want to **avoid the exponential explossion in the size of PMCFG** for some grammars. This happens because PMCFG as a formalism is too low-level. +- We want to **avoid the exponential explosion in the size of PMCFG** for some grammars. This happens because PMCFG as a formalism is too low-level. By enriching it with light-weight variables, we can make it more powerful and hopefully avoid the exponential explosion. -- We want to finally **ditch out the old Haskell runtime** which has long outlived its time. +- We want to finally **ditch the old Haskell runtime** which has long outlived its time. There are also two bugs in the old C runtime whose fixes will require a lot of changes, so instead of fixing the old runtime we do it here: -- **Integer literals in the C runtime** are implemented as 32-bit integers, while the Haskell runtime used unlimited integer. Python supports unlimited integer too, -so it would be nice to support unlimited integer in the new runtime as well. +- **Integer literals in the C runtime** are implemented as 32-bit integers, while the Haskell runtime used unlimited integers. +Python supports unlimited integers too, so it would be nice to support them in the new runtime as well. -- The old C runtime assumed that **String literals are terminated with the NULL character**. None of the modern languages (Haskell, Python, Java, etc) make +- The old C runtime assumed that **String literals are terminated with the NULL character**. None of the modern languages (Haskell, Python, Java, etc) make that assumption, so we should drop it too. # Consequences The desired features will have the following implementation cosequences. -- The switch from memory-based to disc-based runtime requires one big change. Before it was easy to just keep a pointer from one object to another. -Unfortunately this doesn't work with memory-mapped files, since every time when you map a file into memory it may end up at a different virtual address. +- The switch from memory-based to disc-based runtime requires one big change. Before it was easy to just keep a pointer from one object to another. +Unfortunately this doesn't work with memory-mapped files, since every time when you map a file into memory it may end up at a different virtual address. Instead we must use file offsets. In order to make programming simpler, the new runtime will be **implemented in C++ instead of C**. This allows us to overload -the arrow operator (->) which will dynamically convert file offsets to in-memory pointers. +the arrow operator (`->`) which will dynamically convert file offsets to in-memory pointers. -- The choice of C++ also allows us to ditch the old libgu library and **use STL** instead. +- The choice of C++ also allows us to ditch the old `libgu` library and **use STL** instead. -- The content of the memory mapped files is platform specific. For that reason there will be two grammar representations: - - **Native Grammar Format** (.ngf) - which will be instantly loadable by just mapping it to memory, but will be platform dependent. - - **Portable Grammar Format** (.pgf) - which will take longer to load but will be more compact and platform independent. - The runtime will be able to load .pgf files and convert them to .ngf. Conversely .pgf can be exported from the current .ngf. - +- The content of the memory mapped files is platform-specific. For that reason there will be two grammar representations: + - **Native Grammar Format** (`.ngf`) - which will be instantly loadable by just mapping it to memory, but will be platform-dependent. + - **Portable Grammar Format** (`.pgf`) - which will take longer to load but will be more compact and platform independent. + The runtime will be able to load `.pgf` files and convert them to `.ngf`. Conversely `.pgf` can be exported from the current `.ngf`. diff --git a/src/runtime/c/doc/README.md b/src/runtime/c/doc/README.md index 298647f5a..3839f1e04 100644 --- a/src/runtime/c/doc/README.md +++ b/src/runtime/c/doc/README.md @@ -1,14 +1,13 @@ # The Hacker's Guide to GF This is the hacker's guide to GF, for the guide to the galaxy, see the full edition [here](https://en.wikipedia.org/wiki/The_Hitchhiker%27s_Guide_to_the_Galaxy). -Here we will limit outselves to the vastly narrower domain of the [GF](https://www.grammaticalframework.org) runtime. This means that we will not meet -any [Vogons](https://en.wikipedia.org/wiki/Vogon), but we will touch upon topics like memory management, databases, transactions, compilers, +Here we will limit outselves to the vastly narrower domain of the [GF](https://www.grammaticalframework.org) runtime. This means that we will not meet +any [Vogons](https://en.wikipedia.org/wiki/Vogon), but we will touch upon topics like memory management, databases, transactions, compilers, functional programming, theorem proving and sometimes even languages. Subjects that no doubt would interest any curious hacker. -So, **Don't Panick!** and keep reading. This is a live document and will develop together with the runtime itself. +So, **Don't Panic!** and keep reading. This is a live document and will develop together with the runtime itself. **TABLE OF CONTENTS** 1. [Desiderata](DESIDERATA.md) 2. [Memory Model](memory_model.md) - diff --git a/src/runtime/c/doc/memory_model.md b/src/runtime/c/doc/memory_model.md index ef8aa51a9..9c126deeb 100644 --- a/src/runtime/c/doc/memory_model.md +++ b/src/runtime/c/doc/memory_model.md @@ -1,46 +1,46 @@ # The different storage files -The purpose of the .ngf files is to be used as on disk databases that store grammars. Their format is platform dependent and they should not be copied from -one platform to another. In contrast the .pgf files are platform independent and can be moved around. The runtime can import a .pgf file and create an .ngf file. -Conversely a .pgf file can be exported from an already existing .ngf file. +The purpose of the `.ngf` files is to be used as on-disk databases that store grammars. Their format is platform-dependent and they should not be copied from +one platform to another. In contrast the `.pgf` files are platform-independent and can be moved around. The runtime can import a `.pgf` file and create an `.ngf` file. +Conversely a `.pgf` file can be exported from an already existing `.ngf` file. -The internal relation between the two file is more interesting. The runtime uses its own memory allocator which always allocates memory from a memory mapped file. -The file may be explicit or an annonymous one. The .ngf is simply a memory image saved in a file. This means that loading the file is always immediate. +The internal relation between the two files is more interesting. The runtime uses its own memory allocator which always allocates memory from a memory mapped file. +The file may be explicit or an anonymous one. The `.ngf` is simply a memory image saved in a file. This means that loading the file is always immediate. You just create a new mapping and the kernel will load memory pages on demand. -On the other hand a .pgf file is a version of the grammar serialized in a platform independent format. This means that loading this type of file is always slower. -Fortunately, you can always create an .ngf file from it to speedup later reloads. +On the other hand a `.pgf` file is a version of the grammar serialized in a platform-independent format. This means that loading this type of file is always slower. +Fortunately, you can always create an `.ngf` file from it to speed up later reloads. The runtime has three ways to load a grammar: -* loading a .pgf: +#### 1. Loading a `.pgf` ```Haskell readPGF :: FilePath -> IO PGF ``` -This loads the .pgf into an annonymous memory mapped file. In practice, this means that instead of allocating memory from an explicit file, the runtime will still +This loads the `.pgf` into an anonymous memory-mapped file. In practice, this means that instead of allocating memory from an explicit file, the runtime will still use the normal swap file. -* loading a .pgf and booting a new .ngf: +#### 2. Loading a `.pgf` and booting a new `.ngf` ```Haskell bootPGF :: FilePath -> FilePath -> IO PGF ``` -The grammar is loaded from a .pgf (the first argument) and the memory is mapped to an explicit .ngf (second argument). The .ngf file is created by the function +The grammar is loaded from a `.pgf` (the first argument) and the memory is mapped to an explicit `.ngf` (second argument). The `.ngf` file is created by the function and a file with the same name should not exist before the call. -* loading an existing memory image: +#### 3. Loading an existing memory image ```Haskell readNGF :: FilePath -> IO PGF ``` -Once an .ngf file exists, it can be mapped back to memory by using this function. This call is always guaranteed to be fast. The same function can also -create new empty .ngf files. If the file does not exist, then a new one will be created which contains an empty grammar. The grammar could then be extended +Once an `.ngf` file exists, it can be mapped back to memory by using this function. This call is always guaranteed to be fast. The same function can also +create new empty `.ngf` files. If the file does not exist, then a new one will be created which contains an empty grammar. The grammar could then be extended by dynamically adding functions and categories. -# The content of an .ngf file +# The content of an `.ngf` file -The .ngf file is a memory image but this is not the end of the story. The problem is that there is no way to control at which address the memory image would be -mapped. On Posix systems, `mmap` takes as hint the mapping address but the kernel may choose to ignore it. There is also the flag MAP_FIXED, which makes the hint +The `.ngf` file is a memory image but this is not the end of the story. The problem is that there is no way to control at which address the memory image would be +mapped. On Posix systems, `mmap` takes as hint the mapping address but the kernel may choose to ignore it. There is also the flag `MAP_FIXED`, which makes the hint into a constraint, but then the kernel may fail to satisfy the constraint. For example that address may already be used for something else. Furthermore, if the -same file is mapped from several processes (if they all load the same grammar), it would be difficult to find an address which is free in all of them. +same file is mapped from several processes (if they all load the same grammar), it would be difficult to find an address which is free in all of them. Last but not least using `MAP_FIXED` is considered a security risk. Since the start address of the mapping can change, using traditional memory pointers withing the mapped area is not possible. The only option is to use offsets @@ -50,15 +50,15 @@ relative to the beginning of the area. In other words, if normally we would have ``` Writing the explicit pointer arithmetics and typecasts, each time when we dereference a pointer, is not better than Vogon poetry and it -becomes worse when using a chain of arrow operators. The solution is to use the operator overloading in C++. +becomes worse when using a chain of arrow operators. The solution is to use the operator overloading in C++. There is the type `ref` which wraps around a file offset to a data item of type `A`. The operators `->` and `*` are overloaded for the type and they do the necessary pointer arithmetics and type casts. This solves the problem with code readability but creates another problem. How do `->` and `*` know the address of the memory mapped area? Obviously, -`current_base` must be a global variable and there must be a way to initialize it. More specifically it must be thread local to allow different threads to +`current_base` must be a global variable and there must be a way to initialize it. More specifically it must be thread-local to allow different threads to work without collisions. -A database (a memory mapped file) in the runtime is represented by the type `DB`. Before any of the data in the database is accessed, the database must +A database (a memory-mapped file) in the runtime is represented by the type `DB`. Before any of the data in the database is accessed, the database must be brought into scope. Bringing into scope means that `current_base` is initialized to point to the mapping area for that database. After that any dereferencing of a reference will be done relative to the corresponding database. This is how scopes are defined: ```C++ @@ -68,7 +68,7 @@ of a reference will be done relative to the corresponding database. This is how } ``` Here `DB_scope` is a helper type and `db` is a pointer to the database that you want to bring into scope. The constructor for `DB_scope` saves the old value -for `current_base` and then sets it to point to the area of the given database. Conversely the destructor, restores the previous value. +for `current_base` and then sets it to point to the area of the given database. Conversely, the destructor restores the previous value. The use of `DB_scope` is reentrant, i.e. you can do this: ```C++ @@ -85,12 +85,12 @@ The use of `DB_scope` is reentrant, i.e. you can do this: What you can't do is to have more than one database in scope simultaneously. Fortunately, that is not needed. All API functions start a scope and the internals of the runtime always work with the current database in scope. -Note the flag `READER_SCOPE`. You can use either `READER_SCOPE` or `WRITER_SCOPE`. In addition to selecting the database, the DB_scope also enforces, -the single writer, multiple readers policy. The main problem is that a writer may have to enlarge the current file, which consequently may mean -that the kernel should relocate the mapping area to a new address. If there are readers at the same time, they way break since they expect that the mapped +Note the flag `READER_SCOPE`. You can use either `READER_SCOPE` or `WRITER_SCOPE`. In addition to selecting the database, the `DB_scope` also enforces +the single writer/multiple readers policy. The main problem is that a writer may have to enlarge the current file, which consequently may mean +that the kernel should relocate the mapping area to a new address. If there are readers at the same time, they may break since they expect that the mapped area is at a particular location. -# Developing Writers +# Developing writers There is one important complication when developing procedures modifying the database. Every call to `DB::malloc` may potentially have to enlarge the mapped area which sometimes leads to changing `current_base`. That would not have been a problem if GCC was not sometimes caching variables in registers. Look at the following code: @@ -98,7 +98,7 @@ which sometimes leads to changing `current_base`. That would not have been a pro p->r = foo(); ``` Here `p` is a reference which is used to access another reference `r`. On the other hand, `foo()` is a procedure which directly or indirectly calls `DB::malloc`. -GCC compiles assignments by first computing the address to modify, and then it evaluates the right hand side. This means that while `foo()` is beeing evaluated the address computed on the left-hand side is saved in a register or somewhere in the stack. But now, if it happens that the allocation in `foo()` has changed +GCC compiles assignments by first computing the address to modify, and then it evaluates the right hand side. This means that while `foo()` is being evaluated the address computed on the left-hand side is saved in a register or somewhere in the stack. But now, if it happens that the allocation in `foo()` has changed `current_base`, then the saved address is no longer valid. That first problem is solved by overloading the assignment operator for `ref`: @@ -108,29 +108,29 @@ ref& operator= (const ref& r) { return *this; } ``` -On a first sight, nothing special happens here and it looks like the overloading is redundant. However, now the assignments are compiled in a very different way. +On first sight, nothing special happens here and it looks like the overloading is redundant. However, now the assignments are compiled in a very different way. The overloaded operator is inlined, so there is no real method call and we don't get any overhead. The real difference is that now, whatever is on the left-hand side of the assignment becomes the value of the `this` pointer, and `this` is always the last thing to be evaluated in a method call. This solves the problem. `foo()` is evaluated first and if it changes `current_base`, the change will be taken into account when computing the left-hand side of the assignment. - + Unfortunately, this is not the only problem. A similar thing happens when the arguments of a function are calls to other functions. See this: ```C++ foo(p->r,bar(),q->r) ``` -Where now `bar()` is the function that do allocation. The compiler is free to keep in a register the value of `current_base` that it needs for the evaluation of +Where now `bar()` is the function that performs allocation. The compiler is free to keep in a register the value of `current_base` that it needs for the evaluation of `p->r`, while it evaluates `bar()`. But if `current_base` has changed, then the saved value would be invalid while computing `q->r`. There doesn't seem to be a work around for this. The only solution is to: - + **Never call a function that allocates as an argument to another function** - + Instead we call allocating functions on a separate line and we save the result in a temporary variable. - -# Thread Local Variables -A final remark is the compilation of thread local variables. When a thread local variable is compiled in a position dependent code, i.e. in executables, it is -compiled efficiently by using the fs register which points to the thread local segment. Unfortunately, that is not the case by default for shared -libraries like our runtime. In that case, GCC applies the global-dynamic model which means that access to a thread local variable is internally implemented -with a call to the function ´__tls_get_addr´. Since `current_base` is used all the time, this adds overhead. +# Thread-local variables + +A final remark is the compilation of thread-local variables. When a thread-local variable is compiled in a position-dependent code, i.e. in executables, it is +compiled efficiently by using the `fs` register which points to the thread-local segment. Unfortunately, that is not the case by default for shared +libraries like our runtime. In that case, GCC applies the global-dynamic model which means that access to a thread local variable is internally implemented +with a call to the function `__tls_get_addr`. Since `current_base` is used all the time, this adds overhead. The solution is to define the variable with the attribute `__attribute__((tls_model("initial-exec")))` which says that it should be treated as if it is defined in an executable. This removes the overhead, but adds the limitation that the runtime should not be loaded with `dlopen`. diff --git a/src/runtime/python/INSTALL b/src/runtime/python/INSTALL deleted file mode 100644 index 41b609638..000000000 --- a/src/runtime/python/INSTALL +++ /dev/null @@ -1,5 +0,0 @@ -You will need the python-devel package or similar. -You must have installed the PGF C runtime (see ../c/INSTALL) - -$ python setup.py build -$ sudo python setup.py install diff --git a/src/runtime/python/README.md b/src/runtime/python/README.md new file mode 100644 index 000000000..8878198d4 --- /dev/null +++ b/src/runtime/python/README.md @@ -0,0 +1,25 @@ +# Python bindings to C runtime + +## Pre-requisites + +1. You must have installed the PGF C runtime (see `../c/README.md`) +2. You will need the system Python development package, e.g.: + - RedHat: `yum install python-devel` + - Debian: `apt install python-dev` + +## Installation + +```sh +python setup.py build +sudo python setup.py install +``` + +## Usage + +See: https://www.grammaticalframework.org/doc/runtime-api.html#python + +## Running tests + +```sh +python test.py +``` diff --git a/src/runtime/python/setup.py b/src/runtime/python/setup.py index b7ccb61e6..5a73e6a08 100644 --- a/src/runtime/python/setup.py +++ b/src/runtime/python/setup.py @@ -16,7 +16,7 @@ pgf_module = Extension('pgf', libraries = ['pgf']) setup (name = 'pgf', - version = '1.0', + version = '2.0', description = 'Python bindings to the Grammatical Framework\'s PGF runtime', long_description="""\ Grammatical Framework (GF) is a programming language for multilingual grammar applications. diff --git a/src/runtime/python/test.py b/src/runtime/python/test.py index 4c3374650..a814ae16e 100644 --- a/src/runtime/python/test.py +++ b/src/runtime/python/test.py @@ -1,65 +1,66 @@ import pgf import sys -import sets -import readline -import locale +# import sets +# import readline +# import locale sys.stdout.write("loading...") sys.stdout.flush(); -gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") +# gr = pgf.readPGF("../../../treebanks/PennTreebank/ParseEngAbs.pgf") +gr = pgf.readPGF("../haskell/tests/basic.pgf") sys.stdout.write("\n") -source_lang = gr.languages["ParseEng"] -target_lang = gr.languages["ParseBul"] - -we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") -print source_lang.linearize(we) - -sys.stdout.write("start cat: "+gr.startCat+"\n\n") - -class Completer(): - def __init__(self, lang): - self.gr = lang - - def complete(self, prefix, state): - if state == 0: - line = readline.get_line_buffer() - line = line[0:readline.get_begidx()] - self.i = source_lang.complete(line, prefix=prefix) - self.tokens = sets.Set() - - if len(self.tokens) > 50: - return None - - while True: - try: - (p,t,c) = self.i.next() - if t not in self.tokens: - self.tokens.add(t) - return t - except StopIteration: - return None - -completer = Completer(source_lang) -readline.set_completer(completer.complete) -readline.parse_and_bind("tab: complete") -locale.setlocale(locale.LC_CTYPE, "") - -while True: - try: - line = raw_input("> "); - except EOFError: - sys.stdout.write("\n") - readline.set_completer(None) - break - except KeyboardInterrupt: - sys.stdout.write("\n") - readline.set_completer(None) - break - - try: - for (p,e) in source_lang.parse(line, n=1): - sys.stdout.write("["+str(p)+"] "+str(e)+"\n") - print target_lang.linearize(e) - except pgf.ParseError as e: - print e.message +# source_lang = gr.languages["ParseEng"] +# target_lang = gr.languages["ParseBul"] +# +# we = pgf.readExpr("UttImpSg PPos (ImpVP (UseV try_V))") +# print source_lang.linearize(we) +# +# sys.stdout.write("start cat: "+gr.startCat+"\n\n") +# +# class Completer(): +# def __init__(self, lang): +# self.gr = lang +# +# def complete(self, prefix, state): +# if state == 0: +# line = readline.get_line_buffer() +# line = line[0:readline.get_begidx()] +# self.i = source_lang.complete(line, prefix=prefix) +# self.tokens = sets.Set() +# +# if len(self.tokens) > 50: +# return None +# +# while True: +# try: +# (p,t,c) = self.i.next() +# if t not in self.tokens: +# self.tokens.add(t) +# return t +# except StopIteration: +# return None +# +# completer = Completer(source_lang) +# readline.set_completer(completer.complete) +# readline.parse_and_bind("tab: complete") +# locale.setlocale(locale.LC_CTYPE, "") +# +# while True: +# try: +# line = raw_input("> "); +# except EOFError: +# sys.stdout.write("\n") +# readline.set_completer(None) +# break +# except KeyboardInterrupt: +# sys.stdout.write("\n") +# readline.set_completer(None) +# break +# +# try: +# for (p,e) in source_lang.parse(line, n=1): +# sys.stdout.write("["+str(p)+"] "+str(e)+"\n") +# print target_lang.linearize(e) +# except pgf.ParseError as e: +# print e.message