1
0
forked from GitHub/gf-rgl
This commit is contained in:
Inari Listenmaa
2020-03-06 17:14:54 +01:00
16 changed files with 113593 additions and 2 deletions

View File

@@ -12,7 +12,7 @@ addons:
- ghc - ghc
before_install: before_install:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.2 && export PATH="/usr/local/opt/ghc@8.2/bin:$PATH" ; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.6 && export PATH="/usr/local/opt/ghc@8.6/bin:$PATH" ; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl http://www.grammaticalframework.org/download/gf-3.9-bin-intel-mac.tar.gz > gf.tar.gz && sudo tar --no-same-owner --no-same-permissions -C /usr/local -zxf gf.tar.gz && rm gf.tar.gz; fi - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl http://www.grammaticalframework.org/download/gf-3.9-bin-intel-mac.tar.gz > gf.tar.gz && sudo tar --no-same-owner --no-same-permissions -C /usr/local -zxf gf.tar.gz && rm gf.tar.gz; fi
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl http://www.grammaticalframework.org/download/gf_3.9.1-1_amd64-trusty.deb > gf.deb && sudo dpkg -i gf.deb && rm gf.deb ; fi - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl http://www.grammaticalframework.org/download/gf_3.9.1-1_amd64-trusty.deb > gf.deb && sudo dpkg -i gf.deb && rm gf.deb ; fi
- if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install ghc --version=8.4.4 && export PATH="/c/ProgramData/chocolatey/lib/ghc/tools/ghc-8.4.4/bin:$PATH"; fi - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install ghc --version=8.4.4 && export PATH="/c/ProgramData/chocolatey/lib/ghc/tools/ghc-8.4.4/bin:$PATH"; fi

View File

@@ -6,7 +6,7 @@
The GF Resource Grammar Library is the standard library for Grammatical Framework. It covers the morphology and basic syntax of over 30 languages. The GF Resource Grammar Library is the standard library for Grammatical Framework. It covers the morphology and basic syntax of over 30 languages.
For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis.html). For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis/).
## Choose your build method ## Choose your build method

View File

@@ -0,0 +1,142 @@
module Main where
import PGF
import qualified Data.Map as M
import Data.Char
import Data.List
import System.Environment (getArgs)
-- AR 2020-02-28
-- making a word list purely morphological, i.e.
-- - functions are 1-to-1 with lemgrams, i.e.
-- - no sense distinctions
-- - no subcategorizations
-- - no variants
-- - functionname = baseform_category, with exceptions
-- - variant inflection tables: lie_1_V, lie_2_V
-- - words that have non-ident characters: 'bird\'s-eye_A'
-- - words that start with non-letters: W_'tween_Adv
-- example:
-- gf -make ../english/DictEng.gf
-- runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
-- 64923 -> 56599 functions
usage = "MkMorphodict <pgf> <outfile>"
main = do
pgfile:outfile:_ <- getArgs
pgf <- readPGF pgfile
config <- readFile (outfile ++ ".config") >>= return . mkConfig
let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
absheader <- readFile (outfile ++ "Abs.header")
cncheader <- readFile (outfile ++ ".header")
writeFile (outfile ++ "Abs.gf") absheader
appendFile (outfile ++ "Abs.gf") $ unlines absrules
appendFile (outfile ++ "Abs.gf") "}"
writeFile (outfile ++ ".gf") cncheader
appendFile (outfile ++ ".gf") $ unlines cncrules
appendFile (outfile ++ ".gf") "}"
type Cat = CId
type Oper = String
type Config = M.Map Cat (Cat,Oper,[Int])
data MDEnv = MDEnv {
pgf :: PGF,
config :: Config,
lang :: Language
}
mkConfig :: String -> Config
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
where
mkOne s = case words s of
"--":_ -> Right s
cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
_ -> Right s
mkMorphoDict :: MDEnv -> ([String],[String])
mkMorphoDict env =
unzip $
map splitRule $
findCompounds $
nameFunctions $
mergeRules $
concatMap findRules cats
where
splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
findRules cat = [
([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) | --- head ints is the base form in smart paradigms
f <- functionsByCat (pgf env) cat,
lin <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
Just (c,op,ints) <- [M.lookup cat (config env)]
]
appSig ints forms = [forms !! i | i <- ints]
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
nameFunctions = expandNames . sortOn fst
expandNames fls = case fls of
(f,l):fls2 -> case span ((==f) . fst) fls2 of
([],_) -> (mkFun f,l) : expandNames fls2
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
_ -> []
renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
findCompounds = getCompounds . sortOn cat_orthrevforms
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
cat_revforms (_,(cat,_:forms)) = (cat,map reverse forms)
revstem = head . snd . cat_revforms
wforms (_,(_,_:forms)) = forms
getCompounds fls = case fls of
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
([],_:_) -> markWith fl [] : getCompounds fls2
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
_ -> []
markCompound fl fl1 =
case and [isPrefixWord (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms fl1)] of
True -> markWith fl1 [";","--","compound",(fst fl)]
False -> markWith fl1 [";","--","notcompound",(fst fl)]
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
isPrefixWord x xy =
length suff > 1 &&
any (\c -> elem c "-0123456789aeiouyåäö") suff &&
isPrefixOf x xy
where
suff = drop (length x) xy
mkFun = quoteIf . concat . intersperse "_"
quoteIf s = case s of
_ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
c:_ | not (isAlpha c) -> "W_" ++ s
_ -> s
where
unSgQuote s = case s of
'\'':cs -> "\\\'" ++ unSgQuote cs
c:cs -> c : unSgQuote cs
_ -> s
quote s = "\"" ++ s ++ "\""

View File

@@ -0,0 +1,8 @@
N : N mkN 0 2
A : A mkA 0 2 4 6
V : V mkV 0 4 2
V2 : V mkV 0 4 2
Adv : Adv mkAdv 0
Prep : Prep mkPrep 0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,7 @@
concrete MorphoDictEng of MorphoDictEngAbs =
CatEng [N,A,V,Adv,Prep] **
open
ParadigmsEng
in
{

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,4 @@
abstract MorphoDictEngAbs =
Cat [N,A,V,Adv,Prep] **
{

View File

@@ -0,0 +1,8 @@
N : N mkN 0 2 4 6
A : A mkA 0 2 4 10 12
V : V mkV 6 0 4 2 8 10
V2 : V mkV 6 0 4 2 8 10
Adv : Adv mkAdv 0
Prep : Prep mkPrep 0
PN : PN mkPN 0

View File

@@ -0,0 +1,8 @@
concrete MorphoDictSwe of MorphoDictSweAbs =
CatSwe [N,A,V,Adv,Prep,PN] **
open
ParadigmsSwe
in
{

View File

@@ -0,0 +1,4 @@
abstract MorphoDictSweAbs =
Cat [N,A,V,Adv,Prep,PN] **
{

38
src/morphodict/README Normal file
View File

@@ -0,0 +1,38 @@
MkMorphoDict: Extracting a minimal morphological dictionary from an existing GF dictionary.
Aarne Ranta 2020-03-02
principles:
There should be a single source for each lemgram (i.e. inflection table of a word)
Functions names should be easy to guess: baseform_Category (but avoiding accidental errors if this is not a unique key)
Hence,
Functions are 1-to-1 with lemgrams, i.e. inflection tables, thus
- no sense distinctions
- no subcategorizations
- no variants
Functionname = baseform_category, with exceptions
- same baseform_Category, different inflection tables: lie_1_V, lie_2_V
- words that have non-ident characters: 'bird\'s-eye_A'
- words that start with non-letters: W_'tween_Adv
Example run, English:
gf -make ../english/DictEng.gf
runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
Result: 64923 -> 56599 functions, of which 21679 could be compounds
Swedish, using a dump of SALDO (not available in these sources)
cd saldo/
runghc SaldoGF.hs
# combine abs.tmp with Saldo.header to obtain Saldo.gf
# combine cnc.tmp with SaldoSwe.header to obtain SaldoSwe.gf
gf -make SaldoSwe.gf
cd ..
runghc MkMorphodict.hs saldo/Saldo.pgf MorphoDictSwe

View File

@@ -0,0 +1 @@
abstract Saldo = Cat [N,A,V,PN,Adv,Prep] ** {

View File

@@ -0,0 +1,97 @@
import Data.List
import qualified Data.Map as M
-- AR 2020-03-03
-- generating GF from preprocessed SALDO (of type Lex by John Camilleri)
main = do
lexicon <- readFile "saldom.hsdump" >>= return . readLex -- this is the preprocessed file
let gf = map (mkRules . treatNone) $ mkFuns lexicon
writeFile "abs.tmp" $ unlines $ map fst gf -- the generated files need headers
writeFile "cnc.tmp" $ unlines $ map snd gf -- use SaldoGF.header for this
-- JC's datatypes, using String for simplicity
type Lex = M.Map String Entry -- key is lemgram ID
type Table = [(String,String)]
data Entry = E
{ ePOS :: String
, eTable :: Table -- morphological tags to surface form: ("sg def gen" ,"killens")
} deriving (Show, Read)
readLex :: String -> [(String,Entry)]
readLex = read . drop 8
-- new code by AR
mkRules (fun,cat,lin) = (nunwords ["fun",fun,":",cat,";"],nunwords ["lin",fun,"=",lin,";"])
where
-- commenting out functions that still have NONE forms
nunwords ws = unwords ((if elem "\"NONE\"" (words lin) then ["--n"] else []) ++ ws)
-- converting incomplete paradigms to special mkC constructors, defined in SaldoSwe.header
treatNone (f,cat,lin) = case (cat,drop 1 (words lin)) of
("V", "\"NONE\"":"\"NONE\"":v:_) -> (f, "V", unwords ("mkVDep":[v]))
("V", i:d:p:a:b:"\"NONE\"":_) -> (f, "V", unwords ("mkVIntr":[i,d,p,a,b]))
("A", i:"\"NONE\"":p:c:s:_) -> (f, "A", unwords ("mkAUtr":[i,p,c,s]))
("A", i:d:p:"\"NONE\"":"\"NONE\"":_) -> (f, "A", unwords ("mkAComp":[i,d,p]))
("N", "\"NONE\"":d:"\"NONE\"":_) -> (f, "PN", unwords ("mkPNDef":[d])) ---
("N", i:"\"NONE\"":"\"NONE\"":_) -> (f, "PN", unwords ("mkPNIndef":[i]))
("N", i:d:"\"NONE\"":"\"NONE\"":_) -> (f, "N", unwords ("mkNSg":[i,d]))
("N", "\"NONE\"":"\"NONE\"":i:d:_) -> (f, "N", unwords ("mkNPl":[i,d]))
_ -> (f,cat,lin)
--- generating function names for simplicity: the result is fed to ../MkMorphoDict anyway
mkFuns lx = [("w"++show i, cat, lin) | (i,(cat,lin)) <- zip [1000000..] (concatMap (entry2lin . snd) lx)]
entry2lin e =
[(cat, mkLin cat ws) | ws <- manyTables valuess]
where
(cat,forms) = formSpec (ePOS e)
valuess = [nub [v | (t,v) <- eTable e, t == f] | f <- forms]
mkLin c ws = unwords $ ["mk"++c] ++ ["\"" ++ w ++ "\"" | w <- ws]
-- looking for the characteristic forms for each POS
formSpec pos = case pos of
"nn" -> ("N",[
"sg indef nom",
"sg def nom",
"pl indef nom",
"pl def nom"
])
"av" -> ("A",[
"pos indef sg u nom",
"pos indef sg n nom",
"pos indef pl nom",
"komp nom",
"super indef nom"
])
"vb" -> ("V",[
"inf aktiv",
"pres ind aktiv",
"imper",
"pret ind aktiv",
"sup aktiv",
"pret_part indef sg u nom"
])
"ab" -> ("Adv",[
"invar"
---- "pos"
])
"pp" -> ("Prep",[
"invar"
])
_ -> ("NONE++pos",["NONE++pos"]) -- ignoring other POS tags, which are rare anyway
-- trying to generate a small number of tables from sets of variant forms; seems to work well enough
manyTables formss = [
map ((!!i) . pad) formss |
i <- [0..maximum (map length formss)-1],
let pad forms = if null forms then repeat "NONE" else forms ++ repeat (head forms)
]

View File

@@ -0,0 +1,50 @@
concrete SaldoSwe of Saldo =
CatSwe [N,A,V,Adv,Prep,PN] **
open
ParadigmsSwe, Prelude
in
{
-- to deal with incomplete paradigms
--- the values could be in special categories to avoid overgeneration
oper
mkVDep : Str -> V
= \v -> case v of {
x + "as" => depV (mkV (x + "a")) ;
x + "es" => depV (mkV (x + "er")) ;
x + "s" => depV (mkV (x + "er")) ;
_ => Predef.error (v ++ "not for mkVDep")
} ;
mkVIntr : (_,_,_,_,_ : Str) -> V
= \i,_,_,p,pt -> mkV i p pt ; ---
mkAUtr : (_,_,_,_ : Str) -> A
= \u,p,c,s -> mkA u u p c s ; ---
mkAComp : (_,_,_ : Str) -> A
= \u,n,p -> compoundA (mkA u n) ; ---
mkPNDef : Str -> PN
= \s -> case s of {
_ + "n" => mkPN s utrum ;
_ => mkPN s neutrum
} ;
mkPNIndef : Str -> PN
= \s -> mkPN s neutrum ; ---
mkNSg : (_,_ : Str) -> N
= \i,d -> case d of {
_ + "n" => mkN i utrum ; ---
_ => mkN i neutrum ---
} ;
mkNPl : (_,_ : Str) -> N
= \i,d -> case i of {
s + "or" => mkN (s + "a") ; ---
s + ("ar"|"er") => mkN s i ; ---
s + "en" => mkN (s + "e") i ; ---
_ => mkN i i ---
} ;

13
src/swedish/README.md Normal file
View File

@@ -0,0 +1,13 @@
# Swedish
## Language info
- English name: Swedish
- Autonym: Svenska
- ISO code: Swe
## Dictionaries
- `OldDictSwe`: Converted from SALDO using [this code](https://github.com/MalinAhlberg/SwedishProject/tree/master/saldo) in 2011.
- `NewDictSwe`: Re-import from SALDO using [this code](https://github.com/DigitalGrammarsAB/SALDOtoGF/tree/a45e503a824ded39844df2aeeb7a6ee891e3bee1) in 2018, with more words and different identifier structure.
- `DictSwe` is a union of `OldDictSwe` and `NewDictSwe`