forked from GitHub/gf-rgl
Merge branch 'master' of https://github.com/GrammaticalFramework/gf-rgl into polish
This commit is contained in:
@@ -12,7 +12,7 @@ addons:
|
||||
- ghc
|
||||
|
||||
before_install:
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.2 && export PATH="/usr/local/opt/ghc@8.2/bin:$PATH" ; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.6 && export PATH="/usr/local/opt/ghc@8.6/bin:$PATH" ; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl http://www.grammaticalframework.org/download/gf-3.9-bin-intel-mac.tar.gz > gf.tar.gz && sudo tar --no-same-owner --no-same-permissions -C /usr/local -zxf gf.tar.gz && rm gf.tar.gz; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl http://www.grammaticalframework.org/download/gf_3.9.1-1_amd64-trusty.deb > gf.deb && sudo dpkg -i gf.deb && rm gf.deb ; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install ghc --version=8.4.4 && export PATH="/c/ProgramData/chocolatey/lib/ghc/tools/ghc-8.4.4/bin:$PATH"; fi
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
|
||||
The GF Resource Grammar Library is the standard library for Grammatical Framework. It covers the morphology and basic syntax of over 30 languages.
|
||||
|
||||
For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis.html).
|
||||
For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis/).
|
||||
|
||||
## Choose your build method
|
||||
|
||||
|
||||
142
src/morphodict/MkMorphodict.hs
Normal file
142
src/morphodict/MkMorphodict.hs
Normal file
@@ -0,0 +1,142 @@
|
||||
module Main where
|
||||
|
||||
import PGF
|
||||
|
||||
import qualified Data.Map as M
|
||||
import Data.Char
|
||||
import Data.List
|
||||
import System.Environment (getArgs)
|
||||
|
||||
-- AR 2020-02-28
|
||||
|
||||
-- making a word list purely morphological, i.e.
|
||||
-- - functions are 1-to-1 with lemgrams, i.e.
|
||||
-- - no sense distinctions
|
||||
-- - no subcategorizations
|
||||
-- - no variants
|
||||
-- - functionname = baseform_category, with exceptions
|
||||
-- - variant inflection tables: lie_1_V, lie_2_V
|
||||
-- - words that have non-ident characters: 'bird\'s-eye_A'
|
||||
-- - words that start with non-letters: W_'tween_Adv
|
||||
|
||||
-- example:
|
||||
-- gf -make ../english/DictEng.gf
|
||||
-- runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
|
||||
-- 64923 -> 56599 functions
|
||||
|
||||
usage = "MkMorphodict <pgf> <outfile>"
|
||||
|
||||
main = do
|
||||
pgfile:outfile:_ <- getArgs
|
||||
pgf <- readPGF pgfile
|
||||
config <- readFile (outfile ++ ".config") >>= return . mkConfig
|
||||
|
||||
let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
|
||||
|
||||
absheader <- readFile (outfile ++ "Abs.header")
|
||||
cncheader <- readFile (outfile ++ ".header")
|
||||
|
||||
writeFile (outfile ++ "Abs.gf") absheader
|
||||
appendFile (outfile ++ "Abs.gf") $ unlines absrules
|
||||
appendFile (outfile ++ "Abs.gf") "}"
|
||||
|
||||
writeFile (outfile ++ ".gf") cncheader
|
||||
appendFile (outfile ++ ".gf") $ unlines cncrules
|
||||
appendFile (outfile ++ ".gf") "}"
|
||||
|
||||
|
||||
type Cat = CId
|
||||
type Oper = String
|
||||
type Config = M.Map Cat (Cat,Oper,[Int])
|
||||
|
||||
data MDEnv = MDEnv {
|
||||
pgf :: PGF,
|
||||
config :: Config,
|
||||
lang :: Language
|
||||
}
|
||||
|
||||
mkConfig :: String -> Config
|
||||
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
||||
where
|
||||
mkOne s = case words s of
|
||||
"--":_ -> Right s
|
||||
cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
|
||||
_ -> Right s
|
||||
|
||||
mkMorphoDict :: MDEnv -> ([String],[String])
|
||||
mkMorphoDict env =
|
||||
unzip $
|
||||
map splitRule $
|
||||
findCompounds $
|
||||
nameFunctions $
|
||||
mergeRules $
|
||||
concatMap findRules cats
|
||||
where
|
||||
splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
|
||||
|
||||
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
|
||||
|
||||
findRules cat = [
|
||||
([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) | --- head ints is the base form in smart paradigms
|
||||
f <- functionsByCat (pgf env) cat,
|
||||
lin <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
|
||||
Just (c,op,ints) <- [M.lookup cat (config env)]
|
||||
]
|
||||
|
||||
appSig ints forms = [forms !! i | i <- ints]
|
||||
|
||||
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
||||
|
||||
nameFunctions = expandNames . sortOn fst
|
||||
|
||||
expandNames fls = case fls of
|
||||
(f,l):fls2 -> case span ((==f) . fst) fls2 of
|
||||
([],_) -> (mkFun f,l) : expandNames fls2
|
||||
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
|
||||
_ -> []
|
||||
|
||||
renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
|
||||
|
||||
findCompounds = getCompounds . sortOn cat_orthrevforms
|
||||
|
||||
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
||||
|
||||
cat_revforms (_,(cat,_:forms)) = (cat,map reverse forms)
|
||||
revstem = head . snd . cat_revforms
|
||||
wforms (_,(_,_:forms)) = forms
|
||||
|
||||
getCompounds fls = case fls of
|
||||
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
||||
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
||||
([],_:_) -> markWith fl [] : getCompounds fls2
|
||||
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
|
||||
_ -> []
|
||||
|
||||
markCompound fl fl1 =
|
||||
case and [isPrefixWord (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms fl1)] of
|
||||
True -> markWith fl1 [";","--","compound",(fst fl)]
|
||||
False -> markWith fl1 [";","--","notcompound",(fst fl)]
|
||||
|
||||
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
|
||||
|
||||
isPrefixWord x xy =
|
||||
length suff > 1 &&
|
||||
any (\c -> elem c "-0123456789aeiouyåäö") suff &&
|
||||
isPrefixOf x xy
|
||||
where
|
||||
suff = drop (length x) xy
|
||||
|
||||
mkFun = quoteIf . concat . intersperse "_"
|
||||
quoteIf s = case s of
|
||||
_ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
|
||||
c:_ | not (isAlpha c) -> "W_" ++ s
|
||||
_ -> s
|
||||
where
|
||||
unSgQuote s = case s of
|
||||
'\'':cs -> "\\\'" ++ unSgQuote cs
|
||||
c:cs -> c : unSgQuote cs
|
||||
_ -> s
|
||||
|
||||
|
||||
quote s = "\"" ++ s ++ "\""
|
||||
|
||||
8
src/morphodict/MorphoDictEng.config
Normal file
8
src/morphodict/MorphoDictEng.config
Normal file
@@ -0,0 +1,8 @@
|
||||
N : N mkN 0 2
|
||||
A : A mkA 0 2 4 6
|
||||
V : V mkV 0 4 2
|
||||
V2 : V mkV 0 4 2
|
||||
Adv : Adv mkAdv 0
|
||||
Prep : Prep mkPrep 0
|
||||
|
||||
|
||||
56607
src/morphodict/MorphoDictEng.gf
Normal file
56607
src/morphodict/MorphoDictEng.gf
Normal file
File diff suppressed because it is too large
Load Diff
7
src/morphodict/MorphoDictEng.header
Normal file
7
src/morphodict/MorphoDictEng.header
Normal file
@@ -0,0 +1,7 @@
|
||||
concrete MorphoDictEng of MorphoDictEngAbs =
|
||||
CatEng [N,A,V,Adv,Prep] **
|
||||
open
|
||||
ParadigmsEng
|
||||
in
|
||||
{
|
||||
|
||||
56604
src/morphodict/MorphoDictEngAbs.gf
Normal file
56604
src/morphodict/MorphoDictEngAbs.gf
Normal file
File diff suppressed because it is too large
Load Diff
4
src/morphodict/MorphoDictEngAbs.header
Normal file
4
src/morphodict/MorphoDictEngAbs.header
Normal file
@@ -0,0 +1,4 @@
|
||||
abstract MorphoDictEngAbs =
|
||||
Cat [N,A,V,Adv,Prep] **
|
||||
{
|
||||
|
||||
8
src/morphodict/MorphoDictSwe.config
Normal file
8
src/morphodict/MorphoDictSwe.config
Normal file
@@ -0,0 +1,8 @@
|
||||
N : N mkN 0 2 4 6
|
||||
A : A mkA 0 2 4 10 12
|
||||
V : V mkV 6 0 4 2 8 10
|
||||
V2 : V mkV 6 0 4 2 8 10
|
||||
Adv : Adv mkAdv 0
|
||||
Prep : Prep mkPrep 0
|
||||
PN : PN mkPN 0
|
||||
|
||||
8
src/morphodict/MorphoDictSwe.header
Normal file
8
src/morphodict/MorphoDictSwe.header
Normal file
@@ -0,0 +1,8 @@
|
||||
concrete MorphoDictSwe of MorphoDictSweAbs =
|
||||
CatSwe [N,A,V,Adv,Prep,PN] **
|
||||
open
|
||||
ParadigmsSwe
|
||||
in
|
||||
{
|
||||
|
||||
|
||||
4
src/morphodict/MorphoDictSweAbs.header
Normal file
4
src/morphodict/MorphoDictSweAbs.header
Normal file
@@ -0,0 +1,4 @@
|
||||
abstract MorphoDictSweAbs =
|
||||
Cat [N,A,V,Adv,Prep,PN] **
|
||||
{
|
||||
|
||||
38
src/morphodict/README
Normal file
38
src/morphodict/README
Normal file
@@ -0,0 +1,38 @@
|
||||
MkMorphoDict: Extracting a minimal morphological dictionary from an existing GF dictionary.
|
||||
|
||||
Aarne Ranta 2020-03-02
|
||||
|
||||
principles:
|
||||
|
||||
There should be a single source for each lemgram (i.e. inflection table of a word)
|
||||
Functions names should be easy to guess: baseform_Category (but avoiding accidental errors if this is not a unique key)
|
||||
|
||||
Hence,
|
||||
|
||||
Functions are 1-to-1 with lemgrams, i.e. inflection tables, thus
|
||||
- no sense distinctions
|
||||
- no subcategorizations
|
||||
- no variants
|
||||
|
||||
Functionname = baseform_category, with exceptions
|
||||
- same baseform_Category, different inflection tables: lie_1_V, lie_2_V
|
||||
- words that have non-ident characters: 'bird\'s-eye_A'
|
||||
- words that start with non-letters: W_'tween_Adv
|
||||
|
||||
Example run, English:
|
||||
|
||||
gf -make ../english/DictEng.gf
|
||||
runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
|
||||
|
||||
Result: 64923 -> 56599 functions, of which 21679 could be compounds
|
||||
|
||||
Swedish, using a dump of SALDO (not available in these sources)
|
||||
|
||||
cd saldo/
|
||||
runghc SaldoGF.hs
|
||||
# combine abs.tmp with Saldo.header to obtain Saldo.gf
|
||||
# combine cnc.tmp with SaldoSwe.header to obtain SaldoSwe.gf
|
||||
gf -make SaldoSwe.gf
|
||||
cd ..
|
||||
runghc MkMorphodict.hs saldo/Saldo.pgf MorphoDictSwe
|
||||
|
||||
1
src/morphodict/saldo/Saldo.header
Normal file
1
src/morphodict/saldo/Saldo.header
Normal file
@@ -0,0 +1 @@
|
||||
abstract Saldo = Cat [N,A,V,PN,Adv,Prep] ** {
|
||||
97
src/morphodict/saldo/SaldoGF.hs
Normal file
97
src/morphodict/saldo/SaldoGF.hs
Normal file
@@ -0,0 +1,97 @@
|
||||
import Data.List
|
||||
import qualified Data.Map as M
|
||||
|
||||
-- AR 2020-03-03
|
||||
-- generating GF from preprocessed SALDO (of type Lex by John Camilleri)
|
||||
|
||||
main = do
|
||||
lexicon <- readFile "saldom.hsdump" >>= return . readLex -- this is the preprocessed file
|
||||
let gf = map (mkRules . treatNone) $ mkFuns lexicon
|
||||
writeFile "abs.tmp" $ unlines $ map fst gf -- the generated files need headers
|
||||
writeFile "cnc.tmp" $ unlines $ map snd gf -- use SaldoGF.header for this
|
||||
|
||||
-- JC's datatypes, using String for simplicity
|
||||
|
||||
type Lex = M.Map String Entry -- key is lemgram ID
|
||||
|
||||
type Table = [(String,String)]
|
||||
|
||||
data Entry = E
|
||||
{ ePOS :: String
|
||||
, eTable :: Table -- morphological tags to surface form: ("sg def gen" ,"killens")
|
||||
} deriving (Show, Read)
|
||||
|
||||
readLex :: String -> [(String,Entry)]
|
||||
readLex = read . drop 8
|
||||
|
||||
-- new code by AR
|
||||
|
||||
mkRules (fun,cat,lin) = (nunwords ["fun",fun,":",cat,";"],nunwords ["lin",fun,"=",lin,";"])
|
||||
where
|
||||
-- commenting out functions that still have NONE forms
|
||||
nunwords ws = unwords ((if elem "\"NONE\"" (words lin) then ["--n"] else []) ++ ws)
|
||||
|
||||
-- converting incomplete paradigms to special mkC constructors, defined in SaldoSwe.header
|
||||
treatNone (f,cat,lin) = case (cat,drop 1 (words lin)) of
|
||||
("V", "\"NONE\"":"\"NONE\"":v:_) -> (f, "V", unwords ("mkVDep":[v]))
|
||||
("V", i:d:p:a:b:"\"NONE\"":_) -> (f, "V", unwords ("mkVIntr":[i,d,p,a,b]))
|
||||
("A", i:"\"NONE\"":p:c:s:_) -> (f, "A", unwords ("mkAUtr":[i,p,c,s]))
|
||||
("A", i:d:p:"\"NONE\"":"\"NONE\"":_) -> (f, "A", unwords ("mkAComp":[i,d,p]))
|
||||
("N", "\"NONE\"":d:"\"NONE\"":_) -> (f, "PN", unwords ("mkPNDef":[d])) ---
|
||||
("N", i:"\"NONE\"":"\"NONE\"":_) -> (f, "PN", unwords ("mkPNIndef":[i]))
|
||||
("N", i:d:"\"NONE\"":"\"NONE\"":_) -> (f, "N", unwords ("mkNSg":[i,d]))
|
||||
("N", "\"NONE\"":"\"NONE\"":i:d:_) -> (f, "N", unwords ("mkNPl":[i,d]))
|
||||
_ -> (f,cat,lin)
|
||||
|
||||
--- generating function names for simplicity: the result is fed to ../MkMorphoDict anyway
|
||||
mkFuns lx = [("w"++show i, cat, lin) | (i,(cat,lin)) <- zip [1000000..] (concatMap (entry2lin . snd) lx)]
|
||||
|
||||
entry2lin e =
|
||||
[(cat, mkLin cat ws) | ws <- manyTables valuess]
|
||||
where
|
||||
(cat,forms) = formSpec (ePOS e)
|
||||
valuess = [nub [v | (t,v) <- eTable e, t == f] | f <- forms]
|
||||
mkLin c ws = unwords $ ["mk"++c] ++ ["\"" ++ w ++ "\"" | w <- ws]
|
||||
|
||||
-- looking for the characteristic forms for each POS
|
||||
|
||||
formSpec pos = case pos of
|
||||
"nn" -> ("N",[
|
||||
"sg indef nom",
|
||||
"sg def nom",
|
||||
"pl indef nom",
|
||||
"pl def nom"
|
||||
])
|
||||
"av" -> ("A",[
|
||||
"pos indef sg u nom",
|
||||
"pos indef sg n nom",
|
||||
"pos indef pl nom",
|
||||
"komp nom",
|
||||
"super indef nom"
|
||||
])
|
||||
"vb" -> ("V",[
|
||||
"inf aktiv",
|
||||
"pres ind aktiv",
|
||||
"imper",
|
||||
"pret ind aktiv",
|
||||
"sup aktiv",
|
||||
"pret_part indef sg u nom"
|
||||
])
|
||||
"ab" -> ("Adv",[
|
||||
"invar"
|
||||
---- "pos"
|
||||
])
|
||||
"pp" -> ("Prep",[
|
||||
"invar"
|
||||
])
|
||||
_ -> ("NONE++pos",["NONE++pos"]) -- ignoring other POS tags, which are rare anyway
|
||||
|
||||
-- trying to generate a small number of tables from sets of variant forms; seems to work well enough
|
||||
|
||||
manyTables formss = [
|
||||
map ((!!i) . pad) formss |
|
||||
i <- [0..maximum (map length formss)-1],
|
||||
let pad forms = if null forms then repeat "NONE" else forms ++ repeat (head forms)
|
||||
]
|
||||
|
||||
|
||||
50
src/morphodict/saldo/SaldoSwe.header
Normal file
50
src/morphodict/saldo/SaldoSwe.header
Normal file
@@ -0,0 +1,50 @@
|
||||
concrete SaldoSwe of Saldo =
|
||||
CatSwe [N,A,V,Adv,Prep,PN] **
|
||||
open
|
||||
ParadigmsSwe, Prelude
|
||||
in
|
||||
{
|
||||
|
||||
-- to deal with incomplete paradigms
|
||||
--- the values could be in special categories to avoid overgeneration
|
||||
|
||||
oper
|
||||
mkVDep : Str -> V
|
||||
= \v -> case v of {
|
||||
x + "as" => depV (mkV (x + "a")) ;
|
||||
x + "es" => depV (mkV (x + "er")) ;
|
||||
x + "s" => depV (mkV (x + "er")) ;
|
||||
_ => Predef.error (v ++ "not for mkVDep")
|
||||
} ;
|
||||
|
||||
mkVIntr : (_,_,_,_,_ : Str) -> V
|
||||
= \i,_,_,p,pt -> mkV i p pt ; ---
|
||||
|
||||
mkAUtr : (_,_,_,_ : Str) -> A
|
||||
= \u,p,c,s -> mkA u u p c s ; ---
|
||||
|
||||
mkAComp : (_,_,_ : Str) -> A
|
||||
= \u,n,p -> compoundA (mkA u n) ; ---
|
||||
|
||||
mkPNDef : Str -> PN
|
||||
= \s -> case s of {
|
||||
_ + "n" => mkPN s utrum ;
|
||||
_ => mkPN s neutrum
|
||||
} ;
|
||||
|
||||
mkPNIndef : Str -> PN
|
||||
= \s -> mkPN s neutrum ; ---
|
||||
|
||||
mkNSg : (_,_ : Str) -> N
|
||||
= \i,d -> case d of {
|
||||
_ + "n" => mkN i utrum ; ---
|
||||
_ => mkN i neutrum ---
|
||||
} ;
|
||||
|
||||
mkNPl : (_,_ : Str) -> N
|
||||
= \i,d -> case i of {
|
||||
s + "or" => mkN (s + "a") ; ---
|
||||
s + ("ar"|"er") => mkN s i ; ---
|
||||
s + "en" => mkN (s + "e") i ; ---
|
||||
_ => mkN i i ---
|
||||
} ;
|
||||
13
src/swedish/README.md
Normal file
13
src/swedish/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# Swedish
|
||||
|
||||
## Language info
|
||||
|
||||
- English name: Swedish
|
||||
- Autonym: Svenska
|
||||
- ISO code: Swe
|
||||
|
||||
## Dictionaries
|
||||
|
||||
- `OldDictSwe`: Converted from SALDO using [this code](https://github.com/MalinAhlberg/SwedishProject/tree/master/saldo) in 2011.
|
||||
- `NewDictSwe`: Re-import from SALDO using [this code](https://github.com/DigitalGrammarsAB/SALDOtoGF/tree/a45e503a824ded39844df2aeeb7a6ee891e3bee1) in 2018, with more words and different identifier structure.
|
||||
- `DictSwe` is a union of `OldDictSwe` and `NewDictSwe`
|
||||
Reference in New Issue
Block a user