Merge branch 'master' of https://github.com/GrammaticalFramework/gf-rgl into polish

2020-03-06 17:14:54 +01:00
parent 962a94cb24 2911abd137
commit 334038fee2
16 changed files with 113593 additions and 2 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,7 @@ addons:
      - ghc

 before_install:
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.2 && export PATH="/usr/local/opt/ghc@8.2/bin:$PATH" ; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then brew update && brew install ghc@8.6 && export PATH="/usr/local/opt/ghc@8.6/bin:$PATH" ; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl http://www.grammaticalframework.org/download/gf-3.9-bin-intel-mac.tar.gz > gf.tar.gz && sudo tar --no-same-owner --no-same-permissions -C /usr/local -zxf gf.tar.gz && rm gf.tar.gz; fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl http://www.grammaticalframework.org/download/gf_3.9.1-1_amd64-trusty.deb > gf.deb && sudo dpkg -i gf.deb && rm gf.deb ; fi
  - if [[ "$TRAVIS_OS_NAME" == "windows" ]]; then choco install ghc --version=8.4.4 && export PATH="/c/ProgramData/chocolatey/lib/ghc/tools/ghc-8.4.4/bin:$PATH"; fi
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@

 The GF Resource Grammar Library is the standard library for Grammatical Framework. It covers the morphology and basic syntax of over 30 languages.

-For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis.html).
+For more about the RGL, see the [synopsis page](http://www.grammaticalframework.org/lib/doc/synopsis/).

 ## Choose your build method

--- a/src/morphodict/MkMorphodict.hs
+++ b/src/morphodict/MkMorphodict.hs
@@ -0,0 +1,142 @@
+module Main where
+
+import PGF
+
+import qualified Data.Map as M
+import Data.Char
+import Data.List
+import System.Environment (getArgs)
+
+-- AR 2020-02-28
+
+-- making a word list purely morphological, i.e.
+--   - functions are 1-to-1 with lemgrams, i.e.
+--     - no sense distinctions
+--     - no subcategorizations
+--     - no variants
+--  - functionname = baseform_category, with exceptions
+--     - variant inflection tables: lie_1_V, lie_2_V
+--     - words that have non-ident characters: 'bird\'s-eye_A'
+--     - words that start with non-letters: W_'tween_Adv
+
+-- example:
+--   gf -make ../english/DictEng.gf
+--   runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
+-- 64923 ->  56599 functions
+
+usage = "MkMorphodict <pgf> <outfile>"
+
+main = do
+  pgfile:outfile:_ <- getArgs
+  pgf <- readPGF pgfile
+  config <- readFile (outfile ++ ".config") >>= return . mkConfig
+  
+  let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
+  
+  absheader <- readFile (outfile ++ "Abs.header")
+  cncheader <- readFile (outfile ++ ".header")
+  
+  writeFile (outfile ++ "Abs.gf") absheader
+  appendFile (outfile ++ "Abs.gf") $ unlines absrules
+  appendFile (outfile ++ "Abs.gf") "}"
+  
+  writeFile (outfile ++ ".gf") cncheader
+  appendFile (outfile ++ ".gf") $ unlines cncrules
+  appendFile (outfile ++ ".gf") "}"
+
+
+type Cat  = CId
+type Oper = String
+type Config = M.Map Cat (Cat,Oper,[Int])
+
+data MDEnv = MDEnv {
+  pgf    :: PGF,
+  config :: Config,
+  lang   :: Language
+  }
+
+mkConfig :: String -> Config
+mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
+ where
+  mkOne s = case words s of
+    "--":_                 -> Right s 
+    cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
+    _ -> Right s
+
+mkMorphoDict :: MDEnv -> ([String],[String])
+mkMorphoDict env =
+  unzip $
+  map splitRule $
+  findCompounds $
+  nameFunctions $
+  mergeRules $
+  concatMap findRules cats
+ where
+  splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
+
+  cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
+
+  findRules cat = [
+    ([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) |  --- head ints is the base form in smart paradigms
+      f    <- functionsByCat (pgf env) cat,
+      lin  <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
+      Just (c,op,ints) <- [M.lookup cat (config env)]
+   ] 
+
+  appSig ints forms = [forms !! i | i <- ints]
+
+  mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
+
+  nameFunctions = expandNames . sortOn fst
+
+  expandNames fls = case fls of
+    (f,l):fls2 -> case span ((==f) . fst) fls2 of
+      ([],_) -> (mkFun f,l) : expandNames fls2
+      (fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
+    _ -> []
+
+  renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
+
+  findCompounds = getCompounds . sortOn cat_orthrevforms
+
+  cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
+
+  cat_revforms (_,(cat,_:forms)) = (cat,map reverse forms)
+  revstem = head . snd . cat_revforms
+  wforms (_,(_,_:forms)) = forms
+
+  getCompounds fls = case fls of
+    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1 
+    fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
+      ([],_:_) -> markWith fl [] : getCompounds fls2
+      (fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
+    _ -> []
+
+  markCompound fl fl1 =
+    case and [isPrefixWord (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms fl1)] of
+      True  -> markWith fl1 [";","--","compound",(fst fl)]
+      False -> markWith fl1 [";","--","notcompound",(fst fl)]
+
+  markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
+
+  isPrefixWord x xy =
+    length suff > 1 &&
+    any (\c -> elem c "-0123456789aeiouyåäö") suff &&
+    isPrefixOf x xy
+   where
+     suff = drop (length x) xy
+
+mkFun = quoteIf . concat . intersperse "_"
+quoteIf s = case s of
+  _ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
+  c:_ | not (isAlpha c) -> "W_" ++ s
+  _ -> s
+ where
+  unSgQuote s = case s of
+    '\'':cs -> "\\\'" ++ unSgQuote cs
+    c:cs -> c : unSgQuote cs
+    _ -> s
+
+
+quote s = "\"" ++ s ++ "\""
+
--- a/src/morphodict/MorphoDictEng.config
+++ b/src/morphodict/MorphoDictEng.config
@@ -0,0 +1,8 @@
+N : N mkN 0 2
+A : A mkA 0 2 4 6
+V : V mkV 0 4 2
+V2 : V mkV 0 4 2
+Adv : Adv mkAdv 0
+Prep : Prep mkPrep 0
+
+
--- a/src/morphodict/MorphoDictEng.gf
+++ b/src/morphodict/MorphoDictEng.gf
--- a/src/morphodict/MorphoDictEng.header
+++ b/src/morphodict/MorphoDictEng.header
@@ -0,0 +1,7 @@
+concrete MorphoDictEng of MorphoDictEngAbs =
+  CatEng [N,A,V,Adv,Prep] **
+  open
+    ParadigmsEng
+  in
+ {
+
--- a/src/morphodict/MorphoDictEngAbs.gf
+++ b/src/morphodict/MorphoDictEngAbs.gf
--- a/src/morphodict/MorphoDictEngAbs.header
+++ b/src/morphodict/MorphoDictEngAbs.header
@@ -0,0 +1,4 @@
+abstract MorphoDictEngAbs =
+  Cat [N,A,V,Adv,Prep] **
+{
+
--- a/src/morphodict/MorphoDictSwe.config
+++ b/src/morphodict/MorphoDictSwe.config
@@ -0,0 +1,8 @@
+N : N mkN 0 2 4 6
+A : A mkA 0 2 4 10 12
+V : V mkV 6 0 4 2 8 10
+V2 : V mkV 6 0 4 2 8 10
+Adv : Adv mkAdv 0
+Prep : Prep mkPrep 0
+PN : PN mkPN 0
+
--- a/src/morphodict/MorphoDictSwe.header
+++ b/src/morphodict/MorphoDictSwe.header
@@ -0,0 +1,8 @@
+concrete MorphoDictSwe of MorphoDictSweAbs =
+  CatSwe [N,A,V,Adv,Prep,PN] **
+  open
+    ParadigmsSwe
+  in
+ {
+
+
--- a/src/morphodict/MorphoDictSweAbs.header
+++ b/src/morphodict/MorphoDictSweAbs.header
@@ -0,0 +1,4 @@
+abstract MorphoDictSweAbs =
+  Cat [N,A,V,Adv,Prep,PN] **
+{
+
--- a/src/morphodict/README
+++ b/src/morphodict/README
@@ -0,0 +1,38 @@
+MkMorphoDict: Extracting a minimal morphological dictionary from an existing GF dictionary.
+
+Aarne Ranta 2020-03-02
+
+principles:
+
+There should be a single source for each lemgram (i.e. inflection table of a word)
+Functions names should be easy to guess: baseform_Category (but avoiding accidental errors if this is not a unique key)
+
+Hence,
+
+Functions are 1-to-1 with lemgrams, i.e. inflection tables, thus
+     - no sense distinctions
+     - no subcategorizations
+     - no variants
+
+Functionname = baseform_category, with exceptions
+     - same baseform_Category, different inflection tables: lie_1_V, lie_2_V
+     - words that have non-ident characters: 'bird\'s-eye_A'
+     - words that start with non-letters: W_'tween_Adv
+
+Example run, English:
+
+   gf -make ../english/DictEng.gf
+   runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
+
+Result: 64923 ->  56599 functions, of which 21679 could be compounds
+
+Swedish, using a dump of SALDO (not available in these sources)
+
+  cd saldo/
+  runghc SaldoGF.hs
+  # combine abs.tmp with Saldo.header to obtain Saldo.gf
+  # combine cnc.tmp with SaldoSwe.header to obtain SaldoSwe.gf
+  gf -make SaldoSwe.gf
+  cd ..
+  runghc MkMorphodict.hs saldo/Saldo.pgf MorphoDictSwe
+
--- a/src/morphodict/saldo/Saldo.header
+++ b/src/morphodict/saldo/Saldo.header
@@ -0,0 +1 @@
+abstract Saldo = Cat [N,A,V,PN,Adv,Prep] ** {
--- a/src/morphodict/saldo/SaldoGF.hs
+++ b/src/morphodict/saldo/SaldoGF.hs
@@ -0,0 +1,97 @@
+import Data.List
+import qualified Data.Map as M
+
+-- AR 2020-03-03
+-- generating GF from preprocessed SALDO (of type Lex by John Camilleri)
+
+main = do
+  lexicon <- readFile "saldom.hsdump" >>= return . readLex  -- this is the preprocessed file
+  let gf = map (mkRules . treatNone) $ mkFuns lexicon
+  writeFile "abs.tmp" $ unlines $ map fst gf  -- the generated files need headers
+  writeFile "cnc.tmp" $ unlines $ map snd gf  -- use SaldoGF.header for this
+
+-- JC's datatypes, using String for simplicity
+
+type Lex = M.Map String Entry -- key is lemgram ID
+
+type Table = [(String,String)]
+
+data Entry = E
+  { ePOS :: String
+  , eTable :: Table -- morphological tags to surface form: ("sg def gen" ,"killens")
+  } deriving (Show, Read)
+
+readLex :: String -> [(String,Entry)]
+readLex = read . drop 8
+
+-- new code by AR
+
+mkRules (fun,cat,lin) = (nunwords ["fun",fun,":",cat,";"],nunwords ["lin",fun,"=",lin,";"])
+ where
+  -- commenting out functions that still have NONE forms
+  nunwords ws = unwords ((if elem "\"NONE\"" (words lin) then ["--n"] else []) ++ ws)
+
+-- converting incomplete paradigms to special mkC constructors, defined in SaldoSwe.header
+treatNone (f,cat,lin) = case (cat,drop 1 (words lin)) of
+  ("V", "\"NONE\"":"\"NONE\"":v:_) -> (f, "V", unwords ("mkVDep":[v]))
+  ("V", i:d:p:a:b:"\"NONE\"":_) -> (f, "V", unwords ("mkVIntr":[i,d,p,a,b]))
+  ("A", i:"\"NONE\"":p:c:s:_) -> (f, "A", unwords ("mkAUtr":[i,p,c,s]))
+  ("A", i:d:p:"\"NONE\"":"\"NONE\"":_) -> (f, "A", unwords ("mkAComp":[i,d,p]))
+  ("N", "\"NONE\"":d:"\"NONE\"":_) -> (f, "PN", unwords ("mkPNDef":[d])) ---
+  ("N", i:"\"NONE\"":"\"NONE\"":_) -> (f, "PN", unwords ("mkPNIndef":[i]))
+  ("N", i:d:"\"NONE\"":"\"NONE\"":_) -> (f, "N", unwords ("mkNSg":[i,d]))
+  ("N", "\"NONE\"":"\"NONE\"":i:d:_) -> (f, "N", unwords ("mkNPl":[i,d]))
+  _ -> (f,cat,lin)
+
+--- generating function names for simplicity: the result is fed to ../MkMorphoDict anyway
+mkFuns lx = [("w"++show i, cat, lin) | (i,(cat,lin)) <- zip [1000000..] (concatMap (entry2lin . snd) lx)]
+
+entry2lin e =
+   [(cat, mkLin cat ws) | ws <- manyTables valuess]
+  where
+    (cat,forms) = formSpec (ePOS e)
+    valuess = [nub [v | (t,v) <- eTable e, t == f] | f <- forms]
+    mkLin c ws = unwords $ ["mk"++c] ++ ["\"" ++ w ++ "\"" | w <- ws]
+
+-- looking for the characteristic forms for each POS
+
+formSpec pos = case pos of
+  "nn" -> ("N",[
+    "sg indef nom",
+    "sg def nom",
+    "pl indef nom",
+    "pl def nom"
+    ])
+  "av" -> ("A",[
+    "pos indef sg u nom",
+    "pos indef sg n nom",
+    "pos indef pl nom",
+    "komp nom",
+    "super indef nom"
+    ])
+  "vb" -> ("V",[
+    "inf aktiv",
+    "pres ind aktiv",
+    "imper",
+    "pret ind aktiv",
+    "sup aktiv",
+    "pret_part indef sg u nom"
+    ])
+  "ab" -> ("Adv",[
+    "invar"
+----    "pos"
+    ])
+  "pp" -> ("Prep",[
+    "invar"
+    ])
+  _ -> ("NONE++pos",["NONE++pos"]) -- ignoring other POS tags, which are rare anyway
+
+-- trying to generate a small number of tables from sets of variant forms; seems to work well enough
+
+manyTables formss = [
+  map ((!!i) . pad) formss |
+    i <- [0..maximum (map length formss)-1],
+    let pad forms = if null forms then repeat "NONE" else forms ++ repeat (head forms)
+  ]
+
+
--- a/src/morphodict/saldo/SaldoSwe.header
+++ b/src/morphodict/saldo/SaldoSwe.header
@@ -0,0 +1,50 @@
+concrete SaldoSwe of Saldo =
+  CatSwe [N,A,V,Adv,Prep,PN] **
+  open
+    ParadigmsSwe, Prelude
+  in
+ {
+
+-- to deal with incomplete paradigms
+--- the values could be in special categories to avoid overgeneration
+
+oper
+  mkVDep : Str -> V
+    = \v -> case v of {
+        x + "as" => depV (mkV (x + "a")) ;
+        x + "es" => depV (mkV (x + "er")) ;
+	x + "s"  => depV (mkV (x + "er")) ;
+	_ => Predef.error (v ++ "not for mkVDep")
+      } ;
+      
+  mkVIntr : (_,_,_,_,_ : Str) -> V
+    = \i,_,_,p,pt -> mkV i p pt ; ---
+    
+  mkAUtr : (_,_,_,_ : Str) -> A
+    = \u,p,c,s -> mkA u u p c s ; ---
+
+  mkAComp : (_,_,_ : Str) -> A
+    = \u,n,p -> compoundA (mkA u n) ; ---
+    
+  mkPNDef : Str -> PN
+    = \s -> case s of {
+      _ + "n" => mkPN s utrum ;
+      _  => mkPN s neutrum
+      } ;
+      
+  mkPNIndef : Str -> PN
+   = \s -> mkPN s neutrum ; ---
+   
+  mkNSg : (_,_ : Str) -> N
+   = \i,d -> case d of {
+       _ + "n" => mkN i utrum ; ---
+       _  => mkN i neutrum ---
+       } ;
+       
+  mkNPl : (_,_ : Str) -> N
+   = \i,d -> case i of {
+       s + "or" => mkN (s + "a") ; ---
+       s + ("ar"|"er") => mkN s i ; ---
+       s + "en" => mkN (s + "e") i ; ---
+       _  => mkN i i ---
+       } ;
--- a/src/swedish/README.md
+++ b/src/swedish/README.md
@@ -0,0 +1,13 @@
+# Swedish
+
+## Language info
+
+- English name: Swedish
+- Autonym: Svenska
+- ISO code: Swe
+
+## Dictionaries
+
+- `OldDictSwe`: Converted from SALDO using [this code](https://github.com/MalinAhlberg/SwedishProject/tree/master/saldo) in 2011.
+- `NewDictSwe`: Re-import from SALDO using [this code](https://github.com/DigitalGrammarsAB/SALDOtoGF/tree/a45e503a824ded39844df2aeeb7a6ee891e3bee1) in 2018, with more words and different identifier structure.
+- `DictSwe` is a union of `OldDictSwe` and `NewDictSwe`
				`@@ -0,0 +1 @@`
				`abstract Saldo = Cat [N,A,V,PN,Adv,Prep] ** {`