Merge pull request #370 from inariksit/morphodict

Minor improvements in MkMorphodict.hs + Finnish lexicon in new format
2021-06-07 17:42:06 +02:00
parent ace36f7129 e3c6694019
commit 0ee2377c99
11 changed files with 61145 additions and 62141 deletions
--- a/src/finnish/NewDictFin.gf
+++ b/src/finnish/NewDictFin.gf
@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
 lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
 lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
 lin monias_N = mkN {s = d41 "monias"} ;
-lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
+lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
 lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
 lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
 lin monikko_N = mkN {s = d04A "monikko"} ;
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
 lin yks_Adv = mkAdv {s = c99 "yks"} ;
 lin ykseys_N = mkN {s = d40 "ykseys"} ;
 lin yksi_N = mkN {s = d31 "yksi"} ;
-lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
+lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
 lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
 lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
 lin yksikkö_N = mkN {s = d04A "yksikkö"} ;
--- a/src/morphodict/MkMorphodict.hs
+++ b/src/morphodict/MkMorphodict.hs
@@ -5,7 +5,9 @@ import PGF
 import qualified Data.Map as M
 import Data.Char
 import Data.List
+import Safe
 import System.Environment (getArgs)
+import Debug.Trace

 -- AR 2020-02-28

@@ -28,29 +30,33 @@ usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
 main = do
  xx <- getArgs
  if length xx /= 4
-    then putStrLn usage
+    then do
+      putStrLn "Usage:"
+      putStrLn usage
+      putStrLn $ "Got instead: " ++ show xx
    else do
-      let mode:configfile:datafile:outfile:_ = xx 
+      let mode:configfile:datafile:outfile:_ = xx
      config <- readFile configfile >>= return . mkConfig
-  
+
      rawdata <- case mode of
        "pgf" -> pgfFile2rawData config datafile
-        "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines 
+        "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
+        _ -> error $ "Expected mode (pgf|raw), got " ++ mode
      rawdata2gf config rawdata outfile


 rawdata2gf config rawdata outfile = do
-  
+
  let env = MDEnv rawdata config
  let (absrules,cncrules) = mkMorphoDict env
-  
+
  absheader <- readFile (outfile ++ "Abs.header")
  cncheader <- readFile (outfile ++ ".header")
-  
+
  writeFile (outfile ++ "Abs.gf") absheader
  appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
  appendFile (outfile ++ "Abs.gf") "}"
-  
+
  writeFile (outfile ++ ".gf") cncheader
  appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
  appendFile (outfile ++ ".gf") "}"
@@ -68,7 +74,7 @@ pgfFile2rawData config pgffile = do
        cat  <- cats,
        f    <- functionsByCat pgf (mkCId cat),
        lin  <- tabularLinearizes pgf lang (mkApp f [])
-    ] 
+    ]

 type Cat  = String
 type Fun  = String
@@ -84,12 +90,13 @@ mkConfig :: String -> Config  -- N : N mkN 0 2 4 6 # 9
 mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
 where
  mkOne s = case words s of
-    "--":_                 -> Right s 
+    "--":_                 -> Right s
    cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
    _ -> Right s
  mkArgs ints = case break (=="#") ints of
-    (ss,[])   -> (map read ss, [])
-    (ss,_:fs) -> (map read ss, map read fs)
+    (ss,[])   -> (map read'  ss, [])
+    (ss,_:fs) -> (map read' ss, map read' fs)
+  read' a = readNote [] a -- Safe.readNote provides better error message

 getRawData s = case words s of
  c:cs -> (c,cs)
@@ -118,10 +125,13 @@ mkMorphoDict env =
    (([lemma],newcat),(oper, appSig sig args)) |
        (oldcat,args) <- raws,
        Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
-        let lemma = args !! head (fst sig)
-   ] 
+        let lemma = args `at` head (fst sig)
+   ]

-  appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
+  appSig (ints,feats) args =
+    -- If there's wrong number in config file, uncomment the line below to see which number it should be
+    -- trace (intercalate "\n" $ map show (zip [0..] args)) $
+    ([args `at` i | i <- ints], [args `at` i | i <- feats])

  mergeRules :: [RawRule] -> [RawRule]
  mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
@@ -153,16 +163,20 @@ mkMorphoDict env =
    _ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
    _ -> fls

-      
-mkFun = showCId . mkCId . concat . intersperse "_"
+-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
+-- "hello_world_hello_friends_hello_all"
+mkFun :: [String] -> String                          -- if word contains space or hyphen, replace with underscore
+mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
+  where
+    removeHyphen [] = []
+    removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
+    removeHyphen ('-':cs) = ' ' : removeHyphen cs
+    removeHyphen (c:cs) = c : removeHyphen cs

 quote s = "\"" ++ s ++ "\""

-
-
-
 {- ---- let us ignore this
-  findCompounds :: [RuleData] -> [RuleData] 
+  findCompounds :: [RuleData] -> [RuleData]
  findCompounds = getCompounds . sortOn cat_orthrevforms

  cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
@@ -171,9 +185,9 @@ quote s = "\"" ++ s ++ "\""
  revstem = head . snd . cat_revforms
  wforms (_,(_,_:forms)) = forms

-  getCompounds :: [RuleData] -> [RuleData] 
+  getCompounds :: [RuleData] -> [RuleData]
  getCompounds fls = case fls of
-    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1 
+    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
    fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
      ([],_:_) -> markWith fl [] : getCompounds fls2
      (fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
@@ -188,7 +202,7 @@ quote s = "\"" ++ s ++ "\""

  isPrefixWord x xy =
    length suff > 1 &&                                ---- compound first part must be at least two letters long
-    any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit 
+    any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
    isPrefixOf x xy                                   ---- and of course be a prefix
   where
     suff = drop (length x) xy
--- a/src/morphodict/MorphoDictFin.config
+++ b/src/morphodict/MorphoDictFin.config
@@ -0,0 +1,6 @@
+N : N mkN 0 1 2 4 7 13 14 16 17 19
+A : A mkA' 0 1 2 4 7 13 14 16 17 19
+V : V mkV  0 17 19 22 43 49 23 25 31 58 94 37
+V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
+Adv : Adv mkAdv 0
+Prep : Prep mkPrep 0
--- a/src/morphodict/MorphoDictFin.gf
+++ b/src/morphodict/MorphoDictFin.gf
--- a/src/morphodict/MorphoDictFin.header
+++ b/src/morphodict/MorphoDictFin.header
@@ -0,0 +1,13 @@
+concrete MorphoDictFin of MorphoDictFinAbs = CatFin ** open
+  ParadigmsFin,
+--  MorphoFin,
+  Kotus
+--  Prelude
+ in {
+
+-- extracted from http://kaino.kotus.fi/sanat/nykysuomi/, licensed under LGPL
+
+flags coding = utf8 ;
+
+oper mkA' : (x1,_,_,_,_,_,_,_,_,x10 : Str) -> A = \a,b,c,d,e,f,g,h,i,j -> mkA (mkN a b c d e f g h i j) ; -- Need a single worst-case paradigm for how config is implemented
+
--- a/src/morphodict/MorphoDictFinAbs.gf
+++ b/src/morphodict/MorphoDictFinAbs.gf
--- a/src/morphodict/MorphoDictFinAbs.header
+++ b/src/morphodict/MorphoDictFinAbs.header
@@ -0,0 +1,4 @@
+abstract MorphoDictFinAbs =
+  Cat [N,A,V,Adv,Prep] **
+{
+
--- a/src/morphodict/morphodict.cabal
+++ b/src/morphodict/morphodict.cabal
@@ -0,0 +1,20 @@
+name: morphodict
+version: 0.1
+homepage: https://github.com/GrammaticalFramework/gf-rgl/tree/master/src/morphodict
+author: Aarne Ranta
+category: Natural Language Processing
+build-type: Simple
+extra-source-files: README.md
+cabal-version: >=1.10
+
+executable MkMorphoDict
+  hs-source-dirs:
+    .
+  main-is: MkMorphoDict.hs
+  other-modules:
+  build-depends:
+    base,
+    containers,
+    safe,
+    gf
+  default-language: Haskell2010
--- a/src/morphodict/stack.yaml
+++ b/src/morphodict/stack.yaml
@@ -0,0 +1,16 @@
+resolver: lts-12.26
+
+packages:
+- .
+
+# so that `stack build --copy-bins` puts bin here
+local-bin-path: .
+
+extra-deps:
+- gf-3.10
+- cgi-3001.3.0.3 # dependency of gf
+
+flags:
+     # this excludes PGF2 module in gf package
+  gf:
+    c-runtime: false
--- a/src/morphodict/stack.yaml.lock
+++ b/src/morphodict/stack.yaml.lock
@@ -0,0 +1,26 @@
+# This file was autogenerated by Stack.
+# You should not edit this file by hand.
+# For more information, please see the documentation at:
+#   https://docs.haskellstack.org/en/stable/lock_files
+
+packages:
+- completed:
+    hackage: gf-3.10@sha256:6f851dfaab5e1f9d4f3796515b86f78806a2bb305136a902713dfc2b92d9cfb0,8477
+    pantry-tree:
+      size: 64924
+      sha256: 66332577ff42a42eed767f451f53266e1020b72749cdcdf7387933615d5de091
+  original:
+    hackage: gf-3.10
+- completed:
+    hackage: cgi-3001.3.0.3@sha256:4f3768d09e4a6620642588cab2e99d83c1b6b542dad6147d0af9532170036115,2076
+    pantry-tree:
+      size: 667
+      sha256: 65f6fd4574cffd1e5e2490c133b7ba58fd2fea0a65d81f1fa6fe14f08025629b
+  original:
+    hackage: cgi-3001.3.0.3
+snapshots:
+- completed:
+    size: 509471
+    url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/12/26.yaml
+    sha256: 95f014df58d0679b1c4a2b7bf2b652b61da8d30de5f571abb0d59015ef678646
+  original: lts-12.26
--- a/src/morphodict/utils/only_homonyms.sh
+++ b/src/morphodict/utils/only_homonyms.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+USAGE="usage: ./remove_sense_distinctions.sh <concrete syntax file>"
+NOTE="This is not extremely useful, it will just create a file with only those entries that are homonymous in dictionary form, but differ in other forms. The purpose of the file is for you to look at/do small experiments with. The real job is done in MkMorphoDict.hs."
+
+# String manipulation
+CONC=$1                      # e.g. MorphoDictFin.gf
+BAK="$CONC.bak"              # e.g. MorphoDictFin.gf.bak
+
+NAME=`echo $CONC | cut -f 1 -d '.'` # e.g. MorphoDictFin
+ABS="${NAME}Abs.gf"          # e.g. MorphoDictFinAbs.gf
+CONC_HEADER="$NAME.header"   # e.g. MorphoDictFin.header
+ABS_HEADER="${NAME}Abs.header"  # e.g. MorphoDictFinAbs.header
+
+find_duplicates() {
+    echo "Putting (temporarily) only homonyms in $CONC"
+    echo "cat $CONC_HEADER > $CONC"
+    cat $CONC_HEADER > $CONC
+    DUPLS=`cut -f 2 -d ' ' /tmp/$CONC  \
+         | sort | uniq -c | sort -nr \
+         | egrep "^ +1?[2-9][0-9]? [a-zåäö]+_" \
+         | tr -d '[0-9][A-ZÅÄÖ]'`
+    for d in $DUPLS
+    do
+        grep "lin $d" $BAK >> $CONC
+    done
+    echo "}" >> $CONC
+}
+
+remove_numbers() {
+    echo "cp $CONC{,.bak}"
+    cp $CONC{,.bak}
+    echo "cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC"
+    cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC
+    echo "Done removing numbers."
+}
+
+#### Action starts here
+
+echo $NOTE
+
+if [[ $CONC == *"Abs.gf" ]]
+  then
+    echo $USAGE
+  else
+    remove_numbers
+    find_duplicates
+    # echo "gf -v=0 -make $CONC"
+    # gf -v=0 -make $CONC
+    echo "$CONC contains now only homonyms. Original file is found in $BAK."
+fi