Merge pull request #370 from inariksit/morphodict

Minor improvements in MkMorphodict.hs + Finnish lexicon in new format
2026-05-28 01:18:57 -06:00 · 2021-06-07 17:42:06 +02:00
parent ace36f7129 e3c6694019
commit 0ee2377c99
11 changed files with 61145 additions and 62141 deletions
--- a/src/finnish/NewDictFin.gf
+++ b/src/finnish/NewDictFin.gf
@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
 lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
 lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
 lin monias_N = mkN {s = d41 "monias"} ;
-lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
+lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
 lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
 lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
 lin monikko_N = mkN {s = d04A "monikko"} ;
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
 lin yks_Adv = mkAdv {s = c99 "yks"} ;
 lin ykseys_N = mkN {s = d40 "ykseys"} ;
 lin yksi_N = mkN {s = d31 "yksi"} ;
-lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
+lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
 lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
 lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
 lin yksikkö_N = mkN {s = d04A "yksikkö"} ;
--- a/src/morphodict/MkMorphodict.hs
+++ b/src/morphodict/MkMorphodict.hs
@@ -5,7 +5,9 @@ import PGF
 import qualified Data.Map as M
 import Data.Char
 import Data.List
 import Safe
 import System.Environment (getArgs)
 import Debug.Trace
 -- AR 2020-02-28
@@ -28,29 +30,33 @@ usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
 main = do
  xx <- getArgs
  if length xx /= 4
-    then putStrLn usage
+    then do
      putStrLn "Usage:"
      putStrLn usage
      putStrLn $ "Got instead: " ++ show xx
    else do
-      let mode:configfile:datafile:outfile:_ = xx 
+      let mode:configfile:datafile:outfile:_ = xx
      config <- readFile configfile >>= return . mkConfig
-  
+
      rawdata <- case mode of
        "pgf" -> pgfFile2rawData config datafile
-        "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines 
+        "raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
        _ -> error $ "Expected mode (pgf|raw), got " ++ mode
      rawdata2gf config rawdata outfile
 rawdata2gf config rawdata outfile = do
-  
+
  let env = MDEnv rawdata config
  let (absrules,cncrules) = mkMorphoDict env
-  
+
  absheader <- readFile (outfile ++ "Abs.header")
  cncheader <- readFile (outfile ++ ".header")
-  
+
  writeFile (outfile ++ "Abs.gf") absheader
  appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
  appendFile (outfile ++ "Abs.gf") "}"
-  
+
  writeFile (outfile ++ ".gf") cncheader
  appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
  appendFile (outfile ++ ".gf") "}"
@@ -68,7 +74,7 @@ pgfFile2rawData config pgffile = do
        cat  <- cats,
        f    <- functionsByCat pgf (mkCId cat),
        lin  <- tabularLinearizes pgf lang (mkApp f [])
-    ] 
+    ]
 type Cat  = String
 type Fun  = String
@@ -84,12 +90,13 @@ mkConfig :: String -> Config  -- N : N mkN 0 2 4 6 # 9
 mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
 where
  mkOne s = case words s of
-    "--":_                 -> Right s 
+    "--":_                 -> Right s
    cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
    _ -> Right s
  mkArgs ints = case break (=="#") ints of
-    (ss,[])   -> (map read ss, [])
+    (ss,[])   -> (map read'  ss, [])
-    (ss,_:fs) -> (map read ss, map read fs)
+    (ss,_:fs) -> (map read' ss, map read' fs)
  read' a = readNote [] a -- Safe.readNote provides better error message
 getRawData s = case words s of
  c:cs -> (c,cs)
@@ -118,10 +125,13 @@ mkMorphoDict env =
    (([lemma],newcat),(oper, appSig sig args)) |
        (oldcat,args) <- raws,
        Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
-        let lemma = args !! head (fst sig)
+        let lemma = args `at` head (fst sig)
-   ] 
+   ]
-  appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
+  appSig (ints,feats) args =
    -- If there's wrong number in config file, uncomment the line below to see which number it should be
    -- trace (intercalate "\n" $ map show (zip [0..] args)) $
    ([args `at` i | i <- ints], [args `at` i | i <- feats])
  mergeRules :: [RawRule] -> [RawRule]
  mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
@@ -153,16 +163,20 @@ mkMorphoDict env =
    _ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
    _ -> fls
-      
+-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
-mkFun = showCId . mkCId . concat . intersperse "_"
+-- "hello_world_hello_friends_hello_all"
 mkFun :: [String] -> String                          -- if word contains space or hyphen, replace with underscore
 mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
  where
    removeHyphen [] = []
    removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
    removeHyphen ('-':cs) = ' ' : removeHyphen cs
    removeHyphen (c:cs) = c : removeHyphen cs
 quote s = "\"" ++ s ++ "\""
 {- ---- let us ignore this
-  findCompounds :: [RuleData] -> [RuleData] 
+  findCompounds :: [RuleData] -> [RuleData]
  findCompounds = getCompounds . sortOn cat_orthrevforms
  cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
@@ -171,9 +185,9 @@ quote s = "\"" ++ s ++ "\""
  revstem = head . snd . cat_revforms
  wforms (_,(_,_:forms)) = forms
-  getCompounds :: [RuleData] -> [RuleData] 
+  getCompounds :: [RuleData] -> [RuleData]
  getCompounds fls = case fls of
-    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1 
+    fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
    fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
      ([],_:_) -> markWith fl [] : getCompounds fls2
      (fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
@@ -188,7 +202,7 @@ quote s = "\"" ++ s ++ "\""
  isPrefixWord x xy =
    length suff > 1 &&                                ---- compound first part must be at least two letters long
-    any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit 
+    any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
    isPrefixOf x xy                                   ---- and of course be a prefix
   where
     suff = drop (length x) xy
--- a/src/morphodict/MorphoDictFin.config
+++ b/src/morphodict/MorphoDictFin.config
@@ -0,0 +1,6 @@
 N : N mkN 0 1 2 4 7 13 14 16 17 19
 A : A mkA' 0 1 2 4 7 13 14 16 17 19
 V : V mkV  0 17 19 22 43 49 23 25 31 58 94 37
 V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
 Adv : Adv mkAdv 0
 Prep : Prep mkPrep 0
--- a/src/morphodict/MorphoDictFin.gf
+++ b/src/morphodict/MorphoDictFin.gf
--- a/src/morphodict/MorphoDictFin.header
+++ b/src/morphodict/MorphoDictFin.header
@@ -0,0 +1,13 @@
 concrete MorphoDictFin of MorphoDictFinAbs = CatFin ** open
  ParadigmsFin,
 --  MorphoFin,
  Kotus
 --  Prelude
 in {
 -- extracted from http://kaino.kotus.fi/sanat/nykysuomi/, licensed under LGPL
 flags coding = utf8 ;
 oper mkA' : (x1,_,_,_,_,_,_,_,_,x10 : Str) -> A = \a,b,c,d,e,f,g,h,i,j -> mkA (mkN a b c d e f g h i j) ; -- Need a single worst-case paradigm for how config is implemented
--- a/src/morphodict/MorphoDictFinAbs.gf
+++ b/src/morphodict/MorphoDictFinAbs.gf
--- a/src/morphodict/MorphoDictFinAbs.header
+++ b/src/morphodict/MorphoDictFinAbs.header
@@ -0,0 +1,4 @@
 abstract MorphoDictFinAbs =
  Cat [N,A,V,Adv,Prep] **
 {
--- a/src/morphodict/morphodict.cabal
+++ b/src/morphodict/morphodict.cabal
@@ -0,0 +1,20 @@
 name: morphodict
 version: 0.1
 homepage: https://github.com/GrammaticalFramework/gf-rgl/tree/master/src/morphodict
 author: Aarne Ranta
 category: Natural Language Processing
 build-type: Simple
 extra-source-files: README.md
 cabal-version: >=1.10
 executable MkMorphoDict
  hs-source-dirs:
    .
  main-is: MkMorphoDict.hs
  other-modules:
  build-depends:
    base,
    containers,
    safe,
    gf
  default-language: Haskell2010
--- a/src/morphodict/stack.yaml
+++ b/src/morphodict/stack.yaml
@@ -0,0 +1,16 @@
 resolver: lts-12.26
 packages:
 - .
 # so that `stack build --copy-bins` puts bin here
 local-bin-path: .
 extra-deps:
 - gf-3.10
 - cgi-3001.3.0.3 # dependency of gf
 flags:
     # this excludes PGF2 module in gf package
  gf:
    c-runtime: false
--- a/src/morphodict/stack.yaml.lock
+++ b/src/morphodict/stack.yaml.lock
@@ -0,0 +1,26 @@
 # This file was autogenerated by Stack.
 # You should not edit this file by hand.
 # For more information, please see the documentation at:
 #   https://docs.haskellstack.org/en/stable/lock_files
 packages:
 - completed:
    hackage: gf-3.10@sha256:6f851dfaab5e1f9d4f3796515b86f78806a2bb305136a902713dfc2b92d9cfb0,8477
    pantry-tree:
      size: 64924
      sha256: 66332577ff42a42eed767f451f53266e1020b72749cdcdf7387933615d5de091
  original:
    hackage: gf-3.10
 - completed:
    hackage: cgi-3001.3.0.3@sha256:4f3768d09e4a6620642588cab2e99d83c1b6b542dad6147d0af9532170036115,2076
    pantry-tree:
      size: 667
      sha256: 65f6fd4574cffd1e5e2490c133b7ba58fd2fea0a65d81f1fa6fe14f08025629b
  original:
    hackage: cgi-3001.3.0.3
 snapshots:
 - completed:
    size: 509471
    url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/12/26.yaml
    sha256: 95f014df58d0679b1c4a2b7bf2b652b61da8d30de5f571abb0d59015ef678646
  original: lts-12.26
--- a/src/morphodict/utils/only_homonyms.sh
+++ b/src/morphodict/utils/only_homonyms.sh
@@ -0,0 +1,51 @@
 #!/bin/bash
 USAGE="usage: ./remove_sense_distinctions.sh <concrete syntax file>"
 NOTE="This is not extremely useful, it will just create a file with only those entries that are homonymous in dictionary form, but differ in other forms. The purpose of the file is for you to look at/do small experiments with. The real job is done in MkMorphoDict.hs."
 # String manipulation
 CONC=$1                      # e.g. MorphoDictFin.gf
 BAK="$CONC.bak"              # e.g. MorphoDictFin.gf.bak
 NAME=`echo $CONC | cut -f 1 -d '.'` # e.g. MorphoDictFin
 ABS="${NAME}Abs.gf"          # e.g. MorphoDictFinAbs.gf
 CONC_HEADER="$NAME.header"   # e.g. MorphoDictFin.header
 ABS_HEADER="${NAME}Abs.header"  # e.g. MorphoDictFinAbs.header
 find_duplicates() {
    echo "Putting (temporarily) only homonyms in $CONC"
    echo "cat $CONC_HEADER > $CONC"
    cat $CONC_HEADER > $CONC
    DUPLS=`cut -f 2 -d ' ' /tmp/$CONC  \
         | sort | uniq -c | sort -nr \
         | egrep "^ +1?[2-9][0-9]? [a-zåäö]+_" \
         | tr -d '[0-9][A-ZÅÄÖ]'`
    for d in $DUPLS
    do
        grep "lin $d" $BAK >> $CONC
    done
    echo "}" >> $CONC
 }
 remove_numbers() {
    echo "cp $CONC{,.bak}"
    cp $CONC{,.bak}
    echo "cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC"
    cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC
    echo "Done removing numbers."
 }
 #### Action starts here
 echo $NOTE
 if [[ $CONC == *"Abs.gf" ]]
  then
    echo $USAGE
  else
    remove_numbers
    find_duplicates
    # echo "gf -v=0 -make $CONC"
    # gf -v=0 -make $CONC
    echo "$CONC contains now only homonyms. Original file is found in $BAK."
 fi