Merge pull request #370 from inariksit/morphodict

Minor improvements in MkMorphodict.hs + Finnish lexicon in new format
This commit is contained in:
Aarne Ranta
2021-06-07 17:42:06 +02:00
committed by GitHub
11 changed files with 61145 additions and 62141 deletions

View File

@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
lin monias_N = mkN {s = d41 "monias"} ;
lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
lin monikko_N = mkN {s = d04A "monikko"} ;
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
lin yks_Adv = mkAdv {s = c99 "yks"} ;
lin ykseys_N = mkN {s = d40 "ykseys"} ;
lin yksi_N = mkN {s = d31 "yksi"} ;
lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
lin yksikkö_N = mkN {s = d04A "yksikkö"} ;

View File

@@ -5,7 +5,9 @@ import PGF
import qualified Data.Map as M
import Data.Char
import Data.List
import Safe
import System.Environment (getArgs)
import Debug.Trace
-- AR 2020-02-28
@@ -28,7 +30,10 @@ usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
main = do
xx <- getArgs
if length xx /= 4
then putStrLn usage
then do
putStrLn "Usage:"
putStrLn usage
putStrLn $ "Got instead: " ++ show xx
else do
let mode:configfile:datafile:outfile:_ = xx
config <- readFile configfile >>= return . mkConfig
@@ -36,6 +41,7 @@ main = do
rawdata <- case mode of
"pgf" -> pgfFile2rawData config datafile
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
rawdata2gf config rawdata outfile
@@ -88,8 +94,9 @@ mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
_ -> Right s
mkArgs ints = case break (=="#") ints of
(ss,[]) -> (map read ss, [])
(ss,_:fs) -> (map read ss, map read fs)
(ss,[]) -> (map read' ss, [])
(ss,_:fs) -> (map read' ss, map read' fs)
read' a = readNote [] a -- Safe.readNote provides better error message
getRawData s = case words s of
c:cs -> (c,cs)
@@ -118,10 +125,13 @@ mkMorphoDict env =
(([lemma],newcat),(oper, appSig sig args)) |
(oldcat,args) <- raws,
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
let lemma = args !! head (fst sig)
let lemma = args `at` head (fst sig)
]
appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
appSig (ints,feats) args =
-- If there's wrong number in config file, uncomment the line below to see which number it should be
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
([args `at` i | i <- ints], [args `at` i | i <- feats])
mergeRules :: [RawRule] -> [RawRule]
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
@@ -153,14 +163,18 @@ mkMorphoDict env =
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
_ -> fls
mkFun = showCId . mkCId . concat . intersperse "_"
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
-- "hello_world_hello_friends_hello_all"
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
where
removeHyphen [] = []
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
removeHyphen ('-':cs) = ' ' : removeHyphen cs
removeHyphen (c:cs) = c : removeHyphen cs
quote s = "\"" ++ s ++ "\""
{- ---- let us ignore this
findCompounds :: [RuleData] -> [RuleData]
findCompounds = getCompounds . sortOn cat_orthrevforms

View File

@@ -0,0 +1,6 @@
N : N mkN 0 1 2 4 7 13 14 16 17 19
A : A mkA' 0 1 2 4 7 13 14 16 17 19
V : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
Adv : Adv mkAdv 0
Prep : Prep mkPrep 0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
concrete MorphoDictFin of MorphoDictFinAbs = CatFin ** open
ParadigmsFin,
-- MorphoFin,
Kotus
-- Prelude
in {
-- extracted from http://kaino.kotus.fi/sanat/nykysuomi/, licensed under LGPL
flags coding = utf8 ;
oper mkA' : (x1,_,_,_,_,_,_,_,_,x10 : Str) -> A = \a,b,c,d,e,f,g,h,i,j -> mkA (mkN a b c d e f g h i j) ; -- Need a single worst-case paradigm for how config is implemented

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,4 @@
abstract MorphoDictFinAbs =
Cat [N,A,V,Adv,Prep] **
{

View File

@@ -0,0 +1,20 @@
name: morphodict
version: 0.1
homepage: https://github.com/GrammaticalFramework/gf-rgl/tree/master/src/morphodict
author: Aarne Ranta
category: Natural Language Processing
build-type: Simple
extra-source-files: README.md
cabal-version: >=1.10
executable MkMorphoDict
hs-source-dirs:
.
main-is: MkMorphoDict.hs
other-modules:
build-depends:
base,
containers,
safe,
gf
default-language: Haskell2010

16
src/morphodict/stack.yaml Normal file
View File

@@ -0,0 +1,16 @@
resolver: lts-12.26
packages:
- .
# so that `stack build --copy-bins` puts bin here
local-bin-path: .
extra-deps:
- gf-3.10
- cgi-3001.3.0.3 # dependency of gf
flags:
# this excludes PGF2 module in gf package
gf:
c-runtime: false

View File

@@ -0,0 +1,26 @@
# This file was autogenerated by Stack.
# You should not edit this file by hand.
# For more information, please see the documentation at:
# https://docs.haskellstack.org/en/stable/lock_files
packages:
- completed:
hackage: gf-3.10@sha256:6f851dfaab5e1f9d4f3796515b86f78806a2bb305136a902713dfc2b92d9cfb0,8477
pantry-tree:
size: 64924
sha256: 66332577ff42a42eed767f451f53266e1020b72749cdcdf7387933615d5de091
original:
hackage: gf-3.10
- completed:
hackage: cgi-3001.3.0.3@sha256:4f3768d09e4a6620642588cab2e99d83c1b6b542dad6147d0af9532170036115,2076
pantry-tree:
size: 667
sha256: 65f6fd4574cffd1e5e2490c133b7ba58fd2fea0a65d81f1fa6fe14f08025629b
original:
hackage: cgi-3001.3.0.3
snapshots:
- completed:
size: 509471
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/12/26.yaml
sha256: 95f014df58d0679b1c4a2b7bf2b652b61da8d30de5f571abb0d59015ef678646
original: lts-12.26

View File

@@ -0,0 +1,51 @@
#!/bin/bash
USAGE="usage: ./remove_sense_distinctions.sh <concrete syntax file>"
NOTE="This is not extremely useful, it will just create a file with only those entries that are homonymous in dictionary form, but differ in other forms. The purpose of the file is for you to look at/do small experiments with. The real job is done in MkMorphoDict.hs."
# String manipulation
CONC=$1 # e.g. MorphoDictFin.gf
BAK="$CONC.bak" # e.g. MorphoDictFin.gf.bak
NAME=`echo $CONC | cut -f 1 -d '.'` # e.g. MorphoDictFin
ABS="${NAME}Abs.gf" # e.g. MorphoDictFinAbs.gf
CONC_HEADER="$NAME.header" # e.g. MorphoDictFin.header
ABS_HEADER="${NAME}Abs.header" # e.g. MorphoDictFinAbs.header
find_duplicates() {
echo "Putting (temporarily) only homonyms in $CONC"
echo "cat $CONC_HEADER > $CONC"
cat $CONC_HEADER > $CONC
DUPLS=`cut -f 2 -d ' ' /tmp/$CONC \
| sort | uniq -c | sort -nr \
| egrep "^ +1?[2-9][0-9]? [a-zåäö]+_" \
| tr -d '[0-9][A-ZÅÄÖ]'`
for d in $DUPLS
do
grep "lin $d" $BAK >> $CONC
done
echo "}" >> $CONC
}
remove_numbers() {
echo "cp $CONC{,.bak}"
cp $CONC{,.bak}
echo "cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC"
cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC
echo "Done removing numbers."
}
#### Action starts here
echo $NOTE
if [[ $CONC == *"Abs.gf" ]]
then
echo $USAGE
else
remove_numbers
find_duplicates
# echo "gf -v=0 -make $CONC"
# gf -v=0 -make $CONC
echo "$CONC contains now only homonyms. Original file is found in $BAK."
fi