mirror of
https://github.com/GrammaticalFramework/gf-rgl.git
synced 2026-05-28 01:18:57 -06:00
Merge pull request #370 from inariksit/morphodict
Minor improvements in MkMorphodict.hs + Finnish lexicon in new format
This commit is contained in:
@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
|
|||||||
lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
|
lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
|
||||||
lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
|
lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
|
||||||
lin monias_N = mkN {s = d41 "monias"} ;
|
lin monias_N = mkN {s = d41 "monias"} ;
|
||||||
lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
|
lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
|
||||||
lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
|
lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
|
||||||
lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
|
lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
|
||||||
lin monikko_N = mkN {s = d04A "monikko"} ;
|
lin monikko_N = mkN {s = d04A "monikko"} ;
|
||||||
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
|
|||||||
lin yks_Adv = mkAdv {s = c99 "yks"} ;
|
lin yks_Adv = mkAdv {s = c99 "yks"} ;
|
||||||
lin ykseys_N = mkN {s = d40 "ykseys"} ;
|
lin ykseys_N = mkN {s = d40 "ykseys"} ;
|
||||||
lin yksi_N = mkN {s = d31 "yksi"} ;
|
lin yksi_N = mkN {s = d31 "yksi"} ;
|
||||||
lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
|
lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
|
||||||
lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
|
lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
|
||||||
lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
|
lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
|
||||||
lin yksikkö_N = mkN {s = d04A "yksikkö"} ;
|
lin yksikkö_N = mkN {s = d04A "yksikkö"} ;
|
||||||
|
|||||||
@@ -5,7 +5,9 @@ import PGF
|
|||||||
import qualified Data.Map as M
|
import qualified Data.Map as M
|
||||||
import Data.Char
|
import Data.Char
|
||||||
import Data.List
|
import Data.List
|
||||||
|
import Safe
|
||||||
import System.Environment (getArgs)
|
import System.Environment (getArgs)
|
||||||
|
import Debug.Trace
|
||||||
|
|
||||||
-- AR 2020-02-28
|
-- AR 2020-02-28
|
||||||
|
|
||||||
@@ -28,29 +30,33 @@ usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
|
|||||||
main = do
|
main = do
|
||||||
xx <- getArgs
|
xx <- getArgs
|
||||||
if length xx /= 4
|
if length xx /= 4
|
||||||
then putStrLn usage
|
then do
|
||||||
|
putStrLn "Usage:"
|
||||||
|
putStrLn usage
|
||||||
|
putStrLn $ "Got instead: " ++ show xx
|
||||||
else do
|
else do
|
||||||
let mode:configfile:datafile:outfile:_ = xx
|
let mode:configfile:datafile:outfile:_ = xx
|
||||||
config <- readFile configfile >>= return . mkConfig
|
config <- readFile configfile >>= return . mkConfig
|
||||||
|
|
||||||
rawdata <- case mode of
|
rawdata <- case mode of
|
||||||
"pgf" -> pgfFile2rawData config datafile
|
"pgf" -> pgfFile2rawData config datafile
|
||||||
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
|
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
|
||||||
|
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
|
||||||
rawdata2gf config rawdata outfile
|
rawdata2gf config rawdata outfile
|
||||||
|
|
||||||
|
|
||||||
rawdata2gf config rawdata outfile = do
|
rawdata2gf config rawdata outfile = do
|
||||||
|
|
||||||
let env = MDEnv rawdata config
|
let env = MDEnv rawdata config
|
||||||
let (absrules,cncrules) = mkMorphoDict env
|
let (absrules,cncrules) = mkMorphoDict env
|
||||||
|
|
||||||
absheader <- readFile (outfile ++ "Abs.header")
|
absheader <- readFile (outfile ++ "Abs.header")
|
||||||
cncheader <- readFile (outfile ++ ".header")
|
cncheader <- readFile (outfile ++ ".header")
|
||||||
|
|
||||||
writeFile (outfile ++ "Abs.gf") absheader
|
writeFile (outfile ++ "Abs.gf") absheader
|
||||||
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
|
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
|
||||||
appendFile (outfile ++ "Abs.gf") "}"
|
appendFile (outfile ++ "Abs.gf") "}"
|
||||||
|
|
||||||
writeFile (outfile ++ ".gf") cncheader
|
writeFile (outfile ++ ".gf") cncheader
|
||||||
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
|
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
|
||||||
appendFile (outfile ++ ".gf") "}"
|
appendFile (outfile ++ ".gf") "}"
|
||||||
@@ -68,7 +74,7 @@ pgfFile2rawData config pgffile = do
|
|||||||
cat <- cats,
|
cat <- cats,
|
||||||
f <- functionsByCat pgf (mkCId cat),
|
f <- functionsByCat pgf (mkCId cat),
|
||||||
lin <- tabularLinearizes pgf lang (mkApp f [])
|
lin <- tabularLinearizes pgf lang (mkApp f [])
|
||||||
]
|
]
|
||||||
|
|
||||||
type Cat = String
|
type Cat = String
|
||||||
type Fun = String
|
type Fun = String
|
||||||
@@ -84,12 +90,13 @@ mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
|
|||||||
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
|
||||||
where
|
where
|
||||||
mkOne s = case words s of
|
mkOne s = case words s of
|
||||||
"--":_ -> Right s
|
"--":_ -> Right s
|
||||||
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
|
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
|
||||||
_ -> Right s
|
_ -> Right s
|
||||||
mkArgs ints = case break (=="#") ints of
|
mkArgs ints = case break (=="#") ints of
|
||||||
(ss,[]) -> (map read ss, [])
|
(ss,[]) -> (map read' ss, [])
|
||||||
(ss,_:fs) -> (map read ss, map read fs)
|
(ss,_:fs) -> (map read' ss, map read' fs)
|
||||||
|
read' a = readNote [] a -- Safe.readNote provides better error message
|
||||||
|
|
||||||
getRawData s = case words s of
|
getRawData s = case words s of
|
||||||
c:cs -> (c,cs)
|
c:cs -> (c,cs)
|
||||||
@@ -118,10 +125,13 @@ mkMorphoDict env =
|
|||||||
(([lemma],newcat),(oper, appSig sig args)) |
|
(([lemma],newcat),(oper, appSig sig args)) |
|
||||||
(oldcat,args) <- raws,
|
(oldcat,args) <- raws,
|
||||||
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
|
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
|
||||||
let lemma = args !! head (fst sig)
|
let lemma = args `at` head (fst sig)
|
||||||
]
|
]
|
||||||
|
|
||||||
appSig (ints,feats) args = ([args !! i | i <- ints], [args !! i | i <- feats])
|
appSig (ints,feats) args =
|
||||||
|
-- If there's wrong number in config file, uncomment the line below to see which number it should be
|
||||||
|
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
|
||||||
|
([args `at` i | i <- ints], [args `at` i | i <- feats])
|
||||||
|
|
||||||
mergeRules :: [RawRule] -> [RawRule]
|
mergeRules :: [RawRule] -> [RawRule]
|
||||||
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
|
||||||
@@ -153,16 +163,20 @@ mkMorphoDict env =
|
|||||||
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
|
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
|
||||||
_ -> fls
|
_ -> fls
|
||||||
|
|
||||||
|
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
|
||||||
mkFun = showCId . mkCId . concat . intersperse "_"
|
-- "hello_world_hello_friends_hello_all"
|
||||||
|
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
|
||||||
|
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
|
||||||
|
where
|
||||||
|
removeHyphen [] = []
|
||||||
|
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
|
||||||
|
removeHyphen ('-':cs) = ' ' : removeHyphen cs
|
||||||
|
removeHyphen (c:cs) = c : removeHyphen cs
|
||||||
|
|
||||||
quote s = "\"" ++ s ++ "\""
|
quote s = "\"" ++ s ++ "\""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
{- ---- let us ignore this
|
{- ---- let us ignore this
|
||||||
findCompounds :: [RuleData] -> [RuleData]
|
findCompounds :: [RuleData] -> [RuleData]
|
||||||
findCompounds = getCompounds . sortOn cat_orthrevforms
|
findCompounds = getCompounds . sortOn cat_orthrevforms
|
||||||
|
|
||||||
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
|
||||||
@@ -171,9 +185,9 @@ quote s = "\"" ++ s ++ "\""
|
|||||||
revstem = head . snd . cat_revforms
|
revstem = head . snd . cat_revforms
|
||||||
wforms (_,(_,_:forms)) = forms
|
wforms (_,(_,_:forms)) = forms
|
||||||
|
|
||||||
getCompounds :: [RuleData] -> [RuleData]
|
getCompounds :: [RuleData] -> [RuleData]
|
||||||
getCompounds fls = case fls of
|
getCompounds fls = case fls of
|
||||||
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
|
||||||
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
|
||||||
([],_:_) -> markWith fl [] : getCompounds fls2
|
([],_:_) -> markWith fl [] : getCompounds fls2
|
||||||
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
|
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
|
||||||
@@ -188,7 +202,7 @@ quote s = "\"" ++ s ++ "\""
|
|||||||
|
|
||||||
isPrefixWord x xy =
|
isPrefixWord x xy =
|
||||||
length suff > 1 && ---- compound first part must be at least two letters long
|
length suff > 1 && ---- compound first part must be at least two letters long
|
||||||
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
|
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
|
||||||
isPrefixOf x xy ---- and of course be a prefix
|
isPrefixOf x xy ---- and of course be a prefix
|
||||||
where
|
where
|
||||||
suff = drop (length x) xy
|
suff = drop (length x) xy
|
||||||
|
|||||||
6
src/morphodict/MorphoDictFin.config
Normal file
6
src/morphodict/MorphoDictFin.config
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
N : N mkN 0 1 2 4 7 13 14 16 17 19
|
||||||
|
A : A mkA' 0 1 2 4 7 13 14 16 17 19
|
||||||
|
V : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
|
||||||
|
V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
|
||||||
|
Adv : Adv mkAdv 0
|
||||||
|
Prep : Prep mkPrep 0
|
||||||
File diff suppressed because it is too large
Load Diff
13
src/morphodict/MorphoDictFin.header
Normal file
13
src/morphodict/MorphoDictFin.header
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
concrete MorphoDictFin of MorphoDictFinAbs = CatFin ** open
|
||||||
|
ParadigmsFin,
|
||||||
|
-- MorphoFin,
|
||||||
|
Kotus
|
||||||
|
-- Prelude
|
||||||
|
in {
|
||||||
|
|
||||||
|
-- extracted from http://kaino.kotus.fi/sanat/nykysuomi/, licensed under LGPL
|
||||||
|
|
||||||
|
flags coding = utf8 ;
|
||||||
|
|
||||||
|
oper mkA' : (x1,_,_,_,_,_,_,_,_,x10 : Str) -> A = \a,b,c,d,e,f,g,h,i,j -> mkA (mkN a b c d e f g h i j) ; -- Need a single worst-case paradigm for how config is implemented
|
||||||
|
|
||||||
File diff suppressed because it is too large
Load Diff
4
src/morphodict/MorphoDictFinAbs.header
Normal file
4
src/morphodict/MorphoDictFinAbs.header
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
abstract MorphoDictFinAbs =
|
||||||
|
Cat [N,A,V,Adv,Prep] **
|
||||||
|
{
|
||||||
|
|
||||||
20
src/morphodict/morphodict.cabal
Normal file
20
src/morphodict/morphodict.cabal
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
name: morphodict
|
||||||
|
version: 0.1
|
||||||
|
homepage: https://github.com/GrammaticalFramework/gf-rgl/tree/master/src/morphodict
|
||||||
|
author: Aarne Ranta
|
||||||
|
category: Natural Language Processing
|
||||||
|
build-type: Simple
|
||||||
|
extra-source-files: README.md
|
||||||
|
cabal-version: >=1.10
|
||||||
|
|
||||||
|
executable MkMorphoDict
|
||||||
|
hs-source-dirs:
|
||||||
|
.
|
||||||
|
main-is: MkMorphoDict.hs
|
||||||
|
other-modules:
|
||||||
|
build-depends:
|
||||||
|
base,
|
||||||
|
containers,
|
||||||
|
safe,
|
||||||
|
gf
|
||||||
|
default-language: Haskell2010
|
||||||
16
src/morphodict/stack.yaml
Normal file
16
src/morphodict/stack.yaml
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
resolver: lts-12.26
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- .
|
||||||
|
|
||||||
|
# so that `stack build --copy-bins` puts bin here
|
||||||
|
local-bin-path: .
|
||||||
|
|
||||||
|
extra-deps:
|
||||||
|
- gf-3.10
|
||||||
|
- cgi-3001.3.0.3 # dependency of gf
|
||||||
|
|
||||||
|
flags:
|
||||||
|
# this excludes PGF2 module in gf package
|
||||||
|
gf:
|
||||||
|
c-runtime: false
|
||||||
26
src/morphodict/stack.yaml.lock
Normal file
26
src/morphodict/stack.yaml.lock
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# This file was autogenerated by Stack.
|
||||||
|
# You should not edit this file by hand.
|
||||||
|
# For more information, please see the documentation at:
|
||||||
|
# https://docs.haskellstack.org/en/stable/lock_files
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- completed:
|
||||||
|
hackage: gf-3.10@sha256:6f851dfaab5e1f9d4f3796515b86f78806a2bb305136a902713dfc2b92d9cfb0,8477
|
||||||
|
pantry-tree:
|
||||||
|
size: 64924
|
||||||
|
sha256: 66332577ff42a42eed767f451f53266e1020b72749cdcdf7387933615d5de091
|
||||||
|
original:
|
||||||
|
hackage: gf-3.10
|
||||||
|
- completed:
|
||||||
|
hackage: cgi-3001.3.0.3@sha256:4f3768d09e4a6620642588cab2e99d83c1b6b542dad6147d0af9532170036115,2076
|
||||||
|
pantry-tree:
|
||||||
|
size: 667
|
||||||
|
sha256: 65f6fd4574cffd1e5e2490c133b7ba58fd2fea0a65d81f1fa6fe14f08025629b
|
||||||
|
original:
|
||||||
|
hackage: cgi-3001.3.0.3
|
||||||
|
snapshots:
|
||||||
|
- completed:
|
||||||
|
size: 509471
|
||||||
|
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/12/26.yaml
|
||||||
|
sha256: 95f014df58d0679b1c4a2b7bf2b652b61da8d30de5f571abb0d59015ef678646
|
||||||
|
original: lts-12.26
|
||||||
51
src/morphodict/utils/only_homonyms.sh
Executable file
51
src/morphodict/utils/only_homonyms.sh
Executable file
@@ -0,0 +1,51 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
USAGE="usage: ./remove_sense_distinctions.sh <concrete syntax file>"
|
||||||
|
NOTE="This is not extremely useful, it will just create a file with only those entries that are homonymous in dictionary form, but differ in other forms. The purpose of the file is for you to look at/do small experiments with. The real job is done in MkMorphoDict.hs."
|
||||||
|
|
||||||
|
# String manipulation
|
||||||
|
CONC=$1 # e.g. MorphoDictFin.gf
|
||||||
|
BAK="$CONC.bak" # e.g. MorphoDictFin.gf.bak
|
||||||
|
|
||||||
|
NAME=`echo $CONC | cut -f 1 -d '.'` # e.g. MorphoDictFin
|
||||||
|
ABS="${NAME}Abs.gf" # e.g. MorphoDictFinAbs.gf
|
||||||
|
CONC_HEADER="$NAME.header" # e.g. MorphoDictFin.header
|
||||||
|
ABS_HEADER="${NAME}Abs.header" # e.g. MorphoDictFinAbs.header
|
||||||
|
|
||||||
|
find_duplicates() {
|
||||||
|
echo "Putting (temporarily) only homonyms in $CONC"
|
||||||
|
echo "cat $CONC_HEADER > $CONC"
|
||||||
|
cat $CONC_HEADER > $CONC
|
||||||
|
DUPLS=`cut -f 2 -d ' ' /tmp/$CONC \
|
||||||
|
| sort | uniq -c | sort -nr \
|
||||||
|
| egrep "^ +1?[2-9][0-9]? [a-zåäö]+_" \
|
||||||
|
| tr -d '[0-9][A-ZÅÄÖ]'`
|
||||||
|
for d in $DUPLS
|
||||||
|
do
|
||||||
|
grep "lin $d" $BAK >> $CONC
|
||||||
|
done
|
||||||
|
echo "}" >> $CONC
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_numbers() {
|
||||||
|
echo "cp $CONC{,.bak}"
|
||||||
|
cp $CONC{,.bak}
|
||||||
|
echo "cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC"
|
||||||
|
cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC
|
||||||
|
echo "Done removing numbers."
|
||||||
|
}
|
||||||
|
|
||||||
|
#### Action starts here
|
||||||
|
|
||||||
|
echo $NOTE
|
||||||
|
|
||||||
|
if [[ $CONC == *"Abs.gf" ]]
|
||||||
|
then
|
||||||
|
echo $USAGE
|
||||||
|
else
|
||||||
|
remove_numbers
|
||||||
|
find_duplicates
|
||||||
|
# echo "gf -v=0 -make $CONC"
|
||||||
|
# gf -v=0 -make $CONC
|
||||||
|
echo "$CONC contains now only homonyms. Original file is found in $BAK."
|
||||||
|
fi
|
||||||
Reference in New Issue
Block a user