1
0
forked from GitHub/gf-rgl
This commit is contained in:
krangelov
2021-07-02 21:29:12 +02:00
24 changed files with 61482 additions and 62253 deletions

View File

@@ -47,6 +47,7 @@ abstract Extend = Cat ** {
PredVPS : NP -> VPS -> S ; -- she [has walked and won't sleep]
SQuestVPS : NP -> VPS -> QS ; -- has she walked
QuestVPS : IP -> VPS -> QS ; -- who has walked
RelVPS : RP -> VPS -> RS ; -- which won't sleep
-- existentials that work in the absence of Cl
ExistS : Temp -> Pol -> NP -> S ; -- there was a party
@@ -286,4 +287,6 @@ fun UseDAP : DAP -> NP ;
UseDAPMasc : DAP -> NP ;
UseDAPFem : DAP -> NP ;
cat X ; -- for words that are difficult to classify, mainly for MorphoDict
}

View File

@@ -4,6 +4,8 @@ lincat
RNP = Grammar.NP ;
RNPList = Grammar.ListNP ;
X = {s : Str} ; -- for words that are difficult to classify, mainly for MorphoDict
lin
BaseVPS = variants {} ;
ConsVPS = variants {} ;
@@ -26,6 +28,7 @@ lin
MkVPS vp = variants {} ; -- Temp -> Pol -> VP -> VPS ; -- hasn't slept
ConjVPS = variants {} ; -- Conj -> [VPS] -> VPS ; -- has walked and won't sleep
PredVPS = variants {} ; -- NP -> VPS -> S ; -- has walked and won't sleep
RelVPS = variants {} ; -- RP -> VPS -> RS ; -- which won't sleep
MkVPI vp = variants {} ; -- Temp -> Pol -> VP -> VPI ; -- to sleep / hasn't slept
ConjVPI = variants {} ; -- Conj -> [VPI] -> VPI ; -- has walked and won't sleep
ComplVPIVV = variants {} ; -- VV -> VPI -> VP ; -- want to sleep and to walk

View File

@@ -5,7 +5,7 @@ concrete ExtendEng of Extend =
[
VPS, ListVPS, VPI, ListVPI, VPS2, ListVPS2, VPI2, ListVPI2, RNP, RNPList,
AdAdV, AdjAsCN, AdjAsNP, ApposNP, AdvIsNP,
MkVPS, BaseVPS, ConsVPS, ConjVPS, PredVPS, QuestVPS, SQuestVPS,
MkVPS, BaseVPS, ConsVPS, ConjVPS, PredVPS, QuestVPS, SQuestVPS, RelVPS,
MkVPI, BaseVPI, ConsVPI, ConjVPI, ComplVPIVV,
MkVPS2, BaseVPS2, ConsVPS2, ConjVPS2, ComplVPS2, ReflVPS2,
MkVPI2, BaseVPI2, ConsVPI2, ConjVPI2, ComplVPI2,
@@ -127,6 +127,11 @@ concrete ExtendEng of Extend =
QuestVPS ip vps = let vp = vps.s ! oDir ! toAgr ip.n P3 Neutr in {
s = \\q => ip.s ! npNom ++ vp.fin ++ vp.inf
} ;
RelVPS rp vps = {
s = \\agr => let vp = vps.s ! oDir ! agr in
rp.s ! RC (fromAgr agr).g npNom ++ vp.fin ++ vp.inf ;
c = npNom ;
} ;
MkVPI vp = mkVPI (lin VP vp) ;
@@ -177,7 +182,7 @@ concrete ExtendEng of Extend =
let
verb = vp.s ! t.t ! t.a ! p.p ! o ! a ; -- choice of Order determines aux or not
compl = vp.s2 ! a ++ vp.ext
in {fin = verb.aux ;
in {fin = verb.aux ++ t.s ++ p.s ;
inf = verb.adv ++ vp.ad ! a ++ verb.fin ++ verb.inf ++ vp.p ++ compl} ;
} ;

View File

@@ -13,7 +13,8 @@ flags optimize=values ;
bend_V = irregV "bend" "bent" "bent" ;
beset_V = irregV "beset" "beset" "beset" ;
bet_V = irregDuplV "bet" "bet" "bet" ;
bid_V = irregDuplV "bid" (variants {"bid" ; "bade"}) (variants {"bid" ; "bidden"}) ;
bid_V = irregDuplV "bid" "bid" "bid" ;
bid_bade_V = irregDuplV "bid" "bade" "bidden" ;
bind_V = irregV "bind" "bound" "bound" ;
bite_V = irregV "bite" "bit" "bitten" ;
bleed_V = irregV "bleed" "bled" "bled" ;
@@ -23,7 +24,7 @@ flags optimize=values ;
bring_V = irregV "bring" "brought" "brought" ;
broadcast_V = irregV "broadcast" "broadcast" "broadcast" ;
build_V = irregV "build" "built" "built" ;
burn_V = irregV "burn" (variants {"burned" ; "burnt"}) (variants {"burned" ; "burnt"}) ;
burn_V = irregV "burn" "burnt" "burnt" ;
burst_V = irregV "burst" "burst" "burst" ;
buy_V = irregV "buy" "bought" "bought" ;
cast_V = irregV "cast" "cast" "cast" ;
@@ -36,10 +37,10 @@ flags optimize=values ;
cut_V = irregDuplV "cut" "cut" "cut" ;
deal_V = irregV "deal" "dealt" "dealt" ;
dig_V = irregDuplV "dig" "dug" "dug" ;
dive_V = irregV "dive" (variants {"dived" ; "dove"}) "dived" ;
dive_V = irregV "dive" "dove" "dived" ;
do_V = mk5V "do" "does" "did" "done" "doing" ;
draw_V = irregV "draw" "drew" "drawn" ;
dream_V = irregV "dream" (variants {"dreamed" ; "dreamt"}) (variants {"dreamed" ; "dreamt"}) ;
dream_V = irregV "dream" "dreamt" "dreamt" ;
drive_V = irregV "drive" "drove" "driven" ;
drink_V = irregV "drink" "drank" "drunk" ;
eat_V = irregV "eat" "ate" "eaten" ;
@@ -75,27 +76,27 @@ flags optimize=values ;
know_V = irregV "know" "knew" "known" ;
lay_V = irregV "lay" "laid" "laid" ;
lead_V = irregV "lead" "led" "led" ;
leap_V = irregV "leap" (variants {"leaped" ; "lept"}) (variants {"leaped" ; "lept"}) ;
learn_V = irregV "learn" (variants {"learned" ; "learnt"}) (variants {"learned" ; "learnt"}) ;
leap_V = irregV "leap" "lept" "lept" ;
learn_V = irregV "learn" "learnt" "learnt" ;
leave_V = irregV "leave" "left" "left" ;
lend_V = irregV "lend" "lent" "lent" ;
let_V = irregDuplV "let" "let" "let" ;
lie_V = irregV "lie" "lay" "lain" ;
light_V = irregV "light" (variants {"lighted" ; "lit"}) "lighted" ;
light_V = irregV "light" "lit" "lit" ;
lose_V = irregV "lose" "lost" "lost" ;
make_V = irregV "make" "made" "made" ;
mean_V = irregV "mean" "meant" "meant" ;
meet_V = irregV "meet" "met" "met" ;
misspell_V = irregV "misspell" (variants {"misspelled" ; "misspelt"}) (variants {"misspelled" ; "misspelt"}) ;
misspell_V = irregV "misspell" "misspelt" "misspelt" ;
mistake_V = irregV "mistake" "mistook" "mistaken" ;
mow_V = irregV "mow" "mowed" (variants {"mowed" ; "mown"}) ;
mow_V = irregV "mow" "mowed" "mown" ;
overcome_V = irregV "overcome" "overcame" "overcome" ;
overdo_V = mk5V "overdo" "overdoes" "overdid" "overdone" "overdoing" ;
overtake_V = irregV "overtake" "overtook" "overtaken" ;
overthrow_V = irregV "overthrow" "overthrew" "overthrown" ;
pay_V = irregV "pay" "paid" "paid" ;
plead_V = irregV "plead" "pled" "pled" ;
prove_V = irregV "prove" "proved" (variants {"proved" ; "proven"}) ;
prove_V = irregV "prove" "proved" "proven" ;
put_V = irregDuplV "put" "put" "put" ;
quit_V = irregDuplV "quit" "quit" "quit" ;
read_V = irregV "read" "read" "read" ;
@@ -104,22 +105,22 @@ flags optimize=values ;
ring_V = irregV "ring" "rang" "rung" ;
rise_V = irregV "rise" "rose" "risen" ;
run_V = irregDuplV "run" "ran" "run" ;
saw_V = irregV "saw" "sawed" (variants {"sawed" ; "sawn"}) ;
saw_V = irregV "saw" "sawed" "sawn" ;
say_V = irregV "say" "said" "said" ;
see_V = irregV "see" "saw" "seen" ;
seek_V = irregV "seek" "sought" "sought" ;
sell_V = irregV "sell" "sold" "sold" ;
send_V = irregV "send" "sent" "sent" ;
set_V = irregDuplV "set" "set" "set" ;
sew_V = irregV "sew" "sewed" (variants {"sewed" ; "sewn"}) ;
sew_V = irregV "sew" "sewed" "sewn" ;
shake_V = irregV "shake" "shook" "shaken" ;
shave_V = irregV "shave" "shaved" (variants {"shaved" ; "shaven"}) ;
shave_V = irregV "shave" "shaved" "shaven" ;
shear_V = irregV "shear" "shore" "shorn" ;
shed_V = irregDuplV "shed" "shed" "shed" ;
shine_V = irregV "shine" "shone" "shone" ;
shoe_V = irregV "shoe" "shoed" (variants {"shoed" ; "shod"}) ;
shoe_V = irregV "shoe" "shoed" "shod" ;
shoot_V = irregV "shoot" "shot" "shot" ;
show_V = irregV "show" "showed" (variants {"showed" ; "shown"}) ;
show_V = irregV "show" "showed" "shown" ;
shrink_V = irregV "shrink" "shrank" "shrunk" ;
shut_V = irregDuplV "shut" "shut" "shut" ;
sing_V = irregV "sing" "sang" "sung" ;
@@ -131,16 +132,18 @@ flags optimize=values ;
sling_V = irregV "sling" "slung" "slung" ;
slit_V = irregDuplV "slit" "slit" "slit" ;
smite_V = irregV "smite" "smote" "smitten" ;
sow_V = irregV "sow" "sowed" (variants {"sowed" ; "sown"}) ;
sow_V = irregV "sow" "sowed" "sown" ;
speak_V = irregV "speak" "spoke" "spoken" ;
speed_V = irregV "speed" "sped" "sped" ;
spend_V = irregV "spend" "spent" "spent" ;
spill_V = irregV "spill" (variants {"spilled" ; "spilt"}) (variants {"spilled" ; "spilt"}) ;
spill_V = irregV "spill" "spilt" "spilt" ;
spin_V = irregDuplV "spin" "spun" "spun" ;
spit_V = irregDuplV "spit" (variants {"spit" ; "spat"}) "spit" ;
spit_V = irregDuplV "spit" "spit" "spit" ;
spit_spat_V = irregDuplV "spit" "spat" "spit" ;
split_V = irregDuplV "split" "split" "split" ;
spread_V = irregV "spread" "spread" "spread" ;
spring_V = irregV "spring" (variants {"sprang" ; "sprung"}) "sprung" ;
spring_V = irregV "spring" "sprang" "sprung" ;
spring_sprung_V = irregV "spring" "sprung" "sprung" ;
stand_V = irregV "stand" "stood" "stood" ;
steal_V = irregV "steal" "stole" "stolen" ;
stick_V = irregV "stick" "stuck" "stuck" ;
@@ -152,7 +155,7 @@ flags optimize=values ;
strive_V = irregV "strive" "strove" "striven" ;
swear_V = irregV "swear" "swore" "sworn" ;
sweep_V = irregV "sweep" "swept" "swept" ;
swell_V = irregV "swell" "swelled" (variants {"swelled" ; "swollen"}) ;
swell_V = irregV "swell" "swelled" "swollen" ;
swim_V = irregDuplV "swim" "swam" "swum" ;
swing_V = irregV "swing" "swung" "swung" ;
take_V = irregV "take" "took" "taken" ;
@@ -160,7 +163,7 @@ flags optimize=values ;
tear_V = irregV "tear" "tore" "torn" ;
tell_V = irregV "tell" "told" "told" ;
think_V = irregV "think" "thought" "thought" ;
thrive_V = irregV "thrive" (variants {"thrived" ; "throve"}) "thrived" ;
thrive_V = irregV "thrive" "throve" "thrived" ;
throw_V = irregV "throw" "threw" "thrown" ;
thrust_V = irregV "thrust" "thrust" "thrust" ;
tread_V = irregV "tread" "trod" "trodden" ;
@@ -169,7 +172,7 @@ flags optimize=values ;
upset_V = irregDuplV "upset" "upset" "upset" ;
wake_V = irregV "wake" "woke" "woken" ;
wear_V = irregV "wear" "wore" "worn" ;
weave_V = irregV "weave" (variants {"weaved" ; "wove"}) (variants {"weaved" ; "woven"}) ;
weave_V = irregV "weave" "wove" "woven" ;
wed_V = irregDuplV "wed" "wed" "wed" ;
weep_V = irregV "weep" "wept" "wept" ;
wind_V = irregV "wind" "wound" "wound" ;

View File

@@ -9,6 +9,7 @@ fun
beset_V : V ;
bet_V : V ;
bid_V : V ;
bid_bade_V : V ;
bind_V : V ;
bite_V : V ;
bleed_V : V ;
@@ -133,9 +134,11 @@ fun
spill_V : V ;
spin_V : V ;
spit_V : V ;
spit_spat_V : V ;
split_V : V ;
spread_V : V ;
spring_V : V ;
spring_sprung_V : V ;
stand_V : V ;
steal_V : V ;
stick_V : V ;

View File

@@ -19885,7 +19885,7 @@ lin moniaalle_Adv = mkAdv {s = c99 "moniaalle"} ;
lin moniaalta_Adv = mkAdv {s = c99 "moniaalta"} ;
lin monialaistua_V = mkV {s = c52 "monialaistua"} ;
lin monias_N = mkN {s = d41 "monias"} ;
lin moni_ilmeinen_N = mkN {s = d18 "moni-ilmeinen"} ;
lin moni_ilmeinen_N = mkN {s = d38 "moni-ilmeinen"} ;
lin monijumalaisuus_N = mkN {s = d40 "monijumalaisuus"} ;
lin monikansainen_N = mkN {s = d38 "monikansainen"} ;
lin monikko_N = mkN {s = d04A "monikko"} ;
@@ -41387,7 +41387,7 @@ lin ykkönen_N = mkN {s = d38 "ykkönen"} ;
lin yks_Adv = mkAdv {s = c99 "yks"} ;
lin ykseys_N = mkN {s = d40 "ykseys"} ;
lin yksi_N = mkN {s = d31 "yksi"} ;
lin yksi_ilmeinen_N = mkN {s = d18 "yksi-ilmeinen"} ;
lin yksi_ilmeinen_N = mkN {s = d38 "yksi-ilmeinen"} ;
lin yksiin_Adv = mkAdv {s = c99 "yksiin"} ;
lin yksijumalaisuus_N = mkN {s = d40 "yksijumalaisuus"} ;
lin yksikkö_N = mkN {s = d04A "yksikkö"} ;

View File

@@ -159,6 +159,8 @@ oper
s = \\c => (StemFin.snoun2nounBind veri).s ! NCase n Gen + paine.s ! c
} ;
foreignN : Str -> N ; -- foreign word without Finnish alternations, e.g. sake/saken/sakeja
-- Nouns used as functions need a case, of which the default is
-- the genitive.
@@ -667,7 +669,8 @@ mkVS = overload {
mkPN_1 : Str -> PN = \s -> lin PN (snoun2spn (mk1N s)) ;
foreignPN : Str -> PN = \s -> (lin PN (snoun2spn (nforms2snoun (noun s)))) where {
foreignPN : Str -> PN = \s -> mkPN (foreignN s) ;
foreignN : Str -> N = \s -> (lin N (nforms2snoun (noun s))) where {
noun : Str -> NForms = \s -> case s of {
_ + "i" => dPaatti s (s + "n") ;
_ + "e" => dNukke s (s + "n") ;

View File

@@ -11,6 +11,10 @@ concrete IdiomFre of Idiom = CatFre **
ExistNP np =
mkClause "il" True False (agrP3 Masc Sg)
(insertClit3 "y" (insertComplement (\\_ => (np.s ! Acc).ton) (predV avoir_V))) ;
ExistNPAdv np adv =
mkClause "il" True False (agrP3 Masc Sg)
(insertAdv adv.s (insertClit3 "y" (insertComplement (\\_ => (np.s ! Acc).ton) (predV avoir_V)))) ;
ExistIP ip = {
s = \\t,a,p,_ =>

View File

@@ -144,10 +144,10 @@ concrete VerbGer of Verb = CatGer ** open Prelude, ResGer, Coordination in {
CompAdv a = {s = \\_ => a.s ; ext = []} ;
CompCN cn = {s = \\a => case numberAgr a of {
Sg => "ein" + pronEnding ! GSg cn.g ! Nom ++ cn.s ! Strong ! Sg ! Nom ;
Pl => cn.s ! Strong ! Pl ! Nom
Sg => "ein" + pronEnding ! GSg cn.g ! Nom ++ cn.s ! Strong ! Sg ! Nom ++ cn.rc ! Sg ; ---
Pl => cn.s ! Strong ! Pl ! Nom ++ cn.rc ! Pl ---
} ;
ext = []
ext = cn.adv ++ cn.ext
} ;
AdvVP vp adv = insertAdv adv.s vp ;

View File

@@ -23,6 +23,14 @@ concrete IdiomIta of Idiom = CatIta **
(insertClit3 (elision "ci" "c'" "ci")
(insertComplement (\\_ => (np.s ! Nom).ton)
(predV copula))) ;
ExistNPAdv np adv =
let npa = complAgr np.a in
mkClause [] True False (agrP3 npa.g npa.n)
(insertAdv adv.s
(insertClit3 (elision "ci" "c'" "ci")
(insertComplement (\\_ => (np.s ! Nom).ton)
(predV copula)))) ;
ExistIP ip = {
s = \\t,a,p,_ =>

View File

@@ -5,7 +5,9 @@ import PGF
import qualified Data.Map as M
import Data.Char
import Data.List
import Safe
import System.Environment (getArgs)
import Debug.Trace
-- AR 2020-02-28
@@ -17,86 +19,164 @@ import System.Environment (getArgs)
-- - functionname = baseform_category, with exceptions
-- - variant inflection tables: lie_1_V, lie_2_V
-- - words that have non-ident characters: 'bird\'s-eye_A'
-- - words that start with non-letters: W_'tween_Adv
-- example:
-- gf -make ../english/DictEng.gf
-- runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
-- runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
-- 64923 -> 56599 functions
usage = "MkMorphodict <pgf> <outfile>"
usage = "runghc MkMorphodict (raw|pgf) <configfile> <datafile> <outfile>"
main = do
pgfile:outfile:_ <- getArgs
pgf <- readPGF pgfile
config <- readFile (outfile ++ ".config") >>= return . mkConfig
let (absrules,cncrules) = mkMorphoDict (MDEnv pgf config (head (languages pgf)))
xx <- getArgs
if length xx /= 4
then do
putStrLn "Usage:"
putStrLn usage
putStrLn $ "Got instead: " ++ show xx
else do
let mode:configfile:datafile:outfile:_ = xx
config <- readFile configfile >>= return . mkConfig
rawdata <- case mode of
"pgf" -> pgfFile2rawData config datafile
"raw" -> readFile datafile >>= return . map getRawData . filter (not . null) . lines
_ -> error $ "Expected mode (pgf|raw), got " ++ mode
rawdata2gf config rawdata outfile
rawdata2gf config rawdata outfile = do
let env = MDEnv rawdata config
let (absrules,cncrules) = mkMorphoDict env
absheader <- readFile (outfile ++ "Abs.header")
cncheader <- readFile (outfile ++ ".header")
writeFile (outfile ++ "Abs.gf") absheader
appendFile (outfile ++ "Abs.gf") $ unlines absrules
appendFile (outfile ++ "Abs.gf") $ unlines $ sort absrules
appendFile (outfile ++ "Abs.gf") "}"
writeFile (outfile ++ ".gf") cncheader
appendFile (outfile ++ ".gf") $ unlines cncrules
appendFile (outfile ++ ".gf") $ unlines $ sort cncrules
appendFile (outfile ++ ".gf") "}"
-- one way to get raw data from a dictionary pgf
-- another way, more controllod, is to write a wrapper grammar with a function, for each category, to generate a RawData entry
pgfFile2rawData config pgffile = do
pgf <- readPGF pgffile
type Cat = CId
let cats = nub [c | (c,(_,_,_)) <- M.assocs config]
let lang:_ = languages pgf
return [
(cat, map snd lin) |
cat <- cats,
f <- functionsByCat pgf (mkCId cat),
lin <- tabularLinearizes pgf lang (mkApp f [])
]
type Cat = String
type Fun = String
type Oper = String
type Config = M.Map Cat (Cat,Oper,[Int])
type Config = M.Map Cat (Cat,Oper,([Int],[Int])) -- lin word_Cat = Oper str_i1 str_i2 ... str_in featj1 ... featjn ;
data MDEnv = MDEnv {
pgf :: PGF,
config :: Config,
lang :: Language
rawdata :: [RawData],
config :: Config
}
mkConfig :: String -> Config
mkConfig :: String -> Config -- N : N mkN 0 2 4 6 # 9
mkConfig ls = M.fromList [(c,i) | Left (c,i) <- map mkOne (lines ls)]
where
mkOne s = case words s of
"--":_ -> Right s
cat:":":tcat:oper:ints -> Left (mkCId cat,(mkCId tcat,oper,map read ints))
"--":_ -> Right s
cat:":":tcat:oper:ints -> Left (cat,(tcat,oper,mkArgs ints))
_ -> Right s
mkArgs ints = case break (=="#") ints of
(ss,[]) -> (map read' ss, [])
(ss,_:fs) -> (map read' ss, map read' fs)
read' a = readNote [] a -- Safe.readNote provides better error message
mkMorphoDict :: MDEnv -> ([String],[String])
getRawData s = case words s of
c:cs -> (c,cs)
type RawData = (String,[String]) -- old cat name, forms and features
type RawRule = (([String],Cat), (Oper, ([String],[String]))) -- parts of fun name, new category, oper, arguments
type RuleData = ((Fun, Cat), (Oper, ([String],[String]))) -- final fun name, cat, oper, args
mkMorphoDict :: MDEnv -> ([String],[String]) -- fun rules, lin rules
mkMorphoDict env =
unzip $
map splitRule $
findCompounds $
---- findCompounds $ -- let us not care about compounds for the time being, but include them if they are given
nameFunctions $
mergeRules $
concatMap findRules cats
findRules $
rawdata env
where
splitRule (fun,(cat,lin)) = (unwords ["fun",fun,":",showCId cat,";"], unwords ["lin",fun,"=", unwords lin,";"])
splitRule ((fun,cat),(oper,(forms,feats))) =
(unwords ["fun",fun,":",cat,";"], unwords ["lin",fun,"=", unwords (oper : map quote forms ++ feats),";"])
cats = nub [c | (c,(_,_,_)) <- M.assocs (config env)]
findRules cat = [
([snd (lin !! head ints), showCId c], (c, op : appSig ints (map snd lin))) | --- head ints is the base form in smart paradigms
f <- functionsByCat (pgf env) cat,
lin <- tabularLinearizes (pgf env) (lang env) (mkApp f []), -- [[(String, String)]]
Just (c,op,ints) <- [M.lookup cat (config env)]
]
findRules :: [RawData] -> [RawRule]
findRules raws = [
(([lemma],newcat),(oper, appSig sig args)) |
(oldcat,args) <- raws,
Just (newcat, oper, sig) <- [M.lookup oldcat (config env)],
let lemma = args `at` head (fst sig)
]
appSig ints forms = [forms !! i | i <- ints]
appSig (ints,feats) args =
-- If there's wrong number in config file, uncomment the line below to see which number it should be
-- trace (intercalate "\n" $ map show (zip [0..] args)) $
([args `at` i | i <- ints], [args `at` i | i <- feats])
mergeRules :: [RawRule] -> [RawRule]
mergeRules = map head . groupBy (\x y -> snd x == snd y) . sortOn snd
nameFunctions :: [RawRule] -> [RuleData]
nameFunctions = expandNames . sortOn fst
expandNames :: [RawRule] -> [RuleData]
expandNames fls = case fls of
(f,l):fls2 -> case span ((==f) . fst) fls2 of
([],_) -> (mkFun f,l) : expandNames fls2
(f@(w,c),l) : fls2 -> case span ((==f) . fst) fls2 of
([],_) -> ((mkFun (w ++ [c]),c),l) : expandNames fls2
(fls1,fls3) -> renames ((f,l):fls1) ++ expandNames fls3
_ -> []
renames fls = [(mkFun (init f ++ [show i,last f]),l) | (i,(f,l)) <- zip [1..] fls]
renames :: [RawRule] -> [RuleData]
--- renames fls = [((mkFun (f ++ [show i,c]),c),l) | (i,((f,c),l)) <- zip [1..] fls] -- disambiguate with int
renames fls = [((mkFun (f ++ fs ++ [c]),c),l) | (i,(((f,c),l),fs)) <- zip [1..] (zip fls (minimize fls))] -- disambiguate with different forms
minimize :: [RawRule] -> [[String]]
minimize fls = shrink [ws ++ fs | (_,(_,(_:ws,fs))) <- fls]
shrink fls = case fls of
fl@(_:_):_ | all ((==head fl) . head) fls -> shrink (map tail fls)
fl@(_:_):_ | all ((==last fl) . last) fls -> shrink (map init fls)
_ -> shrinkMore fls
shrinkMore fls = case fls of
_ | length (nub (map init fls)) == length fls -> shrinkMore (map init fls)
_ | length (nub (map tail fls)) == length fls -> shrinkMore (map tail fls)
_ -> fls
-- >>> mkFun ["hello", "world", "hello friends", "hello-all"]
-- "hello_world_hello_friends_hello_all"
mkFun :: [String] -> String -- if word contains space or hyphen, replace with underscore
mkFun = showCId . mkCId . concat . intersperse "_" . concatMap (words . removeHyphen)
where
removeHyphen [] = []
removeHyphen ['-'] = ['-'] -- If hyphen is the last character, it's usually meaningful, leave it
removeHyphen ('-':cs) = ' ' : removeHyphen cs
removeHyphen (c:cs) = c : removeHyphen cs
quote s = "\"" ++ s ++ "\""
{- ---- let us ignore this
findCompounds :: [RuleData] -> [RuleData]
findCompounds = getCompounds . sortOn cat_orthrevforms
cat_orthrevforms (_,(cat,_:forms)) = (cat,[map (!!i) fss | let fss = map reverse forms, i <- [0..minimum (map length fss) - 1]])
@@ -105,8 +185,9 @@ mkMorphoDict env =
revstem = head . snd . cat_revforms
wforms (_,(_,_:forms)) = forms
getCompounds :: [RuleData] -> [RuleData]
getCompounds fls = case fls of
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
fl : fls1 | length (revstem fl) < 2 -> markWith fl [] : getCompounds fls1
fl : fls2 -> case span (\x -> and [isPrefixOf (reverse w) (reverse w1) | (w,w1) <- zip (wforms fl) (wforms x)]) fls2 of
([],_:_) -> markWith fl [] : getCompounds fls2
(fls1,fls3) -> markWith fl [] : map (markCompound fl) fls1 ++ getCompounds fls3
@@ -117,26 +198,14 @@ mkMorphoDict env =
True -> markWith fl1 [";","--","compound",(fst fl)]
False -> markWith fl1 [";","--","notcompound",(fst fl)]
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs))
markWith (f,(c,op:ws)) xs = (f,(c,op : map quote ws ++ xs)) ---- TODO only quote string args, not features
isPrefixWord x xy =
length suff > 1 &&
any (\c -> elem c "-0123456789aeiouyåäö") suff &&
isPrefixOf x xy
length suff > 1 && ---- compound first part must be at least two letters long
any (\c -> elem c "-0123456789aeiouyåäö") suff && ---- must contain a vowel or a digit
isPrefixOf x xy ---- and of course be a prefix
where
suff = drop (length x) xy
mkFun = quoteIf . concat . intersperse "_"
quoteIf s = case s of
_ | any (\c -> not (isAlphaNum c || elem c "_'")) s -> "'" ++ unSgQuote s ++ "'"
c:_ | not (isAlpha c) -> "W_" ++ s
_ -> s
where
unSgQuote s = case s of
'\'':cs -> "\\\'" ++ unSgQuote cs
c:cs -> c : unSgQuote cs
_ -> s
-}
quote s = "\"" ++ s ++ "\""

View File

@@ -0,0 +1,6 @@
N : N mkN 0 1 2 4 7 13 14 16 17 19
A : A mkA' 0 1 2 4 7 13 14 16 17 19
V : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
V2 : V mkV 0 17 19 22 43 49 23 25 31 58 94 37
Adv : Adv mkAdv 0
Prep : Prep mkPrep 0

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,13 @@
concrete MorphoDictFin of MorphoDictFinAbs = CatFin ** open
ParadigmsFin,
-- MorphoFin,
Kotus
-- Prelude
in {
-- extracted from http://kaino.kotus.fi/sanat/nykysuomi/, licensed under LGPL
flags coding = utf8 ;
oper mkA' : (x1,_,_,_,_,_,_,_,_,x10 : Str) -> A = \a,b,c,d,e,f,g,h,i,j -> mkA (mkN a b c d e f g h i j) ; -- Need a single worst-case paradigm for how config is implemented

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,4 @@
abstract MorphoDictFinAbs =
Cat [N,A,V,Adv,Prep] **
{

View File

@@ -1,38 +0,0 @@
MkMorphoDict: Extracting a minimal morphological dictionary from an existing GF dictionary.
Aarne Ranta 2020-03-02
principles:
There should be a single source for each lemgram (i.e. inflection table of a word)
Functions names should be easy to guess: baseform_Category (but avoiding accidental errors if this is not a unique key)
Hence,
Functions are 1-to-1 with lemgrams, i.e. inflection tables, thus
- no sense distinctions
- no subcategorizations
- no variants
Functionname = baseform_category, with exceptions
- same baseform_Category, different inflection tables: lie_1_V, lie_2_V
- words that have non-ident characters: 'bird\'s-eye_A'
- words that start with non-letters: W_'tween_Adv
Example run, English:
gf -make ../english/DictEng.gf
runghc MkMorphodict.hs DictEngAbs.pgf MorphoDictEng
Result: 64923 -> 56599 functions, of which 21679 could be compounds
Swedish, using a dump of SALDO (not available in these sources)
cd saldo/
runghc SaldoGF.hs
# combine abs.tmp with Saldo.header to obtain Saldo.gf
# combine cnc.tmp with SaldoSwe.header to obtain SaldoSwe.gf
gf -make SaldoSwe.gf
cd ..
runghc MkMorphodict.hs saldo/Saldo.pgf MorphoDictSwe

171
src/morphodict/README.md Normal file
View File

@@ -0,0 +1,171 @@
# morphodict: purely morphological unilingual dictionaries
Aarne Ranta 2020-03-02 -- 2021-05-27
UNDER CONSTRUCTION, INCOMPLETE AND BUGGY
## The vision
Vision 1: if you need the noun "stjärna" in Swedish, you will find it
as `MorphoDictSwe.stjärna_N`.
Vision 2: if you analyse a Swedish text that contains the word "stjärnornas", it will be returned as `MorphoDictSwe.stjärna_N`.
Vision 3: this will work for all words of Swedish and all other RGL languages. Only seldom will you need `ParadigmsSwe`.
## What is contained
The guiding principle is to provide a single source for each *lemgram* (i.e. linearization records, i.e. inflection table plus inherent features).
Functions names should be easy to guess:
- `baseform_Category`
Baseforms that have many different lemgrams are an exception.
They should be disambiguated by adding the differing forms, as in
- `lie_lay_V` ("lie, lay, lain")
- `lie_lied_V` ("lie, lied lied")
Such distinctions are made in all cases where there are alternative inflections, even if there is no sense distinction:
- `learn_learned_V` ("learn, learned, learned")
- `learn_learnt_V` ("learn, learnt, learnt")
Hence,
- no `variants` should appear in the MorphoDict
- no entries should be duplicated if their lemgrams are the same
- hence, in particular, sense distinctions do not result in different entries
The dictionary will also exclude *multiwords* consisting of several tokens.
Most of the time, even *compounds* written as single tokens should be excluded.
However, as the status of a compound is not always clear, and since they do not create spurious morphological analyses, they can be tolerated, in particular if extracted from legacy sources.
Since multiwords and compounds are excluded, `Paradigms` and `MakeStructural` should for each language provide API functions for easy definitions of them, preferably of the form
```
mkC : Str -> C -> C
```
The situation when this is not enough is when separate functions are needed for gluing and concatenation compounds.
*Open question*: what to do with compound prepositions that are common in e.g. English?
The above principles imply
```
according_to_Prep = mkPrep "according" to_Prep
```
defined *outside* `MorphoDictEng`, so that `mkPrep` comes from `ParadigmsEng` and `to_Prep` from `MorphoDictEng`.
This may sound like against tradition, but follows the general guidelines of morphological dictionaries.
## Relevant categories
In addition to sense distinctions, MorphoDict ignores subcategorizations.
One reason is that, just like senses (although in a lesser degree), they are open-ended and sometimes vague.
Another reason is that different subcategory variants overload morphological analysis.
The most numerous categories to be addressed are content words:
- `A`
- `Adv`
- `Interj`
- `N`
- `PN`
- `Symb`
- `V`
In addition, structural words should appear here with their native lemma names:
- `Conj`
- `Det`
- `IAdv`
- `IDet`
- `IP`
- `NP` (special NP-like "pronouns", such as "somebody")
- `Prep`
- `Pron` (in the RGL only covering personal pronouns)
- `Punct`
- `Subj`
Additional language-specific categories can be included if the reasons are clear.
They must then be importable from the `Paradigms` module for that language, together with `mk` functions.
The `Extend` module may also put them in use in syntax.
Following the model of Universal Tagset, we add a category `X` for unspecified words in `Extend`, with the linearization type `{s : Str}`.
Hence it can only be used for uninflected strings with unclear status.
## Naming
As stated before,
- `functionname` = `baseform_category` if there is a unique lemgram
- = `baseform_number_category` if there is a need to disambiguate
The disambiguation numbering should reflect the frequency or probability of the lemgram, but this is just a recommendation, since the frequency is not always known.
The baseform should be the native alphabet baseform in Unicode letters, which is as such a valid GF identifier.
However, if the word contains characters that are not legal in identifiers, the function name should be simply included in single quotes, rather than inventing transliterations.
If function names are formed by the API function `PGF.mkCId`, these conventions are automatically followed.
## Coding conventions
To enable easy ocular and automatic inspection,
- write one entry per line, each prefixed by `fun` or `lin` keyword
- sort the entries alphabetically
- use paradigms with enough many arguments to make the characteristic forms explicit
To guarantee compatibility with the rest of the RGL and application grammars,
- paradigms used should be imported from `Paradigms` and `MakeStructural` rather than defined in `MorphoDict` itself
- import of *low-level modules* such as `Res` should be avoided
- `MorphoDict` should be self-contained, i.e. not inherit from other modules such as `Structural` or `Irreg`. But it is OK to `open` them in a qualified mode to use when defining linearizations.
## Bootstrapping with `MkMorphoDict`
Example run, English:
```
gf -make ../english/DictEng.gf
runghc MkMorphodict.hs pgf MorphoDictEng.config DictEngAbs.pgf MorphoDictEng
```
Or, if you have raw data from another source, of the format "N woman women", you can do
```
runghc MkMorphodict.hs raw MorphoDictEng.config raw_words_eng.txt MorphoDictEng
```
The script needs a *configuration file* mapping legacy categories and forms lists to parts of GF code:
```
N : N mkN 0 2
A : A mkA 0 2 4 6
V : V mkV 0 4 2
V2 : V mkV 0 4 2
Adv : Adv mkAdv 0
Prep : Prep mkPrep 0
```
In addition, it needs *header files* containing lines to be prefixed to the generated files:
```
concrete MorphoDictEng of MorphoDictEngAbs =
CatEng [N,A,V,Adv,Prep] **
open
ParadigmsEng
in
{
```
```
abstract MorphoDictEngAbs =
Cat [N,A,V,Adv,Prep] **
{
```
For more details, we refer to `MkMorphodict.hs` for the time being.
If the config and header files are sound, the script produces compilable GF files.
They also mostly comply to the guidelines given in this document.
Some things TODO:
- deal with multiwords such as "more regular" generated by Paradigms
- use references to native Irreg files instead of very long smart paradigms
- support increments in addition to overwrites
## Things to do
To support the construction of a `MorphoDict`, the following should be provided in `Paradigms`:
- explicit smart paradigms with characteristic forms and inherent features for each category
- API constants for all inherent features that are needed
- compound-constructing functions for all categories that need them
- the extra categories that one wants to include in that language

View File

@@ -0,0 +1,20 @@
name: morphodict
version: 0.1
homepage: https://github.com/GrammaticalFramework/gf-rgl/tree/master/src/morphodict
author: Aarne Ranta
category: Natural Language Processing
build-type: Simple
extra-source-files: README.md
cabal-version: >=1.10
executable MkMorphoDict
hs-source-dirs:
.
main-is: MkMorphoDict.hs
other-modules:
build-depends:
base,
containers,
safe,
gf
default-language: Haskell2010

16
src/morphodict/stack.yaml Normal file
View File

@@ -0,0 +1,16 @@
resolver: lts-12.26
packages:
- .
# so that `stack build --copy-bins` puts bin here
local-bin-path: .
extra-deps:
- gf-3.10
- cgi-3001.3.0.3 # dependency of gf
flags:
# this excludes PGF2 module in gf package
gf:
c-runtime: false

View File

@@ -0,0 +1,26 @@
# This file was autogenerated by Stack.
# You should not edit this file by hand.
# For more information, please see the documentation at:
# https://docs.haskellstack.org/en/stable/lock_files
packages:
- completed:
hackage: gf-3.10@sha256:6f851dfaab5e1f9d4f3796515b86f78806a2bb305136a902713dfc2b92d9cfb0,8477
pantry-tree:
size: 64924
sha256: 66332577ff42a42eed767f451f53266e1020b72749cdcdf7387933615d5de091
original:
hackage: gf-3.10
- completed:
hackage: cgi-3001.3.0.3@sha256:4f3768d09e4a6620642588cab2e99d83c1b6b542dad6147d0af9532170036115,2076
pantry-tree:
size: 667
sha256: 65f6fd4574cffd1e5e2490c133b7ba58fd2fea0a65d81f1fa6fe14f08025629b
original:
hackage: cgi-3001.3.0.3
snapshots:
- completed:
size: 509471
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/12/26.yaml
sha256: 95f014df58d0679b1c4a2b7bf2b652b61da8d30de5f571abb0d59015ef678646
original: lts-12.26

View File

@@ -0,0 +1,51 @@
#!/bin/bash
USAGE="usage: ./remove_sense_distinctions.sh <concrete syntax file>"
NOTE="This is not extremely useful, it will just create a file with only those entries that are homonymous in dictionary form, but differ in other forms. The purpose of the file is for you to look at/do small experiments with. The real job is done in MkMorphoDict.hs."
# String manipulation
CONC=$1 # e.g. MorphoDictFin.gf
BAK="$CONC.bak" # e.g. MorphoDictFin.gf.bak
NAME=`echo $CONC | cut -f 1 -d '.'` # e.g. MorphoDictFin
ABS="${NAME}Abs.gf" # e.g. MorphoDictFinAbs.gf
CONC_HEADER="$NAME.header" # e.g. MorphoDictFin.header
ABS_HEADER="${NAME}Abs.header" # e.g. MorphoDictFinAbs.header
find_duplicates() {
echo "Putting (temporarily) only homonyms in $CONC"
echo "cat $CONC_HEADER > $CONC"
cat $CONC_HEADER > $CONC
DUPLS=`cut -f 2 -d ' ' /tmp/$CONC \
| sort | uniq -c | sort -nr \
| egrep "^ +1?[2-9][0-9]? [a-zåäö]+_" \
| tr -d '[0-9][A-ZÅÄÖ]'`
for d in $DUPLS
do
grep "lin $d" $BAK >> $CONC
done
echo "}" >> $CONC
}
remove_numbers() {
echo "cp $CONC{,.bak}"
cp $CONC{,.bak}
echo "cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC"
cat $CONC | sed -E 's/_[0-9]_/_/g' | uniq > /tmp/$CONC
echo "Done removing numbers."
}
#### Action starts here
echo $NOTE
if [[ $CONC == *"Abs.gf" ]]
then
echo $USAGE
else
remove_numbers
find_duplicates
# echo "gf -v=0 -make $CONC"
# gf -v=0 -make $CONC
echo "$CONC contains now only homonyms. Original file is found in $BAK."
fi

View File

@@ -7,7 +7,7 @@ concrete ExtendSwe of Extend = CatSwe **
StrandRelSlash, EmptyRelSlash, StrandQuestSlash,
PassVPSlash, PassAgentVPSlash, UttVPShort, ByVP, InOrderToVP,
MkVPI, BaseVPI, ConsVPI, ConjVPI, ComplVPIVV,
MkVPS, BaseVPS, ConsVPS, ConjVPS, PredVPS,
MkVPS, BaseVPS, ConsVPS, ConjVPS, PredVPS, RelVPS,
MkVPS2, ConjVPS2, ComplVPS2, ReflVPS2, MkVPI2, ConjVPI2, ComplVPI2,
ICompAP,ProDrop,EmbedSSlash,
AdAdV, PositAdVAdj, GerundCN, GerundNP, GerundAdv, PresPartAP, PastPartAP, PastPartAgentAP,
@@ -94,22 +94,6 @@ in {
{c2 = a2.c2} ; -- has the right c2
N2VPSlash n2 =
let vp : CatSwe.VP = UseComp (CompCN (UseN2 n2)) ;
dummyVPS : VPSlash = SlashV2a (P.mkV2 "dummy") ;
in dummyVPS ** -- has necessary fields for VPSlash
vp ** -- has all the right fields except for c2
{c2 = n2.c2} ; -- has the right c2
A2VPSlash a2 =
let vp : CatSwe.VP = UseComp (CompAP (UseA2 a2)) ;
dummyVPS : VPSlash = SlashV2a (P.mkV2 "dummy") ;
in dummyVPS ** -- has necessary fields for VPSlash
vp ** -- has all the right fields except for c2
{c2 = a2.c2} ; -- has the right c2
lin UttVPShort vp = {s = infVP vp (agrP3 Utr Sg)} ;
lincat
@@ -148,6 +132,16 @@ in {
}
} ;
RelVPS rp vps = {
s = \\ag,rcase =>
let agr = case rp.a of { -- RP's agr may override in the regular RelativeScand, is this true with VPS too?
RNoAg => ag ;
RAg g n p => {g = g ; n = n ; p = p}
} ;
in rp.s ! ag.g ! ag.n ! rcase ++ vps.s ! Sub ! agr ;
c = NPNom
} ;
MkVPS t p vp = {
s = \\o,a =>
let

View File

@@ -44,6 +44,7 @@ oper
utrum : Gender ; -- the "en" gender
neutrum : Gender ; -- the "ett" gender
neuter : Gender ; -- synonym of neutrum
-- To abstract over number names, we define the following.
@@ -151,6 +152,7 @@ oper
mkPN : (jesus,jesu : Str) -> Gender -> PN -- irregular genitive
} ;
geoPN : Str -> PN ; -- neuter, with identical genitive if ends in a vowel
--2 Adjectives
@@ -348,6 +350,7 @@ oper
Case = CommonScand.Case ;
utrum = Utr ;
neutrum = Neutr ;
neuter = Neutr ;
singular = Sg ;
plural = Pl ;
nominative = Nom ;
@@ -525,6 +528,14 @@ oper
{s = table {Nom => jesus ; Gen => jesu} ; g = g ; lock_PN = <>} ;
} ;
geoPN name =
let names : Str = case name of {
_ + ("a"|"e"|"i"|"o"|"u"|"y"|"å"|"ä"|"ö") => name ;
_ + "s" => name ;
_ => name + "s"
} in
mkPN name names neutrum ;
regPN n = regGenPN n utrum ;
regGenPN n g = {s = \\c => mkCase c n ; g = g} ** {lock_PN = <>} ;
nounPN n = {s = n.s ! singular ! Indef ; g = n.g ; lock_PN = <>} ;