mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-04-24 03:52:50 -06:00
a simple clitic analysis command 'ca'
This commit is contained in:
@@ -33,6 +33,7 @@ import GF.Data.ErrM ----
|
|||||||
import GF.Command.Abstract
|
import GF.Command.Abstract
|
||||||
import GF.Command.Messages
|
import GF.Command.Messages
|
||||||
import GF.Text.Lexing
|
import GF.Text.Lexing
|
||||||
|
import GF.Text.Clitics
|
||||||
import GF.Text.Transliterations
|
import GF.Text.Transliterations
|
||||||
import GF.Quiz
|
import GF.Quiz
|
||||||
|
|
||||||
@@ -200,6 +201,38 @@ allCommands env@(pgf, mos) = Map.fromList [
|
|||||||
]
|
]
|
||||||
}),
|
}),
|
||||||
|
|
||||||
|
("ca", emptyCommandInfo {
|
||||||
|
longname = "clitic_analyse",
|
||||||
|
synopsis = "print the analyses of all words into stems and clitics",
|
||||||
|
explanation = unlines [
|
||||||
|
"Analyses all words into all possible combinations of stem + clitics.",
|
||||||
|
"The analysis is returned in the format stem &+ clitic1 &+ clitic2 ...",
|
||||||
|
"which is hence the inverse of 'pt -bind'. The list of clitics is give",
|
||||||
|
"by the flag '-clitics'. The list of stems is given as the list of words",
|
||||||
|
"of the language given by the '-lang' flag."
|
||||||
|
],
|
||||||
|
exec = \opts -> case opts of
|
||||||
|
_ | isOpt "raw" opts ->
|
||||||
|
return . fromString .
|
||||||
|
unlines . map (unwords . map (concat . intersperse "+")) .
|
||||||
|
map (getClitics (isInMorpho (optMorpho opts)) (optClitics opts)) .
|
||||||
|
concatMap words . toStrings
|
||||||
|
_ ->
|
||||||
|
return . fromStrings .
|
||||||
|
getCliticsText (isInMorpho (optMorpho opts)) (optClitics opts) .
|
||||||
|
concatMap words . toStrings,
|
||||||
|
flags = [
|
||||||
|
("clitics","the list of possible clitics (comma-separated, no spaces)"),
|
||||||
|
("lang", "the language of analysis")
|
||||||
|
],
|
||||||
|
options = [
|
||||||
|
("raw", "analyse each word separately (not suitable input for parser)")
|
||||||
|
],
|
||||||
|
examples = [
|
||||||
|
"ca -lang=Fin -clitics=ko,ni \"nukkuuko minun vaimoni\" | p -- to parse Finnish"
|
||||||
|
]
|
||||||
|
}),
|
||||||
|
|
||||||
("cc", emptyCommandInfo {
|
("cc", emptyCommandInfo {
|
||||||
longname = "compute_concrete",
|
longname = "compute_concrete",
|
||||||
syntax = "cc (-all | -table | -unqual)? TERM",
|
syntax = "cc (-all | -table | -unqual)? TERM",
|
||||||
@@ -1113,6 +1146,10 @@ allCommands env@(pgf, mos) = Map.fromList [
|
|||||||
|
|
||||||
optMorpho opts = morpho (error "no morpho") id (head (optLangs opts))
|
optMorpho opts = morpho (error "no morpho") id (head (optLangs opts))
|
||||||
|
|
||||||
|
optClitics opts = case valStrOpts "clitics" "" opts of
|
||||||
|
"" -> []
|
||||||
|
cs -> map reverse $ chunks ',' cs
|
||||||
|
|
||||||
mexp xs = case xs of
|
mexp xs = case xs of
|
||||||
t:_ -> Just t
|
t:_ -> Just t
|
||||||
_ -> Nothing
|
_ -> Nothing
|
||||||
|
|||||||
40
src/compiler/GF/Text/Clitics.hs
Normal file
40
src/compiler/GF/Text/Clitics.hs
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
module GF.Text.Clitics (getClitics,getCliticsText) where
|
||||||
|
|
||||||
|
import Data.List
|
||||||
|
|
||||||
|
-- AR 6/2/2011
|
||||||
|
-- Analyse word as stem+clitic whenever
|
||||||
|
-- (1) clitic is in clitic list
|
||||||
|
-- (2) either
|
||||||
|
-- (a) stem is in Lexicon
|
||||||
|
-- (b) stem can be analysed as stem0+clitic0
|
||||||
|
--
|
||||||
|
-- Examples:
|
||||||
|
-- Italian amarmi = amar+mi
|
||||||
|
-- Finnish autossanikohan = autossa+ni+kohan
|
||||||
|
--
|
||||||
|
-- The analysis gives all results, including the case where the whole word is in Lexicon.
|
||||||
|
--
|
||||||
|
-- The clitics in the list are expected to be reversed.
|
||||||
|
|
||||||
|
getClitics :: (String -> Bool) -> [String] -> String -> [[String]]
|
||||||
|
getClitics isLex rclitics = map (reverse . map reverse) . clits . reverse where
|
||||||
|
clits rword = ifLex rword [rclit:more |
|
||||||
|
rclit <- rclitics, stem <- splits rclit rword, more <- clits stem]
|
||||||
|
splits c = maybe [] return . stripPrefix c
|
||||||
|
|
||||||
|
ifLex w ws = if isLex (reverse w) then [w] : ws else ws
|
||||||
|
|
||||||
|
|
||||||
|
getCliticsText :: (String -> Bool) -> [String] -> [String] -> [String]
|
||||||
|
getCliticsText isLex rclitics =
|
||||||
|
map unwords . sequence . map (map render . getClitics isLex rclitics)
|
||||||
|
where
|
||||||
|
render = unwords . intersperse "&+"
|
||||||
|
|
||||||
|
|
||||||
|
-- example
|
||||||
|
|
||||||
|
getClitics1 = getClitics exlex1 exclits1
|
||||||
|
exlex1 = flip elem ["auto", "naise", "rahan","maa","maahan","maahankaan"]
|
||||||
|
exclits1 = map reverse ["ni","ko","han","pas","nsa","kin","kaan"]
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
module PGF.Morphology(Lemma,Analysis,Morpho,
|
module PGF.Morphology(Lemma,Analysis,Morpho,
|
||||||
buildMorpho,
|
buildMorpho,isInMorpho,
|
||||||
lookupMorpho,fullFormLexicon,
|
lookupMorpho,fullFormLexicon,
|
||||||
morphoMissing,missingWordMsg) where
|
morphoMissing,missingWordMsg) where
|
||||||
|
|
||||||
@@ -42,6 +42,9 @@ collectWords pinfo = Map.fromListWith (++)
|
|||||||
lookupMorpho :: Morpho -> String -> [(Lemma,Analysis)]
|
lookupMorpho :: Morpho -> String -> [(Lemma,Analysis)]
|
||||||
lookupMorpho (Morpho mo) s = maybe [] id $ Map.lookup s mo
|
lookupMorpho (Morpho mo) s = maybe [] id $ Map.lookup s mo
|
||||||
|
|
||||||
|
isInMorpho :: Morpho -> String -> Bool
|
||||||
|
isInMorpho (Morpho mo) s = maybe False (const True) $ Map.lookup s mo
|
||||||
|
|
||||||
fullFormLexicon :: Morpho -> [(String,[(Lemma,Analysis)])]
|
fullFormLexicon :: Morpho -> [(String,[(Lemma,Analysis)])]
|
||||||
fullFormLexicon (Morpho mo) = Map.toList mo
|
fullFormLexicon (Morpho mo) = Map.toList mo
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user