steps towards an NLG language

This commit is contained in:
Krasimir Angelov
2024-04-07 10:39:43 +02:00
parent 81717e7822
commit f637abe92e
6 changed files with 162 additions and 141 deletions

View File

@@ -395,12 +395,12 @@ data Term =
| FV [Term] -- ^ alternatives in free variation: @variants { s ; ... }@
| Markup Ident [(Ident,Term)] [Term]
| Alts Term [(Term, Term)] -- ^ alternatives by prefix: @pre {t ; s\/c ; ...}@
| Strs [Term] -- ^ conditioning prefix strings: @strs {s ; ...}@
| TSymCat Int LIndex [(LIndex,(Ident,Type))]
| TSymVar Int Int
| OpenTag Ident [Assign] -- used internally in the parser
| CloseTag Ident -- used internally in the parser
deriving (Show, Eq, Ord)
-- | Patterns

View File

@@ -2,8 +2,8 @@
{
{-# LANGUAGE CPP #-}
module GF.Grammar.Lexer
( Token(..), Posn(..)
, P, runP, runPartial, token, lexer, getPosn, failLoc
( Lang(..), Token(..), Posn(..)
, P, runP, runLangP, runPartial, token, lexer, getPosn, failLoc
, isReservedWord
) where
@@ -17,7 +17,7 @@ import qualified Data.ByteString.Internal as BS(w2c)
import qualified Data.ByteString.UTF8 as UTF8
import qualified Data.Map as Map
import Data.Word(Word8)
import Data.Char(readLitChar,isSpace)
import Data.Char(readLitChar,isSpace,isAlphaNum)
import Data.Maybe(isJust)
import qualified Control.Monad.Fail as Fail
}
@@ -31,7 +31,10 @@ $i = [$l $d _ '] -- identifier character
$u = [.\n] -- universal: any character
@rsyms = -- symbols and non-identifier-like reserved words
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
\; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
@ident =
(\_ | $l)($l | $d | \_ | \')*
:-
"--" [.]* ; -- Toss single line comments
@@ -39,11 +42,39 @@ $u = [.\n] -- universal: any character
$white+ ;
@rsyms { tok ident }
\< { \_ _ s -> if start_of_tag (BS.tail s)
then T_less_tag
else T_less }
\< $white* @ident $white* (\/ | \>)
{ \lang inp@(AI pos s) inp' _ ->
let inp0 = AI (alexMove pos '<') (BS.tail s)
in case lang of {
NLG -> case getTag inp0 of {
Just (tag,inp') -> POk (inp,inp') (T_open_tag tag) ;
Nothing -> PFailed pos "matching the html tag failed"
} ;
_ -> POk (inp,inp0) T_less
} }
\< $white* @ident $white+ @ident $white* \=
{ \lang inp@(AI pos s) inp' _ ->
let inp0 = AI (alexMove pos '<') (BS.tail s)
in case lang of {
NLG -> case getTag inp0 of {
Just (tag,inp') -> if tag == identS "let"
then POk (inp,inp0) T_less
else POk (inp,inp') (T_open_tag tag) ;
Nothing -> PFailed pos "matching the html tag failed"
} ;
_ -> POk (inp,inp0) T_less
} }
\< \/ $white* @ident { \lang inp@(AI pos s) inp' _ ->
case lang of {
NLG -> case getTag (AI (alexMove (alexMove pos '<') '/') (BS.drop 2 s)) of {
Just (tag,inp') -> POk (inp,inp') (T_close_tag tag) ;
Nothing -> PFailed pos "matching the html tag failed"
} ;
_ -> let inp0 = AI (alexMove pos '<') (BS.tail s)
in POk (inp,inp0) T_less
} }
\' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (T_Ident . identS . unescapeInitTail . unpack) }
(\_ | $l)($l | $d | \_ | \')* { tok ident }
@ident { tok ident }
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t | $d+)))* \" { tok (T_String . unescapeInitTail . unpack) }
@@ -53,11 +84,10 @@ $white+ ;
{
unpack = UTF8.toString
--unpack = id
ident = res T_Ident . identC . rawIdentC
tok f p len s = f (UTF8.take len s)
tok f _ inp@(AI _ s) inp' len = POk (inp,inp') (f (UTF8.take len s))
data Token
= T_exclmark
@@ -78,7 +108,6 @@ data Token
| T_colon
| T_semicolon
| T_less
| T_less_tag
| T_equal
| T_big_rarrow
| T_great
@@ -138,6 +167,8 @@ data Token
| T_Integer Integer -- integer literals
| T_Double Double -- double precision float literals
| T_Ident Ident
| T_open_tag Ident
| T_close_tag Ident
| T_EOF
deriving Show -- debug
@@ -171,6 +202,7 @@ resWords = Map.fromList
, b ";" T_semicolon
, b "=" T_equal
, b "=>" T_big_rarrow
, b "<" T_less
, b ">" T_great
, b "?" T_questmark
, b "[" T_obrack
@@ -251,20 +283,14 @@ alexMove (Pn l c) '\n' = Pn (l+1) 1
alexMove (Pn l c) _ = Pn l (c+1)
alexGetByte :: AlexInput -> Maybe (Word8,AlexInput)
alexGetByte (AI p _ s) =
alexGetByte (AI p s) =
case WBS.uncons s of
Nothing -> Nothing
Just (w,s) ->
let p' = alexMove p c
c = BS.w2c w
in p' `seq` Just (w, (AI p' c s))
{-
-- Not used by this lexer:
alexInputPrevChar :: AlexInput -> Char
alexInputPrevChar (AI p c s) = c
-}
let p' = alexMove p (BS.w2c w)
in p' `seq` Just (w, (AI p' s))
data AlexInput = AI {-# UNPACK #-} !Posn -- current position,
{-# UNPACK #-} !Char -- previous char
{-# UNPACK #-} !BS.ByteString -- current input string
type AlexInput2 = (AlexInput,AlexInput)
@@ -274,7 +300,9 @@ data ParseResult a
| PFailed Posn -- The position of the error
String -- The error message
newtype P a = P { unP :: AlexInput2 -> ParseResult a }
data Lang = GF | BNFC | NLG
newtype P a = P { unP :: Lang -> AlexInput2 -> ParseResult a }
instance Functor P where
fmap = liftA
@@ -284,10 +312,10 @@ instance Applicative P where
(<*>) = ap
instance Monad P where
return a = a `seq` (P $ \s -> POk s a)
(P m) >>= k = P $ \ s -> case m s of
POk s a -> unP (k a) s
PFailed posn err -> PFailed posn err
return a = a `seq` (P $ \_ s -> POk s a)
(P m) >>= k = P $ \l s -> case m l s of
POk s a -> unP (k a) l s
PFailed posn err -> PFailed posn err
#if !(MIN_VERSION_base(4,13,0))
-- Monad(fail) will be removed in GHC 8.8+
@@ -295,24 +323,27 @@ instance Monad P where
#endif
instance Fail.MonadFail P where
fail msg = P $ \(_,AI posn _ _) -> PFailed posn msg
fail msg = P $ \_ (_,AI posn _) -> PFailed posn msg
runP :: P a -> BS.ByteString -> Either (Posn,String) a
runP p bs = snd <$> runP' p (Pn 1 0,bs)
runP p bs = snd <$> runP' GF p (Pn 1 0,bs)
runPartial p s = conv <$> runP' p (Pn 1 0,UTF8.fromString s)
runLangP :: Lang -> P a -> BS.ByteString -> Either (Posn,String) a
runLangP lang p bs = snd <$> runP' lang p (Pn 1 0,bs)
runPartial p s = conv <$> runP' GF p (Pn 1 0,UTF8.fromString s)
where conv ((pos,rest),x) = (UTF8.toString rest,x)
runP' (P f) (pos,txt) =
case f (dup (AI pos ' ' txt)) of
POk (AI pos _ rest,_) x -> Right ((pos,rest),x)
runP' lang (P f) (pos,txt) =
case f lang (dup (AI pos txt)) of
POk (AI pos rest,_) x -> Right ((pos,rest),x)
PFailed pos msg -> Left (pos,msg)
dup x = (x,x)
failLoc :: Posn -> String -> P a
failLoc pos msg = P $ \_ -> PFailed pos msg
failLoc pos msg = P $ \_ _ -> PFailed pos msg
lexer :: (Token -> P a) -> P a
lexer cont = cont=<<token
@@ -320,58 +351,29 @@ lexer cont = cont=<<token
token :: P Token
token = P go
where
go ai2@(_,inp@(AI pos _ str)) =
go lang (_,inp) =
case alexScan inp 0 of
AlexEOF -> POk (inp,inp) T_EOF
AlexError (AI pos _ _) -> PFailed pos "lexical error"
AlexSkip inp' len -> {-trace (show len) $-} go (inp,inp')
AlexToken inp' len act -> POk (inp,inp') (act pos len str)
AlexError (AI pos _) -> PFailed pos "lexical error"
AlexSkip inp' len -> go lang (inp,inp')
AlexToken inp' len act -> act lang inp inp' len
start_of_tag s = isJust (match s)
getTag inp = space inp
where
match s = do
s <- matchSpace s
(char s '/'
`mplus`
do s <- matchIdent s
s <- matchSpace s
(char s '/'
`mplus`
do s <- matchIdent s
s <- matchSpace s
char s '='))
space inp = do
(w,inp') <- alexGetByte inp
if isSpace (BS.w2c w)
then space inp'
else ident [] inp
matchSpace s
| BS.null s = Just s
| isSpace (BS.head s) = matchSpace (BS.tail s)
| otherwise = Just s
init =
BS.pack "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
rest =
BS.append init (BS.pack "0123456789'")
char s c
| BS.null s = Nothing
| BS.head s == c = Just (BS.tail s)
| otherwise = Nothing
matchIdent s
| BS.null s = Nothing
| BS.elem (BS.head s) init = matchRest s 1 (BS.tail s)
| otherwise = Nothing
matchRest s0 i s
| BS.null s = checkResWord (BS.take i s0) s
| BS.elem (BS.head s) rest = matchRest s0 (i+1) (BS.tail s)
| otherwise = checkResWord (BS.take i s0) s
checkResWord w s =
case Map.lookup (identC (rawIdentC w)) resWords of
Just t -> Nothing
Nothing -> Just s
ident cs inp = do
(w,inp') <- alexGetByte inp
let c = BS.w2c w
if isAlphaNum c || c == '_'
then ident (c:cs) inp'
else return (identS (reverse cs),inp)
getPosn :: P Posn
getPosn = P $ \ai2@(_,inp@(AI pos _ _)) -> POk ai2 pos
getPosn = P $ \_ ai2@(_,inp@(AI pos _)) -> POk ai2 pos
}

View File

@@ -2,13 +2,14 @@
{
{-# OPTIONS -fno-warn-overlapping-patterns #-}
module GF.Grammar.Parser
( P, runP, runPartial
( P, runP, Lang(..), runLangP, runPartial, Posn(..)
, pModDef
, pModHeader
, pTerm
, pTopDef
, pBNFCRules
, pEBNFRules
, pNLG
) where
import GF.Infra.Ident
@@ -33,6 +34,7 @@ import qualified Data.Map as Map
%partial pTerm Exp
%name pBNFCRules ListCFRule
%name pEBNFRules ListEBNFRule
%name pNLG NLG
-- no lexer declaration
%monad { P } { >>= } { return }
@@ -59,7 +61,6 @@ import qualified Data.Map as Map
':' { T_colon }
';' { T_semicolon }
'<' { T_less }
'<tag' { T_less_tag }
'=' { T_equal }
'=>' { T_big_rarrow}
'>' { T_great }
@@ -118,6 +119,8 @@ Integer { (T_Integer $$) }
Double { (T_Double $$) }
String { (T_String $$) }
Ident { (T_Ident $$) }
'<tag' { (T_open_tag $$) }
'</tag' { (T_close_tag $$) }
%%
@@ -271,10 +274,10 @@ ParamDef
OperDef :: { [(Ident,Info)] }
OperDef
: Posn LhsNames ':' Exp Posn { [(i, info) | i <- $2, info <- mkOverload (Just (mkL $1 $5 $4)) Nothing ] }
| Posn LhsNames '=' Exp Posn { [(i, info) | i <- $2, info <- mkOverload Nothing (Just (mkL $1 $5 $4))] }
| Posn LhsName ListArg '=' Exp Posn { [(i, info) | i <- [$2], info <- mkOverload Nothing (Just (mkL $1 $6 (mkAbs $3 $5)))] }
| Posn LhsNames ':' Exp '=' Exp Posn { [(i, info) | i <- $2, info <- mkOverload (Just (mkL $1 $7 $4)) (Just (mkL $1 $7 $6))] }
: Posn LhsNames ':' Exp ';' Posn { [(i, info) | i <- $2, info <- mkOverload (Just (mkL $1 $6 $4)) Nothing ] }
| Posn LhsNames '=' Markup Posn { [(i, info) | i <- $2, info <- mkOverload Nothing (Just (mkL $1 $5 $4))] }
| Posn LhsName ListArg '=' Markup Posn { [(i, info) | i <- [$2], info <- mkOverload Nothing (Just (mkL $1 $6 (mkAbs $3 $5)))] }
| Posn LhsNames ':' Exp '=' Markup Posn { [(i, info) | i <- $2, info <- mkOverload (Just (mkL $1 $7 $4)) (Just (mkL $1 $7 $6))] }
LinDef :: { [(Ident,Info)] }
LinDef
@@ -315,8 +318,8 @@ ListDefDef
ListOperDef :: { [(Ident,Info)] }
ListOperDef
: OperDef ';' { $1 }
| OperDef ';' ListOperDef { $1 ++ $3 }
: OperDef { $1 }
| OperDef ListOperDef { $1 ++ $2 }
ListCatDef :: { [(Ident,Info)] }
ListCatDef
@@ -383,11 +386,17 @@ LocDef
| ListIdent '=' Exp { [(lab,Nothing,Just $3) | lab <- $1] }
| ListIdent ':' Exp '=' Exp { [(lab,Just $3,Just $5) | lab <- $1] }
LocMarkupDef :: { [(Ident, Maybe Type, Maybe Term)] }
LocMarkupDef
: ListIdent '=' Tag { [(lab,Nothing,Just $3) | lab <- $1] }
| ListIdent ':' Exp '=' Tag { [(lab,Just $3,Just $5) | lab <- $1] }
ListLocDef :: { [(Ident, Maybe Type, Maybe Term)] }
ListLocDef
: {- empty -} { [] }
| LocDef { $1 }
| LocDef ';' ListLocDef { $1 ++ $3 }
: {- empty -} { [] }
| LocDef { $1 }
| LocMarkupDef ListLocDef { $1 ++ $2 }
| LocDef ';' ListLocDef { $1 ++ $3 }
Exp :: { Term }
Exp
@@ -402,6 +411,9 @@ Exp
| 'let' ListLocDef 'in' Exp {%
do defs <- mapM tryLoc $2
return $ mkLet defs $4 }
| 'let' ListLocDef 'in' Tag {%
do defs <- mapM tryLoc $2
return $ mkLet defs $4 }
| Exp3 'where' '{' ListLocDef '}' {%
do defs <- mapM tryLoc $4
return $ mkLet defs $1 }
@@ -432,9 +444,7 @@ Exp3
Exp4 :: { Term }
Exp4
: Exp4 Exp5 {% case $2 of
CloseTag id -> mkTag id $1 Empty
_ -> return (App $1 $2) }
: Exp4 Exp5 { App $1 $2 }
| Exp4 '{' Exp '}' { App $1 (ImplArg $3) }
| 'case' Exp 'of' '{' ListCase '}' { let annot = case $2 of
Typed _ t -> TTyped t
@@ -471,18 +481,6 @@ Exp6
| '<' ListTupleComp '>' { R (tuple2record $2) }
| '<' Exp ':' Exp '>' { Typed $2 $4 }
| '(' Exp ')' { $2 }
| '<tag' Ident Attributes '>' { OpenTag $2 $3 }
| '<tag' '/' Ident '>' { CloseTag $3 }
| '<tag' Ident Attributes '/' '>' { markup $2 $3 Empty }
Attributes :: { [Assign] }
Attributes
: { [] }
| Attribute Attributes { $1:$2 }
Attribute :: { Assign }
Attribute
: Ident '=' Exp6 { assign (ident2label $1) $3 }
ListExp :: { [Term] }
ListExp
@@ -703,6 +701,42 @@ ERHS3 :: { ERHS }
| Ident { ENonTerm (showIdent $1,[]) }
| '(' ERHS0 ')' { $2 }
NLG :: { Map.Map Ident Info }
: ListNLGDef { Map.fromList $1 }
| Posn Tag Posn { Map.singleton (identS "main") (ResOper Nothing (Just (mkL $1 $3 (Abs Explicit (identS "qid") $2)))) }
| Posn Exp Posn { Map.singleton (identS "main") (ResOper Nothing (Just (mkL $1 $3 (Abs Explicit (identS "qid") $2)))) }
ListNLGDef :: { [(Ident,Info)] }
ListNLGDef
: {- empty -} { [] }
| 'oper' OperDef ListNLGDef { $2 ++ $3 }
Markup :: { Term }
Markup
: Tag { $1 }
| Exp ';' { $1 }
Tag :: { Term }
Tag
: '<tag' Attributes '>' ListMarkup '</tag' '>' {% if $1 == $5
then return (Markup $1 $2 $4)
else fail ("Unmatched closing tag " ++ showIdent $1) }
| '<tag' Attributes '/' '>' { Markup $1 $2 [] }
ListMarkup :: { [Term] }
: { [] }
| Exp { [$1] }
| Markup ListMarkup { $1 : $2 }
Attributes :: { [(Ident,Term)] }
Attributes
: { [] }
| Attribute Attributes { $1:$2 }
Attribute :: { (Ident,Term) }
Attribute
: Ident '=' Exp6 { ($1,$3) }
ModuleName :: { ModuleName }
: Ident { MN $1 }
@@ -828,35 +862,4 @@ mkAlts cs = case cs of
mkL :: Posn -> Posn -> x -> L x
mkL (Pn l1 _) (Pn l2 _) x = L (Local l1 l2) x
mkTag ident (App t1 t2) conc =
case match ident t2 of
Just attrs -> fmap (App t1) (mkTag ident t2 conc)
Nothing -> let t = App (Q (cPredef, (identS "linearize"))) t2
in mkTag ident t1 $ case conc of
Empty -> t
_ -> C t conc
mkTag ident t conc =
case match ident t of
Just attrs -> return (markup ident attrs conc)
Nothing -> fail ("Unmatched closing tag " ++ showIdent ident)
match ident (OpenTag ident' attrs)
| ident == ident' = Just attrs
match ident (R [(lbl,(Nothing,Vr ident'))])
| lbl == ident2label (identS "p1") && ident == ident'
= Just []
match ident _ = Nothing
markup ident attrs content =
App
(App
(App
(Q (cPredef, (identS "markup")))
(R attrs)
)
(K (showIdent ident))
)
content
}

View File

@@ -33,6 +33,7 @@ cSOFT_BIND = identS "SOFT_BIND"
cSOFT_SPACE = identS "SOFT_SPACE"
cCAPIT = identS "CAPIT"
cALL_CAPIT = identS "ALL_CAPIT"
cHtml = identS "Html"
isPredefCat :: Ident -> Bool
isPredefCat c = elem c [cInt,cString,cFloat]

View File

@@ -24,7 +24,7 @@ module GF.Grammar.Printer
) where
import Prelude hiding ((<>)) -- GHC 8.4.1 clash with Text.PrettyPrint
import PGF2(Literal(..))
import PGF2(Literal(..),pgfFilePath)
import PGF2.Transactions(SeqId)
import GF.Infra.Ident
import GF.Infra.Option
@@ -84,6 +84,8 @@ ppModule q (mn, ModInfo mtype mstat opts exts with opens _ _ mseqs jments) =
ppExtends (id,MIExcept incs) = id <+> '-' <+> brackets (commaPunct pp incs)
ppWith (id,ext,opens) = ppExtends (id,ext) <+> "with" <+> commaPunct ppInstSpec opens
ppModule q (mn, ModPGF pgf) =
"pgf" <+> mn <+> '=' <+> show (pgfFilePath pgf)
ppOptions opts =
"flags" $$
@@ -249,6 +251,11 @@ ppTerm q d (Typed e t) = '<' <> ppTerm q 0 e <+> ':' <+> ppTerm q 0 t <> '>'
ppTerm q d (ImplArg e) = braces (ppTerm q 0 e)
ppTerm q d (ELincat cat t) = prec d 4 ("lincat" <+> cat <+> ppTerm q 5 t)
ppTerm q d (ELin cat t) = prec d 4 ("lin" <+> cat <+> ppTerm q 5 t)
ppTerm q d (Markup tag attrs children)
| null children = pp "<" <> pp tag <+> hsep (map (ppMarkupAttr q) attrs) <> pp "/>"
| otherwise = pp "<" <> pp tag <+> hsep (map (ppMarkupAttr q) attrs) <> pp ">" $$
nest 3 (ppMarkupChildren q children) $$
pp "</" <> pp tag <> pp ">"
ppTerm q d (TSymCat i r rs) = pp '<' <> pp i <> pp ',' <> ppLinFun (pp.fst) r rs <> pp '>'
ppTerm q d (TSymVar i r) = pp '<' <> pp i <> pp ',' <> pp '$' <> pp r <> pp '>'
@@ -341,6 +348,16 @@ ppAltern q (x,y) = ppTerm q 0 x <+> '/' <+> ppTerm q 0 y
ppParams q ps = fsep (intersperse (pp '|') (map (ppParam q) ps))
ppParam q (id,cxt) = id <+> hsep (map (ppDDecl q) cxt)
ppMarkupAttr q (id,e) =
id <> pp '=' <> ppTerm q 5 e
ppMarkupChildren q [t] = ppTerm q 0 t
ppMarkupChildren q (t:ts) =
(case t of
Markup {} -> ppTerm q 0 t
_ -> ppTerm q 0 t <> ';') $$
ppMarkupChildren q ts
ppSeqId :: SeqId -> Doc
ppSeqId seqid = 'S' <> pp seqid

View File

@@ -82,8 +82,6 @@ library
GF.Infra.Option
GF.Infra.UseIO
GF.Infra.BuildInfo
other-modules:
GF.Support
GF.Text.Pretty
GF.Text.Lexing