From 1e6e84c5a6a075f463be5a8e92d4ef6a6c512700 Mon Sep 17 00:00:00 2001 From: Matheus Bernardo Date: Fri, 1 Mar 2024 16:48:00 +0100 Subject: [PATCH 1/2] Adding html tags to the parser and lexer --- src/compiler/api/GF/Grammar/Lexer.x | 10 ++++++-- src/compiler/api/GF/Grammar/Macros.hs | 3 +++ src/compiler/api/GF/Grammar/Parser.y | 33 +++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 2 deletions(-) diff --git a/src/compiler/api/GF/Grammar/Lexer.x b/src/compiler/api/GF/Grammar/Lexer.x index b6b024d7b..5303932de 100644 --- a/src/compiler/api/GF/Grammar/Lexer.x +++ b/src/compiler/api/GF/Grammar/Lexer.x @@ -31,7 +31,7 @@ $i = [$l $d _ '] -- identifier character $u = [.\n] -- universal: any character @rsyms = -- symbols and non-identifier-like reserved words - \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \= + \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \<\/ | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \= :- "--" [.]* ; -- Toss single line comments @@ -47,11 +47,14 @@ $white+ ; (\-)? $d+ { tok (T_Integer . read . unpack) } (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) } +\< (\_ | $l)($l | $d | \_ | \')* { tok tag } + { unpack = UTF8.toString --unpack = id ident = res T_Ident . identC . rawIdentC +tag = res T_less_tag . identC . rawIdentC . BS.tail tok f p s = f s @@ -74,6 +77,7 @@ data Token | T_colon | T_semicolon | T_less + | T_less_close | T_equal | T_big_rarrow | T_great @@ -129,12 +133,13 @@ data Token | T_terminator | T_separator | T_nonempty + | T_less_tag Ident | T_String String -- string literals | T_Integer Integer -- integer literals | T_Double Double -- double precision float literals | T_Ident Ident | T_EOF --- deriving Show -- debug + deriving Show -- debug res = eitherResIdent eitherResIdent :: (Ident -> Token) -> Ident -> Token @@ -164,6 +169,7 @@ resWords = Map.fromList , b "/" T_alt , b ":" T_colon , b ";" T_semicolon + , b "" T_big_rarrow diff --git a/src/compiler/api/GF/Grammar/Macros.hs b/src/compiler/api/GF/Grammar/Macros.hs index 7d083fbe3..cb097395c 100644 --- a/src/compiler/api/GF/Grammar/Macros.hs +++ b/src/compiler/api/GF/Grammar/Macros.hs @@ -260,6 +260,9 @@ tuple2recordType ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts] tuple2recordPatt :: [Patt] -> [(Label,Patt)] tuple2recordPatt ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts] +attrs2record :: [(Ident, Term)] -> [Assign] +attrs2record ts = [assign (ident2label i) ((App (Q (cPredef, (identS "linearize"))) t)) | (i,t) <- ts] + mkCases :: Ident -> Term -> Term mkCases x t = T TRaw [(PV x, t)] diff --git a/src/compiler/api/GF/Grammar/Parser.y b/src/compiler/api/GF/Grammar/Parser.y index 76e3a4c16..cad954d55 100644 --- a/src/compiler/api/GF/Grammar/Parser.y +++ b/src/compiler/api/GF/Grammar/Parser.y @@ -59,6 +59,8 @@ import qualified Data.Map as Map ':' { T_colon } ';' { T_semicolon } '<' { T_less } + '' { T_big_rarrow} '>' { T_great } @@ -468,6 +470,17 @@ Exp6 | '<' ListTupleComp '>' { R (tuple2record $2) } | '<' Exp ':' Exp '>' { Typed $2 $4 } | '(' Exp ')' { $2 } + | '' Exps '' {% mkTag $1 $6 $2 $4 } + | '' {% mkTag $1 $1 $2 [Empty] } + +Attributes :: { [(Ident,Term)] } +Attributes + : { [] } + | Attribute Attributes { $1:$2 } + +Attribute :: { (Ident,Term) } +Attribute + : Ident '=' Exp6 { ($1,$3) } ListExp :: { [Term] } ListExp @@ -813,4 +826,24 @@ mkAlts cs = case cs of mkL :: Posn -> Posn -> x -> L x mkL (Pn l1 _) (Pn l2 _) x = L (Local l1 l2) x + +mkTag ident ident' attrs t + | showIdent ident == showIdent ident' = return (App + (App + (App + (Q (cPredef, (identS "markup"))) + (R (attrs2record attrs)) + ) + (K (showIdent ident)) + ) + (mkConcatenation t) + ) + + | otherwise = fail "Tags don't match" + +mkConcatenation [] = Empty +mkConcatenation [t] = (App (Q (cPredef, (identS "linearize"))) t) +mkConcatenation (t:ts) = C t (mkConcatenation ts) + + } From 52c56feaf2f0785f07b2f52f63ec24491a62b44e Mon Sep 17 00:00:00 2001 From: Krasimir Angelov Date: Wed, 6 Mar 2024 19:27:16 +0100 Subject: [PATCH 2/2] fix the disambiguation of html tags --- src/compiler/api/GF/Grammar/Grammar.hs | 2 + src/compiler/api/GF/Grammar/Lexer.x | 62 ++++++++++++++++++++------ src/compiler/api/GF/Grammar/Macros.hs | 3 -- src/compiler/api/GF/Grammar/Parser.y | 61 +++++++++++++++---------- 4 files changed, 88 insertions(+), 40 deletions(-) diff --git a/src/compiler/api/GF/Grammar/Grammar.hs b/src/compiler/api/GF/Grammar/Grammar.hs index cd992810d..1353462b8 100644 --- a/src/compiler/api/GF/Grammar/Grammar.hs +++ b/src/compiler/api/GF/Grammar/Grammar.hs @@ -399,6 +399,8 @@ data Term = | Strs [Term] -- ^ conditioning prefix strings: @strs {s ; ...}@ | TSymCat Int LIndex [(LIndex,(Ident,Type))] | TSymVar Int Int + | OpenTag Ident [Assign] -- used internally in the parser + | CloseTag Ident -- used internally in the parser deriving (Show, Eq, Ord) -- | Patterns diff --git a/src/compiler/api/GF/Grammar/Lexer.x b/src/compiler/api/GF/Grammar/Lexer.x index 5303932de..98e3be332 100644 --- a/src/compiler/api/GF/Grammar/Lexer.x +++ b/src/compiler/api/GF/Grammar/Lexer.x @@ -8,7 +8,7 @@ module GF.Grammar.Lexer ) where import Control.Applicative -import Control.Monad(ap) +import Control.Monad(ap,mplus) import GF.Infra.Ident --import GF.Data.Operations import qualified Data.ByteString.Char8 as BS @@ -17,8 +17,8 @@ import qualified Data.ByteString.Internal as BS(w2c) import qualified Data.ByteString.UTF8 as UTF8 import qualified Data.Map as Map import Data.Word(Word8) -import Data.Char(readLitChar) ---import Debug.Trace(trace) +import Data.Char(readLitChar,isSpace) +import Data.Maybe(isJust) import qualified Control.Monad.Fail as Fail } @@ -31,7 +31,7 @@ $i = [$l $d _ '] -- identifier character $u = [.\n] -- universal: any character @rsyms = -- symbols and non-identifier-like reserved words - \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \<\/ | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \= + \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \= :- "--" [.]* ; -- Toss single line comments @@ -39,6 +39,9 @@ $u = [.\n] -- universal: any character $white+ ; @rsyms { tok ident } +\< { \_ _ s -> if start_of_tag (BS.tail s) + then T_less_tag + else T_less } \' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (T_Ident . identS . unescapeInitTail . unpack) } (\_ | $l)($l | $d | \_ | \')* { tok ident } @@ -47,16 +50,14 @@ $white+ ; (\-)? $d+ { tok (T_Integer . read . unpack) } (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double . read . unpack) } -\< (\_ | $l)($l | $d | \_ | \')* { tok tag } { unpack = UTF8.toString --unpack = id ident = res T_Ident . identC . rawIdentC -tag = res T_less_tag . identC . rawIdentC . BS.tail -tok f p s = f s +tok f p len s = f (UTF8.take len s) data Token = T_exclmark @@ -77,7 +78,7 @@ data Token | T_colon | T_semicolon | T_less - | T_less_close + | T_less_tag | T_equal | T_big_rarrow | T_great @@ -133,7 +134,6 @@ data Token | T_terminator | T_separator | T_nonempty - | T_less_tag Ident | T_String String -- string literals | T_Integer Integer -- integer literals | T_Double Double -- double precision float literals @@ -169,8 +169,6 @@ resWords = Map.fromList , b "/" T_alt , b ":" T_colon , b ";" T_semicolon - , b "" T_big_rarrow , b ">" T_great @@ -322,13 +320,51 @@ lexer cont = cont=< POk (inp,inp) T_EOF AlexError (AI pos _ _) -> PFailed pos "lexical error" AlexSkip inp' len -> {-trace (show len) $-} go (inp,inp') - AlexToken inp' len act -> POk (inp,inp') (act pos ({-UTF8.toString-} (UTF8.take len str))) + AlexToken inp' len act -> POk (inp,inp') (act pos len str) + +start_of_tag s = isJust (match s) + where + match s = do + s <- matchSpace s + (char s '/' + `mplus` + do s <- matchIdent s + s <- matchSpace s + (char s '/' + `mplus` + do s <- matchIdent s + s <- matchSpace s + char s '=')) + + matchSpace s + | BS.null s = Just s + | isSpace (BS.head s) = matchSpace (BS.tail s) + | otherwise = Just s + + init = + BS.pack "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" + rest = + BS.append init (BS.pack "0123456789'") + + char s c + | BS.null s = Nothing + | BS.head s == c = Just (BS.tail s) + | otherwise = Nothing + + matchIdent s + | BS.null s = Nothing + | BS.elem (BS.head s) init = matchRest (BS.tail s) + | otherwise = Nothing + + matchRest s + | BS.null s = Just s + | BS.elem (BS.head s) rest = matchRest (BS.tail s) + | otherwise = Just s getPosn :: P Posn getPosn = P $ \ai2@(_,inp@(AI pos _ _)) -> POk ai2 pos diff --git a/src/compiler/api/GF/Grammar/Macros.hs b/src/compiler/api/GF/Grammar/Macros.hs index cb097395c..7d083fbe3 100644 --- a/src/compiler/api/GF/Grammar/Macros.hs +++ b/src/compiler/api/GF/Grammar/Macros.hs @@ -260,9 +260,6 @@ tuple2recordType ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts] tuple2recordPatt :: [Patt] -> [(Label,Patt)] tuple2recordPatt ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts] -attrs2record :: [(Ident, Term)] -> [Assign] -attrs2record ts = [assign (ident2label i) ((App (Q (cPredef, (identS "linearize"))) t)) | (i,t) <- ts] - mkCases :: Ident -> Term -> Term mkCases x t = T TRaw [(PV x, t)] diff --git a/src/compiler/api/GF/Grammar/Parser.y b/src/compiler/api/GF/Grammar/Parser.y index cad954d55..40ff012bd 100644 --- a/src/compiler/api/GF/Grammar/Parser.y +++ b/src/compiler/api/GF/Grammar/Parser.y @@ -59,8 +59,7 @@ import qualified Data.Map as Map ':' { T_colon } ';' { T_semicolon } '<' { T_less } - '' { T_big_rarrow} '>' { T_great } @@ -433,7 +432,9 @@ Exp3 Exp4 :: { Term } Exp4 - : Exp4 Exp5 { App $1 $2 } + : Exp4 Exp5 {% case $2 of + CloseTag id -> mkTag id $1 Empty + _ -> return (App $1 $2) } | Exp4 '{' Exp '}' { App $1 (ImplArg $3) } | 'case' Exp 'of' '{' ListCase '}' { let annot = case $2 of Typed _ t -> TTyped t @@ -470,17 +471,18 @@ Exp6 | '<' ListTupleComp '>' { R (tuple2record $2) } | '<' Exp ':' Exp '>' { Typed $2 $4 } | '(' Exp ')' { $2 } - | '' Exps '' {% mkTag $1 $6 $2 $4 } - | '' {% mkTag $1 $1 $2 [Empty] } + | '' { OpenTag $2 $3 } + | '' { CloseTag $3 } + | '' { markup $2 $3 Empty } -Attributes :: { [(Ident,Term)] } +Attributes :: { [Assign] } Attributes : { [] } | Attribute Attributes { $1:$2 } -Attribute :: { (Ident,Term) } +Attribute :: { Assign } Attribute - : Ident '=' Exp6 { ($1,$3) } + : Ident '=' Exp6 { assign (ident2label $1) $3 } ListExp :: { [Term] } ListExp @@ -827,23 +829,34 @@ mkL :: Posn -> Posn -> x -> L x mkL (Pn l1 _) (Pn l2 _) x = L (Local l1 l2) x -mkTag ident ident' attrs t - | showIdent ident == showIdent ident' = return (App - (App - (App - (Q (cPredef, (identS "markup"))) - (R (attrs2record attrs)) - ) - (K (showIdent ident)) - ) - (mkConcatenation t) - ) +mkTag ident (App t1 t2) conc = + case match ident t2 of + Just attrs -> fmap (App t1) (mkTag ident t2 conc) + Nothing -> let t = App (Q (cPredef, (identS "linearize"))) t2 + in mkTag ident t1 $ case conc of + Empty -> t + _ -> C t conc +mkTag ident t conc = + case match ident t of + Just attrs -> return (markup ident attrs conc) + Nothing -> fail ("Unmatched closing tag " ++ showIdent ident) - | otherwise = fail "Tags don't match" - -mkConcatenation [] = Empty -mkConcatenation [t] = (App (Q (cPredef, (identS "linearize"))) t) -mkConcatenation (t:ts) = C t (mkConcatenation ts) +match ident (OpenTag ident' attrs) + | ident == ident' = Just attrs +match ident (R [(lbl,(Nothing,Vr ident'))]) + | lbl == ident2label (identS "p1") && ident == ident' + = Just [] +match ident _ = Nothing +markup ident attrs content = + App + (App + (App + (Q (cPredef, (identS "markup"))) + (R attrs) + ) + (K (showIdent ident)) + ) + content }