From 1e6e84c5a6a075f463be5a8e92d4ef6a6c512700 Mon Sep 17 00:00:00 2001
From: Matheus Bernardo <matheus.bernardo@klarna.com>
Date: Fri, 1 Mar 2024 16:48:00 +0100
Subject: [PATCH 1/2] Adding html tags to the parser and lexer

---
 src/compiler/api/GF/Grammar/Lexer.x   | 10 ++++++--
 src/compiler/api/GF/Grammar/Macros.hs |  3 +++
 src/compiler/api/GF/Grammar/Parser.y  | 33 +++++++++++++++++++++++++++
 3 files changed, 44 insertions(+), 2 deletions(-)

diff --git a/src/compiler/api/GF/Grammar/Lexer.x b/src/compiler/api/GF/Grammar/Lexer.x
index b6b024d7b..5303932de 100644
--- a/src/compiler/api/GF/Grammar/Lexer.x
+++ b/src/compiler/api/GF/Grammar/Lexer.x
@@ -31,7 +31,7 @@ $i = [$l $d _ ']          -- identifier character
 $u = [.\n]                -- universal: any character
 
 @rsyms =    -- symbols and non-identifier-like reserved words
-   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
+   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \<\/ | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
 
 :-
 "--" [.]* ; -- Toss single line comments
@@ -47,11 +47,14 @@ $white+ ;
 (\-)? $d+                       { tok (T_Integer . read . unpack) }
 (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }
 
+\< (\_ | $l)($l | $d | \_ | \')*  { tok tag }
+
 {
 unpack = UTF8.toString
 --unpack = id
 
 ident = res T_Ident . identC . rawIdentC
+tag = res T_less_tag . identC . rawIdentC . BS.tail
 
 tok f p s = f s
 
@@ -74,6 +77,7 @@ data Token
  | T_colon
  | T_semicolon
  | T_less
+ | T_less_close
  | T_equal
  | T_big_rarrow
  | T_great
@@ -129,12 +133,13 @@ data Token
  | T_terminator
  | T_separator
  | T_nonempty
+ | T_less_tag Ident
  | T_String  String          -- string literals
  | T_Integer Integer         -- integer literals
  | T_Double  Double          -- double precision float literals
  | T_Ident   Ident
  | T_EOF
--- deriving Show -- debug
+ deriving Show -- debug
 
 res = eitherResIdent
 eitherResIdent :: (Ident -> Token) -> Ident -> Token
@@ -164,6 +169,7 @@ resWords = Map.fromList
  , b "/"  T_alt
  , b ":"  T_colon
  , b ";"  T_semicolon
+ , b "</" T_less_close
  , b "<"  T_less
  , b "="  T_equal
  , b "=>" T_big_rarrow
diff --git a/src/compiler/api/GF/Grammar/Macros.hs b/src/compiler/api/GF/Grammar/Macros.hs
index 7d083fbe3..cb097395c 100644
--- a/src/compiler/api/GF/Grammar/Macros.hs
+++ b/src/compiler/api/GF/Grammar/Macros.hs
@@ -260,6 +260,9 @@ tuple2recordType ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts]
 tuple2recordPatt :: [Patt] -> [(Label,Patt)]
 tuple2recordPatt ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts]
 
+attrs2record :: [(Ident, Term)] -> [Assign]
+attrs2record ts = [assign (ident2label i) ((App (Q (cPredef, (identS "linearize"))) t)) | (i,t) <-  ts]
+
 mkCases :: Ident -> Term -> Term
 mkCases x t = T TRaw [(PV x, t)]
 
diff --git a/src/compiler/api/GF/Grammar/Parser.y b/src/compiler/api/GF/Grammar/Parser.y
index 76e3a4c16..cad954d55 100644
--- a/src/compiler/api/GF/Grammar/Parser.y
+++ b/src/compiler/api/GF/Grammar/Parser.y
@@ -59,6 +59,8 @@ import qualified Data.Map as Map
  ':'          { T_colon     }
  ';'          { T_semicolon }
  '<'          { T_less      }
+ '</'         { T_less_close }
+ '<tag'       { (T_less_tag $$)  }
  '='          { T_equal     }
  '=>'         { T_big_rarrow}
  '>'          { T_great     }
@@ -468,6 +470,17 @@ Exp6
   | '<' ListTupleComp '>' { R (tuple2record $2) }
   | '<' Exp ':' Exp '>'   { Typed $2 $4      }
   | '(' Exp ')'           { $2 }
+  | '<tag' Attributes '>'  Exps '</' Ident '>'    {% mkTag $1 $6 $2 $4 }
+  | '<tag' Attributes '/' '>' {% mkTag $1 $1 $2 [Empty] }
+
+Attributes :: { [(Ident,Term)] }
+Attributes
+  :                       { []    }
+  | Attribute Attributes  { $1:$2 }
+
+Attribute :: { (Ident,Term) }
+Attribute
+  : Ident '=' Exp6   { ($1,$3) }
 
 ListExp :: { [Term] }
 ListExp
@@ -813,4 +826,24 @@ mkAlts cs = case cs of
 mkL :: Posn -> Posn -> x -> L x
 mkL (Pn l1 _) (Pn l2 _) x = L (Local l1 l2) x
 
+
+mkTag ident ident' attrs t
+  | showIdent ident ==  showIdent ident'   =  return  (App
+                                                        (App
+                                                          (App 
+                                                              (Q (cPredef, (identS "markup"))) 
+                                                              (R (attrs2record attrs))
+                                                          )
+                                                          (K (showIdent ident))
+                                                        )
+                                                        (mkConcatenation t)
+                                                      ) 
+
+  | otherwise                              =  fail "Tags don't match"
+
+mkConcatenation [] = Empty
+mkConcatenation [t] = (App (Q (cPredef, (identS "linearize"))) t)
+mkConcatenation (t:ts) = C t (mkConcatenation ts)
+
+
 }

From 52c56feaf2f0785f07b2f52f63ec24491a62b44e Mon Sep 17 00:00:00 2001
From: Krasimir Angelov <kr.angelov@gmail.com>
Date: Wed, 6 Mar 2024 19:27:16 +0100
Subject: [PATCH 2/2] fix the disambiguation of html tags

---
 src/compiler/api/GF/Grammar/Grammar.hs |  2 +
 src/compiler/api/GF/Grammar/Lexer.x    | 62 ++++++++++++++++++++------
 src/compiler/api/GF/Grammar/Macros.hs  |  3 --
 src/compiler/api/GF/Grammar/Parser.y   | 61 +++++++++++++++----------
 4 files changed, 88 insertions(+), 40 deletions(-)

diff --git a/src/compiler/api/GF/Grammar/Grammar.hs b/src/compiler/api/GF/Grammar/Grammar.hs
index cd992810d..1353462b8 100644
--- a/src/compiler/api/GF/Grammar/Grammar.hs
+++ b/src/compiler/api/GF/Grammar/Grammar.hs
@@ -399,6 +399,8 @@ data Term =
  | Strs [Term]                   -- ^ conditioning prefix strings: @strs {s ; ...}@
  | TSymCat Int LIndex [(LIndex,(Ident,Type))]
  | TSymVar Int Int
+ | OpenTag Ident [Assign]        -- used internally in the parser
+ | CloseTag Ident                -- used internally in the parser
   deriving (Show, Eq, Ord)
 
 -- | Patterns
diff --git a/src/compiler/api/GF/Grammar/Lexer.x b/src/compiler/api/GF/Grammar/Lexer.x
index 5303932de..98e3be332 100644
--- a/src/compiler/api/GF/Grammar/Lexer.x
+++ b/src/compiler/api/GF/Grammar/Lexer.x
@@ -8,7 +8,7 @@ module GF.Grammar.Lexer
          ) where
 
 import Control.Applicative
-import Control.Monad(ap)
+import Control.Monad(ap,mplus)
 import GF.Infra.Ident
 --import GF.Data.Operations
 import qualified Data.ByteString.Char8 as BS
@@ -17,8 +17,8 @@ import qualified Data.ByteString.Internal as BS(w2c)
 import qualified Data.ByteString.UTF8 as UTF8
 import qualified Data.Map as Map
 import Data.Word(Word8)
-import Data.Char(readLitChar)
---import Debug.Trace(trace)
+import Data.Char(readLitChar,isSpace)
+import Data.Maybe(isJust)
 import qualified Control.Monad.Fail as Fail
 }
 
@@ -31,7 +31,7 @@ $i = [$l $d _ ']          -- identifier character
 $u = [.\n]                -- universal: any character
 
 @rsyms =    -- symbols and non-identifier-like reserved words
-   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \< | \<\/ | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
+   \; | \= | \{ | \} | \( | \) | \~ | \* \* | \: | \- \> | \, | \[ | \] | \- | \. | \| | \% | \? | \> | \@ | \# | \! | \* | \+ | \+ \+ | \\ | \\\\ | \= \> | \_ | \$ | \/ | \: \= | \: \: \=
 
 :-
 "--" [.]* ; -- Toss single line comments
@@ -39,6 +39,9 @@ $u = [.\n]                -- universal: any character
 
 $white+ ;
 @rsyms                          { tok ident }
+\<                              { \_ _ s -> if start_of_tag (BS.tail s)
+                                              then T_less_tag
+                                              else T_less     }
 \' ([. # [\' \\ \n]] | (\\ (\' | \\)))+ \' { tok (T_Ident . identS . unescapeInitTail . unpack) }
 (\_ | $l)($l | $d | \_ | \')*   { tok ident }
 
@@ -47,16 +50,14 @@ $white+ ;
 (\-)? $d+                       { tok (T_Integer . read . unpack) }
 (\-)? $d+ \. $d+ (e (\-)? $d+)? { tok (T_Double  . read . unpack) }
 
-\< (\_ | $l)($l | $d | \_ | \')*  { tok tag }
 
 {
 unpack = UTF8.toString
 --unpack = id
 
 ident = res T_Ident . identC . rawIdentC
-tag = res T_less_tag . identC . rawIdentC . BS.tail
 
-tok f p s = f s
+tok f p len s = f (UTF8.take len s)
 
 data Token
  = T_exclmark
@@ -77,7 +78,7 @@ data Token
  | T_colon
  | T_semicolon
  | T_less
- | T_less_close
+ | T_less_tag
  | T_equal
  | T_big_rarrow
  | T_great
@@ -133,7 +134,6 @@ data Token
  | T_terminator
  | T_separator
  | T_nonempty
- | T_less_tag Ident
  | T_String  String          -- string literals
  | T_Integer Integer         -- integer literals
  | T_Double  Double          -- double precision float literals
@@ -169,8 +169,6 @@ resWords = Map.fromList
  , b "/"  T_alt
  , b ":"  T_colon
  , b ";"  T_semicolon
- , b "</" T_less_close
- , b "<"  T_less
  , b "="  T_equal
  , b "=>" T_big_rarrow
  , b ">"  T_great
@@ -322,13 +320,51 @@ lexer cont = cont=<<token
 token :: P Token
 token = P go
   where
-  --cont' t = trace (show t) (cont t)
     go ai2@(_,inp@(AI pos _ str)) =
       case alexScan inp 0 of
         AlexEOF                -> POk (inp,inp) T_EOF
         AlexError (AI pos _ _) -> PFailed pos "lexical error"
         AlexSkip  inp' len     -> {-trace (show len) $-} go (inp,inp')
-        AlexToken inp' len act -> POk (inp,inp') (act pos ({-UTF8.toString-} (UTF8.take len str)))
+        AlexToken inp' len act -> POk (inp,inp') (act pos len str)
+
+start_of_tag s = isJust (match s)
+  where
+    match s = do
+      s <- matchSpace s
+      (char s '/'
+       `mplus`
+       do s <- matchIdent s
+          s <- matchSpace s
+          (char s '/'
+           `mplus`
+           do s <- matchIdent s
+              s <- matchSpace s
+              char s '='))
+
+    matchSpace s
+      | BS.null s           = Just s
+      | isSpace (BS.head s) = matchSpace (BS.tail s)
+      | otherwise           = Just s
+
+    init =
+      BS.pack "_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ"
+    rest =
+      BS.append init (BS.pack "0123456789'")
+
+    char s c
+      | BS.null s                = Nothing
+      | BS.head s == c           = Just (BS.tail s)
+      | otherwise                = Nothing
+
+    matchIdent s
+      | BS.null s                = Nothing
+      | BS.elem (BS.head s) init = matchRest (BS.tail s)
+      | otherwise                = Nothing
+
+    matchRest s
+      | BS.null s                = Just s
+      | BS.elem (BS.head s) rest = matchRest (BS.tail s)
+      | otherwise                = Just s
 
 getPosn :: P Posn
 getPosn = P $ \ai2@(_,inp@(AI pos _ _)) -> POk ai2 pos
diff --git a/src/compiler/api/GF/Grammar/Macros.hs b/src/compiler/api/GF/Grammar/Macros.hs
index cb097395c..7d083fbe3 100644
--- a/src/compiler/api/GF/Grammar/Macros.hs
+++ b/src/compiler/api/GF/Grammar/Macros.hs
@@ -260,9 +260,6 @@ tuple2recordType ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts]
 tuple2recordPatt :: [Patt] -> [(Label,Patt)]
 tuple2recordPatt ts = [(tupleLabel i, t) | (i,t) <- zip [1..] ts]
 
-attrs2record :: [(Ident, Term)] -> [Assign]
-attrs2record ts = [assign (ident2label i) ((App (Q (cPredef, (identS "linearize"))) t)) | (i,t) <-  ts]
-
 mkCases :: Ident -> Term -> Term
 mkCases x t = T TRaw [(PV x, t)]
 
diff --git a/src/compiler/api/GF/Grammar/Parser.y b/src/compiler/api/GF/Grammar/Parser.y
index cad954d55..40ff012bd 100644
--- a/src/compiler/api/GF/Grammar/Parser.y
+++ b/src/compiler/api/GF/Grammar/Parser.y
@@ -59,8 +59,7 @@ import qualified Data.Map as Map
  ':'          { T_colon     }
  ';'          { T_semicolon }
  '<'          { T_less      }
- '</'         { T_less_close }
- '<tag'       { (T_less_tag $$)  }
+ '<tag'       { T_less_tag  }
  '='          { T_equal     }
  '=>'         { T_big_rarrow}
  '>'          { T_great     }
@@ -433,7 +432,9 @@ Exp3
 
 Exp4 :: { Term }
 Exp4
-  : Exp4 Exp5                        { App $1 $2     }
+  : Exp4 Exp5                        {% case $2 of
+                                          CloseTag id -> mkTag id $1 Empty
+                                          _           -> return (App $1 $2) }
   | Exp4 '{' Exp '}'                 { App $1 (ImplArg $3) } 
   | 'case' Exp 'of' '{' ListCase '}' { let annot = case $2 of
                                              Typed _ t -> TTyped t
@@ -470,17 +471,18 @@ Exp6
   | '<' ListTupleComp '>' { R (tuple2record $2) }
   | '<' Exp ':' Exp '>'   { Typed $2 $4      }
   | '(' Exp ')'           { $2 }
-  | '<tag' Attributes '>'  Exps '</' Ident '>'    {% mkTag $1 $6 $2 $4 }
-  | '<tag' Attributes '/' '>' {% mkTag $1 $1 $2 [Empty] }
+  | '<tag' Ident Attributes '>'     { OpenTag $2 $3 }
+  | '<tag' '/' Ident '>'            { CloseTag $3   }
+  | '<tag' Ident Attributes '/' '>' { markup $2 $3 Empty }
 
-Attributes :: { [(Ident,Term)] }
+Attributes :: { [Assign] }
 Attributes
   :                       { []    }
   | Attribute Attributes  { $1:$2 }
 
-Attribute :: { (Ident,Term) }
+Attribute :: { Assign }
 Attribute
-  : Ident '=' Exp6   { ($1,$3) }
+  : Ident '=' Exp6   { assign (ident2label $1) $3 }
 
 ListExp :: { [Term] }
 ListExp
@@ -827,23 +829,34 @@ mkL :: Posn -> Posn -> x -> L x
 mkL (Pn l1 _) (Pn l2 _) x = L (Local l1 l2) x
 
 
-mkTag ident ident' attrs t
-  | showIdent ident ==  showIdent ident'   =  return  (App
-                                                        (App
-                                                          (App 
-                                                              (Q (cPredef, (identS "markup"))) 
-                                                              (R (attrs2record attrs))
-                                                          )
-                                                          (K (showIdent ident))
-                                                        )
-                                                        (mkConcatenation t)
-                                                      ) 
+mkTag ident (App t1 t2) conc =
+  case match ident t2 of
+    Just attrs -> fmap (App t1) (mkTag ident t2 conc)
+    Nothing    -> let t = App (Q (cPredef, (identS "linearize"))) t2
+                  in mkTag ident t1 $ case conc of
+                                        Empty -> t
+                                        _     -> C t conc
+mkTag ident t conc =
+  case match ident t of
+    Just attrs -> return (markup ident attrs conc)
+    Nothing    -> fail ("Unmatched closing tag " ++ showIdent ident)
 
-  | otherwise                              =  fail "Tags don't match"
-
-mkConcatenation [] = Empty
-mkConcatenation [t] = (App (Q (cPredef, (identS "linearize"))) t)
-mkConcatenation (t:ts) = C t (mkConcatenation ts)
+match ident (OpenTag ident' attrs)
+  | ident == ident'    = Just attrs
+match ident (R [(lbl,(Nothing,Vr ident'))])
+  | lbl == ident2label (identS "p1") && ident == ident'
+                       = Just []
+match ident _          = Nothing
 
+markup ident attrs content =
+  App
+    (App
+       (App 
+          (Q (cPredef, (identS "markup")))
+          (R attrs)
+       )
+       (K (showIdent ident))
+    )
+    content
 
 }