mirror of
https://github.com/GrammaticalFramework/gf-core.git
synced 2026-05-06 17:52:51 -06:00
Use HashTable to share strings in tokens when parsing GFC files.
This commit is contained in:
@@ -1,7 +1,7 @@
|
|||||||
{-# OPTIONS -fglasgow-exts -cpp #-}
|
{-# OPTIONS -fglasgow-exts -cpp #-}
|
||||||
{-# LINE 3 "LexGFC.x" #-}
|
{-# LINE 3 "LexGFC.x" #-}
|
||||||
module LexGFC where
|
module LexGFC where
|
||||||
|
import SharedString
|
||||||
import ErrM
|
import ErrM
|
||||||
|
|
||||||
#if __GLASGOW_HASKELL__ >= 503
|
#if __GLASGOW_HASKELL__ >= 503
|
||||||
@@ -35,10 +35,10 @@ alex_accept = listArray (0::Int,14) [[],[],[(AlexAccSkip)],[(AlexAcc (alex_actio
|
|||||||
tok f p s = f p s
|
tok f p s = f p s
|
||||||
|
|
||||||
data Tok =
|
data Tok =
|
||||||
TS String -- reserved words
|
TS !String -- reserved words
|
||||||
| TL String -- string literals
|
| TL !String -- string literals
|
||||||
| TI String -- integer literals
|
| TI String -- integer literals
|
||||||
| TV String -- identifiers
|
| TV !String -- identifiers -- H
|
||||||
| TD String -- double precision float literals
|
| TD String -- double precision float literals
|
||||||
| TC String -- character literals
|
| TC String -- character literals
|
||||||
|
|
||||||
@@ -130,9 +130,9 @@ alexGetChar (p, _, (c:s)) =
|
|||||||
alexInputPrevChar :: AlexInput -> Char
|
alexInputPrevChar :: AlexInput -> Char
|
||||||
alexInputPrevChar (p, c, s) = c
|
alexInputPrevChar (p, c, s) = c
|
||||||
|
|
||||||
alex_action_1 = tok (\p s -> PT p (TS s))
|
alex_action_1 = tok (\p s -> PT p (TS (shareString s)))
|
||||||
alex_action_2 = tok (\p s -> PT p (eitherResIdent TV s))
|
alex_action_2 = tok (\p s -> PT p (eitherResIdent (TV . shareString) s))
|
||||||
alex_action_3 = tok (\p s -> PT p (TL $ unescapeInitTail s))
|
alex_action_3 = tok (\p s -> PT p (TL $ shareString $ unescapeInitTail s))
|
||||||
alex_action_4 = tok (\p s -> PT p (TI s))
|
alex_action_4 = tok (\p s -> PT p (TI s))
|
||||||
{-# LINE 1 "GenericTemplate.hs" #-}
|
{-# LINE 1 "GenericTemplate.hs" #-}
|
||||||
{-# LINE 1 "<built-in>" #-}
|
{-# LINE 1 "<built-in>" #-}
|
||||||
|
|||||||
131
src/GF/Canon/LexGFC.x
Normal file
131
src/GF/Canon/LexGFC.x
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
-- -*- haskell -*-
|
||||||
|
-- This Alex file was machine-generated by the BNF converter
|
||||||
|
-- Lines with -- H have been hacked for greater performance
|
||||||
|
{
|
||||||
|
module LexGFC where
|
||||||
|
import SharedString -- H
|
||||||
|
import ErrM
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
$l = [a-zA-Z\192 - \255] # [\215 \247] -- isolatin1 letter FIXME
|
||||||
|
$c = [A-Z\192-\221] # [\215] -- capital isolatin1 letter FIXME
|
||||||
|
$s = [a-z\222-\255] # [\247] -- small isolatin1 letter FIXME
|
||||||
|
$d = [0-9] -- digit
|
||||||
|
$i = [$l $d _ '] -- identifier character
|
||||||
|
$u = [\0-\255] -- universal: any character
|
||||||
|
|
||||||
|
@rsyms = -- reserved words consisting of special symbols
|
||||||
|
\; | \= | \{ | \} | \: | \- \> | \* \* | \[ | \] | \\ | \. | \( | \) | \_ | \< | \> | \$ | \? | \= \> | \! | \+ \+ | \/ | \@ | \+ | \| | \,
|
||||||
|
|
||||||
|
:-
|
||||||
|
|
||||||
|
$white+ ;
|
||||||
|
@rsyms { tok (\p s -> PT p (TS (shareString s))) } -- H
|
||||||
|
|
||||||
|
$l $i* { tok (\p s -> PT p (eitherResIdent (TV . shareString) s)) } -- H
|
||||||
|
\" ([$u # [\" \\ \n]] | (\\ (\" | \\ | \' | n | t)))* \"{ tok (\p s -> PT p (TL $ shareString $ unescapeInitTail s)) } -- H
|
||||||
|
|
||||||
|
$d+ { tok (\p s -> PT p (TI s)) }
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
|
||||||
|
tok f p s = f p s
|
||||||
|
|
||||||
|
data Tok =
|
||||||
|
TS !String -- reserved words -- H
|
||||||
|
| TL !String -- string literals -- H
|
||||||
|
| TI String -- integer literals
|
||||||
|
| TV !String -- identifiers -- H
|
||||||
|
| TD String -- double precision float literals
|
||||||
|
| TC String -- character literals
|
||||||
|
|
||||||
|
deriving (Eq,Show,Ord)
|
||||||
|
|
||||||
|
data Token =
|
||||||
|
PT Posn Tok
|
||||||
|
| Err Posn
|
||||||
|
deriving (Eq,Show,Ord)
|
||||||
|
|
||||||
|
tokenPos (PT (Pn _ l _) _ :_) = "line " ++ show l
|
||||||
|
tokenPos (Err (Pn _ l _) :_) = "line " ++ show l
|
||||||
|
tokenPos _ = "end of file"
|
||||||
|
|
||||||
|
posLineCol (Pn _ l c) = (l,c)
|
||||||
|
mkPosToken t@(PT p _) = (posLineCol p, prToken t)
|
||||||
|
|
||||||
|
prToken t = case t of
|
||||||
|
PT _ (TS s) -> s
|
||||||
|
PT _ (TI s) -> s
|
||||||
|
PT _ (TV s) -> s
|
||||||
|
PT _ (TD s) -> s
|
||||||
|
PT _ (TC s) -> s
|
||||||
|
|
||||||
|
_ -> show t
|
||||||
|
|
||||||
|
eitherResIdent :: (String -> Tok) -> String -> Tok
|
||||||
|
eitherResIdent tv s = if isResWord s then (TS s) else (tv s) where
|
||||||
|
isResWord s = isInTree s $
|
||||||
|
B "lin" (B "concrete" (B "Type" (B "Str" (B "Ints" N N) N) (B "cat" (B "abstract" N N) N)) (B "fun" (B "flags" (B "data" N N) N) (B "in" (B "grammar" N N) N))) (B "pre" (B "open" (B "of" (B "lincat" N N) N) (B "param" (B "oper" N N) N)) (B "transfer" (B "table" (B "resource" N N) N) (B "variants" N N)))
|
||||||
|
|
||||||
|
data BTree = N | B String BTree BTree deriving (Show)
|
||||||
|
|
||||||
|
isInTree :: String -> BTree -> Bool
|
||||||
|
isInTree x tree = case tree of
|
||||||
|
N -> False
|
||||||
|
B a left right
|
||||||
|
| x < a -> isInTree x left
|
||||||
|
| x > a -> isInTree x right
|
||||||
|
| x == a -> True
|
||||||
|
|
||||||
|
unescapeInitTail :: String -> String
|
||||||
|
unescapeInitTail = unesc . tail where
|
||||||
|
unesc s = case s of
|
||||||
|
'\\':c:cs | elem c ['\"', '\\', '\''] -> c : unesc cs
|
||||||
|
'\\':'n':cs -> '\n' : unesc cs
|
||||||
|
'\\':'t':cs -> '\t' : unesc cs
|
||||||
|
'"':[] -> []
|
||||||
|
c:cs -> c : unesc cs
|
||||||
|
_ -> []
|
||||||
|
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
-- Alex wrapper code.
|
||||||
|
-- A modified "posn" wrapper.
|
||||||
|
-------------------------------------------------------------------
|
||||||
|
|
||||||
|
data Posn = Pn !Int !Int !Int
|
||||||
|
deriving (Eq, Show,Ord)
|
||||||
|
|
||||||
|
alexStartPos :: Posn
|
||||||
|
alexStartPos = Pn 0 1 1
|
||||||
|
|
||||||
|
alexMove :: Posn -> Char -> Posn
|
||||||
|
alexMove (Pn a l c) '\t' = Pn (a+1) l (((c+7) `div` 8)*8+1)
|
||||||
|
alexMove (Pn a l c) '\n' = Pn (a+1) (l+1) 1
|
||||||
|
alexMove (Pn a l c) _ = Pn (a+1) l (c+1)
|
||||||
|
|
||||||
|
type AlexInput = (Posn, -- current position,
|
||||||
|
Char, -- previous char
|
||||||
|
String) -- current input string
|
||||||
|
|
||||||
|
tokens :: String -> [Token]
|
||||||
|
tokens str = go (alexStartPos, '\n', str)
|
||||||
|
where
|
||||||
|
go :: (Posn, Char, String) -> [Token]
|
||||||
|
go inp@(pos, _, str) =
|
||||||
|
case alexScan inp 0 of
|
||||||
|
AlexEOF -> []
|
||||||
|
AlexError (pos, _, _) -> fail $ show pos ++ ": lexical error"
|
||||||
|
AlexSkip inp' len -> go inp'
|
||||||
|
AlexToken inp' len act -> act pos (take len str) : (go inp')
|
||||||
|
|
||||||
|
alexGetChar :: AlexInput -> Maybe (Char,AlexInput)
|
||||||
|
alexGetChar (p, c, []) = Nothing
|
||||||
|
alexGetChar (p, _, (c:s)) =
|
||||||
|
let p' = alexMove p c
|
||||||
|
in p' `seq` Just (c, (p', c, s))
|
||||||
|
|
||||||
|
alexInputPrevChar :: AlexInput -> Char
|
||||||
|
alexInputPrevChar (p, c, s) = c
|
||||||
|
}
|
||||||
18
src/GF/Data/SharedString.hs
Normal file
18
src/GF/Data/SharedString.hs
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
module SharedString (shareString) where
|
||||||
|
|
||||||
|
import Data.HashTable as H
|
||||||
|
import System.IO.Unsafe (unsafePerformIO)
|
||||||
|
|
||||||
|
{-# NOINLINE stringPool #-}
|
||||||
|
stringPool :: HashTable String String
|
||||||
|
stringPool = unsafePerformIO $ new (==) hashString
|
||||||
|
|
||||||
|
{-# NOINLINE shareString #-}
|
||||||
|
shareString :: String -> String
|
||||||
|
shareString s = unsafePerformIO $ do
|
||||||
|
mv <- H.lookup stringPool s
|
||||||
|
case mv of
|
||||||
|
Just s' -> return s'
|
||||||
|
Nothing -> do
|
||||||
|
H.insert stringPool s s
|
||||||
|
return s
|
||||||
Reference in New Issue
Block a user