From ff11f2f9639055b62ef681a48272e546489046c6 Mon Sep 17 00:00:00 2001 From: krasimir Date: Wed, 11 May 2016 07:11:45 +0000 Subject: [PATCH] many of the uses of peekCString and withCString in the Haskell binding were incorrect since they encode the string in the system locale while the C runtime is always using UTF8 --- src/runtime/haskell-bind/PGF2.hsc | 120 ++++++++++++++------------- src/runtime/haskell-bind/PGF2/FFI.hs | 39 ++++++++- 2 files changed, 99 insertions(+), 60 deletions(-) diff --git a/src/runtime/haskell-bind/PGF2.hsc b/src/runtime/haskell-bind/PGF2.hsc index 96677d3bd..af310b17f 100644 --- a/src/runtime/haskell-bind/PGF2.hsc +++ b/src/runtime/haskell-bind/PGF2.hsc @@ -99,7 +99,7 @@ languages p = getLanguages :: IORef (Map.Map String Concr) -> MapItorCallback getLanguages ref itor key value exn = do langs <- readIORef ref - name <- peekCString (castPtr key) + name <- peekUtf8CString (castPtr key) concr <- fmap (\ptr -> Concr ptr p) $ peek (castPtr value) writeIORef ref $! Map.insert name concr langs @@ -108,18 +108,18 @@ generateAll p cat = unsafePerformIO $ do genPl <- gu_new_pool exprPl <- gu_new_pool - enum <- withCString cat $ \cat -> do - exn <- gu_new_exn genPl - pgf_generate_all (pgf p) cat exn genPl exprPl + cat <- newUtf8CString cat genPl + exn <- gu_new_exn genPl + enum <- pgf_generate_all (pgf p) cat exn genPl exprPl genFPl <- newForeignPtr gu_pool_finalizer genPl exprFPl <- newForeignPtr gu_pool_finalizer exprPl fromPgfExprEnum enum genFPl (p,exprFPl) abstractName :: PGF -> AbsName -abstractName p = unsafePerformIO (peekCString =<< pgf_abstract_name (pgf p)) +abstractName p = unsafePerformIO (peekUtf8CString =<< pgf_abstract_name (pgf p)) startCat :: PGF -> Cat -startCat p = unsafePerformIO (peekCString =<< pgf_start_cat (pgf p)) +startCat p = unsafePerformIO (peekUtf8CString =<< pgf_start_cat (pgf p)) loadConcr :: Concr -> FilePath -> IO () loadConcr c fpath = @@ -146,12 +146,13 @@ unloadConcr c = pgf_concrete_unload (concr c) functionType :: PGF -> CId -> Type functionType p fn = unsafePerformIO $ - withCString fn $ \c_fn -> do + withGuPool $ \tmpPl -> do + c_fn <- newUtf8CString fn tmpPl c_type <- pgf_function_type (pgf p) c_fn peekType c_type where peekType c_type = do - cid <- (#peek PgfType, cid) c_type >>= peekCString + cid <- (#peek PgfType, cid) c_type >>= peekUtf8CString c_hypos <- (#peek PgfType, hypos) c_type n_hypos <- (#peek GuSeq, len) c_hypos hs <- peekHypos (c_hypos `plusPtr` (#offset GuSeq, data)) 0 n_hypos @@ -161,7 +162,7 @@ functionType p fn = peekHypos :: Ptr a -> Int -> Int -> IO [Hypo] peekHypos c_hypo i n - | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekCString + | i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekUtf8CString ty <- (#peek PgfHypo, type) c_hypo >>= peekType bt <- fmap toBindType ((#peek PgfHypo, bind_type) c_hypo) hs <- peekHypos (plusPtr c_hypo (#size PgfHypo)) (i+1) n @@ -190,7 +191,7 @@ graphvizAbstractTree p e = exn <- gu_new_exn tmpPl pgf_graphviz_abstract_tree (pgf p) (expr e) out exn s <- gu_string_buf_freeze sb tmpPl - peekCString s + peekUtf8CString s graphvizParseTree :: Concr -> Expr -> String @@ -201,7 +202,7 @@ graphvizParseTree c e = exn <- gu_new_exn tmpPl pgf_graphviz_parse_tree (concr c) (expr e) out exn s <- gu_string_buf_freeze sb tmpPl - peekCString s + peekUtf8CString s ----------------------------------------------------------------------------- -- Functions using Concr @@ -210,15 +211,17 @@ graphvizParseTree c e = type MorphoAnalysis = (Fun,String,Float) lookupMorpho :: Concr -> String -> [MorphoAnalysis] -lookupMorpho (Concr concr master) sent = unsafePerformIO $ - do ref <- newIORef [] - allocaBytes (#size PgfMorphoCallback) $ \cback -> - do fptr <- wrapLookupMorphoCallback (getAnalysis ref) - (#poke PgfMorphoCallback, callback) cback fptr - withCString sent $ \c_sent -> - pgf_lookup_morpho concr c_sent cback nullPtr - freeHaskellFunPtr fptr - readIORef ref +lookupMorpho (Concr concr master) sent = + unsafePerformIO $ + withGuPool $ \tmpPl -> do + ref <- newIORef [] + cback <- gu_malloc tmpPl (#size PgfMorphoCallback) + fptr <- wrapLookupMorphoCallback (getAnalysis ref) + (#poke PgfMorphoCallback, callback) cback fptr + c_sent <- newUtf8CString sent tmpPl + pgf_lookup_morpho concr c_sent cback nullPtr + freeHaskellFunPtr fptr + readIORef ref fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])] fullFormLexicon lang = @@ -237,7 +240,7 @@ fullFormLexicon lang = if ffEntry == nullPtr then do finalizeForeignPtr fpl return [] - else do tok <- peekCString =<< pgf_fullform_get_string ffEntry + else do tok <- peekUtf8CString =<< pgf_fullform_get_string ffEntry ref <- newIORef [] allocaBytes (#size PgfMorphoCallback) $ \cback -> do fptr <- wrapLookupMorphoCallback (getAnalysis ref) @@ -250,8 +253,8 @@ fullFormLexicon lang = getAnalysis :: IORef [MorphoAnalysis] -> LookupMorphoCallback getAnalysis ref self c_lemma c_anal prob exn = do ans <- readIORef ref - lemma <- peekCString c_lemma - anal <- peekCString c_anal + lemma <- peekUtf8CString c_lemma + anal <- peekUtf8CString c_anal writeIORef ref ((lemma, anal, prob):ans) parse :: Concr -> Cat -> String -> Either String [(Expr,Float)] @@ -274,26 +277,26 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse -> Either String [(Expr,Float)] parseWithHeuristics lang cat sent heuristic callbacks = unsafePerformIO $ - do parsePl <- gu_new_pool - exprPl <- gu_new_pool + do exprPl <- gu_new_pool + parsePl <- gu_new_pool exn <- gu_new_exn parsePl - enum <- withCString cat $ \cat -> - withCString sent $ \sent -> do - callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl - pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl + cat <- newUtf8CString cat parsePl + sent <- newUtf8CString sent parsePl + callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl + enum <- pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekCString c_tok + tok <- peekUtf8CString c_tok gu_pool_free parsePl gu_pool_free exprPl return (Left tok) else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn - msg <- peekCString c_msg + msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) @@ -308,15 +311,15 @@ parseWithHeuristics lang cat sent heuristic callbacks = mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> String -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap) mkCallbacksMap concr callbacks pool = do callbacks_map <- pgf_new_callbacks_map concr pool - forM_ callbacks $ \(cat,match) -> - withCString cat $ \ccat -> do - match <- wrapLiteralMatchCallback (match_callback match) - predict <- wrapLiteralPredictCallback predict_callback - hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool + forM_ callbacks $ \(cat,match) -> do + ccat <- newUtf8CString cat pool + match <- wrapLiteralMatchCallback (match_callback match) + predict <- wrapLiteralPredictCallback predict_callback + hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool return callbacks_map where match_callback match _ clin_idx csentence poffset out_pool = do - sentence <- peekCString csentence + sentence <- peekUtf8CString csentence coffset <- peek poffset case match (fromIntegral clin_idx) sentence (fromIntegral coffset) of Nothing -> return nullPtr @@ -361,26 +364,26 @@ parseWithOracle lang cat sent (predict,complete,literal) = do parsePl <- gu_new_pool exprPl <- gu_new_pool exn <- gu_new_exn parsePl - enum <- withCString cat $ \cat -> - withCString sent $ \sent -> do - predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict - completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete - literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal - cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl - pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl + cat <- newUtf8CString cat parsePl + sent <- newUtf8CString sent parsePl + predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict + completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete + literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal + cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl + enum <- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl failed <- gu_exn_is_raised exn if failed then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError if is_parse_error then do c_tok <- (#peek GuExn, data.data) exn - tok <- peekCString c_tok + tok <- peekUtf8CString c_tok gu_pool_free parsePl gu_pool_free exprPl return (Left tok) else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn - msg <- peekCString c_msg + msg <- peekUtf8CString c_msg gu_pool_free parsePl gu_pool_free exprPl throwIO (PGFError msg) @@ -393,13 +396,13 @@ parseWithOracle lang cat sent (predict,complete,literal) = return (Right exprs) where oracleWrapper oracle catPtr lblPtr offset = do - cat <- peekCString catPtr - lbl <- peekCString lblPtr + cat <- peekUtf8CString catPtr + lbl <- peekUtf8CString lblPtr return (oracle cat lbl (fromIntegral offset)) oracleLiteralWrapper oracle catPtr lblPtr poffset out_pool = do - cat <- peekCString catPtr - lbl <- peekCString lblPtr + cat <- peekUtf8CString catPtr + lbl <- peekUtf8CString lblPtr offset <- peek poffset case oracle cat lbl (fromIntegral offset) of Just (e,prob,offset) -> @@ -425,7 +428,8 @@ parseWithOracle lang cat sent (predict,complete,literal) = hasLinearization :: Concr -> Fun -> Bool hasLinearization lang id = unsafePerformIO $ - withCString id (pgf_has_linearization (concr lang)) + withGuPool $ \pl -> + newUtf8CString id pl >>= pgf_has_linearization (concr lang) linearize :: Concr -> Expr -> String linearize lang e = unsafePerformIO $ @@ -441,11 +445,11 @@ linearize lang e = unsafePerformIO $ else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn - msg <- peekCString c_msg + msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do lin <- gu_string_buf_freeze sb pl - peekCString lin + peekUtf8CString lin linearizeAll :: Concr -> Expr -> [String] linearizeAll lang e = unsafePerformIO $ @@ -473,7 +477,7 @@ linearizeAll lang e = unsafePerformIO $ then collect cts exn pl else throwExn exn pl else do lin <- gu_string_buf_freeze sb tmpPl - s <- peekCString lin + s <- peekUtf8CString lin ss <- unsafeInterleaveIO (collect cts exn pl) return (s:ss) @@ -481,7 +485,7 @@ linearizeAll lang e = unsafePerformIO $ is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn - msg <- peekCString c_msg + msg <- peekUtf8CString c_msg gu_pool_free pl throwIO (PGFError msg) else do gu_pool_free pl @@ -500,7 +504,7 @@ alignWords lang e = unsafePerformIO $ else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn if is_exn then do c_msg <- (#peek GuExn, data.data) exn - msg <- peekCString c_msg + msg <- peekUtf8CString c_msg throwIO (PGFError msg) else throwIO (PGFError "The abstract tree cannot be linearized") else do len <- (#peek GuSeq, len) seq @@ -510,7 +514,7 @@ alignWords lang e = unsafePerformIO $ peekAlignmentPhrase :: Ptr () -> IO (String, [Int]) peekAlignmentPhrase ptr = do c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr - phrase <- peekCString c_phrase + phrase <- peekUtf8CString c_phrase n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr (fids :: [CInt]) <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids)) return (phrase, map fromIntegral fids) @@ -532,7 +536,7 @@ functions p = getFunctions :: IORef [String] -> MapItorCallback getFunctions ref itor key value exn = do names <- readIORef ref - name <- peekCString (castPtr key) + name <- peekUtf8CString (castPtr key) writeIORef ref $! (name : names) categories :: PGF -> [Cat] diff --git a/src/runtime/haskell-bind/PGF2/FFI.hs b/src/runtime/haskell-bind/PGF2/FFI.hs index 3ba5858bc..1e3abec64 100644 --- a/src/runtime/haskell-bind/PGF2/FFI.hs +++ b/src/runtime/haskell-bind/PGF2/FFI.hs @@ -2,12 +2,13 @@ module PGF2.FFI where +import Foreign ( alloca, poke ) import Foreign.C ---import Foreign.C.String import Foreign.Ptr import Foreign.ForeignPtr import Control.Exception import GHC.Ptr +import Data.Int(Int32) data PGF = PGF {pgf :: Ptr PgfPGF, pgfMaster :: ForeignPtr GuPool} data Concr = Concr {concr :: Ptr PgfConcr, concrMaster :: PGF} @@ -72,10 +73,16 @@ foreign import ccall "gu/file.h gu_file_in" foreign import ccall "gu/enum.h gu_enum_next" gu_enum_next :: Ptr a -> Ptr (Ptr b) -> Ptr GuPool -> IO () - + foreign import ccall "gu/string.h gu_string_buf_freeze" gu_string_buf_freeze :: Ptr GuStringBuf -> Ptr GuPool -> IO CString +foreign import ccall unsafe "gu/utf8.h gu_utf8_decode" + gu_utf8_decode :: Ptr CString -> IO Int32 + +foreign import ccall unsafe "gu/utf8.h gu_utf8_encode" + gu_utf8_encode :: Int32 -> Ptr CString -> IO () + withGuPool :: (Ptr GuPool -> IO a) -> IO a withGuPool f = bracket gu_new_pool gu_pool_free f @@ -85,6 +92,34 @@ newOut pool = out <- gu_string_buf_out sb return (sb,out) +peekUtf8CString :: CString -> IO String +peekUtf8CString ptr = + alloca $ \pptr -> + poke pptr ptr >> decode pptr + where + decode pptr = do + x <- gu_utf8_decode pptr + if x == 0 + then return [] + else do cs <- decode pptr + return (((toEnum . fromEnum) x) : cs) + +newUtf8CString :: String -> Ptr GuPool -> IO CString +newUtf8CString s pool = do + -- An UTF8 character takes up to 6 bytes. We allocate enough + -- memory for the worst case. This is wasteful but those + -- strings are usually allocated only temporary. + ptr <- gu_malloc pool (fromIntegral (length s * 6+1)) + alloca $ \pptr -> + poke pptr ptr >> encode s pptr + return ptr + where + encode [] pptr = do + gu_utf8_encode 0 pptr + encode (c:cs) pptr = do + gu_utf8_encode ((toEnum . fromEnum) c) pptr + encode cs pptr + ------------------------------------------------------------------ -- libpgf API