many of the uses of peekCString and withCString in the Haskell binding were incorrect since they encode the string in the system locale while the C runtime is always using UTF8

This commit is contained in:
krasimir
2016-05-11 07:11:45 +00:00
parent 350316829e
commit ff11f2f963
2 changed files with 99 additions and 60 deletions

View File

@@ -99,7 +99,7 @@ languages p =
getLanguages :: IORef (Map.Map String Concr) -> MapItorCallback
getLanguages ref itor key value exn = do
langs <- readIORef ref
name <- peekCString (castPtr key)
name <- peekUtf8CString (castPtr key)
concr <- fmap (\ptr -> Concr ptr p) $ peek (castPtr value)
writeIORef ref $! Map.insert name concr langs
@@ -108,18 +108,18 @@ generateAll p cat =
unsafePerformIO $
do genPl <- gu_new_pool
exprPl <- gu_new_pool
enum <- withCString cat $ \cat -> do
exn <- gu_new_exn genPl
pgf_generate_all (pgf p) cat exn genPl exprPl
cat <- newUtf8CString cat genPl
exn <- gu_new_exn genPl
enum <- pgf_generate_all (pgf p) cat exn genPl exprPl
genFPl <- newForeignPtr gu_pool_finalizer genPl
exprFPl <- newForeignPtr gu_pool_finalizer exprPl
fromPgfExprEnum enum genFPl (p,exprFPl)
abstractName :: PGF -> AbsName
abstractName p = unsafePerformIO (peekCString =<< pgf_abstract_name (pgf p))
abstractName p = unsafePerformIO (peekUtf8CString =<< pgf_abstract_name (pgf p))
startCat :: PGF -> Cat
startCat p = unsafePerformIO (peekCString =<< pgf_start_cat (pgf p))
startCat p = unsafePerformIO (peekUtf8CString =<< pgf_start_cat (pgf p))
loadConcr :: Concr -> FilePath -> IO ()
loadConcr c fpath =
@@ -146,12 +146,13 @@ unloadConcr c = pgf_concrete_unload (concr c)
functionType :: PGF -> CId -> Type
functionType p fn =
unsafePerformIO $
withCString fn $ \c_fn -> do
withGuPool $ \tmpPl -> do
c_fn <- newUtf8CString fn tmpPl
c_type <- pgf_function_type (pgf p) c_fn
peekType c_type
where
peekType c_type = do
cid <- (#peek PgfType, cid) c_type >>= peekCString
cid <- (#peek PgfType, cid) c_type >>= peekUtf8CString
c_hypos <- (#peek PgfType, hypos) c_type
n_hypos <- (#peek GuSeq, len) c_hypos
hs <- peekHypos (c_hypos `plusPtr` (#offset GuSeq, data)) 0 n_hypos
@@ -161,7 +162,7 @@ functionType p fn =
peekHypos :: Ptr a -> Int -> Int -> IO [Hypo]
peekHypos c_hypo i n
| i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekCString
| i < n = do cid <- (#peek PgfHypo, cid) c_hypo >>= peekUtf8CString
ty <- (#peek PgfHypo, type) c_hypo >>= peekType
bt <- fmap toBindType ((#peek PgfHypo, bind_type) c_hypo)
hs <- peekHypos (plusPtr c_hypo (#size PgfHypo)) (i+1) n
@@ -190,7 +191,7 @@ graphvizAbstractTree p e =
exn <- gu_new_exn tmpPl
pgf_graphviz_abstract_tree (pgf p) (expr e) out exn
s <- gu_string_buf_freeze sb tmpPl
peekCString s
peekUtf8CString s
graphvizParseTree :: Concr -> Expr -> String
@@ -201,7 +202,7 @@ graphvizParseTree c e =
exn <- gu_new_exn tmpPl
pgf_graphviz_parse_tree (concr c) (expr e) out exn
s <- gu_string_buf_freeze sb tmpPl
peekCString s
peekUtf8CString s
-----------------------------------------------------------------------------
-- Functions using Concr
@@ -210,15 +211,17 @@ graphvizParseTree c e =
type MorphoAnalysis = (Fun,String,Float)
lookupMorpho :: Concr -> String -> [MorphoAnalysis]
lookupMorpho (Concr concr master) sent = unsafePerformIO $
do ref <- newIORef []
allocaBytes (#size PgfMorphoCallback) $ \cback ->
do fptr <- wrapLookupMorphoCallback (getAnalysis ref)
(#poke PgfMorphoCallback, callback) cback fptr
withCString sent $ \c_sent ->
pgf_lookup_morpho concr c_sent cback nullPtr
freeHaskellFunPtr fptr
readIORef ref
lookupMorpho (Concr concr master) sent =
unsafePerformIO $
withGuPool $ \tmpPl -> do
ref <- newIORef []
cback <- gu_malloc tmpPl (#size PgfMorphoCallback)
fptr <- wrapLookupMorphoCallback (getAnalysis ref)
(#poke PgfMorphoCallback, callback) cback fptr
c_sent <- newUtf8CString sent tmpPl
pgf_lookup_morpho concr c_sent cback nullPtr
freeHaskellFunPtr fptr
readIORef ref
fullFormLexicon :: Concr -> [(String, [MorphoAnalysis])]
fullFormLexicon lang =
@@ -237,7 +240,7 @@ fullFormLexicon lang =
if ffEntry == nullPtr
then do finalizeForeignPtr fpl
return []
else do tok <- peekCString =<< pgf_fullform_get_string ffEntry
else do tok <- peekUtf8CString =<< pgf_fullform_get_string ffEntry
ref <- newIORef []
allocaBytes (#size PgfMorphoCallback) $ \cback ->
do fptr <- wrapLookupMorphoCallback (getAnalysis ref)
@@ -250,8 +253,8 @@ fullFormLexicon lang =
getAnalysis :: IORef [MorphoAnalysis] -> LookupMorphoCallback
getAnalysis ref self c_lemma c_anal prob exn = do
ans <- readIORef ref
lemma <- peekCString c_lemma
anal <- peekCString c_anal
lemma <- peekUtf8CString c_lemma
anal <- peekUtf8CString c_anal
writeIORef ref ((lemma, anal, prob):ans)
parse :: Concr -> Cat -> String -> Either String [(Expr,Float)]
@@ -274,26 +277,26 @@ parseWithHeuristics :: Concr -- ^ the language with which we parse
-> Either String [(Expr,Float)]
parseWithHeuristics lang cat sent heuristic callbacks =
unsafePerformIO $
do parsePl <- gu_new_pool
exprPl <- gu_new_pool
do exprPl <- gu_new_pool
parsePl <- gu_new_pool
exn <- gu_new_exn parsePl
enum <- withCString cat $ \cat ->
withCString sent $ \sent -> do
callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl
pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl
cat <- newUtf8CString cat parsePl
sent <- newUtf8CString sent parsePl
callbacks_map <- mkCallbacksMap (concr lang) callbacks parsePl
enum <- pgf_parse_with_heuristics (concr lang) cat sent heuristic callbacks_map exn parsePl exprPl
failed <- gu_exn_is_raised exn
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
tok <- peekCString c_tok
tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
msg <- peekCString c_msg
msg <- peekUtf8CString c_msg
gu_pool_free parsePl
gu_pool_free exprPl
throwIO (PGFError msg)
@@ -308,15 +311,15 @@ parseWithHeuristics lang cat sent heuristic callbacks =
mkCallbacksMap :: Ptr PgfConcr -> [(String, Int -> String -> Int -> Maybe (Expr,Float,Int))] -> Ptr GuPool -> IO (Ptr PgfCallbacksMap)
mkCallbacksMap concr callbacks pool = do
callbacks_map <- pgf_new_callbacks_map concr pool
forM_ callbacks $ \(cat,match) ->
withCString cat $ \ccat -> do
match <- wrapLiteralMatchCallback (match_callback match)
predict <- wrapLiteralPredictCallback predict_callback
hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool
forM_ callbacks $ \(cat,match) -> do
ccat <- newUtf8CString cat pool
match <- wrapLiteralMatchCallback (match_callback match)
predict <- wrapLiteralPredictCallback predict_callback
hspgf_callbacks_map_add_literal concr callbacks_map ccat match predict pool
return callbacks_map
where
match_callback match _ clin_idx csentence poffset out_pool = do
sentence <- peekCString csentence
sentence <- peekUtf8CString csentence
coffset <- peek poffset
case match (fromIntegral clin_idx) sentence (fromIntegral coffset) of
Nothing -> return nullPtr
@@ -361,26 +364,26 @@ parseWithOracle lang cat sent (predict,complete,literal) =
do parsePl <- gu_new_pool
exprPl <- gu_new_pool
exn <- gu_new_exn parsePl
enum <- withCString cat $ \cat ->
withCString sent $ \sent -> do
predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict
completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete
literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal
cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl
pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl
cat <- newUtf8CString cat parsePl
sent <- newUtf8CString sent parsePl
predictPtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) predict
completePtr <- maybe (return nullFunPtr) (wrapOracleCallback . oracleWrapper) complete
literalPtr <- maybe (return nullFunPtr) (wrapOracleLiteralCallback . oracleLiteralWrapper) literal
cback <- hspgf_new_oracle_callback sent predictPtr completePtr literalPtr parsePl
enum <- pgf_parse_with_oracle (concr lang) cat sent cback exn parsePl exprPl
failed <- gu_exn_is_raised exn
if failed
then do is_parse_error <- gu_exn_caught exn gu_exn_type_PgfParseError
if is_parse_error
then do c_tok <- (#peek GuExn, data.data) exn
tok <- peekCString c_tok
tok <- peekUtf8CString c_tok
gu_pool_free parsePl
gu_pool_free exprPl
return (Left tok)
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
msg <- peekCString c_msg
msg <- peekUtf8CString c_msg
gu_pool_free parsePl
gu_pool_free exprPl
throwIO (PGFError msg)
@@ -393,13 +396,13 @@ parseWithOracle lang cat sent (predict,complete,literal) =
return (Right exprs)
where
oracleWrapper oracle catPtr lblPtr offset = do
cat <- peekCString catPtr
lbl <- peekCString lblPtr
cat <- peekUtf8CString catPtr
lbl <- peekUtf8CString lblPtr
return (oracle cat lbl (fromIntegral offset))
oracleLiteralWrapper oracle catPtr lblPtr poffset out_pool = do
cat <- peekCString catPtr
lbl <- peekCString lblPtr
cat <- peekUtf8CString catPtr
lbl <- peekUtf8CString lblPtr
offset <- peek poffset
case oracle cat lbl (fromIntegral offset) of
Just (e,prob,offset) ->
@@ -425,7 +428,8 @@ parseWithOracle lang cat sent (predict,complete,literal) =
hasLinearization :: Concr -> Fun -> Bool
hasLinearization lang id = unsafePerformIO $
withCString id (pgf_has_linearization (concr lang))
withGuPool $ \pl ->
newUtf8CString id pl >>= pgf_has_linearization (concr lang)
linearize :: Concr -> Expr -> String
linearize lang e = unsafePerformIO $
@@ -441,11 +445,11 @@ linearize lang e = unsafePerformIO $
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
msg <- peekCString c_msg
msg <- peekUtf8CString c_msg
throwIO (PGFError msg)
else throwIO (PGFError "The abstract tree cannot be linearized")
else do lin <- gu_string_buf_freeze sb pl
peekCString lin
peekUtf8CString lin
linearizeAll :: Concr -> Expr -> [String]
linearizeAll lang e = unsafePerformIO $
@@ -473,7 +477,7 @@ linearizeAll lang e = unsafePerformIO $
then collect cts exn pl
else throwExn exn pl
else do lin <- gu_string_buf_freeze sb tmpPl
s <- peekCString lin
s <- peekUtf8CString lin
ss <- unsafeInterleaveIO (collect cts exn pl)
return (s:ss)
@@ -481,7 +485,7 @@ linearizeAll lang e = unsafePerformIO $
is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
msg <- peekCString c_msg
msg <- peekUtf8CString c_msg
gu_pool_free pl
throwIO (PGFError msg)
else do gu_pool_free pl
@@ -500,7 +504,7 @@ alignWords lang e = unsafePerformIO $
else do is_exn <- gu_exn_caught exn gu_exn_type_PgfExn
if is_exn
then do c_msg <- (#peek GuExn, data.data) exn
msg <- peekCString c_msg
msg <- peekUtf8CString c_msg
throwIO (PGFError msg)
else throwIO (PGFError "The abstract tree cannot be linearized")
else do len <- (#peek GuSeq, len) seq
@@ -510,7 +514,7 @@ alignWords lang e = unsafePerformIO $
peekAlignmentPhrase :: Ptr () -> IO (String, [Int])
peekAlignmentPhrase ptr = do
c_phrase <- (#peek PgfAlignmentPhrase, phrase) ptr
phrase <- peekCString c_phrase
phrase <- peekUtf8CString c_phrase
n_fids <- (#peek PgfAlignmentPhrase, n_fids) ptr
(fids :: [CInt]) <- peekArray (fromIntegral (n_fids :: CInt)) (ptr `plusPtr` (#offset PgfAlignmentPhrase, fids))
return (phrase, map fromIntegral fids)
@@ -532,7 +536,7 @@ functions p =
getFunctions :: IORef [String] -> MapItorCallback
getFunctions ref itor key value exn = do
names <- readIORef ref
name <- peekCString (castPtr key)
name <- peekUtf8CString (castPtr key)
writeIORef ref $! (name : names)
categories :: PGF -> [Cat]

View File

@@ -2,12 +2,13 @@
module PGF2.FFI where
import Foreign ( alloca, poke )
import Foreign.C
--import Foreign.C.String
import Foreign.Ptr
import Foreign.ForeignPtr
import Control.Exception
import GHC.Ptr
import Data.Int(Int32)
data PGF = PGF {pgf :: Ptr PgfPGF, pgfMaster :: ForeignPtr GuPool}
data Concr = Concr {concr :: Ptr PgfConcr, concrMaster :: PGF}
@@ -72,10 +73,16 @@ foreign import ccall "gu/file.h gu_file_in"
foreign import ccall "gu/enum.h gu_enum_next"
gu_enum_next :: Ptr a -> Ptr (Ptr b) -> Ptr GuPool -> IO ()
foreign import ccall "gu/string.h gu_string_buf_freeze"
gu_string_buf_freeze :: Ptr GuStringBuf -> Ptr GuPool -> IO CString
foreign import ccall unsafe "gu/utf8.h gu_utf8_decode"
gu_utf8_decode :: Ptr CString -> IO Int32
foreign import ccall unsafe "gu/utf8.h gu_utf8_encode"
gu_utf8_encode :: Int32 -> Ptr CString -> IO ()
withGuPool :: (Ptr GuPool -> IO a) -> IO a
withGuPool f = bracket gu_new_pool gu_pool_free f
@@ -85,6 +92,34 @@ newOut pool =
out <- gu_string_buf_out sb
return (sb,out)
peekUtf8CString :: CString -> IO String
peekUtf8CString ptr =
alloca $ \pptr ->
poke pptr ptr >> decode pptr
where
decode pptr = do
x <- gu_utf8_decode pptr
if x == 0
then return []
else do cs <- decode pptr
return (((toEnum . fromEnum) x) : cs)
newUtf8CString :: String -> Ptr GuPool -> IO CString
newUtf8CString s pool = do
-- An UTF8 character takes up to 6 bytes. We allocate enough
-- memory for the worst case. This is wasteful but those
-- strings are usually allocated only temporary.
ptr <- gu_malloc pool (fromIntegral (length s * 6+1))
alloca $ \pptr ->
poke pptr ptr >> encode s pptr
return ptr
where
encode [] pptr = do
gu_utf8_encode 0 pptr
encode (c:cs) pptr = do
gu_utf8_encode ((toEnum . fromEnum) c) pptr
encode cs pptr
------------------------------------------------------------------
-- libpgf API