Skip to content

Expose SIMD UTF-8 validation functions from internal module #483

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions cbits/validate_utf8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,8 @@ extern "C"
int _hs_text_is_valid_utf8(const char* str, size_t len){
return simdutf::validate_utf8(str, len);
}

extern "C"
int _hs_text_is_valid_utf8_offset(const char* str, size_t off, size_t len){
return simdutf::validate_utf8(str + off, len);
}
8 changes: 3 additions & 5 deletions src/Data/Text/Internal/PrimCompat.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,12 @@ module Data.Text.Internal.PrimCompat
) where

#if MIN_VERSION_base(4,16,0)

import GHC.Base

import GHC.Exts (wordToWord8#,word8ToWord#,wordToWord16#,word16ToWord#,wordToWord32#,word32ToWord#)
#else

import GHC.Prim (Word#)
#endif

#if !(MIN_VERSION_base(4,16,0))
wordToWord8#, word8ToWord# :: Word# -> Word#
wordToWord16#, word16ToWord# :: Word# -> Word#
wordToWord32#, word32ToWord# :: Word# -> Word#
Expand All @@ -33,5 +32,4 @@ wordToWord32# w = w
{-# INLINE word16ToWord# #-}
{-# INLINE wordToWord32# #-}
{-# INLINE word32ToWord# #-}

#endif
149 changes: 149 additions & 0 deletions src/Data/Text/Internal/Validate.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}

-- | Test whether or not a sequence of bytes is a valid UTF-8 byte sequence.
-- In the GHC Haskell ecosystem, there are several representations of byte
-- sequences. The only one that the stable @text@ API concerns itself with is
-- 'ByteString'. Part of bytestring-to-text decoding is 'isValidUtf8ByteString',
-- a high-performance UTF-8 validation routine written in C++ with fallbacks
-- for various platforms. The C++ code backing this routine is nontrivial,
-- so in the interest of reuse, this module additionally exports functions
-- for working with the GC-managed @ByteArray@ type. These @ByteArray@
-- functions are not used anywhere else in @text@. They are for the benefit
-- of library and application authors who do not use 'ByteString' but still
-- need to interoperate with @text@.
module Data.Text.Internal.Validate
(
-- * ByteString
isValidUtf8ByteString
-- * ByteArray
--
-- | Is the slice of a byte array a valid UTF-8 byte sequence? These
-- functions all accept an offset and a length.
, isValidUtf8ByteArray
, isValidUtf8ByteArrayUnpinned
, isValidUtf8ByteArrayPinned
) where

import Data.Array.Byte (ByteArray(ByteArray))
import Data.ByteString (ByteString)
import GHC.Exts (isTrue#,isByteArrayPinned#)

#ifdef SIMDUTF
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Text.Internal.ByteStringCompat (withBS)
import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe)
#else
import GHC.Exts (ByteArray#)
import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
import GHC.Exts (Int(I#),indexWord8Array#)
import GHC.Word (Word8(W8#))
import qualified Data.ByteString as B
#if !MIN_VERSION_bytestring(0,11,2)
import qualified Data.ByteString.Unsafe as B
#endif
#endif

-- | Is the ByteString a valid UTF-8 byte sequence?
isValidUtf8ByteString :: ByteString -> Bool
#ifdef SIMDUTF
isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8_ptr_unsafe ptr (fromIntegral len)
#else
#if MIN_VERSION_bytestring(0,11,2)
isValidUtf8ByteString = B.isValidUtf8
#else
isValidUtf8ByteString bs = start 0
where
start ix
| ix >= B.length bs = True
| otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= B.length bs = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'
#endif
#endif

-- | For pinned byte arrays larger than 128KiB, this switches to the safe FFI
-- so that it does not prevent GC. This threshold (128KiB) was chosen
-- somewhat arbitrarily and may change in the future.
isValidUtf8ByteArray ::
ByteArray -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
isValidUtf8ByteArray b@(ByteArray b#) !off !len
| len >= 131072 -- 128KiB
, isTrue# (isByteArrayPinned# b#)
= isValidUtf8ByteArrayPinned b off len
| otherwise = isValidUtf8ByteArrayUnpinned b off len

-- | This uses the @unsafe@ FFI. GC waits for all @unsafe@ FFI calls
-- to complete before starting. Consequently, an @unsafe@ FFI call does not
-- run concurrently with GC and is not interrupted by GC. Since relocation
-- cannot happen concurrently with an @unsafe@ FFI call, it is safe
-- to call this function with an unpinned byte array argument.
-- It is also safe to call this with a pinned @ByteArray@ argument.
isValidUtf8ByteArrayUnpinned ::
ByteArray -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
#ifdef SIMDUTF
isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len =
unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len)
#else
isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
#endif

-- | This uses the @safe@ FFI. GC may run concurrently with @safe@
-- FFI calls. Consequently, unpinned objects may be relocated while a
-- @safe@ FFI call is executing. The byte array argument /must/ be pinned,
-- and the calling context is responsible for enforcing this. If the
-- byte array is not pinned, this function's behavior is undefined.
isValidUtf8ByteArrayPinned ::
ByteArray -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
#ifdef SIMDUTF
isValidUtf8ByteArrayPinned (ByteArray bs) !off !len =
unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len)
#else
isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
#endif

#ifndef SIMDUTF
isValidUtf8ByteArrayHaskell# ::
ByteArray# -- ^ Bytes
-> Int -- ^ Offset
-> Int -- ^ Length
-> Bool
isValidUtf8ByteArrayHaskell# b !off !len = start off
where
indexWord8 :: ByteArray# -> Int -> Word8
indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
start ix
| ix >= len = True
| otherwise = case utf8DecodeStart (indexWord8 b ix) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st _ -> step (ix + 1) st
step ix st
| ix >= len = False
-- We do not use decoded code point, so passing a dummy value to save an argument.
| otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
Accept{} -> start (ix + 1)
Reject{} -> False
Incomplete st' _ -> step (ix + 1) st'
#endif
42 changes: 42 additions & 0 deletions src/Data/Text/Internal/Validate/Simd.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE MagicHash #-}
{-# LANGUAGE UnliftedFFITypes #-}

-- | Validate that a byte sequence is UTF-8-encoded text. All of these
-- functions return zero when the byte sequence is not UTF-8-encoded text,
-- and they return an unspecified non-zero value when the byte sequence
-- is UTF-8-encoded text.
--
-- Variants are provided for both @ByteArray#@ and @Ptr@. Additionally,
-- variants are provided that use both the @safe@ and @unsafe@ FFI.
--
-- If compiling with SIMDUTF turned off, this module exports nothing.
module Data.Text.Internal.Validate.Simd
( c_is_valid_utf8_ptr_unsafe
, c_is_valid_utf8_ptr_safe
, c_is_valid_utf8_bytearray_unsafe
, c_is_valid_utf8_bytearray_safe
) where

import Data.Word (Word8)
import Foreign.C.Types (CSize(..),CInt(..))
import GHC.Exts (Ptr,ByteArray#)

foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_unsafe
:: Ptr Word8 -- ^ Bytes
-> CSize -- ^ Length
-> IO CInt
foreign import ccall safe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_safe
:: Ptr Word8 -- ^ Bytes
-> CSize -- ^ Length
-> IO CInt
foreign import ccall unsafe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_unsafe
:: ByteArray# -- ^ Bytes
-> CSize -- ^ Offset into bytes
-> CSize -- ^ Length
-> IO CInt
foreign import ccall safe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_safe
:: ByteArray# -- ^ Bytes
-> CSize -- ^ Offset into bytes
-> CSize -- ^ Length
-> IO CInt
4 changes: 3 additions & 1 deletion text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ flag developer
manual: True

flag simdutf
description: use simdutf library
description: use simdutf library, causes Data.Text.Internal.Validate.Simd to be exposed
default: True
manual: True

Expand All @@ -85,6 +85,7 @@ library
hs-source-dirs: src

if flag(simdutf)
exposed-modules: Data.Text.Internal.Validate.Simd
include-dirs: simdutf
cxx-sources: simdutf/simdutf.cpp
cbits/validate_utf8.cpp
Expand Down Expand Up @@ -173,6 +174,7 @@ library
Data.Text.Internal.StrictBuilder
Data.Text.Internal.Unsafe
Data.Text.Internal.Unsafe.Char
Data.Text.Internal.Validate
Data.Text.Lazy
Data.Text.Lazy.Builder
Data.Text.Lazy.Builder.Int
Expand Down