Skip to content

add C-based isAscii :: Text -> Bool #497

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Feb 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions cbits/is_ascii.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,10 @@ const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){

return src - src0;
}

/*
_hs_text_is_ascii_offset is a helper for calling _hs_text_is_ascii on Texts.
*/
const size_t _hs_text_is_ascii_offset(const uint8_t *arr, size_t off, size_t len){
return _hs_text_is_ascii(arr + off, arr + off + len);
}
46 changes: 38 additions & 8 deletions src/Data/Text.hs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ module Data.Text
, all
, maximum
, minimum
, isAscii

-- * Construction

Expand Down Expand Up @@ -218,7 +219,7 @@ import Control.DeepSeq (NFData(rnf))
import Control.Exception (assert)
#endif
import Data.Bits ((.&.), shiftR, shiftL)
import Data.Char (isSpace, isAscii, ord)
import qualified Data.Char as Char
import Data.Data (Data(gfoldl, toConstr, gunfold, dataTypeOf), constrIndex,
Constr, mkConstr, DataType, mkDataType, Fixity(Prefix))
import Control.Monad (foldM)
Expand Down Expand Up @@ -1176,6 +1177,35 @@ minimum :: HasCallStack => Text -> Char
minimum t = S.minimum (stream t)
{-# INLINE minimum #-}

-- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only
-- U+0000 through U+007F).
--
-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
--
-- >>> isAscii ""
-- True
--
-- >>> isAscii "abc\NUL"
-- True
--
-- >>> isAscii "abcd€"
-- False
--
-- prop> isAscii t == all (< '\x80') t
--
-- @since 2.0.2
isAscii :: Text -> Bool
isAscii (Text (A.ByteArray arr) off len) =
cSizeToInt (c_is_ascii_offset arr (intToCSize off) (intToCSize len)) == len
{-# INLINE isAscii #-}

cSizeToInt :: CSize -> Int
cSizeToInt = P.fromIntegral
{-# INLINE cSizeToInt #-}

foreign import ccall unsafe "_hs_text_is_ascii_offset" c_is_ascii_offset
:: ByteArray# -> CSize -> CSize -> CSize

-- -----------------------------------------------------------------------------
-- * Building 'Text's
-- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
Expand Down Expand Up @@ -1315,8 +1345,8 @@ replicate n t@(Text a o l)
replicateChar :: Int -> Char -> Text
replicateChar !len !c'
| len <= 0 = empty
| isAscii c = runST $ do
marr <- A.newFilled len (ord c)
| Char.isAscii c = runST $ do
marr <- A.newFilled len (Char.ord c)
arr <- A.unsafeFreeze marr
return $ Text arr 0 len
| otherwise = runST $ do
Expand Down Expand Up @@ -1499,22 +1529,22 @@ dropAround p = dropWhile p . dropWhileEnd p
--
-- > dropWhile isSpace
stripStart :: Text -> Text
stripStart = dropWhile isSpace
stripStart = dropWhile Char.isSpace
{-# INLINE stripStart #-}

-- | /O(n)/ Remove trailing white space from a string. Equivalent to:
--
-- > dropWhileEnd isSpace
stripEnd :: Text -> Text
stripEnd = dropWhileEnd isSpace
stripEnd = dropWhileEnd Char.isSpace
{-# INLINE [1] stripEnd #-}

-- | /O(n)/ Remove leading and trailing white space from a string.
-- Equivalent to:
--
-- > dropAround isSpace
strip :: Text -> Text
strip = dropAround isSpace
strip = dropAround Char.isSpace
{-# INLINE [1] strip #-}

-- | /O(n)/ 'splitAt' @n t@ returns a pair whose first element is a
Expand Down Expand Up @@ -1994,7 +2024,7 @@ words (Text arr off len) = loop 0 0
| w0 < 0xE0 = loop start (n + 2)
-- or 3 bytes for 0x1680 + 0x2000..0x200A + 0x2028..0x2029 + 0x202F + 0x205F + 0x3000
| w0 == 0xE1 && w1 == 0x9A && w2 == 0x80
|| w0 == 0xE2 && (w1 == 0x80 && isSpace (chr3 w0 w1 w2) || w1 == 0x81 && w2 == 0x9F)
|| w0 == 0xE2 && (w1 == 0x80 && Char.isSpace (chr3 w0 w1 w2) || w1 == 0x81 && w2 == 0x9F)
|| w0 == 0xE3 && w1 == 0x80 && w2 == 0x80 =
if start == n
then loop (n + 3) (n + 3)
Expand Down Expand Up @@ -2205,7 +2235,7 @@ copy (Text arr off len) = Text (A.run go) 0 len
return marr

ord8 :: Char -> Word8
ord8 = P.fromIntegral . ord
ord8 = P.fromIntegral . Char.ord

intToCSize :: Int -> CSize
intToCSize = P.fromIntegral
Expand Down
21 changes: 21 additions & 0 deletions src/Data/Text/Lazy.hs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ module Data.Text.Lazy
, all
, maximum
, minimum
, isAscii

-- * Construction

Expand Down Expand Up @@ -870,6 +871,26 @@ minimum :: HasCallStack => Text -> Char
minimum t = S.minimum (stream t)
{-# INLINE minimum #-}

-- | \O(n)\ Test whether 'Text' contains only ASCII code-points (i.e. only
-- U+0000 through U+007F).
--
-- This is a more efficient version of @'all' 'Data.Char.isAscii'@.
--
-- >>> isAscii ""
-- True
--
-- >>> isAscii "abc\NUL"
-- True
--
-- >>> isAscii "abcd€"
-- False
--
-- prop> isAscii t == all (< '\x80') t
--
-- @since 2.0.2
isAscii :: Text -> Bool
isAscii = foldrChunks (\chnk acc -> T.isAscii chnk && acc) True

-- | /O(n)/ 'scanl' is similar to 'foldl', but returns a list of
-- successive reduced values from the left.
-- Performs replacement on invalid scalar values.
Expand Down
15 changes: 13 additions & 2 deletions tests/Tests/Properties/Folds.hs
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,15 @@ import Control.Arrow (second)
import Control.Exception (ErrorCall, evaluate, try)
import Data.Word (Word8, Word16)
import Test.Tasty (TestTree, testGroup)
import Test.Tasty.HUnit (testCase, assertFailure)
import Test.Tasty.HUnit (testCase, assertFailure, assertBool)
import Test.Tasty.QuickCheck (testProperty, Small(..), (===), applyFun, applyFun2)
import Tests.QuickCheckUtils
import qualified Data.List as L
import qualified Data.Text as T
import qualified Data.Text.Internal.Fusion as S
import qualified Data.Text.Internal.Fusion.Common as S
import qualified Data.Text.Lazy as TL
import qualified Data.Char as Char

-- Folds

Expand Down Expand Up @@ -109,6 +110,8 @@ sf_minimum (applyFun -> p)
= (L.minimum . L.filter p) `eqP` (S.minimum . S.filter p)
t_minimum = L.minimum `eqP` T.minimum
tl_minimum = L.minimum `eqP` TL.minimum
t_isAscii = L.all Char.isAscii `eqP` T.isAscii
tl_isAscii = L.all Char.isAscii `eqP` TL.isAscii

-- Scans

Expand Down Expand Up @@ -190,6 +193,11 @@ tl_unfoldrN n m = (L.take i . L.unfoldr (unf j)) `eq`
where i = fromIntegral (n :: Word16)
j = fromIntegral (m :: Word16)

isAscii_border :: IO ()
isAscii_border = do
let text = T.drop 2 $ T.pack "XX1234五"
assertBool "UTF-8 string with ASCII prefix ending at last position incorrectly detected as ASCII" $ not $ T.isAscii text

testFolds :: TestTree
testFolds =
testGroup "folds-unfolds" [
Expand Down Expand Up @@ -234,7 +242,10 @@ testFolds =
testProperty "tl_maximum" tl_maximum,
testProperty "sf_minimum" sf_minimum,
testProperty "t_minimum" t_minimum,
testProperty "tl_minimum" tl_minimum
testProperty "tl_minimum" tl_minimum,
testProperty "t_isAscii " t_isAscii,
testProperty "tl_isAscii " tl_isAscii,
testCase "isAscii_border" isAscii_border
]
],

Expand Down