{-# LANGUAGE BangPatterns             #-}
{-# LANGUAGE CPP                      #-}
{-# LANGUAGE ForeignFunctionInterface #-}
{-# LANGUAGE MagicHash                #-}

module Data.HashTable.Internal.CacheLine
  ( cacheLineSearch
  , cacheLineSearch2
  , cacheLineSearch3
  , forwardSearch2
  , forwardSearch3
  , isCacheLineAligned
  , advanceByCacheLineSize
  , prefetchRead
  , prefetchWrite
  , bl_abs#
  , sign#
  , mask#
  , mask
  , maskw#
  ) where

import           Control.Monad
import           Control.Monad.ST                 (ST)

import           Data.HashTable.Internal.IntArray (Elem, IntArray)
import qualified Data.HashTable.Internal.IntArray as M

#ifndef NO_C_SEARCH
import           Foreign.C.Types
#else
import           Data.Bits
import           Data.Int
import qualified Data.Vector.Unboxed              as U
import           GHC.Int
#endif

import           Data.HashTable.Internal.Utils
import           GHC.Exts

{-# INLINE prefetchRead  #-}
{-# INLINE prefetchWrite #-}
prefetchRead :: IntArray s -> Int -> ST s ()
prefetchWrite :: IntArray s -> Int -> ST s ()

#ifndef NO_C_SEARCH
foreign import ccall unsafe "line_search"
  c_lineSearch :: Ptr a -> CInt -> CUShort -> IO CInt

foreign import ccall unsafe "line_search_2"
  c_lineSearch_2 :: Ptr a -> CInt -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "line_search_3"
  c_lineSearch_3 :: Ptr a -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "forward_search_2"
  c_forwardSearch_2 :: Ptr a -> CInt -> CInt -> CUShort -> CUShort -> IO CInt

foreign import ccall unsafe "forward_search_3"
  c_forwardSearch_3 :: Ptr a -> CInt -> CInt -> CUShort -> CUShort -> CUShort
                    -> IO CInt

foreign import ccall unsafe "prefetch_cacheline_read"
  prefetchCacheLine_read :: Ptr a -> CInt -> IO ()

foreign import ccall unsafe "prefetch_cacheline_write"
  prefetchCacheLine_write :: Ptr a -> CInt -> IO ()


fI :: (Num b, Integral a) => a -> b
fI :: forall b a. (Num b, Integral a) => a -> b
fI = a -> b
forall a b. (Integral a, Num b) => a -> b
fromIntegral

prefetchRead :: forall s. IntArray s -> Int -> ST s ()
prefetchRead IntArray s
a Int
i = IO () -> ST s ()
forall a s. IO a -> ST s a
unsafeIOToST (IO () -> ST s ()) -> IO () -> ST s ()
forall a b. (a -> b) -> a -> b
$ Ptr Any -> CInt -> IO ()
forall a. Ptr a -> CInt -> IO ()
prefetchCacheLine_read Ptr Any
forall {a}. Ptr a
v CInt
x
  where
    v :: Ptr a
v   = IntArray s -> Ptr a
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
a
    x :: CInt
x   = Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
i

prefetchWrite :: forall s. IntArray s -> Int -> ST s ()
prefetchWrite IntArray s
a Int
i = IO () -> ST s ()
forall a s. IO a -> ST s a
unsafeIOToST (IO () -> ST s ()) -> IO () -> ST s ()
forall a b. (a -> b) -> a -> b
$ Ptr Any -> CInt -> IO ()
forall a. Ptr a -> CInt -> IO ()
prefetchCacheLine_write Ptr Any
forall {a}. Ptr a
v CInt
x
  where
    v :: Ptr a
v   = IntArray s -> Ptr a
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
a
    x :: CInt
x   = Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
i


{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Elem -> Elem -> ST s Int
forwardSearch2 :: forall s. IntArray s -> Int -> Int -> Elem -> Elem -> ST s Int
forwardSearch2 !IntArray s
vec !Int
start !Int
end !Elem
x1 !Elem
x2 =
    (CInt -> Int) -> ST s CInt -> ST s Int
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM CInt -> Int
forall a. Enum a => a -> Int
fromEnum (ST s CInt -> ST s Int) -> ST s CInt -> ST s Int
forall a b. (a -> b) -> a -> b
$! IO CInt -> ST s CInt
forall a s. IO a -> ST s a
unsafeIOToST IO CInt
c
  where
    c :: IO CInt
c = Ptr Any -> CInt -> CInt -> CUShort -> CUShort -> IO CInt
forall a. Ptr a -> CInt -> CInt -> CUShort -> CUShort -> IO CInt
c_forwardSearch_2 (IntArray s -> Ptr Any
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
vec) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
start) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
end) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x1) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x2)


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Elem -> Elem -> Elem -> ST s Int
forwardSearch3 :: forall s.
IntArray s -> Int -> Int -> Elem -> Elem -> Elem -> ST s Int
forwardSearch3 !IntArray s
vec !Int
start !Int
end !Elem
x1 !Elem
x2 !Elem
x3 =
    (CInt -> Int) -> ST s CInt -> ST s Int
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM CInt -> Int
forall a. Enum a => a -> Int
fromEnum (ST s CInt -> ST s Int) -> ST s CInt -> ST s Int
forall a b. (a -> b) -> a -> b
$! IO CInt -> ST s CInt
forall a s. IO a -> ST s a
unsafeIOToST IO CInt
c
  where
    c :: IO CInt
c = Ptr Any -> CInt -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt
forall a.
Ptr a -> CInt -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt
c_forwardSearch_3 (IntArray s -> Ptr Any
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
vec) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
start) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
end)
                          (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x1) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x2) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x3)


{-# INLINE lineSearch #-}
lineSearch :: IntArray s -> Int -> Elem -> ST s Int
lineSearch :: forall s. IntArray s -> Int -> Elem -> ST s Int
lineSearch !IntArray s
vec !Int
start !Elem
value =
    (CInt -> Int) -> ST s CInt -> ST s Int
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM CInt -> Int
forall a. Enum a => a -> Int
fromEnum (ST s CInt -> ST s Int) -> ST s CInt -> ST s Int
forall a b. (a -> b) -> a -> b
$! IO CInt -> ST s CInt
forall a s. IO a -> ST s a
unsafeIOToST IO CInt
c
  where
    c :: IO CInt
c = Ptr Any -> CInt -> CUShort -> IO CInt
forall a. Ptr a -> CInt -> CUShort -> IO CInt
c_lineSearch (IntArray s -> Ptr Any
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
vec) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
start) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
value)


{-# INLINE lineSearch2 #-}
lineSearch2 :: IntArray s -> Int -> Elem -> Elem -> ST s Int
lineSearch2 :: forall s. IntArray s -> Int -> Elem -> Elem -> ST s Int
lineSearch2 !IntArray s
vec !Int
start !Elem
x1 !Elem
x2 =
    (CInt -> Int) -> ST s CInt -> ST s Int
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM CInt -> Int
forall a. Enum a => a -> Int
fromEnum (ST s CInt -> ST s Int) -> ST s CInt -> ST s Int
forall a b. (a -> b) -> a -> b
$! IO CInt -> ST s CInt
forall a s. IO a -> ST s a
unsafeIOToST IO CInt
c
  where
    c :: IO CInt
c = Ptr Any -> CInt -> CUShort -> CUShort -> IO CInt
forall a. Ptr a -> CInt -> CUShort -> CUShort -> IO CInt
c_lineSearch_2 (IntArray s -> Ptr Any
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
vec) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
start) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x1) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x2)


{-# INLINE lineSearch3 #-}
lineSearch3 :: IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
lineSearch3 :: forall s. IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
lineSearch3 !IntArray s
vec !Int
start !Elem
x1 !Elem
x2 !Elem
x3 =
    (CInt -> Int) -> ST s CInt -> ST s Int
forall (m :: * -> *) a1 r. Monad m => (a1 -> r) -> m a1 -> m r
liftM CInt -> Int
forall a. Enum a => a -> Int
fromEnum (ST s CInt -> ST s Int) -> ST s CInt -> ST s Int
forall a b. (a -> b) -> a -> b
$! IO CInt -> ST s CInt
forall a s. IO a -> ST s a
unsafeIOToST IO CInt
c
  where
    c :: IO CInt
c = Ptr Any -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt
forall a. Ptr a -> CInt -> CUShort -> CUShort -> CUShort -> IO CInt
c_lineSearch_3 (IntArray s -> Ptr Any
forall s a. IntArray s -> Ptr a
M.toPtr IntArray s
vec) (Int -> CInt
forall b a. (Num b, Integral a) => a -> b
fI Int
start) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x1) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x2) (Elem -> CUShort
forall b a. (Num b, Integral a) => a -> b
fI Elem
x3)
#endif

{-# INLINE advanceByCacheLineSize #-}
advanceByCacheLineSize :: Int -> Int -> Int
advanceByCacheLineSize :: Int -> Int -> Int
advanceByCacheLineSize !(I# Int#
start0#) !(I# Int#
vecSize#) = Int
out
  where
    !(I# Int#
clm#) = Int
cacheLineIntMask
    !clmm# :: Word#
clmm#     = Word# -> Word#
not# (Int# -> Word#
int2Word# Int#
clm#)
    !start# :: Int#
start#    = Word# -> Int#
word2Int# (Word#
clmm# Word# -> Word# -> Word#
`and#` Int# -> Word#
int2Word# Int#
start0#)
    !(I# Int#
nw#)  = Int
numElemsInCacheLine
    !start'# :: Int#
start'#   = Int#
start# Int# -> Int# -> Int#
+# Int#
nw#
    !s# :: Int#
s#        = Int# -> Int#
sign# (Int#
vecSize# Int# -> Int# -> Int#
-# Int#
start'# Int# -> Int# -> Int#
-# Int#
1#)
    !m# :: Word#
m#        = Word# -> Word#
not# (Int# -> Word#
int2Word# Int#
s#)
    !r# :: Word#
r#        = Int# -> Word#
int2Word# Int#
start'# Word# -> Word# -> Word#
`and#` Word#
m#
    !out :: Int
out       = Int# -> Int
I# (Word# -> Int#
word2Int# Word#
r#)


{-# INLINE isCacheLineAligned #-}
isCacheLineAligned :: Int -> Bool
isCacheLineAligned :: Int -> Bool
isCacheLineAligned (I# Int#
x#) = Int# -> Bool
isTrue# (Int#
r# Int# -> Int# -> Int#
==# Int#
0#)
  where
    !(I# Int#
m#) = Int
cacheLineIntMask
    !mw# :: Word#
mw#     = Int# -> Word#
int2Word# Int#
m#
    !w# :: Word#
w#      = Int# -> Word#
int2Word# Int#
x#
    !r# :: Int#
r#      = Word# -> Int#
word2Int# (Word#
mw# Word# -> Word# -> Word#
`and#` Word#
w#)


{-# INLINE sign# #-}
-- | Returns 0 if x is positive, -1 otherwise
sign# :: Int# -> Int#
sign# :: Int# -> Int#
sign# !Int#
x# = Int#
x# Int# -> Int# -> Int#
`uncheckedIShiftRA#` Int#
wordSizeMinus1#
  where
    !(I# Int#
wordSizeMinus1#) = Int
wordSizeInt -> Int -> Int
forall a. Num a => a -> a -> a
-Int
1


{-# INLINE bl_abs# #-}
-- | Abs of an integer, branchless
bl_abs# :: Int# -> Int#
bl_abs# :: Int# -> Int#
bl_abs# !Int#
x# = Word# -> Int#
word2Int# Word#
r#
  where
    !m# :: Int#
m# = Int# -> Int#
sign# Int#
x#
    !r# :: Word#
r# = (Int# -> Word#
int2Word# (Int#
m# Int# -> Int# -> Int#
+# Int#
x#)) Word# -> Word# -> Word#
`xor#` Int# -> Word#
int2Word# Int#
m#


{-# INLINE mask# #-}
-- | Returns 0xfff..fff (aka -1) if a# == b#, 0 otherwise.
mask# :: Int# -> Int# -> Int#
mask# :: Int# -> Int# -> Int#
mask# !Int#
a# !Int#
b# = Int#
dest#
  where
    !d# :: Int#
d#    = Int#
a# Int# -> Int# -> Int#
-# Int#
b#
    !r# :: Int#
r#    = Int# -> Int#
bl_abs# Int#
d# Int# -> Int# -> Int#
-# Int#
1#
    !dest# :: Int#
dest# = Int# -> Int#
sign# Int#
r#


{- note: this code should be:

mask# :: Int# -> Int# -> Int#
mask# !a# !b# = let !(I# z#) = fromEnum (a# ==# b#)
                    !q#      = negateInt# z#
                in q#

but GHC doesn't properly optimize this as straight-line code at the moment.

-}


{-# INLINE mask #-}
-- | Returns 0xfff..fff (aka -1) if a == b, 0 otherwise.
mask :: Int -> Int -> Int
mask :: Int -> Int -> Int
mask (I# Int#
a#) (I# Int#
b#) = Int# -> Int
I# (Int# -> Int# -> Int#
mask# Int#
a# Int#
b#)


{-# INLINE maskw# #-}
maskw# :: Int# -> Int# -> Word#
maskw# :: Int# -> Int# -> Word#
maskw# !Int#
a# !Int#
b# = Int# -> Word#
int2Word# (Int# -> Int# -> Int#
mask# Int#
a# Int#
b#)


#ifdef NO_C_SEARCH
prefetchRead _ _ = return ()
prefetchWrite _ _ = return ()

{-# INLINE forwardSearch2 #-}
forwardSearch2 :: IntArray s -> Int -> Int -> Elem -> Elem -> ST s Int
forwardSearch2 !vec !start !end !x1 !x2 = go start end False
  where
    go !i !e !b =
      if i >= e then
        if b then return (-1) else go 0 start True
      else do
        h <- M.readArray vec i
        if h == x1 || h == x2
          then return i
          else go (i+1) e b


{-# INLINE forwardSearch3 #-}
forwardSearch3 :: IntArray s -> Int -> Int -> Elem -> Elem -> Elem -> ST s Int
forwardSearch3 !vec !start !end !x1 !x2 !x3 = go start end False
  where
    go !i !e !b =
      if i >= e then
        if b then return (-1) else go 0 start True
      else do
        h <- M.readArray vec i
        if h == x1 || h == x2 || h == x3
          then return i
          else go (i+1) e b


deBruijnBitPositions :: U.Vector Int8
deBruijnBitPositions =
    U.fromList [
          0,   1, 28,  2, 29, 14, 24,  3, 30, 22, 20, 15, 25, 17,  4,  8,
          31, 27, 13, 23, 21, 19, 16,  7, 26, 12, 18,  6, 11,  5, 10,  9
         ]


{-# INLINE firstBitSet# #-}
-- only works with 32-bit values -- ok for us here
firstBitSet# :: Int# -> Int#
firstBitSet# i# = word2Int# ((or# zeroCase# posw#))
  where
    !zeroCase#   = int2Word# (mask# 0# i#)
    !w#          = int2Word# i#
    !iLowest#    = word2Int# (and# w# (int2Word# (negateInt# i#)))
    !idxW#       = uncheckedShiftRL#
                       (narrow32Word# (timesWord# (int2Word# iLowest#)
                                                  (int2Word# 0x077CB531#)))
                       27#
    !idx         = I# (word2Int# idxW#)
    !(I8# pos8#) = U.unsafeIndex deBruijnBitPositions idx
    !posw#       = int2Word# pos8#

#endif


#ifdef NO_C_SEARCH
lineResult# :: Word#    -- ^ mask
            -> Int      -- ^ start value
            -> Int
lineResult# bitmask# (I# start#) = I# (word2Int# rv#)
  where
    !p#   = firstBitSet# (word2Int# bitmask#)
    !mm#  = maskw# p# (-1#)
    !nmm# = not# mm#
    !rv#  = mm# `or#` (nmm# `and#` (int2Word# (start# +# p#)))
{-# INLINE lineResult# #-}


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch :: IntArray s        -- ^ vector to search
           -> Int               -- ^ start index
           -> Elem              -- ^ value to search for
           -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch !vec !start !elem1 = do
    let !(I# v1#) = fromIntegral elem1

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (maskw# x1# v1#) (int2Word# 0x1#))

    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (maskw# x2# v1#) (int2Word# 0x2#))

    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (maskw# x3# v1#) (int2Word# 0x4#))

    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (maskw# x4# v1#) (int2Word# 0x8#))

    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (maskw# x5# v1#) (int2Word# 0x10#))

    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (maskw# x6# v1#) (int2Word# 0x20#))

    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (maskw# x7# v1#) (int2Word# 0x40#))

    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (maskw# x8# v1#) (int2Word# 0x80#))

    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (maskw# x9# v1#) (int2Word# 0x100#))

    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (maskw# x10# v1#) (int2Word# 0x200#))

    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (maskw# x11# v1#) (int2Word# 0x400#))

    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (maskw# x12# v1#) (int2Word# 0x800#))

    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (maskw# x13# v1#) (int2Word# 0x1000#))

    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (maskw# x14# v1#) (int2Word# 0x2000#))

    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (maskw# x15# v1#) (int2Word# 0x4000#))

    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (maskw# x16# v1#) (int2Word# 0x8000#))

    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (maskw# x17# v1#) (int2Word# 0x10000#))

    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (maskw# x18# v1#) (int2Word# 0x20000#))

    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (maskw# x19# v1#) (int2Word# 0x40000#))

    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (maskw# x20# v1#) (int2Word# 0x80000#))

    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (maskw# x21# v1#) (int2Word# 0x100000#))

    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (maskw# x22# v1#) (int2Word# 0x200000#))

    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (maskw# x23# v1#) (int2Word# 0x400000#))

    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (maskw# x24# v1#) (int2Word# 0x800000#))

    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (maskw# x25# v1#) (int2Word# 0x1000000#))

    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (maskw# x26# v1#) (int2Word# 0x2000000#))

    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (maskw# x27# v1#) (int2Word# 0x4000000#))

    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (maskw# x28# v1#) (int2Word# 0x8000000#))

    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (maskw# x29# v1#) (int2Word# 0x10000000#))

    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (maskw# x30# v1#) (int2Word# 0x20000000#))

    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (maskw# x31# v1#) (int2Word# 0x40000000#))

    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (maskw# x32# v1#) (int2Word# 0x80000000#))

    return $! lineResult# p32# start


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch2 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Elem              -- ^ value to search for
            -> Elem              -- ^ second value to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch2 !vec !start !elem1 !elem2 = do
    let !(I# v1#) = fromIntegral elem1
    let !(I# v2#) = fromIntegral elem2

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (int2Word# 0x1#)
                     (or# (maskw# x1# v1#) (maskw# x1# v2#)))
    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (int2Word# 0x2#)
                             (or# (maskw# x2# v1#) (maskw# x2# v2#)))
    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (int2Word# 0x4#)
                             (or# (maskw# x3# v1#) (maskw# x3# v2#)))
    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (int2Word# 0x8#)
                             (or# (maskw# x4# v1#) (maskw# x4# v2#)))
    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (int2Word# 0x10#)
                             (or# (maskw# x5# v1#) (maskw# x5# v2#)))
    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (int2Word# 0x20#)
                             (or# (maskw# x6# v1#) (maskw# x6# v2#)))
    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (int2Word# 0x40#)
                             (or# (maskw# x7# v1#) (maskw# x7# v2#)))
    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (int2Word# 0x80#)
                             (or# (maskw# x8# v1#) (maskw# x8# v2#)))
    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (int2Word# 0x100#)
                             (or# (maskw# x9# v1#) (maskw# x9# v2#)))
    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (int2Word# 0x200#)
                             (or# (maskw# x10# v1#) (maskw# x10# v2#)))
    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (int2Word# 0x400#)
                             (or# (maskw# x11# v1#) (maskw# x11# v2#)))
    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (int2Word# 0x800#)
                             (or# (maskw# x12# v1#) (maskw# x12# v2#)))
    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (int2Word# 0x1000#)
                             (or# (maskw# x13# v1#) (maskw# x13# v2#)))
    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (int2Word# 0x2000#)
                             (or# (maskw# x14# v1#) (maskw# x14# v2#)))
    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (int2Word# 0x4000#)
                             (or# (maskw# x15# v1#) (maskw# x15# v2#)))
    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (int2Word# 0x8000#)
                             (or# (maskw# x16# v1#) (maskw# x16# v2#)))
    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (int2Word# 0x10000#)
                             (or# (maskw# x17# v1#) (maskw# x17# v2#)))
    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (int2Word# 0x20000#)
                             (or# (maskw# x18# v1#) (maskw# x18# v2#)))
    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (int2Word# 0x40000#)
                             (or# (maskw# x19# v1#) (maskw# x19# v2#)))
    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (int2Word# 0x80000#)
                             (or# (maskw# x20# v1#) (maskw# x20# v2#)))
    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (int2Word# 0x100000#)
                             (or# (maskw# x21# v1#) (maskw# x21# v2#)))
    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (int2Word# 0x200000#)
                             (or# (maskw# x22# v1#) (maskw# x22# v2#)))
    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (int2Word# 0x400000#)
                             (or# (maskw# x23# v1#) (maskw# x23# v2#)))
    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (int2Word# 0x800000#)
                             (or# (maskw# x24# v1#) (maskw# x24# v2#)))
    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (int2Word# 0x1000000#)
                             (or# (maskw# x25# v1#) (maskw# x25# v2#)))
    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (int2Word# 0x2000000#)
                             (or# (maskw# x26# v1#) (maskw# x26# v2#)))
    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (int2Word# 0x4000000#)
                             (or# (maskw# x27# v1#) (maskw# x27# v2#)))
    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (int2Word# 0x8000000#)
                             (or# (maskw# x28# v1#) (maskw# x28# v2#)))
    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (int2Word# 0x10000000#)
                             (or# (maskw# x29# v1#) (maskw# x29# v2#)))
    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (int2Word# 0x20000000#)
                             (or# (maskw# x30# v1#) (maskw# x30# v2#)))
    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (int2Word# 0x40000000#)
                             (or# (maskw# x31# v1#) (maskw# x31# v2#)))
    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (int2Word# 0x80000000#)
                             (or# (maskw# x32# v1#) (maskw# x32# v2#)))

    return $! lineResult# p32# start


-- Required: unlike in C search, required that the start index is
-- cache-line-aligned and array has at least 32 elements after the start index
lineSearch3 :: IntArray s        -- ^ vector to search
            -> Int               -- ^ start index
            -> Elem              -- ^ value to search for
            -> Elem              -- ^ second value to search for
            -> Elem              -- ^ third value to search for
            -> ST s Int          -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
lineSearch3 !vec !start !elem1 !elem2 !elem3 = do
    let !(I# v1#) = fromIntegral elem1
    let !(I# v2#) = fromIntegral elem2
    let !(I# v3#) = fromIntegral elem3

    !(I# x1#) <- liftM fromIntegral $ M.readArray vec start
    let !p1# = (and# (int2Word# 0x1#)
                     (or# (maskw# x1# v1#)
                          (or# (maskw# x1# v2#) (maskw# x1# v3#))))

    !(I# x2#) <- liftM fromIntegral $ M.readArray vec $! start + 1
    let !p2# = or# p1# (and# (int2Word# 0x2#)
                             (or# (maskw# x2# v1#)
                                  (or# (maskw# x2# v2#) (maskw# x2# v3#))))

    !(I# x3#) <- liftM fromIntegral $ M.readArray vec $! start + 2
    let !p3# = or# p2# (and# (int2Word# 0x4#)
                             (or# (maskw# x3# v1#)
                                  (or# (maskw# x3# v2#) (maskw# x3# v3#))))

    !(I# x4#) <- liftM fromIntegral $ M.readArray vec $! start + 3
    let !p4# = or# p3# (and# (int2Word# 0x8#)
                             (or# (maskw# x4# v1#)
                                  (or# (maskw# x4# v2#) (maskw# x4# v3#))))

    !(I# x5#) <- liftM fromIntegral $ M.readArray vec $! start + 4
    let !p5# = or# p4# (and# (int2Word# 0x10#)
                             (or# (maskw# x5# v1#)
                                  (or# (maskw# x5# v2#) (maskw# x5# v3#))))

    !(I# x6#) <- liftM fromIntegral $ M.readArray vec $! start + 5
    let !p6# = or# p5# (and# (int2Word# 0x20#)
                             (or# (maskw# x6# v1#)
                                  (or# (maskw# x6# v2#) (maskw# x6# v3#))))

    !(I# x7#) <- liftM fromIntegral $ M.readArray vec $! start + 6
    let !p7# = or# p6# (and# (int2Word# 0x40#)
                             (or# (maskw# x7# v1#)
                                  (or# (maskw# x7# v2#) (maskw# x7# v3#))))

    !(I# x8#) <- liftM fromIntegral $ M.readArray vec $! start + 7
    let !p8# = or# p7# (and# (int2Word# 0x80#)
                             (or# (maskw# x8# v1#)
                                  (or# (maskw# x8# v2#) (maskw# x8# v3#))))

    !(I# x9#) <- liftM fromIntegral $ M.readArray vec $! start + 8
    let !p9# = or# p8# (and# (int2Word# 0x100#)
                             (or# (maskw# x9# v1#)
                                  (or# (maskw# x9# v2#) (maskw# x9# v3#))))

    !(I# x10#) <- liftM fromIntegral $ M.readArray vec $! start + 9
    let !p10# = or# p9# (and# (int2Word# 0x200#)
                             (or# (maskw# x10# v1#)
                                  (or# (maskw# x10# v2#) (maskw# x10# v3#))))

    !(I# x11#) <- liftM fromIntegral $ M.readArray vec $! start + 10
    let !p11# = or# p10# (and# (int2Word# 0x400#)
                             (or# (maskw# x11# v1#)
                                  (or# (maskw# x11# v2#) (maskw# x11# v3#))))

    !(I# x12#) <- liftM fromIntegral $ M.readArray vec $! start + 11
    let !p12# = or# p11# (and# (int2Word# 0x800#)
                             (or# (maskw# x12# v1#)
                                  (or# (maskw# x12# v2#) (maskw# x12# v3#))))

    !(I# x13#) <- liftM fromIntegral $ M.readArray vec $! start + 12
    let !p13# = or# p12# (and# (int2Word# 0x1000#)
                             (or# (maskw# x13# v1#)
                                  (or# (maskw# x13# v2#) (maskw# x13# v3#))))

    !(I# x14#) <- liftM fromIntegral $ M.readArray vec $! start + 13
    let !p14# = or# p13# (and# (int2Word# 0x2000#)
                             (or# (maskw# x14# v1#)
                                  (or# (maskw# x14# v2#) (maskw# x14# v3#))))

    !(I# x15#) <- liftM fromIntegral $ M.readArray vec $! start + 14
    let !p15# = or# p14# (and# (int2Word# 0x4000#)
                             (or# (maskw# x15# v1#)
                                  (or# (maskw# x15# v2#) (maskw# x15# v3#))))

    !(I# x16#) <- liftM fromIntegral $ M.readArray vec $! start + 15
    let !p16# = or# p15# (and# (int2Word# 0x8000#)
                             (or# (maskw# x16# v1#)
                                  (or# (maskw# x16# v2#) (maskw# x16# v3#))))

    !(I# x17#) <- liftM fromIntegral $ M.readArray vec $! start + 16
    let !p17# = or# p16# (and# (int2Word# 0x10000#)
                             (or# (maskw# x17# v1#)
                                  (or# (maskw# x17# v2#) (maskw# x17# v3#))))

    !(I# x18#) <- liftM fromIntegral $ M.readArray vec $! start + 17
    let !p18# = or# p17# (and# (int2Word# 0x20000#)
                             (or# (maskw# x18# v1#)
                                  (or# (maskw# x18# v2#) (maskw# x18# v3#))))

    !(I# x19#) <- liftM fromIntegral $ M.readArray vec $! start + 18
    let !p19# = or# p18# (and# (int2Word# 0x40000#)
                             (or# (maskw# x19# v1#)
                                  (or# (maskw# x19# v2#) (maskw# x19# v3#))))

    !(I# x20#) <- liftM fromIntegral $ M.readArray vec $! start + 19
    let !p20# = or# p19# (and# (int2Word# 0x80000#)
                             (or# (maskw# x20# v1#)
                                  (or# (maskw# x20# v2#) (maskw# x20# v3#))))

    !(I# x21#) <- liftM fromIntegral $ M.readArray vec $! start + 20
    let !p21# = or# p20# (and# (int2Word# 0x100000#)
                             (or# (maskw# x21# v1#)
                                  (or# (maskw# x21# v2#) (maskw# x21# v3#))))

    !(I# x22#) <- liftM fromIntegral $ M.readArray vec $! start + 21
    let !p22# = or# p21# (and# (int2Word# 0x200000#)
                             (or# (maskw# x22# v1#)
                                  (or# (maskw# x22# v2#) (maskw# x22# v3#))))

    !(I# x23#) <- liftM fromIntegral $ M.readArray vec $! start + 22
    let !p23# = or# p22# (and# (int2Word# 0x400000#)
                             (or# (maskw# x23# v1#)
                                  (or# (maskw# x23# v2#) (maskw# x23# v3#))))

    !(I# x24#) <- liftM fromIntegral $ M.readArray vec $! start + 23
    let !p24# = or# p23# (and# (int2Word# 0x800000#)
                             (or# (maskw# x24# v1#)
                                  (or# (maskw# x24# v2#) (maskw# x24# v3#))))

    !(I# x25#) <- liftM fromIntegral $ M.readArray vec $! start + 24
    let !p25# = or# p24# (and# (int2Word# 0x1000000#)
                             (or# (maskw# x25# v1#)
                                  (or# (maskw# x25# v2#) (maskw# x25# v3#))))

    !(I# x26#) <- liftM fromIntegral $ M.readArray vec $! start + 25
    let !p26# = or# p25# (and# (int2Word# 0x2000000#)
                             (or# (maskw# x26# v1#)
                                  (or# (maskw# x26# v2#) (maskw# x26# v3#))))

    !(I# x27#) <- liftM fromIntegral $ M.readArray vec $! start + 26
    let !p27# = or# p26# (and# (int2Word# 0x4000000#)
                             (or# (maskw# x27# v1#)
                                  (or# (maskw# x27# v2#) (maskw# x27# v3#))))

    !(I# x28#) <- liftM fromIntegral $ M.readArray vec $! start + 27
    let !p28# = or# p27# (and# (int2Word# 0x8000000#)
                             (or# (maskw# x28# v1#)
                                  (or# (maskw# x28# v2#) (maskw# x28# v3#))))

    !(I# x29#) <- liftM fromIntegral $ M.readArray vec $! start + 28
    let !p29# = or# p28# (and# (int2Word# 0x10000000#)
                             (or# (maskw# x29# v1#)
                                  (or# (maskw# x29# v2#) (maskw# x29# v3#))))

    !(I# x30#) <- liftM fromIntegral $ M.readArray vec $! start + 29
    let !p30# = or# p29# (and# (int2Word# 0x20000000#)
                             (or# (maskw# x30# v1#)
                                  (or# (maskw# x30# v2#) (maskw# x30# v3#))))

    !(I# x31#) <- liftM fromIntegral $ M.readArray vec $! start + 30
    let !p31# = or# p30# (and# (int2Word# 0x40000000#)
                             (or# (maskw# x31# v1#)
                                  (or# (maskw# x31# v2#) (maskw# x31# v3#))))

    !(I# x32#) <- liftM fromIntegral $ M.readArray vec $! start + 31
    let !p32# = or# p31# (and# (int2Word# 0x80000000#)
                             (or# (maskw# x32# v1#)
                                  (or# (maskw# x32# v2#) (maskw# x32# v3#))))

    return $! lineResult# p32# start


------------------------------------------------------------------------------
-- | Search through a mutable vector for a given int value. The number of
-- things to search for must be at most the number of things remaining in the
-- vector.
naiveSearch :: IntArray s       -- ^ vector to search
            -> Int              -- ^ start index
            -> Int              -- ^ number of things to search
            -> Elem             -- ^ value to search for
            -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch !vec !start !nThings !value = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value then return i else go (i+1)
{-# INLINE naiveSearch #-}


------------------------------------------------------------------------------
naiveSearch2 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Elem             -- ^ value to search for
             -> Elem             -- ^ value 2 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch2 !vec !start !nThings !value1 !value2 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 then return i else go (i+1)
{-# INLINE naiveSearch2 #-}


------------------------------------------------------------------------------
naiveSearch3 :: IntArray s       -- ^ vector to search
             -> Int              -- ^ start index
             -> Int              -- ^ number of things to search
             -> Elem             -- ^ value to search for
             -> Elem             -- ^ value 2 to search for
             -> Elem             -- ^ value 3 to search for
             -> ST s Int         -- ^ dest index where it can be found, or
                                -- \"-1\" if not found
naiveSearch3 !vec !start !nThings !value1 !value2 !value3 = go start
  where
    !doneIdx = start + nThings

    go !i | i >= doneIdx = return (-1)
          | otherwise = do
        x <- M.readArray vec i
        if x == value1 || x == value2 || x == value3
          then return i
          else go (i+1)
{-# INLINE naiveSearch3 #-}

-- end #if NO_C_SEARCH
#endif

------------------------------------------------------------------------------
-- | Search through a mutable vector for a given int value, cache-line aligned.
-- If the start index is cache-line aligned, and there is more than a
-- cache-line's room between the start index and the end of the vector, we will
-- search the cache-line all at once using an efficient branchless
-- bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch :: IntArray s        -- ^ vector to search
                -> Int               -- ^ start index
                -> Elem              -- ^ value to search for
                -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch :: forall s. IntArray s -> Int -> Elem -> ST s Int
cacheLineSearch !IntArray s
vec !Int
start !Elem
value = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start


    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch vec start (min st1 st2) value
      else lineSearch vec start value
#else
    IntArray s -> Int -> Elem -> ST s Int
forall s. IntArray s -> Int -> Elem -> ST s Int
lineSearch IntArray s
vec Int
start Elem
value
#endif
{-# INLINE cacheLineSearch #-}


------------------------------------------------------------------------------
-- | Search through a mutable vector for one of two given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch2 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Elem              -- ^ value to search for
                 -> Elem              -- ^ value 2 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch2 :: forall s. IntArray s -> Int -> Elem -> Elem -> ST s Int
cacheLineSearch2 !IntArray s
vec !Int
start !Elem
value !Elem
value2 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch2 vec start (min st1 st2) value value2
      else lineSearch2 vec start value value2
#else
    IntArray s -> Int -> Elem -> Elem -> ST s Int
forall s. IntArray s -> Int -> Elem -> Elem -> ST s Int
lineSearch2 IntArray s
vec Int
start Elem
value Elem
value2
#endif
{-# INLINE cacheLineSearch2 #-}


------------------------------------------------------------------------------
-- | Search through a mutable vector for one of three given int values,
-- cache-line aligned.  If the start index is cache-line aligned, and there is
-- more than a cache-line's room between the start index and the end of the
-- vector, we will search the cache-line all at once using an efficient
-- branchless bit-twiddling technique. Otherwise, we will use a typical loop.
--
cacheLineSearch3 :: IntArray s        -- ^ vector to search
                 -> Int               -- ^ start index
                 -> Elem              -- ^ value to search for
                 -> Elem              -- ^ value 2 to search for
                 -> Elem              -- ^ value 3 to search for
                 -> ST s Int          -- ^ dest index where it can be found, or
                                     -- \"-1\" if not found
cacheLineSearch3 :: forall s. IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
cacheLineSearch3 !IntArray s
vec !Int
start !Elem
value !Elem
value2 !Elem
value3 = do
#ifdef NO_C_SEARCH
    let !vlen  = M.length vec
    let !st1   = vlen - start
    let !nvlen = numElemsInCacheLine - st1
    let adv    = (start + cacheLineIntMask) .&. complement cacheLineIntMask
    let st2    = adv - start

    if nvlen > 0 || not (isCacheLineAligned start)
      then naiveSearch3 vec start (min st1 st2) value value2 value3
      else lineSearch3 vec start value value2 value3
#else
    IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
forall s. IntArray s -> Int -> Elem -> Elem -> Elem -> ST s Int
lineSearch3 IntArray s
vec Int
start Elem
value Elem
value2 Elem
value3
#endif
{-# INLINE cacheLineSearch3 #-}