{-# LANGUAGE BangPatterns, ScopedTypeVariables #-}

-- |
-- Module      : Data.Text.Lazy.Search
-- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
--
-- License     : BSD-style
-- Maintainer  : bos@serpentine.com
-- Stability   : experimental
-- Portability : GHC
--
-- /Warning/: this is an internal module, and does not have a stable
-- API or name. Functions in this module may not check or enforce
-- preconditions expected by public modules. Use at your own risk!
--
-- Fast substring search for lazy 'Text', based on work by Boyer,
-- Moore, Horspool, Sunday, and Lundh.  Adapted from the strict
-- implementation.

module Data.Text.Internal.Lazy.Search
    (
      indices
    ) where

import qualified Data.Text.Array as A
import Data.Int (Int64)
import Data.Word (Word16, Word64)
import qualified Data.Text.Internal as T
import Data.Text.Internal.Fusion.Types (PairS(..))
import Data.Text.Internal.Lazy (Text(..), foldlChunks)
import Data.Bits ((.|.), (.&.))
import Data.Text.Internal.Unsafe.Shift (shiftL)

-- | /O(n+m)/ Find the offsets of all non-overlapping indices of
-- @needle@ within @haystack@.
--
-- This function is strict in @needle@, and lazy (as far as possible)
-- in the chunks of @haystack@.
--
-- In (unlikely) bad cases, this algorithm's complexity degrades
-- towards /O(n*m)/.
indices :: Text              -- ^ Substring to search for (@needle@)
        -> Text              -- ^ Text to search in (@haystack@)
        -> [Int64]
indices :: Text -> Text -> [Int64]
indices needle :: Text
needle@(Chunk n :: Text
n ns :: Text
ns) _haystack :: Text
_haystack@(Chunk k :: Text
k ks :: Text
ks)
    | Int64
nlen Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
<= 0  = []
    | Int64
nlen Int64 -> Int64 -> Bool
forall a. Eq a => a -> a -> Bool
== 1  = Word16 -> Int64 -> Text -> Text -> [Int64]
indicesOne (Int64 -> Word16
nindex 0) 0 Text
k Text
ks
    | Bool
otherwise  = Text -> Text -> Int64 -> Int64 -> [Int64]
advance Text
k Text
ks 0 0
  where
    advance :: Text -> Text -> Int64 -> Int64 -> [Int64]
advance x :: Text
x@(T.Text _ _ l :: Int
l) xs :: Text
xs = Int64 -> Int64 -> [Int64]
scan
     where
      scan :: Int64 -> Int64 -> [Int64]
scan !Int64
g !Int64
i
         | Int64
i Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
m = case Text
xs of
                      Empty           -> []
                      Chunk y ys      -> Text -> Text -> Int64 -> Int64 -> [Int64]
advance Text
y Text
ys Int64
g (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-Int64
m)
         | Int64 -> Text -> Text -> Bool
forall t. (Ord t, Num t) => t -> Text -> Text -> Bool
lackingHay (Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
nlen) Text
x Text
xs  = []
         | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z Bool -> Bool -> Bool
&& Int64 -> Bool
candidateMatch 0  = Int64
g Int64 -> [Int64] -> [Int64]
forall a. a -> [a] -> [a]
: Int64 -> Int64 -> [Int64]
scan (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen) (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen)
         | Bool
otherwise                   = Int64 -> Int64 -> [Int64]
scan (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
delta) (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
delta)
       where
         m :: Int64
m = Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l
         c :: Word16
c = Int64 -> Word16
hindex (Int64
i Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ Int64
nlast)
         delta :: Int64
delta | Bool
nextInPattern = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ 1
               | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z        = Int64
skip Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+ 1
               | Bool
otherwise     = 1
         nextInPattern :: Bool
nextInPattern         = Word64
mask Word64 -> Word64 -> Word64
forall a. Bits a => a -> a -> a
.&. Word16 -> Word64
forall a a. (UnsafeShift a, Integral a, Num a) => a -> a
swizzle (Int64 -> Word16
hindex (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
nlen)) Word64 -> Word64 -> Bool
forall a. Eq a => a -> a -> Bool
== 0
         candidateMatch :: Int64 -> Bool
candidateMatch !Int64
j
             | Int64
j Int64 -> Int64 -> Bool
forall a. Ord a => a -> a -> Bool
>= Int64
nlast               = Bool
True
             | Int64 -> Word16
hindex (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+Int64
j) Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
/= Int64 -> Word16
nindex Int64
j = Bool
False
             | Bool
otherwise                = Int64 -> Bool
candidateMatch (Int64
jInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+1)
         hindex :: Int64 -> Word16
hindex                         = Text -> Text -> Int64 -> Word16
index Text
x Text
xs
    nlen :: Int64
nlen      = Text -> Int64
wordLength Text
needle
    nlast :: Int64
nlast     = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- 1
    nindex :: Int64 -> Word16
nindex    = Text -> Text -> Int64 -> Word16
index Text
n Text
ns
    z :: Word16
z         = (Word16 -> Text -> Word16) -> Word16 -> Text -> Word16
forall a. (a -> Text -> a) -> a -> Text -> a
foldlChunks Word16 -> Text -> Word16
forall p. p -> Text -> Word16
fin 0 Text
needle
        where fin :: p -> Text -> Word16
fin _ (T.Text farr :: Array
farr foff :: Int
foff flen :: Int
flen) = Array -> Int -> Word16
A.unsafeIndex Array
farr (Int
foffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
flenInt -> Int -> Int
forall a. Num a => a -> a -> a
-1)
    (Word64
mask :: Word64) :*: skip :: Int64
skip = Text
-> Text -> Int64 -> Int -> Word64 -> Int64 -> PairS Word64 Int64
forall t.
(Bits t, UnsafeShift t, Num t) =>
Text -> Text -> Int64 -> Int -> t -> Int64 -> PairS t Int64
buildTable Text
n Text
ns 0 0 0 (Int64
nlenInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-2)
    swizzle :: a -> a
swizzle w :: a
w = 1 a -> Int -> a
forall a. UnsafeShift a => a -> Int -> a
`shiftL` (a -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
w Int -> Int -> Int
forall a. Bits a => a -> a -> a
.&. 0x3f)
    buildTable :: Text -> Text -> Int64 -> Int -> t -> Int64 -> PairS t Int64
buildTable (T.Text xarr :: Array
xarr xoff :: Int
xoff xlen :: Int
xlen) xs :: Text
xs = Int64 -> Int -> t -> Int64 -> PairS t Int64
go
      where
        go :: Int64 -> Int -> t -> Int64 -> PairS t Int64
go !(Int64
g::Int64) !Int
i !t
msk !Int64
skp
            | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
xlast = case Text
xs of
                             Empty      -> (t
msk t -> t -> t
forall a. Bits a => a -> a -> a
.|. Word16 -> t
forall a a. (UnsafeShift a, Integral a, Num a) => a -> a
swizzle Word16
z) t -> Int64 -> PairS t Int64
forall a b. a -> b -> PairS a b
:*: Int64
skp
                             Chunk y ys -> Text -> Text -> Int64 -> Int -> t -> Int64 -> PairS t Int64
buildTable Text
y Text
ys Int64
g 0 t
msk' Int64
skp'
            | Bool
otherwise = Int64 -> Int -> t -> Int64 -> PairS t Int64
go (Int64
gInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
+1) (Int
iInt -> Int -> Int
forall a. Num a => a -> a -> a
+1) t
msk' Int64
skp'
            where c :: Word16
c                = Array -> Int -> Word16
A.unsafeIndex Array
xarr (Int
xoffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
i)
                  msk' :: t
msk'             = t
msk t -> t -> t
forall a. Bits a => a -> a -> a
.|. Word16 -> t
forall a a. (UnsafeShift a, Integral a, Num a) => a -> a
swizzle Word16
c
                  skp' :: Int64
skp' | Word16
c Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
z    = Int64
nlen Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- Int64
g Int64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
- 2
                       | Bool
otherwise = Int64
skp
                  xlast :: Int
xlast = Int
xlen Int -> Int -> Int
forall a. Num a => a -> a -> a
- 1
    -- | Check whether an attempt to index into the haystack at the
    -- given offset would fail.
    lackingHay :: t -> Text -> Text -> Bool
lackingHay q :: t
q = t -> Text -> Text -> Bool
go 0
      where
        go :: t -> Text -> Text -> Bool
go p :: t
p (T.Text _ _ l :: Int
l) ps :: Text
ps = t
p' t -> t -> Bool
forall a. Ord a => a -> a -> Bool
< t
q Bool -> Bool -> Bool
&& case Text
ps of
                                             Empty      -> Bool
True
                                             Chunk r :: Text
r rs :: Text
rs -> t -> Text -> Text -> Bool
go t
p' Text
r Text
rs
            where p' :: t
p' = t
p t -> t -> t
forall a. Num a => a -> a -> a
+ Int -> t
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l
indices _ _ = []

-- | Fast index into a partly unpacked 'Text'.  We take into account
-- the possibility that the caller might try to access one element
-- past the end.
index :: T.Text -> Text -> Int64 -> Word16
index :: Text -> Text -> Int64 -> Word16
index (T.Text arr :: Array
arr off :: Int
off len :: Int
len) xs :: Text
xs !Int64
i
    | Int
j Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
< Int
len   = Array -> Int -> Word16
A.unsafeIndex Array
arr (Int
offInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
j)
    | Bool
otherwise = case Text
xs of
                    Empty
                        -- out of bounds, but legal
                        | Int
j Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
len  -> 0
                        -- should never happen, due to lackingHay above
                        | Bool
otherwise -> String -> Word16
forall a. String -> a
emptyError "index"
                    Chunk c :: Text
c cs :: Text
cs -> Text -> Text -> Int64 -> Word16
index Text
c Text
cs (Int64
iInt64 -> Int64 -> Int64
forall a. Num a => a -> a -> a
-Int -> Int64
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
len)
    where j :: Int
j = Int64 -> Int
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int64
i

-- | A variant of 'indices' that scans linearly for a single 'Word16'.
indicesOne :: Word16 -> Int64 -> T.Text -> Text -> [Int64]
indicesOne :: Word16 -> Int64 -> Text -> Text -> [Int64]
indicesOne c :: Word16
c = Int64 -> Text -> Text -> [Int64]
forall a. Num a => a -> Text -> Text -> [a]
chunk
  where
    chunk :: a -> Text -> Text -> [a]
chunk !a
i (T.Text oarr :: Array
oarr ooff :: Int
ooff olen :: Int
olen) os :: Text
os = Int -> [a]
go 0
      where
        go :: Int -> [a]
go h :: Int
h | Int
h Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
olen = case Text
os of
                             Empty      -> []
                             Chunk y ys -> a -> Text -> Text -> [a]
chunk (a
ia -> a -> a
forall a. Num a => a -> a -> a
+Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
olen) Text
y Text
ys
             | Word16
on Word16 -> Word16 -> Bool
forall a. Eq a => a -> a -> Bool
== Word16
c = a
i a -> a -> a
forall a. Num a => a -> a -> a
+ Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
h a -> [a] -> [a]
forall a. a -> [a] -> [a]
: Int -> [a]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+1)
             | Bool
otherwise = Int -> [a]
go (Int
hInt -> Int -> Int
forall a. Num a => a -> a -> a
+1)
             where on :: Word16
on = Array -> Int -> Word16
A.unsafeIndex Array
oarr (Int
ooffInt -> Int -> Int
forall a. Num a => a -> a -> a
+Int
h)

-- | The number of 'Word16' values in a 'Text'.
wordLength :: Text -> Int64
wordLength :: Text -> Int64
wordLength = (Int64 -> Text -> Int64) -> Int64 -> Text -> Int64
forall a. (a -> Text -> a) -> a -> Text -> a
foldlChunks Int64 -> Text -> Int64
forall a. Num a => a -> Text -> a
sumLength 0
    where sumLength :: a -> Text -> a
sumLength i :: a
i (T.Text _ _ l :: Int
l) = a
i a -> a -> a
forall a. Num a => a -> a -> a
+ Int -> a
forall a b. (Integral a, Num b) => a -> b
fromIntegral Int
l

emptyError :: String -> a
emptyError :: String -> a
emptyError fun :: String
fun = String -> a
forall a. HasCallStack => String -> a
error ("Data.Text.Lazy.Search." String -> String -> String
forall a. [a] -> [a] -> [a]
++ String
fun String -> String -> String
forall a. [a] -> [a] -> [a]
++ ": empty input")