SDSL 3.0.2
Succinct Data Structure Library
Loading...
Searching...
No Matches
suffix_array_algorithm.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_SUFFIX_ARRAY_ALGORITHM
9#define INCLUDED_SDSL_SUFFIX_ARRAY_ALGORITHM
10
11#include <array>
12#include <assert.h>
13#include <iterator>
14#include <stdint.h>
15#include <type_traits>
16
17#include <sdsl/config.hpp>
18#include <sdsl/csa_wt.hpp>
19#include <sdsl/int_vector.hpp>
22#include <sdsl/wt_pc.hpp>
23
24namespace sdsl
25{
26
28
43template <class t_csa, class t_pat_iter>
44typename t_csa::size_type forward_search(
45 t_csa const & csa,
46 typename t_csa::size_type l,
47 typename t_csa::size_type r,
48 t_pat_iter begin,
49 t_pat_iter end,
50 typename t_csa::size_type & l_res,
51 typename t_csa::size_type & r_res,
53 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
54{
55 assert(l <= r);
56 assert(r < csa.size());
57
58 auto size = csa.size();
59
60 l_res = l;
61 r_res = l - 1;
62 auto l_res_upper = r + 1;
63 auto r_res_upper = r + 1;
64
65 // shortcut for too long patterns
66 if ((typename t_csa::size_type)(end - begin) >= size)
67 return 0;
68
69 // compares the pattern with CSA-prefix i (truncated to length $|pattern|$).
70 auto compare = [&](typename t_csa::size_type i) -> int
71 {
72 for (auto current = begin; current != end; current++)
73 {
74 auto index = csa.char2comp[*current];
75 if (index == 0)
76 return -1;
77 if (csa.C[index + 1] - 1 < i)
78 return -1;
79 if (csa.C[index] > i)
80 return 1;
81 i = csa.psi[i];
82 }
83 return 0;
84 };
85
86 // binary search (on min)
87 while (l_res < l_res_upper)
88 {
89 typename t_csa::size_type sample = l_res + (l_res_upper - l_res) / 2;
90 int result = compare(sample);
91 if (result == 1)
92 l_res = sample + 1;
93 else if (result == -1)
94 l_res_upper = sample;
95 else
96 l_res_upper = sample;
97 }
98
99 // binary search (on max)
100 while (r_res + 1 < r_res_upper)
101 {
102 typename t_csa::size_type sample = r_res + (r_res_upper - r_res) / 2;
103 int result = compare(sample);
104 if (result == 1)
105 r_res = sample;
106 else if (result == -1)
107 r_res_upper = sample;
108 else
109 r_res = sample;
110 }
111
112 return r_res - l_res + 1;
113}
114
116
129template <class t_csa>
130typename t_csa::size_type forward_search(
131 t_csa const & csa,
132 typename t_csa::size_type l,
133 typename t_csa::size_type r,
134 typename t_csa::char_type c,
135 typename t_csa::size_type & l_res,
136 typename t_csa::size_type & r_res,
138 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
139{
140 auto c_ptr = &c;
141 return forward_search(csa, l, r, c_ptr, c_ptr + 1, l_res, r_res);
142}
143
145
166template <class t_csa>
167typename t_csa::size_type backward_search(
168 t_csa const & csa,
169 typename t_csa::size_type l,
170 typename t_csa::size_type r,
171 typename t_csa::char_type c,
172 typename t_csa::size_type & l_res,
173 typename t_csa::size_type & r_res,
175 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
176{
177 assert(l <= r);
178 assert(r < csa.size());
179 typename t_csa::size_type cc = csa.char2comp[c];
180 if (cc == 0 and c > 0)
181 {
182 l_res = 1;
183 r_res = 0;
184 }
185 else
186 {
187 typename t_csa::size_type c_begin = csa.C[cc];
188 if (l == 0 and r + 1 == csa.size())
189 {
190 l_res = c_begin;
191 r_res = csa.C[cc + 1] - 1;
192 }
193 else
194 {
195 l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
196 r_res = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
197 }
198 }
199 assert(r_res + 1 - l_res >= 0);
200 return r_res + 1 - l_res;
201}
202
204
227template <class t_csa, class t_pat_iter>
228typename t_csa::size_type backward_search(
229 t_csa const & csa,
230 typename t_csa::size_type l,
231 typename t_csa::size_type r,
232 t_pat_iter begin,
233 t_pat_iter end,
234 typename t_csa::size_type & l_res,
235 typename t_csa::size_type & r_res,
237 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
238{
239 t_pat_iter it = end;
240 while (begin < it and r + 1 - l > 0)
241 {
242 --it;
243 backward_search(csa, l, r, (typename t_csa::char_type) * it, l, r);
244 }
245 l_res = l;
246 r_res = r;
247 return r + 1 - l;
248}
249
251
266template <class t_wt,
267 uint32_t t_dens,
268 uint32_t t_inv_dens,
269 class t_sa_sample_strat,
270 class t_isa,
271 class t_alphabet_strat>
274 typename csa_wt<>::size_type l_fwd,
275 typename csa_wt<>::size_type r_fwd,
276 typename csa_wt<>::size_type l_bwd,
277 typename csa_wt<>::size_type r_bwd,
278 typename csa_wt<>::char_type c,
279 typename csa_wt<>::size_type & l_fwd_res,
280 typename csa_wt<>::size_type & r_fwd_res,
281 typename csa_wt<>::size_type & l_bwd_res,
282 typename csa_wt<>::size_type & r_bwd_res,
283 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
284{
285 assert(l_fwd <= r_fwd);
286 assert(r_fwd < csa_fwd.size());
288 size_type c_begin = csa_fwd.C[csa_fwd.char2comp[c]];
289 auto r_s_b = csa_fwd.wavelet_tree.lex_count(l_fwd, r_fwd + 1, c);
290 size_type rank_l = std::get<0>(r_s_b);
291 size_type s = std::get<1>(r_s_b), b = std::get<2>(r_s_b);
292 size_type rank_r = r_fwd - l_fwd - s - b + rank_l;
293 l_fwd_res = c_begin + rank_l;
294 r_fwd_res = c_begin + rank_r;
295 assert(r_fwd_res + 1 >= l_fwd_res);
296 l_bwd_res = l_bwd + s;
297 r_bwd_res = r_bwd - b;
298 assert(r_bwd_res - l_bwd_res == r_fwd_res - l_fwd_res);
299 return r_fwd_res + 1 - l_fwd_res;
300}
301
303
332template <class t_pat_iter,
333 class t_wt,
334 uint32_t t_dens,
335 uint32_t t_inv_dens,
336 class t_sa_sample_strat,
337 class t_isa,
338 class t_alphabet_strat>
342 typename csa_wt<>::size_type l_fwd,
343 typename csa_wt<>::size_type r_fwd,
344 typename csa_wt<>::size_type l_bwd,
345 typename csa_wt<>::size_type r_bwd,
346 t_pat_iter begin,
347 t_pat_iter end,
348 typename csa_wt<>::size_type & l_fwd_res,
349 typename csa_wt<>::size_type & r_fwd_res,
350 typename csa_wt<>::size_type & l_bwd_res,
351 typename csa_wt<>::size_type & r_bwd_res,
352 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
353{
354 t_pat_iter it = end;
355 while (begin < it and r_fwd + 1 - l_fwd > 0)
356 {
357 --it;
358 bidirectional_search(csa_fwd,
359 l_fwd,
360 r_fwd,
361 l_bwd,
362 r_bwd,
363 (typename csa_wt<>::char_type) * it,
364 l_fwd,
365 r_fwd,
366 l_bwd,
367 r_bwd);
368 }
369 l_fwd_res = l_fwd;
370 r_fwd_res = r_fwd;
371 l_bwd_res = l_bwd;
372 r_bwd_res = r_bwd;
373 return r_fwd + 1 - l_fwd;
374}
375
377
406template <class t_pat_iter,
407 class t_wt,
408 uint32_t t_dens,
409 uint32_t t_inv_dens,
410 class t_sa_sample_strat,
411 class t_isa,
412 class t_alphabet_strat>
416 typename csa_wt<>::size_type l_fwd,
417 typename csa_wt<>::size_type r_fwd,
418 typename csa_wt<>::size_type l_bwd,
419 typename csa_wt<>::size_type r_bwd,
420 t_pat_iter begin,
421 t_pat_iter end,
422 typename csa_wt<>::size_type & l_fwd_res,
423 typename csa_wt<>::size_type & r_fwd_res,
424 typename csa_wt<>::size_type & l_bwd_res,
425 typename csa_wt<>::size_type & r_bwd_res,
426 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
427{
428 t_pat_iter it = begin;
429 while (it < end and r_fwd + 1 - l_fwd > 0)
430 {
431 bidirectional_search(csa_bwd,
432 l_bwd,
433 r_bwd,
434 l_fwd,
435 r_fwd,
436 (typename csa_wt<>::char_type) * it,
437 l_bwd,
438 r_bwd,
439 l_fwd,
440 r_fwd);
441 ++it;
442 }
443 l_fwd_res = l_fwd;
444 r_fwd_res = r_fwd;
445 l_bwd_res = l_bwd;
446 r_bwd_res = r_bwd;
447 return r_fwd + 1 - l_fwd;
448}
449
451
463template <class t_csa, class t_pat_iter>
464typename t_csa::size_type count(t_csa const & csa, t_pat_iter begin, t_pat_iter end, csa_tag)
465{
466 if (end - begin > (typename std::iterator_traits<t_pat_iter>::difference_type)csa.size())
467 return 0;
468 typename t_csa::size_type t = 0; // dummy variable for the backward_search call
469 typename t_csa::size_type result = backward_search(csa, 0, csa.size() - 1, begin, end, t, t);
470 return result;
471}
472
473template <class t_csx, class t_pat_iter>
474typename t_csx::size_type count(t_csx const & csx, t_pat_iter begin, t_pat_iter end)
475{
476 typename t_csx::index_category tag;
477 return count(csx, begin, end, tag);
478}
479
481
492template <class t_csx>
493typename t_csx::size_type count(t_csx const & csx, const typename t_csx::string_type & pat)
494{
495 typename t_csx::index_category tag;
496 return count(csx, pat.begin(), pat.end(), tag);
497}
498
500
511template <typename t_csx, typename t_pat_iter>
512auto lex_interval(t_csx const & csx, t_pat_iter begin, t_pat_iter end) -> std::array<typename t_csx::size_type, 2>
513{
514 std::array<typename t_csx::size_type, 2> res;
515 backward_search(csx, 0, csx.size() - 1, begin, end, res[0], res[1]);
516 return res;
517}
518
520
534template <class t_csa, class t_pat_iter, class t_rac = int_vector<64>>
535t_rac locate(t_csa const & csa,
536 t_pat_iter begin,
537 t_pat_iter end,
539 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
540 csa_tag())
541{
542 typename t_csa::size_type occ_begin, occ_end, occs;
543 occs = backward_search(csa, 0, csa.size() - 1, begin, end, occ_begin, occ_end);
544 t_rac occ(occs);
545 for (typename t_csa::size_type i = 0; i < occs; ++i)
546 {
547 occ[i] = csa[occ_begin + i];
548 }
549 return occ;
550}
551
553
565template <class t_csx, class t_rac = int_vector<64>>
566t_rac locate(t_csx const & csx, const typename t_csx::string_type & pat)
567{
568 typename t_csx::index_category tag;
569 return locate<t_csx, decltype(pat.begin()), t_rac>(csx, pat.begin(), pat.end(), tag);
570}
571
573
584template <class t_csa, class t_text_iter>
585typename t_csa::size_type extract(
586 t_csa const & csa,
587 typename t_csa::size_type begin,
588 typename t_csa::size_type end,
589 t_text_iter text,
591 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
592{
593 typename t_csa::extract_category extract_tag;
594 return extract(csa, begin, end, text, extract_tag);
595}
596
598template <class t_csa, class t_text_iter>
599typename t_csa::size_type
600extract(t_csa const & csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, lf_tag)
601{
602 assert(end < csa.size());
603 assert(begin <= end);
604 auto steps = end - begin + 1;
605 if (steps > 0)
606 {
607 auto order = csa.isa[end];
608 text[--steps] = first_row_symbol(order, csa);
609 while (steps != 0)
610 {
611 auto rc = csa.wavelet_tree.inverse_select(order);
612 auto j = rc.first;
613 auto c = rc.second;
614 order = csa.C[csa.char2comp[c]] + j;
615 text[--steps] = c;
616 }
617 }
618 return end - begin + 1;
619}
620
622template <class t_csa, class t_text_iter>
623typename t_csa::size_type
624extract(t_csa const & csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, psi_tag)
625{
626 assert(end < csa.size());
627 assert(begin <= end);
628 typename t_csa::size_type steps = end - begin + 1;
629 for (typename t_csa::size_type i = 0, order = csa.isa[begin]; steps != 0; --steps, ++i)
630 {
631 text[i] = first_row_symbol(order, csa);
632 if (steps != 0)
633 order = csa.psi[order];
634 }
635 return end - begin + 1;
636}
637
639
651template <class t_csa>
652typename t_csa::string_type extract(
653 t_csa const & csa,
654 typename t_csa::size_type begin,
655 typename t_csa::size_type end,
657 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x = csa_tag())
658{
659 assert(end <= csa.size());
660 assert(begin <= end);
661 typedef typename t_csa::string_type string_type;
662 string_type result(end - begin + 1, (typename string_type::value_type)0);
663 extract(csa, begin, end, result.begin());
664 return result;
665}
666
667} // namespace sdsl
668#endif
A class for the Compressed Suffix Array (CSA) based on a Wavelet Tree (WT) of the Burrow Wheeler Tran...
Definition csa_wt.hpp:57
wavelet_tree_type const & wavelet_tree
Definition csa_wt.hpp:130
const alphabet_type::char2comp_type & char2comp
Definition csa_wt.hpp:117
const alphabet_type::C_type & C
Definition csa_wt.hpp:119
alphabet_type::char_type char_type
Definition csa_wt.hpp:97
size_type size() const
Number of elements in the .
Definition csa_wt.hpp:164
int_vector ::size_type size_type
Definition csa_wt.hpp:84
size_type size() const noexcept
The number of elements in the int_vector.
#define SDSL_UNUSED
Definition config.hpp:12
csa_wt.hpp contains an implementation of the compressed suffix array based on a wavelet tree.
int_vector.hpp contains the sdsl::int_vector class.
Namespace for the succinct data structure library.
auto lex_interval(t_csx const &csx, t_pat_iter begin, t_pat_iter end) -> std::array< typename t_csx::size_type, 2 >
Returns the lexicographic interval of a pattern in the SA.
t_csa::size_type forward_search(t_csa const &csa, typename t_csa::size_type l, typename t_csa::size_type r, t_pat_iter begin, t_pat_iter end, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Forward search for a pattern in an -interval in the CSA.
t_csa::size_type extract(t_csa const &csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Writes the substring T[begin..end] of the original text T to text[0..end-begin+1].
uint64_t count(t_k2_treap const &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
csa_wt ::size_type bidirectional_search_backward(csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > const &csa_fwd, SDSL_UNUSED csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > const &csa_bwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, t_pat_iter begin, t_pat_iter end, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search in backward direction.
t_csa::char_type first_row_symbol(const typename t_csa::size_type i, t_csa const &csa)
Get the symbol at position i in the first row of the sorted suffixes of CSA.
csa_wt< t_wt >::size_type bidirectional_search_forward(SDSL_UNUSED csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > const &csa_fwd, csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > const &csa_bwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, t_pat_iter begin, t_pat_iter end, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search in forward direction.
t_csa::size_type backward_search(t_csa const &csa, typename t_csa::size_type l, typename t_csa::size_type r, typename t_csa::char_type c, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Backward search for a character c in an -interval in the CSA.
int_vector ::size_type size(range_type const &r)
Size of a range.
csa_wt< t_wt >::size_type bidirectional_search(csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > const &csa_fwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, typename csa_wt<>::char_type c, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search for a character c on an interval of the suffix array.
t_rac locate(t_csa const &csa, t_pat_iter begin, t_pat_iter end, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Calculates all occurrences of a pattern pat in a CSA.
Contains declarations and definitions of data structure concepts.
suffix_array_helper.hpp contains some helper classes for CSTs
wt_pc.hpp contains a class for the wavelet tree of byte sequences.