SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
suffix_array_algorithm.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_SUFFIX_ARRAY_ALGORITHM
9#define INCLUDED_SDSL_SUFFIX_ARRAY_ALGORITHM
10
11#include <iterator>
12
13#include <sdsl/csa_wt.hpp>
15
16namespace sdsl
17{
18
20
35template <class t_csa, class t_pat_iter>
36typename t_csa::size_type
37forward_search(const t_csa & csa,
38 typename t_csa::size_type l,
39 typename t_csa::size_type r,
40 t_pat_iter begin,
41 t_pat_iter end,
42 typename t_csa::size_type & l_res,
43 typename t_csa::size_type & r_res,
45 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
46 csa_tag())
47{
48 assert(l <= r);
49 assert(r < csa.size());
50
51 auto size = csa.size();
52
53 l_res = l;
54 r_res = l - 1;
55 auto l_res_upper = r + 1;
56 auto r_res_upper = r + 1;
57
58 // shortcut for too long patterns
59 if ((typename t_csa::size_type)(end - begin) >= size) return 0;
60
61 // compares the pattern with CSA-prefix i (truncated to length $|pattern|$).
62 auto compare = [&](typename t_csa::size_type i) -> int {
63 for (auto current = begin; current != end; current++)
64 {
65 auto index = csa.char2comp[*current];
66 if (index == 0) return -1;
67 if (csa.C[index + 1] - 1 < i) return -1;
68 if (csa.C[index] > i) return 1;
69 i = csa.psi[i];
70 }
71 return 0;
72 };
73
74 // binary search (on min)
75 while (l_res < l_res_upper)
76 {
77 typename t_csa::size_type sample = l_res + (l_res_upper - l_res) / 2;
78 int result = compare(sample);
79 if (result == 1)
80 l_res = sample + 1;
81 else if (result == -1)
82 l_res_upper = sample;
83 else
84 l_res_upper = sample;
85 }
86
87 // binary search (on max)
88 while (r_res + 1 < r_res_upper)
89 {
90 typename t_csa::size_type sample = r_res + (r_res_upper - r_res) / 2;
91 int result = compare(sample);
92 if (result == 1)
93 r_res = sample;
94 else if (result == -1)
95 r_res_upper = sample;
96 else
97 r_res = sample;
98 }
99
100 return r_res - l_res + 1;
101}
102
104
117template <class t_csa>
118typename t_csa::size_type
119forward_search(const t_csa & csa,
120 typename t_csa::size_type l,
121 typename t_csa::size_type r,
122 typename t_csa::char_type c,
123 typename t_csa::size_type & l_res,
124 typename t_csa::size_type & r_res,
126 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
127 csa_tag())
128{
129 auto c_ptr = &c;
130 return forward_search(csa, l, r, c_ptr, c_ptr + 1, l_res, r_res);
131}
132
134
155template <class t_csa>
156typename t_csa::size_type
157backward_search(const t_csa & csa,
158 typename t_csa::size_type l,
159 typename t_csa::size_type r,
160 typename t_csa::char_type c,
161 typename t_csa::size_type & l_res,
162 typename t_csa::size_type & r_res,
164 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
165 csa_tag())
166{
167 assert(l <= r);
168 assert(r < csa.size());
169 typename t_csa::size_type cc = csa.char2comp[c];
170 if (cc == 0 and c > 0)
171 {
172 l_res = 1;
173 r_res = 0;
174 }
175 else
176 {
177 typename t_csa::size_type c_begin = csa.C[cc];
178 if (l == 0 and r + 1 == csa.size())
179 {
180 l_res = c_begin;
181 r_res = csa.C[cc + 1] - 1;
182 }
183 else
184 {
185 l_res = c_begin + csa.bwt.rank(l, c); // count c in bwt[0..l-1]
186 r_res = c_begin + csa.bwt.rank(r + 1, c) - 1; // count c in bwt[0..r]
187 }
188 }
189 assert(r_res + 1 - l_res >= 0);
190 return r_res + 1 - l_res;
191}
192
194
217template <class t_csa, class t_pat_iter>
218typename t_csa::size_type
219backward_search(const t_csa & csa,
220 typename t_csa::size_type l,
221 typename t_csa::size_type r,
222 t_pat_iter begin,
223 t_pat_iter end,
224 typename t_csa::size_type & l_res,
225 typename t_csa::size_type & r_res,
227 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
228 csa_tag())
229{
230 t_pat_iter it = end;
231 while (begin < it and r + 1 - l > 0)
232 {
233 --it;
234 backward_search(csa, l, r, (typename t_csa::char_type) * it, l, r);
235 }
236 l_res = l;
237 r_res = r;
238 return r + 1 - l;
239}
240
242
257template <class t_wt,
258 uint32_t t_dens,
259 uint32_t t_inv_dens,
260 class t_sa_sample_strat,
261 class t_isa,
262 class t_alphabet_strat>
265 typename csa_wt<>::size_type l_fwd,
266 typename csa_wt<>::size_type r_fwd,
267 typename csa_wt<>::size_type l_bwd,
268 typename csa_wt<>::size_type r_bwd,
269 typename csa_wt<>::char_type c,
270 typename csa_wt<>::size_type & l_fwd_res,
271 typename csa_wt<>::size_type & r_fwd_res,
272 typename csa_wt<>::size_type & l_bwd_res,
273 typename csa_wt<>::size_type & r_bwd_res,
274 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
275{
276 assert(l_fwd <= r_fwd);
277 assert(r_fwd < csa_fwd.size());
279 size_type c_begin = csa_fwd.C[csa_fwd.char2comp[c]];
280 auto r_s_b = csa_fwd.wavelet_tree.lex_count(l_fwd, r_fwd + 1, c);
281 size_type rank_l = std::get<0>(r_s_b);
282 size_type s = std::get<1>(r_s_b), b = std::get<2>(r_s_b);
283 size_type rank_r = r_fwd - l_fwd - s - b + rank_l;
284 l_fwd_res = c_begin + rank_l;
285 r_fwd_res = c_begin + rank_r;
286 assert(r_fwd_res + 1 >= l_fwd_res);
287 l_bwd_res = l_bwd + s;
288 r_bwd_res = r_bwd - b;
289 assert(r_bwd_res - l_bwd_res == r_fwd_res - l_fwd_res);
290 return r_fwd_res + 1 - l_fwd_res;
291}
292
294
323template <class t_pat_iter,
324 class t_wt,
325 uint32_t t_dens,
326 uint32_t t_inv_dens,
327 class t_sa_sample_strat,
328 class t_isa,
329 class t_alphabet_strat>
330typename csa_wt<>::size_type
332 csa_fwd,
333 SDSL_UNUSED const csa_wt<t_wt,
334 t_dens,
335 t_inv_dens,
336 t_sa_sample_strat,
337 t_isa,
338 t_alphabet_strat> & csa_bwd,
339 typename csa_wt<>::size_type l_fwd,
340 typename csa_wt<>::size_type r_fwd,
341 typename csa_wt<>::size_type l_bwd,
342 typename csa_wt<>::size_type r_bwd,
343 t_pat_iter begin,
344 t_pat_iter end,
345 typename csa_wt<>::size_type & l_fwd_res,
346 typename csa_wt<>::size_type & r_fwd_res,
347 typename csa_wt<>::size_type & l_bwd_res,
348 typename csa_wt<>::size_type & r_bwd_res,
349 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
350{
351 t_pat_iter it = end;
352 while (begin < it and r_fwd + 1 - l_fwd > 0)
353 {
354 --it;
355 bidirectional_search(csa_fwd,
356 l_fwd,
357 r_fwd,
358 l_bwd,
359 r_bwd,
360 (typename csa_wt<>::char_type) * it,
361 l_fwd,
362 r_fwd,
363 l_bwd,
364 r_bwd);
365 }
366 l_fwd_res = l_fwd;
367 r_fwd_res = r_fwd;
368 l_bwd_res = l_bwd;
369 r_bwd_res = r_bwd;
370 return r_fwd + 1 - l_fwd;
371}
372
374
403template <class t_pat_iter,
404 class t_wt,
405 uint32_t t_dens,
406 uint32_t t_inv_dens,
407 class t_sa_sample_strat,
408 class t_isa,
409 class t_alphabet_strat>
412 t_dens,
413 t_inv_dens,
414 t_sa_sample_strat,
415 t_isa,
416 t_alphabet_strat> & csa_fwd,
418 csa_bwd,
419 typename csa_wt<>::size_type l_fwd,
420 typename csa_wt<>::size_type r_fwd,
421 typename csa_wt<>::size_type l_bwd,
422 typename csa_wt<>::size_type r_bwd,
423 t_pat_iter begin,
424 t_pat_iter end,
425 typename csa_wt<>::size_type & l_fwd_res,
426 typename csa_wt<>::size_type & r_fwd_res,
427 typename csa_wt<>::size_type & l_bwd_res,
428 typename csa_wt<>::size_type & r_bwd_res,
429 SDSL_UNUSED typename std::enable_if<t_wt::lex_ordered, csa_tag>::type x = csa_tag())
430{
431 t_pat_iter it = begin;
432 while (it < end and r_fwd + 1 - l_fwd > 0)
433 {
434 bidirectional_search(csa_bwd,
435 l_bwd,
436 r_bwd,
437 l_fwd,
438 r_fwd,
439 (typename csa_wt<>::char_type) * it,
440 l_bwd,
441 r_bwd,
442 l_fwd,
443 r_fwd);
444 ++it;
445 }
446 l_fwd_res = l_fwd;
447 r_fwd_res = r_fwd;
448 l_bwd_res = l_bwd;
449 r_bwd_res = r_bwd;
450 return r_fwd + 1 - l_fwd;
451}
452
454
466template <class t_csa, class t_pat_iter>
467typename t_csa::size_type count(const t_csa & csa, t_pat_iter begin, t_pat_iter end, csa_tag)
468{
469 if (end - begin > (typename std::iterator_traits<t_pat_iter>::difference_type)csa.size()) return 0;
470 typename t_csa::size_type t = 0; // dummy variable for the backward_search call
471 typename t_csa::size_type result = backward_search(csa, 0, csa.size() - 1, begin, end, t, t);
472 return result;
473}
474
475template <class t_csx, class t_pat_iter>
476typename t_csx::size_type count(const t_csx & csx, t_pat_iter begin, t_pat_iter end)
477{
478 typename t_csx::index_category tag;
479 return count(csx, begin, end, tag);
480}
481
483
494template <class t_csx>
495typename t_csx::size_type count(const t_csx & csx, const typename t_csx::string_type & pat)
496{
497 typename t_csx::index_category tag;
498 return count(csx, pat.begin(), pat.end(), tag);
499}
500
502
513template <typename t_csx, typename t_pat_iter>
514auto lex_interval(const t_csx & csx, t_pat_iter begin, t_pat_iter end) -> std::array<typename t_csx::size_type, 2>
515{
516 std::array<typename t_csx::size_type, 2> res;
517 backward_search(csx, 0, csx.size() - 1, begin, end, res[0], res[1]);
518 return res;
519}
520
522
536template <class t_csa, class t_pat_iter, class t_rac = int_vector<64>>
537t_rac locate(const t_csa & csa,
538 t_pat_iter begin,
539 t_pat_iter end,
541 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
542 csa_tag())
543{
544 typename t_csa::size_type occ_begin, occ_end, occs;
545 occs = backward_search(csa, 0, csa.size() - 1, begin, end, occ_begin, occ_end);
546 t_rac occ(occs);
547 for (typename t_csa::size_type i = 0; i < occs; ++i) { occ[i] = csa[occ_begin + i]; }
548 return occ;
549}
550
552
564template <class t_csx, class t_rac = int_vector<64>>
565t_rac locate(const t_csx & csx, const typename t_csx::string_type & pat)
566{
567 typename t_csx::index_category tag;
568 return locate<t_csx, decltype(pat.begin()), t_rac>(csx, pat.begin(), pat.end(), tag);
569}
570
572
583template <class t_csa, class t_text_iter>
584typename t_csa::size_type extract(const t_csa & csa,
585 typename t_csa::size_type begin,
586 typename t_csa::size_type end,
587 t_text_iter text,
589 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value,
590 csa_tag>::type x = csa_tag())
591{
592 typename t_csa::extract_category extract_tag;
593 return extract(csa, begin, end, text, extract_tag);
594}
595
597template <class t_csa, class t_text_iter>
598typename t_csa::size_type extract(const t_csa & csa,
599 typename t_csa::size_type begin,
600 typename t_csa::size_type end,
601 t_text_iter text,
602 lf_tag)
603{
604 assert(end < csa.size());
605 assert(begin <= end);
606 auto steps = end - begin + 1;
607 if (steps > 0)
608 {
609 auto order = csa.isa[end];
610 text[--steps] = first_row_symbol(order, csa);
611 while (steps != 0)
612 {
613 auto rc = csa.wavelet_tree.inverse_select(order);
614 auto j = rc.first;
615 auto c = rc.second;
616 order = csa.C[csa.char2comp[c]] + j;
617 text[--steps] = c;
618 }
619 }
620 return end - begin + 1;
621}
622
624template <class t_csa, class t_text_iter>
625typename t_csa::size_type extract(const t_csa & csa,
626 typename t_csa::size_type begin,
627 typename t_csa::size_type end,
628 t_text_iter text,
629 psi_tag)
630{
631 assert(end < csa.size());
632 assert(begin <= end);
633 typename t_csa::size_type steps = end - begin + 1;
634 for (typename t_csa::size_type i = 0, order = csa.isa[begin]; steps != 0; --steps, ++i)
635 {
636 text[i] = first_row_symbol(order, csa);
637 if (steps != 0) order = csa.psi[order];
638 }
639 return end - begin + 1;
640}
641
643
655template <class t_csa>
656typename t_csa::string_type
657extract(const t_csa & csa,
658 typename t_csa::size_type begin,
659 typename t_csa::size_type end,
661 typename std::enable_if<std::is_same<csa_tag, typename t_csa::index_category>::value, csa_tag>::type x =
662 csa_tag())
663{
664 assert(end <= csa.size());
665 assert(begin <= end);
666 typedef typename t_csa::string_type string_type;
667 string_type result(end - begin + 1, (typename string_type::value_type)0);
668 extract(csa, begin, end, result.begin());
669 return result;
670}
671
672} // namespace sdsl
673#endif
A class for the Compressed Suffix Array (CSA) based on a Wavelet Tree (WT) of the Burrow Wheeler Tran...
Definition: csa_wt.hpp:56
const alphabet_type::char2comp_type & char2comp
Definition: csa_wt.hpp:116
const alphabet_type::C_type & C
Definition: csa_wt.hpp:118
alphabet_type::char_type char_type
Definition: csa_wt.hpp:96
size_type size() const
Number of elements in the .
Definition: csa_wt.hpp:163
const wavelet_tree_type & wavelet_tree
Definition: csa_wt.hpp:129
int_vector ::size_type size_type
Definition: csa_wt.hpp:83
size_type size() const noexcept
The number of elements in the int_vector.
#define SDSL_UNUSED
Definition: config.hpp:13
csa_wt.hpp contains an implementation of the compressed suffix array based on a wavelet tree.
Namespace for the succinct data structure library.
t_csa::char_type first_row_symbol(const typename t_csa::size_type i, const t_csa &csa)
Get the symbol at position i in the first row of the sorted suffixes of CSA.
auto lex_interval(const t_csx &csx, t_pat_iter begin, t_pat_iter end) -> std::array< typename t_csx::size_type, 2 >
Returns the lexicographic interval of a pattern in the SA.
csa_wt ::size_type bidirectional_search_backward(const csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > &csa_fwd, SDSL_UNUSED const csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > &csa_bwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, t_pat_iter begin, t_pat_iter end, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search in backward direction.
uint64_t count(const t_k2_treap &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
csa_wt< t_wt >::size_type bidirectional_search_forward(SDSL_UNUSED const csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > &csa_fwd, const csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > &csa_bwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, t_pat_iter begin, t_pat_iter end, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search in forward direction.
t_csa::size_type forward_search(const t_csa &csa, typename t_csa::size_type l, typename t_csa::size_type r, t_pat_iter begin, t_pat_iter end, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Forward search for a pattern in an -interval in the CSA.
csa_wt< t_wt >::size_type bidirectional_search(const csa_wt< t_wt, t_dens, t_inv_dens, t_sa_sample_strat, t_isa, t_alphabet_strat > &csa_fwd, typename csa_wt<>::size_type l_fwd, typename csa_wt<>::size_type r_fwd, typename csa_wt<>::size_type l_bwd, typename csa_wt<>::size_type r_bwd, typename csa_wt<>::char_type c, typename csa_wt<>::size_type &l_fwd_res, typename csa_wt<>::size_type &r_fwd_res, typename csa_wt<>::size_type &l_bwd_res, typename csa_wt<>::size_type &r_bwd_res, SDSL_UNUSED typename std::enable_if< t_wt::lex_ordered, csa_tag >::type x=csa_tag())
Bidirectional search for a character c on an interval of the suffix array.
t_rac locate(const t_csa &csa, t_pat_iter begin, t_pat_iter end, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Calculates all occurrences of a pattern pat in a CSA.
t_csa::size_type extract(const t_csa &csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Writes the substring T[begin..end] of the original text T to text[0..end-begin+1].
t_csa::size_type backward_search(const t_csa &csa, typename t_csa::size_type l, typename t_csa::size_type r, typename t_csa::char_type c, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Backward search for a character c in an -interval in the CSA.
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
suffix_array_helper.hpp contains some helper classes for CSTs