SDSL 3.0.2
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct_lcp.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CONSTRUCT_LCP
9#define INCLUDED_SDSL_CONSTRUCT_LCP
10
11#include <algorithm>
12#include <assert.h>
13#include <iostream>
14#include <memory>
15#include <queue>
16#include <stdint.h>
17#include <string>
18#include <utility>
19#include <vector>
20
21#include <sdsl/bits.hpp>
22#include <sdsl/config.hpp>
25#include <sdsl/int_vector.hpp>
27#include <sdsl/io.hpp>
31#include <sdsl/util.hpp>
32#include <sdsl/wt_algorithm.hpp>
33#include <sdsl/wt_huff.hpp>
34
35//#define STUDY_INFORMATIONS
36
37namespace sdsl
38{
39
40// Forward declaration of construction method
41// template <class t_index>
42// void construct(t_index& idx, const std::string& file);
43
45
62template <uint8_t t_width>
64{
65 static_assert(t_width == 0 or t_width == 8,
66 "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
67 int_vector<> lcp;
68 typedef int_vector<>::size_type size_type;
69 construct_isa(config);
70 {
73 {
74 return;
75 }
77 std::ios::in,
78 1000000); // init isa file_buffer
79 int_vector<> sa;
80 if (!load_from_cache(sa, conf::KEY_SA, config))
81 {
82 return;
83 }
84 // use Kasai algorithm to compute the lcp values
85 for (size_type i = 0, j = 0, sa_1 = 0, l = 0, n = isa_buf.size(); i < n; ++i)
86 {
87 sa_1 = isa_buf[i]; // = isa[i]
88 if (sa_1)
89 {
90 j = sa[sa_1 - 1];
91 if (l)
92 --l;
93 assert(i != j);
94 while (text[i + l] == text[j + l])
95 { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
96 ++l;
97 }
98 sa[sa_1 - 1] = l; // overwrite sa array with lcp values
99 }
100 else
101 {
102 l = 0;
103 sa[n - 1] = 0;
104 }
105 }
106
107 for (size_type i = sa.size(); i > 1; --i)
108 {
109 sa[i - 1] = sa[i - 2];
110 }
111 sa[0] = 0;
112 lcp = std::move(sa);
113 }
114 store_to_cache(lcp, conf::KEY_LCP, config);
115}
116
118
133template <uint8_t t_width>
135{
136 static_assert(t_width == 0 or t_width == 8,
137 "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
138 typedef int_vector<>::size_type size_type;
139 typedef int_vector<t_width> text_type;
140 char const * KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
142 size_type n = sa_buf.size();
143
144 assert(n > 0);
145 if (1 == n)
146 { // Handle special case: Input only the sentinel character.
147 int_vector<> lcp(1, 0);
148 store_to_cache(lcp, conf::KEY_LCP, config);
149 return;
150 }
151
152 // (1) Calculate PHI (stored in array plcp)
153 int_vector<> plcp(n, 0, sa_buf.width());
154 for (size_type i = 0, sai_1 = 0; i < n; ++i)
155 {
156 size_type sai = sa_buf[i];
157 plcp[sai] = sai_1;
158 sai_1 = sai;
159 }
160
161 // (2) Load text from disk
162 text_type text;
163 load_from_cache(text, KEY_TEXT, config);
164
165 // (3) Calculate permuted LCP array (text order), called PLCP
166 size_type max_l = 0;
167 for (size_type i = 0, l = 0; i < n - 1; ++i)
168 {
169 size_type phii = plcp[i];
170 while (text[i + l] == text[phii + l])
171 {
172 ++l;
173 }
174 plcp[i] = l;
175 if (l)
176 {
177 max_l = std::max(max_l, l);
178 --l;
179 }
180 }
181 util::clear(text);
182 uint8_t lcp_width = bits::hi(max_l) + 1;
183
184 // (4) Transform PLCP into LCP
185 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
186 size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
187 int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp
188 lcp_buf[0] = 0;
189 sa_buf.buffersize(buffer_size);
190 for (size_type i = 1; i < n; ++i)
191 {
192 size_type sai = sa_buf[i];
193 lcp_buf[i] = plcp[sai];
194 }
195 lcp_buf.close();
197}
198
200
217{
218 typedef int_vector<>::size_type size_type;
220 size_type n = sa_buf.size();
221 if (1 == n)
222 {
223 int_vector<> lcp(1, 0);
224 store_to_cache(lcp, conf::KEY_LCP, config);
225 return;
226 }
227 const uint8_t log_q = 6; // => q=64
228 const uint32_t q = 1 << log_q;
229 const uint64_t modq = bits::lo_set[log_q];
230
231 // n-1 is the maximum entry in SA
232 int_vector<64> plcp((n - 1 + q) >> log_q);
233
234 for (size_type i = 0, sai_1 = 0; i < n; ++i)
235 { // we can start at i=0. if SA[i]%q==0
236 // we set PHI[(SA[i]=n-1)%q]=0, since T[0]!=T[n-1]
237 size_type sai = sa_buf[i];
238 if ((sai & modq) == 0)
239 {
240 if ((sai >> log_q) >= plcp.size())
241 {
242 // std::cerr<<"sai="<<sai<<" log_q="<<log_q<<" sai>>log_q="<<(sai>>log_q)<<"
243 // "<<sai_1<<std::endl; std::cerr<<"n="<<n<<" "<<" plcp.size()="<<plcp.size();
244 }
245 plcp[sai >> log_q] = sai_1;
246 }
247 sai_1 = sai;
248 }
249
250 int_vector<8> text;
251 load_from_cache(text, conf::KEY_TEXT, config);
252
253 for (size_type i = 0, j, k, l = 0; i < plcp.size(); ++i)
254 {
255 j = i << log_q; // j=i*q
256 k = plcp[i];
257 while (text[j + l] == text[k + l])
258 ++l;
259 plcp[i] = l;
260 if (l >= q)
261 {
262 l -= q;
263 }
264 else
265 {
266 l = 0;
267 }
268 }
269
270 size_type buffer_size = 4000000; // buffer_size is a multiple of 8!
271 sa_buf.buffersize(buffer_size);
273 std::ios::out,
274 buffer_size,
275 sa_buf.width()); // open buffer for plcp
276
277 for (size_type i = 0, sai_1 = 0, l = 0, sai = 0, iq = 0; i < n; ++i)
278 {
279 /*size_type*/ sai = sa_buf[i];
280 // std::cerr<<"i="<<i<<" sai="<<sai<<std::endl;
281 if ((sai & modq) == 0)
282 { // we have already worked the value out ;)
283 lcp_out_buf[i] = l = plcp[sai >> log_q];
284 }
285 else
286 {
287 /*size_type*/ iq = sai & bits::lo_unset[log_q];
288 l = plcp[sai >> log_q];
289 if (l > (sai - iq))
290 l -= (sai - iq);
291 else
292 l = 0;
293 while (text[sai + l] == text[sai_1 + l])
294 ++l;
295 lcp_out_buf[i] = l;
296 }
297#ifdef CHECK_LCP
298 size_type j = 0;
299 for (j = 0; j < l; ++j)
300 {
301 if (text[sai + j] != text[sai_1 + j])
302 {
303 std::cout << "lcp[" << i << "]=" << l << " is two big! " << j << " is right!"
304 << " sai=" << sai << std::endl;
305 if ((sai & modq) != 0)
306 std::cout << " plcp[sai>>log_q]=" << plcp[sai >> log_q] << " sai-iq=" << sai - iq << " sai=" << sai
307 << " sai-iq=" << sai - iq << std::endl;
308 break;
309 }
310 }
311#endif
312 sai_1 = sai;
313 }
314 lcp_out_buf.close();
316 return;
317}
318
320
338inline void construct_lcp_go(cache_config & config)
339{
340 typedef int_vector<>::size_type size_type;
341#ifdef STUDY_INFORMATIONS
342 size_type racs = 0; // random accesses to the text
343 size_type matches = 0;
344 size_type comps2 = 0; // comparisons the second phase
345#endif
346 int_vector<8> text;
347 load_from_cache(text, conf::KEY_TEXT, config);
348 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
349 const size_type n = sa_buf.size();
350 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
351
352 if (1 == n)
353 {
354 int_vector<> lcp(1, 0);
355 store_to_cache(lcp, conf::KEY_LCP, config);
356 return;
357 }
358
359 size_type cnt_c[257] = {0}; // counter for each character in the text
360 size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
361 size_type cnt_cc2[257] = {0}; //
362 size_type omitted_c[257] = {0}; // counts the omitted occurrences for the second phase
363 size_type prev_occ_in_bwt[256] = {0}; // position of the previous occurrence of each character c in the bwt
364 for (size_type i = 0; i < 256; ++i)
365 prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
366 unsigned char alphabet[257] = {0};
367 uint8_t sigma = 0;
368
369 tLI m_list[2][256];
370 size_type m_char_count[2] = {0};
371 uint8_t m_chars[2][256] = {{0}, {0}};
372
373 size_type nn = 0; // n' for phase 2
374 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
375 {
376
377 int_vector<8> lcp_sml(n,
378 0); // initialize array for small values of first phase; note lcp[0]=0
379 size_type done_cnt = 0;
380
381 for (size_type i = 0; i < n; ++i)
382 { // initialize cnt_c
383 ++cnt_c[text[i] + 1];
384 }
385 for (int i = 1; i < 257; ++i)
386 { // calculate sigma and initailize cnt_cc
387 if (cnt_c[i] > 0)
388 {
389 alphabet[sigma++] = (unsigned char)(i - 1);
390 }
391 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
392 }
393 alphabet[sigma] = '\0';
394 {
395 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
396 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
397 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
398 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
399 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
400 ++omitted_c[alphabet[0]]; //
401
402 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
403 rmq_stack[0] = 0;
404 rmq_stack[1] = 0; // first element (-1, -1)
405 rmq_stack[2] = 1;
406 rmq_stack[3] = 0; // second element (0, -1)
407 size_type rmq_end = 3; // index of the value of the topmost element
408
409 const size_type m_mod2 = m % 2;
410 uint8_t cur_c = alphabet[1];
411 size_type big_val = 0;
412 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
413 {
414 uint8_t bwti = bwt_buf[i];
415 sai = sa_buf[i];
416 size_type lf = cnt_cc[bwti];
417 if (!cur_c_cnt)
418 { // cur_c_cnt==0, if there is no more occurence of the current character
419 if (cur_c_cnt < sigma)
420 {
421 cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1];
422 }
423 }
424 size_type l = 0;
425 if (i >= cnt_cc[cur_c])
426 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
427 if (bwti == bwti_1 and lf < i)
428 { // BWT[i]==BWT[i-1]
429 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
430 if (l == m)
431 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
432 l += (text[sai_1 + m] == text[sai + m]);
433#ifdef STUDY_INFORMATIONS
434 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
435 ++racs;
436#endif
437 }
438 lcp_sml[i] = l;
439 ++done_cnt;
440 }
441 else
442 { // BWT[i] != BWT[i-1] or LF[i] > i
443 if (lf < i)
444 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
445#ifdef STUDY_INFORMATIONS
446 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
447 ++racs;
448#endif
449 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
450 {
451 ++l;
452#ifdef STUDY_INFORMATIONS
453 ++matches;
454#endif
455 }
456 lcp_sml[i] = l;
457 }
458 }
459 else
460 { // if already done
461 l = lcp_sml[i]; // load LCP value
462 }
463 if (l > m)
464 {
465 ++big_val;
466 if (i > 10000 and i < 10500 and big_val > 3000)
467 { // if most of the values are big: switch to PHI algorithm
468 util::clear(text);
469 util::clear(lcp_sml);
470 construct_lcp_PHI<8>(config);
471 return;
472 }
473 }
474 // invariant: l <= m+1
475 // begin update rmq_stack
476 size_type x = l + 1;
477 size_type j = rmq_end;
478 while (x <= rmq_stack[j])
479 j -= 2; // pop all elements with value >= l
480 rmq_stack[++j] = i + 1; // push position i
481 rmq_stack[++j] = x; // push value l
482 rmq_end = j; // update index of the value of the topmost element
483 if (lf > i)
484 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
485 ++done_cnt;
486 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
487 // rmq is linear in the stack size; can also be implemented with binary search on the stack
488 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
489 j = rmq_end - 3;
490 while (x_pos <= rmq_stack[j])
491 j -= 2; // search smallest value in the interval I
492 lcp_sml[lf] =
493 rmq_stack[j + 3] - (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
494 }
495 if (l >= m)
496 {
497 if (l == m)
498 push_front_m_index(nn, cur_c, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
499 ++nn;
500 }
501 else
502 ++omitted_c[cur_c];
503
504 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
505 ++cnt_cc[bwti]; // update counter and therefore the LF information
506 sai_1 = sai; // update SA[i-1]
507 bwti_1 = bwti; // update BWT[i-1]
508 }
509 }
510 util::clear(text);
511
512 if (n > 1000 and nn > 5 * (n / 6))
513 { // if we would occupy more space than the PHI algorithm => switch to PHI algorithm
514 util::clear(lcp_sml);
515 construct_lcp_PHI<8>(config);
516 return;
517 }
518 store_to_cache(lcp_sml, "lcp_sml", config);
519 }
520#ifdef STUDY_INFORMATIONS
521 std::cout << "# n=" << n << " nn=" << nn << " nn/n=" << ((double)nn) / n << std::endl;
522#endif
523
524 // phase 2: calculate lcp_big
525 {
526 // std::cout<<"# begin calculating LF' values"<<std::endl;
527 int_vector<> lcp_big(nn,
528 0,
529 bits::hi(n - 1) + 1); // lcp_big first contains adapted LF values and finally the big LCP
530 // values
531 {
532 // initialize lcp_big with adapted LF values
533 bit_vector todo(n, 0); // bit_vector todo indicates which values are >= m in lcp_sml
534 {
535 // initialize bit_vector todo
536 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
537 for (size_type i = 0; i < n; ++i)
538 {
539 if (lcp_sml_buf[i] >= m)
540 {
541 todo[i] = 1;
542 }
543 }
544 }
545
546 cnt_cc2[0] = cnt_cc[0] = 0;
547 for (size_type i = 1, omitted_sum = 0; i < 257; ++i)
548 { // initialize cnt_cc and cnt_cc2
549 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
550 omitted_sum += omitted_c[i - 1];
551 cnt_cc2[i] = cnt_cc[i] - omitted_sum;
552 }
553
554 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
555 for (size_type i = 0, i2 = 0; i < n; ++i)
556 {
557 uint8_t b = bwt_buf[i]; // store BWT[i]
558 size_type lf_i = cnt_cc[b]; // LF[i]
559 if (todo[i])
560 { // LCP[i] is a big value
561 if (todo[lf_i])
562 { // LCP[LF[i]] is a big entry
563 lcp_big[i2] = cnt_cc2[b]; // LF'[i]
564 } /*else{
565 * lcp_big[i2] = 0;
566 }*/
567 ++i2;
568 }
569 if (todo[lf_i])
570 { // LCP[LF[i]] is a big entry
571 ++cnt_cc2[b]; // increment counter for adapted LF
572 }
573 ++cnt_cc[b]; // increment counter for LF
574 }
575 }
576
577 // std::cout<<"# begin initializing bwt2, shift_bwt2, run2"<<std::endl;
578 int_vector<8> bwt2(nn),
579 shift_bwt2(nn); // BWT of big LCP values, and shifted BWT of
580 // big LCP values
581 bit_vector run2(nn + 1); // indicates for each entry i, if i and i-1 are both big LCP values
582 run2[nn] = 0; // index nn is not a big LCP value
583 {
584 // initialize bwt2, shift_bwt2, adj2
585 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
586 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
587 uint8_t b_1 = '\0'; // BWT[i-1]
588 bool is_run = false;
589 for (size_type i = 0, i2 = 0; i < n; ++i)
590 {
591 uint8_t b = bwt_buf[i];
592 if (lcp_sml_buf[i] >= m)
593 {
594 bwt2[i2] = b;
595 shift_bwt2[i2] = b_1;
596 run2[i2] = is_run;
597 is_run = true;
598 ++i2;
599 }
600 else
601 {
602 is_run = false;
603 }
604 b_1 = b;
605 }
606 }
607
608 bit_vector todo2(nn + 1, 1); // init all values with 1, except
609 todo2[nn] = 0; // the last one! (handels case "i < nn")
610
611 // std::cout<<"# begin calculating m-indices"<<std::endl;
612 {
613 // calculate m-indices, (m+1)-indices,... until we are done
614 size_type m2 = m;
615 size_type char_ex[256];
616 for (size_type i = 0; i < 256; ++i)
617 char_ex[i] = nn;
618 size_type char_occ = 0;
619 size_type m_mod2 = m2 % 2, mm1_mod2 = (m2 + 1) % 2;
620 while (m_char_count[m_mod2] > 0)
621 { // while there are m-indices, calculate (m+1)-indices and write m-indices
622 // For all values LCP[i] >= m2 it follows that todo2[i] == 1
623 // list m_list[mm1_mod2][b] is sorted in decreasing order
624 ++m2;
625 mm1_mod2 = (m2 + 1) % 2, m_mod2 = m2 % 2;
626 m_char_count[m_mod2] = 0;
627
628 std::sort(m_chars[mm1_mod2],
629 m_chars[mm1_mod2] + m_char_count[mm1_mod2]); // TODO: ersetzen?
630
631 for (size_type mc = 0; mc < m_char_count[mm1_mod2]; ++mc)
632 { // for every character
633 tLI & mm1_mc_list = m_list[mm1_mod2][m_chars[mm1_mod2][m_char_count[mm1_mod2] - 1 - mc]];
634 // size_type old_i = nn;
635 while (!mm1_mc_list.empty())
636 {
637 size_type i = mm1_mc_list.front(); // i in [0..n-1]
638 mm1_mc_list.pop_front();
639 // For all values LCP[i] >= m-1 it follows that todo2[i] == 1
640 for (size_type k = i; todo2[k]; --k)
641 {
642#ifdef STUDY_INFORMATIONS
643 ++comps2;
644#endif
645 uint8_t b = shift_bwt2[k];
646 if (char_ex[b] != i)
647 {
648 char_ex[b] = i;
649 ++char_occ;
650 }
651 if (!run2[k])
652 break;
653 }
654 for (size_type k = i; todo2[k] and char_occ; ++k)
655 {
656#ifdef STUDY_INFORMATIONS
657 ++comps2;
658#endif
659 uint8_t b = bwt2[k];
660 if (char_ex[b] == i)
661 {
662 size_type p = lcp_big[k];
663 push_back_m_index(p, b, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
664 char_ex[b] = nn;
665 --char_occ;
666 }
667 if (!run2[k + 1])
668 break;
669 }
670 lcp_big[i] = m2 - 1;
671 todo2[i] = 0;
672 // old_i = i;
673 }
674 }
675 }
676 }
677
678 store_to_cache(lcp_big, "lcp_big", config);
679 } // end phase 2
680
681 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
682 // phase 3: merge lcp_sml and lcp_big and save to disk
683 {
684 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
685 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
686 config)); // file buffer containing the big LCP values
687 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
688 std::ios::in,
689 buffer_size); // file buffer containing the small LCP values
691 std::ios::out,
692 buffer_size,
693 lcp_big_buf.width()); // buffer for the resulting LCP array
694 for (size_type i = 0, i2 = 0; i < n; ++i)
695 {
696 size_type l = lcp_sml_buf[i];
697 if (l >= m)
698 { // if l >= m it is stored in lcp_big
699 l = lcp_big_buf[i2];
700 ++i2;
701 }
702 lcp_buf[i] = l;
703 }
704 lcp_buf.close();
705 }
707
708#ifdef STUDY_INFORMATIONS
709 std::cout << "# racs: " << racs << std::endl;
710 std::cout << "# matches: " << matches << std::endl;
711 std::cout << "# comps2: " << comps2 << std::endl;
712#endif
713 return;
714}
715
717
736{
737 typedef int_vector<>::size_type size_type;
738 int_vector<8> text;
739 load_from_cache(text, conf::KEY_TEXT, config); // load text from file system
740 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
741 const size_type n = sa_buf.size();
742 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
743
744 if (1 == n)
745 {
746 int_vector<> lcp(1, 0);
747 store_to_cache(lcp, conf::KEY_LCP, config);
748 return;
749 }
750
751 size_type cnt_c[257] = {0}; // counter for each character in the text
752 size_type cnt_cc[257] = {0}; // prefix sum of the counter cnt_c
753 size_type omitted_c[257] = {0}; // counts the omitted occurrences for the second phase
754 size_type prev_occ_in_bwt[256] = {0}; // position of the previous occurrence of each character c in the bwt
755 for (size_type i = 0; i < 256; ++i)
756 prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
757 unsigned char alphabet[257] = {0};
758 uint8_t sigma = 0;
759
760 size_type nn = 0; // n' for phase 2
761 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
762 {
763
764 int_vector<8> lcp_sml(n,
765 0); // initialize array for small values of first phase; note lcp[0]=0
766 size_type done_cnt = 0;
767
768 for (size_type i = 0; i < n; ++i)
769 { // initialize cnt_c
770 ++cnt_c[text[i] + 1];
771 }
772 for (int i = 1; i < 257; ++i)
773 { // calculate sigma and initailize cnt_cc
774 if (cnt_c[i] > 0)
775 {
776 alphabet[sigma++] = (unsigned char)(i - 1);
777 }
778 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
779 }
780 alphabet[sigma] = '\0';
781 {
782 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
783 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
784 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
785 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
786 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
787 ++omitted_c[alphabet[0]]; //
788
789 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
790 rmq_stack[0] = 0;
791 rmq_stack[1] = 0; // first element (-1, -1)
792 rmq_stack[2] = 1;
793 rmq_stack[3] = 0; // second element (0, -1)
794 size_type rmq_end = 3; // index of the value of the topmost element
795
796 uint8_t cur_c = alphabet[1];
797 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
798 {
799 uint8_t bwti = bwt_buf[i];
800 sai = sa_buf[i];
801 size_type lf = cnt_cc[bwti];
802 if (!cur_c_cnt)
803 { // cur_c_cnt==0, if there is no more occurence of the current character
804 if (cur_c_cnt < sigma)
805 {
806 cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1];
807 }
808 }
809 size_type l = 0;
810 if (i >= cnt_cc[cur_c])
811 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
812 if (bwti == bwti_1 and lf < i)
813 { // BWT[i]==BWT[i-1]
814 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
815 if (l == m)
816 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
817 l += (text[sai_1 + m] == text[sai + m]);
818 }
819 lcp_sml[i] = l;
820 ++done_cnt;
821 }
822 else
823 { // BWT[i] != BWT[i-1] or LF[i] > i
824 if (lf < i)
825 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
826 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
827 {
828 ++l;
829 }
830 lcp_sml[i] = l;
831 }
832 }
833 else
834 { // if already done
835 l = lcp_sml[i]; // load LCP value
836 }
837 // invariant: l <= m+1
838 // begin update rmq_stack
839 size_type x = l + 1;
840 size_type j = rmq_end;
841 while (x <= rmq_stack[j])
842 j -= 2; // pop all elements with value >= l
843 rmq_stack[++j] = i + 1; // push position i
844 rmq_stack[++j] = x; // push value l
845 rmq_end = j; // update index of the value of the topmost element
846 if (lf > i)
847 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
848 ++done_cnt;
849 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
850 // rmq is linear in the stack size; can also be implemented with binary search on the stack
851 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
852 j = rmq_end - 3;
853 while (x_pos <= rmq_stack[j])
854 j -= 2; // search smallest value in the interval I
855 lcp_sml[lf] =
856 rmq_stack[j + 3] - (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
857 }
858 if (l > m)
859 {
860 ++nn;
861 }
862 else
863 ++omitted_c[cur_c];
864
865 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
866 ++cnt_cc[bwti]; // update counter and therefore the LF information
867 sai_1 = sai; // update SA[i-1]
868 bwti_1 = bwti; // update BWT[i-1]
869 }
870 }
871 store_to_cache(lcp_sml, "lcp_sml", config);
872 }
873
874 // phase 2: calculate lcp_big with PHI algorithm on remaining entries of LCP
875 {
876 int_vector<> lcp_big(0, 0, bits::hi(n - 1) + 1); // nn, 0, bits::hi(n-1)+1);
877 {
878
879 memory_monitor::event("lcp-init-phi-begin");
880 size_type sa_n_1 = 0; // value for SA[n-1]
881 bit_vector todo(n, 0); // bit_vector todo indicates which values are > m in lcp_sml
882 {
883 // initialize bit_vector todo
884 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
885 for (size_type i = 0; i < n; ++i)
886 {
887 if (lcp_sml_buf[i] > m)
888 {
889 todo[sa_buf[i]] = 1;
890 }
891 }
892 sa_n_1 = sa_buf[n - 1];
893 }
894 rank_support_v<> todo_rank(&todo); // initialize rank for todo
895
896 const size_type bot = sa_n_1;
897 int_vector<> phi(nn, bot, bits::hi(n - 1) + 1); // phi
898
899 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
900 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
901 uint8_t b_1 = 0;
902 for (size_type i = 0, sai_1 = 0; i < n; ++i)
903 { // initialize phi
904 uint8_t b = bwt_buf[i]; // store BWT[i]
905 size_type sai = sa_buf[i];
906 if (lcp_sml_buf[i] > m and b != b_1)
907 { // if i is a big irreducable value
908 phi[todo_rank(sai)] = sai_1;
909 } // otherwise phi is equal to bot
910 b_1 = b;
911 sai_1 = sai;
912 }
913 memory_monitor::event("lcp-init-phi-end");
914
915 memory_monitor::event("lcp-calc-plcp-begin");
916 for (size_type i = 0, ii = 0, l = m + 1, p = 0; i < n and ii < nn; ++i)
917 { // execute compact Phi algorithm
918 if (todo[i])
919 {
920 if (i > 0 and todo[i - 1])
921 l = l - 1;
922 else
923 l = m + 1;
924 if ((p = phi[ii]) != bot)
925 {
926 while (text[i + l] == text[p + l])
927 ++l;
928 }
929 phi[ii++] = l;
930 }
931 }
932 memory_monitor::event("lcp-calc-plcp-end");
933 util::clear(text);
934
935 memory_monitor::event("lcp-calc-lcp-begin");
936 lcp_big.resize(nn);
937 for (size_type i = 0, ii = 0; i < n and ii < nn; ++i)
938 {
939 if (lcp_sml_buf[i] > m)
940 {
941 lcp_big[ii++] = phi[todo_rank(sa_buf[i])];
942 }
943 }
944 memory_monitor::event("lcp-calc-lcp-end");
945 }
946 store_to_cache(lcp_big, "lcp_big", config);
947 } // end phase 2
948
949 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
950 // phase 3: merge lcp_sml and lcp_big and save to disk
951 {
952 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
953 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
954 config)); // file buffer containing the big LCP values
955 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
956 std::ios::in,
957 buffer_size); // file buffer containing the small LCP values
959 std::ios::out,
960 buffer_size,
961 lcp_big_buf.width()); // file buffer for the resulting LCP array
962
963 for (size_type i = 0, i2 = 0; i < n; ++i)
964 {
965 size_type l = lcp_sml_buf[i];
966 if (l > m)
967 { // if l > m it is stored in lcp_big
968 l = lcp_big_buf[i2];
969 ++i2;
970 }
971 lcp_buf[i] = l;
972 }
973 lcp_big_buf.close(true); // close buffer and remove file
974 lcp_sml_buf.close(true); // close buffer and remove file
975 }
977 return;
978}
979
981
996template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
998{
999 typedef int_vector<>::size_type size_type;
1000 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
1001
1002 // create WT
1003 memory_monitor::event("lcp-bwt-create-wt-huff-begin");
1004 t_wt wt_bwt;
1005 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1006 uint64_t n = wt_bwt.size();
1007 memory_monitor::event("lcp-bwt-create-wt-huff-end");
1008
1009 // init
1010 memory_monitor::event("lcp-bwt-init-begin");
1011 size_type lcp_value = 0; // current LCP value
1012 size_type lcp_value_offset = 0; // Largest LCP value in LCP array, that was written on disk
1013 size_type phase = 0; // Count how often the LCP array was written on disk
1014
1015 size_type intervals = 0; // number of intervals which are currently stored
1016 size_type intervals_new = 0; // number of new intervals
1017
1018 std::queue<size_type> q; // Queue for storing the intervals
1019 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1020 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1021 bool queue_used = true;
1022 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1023 // else use dictionary and wavelet tree
1024
1025 size_type quantity; // quantity of characters in interval
1026 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1027 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1028 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1029
1030 // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at
1031 // moste 8 bit
1032 size_type bb = (n * 20 - size_in_bytes(wt_bwt) * 8 * 1.25 - 5 * n)
1033 / n; // 20n - size of wavelet tree * 1.25 for rank support - 5n for bit arrays - n for index_done array
1034 if (n * 20 < size_in_bytes(wt_bwt) * 8 * 1.25 + 5 * n)
1035 {
1036 bb = 6;
1037 }
1038 bb = std::min(bb, (size_type)8);
1039
1040 size_type lcp_value_max = (1ULL << bb) - 1; // Largest LCP value that can be stored into the LCP array
1041 size_type space_in_bit_for_lcp = n * bb; // Space for the LCP array in main memory
1042
1043#ifdef STUDY_INFORMATIONS
1044 std::cout << "# l=" << n << " b=" << (int)bb << " lcp_value_max=" << lcp_value_max
1045 << " size_in_bytes(wt_bwt<v,bs,bs>)=" << size_in_bytes(wt_bwt) << std::endl;
1046#endif
1047
1048 // init partial_lcp
1049 int_vector<> partial_lcp(n, 0, bb); // LCP array
1050
1051 // init index_done
1052 bit_vector index_done(n + 1, false); // bit_vector indicates if entry LCP[i] is amend
1053 rank_support_v<> ds_rank_support; // Rank support for bit_vector index_done
1054
1055 // create C-array
1056 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1057 create_C_array(C, wt_bwt);
1058 memory_monitor::event("lcp-bwt-init-begin-end");
1059 // calculate lcp
1060 memory_monitor::event("lcp-bwt-calc-values-begin");
1061
1062 // calculate first intervals
1063 partial_lcp[0] = 0;
1064 index_done[0] = true;
1065 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1066 for (size_type i = 0; i < quantity; ++i)
1067 {
1068 unsigned char c = cs[i];
1069 size_type a_new = C[c] + rank_c_i[i];
1070 size_type b_new = C[c] + rank_c_j[i];
1071
1072 // Save LCP value if not seen before
1073 if (!index_done[b_new])
1074 {
1075 if (b_new < n)
1076 partial_lcp[b_new] = lcp_value;
1077 index_done[b_new] = true;
1078 // Save interval
1079 q.push(a_new);
1080 q.push(b_new);
1081 ++intervals;
1082 }
1083 }
1084 ++lcp_value;
1085
1086 // calculate LCP values phase by phase
1087 while (intervals)
1088 {
1089 if (intervals < use_queue_and_wt && !queue_used)
1090 {
1091 memory_monitor::event("lcp-bwt-bitvec2queue-begin");
1092 util::clear(dict[target]);
1093
1094 // copy from bitvector to queue
1095 size_type a2 = util::next_bit(dict[source], 0);
1096 size_type b2 = util::next_bit(dict[source], a2 + 1);
1097 while (b2 < dict[source].size())
1098 {
1099 q.push((a2 - 1) >> 1);
1100 q.push(b2 >> 1);
1101 // get next interval
1102 a2 = util::next_bit(dict[source], b2 + 1);
1103 b2 = util::next_bit(dict[source], a2 + 1);
1104 }
1105 util::clear(dict[source]);
1106 memory_monitor::event("lcp-bwt-bitvec2queue-end");
1107 }
1108 if (intervals >= use_queue_and_wt && queue_used)
1109 {
1110 memory_monitor::event("lcp-bwt-queue2bitvec-begin");
1111 dict[source].resize(2 * (n + 1));
1112
1113 util::set_to_value(dict[source], 0);
1114 // copy from queue to bitvector
1115 while (!q.empty())
1116 {
1117 dict[source][(q.front() << 1) + 1] = 1;
1118 q.pop();
1119 dict[source][(q.front() << 1)] = 1;
1120 q.pop();
1121 }
1122 dict[target].resize(2 * (n + 1));
1123
1124 util::set_to_value(dict[target], 0);
1125 memory_monitor::event("lcp-bwt-queue2bitvec-end");
1126 }
1127
1128 if (intervals < use_queue_and_wt)
1129 {
1130 queue_used = true;
1131 intervals_new = 0;
1132 while (intervals)
1133 {
1134 // get next interval
1135 size_type a = q.front();
1136 q.pop();
1137 size_type b = q.front();
1138 q.pop();
1139 --intervals;
1140
1141 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1142 for (size_type i = 0; i < quantity; ++i)
1143 {
1144 unsigned char c = cs[i];
1145 size_type a_new = C[c] + rank_c_i[i];
1146 size_type b_new = C[c] + rank_c_j[i];
1147
1148 // Save LCP value if not seen before
1149 if (!index_done[b_new] and phase == 0)
1150 {
1151 partial_lcp[b_new] = lcp_value;
1152 index_done[b_new] = true;
1153 // Save interval
1154 q.push(a_new);
1155 q.push(b_new);
1156 ++intervals_new;
1157 }
1158 else if (!index_done[b_new])
1159 {
1160 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1161 if (!partial_lcp[insert_pos])
1162 {
1163 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1164 // Save interval
1165 q.push(a_new);
1166 q.push(b_new);
1167 ++intervals_new;
1168 }
1169 }
1170 }
1171 }
1172 intervals = intervals_new;
1173 }
1174 else
1175 {
1176 queue_used = false;
1177 intervals = 0;
1178
1179 // get next interval
1180 size_type a2 = util::next_bit(dict[source], 0);
1181 size_type b2 = util::next_bit(dict[source], a2 + 1);
1182
1183 while (b2 < dict[source].size())
1184 {
1185 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1186 for (size_type i = 0; i < quantity; ++i)
1187 {
1188 unsigned char c = cs[i];
1189 size_type a_new = C[c] + rank_c_i[i];
1190 size_type b_new = C[c] + rank_c_j[i];
1191 // Save LCP value if not seen before
1192 if (!index_done[b_new] and phase == 0)
1193 {
1194 partial_lcp[b_new] = lcp_value;
1195 index_done[b_new] = true;
1196 // Save interval
1197 dict[target][(a_new << 1) + 1] = 1;
1198 dict[target][(b_new << 1)] = 1;
1199 ++intervals;
1200 }
1201 else if (!index_done[b_new])
1202 {
1203 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1204 if (!partial_lcp[insert_pos])
1205 {
1206 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1207 // Save interval
1208 dict[target][(a_new << 1) + 1] = 1;
1209 dict[target][(b_new << 1)] = 1;
1210 ++intervals;
1211 }
1212 }
1213 }
1214 // get next interval
1215 a2 = util::next_bit(dict[source], b2 + 1);
1216 b2 = util::next_bit(dict[source], a2 + 1);
1217 }
1218 std::swap(source, target);
1219 util::set_to_value(dict[target], 0);
1220 }
1221 ++lcp_value;
1222 if (lcp_value >= lcp_value_max)
1223 {
1224 memory_monitor::event("lcp-bwt-write-to-file-begin");
1225 if (phase)
1226 {
1227 insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset);
1228 }
1229 else
1230 {
1231 store_to_file(partial_lcp, lcp_file);
1232 }
1233 memory_monitor::event("lcp-bwt-write-to-file-end");
1234 memory_monitor::event("lcp-bwt-resize-variables-begin");
1235 util::init_support(ds_rank_support, &index_done); // Create rank support
1236
1237 // Recalculate lcp_value_max and resize partial_lcp
1238 lcp_value_offset = lcp_value_max - 1;
1239 size_type remaining_lcp_values = index_done.size() - ds_rank_support.rank(index_done.size());
1240
1241 uint8_t int_width_new =
1242 std::min(space_in_bit_for_lcp / remaining_lcp_values, (size_type)bits::hi(n - 1) + 1);
1243 lcp_value_max = lcp_value_offset + (1ULL << int_width_new);
1244#ifdef STUDY_INFORMATIONS
1245 std::cout << "# l=" << remaining_lcp_values << " b=" << (int)int_width_new
1246 << " lcp_value_max=" << lcp_value_max << std::endl;
1247#endif
1248 // partial_lcp = int_vector<>(remaining_lcp_values, 0, int_width_new);
1249 partial_lcp.width(int_width_new);
1250 partial_lcp.resize(remaining_lcp_values);
1251 partial_lcp.shrink_to_fit();
1252 util::set_to_value(partial_lcp, 0);
1253 ++phase;
1254 memory_monitor::event("lcp-bwt-resize-variables-end");
1255 }
1256 }
1257 memory_monitor::event("lcp-bwt-calc-values-end");
1258
1259 // merge to file
1260 memory_monitor::event("lcp-bwt-merge-to-file-begin");
1261 if (phase)
1262 {
1263 insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset);
1264 }
1265 else
1266 {
1267 store_to_file(partial_lcp, lcp_file);
1268 }
1270
1271 memory_monitor::event("lcp-bwt-merge-to-file-end");
1272}
1273
1275
1290template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
1292{
1293 typedef int_vector<>::size_type size_type;
1294
1295 uint64_t n; // Input length
1296 size_type buffer_size = 1000000; // Size of the buffer
1297 size_type lcp_value = 0; // current LCP value
1298 std::string tmp_lcp_file = cache_file_name(conf::KEY_LCP, config) + "_tmp";
1299 // (1) Calculate LCP-Positions-Array: For each lcp_value (in ascending order) all its occurrences (in any order) in
1300 // the lcp array
1301 {
1302 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1303 t_wt wt_bwt;
1304 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1305 n = wt_bwt.size();
1306 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1307
1308 // Declare needed variables
1309 memory_monitor::event("lcp-bwt2-init-begin");
1310 size_type intervals = 0; // Number of intervals which are currently stored
1311 size_type intervals_new = 0; // Number of new intervals
1312
1313 std::queue<size_type> q; // Queue for storing the intervals
1314 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1315 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1316 bool queue_used = true; // Defines whether a queue (true) or the bit_vectors (false) was used to store intervals
1317 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1318 // else use dictionary and wavelet tree
1319
1320 size_type quantity; // quantity of characters in interval
1321 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1322 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1323 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1324
1325 // External storage of LCP-Positions-Array
1326 bool new_lcp_value = false;
1327 uint8_t int_width = bits::hi(n) + 2;
1328 int_vector_buffer<> lcp_positions_buf(tmp_lcp_file,
1329 std::ios::out,
1330 buffer_size,
1331 int_width); // Create buffer for positions of LCP entries
1332 size_type idx_out_buf = 0;
1333 bit_vector index_done(n + 1, 0); // Bitvector which is true, if corresponding LCP value was already calculated
1334
1335 // Create C-array
1336 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1337 create_C_array(C, wt_bwt);
1338 memory_monitor::event("lcp-bwt2-init-end");
1339 // Calculate LCP-Positions-Array
1340 memory_monitor::event("lcp-bwt2-calc-values-begin");
1341
1342 // Save position of first LCP-value
1343 lcp_positions_buf[idx_out_buf++] = 0;
1344 if (new_lcp_value)
1345 {
1346 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1347 new_lcp_value = false;
1348 }
1349 index_done[0] = true;
1350
1351 // calculate first intervals
1352 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1353 for (size_type i = 0; i < quantity; ++i)
1354 {
1355 unsigned char c = cs[i];
1356 size_type a_new = C[c] + rank_c_i[i];
1357 size_type b_new = C[c] + rank_c_j[i];
1358
1359 // Save LCP value and corresponding interval if not seen before
1360 if (!index_done[b_new])
1361 {
1362 if (b_new < n)
1363 {
1364 // Save position of LCP-value
1365 lcp_positions_buf[idx_out_buf++] = b_new;
1366 }
1367 index_done[b_new] = true;
1368
1369 // Save interval
1370 q.push(a_new);
1371 q.push(b_new);
1372 ++intervals;
1373 }
1374 }
1375 ++lcp_value;
1376 new_lcp_value = true;
1377
1378 // Calculate LCP positions
1379 while (intervals)
1380 {
1381 if (intervals < use_queue_and_wt && !queue_used)
1382 {
1383 memory_monitor::event("lcp-bwt2-bitvec2queue-begin");
1384 util::clear(dict[target]);
1385
1386 // Copy from bitvector to queue
1387 size_type a2 = util::next_bit(dict[source], 0);
1388 size_type b2 = util::next_bit(dict[source], a2 + 1);
1389 while (b2 < dict[source].size())
1390 {
1391 q.push((a2 - 1) >> 1);
1392 q.push(b2 >> 1);
1393 // Get next interval
1394 a2 = util::next_bit(dict[source], b2 + 1);
1395 b2 = util::next_bit(dict[source], a2 + 1);
1396 }
1397 util::clear(dict[source]);
1398 memory_monitor::event("lcp-bwt2-bitvec2queue-end");
1399 }
1400 if (intervals >= use_queue_and_wt && queue_used)
1401 {
1402 memory_monitor::event("lcp-bwt2-queue2bitvec-begin");
1403 dict[source].resize(2 * (n + 1));
1404 util::set_to_value(dict[source], 0);
1405 // Copy from queue to bitvector
1406 while (!q.empty())
1407 {
1408 dict[source][(q.front() << 1) + 1] = 1;
1409 q.pop();
1410 dict[source][(q.front() << 1)] = 1;
1411 q.pop();
1412 }
1413 dict[target].resize(2 * (n + 1));
1414 util::set_to_value(dict[target], 0);
1415 memory_monitor::event("lcp-bwt2-queue2bitvec-end");
1416 }
1417
1418 if (intervals < use_queue_and_wt)
1419 {
1420 queue_used = true;
1421 intervals_new = 0;
1422 while (intervals)
1423 {
1424 // Get next interval
1425 size_type a = q.front();
1426 q.pop();
1427 size_type b = q.front();
1428 q.pop();
1429 --intervals;
1430
1431 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1432 for (size_type i = 0; i < quantity; ++i)
1433 {
1434 unsigned char c = cs[i];
1435 size_type a_new = C[c] + rank_c_i[i];
1436 size_type b_new = C[c] + rank_c_j[i];
1437
1438 // Save LCP value and corresponding interval if not seen before
1439 if (!index_done[b_new])
1440 {
1441 // Save position of LCP-value
1442 lcp_positions_buf[idx_out_buf++] = b_new;
1443 if (new_lcp_value)
1444 {
1445 // Mark new LCP-value
1446 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1447 new_lcp_value = false;
1448 }
1449 index_done[b_new] = true;
1450
1451 // Save interval
1452 q.push(a_new);
1453 q.push(b_new);
1454 ++intervals_new;
1455 }
1456 }
1457 }
1458 intervals = intervals_new;
1459 }
1460 else
1461 {
1462 queue_used = false;
1463 intervals = 0;
1464 // get next interval
1465 size_type a2 = util::next_bit(dict[source], 0);
1466 size_type b2 = util::next_bit(dict[source], a2 + 1);
1467
1468 while (b2 < dict[source].size())
1469 {
1470 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1471 for (size_type i = 0; i < quantity; ++i)
1472 {
1473 unsigned char c = cs[i];
1474 size_type a_new = C[c] + rank_c_i[i];
1475 size_type b_new = C[c] + rank_c_j[i];
1476 // Save LCP value if not seen before
1477 if (!index_done[b_new])
1478 {
1479 // Save position of LCP-value
1480 lcp_positions_buf[idx_out_buf++] = b_new;
1481 if (new_lcp_value)
1482 {
1483 // Mark new LCP-value
1484 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1485 new_lcp_value = false;
1486 }
1487 index_done[b_new] = true;
1488 // Save interval
1489 dict[target][(a_new << 1) + 1] = 1;
1490 dict[target][(b_new << 1)] = 1;
1491 ++intervals;
1492 }
1493 }
1494 // get next interval
1495 a2 = util::next_bit(dict[source], b2 + 1);
1496 b2 = util::next_bit(dict[source], a2 + 1);
1497 }
1498 std::swap(source, target);
1499 util::set_to_value(dict[target], 0);
1500 }
1501 ++lcp_value;
1502 new_lcp_value = true;
1503 }
1504 memory_monitor::event("lcp-bwt2-calc-values-end");
1505 lcp_positions_buf.close();
1506 }
1507 // (2) Insert LCP entires into LCP array
1508 {
1509 memory_monitor::event("lcp-bwt2-reordering-begin");
1510
1511 int_vector_buffer<> lcp_positions(tmp_lcp_file, std::ios::in, buffer_size);
1512
1513 uint8_t int_width = bits::hi(lcp_value + 1) + 1; // How many bits are needed for one lcp_value?
1514
1515 // Algorithm does r=ceil(int_width/8) runs over LCP-Positions-Array.
1516 // So in each run k>=(n/r) values of the lcp array must be calculated.
1517 // Because k has to be a multiple of 8, we choose number_of_values = (k+16) - ((k+16)%8)
1518 size_type number_of_values = ((n / ((int_width - 1ULL) / 8 + 1) + 16) & (~(0x7ULL)));
1519 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
1520 int_vector_buffer<> lcp_array(lcp_file,
1521 std::ios::out,
1522 number_of_values * int_width / 8,
1523 int_width); // Create Output Buffer
1524 number_of_values = lcp_array.buffersize() * 8 / int_width;
1525
1526 for (size_type position_begin = 0, position_end = number_of_values; position_begin < n and number_of_values > 0;
1527 position_begin = position_end, position_end += number_of_values)
1528 {
1529#ifdef STUDY_INFORMATIONS
1530 std::cout << "# number_of_values=" << number_of_values << " fill lcp_values with " << position_begin
1531 << " <= position <" << position_end << ", each lcp-value has " << (int)int_width
1532 << " bit, lcp_value_max=" << lcp_value << " n=" << n << std::endl;
1533#endif
1534 lcp_value = 0;
1535 for (size_type i = 0; i < n; ++i)
1536 {
1537 size_type position = lcp_positions[i];
1538 if (position > n)
1539 {
1540 position -= n;
1541 ++lcp_value;
1542 }
1543 if (position_begin <= position and position < position_end)
1544 {
1545 lcp_array[position] = lcp_value;
1546 }
1547 }
1548 }
1549 // Close file
1550 lcp_array.close();
1552 lcp_positions.close(true); // close buffer and remove file
1553 memory_monitor::event("lcp-bwt2-reordering-end");
1554 } // End of phase 2
1555}
1556
1557} // namespace sdsl
1558
1559#endif
bits.hpp contains the sdsl::bits class.
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
uint8_t width() const
Returns the width of the integers which are accessed via the [] operator.
void shrink_to_fit()
Free unused allocated memory.
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
size_type size() const noexcept
The number of elements in the int_vector.
reference front() noexcept
Returns first element.
void resize(const size_type size)
Resize the int_vector in terms of elements.
static mm_event_proxy event(std::string const &name)
A rank structure proposed by Sebastiano Vigna.
size_type rank(size_type idx) const
Answers rank queries for the supported bit_vector.
construct_isa.hpp contains a space and time efficient construction method for the inverse suffix arra...
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
memory_tracking.hpp contains two function for allocating and deallocating memory
constexpr char KEY_SA[]
Definition config.hpp:36
constexpr char KEY_TEXT[]
Definition config.hpp:40
constexpr char KEY_LCP[]
Definition config.hpp:43
constexpr char KEY_ISA[]
Definition config.hpp:39
constexpr char KEY_BWT[]
Definition config.hpp:34
Get the smallest position f$i geq idx f$ where a bit is set t_int_vec::size_type next_bit(t_int_vec const &v, uint64_t idx)
void set_to_value(t_int_vec &v, uint64_t k)
Set all entries of int_vector to value k.
Definition util.hpp:597
Namespace for the succinct data structure library.
bool store_to_cache(T const &v, std::string const &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition io.hpp:804
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void construct_lcp_kasai(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct_lcp_bwt_based(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
std::string cache_file_name(std::string const &key, cache_config const &config)
Returns the file name of the resource.
Definition io.hpp:688
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
void construct_lcp_PHI(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void register_cache_file(std::string const &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition io.hpp:717
T::size_type size_in_bytes(T const &t)
Get the size of a data structure in bytes.
Definition io.hpp:854
void create_C_array(std::vector< uint64_t > &C, tWT const &wt)
bool load_from_cache(T &v, std::string const &key, cache_config const &config, bool add_type_hash=false)
Definition io.hpp:772
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition construct.hpp:56
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_lcp_goPHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_isa(cache_config &config)
void construct_lcp_bwt_based2(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void interval_symbols(t_wt const &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
bool store_to_file(T const &v, std::string const &file)
Store a data structure to a file.
Definition io.hpp:874
int_vector ::size_type size(range_type const &r)
Size of a range.
void construct_lcp_go(cache_config &config)
Construct the LCP array (only for byte strings)
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
rank_support_v.hpp contains rank_support_v.
select_support_scan.hpp contains classes that support a sdsl::bit_vector with linear time select.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:652
static constexpr uint64_t lo_unset[65]
lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
Definition bits.hpp:215
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition bits.hpp:193
Helper class for construction process.
Definition config.hpp:66
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition config.hpp:96
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.