SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
construct_lcp.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_CONSTRUCT_LCP
9#define INCLUDED_SDSL_CONSTRUCT_LCP
10
11#include <algorithm>
12#include <iostream>
13#include <stdexcept>
14
15#include <sdsl/config.hpp>
19#include <sdsl/int_vector.hpp>
20#include <sdsl/rank_support.hpp>
22#include <sdsl/sfstream.hpp>
23#include <sdsl/util.hpp>
24#include <sdsl/wt_algorithm.hpp>
25#include <sdsl/wt_huff.hpp>
26
27//#define STUDY_INFORMATIONS
28
29namespace sdsl
30{
31
32// Forward declaration of construction method
33// template <class t_index>
34// void construct(t_index& idx, const std::string& file);
35
37
54template <uint8_t t_width>
56{
57 static_assert(t_width == 0 or t_width == 8,
58 "construct_lcp_kasai: width must be `0` for integer alphabet and `8` for byte alphabet");
59 int_vector<> lcp;
60 typedef int_vector<>::size_type size_type;
61 construct_isa(config);
62 {
64 if (!load_from_cache(text, key_text_trait<t_width>::KEY_TEXT, config)) { return; }
66 std::ios::in,
67 1000000); // init isa file_buffer
68 int_vector<> sa;
69 if (!load_from_cache(sa, conf::KEY_SA, config)) { return; }
70 // use Kasai algorithm to compute the lcp values
71 for (size_type i = 0, j = 0, sa_1 = 0, l = 0, n = isa_buf.size(); i < n; ++i)
72 {
73 sa_1 = isa_buf[i]; // = isa[i]
74 if (sa_1)
75 {
76 j = sa[sa_1 - 1];
77 if (l) --l;
78 assert(i != j);
79 while (text[i + l] == text[j + l])
80 { // i+l < n and j+l < n are not necessary, since text[n]=0 and text[i]!=0 (i<n) and i!=j
81 ++l;
82 }
83 sa[sa_1 - 1] = l; // overwrite sa array with lcp values
84 }
85 else
86 {
87 l = 0;
88 sa[n - 1] = 0;
89 }
90 }
91
92 for (size_type i = sa.size(); i > 1; --i) { sa[i - 1] = sa[i - 2]; }
93 sa[0] = 0;
94 lcp = std::move(sa);
95 }
96 store_to_cache(lcp, conf::KEY_LCP, config);
97}
98
100
115template <uint8_t t_width>
117{
118 static_assert(t_width == 0 or t_width == 8,
119 "construct_lcp_PHI: width must be `0` for integer alphabet and `8` for byte alphabet");
120 typedef int_vector<>::size_type size_type;
121 typedef int_vector<t_width> text_type;
122 const char * KEY_TEXT = key_text_trait<t_width>::KEY_TEXT;
124 size_type n = sa_buf.size();
125
126 assert(n > 0);
127 if (1 == n)
128 { // Handle special case: Input only the sentinel character.
129 int_vector<> lcp(1, 0);
130 store_to_cache(lcp, conf::KEY_LCP, config);
131 return;
132 }
133
134 // (1) Calculate PHI (stored in array plcp)
135 int_vector<> plcp(n, 0, sa_buf.width());
136 for (size_type i = 0, sai_1 = 0; i < n; ++i)
137 {
138 size_type sai = sa_buf[i];
139 plcp[sai] = sai_1;
140 sai_1 = sai;
141 }
142
143 // (2) Load text from disk
144 text_type text;
145 load_from_cache(text, KEY_TEXT, config);
146
147 // (3) Calculate permuted LCP array (text order), called PLCP
148 size_type max_l = 0;
149 for (size_type i = 0, l = 0; i < n - 1; ++i)
150 {
151 size_type phii = plcp[i];
152 while (text[i + l] == text[phii + l]) { ++l; }
153 plcp[i] = l;
154 if (l)
155 {
156 max_l = std::max(max_l, l);
157 --l;
158 }
159 }
160 util::clear(text);
161 uint8_t lcp_width = bits::hi(max_l) + 1;
162
163 // (4) Transform PLCP into LCP
164 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
165 size_type buffer_size = 1000000; // buffer_size is a multiple of 8!
166 int_vector_buffer<> lcp_buf(lcp_file, std::ios::out, buffer_size, lcp_width); // open buffer for lcp
167 lcp_buf[0] = 0;
168 sa_buf.buffersize(buffer_size);
169 for (size_type i = 1; i < n; ++i)
170 {
171 size_type sai = sa_buf[i];
172 lcp_buf[i] = plcp[sai];
173 }
174 lcp_buf.close();
176}
177
179
196{
197 typedef int_vector<>::size_type size_type;
199 size_type n = sa_buf.size();
200 if (1 == n)
201 {
202 int_vector<> lcp(1, 0);
203 store_to_cache(lcp, conf::KEY_LCP, config);
204 return;
205 }
206 const uint8_t log_q = 6; // => q=64
207 const uint32_t q = 1 << log_q;
208 const uint64_t modq = bits::lo_set[log_q];
209
210 // n-1 is the maximum entry in SA
211 int_vector<64> plcp((n - 1 + q) >> log_q);
212
213 for (size_type i = 0, sai_1 = 0; i < n; ++i)
214 { // we can start at i=0. if SA[i]%q==0
215 // we set PHI[(SA[i]=n-1)%q]=0, since T[0]!=T[n-1]
216 size_type sai = sa_buf[i];
217 if ((sai & modq) == 0)
218 {
219 if ((sai >> log_q) >= plcp.size())
220 {
221 // std::cerr<<"sai="<<sai<<" log_q="<<log_q<<" sai>>log_q="<<(sai>>log_q)<<"
222 // "<<sai_1<<std::endl; std::cerr<<"n="<<n<<" "<<" plcp.size()="<<plcp.size();
223 }
224 plcp[sai >> log_q] = sai_1;
225 }
226 sai_1 = sai;
227 }
228
229 int_vector<8> text;
230 load_from_cache(text, conf::KEY_TEXT, config);
231
232 for (size_type i = 0, j, k, l = 0; i < plcp.size(); ++i)
233 {
234 j = i << log_q; // j=i*q
235 k = plcp[i];
236 while (text[j + l] == text[k + l]) ++l;
237 plcp[i] = l;
238 if (l >= q) { l -= q; }
239 else
240 {
241 l = 0;
242 }
243 }
244
245 size_type buffer_size = 4000000; // buffer_size is a multiple of 8!
246 sa_buf.buffersize(buffer_size);
248 std::ios::out,
249 buffer_size,
250 sa_buf.width()); // open buffer for plcp
251
252 for (size_type i = 0, sai_1 = 0, l = 0, sai = 0, iq = 0; i < n; ++i)
253 {
254 /*size_type*/ sai = sa_buf[i];
255 // std::cerr<<"i="<<i<<" sai="<<sai<<std::endl;
256 if ((sai & modq) == 0)
257 { // we have already worked the value out ;)
258 lcp_out_buf[i] = l = plcp[sai >> log_q];
259 }
260 else
261 {
262 /*size_type*/ iq = sai & bits::lo_unset[log_q];
263 l = plcp[sai >> log_q];
264 if (l > (sai - iq))
265 l -= (sai - iq);
266 else
267 l = 0;
268 while (text[sai + l] == text[sai_1 + l]) ++l;
269 lcp_out_buf[i] = l;
270 }
271#ifdef CHECK_LCP
272 size_type j = 0;
273 for (j = 0; j < l; ++j)
274 {
275 if (text[sai + j] != text[sai_1 + j])
276 {
277 std::cout << "lcp[" << i << "]=" << l << " is two big! " << j << " is right!"
278 << " sai=" << sai << std::endl;
279 if ((sai & modq) != 0)
280 std::cout << " plcp[sai>>log_q]=" << plcp[sai >> log_q] << " sai-iq=" << sai - iq << " sai=" << sai
281 << " sai-iq=" << sai - iq << std::endl;
282 break;
283 }
284 }
285#endif
286 sai_1 = sai;
287 }
288 lcp_out_buf.close();
290 return;
291}
292
294
312inline void construct_lcp_go(cache_config & config)
313{
314 typedef int_vector<>::size_type size_type;
315#ifdef STUDY_INFORMATIONS
316 size_type racs = 0; // random accesses to the text
317 size_type matches = 0;
318 size_type comps2 = 0; // comparisons the second phase
319#endif
320 int_vector<8> text;
321 load_from_cache(text, conf::KEY_TEXT, config);
322 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
323 const size_type n = sa_buf.size();
324 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
325
326 if (1 == n)
327 {
328 int_vector<> lcp(1, 0);
329 store_to_cache(lcp, conf::KEY_LCP, config);
330 return;
331 }
332
333 size_type cnt_c[257] = { 0 }; // counter for each character in the text
334 size_type cnt_cc[257] = { 0 }; // prefix sum of the counter cnt_c
335 size_type cnt_cc2[257] = { 0 }; //
336 size_type omitted_c[257] = { 0 }; // counts the omitted occurrences for the second phase
337 size_type prev_occ_in_bwt[256] = { 0 }; // position of the previous occurrence of each character c in the bwt
338 for (size_type i = 0; i < 256; ++i) prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
339 unsigned char alphabet[257] = { 0 };
340 uint8_t sigma = 0;
341
342 tLI m_list[2][256];
343 size_type m_char_count[2] = { 0 };
344 uint8_t m_chars[2][256] = { { 0 }, { 0 } };
345
346 size_type nn = 0; // n' for phase 2
347 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
348 {
349
350 int_vector<8> lcp_sml(n,
351 0); // initialize array for small values of first phase; note lcp[0]=0
352 size_type done_cnt = 0;
353
354 for (size_type i = 0; i < n; ++i)
355 { // initialize cnt_c
356 ++cnt_c[text[i] + 1];
357 }
358 for (int i = 1; i < 257; ++i)
359 { // calculate sigma and initailize cnt_cc
360 if (cnt_c[i] > 0) { alphabet[sigma++] = (unsigned char)(i - 1); }
361 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
362 }
363 alphabet[sigma] = '\0';
364 {
365 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
366 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
367 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
368 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
369 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
370 ++omitted_c[alphabet[0]]; //
371
372 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
373 rmq_stack[0] = 0;
374 rmq_stack[1] = 0; // first element (-1, -1)
375 rmq_stack[2] = 1;
376 rmq_stack[3] = 0; // second element (0, -1)
377 size_type rmq_end = 3; // index of the value of the topmost element
378
379 const size_type m_mod2 = m % 2;
380 uint8_t cur_c = alphabet[1];
381 size_type big_val = 0;
382 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
383 {
384 uint8_t bwti = bwt_buf[i];
385 sai = sa_buf[i];
386 size_type lf = cnt_cc[bwti];
387 if (!cur_c_cnt)
388 { // cur_c_cnt==0, if there is no more occurence of the current character
389 if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
390 }
391 size_type l = 0;
392 if (i >= cnt_cc[cur_c])
393 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
394 if (bwti == bwti_1 and lf < i)
395 { // BWT[i]==BWT[i-1]
396 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
397 if (l == m)
398 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
399 l += (text[sai_1 + m] == text[sai + m]);
400#ifdef STUDY_INFORMATIONS
401 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
402 ++racs;
403#endif
404 }
405 lcp_sml[i] = l;
406 ++done_cnt;
407 }
408 else
409 { // BWT[i] != BWT[i-1] or LF[i] > i
410 if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
411#ifdef STUDY_INFORMATIONS
412 if ((sai_1 ^ sai) >> 6) // if i and phii are in the same cache line
413 ++racs;
414#endif
415 while (text[sai_1 + l] == text[sai + l] and l < m + 1)
416 {
417 ++l;
418#ifdef STUDY_INFORMATIONS
419 ++matches;
420#endif
421 }
422 lcp_sml[i] = l;
423 }
424 }
425 else
426 { // if already done
427 l = lcp_sml[i]; // load LCP value
428 }
429 if (l > m)
430 {
431 ++big_val;
432 if (i > 10000 and i < 10500 and big_val > 3000)
433 { // if most of the values are big: switch to PHI algorithm
434 util::clear(text);
435 util::clear(lcp_sml);
436 construct_lcp_PHI<8>(config);
437 return;
438 }
439 }
440 // invariant: l <= m+1
441 // begin update rmq_stack
442 size_type x = l + 1;
443 size_type j = rmq_end;
444 while (x <= rmq_stack[j]) j -= 2; // pop all elements with value >= l
445 rmq_stack[++j] = i + 1; // push position i
446 rmq_stack[++j] = x; // push value l
447 rmq_end = j; // update index of the value of the topmost element
448 if (lf > i)
449 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
450 ++done_cnt;
451 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
452 // rmq is linear in the stack size; can also be implemented with binary search on the stack
453 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
454 j = rmq_end - 3;
455 while (x_pos <= rmq_stack[j]) j -= 2; // search smallest value in the interval I
456 lcp_sml[lf] = rmq_stack[j + 3] -
457 (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
458 }
459 if (l >= m)
460 {
461 if (l == m) push_front_m_index(nn, cur_c, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
462 ++nn;
463 }
464 else
465 ++omitted_c[cur_c];
466
467 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
468 ++cnt_cc[bwti]; // update counter and therefore the LF information
469 sai_1 = sai; // update SA[i-1]
470 bwti_1 = bwti; // update BWT[i-1]
471 }
472 }
473 util::clear(text);
474
475 if (n > 1000 and nn > 5 * (n / 6))
476 { // if we would occupy more space than the PHI algorithm => switch to PHI algorithm
477 util::clear(lcp_sml);
478 construct_lcp_PHI<8>(config);
479 return;
480 }
481 store_to_cache(lcp_sml, "lcp_sml", config);
482 }
483#ifdef STUDY_INFORMATIONS
484 std::cout << "# n=" << n << " nn=" << nn << " nn/n=" << ((double)nn) / n << std::endl;
485#endif
486
487 // phase 2: calculate lcp_big
488 {
489 // std::cout<<"# begin calculating LF' values"<<std::endl;
490 int_vector<> lcp_big(nn,
491 0,
492 bits::hi(n - 1) + 1); // lcp_big first contains adapted LF values and finally the big LCP
493 // values
494 {
495 // initialize lcp_big with adapted LF values
496 bit_vector todo(n, 0); // bit_vector todo indicates which values are >= m in lcp_sml
497 {
498 // initialize bit_vector todo
499 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
500 for (size_type i = 0; i < n; ++i)
501 {
502 if (lcp_sml_buf[i] >= m) { todo[i] = 1; }
503 }
504 }
505
506 cnt_cc2[0] = cnt_cc[0] = 0;
507 for (size_type i = 1, omitted_sum = 0; i < 257; ++i)
508 { // initialize cnt_cc and cnt_cc2
509 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
510 omitted_sum += omitted_c[i - 1];
511 cnt_cc2[i] = cnt_cc[i] - omitted_sum;
512 }
513
514 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
515 for (size_type i = 0, i2 = 0; i < n; ++i)
516 {
517 uint8_t b = bwt_buf[i]; // store BWT[i]
518 size_type lf_i = cnt_cc[b]; // LF[i]
519 if (todo[i])
520 { // LCP[i] is a big value
521 if (todo[lf_i])
522 { // LCP[LF[i]] is a big entry
523 lcp_big[i2] = cnt_cc2[b]; // LF'[i]
524 } /*else{
525 * lcp_big[i2] = 0;
526 }*/
527 ++i2;
528 }
529 if (todo[lf_i])
530 { // LCP[LF[i]] is a big entry
531 ++cnt_cc2[b]; // increment counter for adapted LF
532 }
533 ++cnt_cc[b]; // increment counter for LF
534 }
535 }
536
537 // std::cout<<"# begin initializing bwt2, shift_bwt2, run2"<<std::endl;
538 int_vector<8> bwt2(nn),
539 shift_bwt2(nn); // BWT of big LCP values, and shifted BWT of
540 // big LCP values
541 bit_vector run2(nn + 1); // indicates for each entry i, if i and i-1 are both big LCP values
542 run2[nn] = 0; // index nn is not a big LCP value
543 {
544 // initialize bwt2, shift_bwt2, adj2
545 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
546 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
547 uint8_t b_1 = '\0'; // BWT[i-1]
548 bool is_run = false;
549 for (size_type i = 0, i2 = 0; i < n; ++i)
550 {
551 uint8_t b = bwt_buf[i];
552 if (lcp_sml_buf[i] >= m)
553 {
554 bwt2[i2] = b;
555 shift_bwt2[i2] = b_1;
556 run2[i2] = is_run;
557 is_run = true;
558 ++i2;
559 }
560 else
561 {
562 is_run = false;
563 }
564 b_1 = b;
565 }
566 }
567
568 bit_vector todo2(nn + 1, 1); // init all values with 1, except
569 todo2[nn] = 0; // the last one! (handels case "i < nn")
570
571 // std::cout<<"# begin calculating m-indices"<<std::endl;
572 {
573 // calculate m-indices, (m+1)-indices,... until we are done
574 size_type m2 = m;
575 size_type char_ex[256];
576 for (size_type i = 0; i < 256; ++i) char_ex[i] = nn;
577 size_type char_occ = 0;
578 size_type m_mod2 = m2 % 2, mm1_mod2 = (m2 + 1) % 2;
579 while (m_char_count[m_mod2] > 0)
580 { // while there are m-indices, calculate (m+1)-indices and write m-indices
581 // For all values LCP[i] >= m2 it follows that todo2[i] == 1
582 // list m_list[mm1_mod2][b] is sorted in decreasing order
583 ++m2;
584 mm1_mod2 = (m2 + 1) % 2, m_mod2 = m2 % 2;
585 m_char_count[m_mod2] = 0;
586
587 std::sort(m_chars[mm1_mod2],
588 m_chars[mm1_mod2] + m_char_count[mm1_mod2]); // TODO: ersetzen?
589
590 for (size_type mc = 0; mc < m_char_count[mm1_mod2]; ++mc)
591 { // for every character
592 tLI & mm1_mc_list = m_list[mm1_mod2][m_chars[mm1_mod2][m_char_count[mm1_mod2] - 1 - mc]];
593 // size_type old_i = nn;
594 while (!mm1_mc_list.empty())
595 {
596 size_type i = mm1_mc_list.front(); // i in [0..n-1]
597 mm1_mc_list.pop_front();
598 // For all values LCP[i] >= m-1 it follows that todo2[i] == 1
599 for (size_type k = i; todo2[k]; --k)
600 {
601#ifdef STUDY_INFORMATIONS
602 ++comps2;
603#endif
604 uint8_t b = shift_bwt2[k];
605 if (char_ex[b] != i)
606 {
607 char_ex[b] = i;
608 ++char_occ;
609 }
610 if (!run2[k]) break;
611 }
612 for (size_type k = i; todo2[k] and char_occ; ++k)
613 {
614#ifdef STUDY_INFORMATIONS
615 ++comps2;
616#endif
617 uint8_t b = bwt2[k];
618 if (char_ex[b] == i)
619 {
620 size_type p = lcp_big[k];
621 push_back_m_index(p, b, m_list[m_mod2], m_chars[m_mod2], m_char_count[m_mod2]);
622 char_ex[b] = nn;
623 --char_occ;
624 }
625 if (!run2[k + 1]) break;
626 }
627 lcp_big[i] = m2 - 1;
628 todo2[i] = 0;
629 // old_i = i;
630 }
631 }
632 }
633 }
634
635 store_to_cache(lcp_big, "lcp_big", config);
636 } // end phase 2
637
638 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
639 // phase 3: merge lcp_sml and lcp_big and save to disk
640 {
641 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
642 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
643 config)); // file buffer containing the big LCP values
644 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
645 std::ios::in,
646 buffer_size); // file buffer containing the small LCP values
648 std::ios::out,
649 buffer_size,
650 lcp_big_buf.width()); // buffer for the resulting LCP array
651 for (size_type i = 0, i2 = 0; i < n; ++i)
652 {
653 size_type l = lcp_sml_buf[i];
654 if (l >= m)
655 { // if l >= m it is stored in lcp_big
656 l = lcp_big_buf[i2];
657 ++i2;
658 }
659 lcp_buf[i] = l;
660 }
661 lcp_buf.close();
662 }
664
665#ifdef STUDY_INFORMATIONS
666 std::cout << "# racs: " << racs << std::endl;
667 std::cout << "# matches: " << matches << std::endl;
668 std::cout << "# comps2: " << comps2 << std::endl;
669#endif
670 return;
671}
672
674
693{
694 typedef int_vector<>::size_type size_type;
695 int_vector<8> text;
696 load_from_cache(text, conf::KEY_TEXT, config); // load text from file system
697 int_vector_buffer<> sa_buf(cache_file_name(conf::KEY_SA, config)); // initialize buffer for suffix array
698 const size_type n = sa_buf.size();
699 const size_type m = 254; // LCP[i] == m+1 corresp. to LCP[i]>= m+1; LCP[i] <= m corresp. to LCP[i] was calculated
700
701 if (1 == n)
702 {
703 int_vector<> lcp(1, 0);
704 store_to_cache(lcp, conf::KEY_LCP, config);
705 return;
706 }
707
708 size_type cnt_c[257] = { 0 }; // counter for each character in the text
709 size_type cnt_cc[257] = { 0 }; // prefix sum of the counter cnt_c
710 size_type omitted_c[257] = { 0 }; // counts the omitted occurrences for the second phase
711 size_type prev_occ_in_bwt[256] = { 0 }; // position of the previous occurrence of each character c in the bwt
712 for (size_type i = 0; i < 256; ++i) prev_occ_in_bwt[i] = (size_type)-1; // initialize the array with -1
713 unsigned char alphabet[257] = { 0 };
714 uint8_t sigma = 0;
715
716 size_type nn = 0; // n' for phase 2
717 // phase 1: calculate lcp_sml; memory consumption: 2n bytes (lcp_sml=n bytes, text=n bytes)
718 {
719
720 int_vector<8> lcp_sml(n,
721 0); // initialize array for small values of first phase; note lcp[0]=0
722 size_type done_cnt = 0;
723
724 for (size_type i = 0; i < n; ++i)
725 { // initialize cnt_c
726 ++cnt_c[text[i] + 1];
727 }
728 for (int i = 1; i < 257; ++i)
729 { // calculate sigma and initailize cnt_cc
730 if (cnt_c[i] > 0) { alphabet[sigma++] = (unsigned char)(i - 1); }
731 cnt_cc[i] = cnt_c[i] + cnt_cc[i - 1];
732 }
733 alphabet[sigma] = '\0';
734 {
735 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // initialize buffer of bwt
736 size_type sai_1 = sa_buf[0]; // store value of sa[i-1]
737 uint8_t bwti_1 = bwt_buf[0]; // store value of BWT[i-1]
738 lcp_sml[cnt_cc[bwti_1]++] = 0; // lcp_sml[ LF[0] ] = 0
739 prev_occ_in_bwt[bwti_1] = 0; // init previous occurence of character BWT[0]
740 ++omitted_c[alphabet[0]]; //
741
742 int_vector<64> rmq_stack(2 * (m + 10)); // initialize stack for m+10 elements representing (position, value)
743 rmq_stack[0] = 0;
744 rmq_stack[1] = 0; // first element (-1, -1)
745 rmq_stack[2] = 1;
746 rmq_stack[3] = 0; // second element (0, -1)
747 size_type rmq_end = 3; // index of the value of the topmost element
748
749 uint8_t cur_c = alphabet[1];
750 for (size_type i = 1, sai, cur_c_idx = 1, cur_c_cnt = cnt_c[alphabet[1] + 1]; i < n; ++i, --cur_c_cnt)
751 {
752 uint8_t bwti = bwt_buf[i];
753 sai = sa_buf[i];
754 size_type lf = cnt_cc[bwti];
755 if (!cur_c_cnt)
756 { // cur_c_cnt==0, if there is no more occurence of the current character
757 if (cur_c_cnt < sigma) { cur_c_cnt = cnt_c[(cur_c = alphabet[++cur_c_idx]) + 1]; }
758 }
759 size_type l = 0;
760 if (i >= cnt_cc[cur_c])
761 { // if the current lcp entry is not already done TODO: schleife von i bis cnt_cc[cur_c]
762 if (bwti == bwti_1 and lf < i)
763 { // BWT[i]==BWT[i-1]
764 l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0; // l = LCP[LF[i]]-1; l < m+1
765 if (l == m)
766 { // if LCP[LF[i]] == m+1; otherwise LCP[LF[i]] < m+1 the result is correct
767 l += (text[sai_1 + m] == text[sai + m]);
768 }
769 lcp_sml[i] = l;
770 ++done_cnt;
771 }
772 else
773 { // BWT[i] != BWT[i-1] or LF[i] > i
774 if (lf < i) l = lcp_sml[lf] ? lcp_sml[lf] - 1 : 0;
775 while (text[sai_1 + l] == text[sai + l] and l < m + 1) { ++l; }
776 lcp_sml[i] = l;
777 }
778 }
779 else
780 { // if already done
781 l = lcp_sml[i]; // load LCP value
782 }
783 // invariant: l <= m+1
784 // begin update rmq_stack
785 size_type x = l + 1;
786 size_type j = rmq_end;
787 while (x <= rmq_stack[j]) j -= 2; // pop all elements with value >= l
788 rmq_stack[++j] = i + 1; // push position i
789 rmq_stack[++j] = x; // push value l
790 rmq_end = j; // update index of the value of the topmost element
791 if (lf > i)
792 { // if LF[i] > i, we can calculate LCP[LF[i]] in constant time with rmq
793 ++done_cnt;
794 // rmq query for lcp-values in the interval I=[prev_occ_in_bwt[BWT[i]]+1..i]
795 // rmq is linear in the stack size; can also be implemented with binary search on the stack
796 size_type x_pos = prev_occ_in_bwt[bwti] + 2;
797 j = rmq_end - 3;
798 while (x_pos <= rmq_stack[j]) j -= 2; // search smallest value in the interval I
799 lcp_sml[lf] = rmq_stack[j + 3] -
800 (rmq_stack[j + 3] == m + 2); // if lcp-value equals m+1, we subtract 1
801 }
802 if (l > m) { ++nn; }
803 else
804 ++omitted_c[cur_c];
805
806 prev_occ_in_bwt[bwti] = i; // update previous position information for character BWT[i]
807 ++cnt_cc[bwti]; // update counter and therefore the LF information
808 sai_1 = sai; // update SA[i-1]
809 bwti_1 = bwti; // update BWT[i-1]
810 }
811 }
812 store_to_cache(lcp_sml, "lcp_sml", config);
813 }
814
815 // phase 2: calculate lcp_big with PHI algorithm on remaining entries of LCP
816 {
817 int_vector<> lcp_big(0, 0, bits::hi(n - 1) + 1); // nn, 0, bits::hi(n-1)+1);
818 {
819
820 memory_monitor::event("lcp-init-phi-begin");
821 size_type sa_n_1 = 0; // value for SA[n-1]
822 bit_vector todo(n, 0); // bit_vector todo indicates which values are > m in lcp_sml
823 {
824 // initialize bit_vector todo
825 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
826 for (size_type i = 0; i < n; ++i)
827 {
828 if (lcp_sml_buf[i] > m) { todo[sa_buf[i]] = 1; }
829 }
830 sa_n_1 = sa_buf[n - 1];
831 }
832 rank_support_v<> todo_rank(&todo); // initialize rank for todo
833
834 const size_type bot = sa_n_1;
835 int_vector<> phi(nn, bot, bits::hi(n - 1) + 1); // phi
836
837 int_vector_buffer<8> bwt_buf(cache_file_name(conf::KEY_BWT, config)); // load BWT
838 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config)); // load lcp_sml
839 uint8_t b_1 = 0;
840 for (size_type i = 0, sai_1 = 0; i < n; ++i)
841 { // initialize phi
842 uint8_t b = bwt_buf[i]; // store BWT[i]
843 size_type sai = sa_buf[i];
844 if (lcp_sml_buf[i] > m and b != b_1)
845 { // if i is a big irreducable value
846 phi[todo_rank(sai)] = sai_1;
847 } // otherwise phi is equal to bot
848 b_1 = b;
849 sai_1 = sai;
850 }
851 memory_monitor::event("lcp-init-phi-end");
852
853 memory_monitor::event("lcp-calc-plcp-begin");
854 for (size_type i = 0, ii = 0, l = m + 1, p = 0; i < n and ii < nn; ++i)
855 { // execute compact Phi algorithm
856 if (todo[i])
857 {
858 if (i > 0 and todo[i - 1])
859 l = l - 1;
860 else
861 l = m + 1;
862 if ((p = phi[ii]) != bot)
863 {
864 while (text[i + l] == text[p + l]) ++l;
865 }
866 phi[ii++] = l;
867 }
868 }
869 memory_monitor::event("lcp-calc-plcp-end");
870 util::clear(text);
871
872 memory_monitor::event("lcp-calc-lcp-begin");
873 lcp_big.resize(nn);
874 for (size_type i = 0, ii = 0; i < n and ii < nn; ++i)
875 {
876 if (lcp_sml_buf[i] > m) { lcp_big[ii++] = phi[todo_rank(sa_buf[i])]; }
877 }
878 memory_monitor::event("lcp-calc-lcp-end");
879 }
880 store_to_cache(lcp_big, "lcp_big", config);
881 } // end phase 2
882
883 // std::cout<<"# merge lcp_sml and lcp_big"<<std::endl;
884 // phase 3: merge lcp_sml and lcp_big and save to disk
885 {
886 const size_type buffer_size = 1000000; // buffer_size has to be a multiple of 8!
887 int_vector_buffer<> lcp_big_buf(cache_file_name("lcp_big",
888 config)); // file buffer containing the big LCP values
889 int_vector_buffer<8> lcp_sml_buf(cache_file_name("lcp_sml", config),
890 std::ios::in,
891 buffer_size); // file buffer containing the small LCP values
893 std::ios::out,
894 buffer_size,
895 lcp_big_buf.width()); // file buffer for the resulting LCP array
896
897 for (size_type i = 0, i2 = 0; i < n; ++i)
898 {
899 size_type l = lcp_sml_buf[i];
900 if (l > m)
901 { // if l > m it is stored in lcp_big
902 l = lcp_big_buf[i2];
903 ++i2;
904 }
905 lcp_buf[i] = l;
906 }
907 lcp_big_buf.close(true); // close buffer and remove file
908 lcp_sml_buf.close(true); // close buffer and remove file
909 }
911 return;
912}
913
915
930template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
932{
933 typedef int_vector<>::size_type size_type;
934 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
935
936 // create WT
937 memory_monitor::event("lcp-bwt-create-wt-huff-begin");
938 t_wt wt_bwt;
939 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
940 uint64_t n = wt_bwt.size();
941 memory_monitor::event("lcp-bwt-create-wt-huff-end");
942
943 // init
944 memory_monitor::event("lcp-bwt-init-begin");
945 size_type lcp_value = 0; // current LCP value
946 size_type lcp_value_offset = 0; // Largest LCP value in LCP array, that was written on disk
947 size_type phase = 0; // Count how often the LCP array was written on disk
948
949 size_type intervals = 0; // number of intervals which are currently stored
950 size_type intervals_new = 0; // number of new intervals
951
952 std::queue<size_type> q; // Queue for storing the intervals
953 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
954 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
955 bool queue_used = true;
956 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
957 // else use dictionary and wavelet tree
958
959 size_type quantity; // quantity of characters in interval
960 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
961 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
962 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
963
964 // Calculate how many bit are for each lcp value available, to limit the memory usage to 20n bit = 2,5n byte, use at
965 // moste 8 bit
966 size_type bb = (n * 20 - size_in_bytes(wt_bwt) * 8 * 1.25 - 5 * n) /
967 n; // 20n - size of wavelet tree * 1.25 for rank support - 5n for bit arrays - n for index_done array
968 if (n * 20 < size_in_bytes(wt_bwt) * 8 * 1.25 + 5 * n) { bb = 6; }
969 bb = std::min(bb, (size_type)8);
970
971 size_type lcp_value_max = (1ULL << bb) - 1; // Largest LCP value that can be stored into the LCP array
972 size_type space_in_bit_for_lcp = n * bb; // Space for the LCP array in main memory
973
974#ifdef STUDY_INFORMATIONS
975 std::cout << "# l=" << n << " b=" << (int)bb << " lcp_value_max=" << lcp_value_max
976 << " size_in_bytes(wt_bwt<v,bs,bs>)=" << size_in_bytes(wt_bwt) << std::endl;
977#endif
978
979 // init partial_lcp
980 int_vector<> partial_lcp(n, 0, bb); // LCP array
981
982 // init index_done
983 bit_vector index_done(n + 1, false); // bit_vector indicates if entry LCP[i] is amend
984 rank_support_v<> ds_rank_support; // Rank support for bit_vector index_done
985
986 // create C-array
987 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
988 create_C_array(C, wt_bwt);
989 memory_monitor::event("lcp-bwt-init-begin-end");
990 // calculate lcp
991 memory_monitor::event("lcp-bwt-calc-values-begin");
992
993 // calculate first intervals
994 partial_lcp[0] = 0;
995 index_done[0] = true;
996 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
997 for (size_type i = 0; i < quantity; ++i)
998 {
999 unsigned char c = cs[i];
1000 size_type a_new = C[c] + rank_c_i[i];
1001 size_type b_new = C[c] + rank_c_j[i];
1002
1003 // Save LCP value if not seen before
1004 if (!index_done[b_new])
1005 {
1006 if (b_new < n) partial_lcp[b_new] = lcp_value;
1007 index_done[b_new] = true;
1008 // Save interval
1009 q.push(a_new);
1010 q.push(b_new);
1011 ++intervals;
1012 }
1013 }
1014 ++lcp_value;
1015
1016 // calculate LCP values phase by phase
1017 while (intervals)
1018 {
1019 if (intervals < use_queue_and_wt && !queue_used)
1020 {
1021 memory_monitor::event("lcp-bwt-bitvec2queue-begin");
1022 util::clear(dict[target]);
1023
1024 // copy from bitvector to queue
1025 size_type a2 = util::next_bit(dict[source], 0);
1026 size_type b2 = util::next_bit(dict[source], a2 + 1);
1027 while (b2 < dict[source].size())
1028 {
1029 q.push((a2 - 1) >> 1);
1030 q.push(b2 >> 1);
1031 // get next interval
1032 a2 = util::next_bit(dict[source], b2 + 1);
1033 b2 = util::next_bit(dict[source], a2 + 1);
1034 }
1035 util::clear(dict[source]);
1036 memory_monitor::event("lcp-bwt-bitvec2queue-end");
1037 }
1038 if (intervals >= use_queue_and_wt && queue_used)
1039 {
1040 memory_monitor::event("lcp-bwt-queue2bitvec-begin");
1041 dict[source].resize(2 * (n + 1));
1042
1043 util::set_to_value(dict[source], 0);
1044 // copy from queue to bitvector
1045 while (!q.empty())
1046 {
1047 dict[source][(q.front() << 1) + 1] = 1;
1048 q.pop();
1049 dict[source][(q.front() << 1)] = 1;
1050 q.pop();
1051 }
1052 dict[target].resize(2 * (n + 1));
1053
1054 util::set_to_value(dict[target], 0);
1055 memory_monitor::event("lcp-bwt-queue2bitvec-end");
1056 }
1057
1058 if (intervals < use_queue_and_wt)
1059 {
1060 queue_used = true;
1061 intervals_new = 0;
1062 while (intervals)
1063 {
1064 // get next interval
1065 size_type a = q.front();
1066 q.pop();
1067 size_type b = q.front();
1068 q.pop();
1069 --intervals;
1070
1071 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1072 for (size_type i = 0; i < quantity; ++i)
1073 {
1074 unsigned char c = cs[i];
1075 size_type a_new = C[c] + rank_c_i[i];
1076 size_type b_new = C[c] + rank_c_j[i];
1077
1078 // Save LCP value if not seen before
1079 if (!index_done[b_new] and phase == 0)
1080 {
1081 partial_lcp[b_new] = lcp_value;
1082 index_done[b_new] = true;
1083 // Save interval
1084 q.push(a_new);
1085 q.push(b_new);
1086 ++intervals_new;
1087 }
1088 else if (!index_done[b_new])
1089 {
1090 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1091 if (!partial_lcp[insert_pos])
1092 {
1093 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1094 // Save interval
1095 q.push(a_new);
1096 q.push(b_new);
1097 ++intervals_new;
1098 }
1099 }
1100 }
1101 }
1102 intervals = intervals_new;
1103 }
1104 else
1105 {
1106 queue_used = false;
1107 intervals = 0;
1108
1109 // get next interval
1110 size_type a2 = util::next_bit(dict[source], 0);
1111 size_type b2 = util::next_bit(dict[source], a2 + 1);
1112
1113 while (b2 < dict[source].size())
1114 {
1115 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1116 for (size_type i = 0; i < quantity; ++i)
1117 {
1118 unsigned char c = cs[i];
1119 size_type a_new = C[c] + rank_c_i[i];
1120 size_type b_new = C[c] + rank_c_j[i];
1121 // Save LCP value if not seen before
1122 if (!index_done[b_new] and phase == 0)
1123 {
1124 partial_lcp[b_new] = lcp_value;
1125 index_done[b_new] = true;
1126 // Save interval
1127 dict[target][(a_new << 1) + 1] = 1;
1128 dict[target][(b_new << 1)] = 1;
1129 ++intervals;
1130 }
1131 else if (!index_done[b_new])
1132 {
1133 size_type insert_pos = b_new - ds_rank_support.rank(b_new);
1134 if (!partial_lcp[insert_pos])
1135 {
1136 partial_lcp[insert_pos] = lcp_value - lcp_value_offset;
1137 // Save interval
1138 dict[target][(a_new << 1) + 1] = 1;
1139 dict[target][(b_new << 1)] = 1;
1140 ++intervals;
1141 }
1142 }
1143 }
1144 // get next interval
1145 a2 = util::next_bit(dict[source], b2 + 1);
1146 b2 = util::next_bit(dict[source], a2 + 1);
1147 }
1148 std::swap(source, target);
1149 util::set_to_value(dict[target], 0);
1150 }
1151 ++lcp_value;
1152 if (lcp_value >= lcp_value_max)
1153 {
1154 memory_monitor::event("lcp-bwt-write-to-file-begin");
1155 if (phase) { insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1156 else
1157 {
1158 store_to_file(partial_lcp, lcp_file);
1159 }
1160 memory_monitor::event("lcp-bwt-write-to-file-end");
1161 memory_monitor::event("lcp-bwt-resize-variables-begin");
1162 util::init_support(ds_rank_support, &index_done); // Create rank support
1163
1164 // Recalculate lcp_value_max and resize partial_lcp
1165 lcp_value_offset = lcp_value_max - 1;
1166 size_type remaining_lcp_values = index_done.size() - ds_rank_support.rank(index_done.size());
1167
1168 uint8_t int_width_new = std::min(space_in_bit_for_lcp / remaining_lcp_values,
1169 (size_type)bits::hi(n - 1) + 1);
1170 lcp_value_max = lcp_value_offset + (1ULL << int_width_new);
1171#ifdef STUDY_INFORMATIONS
1172 std::cout << "# l=" << remaining_lcp_values << " b=" << (int)int_width_new
1173 << " lcp_value_max=" << lcp_value_max << std::endl;
1174#endif
1175 // partial_lcp = int_vector<>(remaining_lcp_values, 0, int_width_new);
1176 partial_lcp.width(int_width_new);
1177 partial_lcp.resize(remaining_lcp_values);
1178 partial_lcp.shrink_to_fit();
1179 util::set_to_value(partial_lcp, 0);
1180 ++phase;
1181 memory_monitor::event("lcp-bwt-resize-variables-end");
1182 }
1183 }
1184 memory_monitor::event("lcp-bwt-calc-values-end");
1185
1186 // merge to file
1187 memory_monitor::event("lcp-bwt-merge-to-file-begin");
1188 if (phase) { insert_lcp_values(partial_lcp, index_done, lcp_file, lcp_value, lcp_value_offset); }
1189 else
1190 {
1191 store_to_file(partial_lcp, lcp_file);
1192 }
1194
1195 memory_monitor::event("lcp-bwt-merge-to-file-end");
1196}
1197
1199
1214template <typename t_wt = wt_huff<bit_vector, rank_support_v<>, select_support_scan<1>, select_support_scan<0>>>
1216{
1217 typedef int_vector<>::size_type size_type;
1218
1219 uint64_t n; // Input length
1220 size_type buffer_size = 1000000; // Size of the buffer
1221 size_type lcp_value = 0; // current LCP value
1222 std::string tmp_lcp_file = cache_file_name(conf::KEY_LCP, config) + "_tmp";
1223 // (1) Calculate LCP-Positions-Array: For each lcp_value (in ascending order) all its occurrences (in any order) in
1224 // the lcp array
1225 {
1226 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1227 t_wt wt_bwt;
1228 construct(wt_bwt, cache_file_name(conf::KEY_BWT, config));
1229 n = wt_bwt.size();
1230 memory_monitor::event("lcp-bwt2-create-wt-huff-begin");
1231
1232 // Declare needed variables
1233 memory_monitor::event("lcp-bwt2-init-begin");
1234 size_type intervals = 0; // Number of intervals which are currently stored
1235 size_type intervals_new = 0; // Number of new intervals
1236
1237 std::queue<size_type> q; // Queue for storing the intervals
1238 std::vector<bit_vector> dict(2); // bit_vector for storing the intervals
1239 size_type source = 0, target = 1; // Defines which bit_vector is source and which is target
1240 bool queue_used = true; // Defines whether a queue (true) or the bit_vectors (false) was used to store intervals
1241 size_type use_queue_and_wt = n / 2048; // if intervals < use_queue_and_wt, then use queue and wavelet tree
1242 // else use dictionary and wavelet tree
1243
1244 size_type quantity; // quantity of characters in interval
1245 std::vector<unsigned char> cs(wt_bwt.sigma); // list of characters in the interval
1246 std::vector<size_type> rank_c_i(wt_bwt.sigma); // number of occurrence of character in [0 .. i-1]
1247 std::vector<size_type> rank_c_j(wt_bwt.sigma); // number of occurrence of character in [0 .. j-1]
1248
1249 // External storage of LCP-Positions-Array
1250 bool new_lcp_value = false;
1251 uint8_t int_width = bits::hi(n) + 2;
1252 int_vector_buffer<> lcp_positions_buf(tmp_lcp_file,
1253 std::ios::out,
1254 buffer_size,
1255 int_width); // Create buffer for positions of LCP entries
1256 size_type idx_out_buf = 0;
1257 bit_vector index_done(n + 1, 0); // Bitvector which is true, if corresponding LCP value was already calculated
1258
1259 // Create C-array
1260 std::vector<size_type> C; // C-Array: C[i] = number of occurrences of characters < i in the input
1261 create_C_array(C, wt_bwt);
1262 memory_monitor::event("lcp-bwt2-init-end");
1263 // Calculate LCP-Positions-Array
1264 memory_monitor::event("lcp-bwt2-calc-values-begin");
1265
1266 // Save position of first LCP-value
1267 lcp_positions_buf[idx_out_buf++] = 0;
1268 if (new_lcp_value)
1269 {
1270 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1271 new_lcp_value = false;
1272 }
1273 index_done[0] = true;
1274
1275 // calculate first intervals
1276 interval_symbols(wt_bwt, 0, n, quantity, cs, rank_c_i, rank_c_j);
1277 for (size_type i = 0; i < quantity; ++i)
1278 {
1279 unsigned char c = cs[i];
1280 size_type a_new = C[c] + rank_c_i[i];
1281 size_type b_new = C[c] + rank_c_j[i];
1282
1283 // Save LCP value and corresponding interval if not seen before
1284 if (!index_done[b_new])
1285 {
1286 if (b_new < n)
1287 {
1288 // Save position of LCP-value
1289 lcp_positions_buf[idx_out_buf++] = b_new;
1290 }
1291 index_done[b_new] = true;
1292
1293 // Save interval
1294 q.push(a_new);
1295 q.push(b_new);
1296 ++intervals;
1297 }
1298 }
1299 ++lcp_value;
1300 new_lcp_value = true;
1301
1302 // Calculate LCP positions
1303 while (intervals)
1304 {
1305 if (intervals < use_queue_and_wt && !queue_used)
1306 {
1307 memory_monitor::event("lcp-bwt2-bitvec2queue-begin");
1308 util::clear(dict[target]);
1309
1310 // Copy from bitvector to queue
1311 size_type a2 = util::next_bit(dict[source], 0);
1312 size_type b2 = util::next_bit(dict[source], a2 + 1);
1313 while (b2 < dict[source].size())
1314 {
1315 q.push((a2 - 1) >> 1);
1316 q.push(b2 >> 1);
1317 // Get next interval
1318 a2 = util::next_bit(dict[source], b2 + 1);
1319 b2 = util::next_bit(dict[source], a2 + 1);
1320 }
1321 util::clear(dict[source]);
1322 memory_monitor::event("lcp-bwt2-bitvec2queue-end");
1323 }
1324 if (intervals >= use_queue_and_wt && queue_used)
1325 {
1326 memory_monitor::event("lcp-bwt2-queue2bitvec-begin");
1327 dict[source].resize(2 * (n + 1));
1328 util::set_to_value(dict[source], 0);
1329 // Copy from queue to bitvector
1330 while (!q.empty())
1331 {
1332 dict[source][(q.front() << 1) + 1] = 1;
1333 q.pop();
1334 dict[source][(q.front() << 1)] = 1;
1335 q.pop();
1336 }
1337 dict[target].resize(2 * (n + 1));
1338 util::set_to_value(dict[target], 0);
1339 memory_monitor::event("lcp-bwt2-queue2bitvec-end");
1340 }
1341
1342 if (intervals < use_queue_and_wt)
1343 {
1344 queue_used = true;
1345 intervals_new = 0;
1346 while (intervals)
1347 {
1348 // Get next interval
1349 size_type a = q.front();
1350 q.pop();
1351 size_type b = q.front();
1352 q.pop();
1353 --intervals;
1354
1355 interval_symbols(wt_bwt, a, b, quantity, cs, rank_c_i, rank_c_j);
1356 for (size_type i = 0; i < quantity; ++i)
1357 {
1358 unsigned char c = cs[i];
1359 size_type a_new = C[c] + rank_c_i[i];
1360 size_type b_new = C[c] + rank_c_j[i];
1361
1362 // Save LCP value and corresponding interval if not seen before
1363 if (!index_done[b_new])
1364 {
1365 // Save position of LCP-value
1366 lcp_positions_buf[idx_out_buf++] = b_new;
1367 if (new_lcp_value)
1368 {
1369 // Mark new LCP-value
1370 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1371 new_lcp_value = false;
1372 }
1373 index_done[b_new] = true;
1374
1375 // Save interval
1376 q.push(a_new);
1377 q.push(b_new);
1378 ++intervals_new;
1379 }
1380 }
1381 }
1382 intervals = intervals_new;
1383 }
1384 else
1385 {
1386 queue_used = false;
1387 intervals = 0;
1388 // get next interval
1389 size_type a2 = util::next_bit(dict[source], 0);
1390 size_type b2 = util::next_bit(dict[source], a2 + 1);
1391
1392 while (b2 < dict[source].size())
1393 {
1394 interval_symbols(wt_bwt, ((a2 - 1) >> 1), (b2 >> 1), quantity, cs, rank_c_i, rank_c_j);
1395 for (size_type i = 0; i < quantity; ++i)
1396 {
1397 unsigned char c = cs[i];
1398 size_type a_new = C[c] + rank_c_i[i];
1399 size_type b_new = C[c] + rank_c_j[i];
1400 // Save LCP value if not seen before
1401 if (!index_done[b_new])
1402 {
1403 // Save position of LCP-value
1404 lcp_positions_buf[idx_out_buf++] = b_new;
1405 if (new_lcp_value)
1406 {
1407 // Mark new LCP-value
1408 lcp_positions_buf[idx_out_buf - 1] = lcp_positions_buf[idx_out_buf - 1] + n;
1409 new_lcp_value = false;
1410 }
1411 index_done[b_new] = true;
1412 // Save interval
1413 dict[target][(a_new << 1) + 1] = 1;
1414 dict[target][(b_new << 1)] = 1;
1415 ++intervals;
1416 }
1417 }
1418 // get next interval
1419 a2 = util::next_bit(dict[source], b2 + 1);
1420 b2 = util::next_bit(dict[source], a2 + 1);
1421 }
1422 std::swap(source, target);
1423 util::set_to_value(dict[target], 0);
1424 }
1425 ++lcp_value;
1426 new_lcp_value = true;
1427 }
1428 memory_monitor::event("lcp-bwt2-calc-values-end");
1429 lcp_positions_buf.close();
1430 }
1431 // (2) Insert LCP entires into LCP array
1432 {
1433 memory_monitor::event("lcp-bwt2-reordering-begin");
1434
1435 int_vector_buffer<> lcp_positions(tmp_lcp_file, std::ios::in, buffer_size);
1436
1437 uint8_t int_width = bits::hi(lcp_value + 1) + 1; // How many bits are needed for one lcp_value?
1438
1439 // Algorithm does r=ceil(int_width/8) runs over LCP-Positions-Array.
1440 // So in each run k>=(n/r) values of the lcp array must be calculated.
1441 // Because k has to be a multiple of 8, we choose number_of_values = (k+16) - ((k+16)%8)
1442 size_type number_of_values = ((n / ((int_width - 1ULL) / 8 + 1) + 16) & (~(0x7ULL)));
1443 std::string lcp_file = cache_file_name(conf::KEY_LCP, config);
1444 int_vector_buffer<> lcp_array(lcp_file,
1445 std::ios::out,
1446 number_of_values * int_width / 8,
1447 int_width); // Create Output Buffer
1448 number_of_values = lcp_array.buffersize() * 8 / int_width;
1449
1450 for (size_type position_begin = 0, position_end = number_of_values; position_begin < n and number_of_values > 0;
1451 position_begin = position_end, position_end += number_of_values)
1452 {
1453#ifdef STUDY_INFORMATIONS
1454 std::cout << "# number_of_values=" << number_of_values << " fill lcp_values with " << position_begin
1455 << " <= position <" << position_end << ", each lcp-value has " << (int)int_width
1456 << " bit, lcp_value_max=" << lcp_value << " n=" << n << std::endl;
1457#endif
1458 lcp_value = 0;
1459 for (size_type i = 0; i < n; ++i)
1460 {
1461 size_type position = lcp_positions[i];
1462 if (position > n)
1463 {
1464 position -= n;
1465 ++lcp_value;
1466 }
1467 if (position_begin <= position and position < position_end) { lcp_array[position] = lcp_value; }
1468 }
1469 }
1470 // Close file
1471 lcp_array.close();
1473 lcp_positions.close(true); // close buffer and remove file
1474 memory_monitor::event("lcp-bwt2-reordering-end");
1475 } // End of phase 2
1476}
1477
1478} // namespace sdsl
1479
1480#endif
uint64_t size() const
Returns the number of elements currently stored.
void close(bool remove_file=false)
Close the int_vector_buffer.
uint64_t buffersize() const
Returns the buffersize in bytes.
uint8_t width() const
Returns the width of the integers which are accessed via the [] operator.
void shrink_to_fit()
Free unused allocated memory.
Definition: int_vector.hpp:506
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:595
size_type size() const noexcept
The number of elements in the int_vector.
reference front() noexcept
Returns first element.
Definition: int_vector.hpp:425
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:521
static mm_event_proxy event(const std::string &name)
A rank structure proposed by Sebastiano Vigna.
size_type rank(size_type idx) const
Answers rank queries for the supported bit_vector.
construct_bwt.hpp contains a space and time efficient construction method for the Burrows and Wheeler...
construct_isa.hpp contains a space and time efficient construction method for the inverse suffix arra...
int_vector.hpp contains the sdsl::int_vector class.
constexpr char KEY_SA[]
Definition: config.hpp:37
constexpr char KEY_TEXT[]
Definition: config.hpp:41
constexpr char KEY_LCP[]
Definition: config.hpp:44
constexpr char KEY_ISA[]
Definition: config.hpp:40
constexpr char KEY_BWT[]
Definition: config.hpp:35
void set_to_value(t_int_vec &v, uint64_t k)
Set all entries of int_vector to value k.
Definition: util.hpp:563
Get the smallest position f$i geq idx f$ where a bit is set t_int_vec::size_type next_bit(const t_int_vec &v, uint64_t idx)
Namespace for the succinct data structure library.
std::string cache_file_name(const std::string &key, const cache_config &config)
Returns the file name of the resource.
Definition: io.hpp:630
bool load_from_cache(T &v, const std::string &key, const cache_config &config, bool add_type_hash=false)
Definition: io.hpp:711
void push_back_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
void interval_symbols(const t_wt &wt, typename t_wt::size_type i, typename t_wt::size_type j, typename t_wt::size_type &k, std::vector< typename t_wt::value_type > &cs, std::vector< typename t_wt::size_type > &rank_c_i, std::vector< typename t_wt::size_type > &rank_c_j)
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
void construct_lcp_kasai(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct_lcp_bwt_based(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void create_C_array(std::vector< uint64_t > &C, const tWT &wt)
void insert_lcp_values(int_vector<> &partial_lcp, bit_vector &index_done, std::string lcp_file, uint64_t max_lcp_value, uint64_t lcp_value_offset)
Merges a partial LCP array into the LCP array on disk.
void construct_lcp_PHI(cache_config &config)
Construct the LCP array for text over byte- or integer-alphabet.
void construct(t_index &idx, std::string file, uint8_t num_bytes=0, bool move_input=false)
Definition: construct.hpp:45
void construct_lcp_semi_extern_PHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_lcp_goPHI(cache_config &config)
Construct the LCP array (only for byte strings)
void construct_isa(cache_config &config)
void construct_lcp_bwt_based2(cache_config &config)
Construct the LCP array out of the BWT (only for byte strings)
void register_cache_file(const std::string &key, cache_config &config)
Register the existing resource specified by the key to the cache.
Definition: io.hpp:656
bool store_to_file(const T &v, const std::string &file)
Store a data structure to a file.
Definition: io.hpp:798
bool store_to_cache(const T &v, const std::string &key, cache_config &config, bool add_type_hash=false)
Stores the object v as a resource in the cache.
Definition: io.hpp:737
void construct_lcp_go(cache_config &config)
Construct the LCP array (only for byte strings)
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
std::list< int_vector<>::size_type > tLI
void push_front_m_index(size_type_class i, uint8_t c, tLI(&m_list)[256], uint8_t(&m_chars)[256], size_type_class &m_char_count)
T::size_type size_in_bytes(const T &t)
Get the size of a data structure in bytes.
Definition: io.hpp:778
rank_support.hpp contains classes that support a sdsl::bit_vector with constant time rank information...
select_support.hpp contains classes that support a sdsl::bit_vector with constant time select informa...
sfstream.hpp contains a two stream class which can be used to read/write from/to files or strings.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition: bits.hpp:651
static constexpr uint64_t lo_unset[65]
lo_unset[i] is a 64-bit word with the i least significant bits not set and the high bits set.
Definition: bits.hpp:210
static constexpr uint64_t lo_set[65]
lo_set[i] is a 64-bit word with the i least significant bits set and the high bits not set.
Definition: bits.hpp:187
Helper class for construction process.
Definition: config.hpp:67
Helper classes to transform width=0 and width=8 to corresponding text key.
Definition: config.hpp:91
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.