SDSL 3.0.2
Succinct Data Structure Library
Loading...
Searching...
No Matches
wt_rlmn.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_RLMN
10#define INCLUDED_SDSL_WT_RLMN
11
12#include <assert.h>
13#include <iostream>
14#include <iterator>
15#include <map>
16#include <stdint.h>
17#include <string>
18#include <utility> // for pair
19
20#include <sdsl/bits.hpp>
21#include <sdsl/cereal.hpp>
22#include <sdsl/int_vector.hpp>
24#include <sdsl/io.hpp>
25#include <sdsl/iterators.hpp>
26#include <sdsl/ram_fs.hpp>
27#include <sdsl/sd_vector.hpp> // for standard initialisation of template parameters
30#include <sdsl/util.hpp>
31#include <sdsl/wt_huff.hpp>
32
34namespace sdsl
35{
36
37template <class t_alphabet_cat>
39{
40 enum
41 {
42 width = 0
43 };
46
47 static std::map<uint64_t, uint64_t> temp_C()
48 {
49 return std::map<uint64_t, uint64_t>();
50 }
51
52 static C_type init_C(std::map<uint64_t, uint64_t> & C, uint64_t size)
53 {
54 uint64_t max_symbol = (--C.end())->first;
55 return C_type(max_symbol + 1, 0, bits::hi(size) + 1);
56 }
57
58 static C_bf_rank_type init_C_bf_rank(C_type const & C, uint64_t size)
59 {
60 return C_bf_rank_type(C.size(), 0, bits::hi(size) + 1);
61 }
62};
63
64template <>
66{
67 enum
68 {
69 width = 8
70 };
73
75 {
76 return int_vector<64>(256, 0);
77 }
78
79 static C_type init_C(C_type & C, uint64_t)
80 {
81 return C;
82 }
83
84 static C_bf_rank_type init_C_bf_rank(C_type const &, uint64_t)
85 {
86 return int_vector<64>(256, 0);
87 }
88};
89
91
110template <class t_bitvector = sd_vector<>,
111 class t_rank = typename t_bitvector::rank_1_type,
112 class t_select = typename t_bitvector::select_1_type,
113 class t_wt = wt_huff<>>
115{
116public:
117 typedef t_wt wt_type;
119 typedef typename t_wt::value_type value_type;
120 typedef typename t_bitvector::difference_type difference_type;
123 typedef t_bitvector bit_vector_type;
124 typedef t_rank rank_support_type;
125 typedef t_select select_support_type;
127 typedef typename t_wt::alphabet_category alphabet_category;
128 enum
129 {
130 lex_ordered = false
131 }; // TODO: is should be possible
132 enum
133 {
135 };
138 // to support all lex_ordered
139 // operations if t_wt::lex_ordered is
140 // true
141
142private:
143 size_type m_size = 0; // size of the original input sequence
144 bit_vector_type m_bl; // bit vector for starts of runs in
145 // the BWT (or last column), i.e. _b_ _l_ast
146 bit_vector_type m_bf; // bit vector for starts of runs in
147 // the first column of the sorted suffixes, i.e _b_ _f_irst
148 wt_type m_wt; // wavelet tree for all levels
149 // two equal chars
150 rank_support_type m_bl_rank; // rank support for bit vector bl
151 rank_support_type m_bf_rank; // rank support for bit vector bf
152 select_support_type m_bl_select; // select support for bit vector bl
153 select_support_type m_bf_select; // select support for bit vector bf
154 C_type m_C; //
155 C_bf_rank_type m_C_bf_rank; // stores the number of 1s in m_bf for
156 // the prefixes m_bf[0..m_C[0]],m_bf[0..m_C[1]],....,m_bf[0..m_C[255]];
157 // named C_s in the original paper
158
159public:
160 size_type const & sigma = m_wt.sigma;
161
162 // Default constructor
163 wt_rlmn() = default;
164
166
171 template <typename t_it>
172 wt_rlmn(t_it begin, t_it end, std::string tmp_dir = ram_file_name("")) : m_size(std::distance(begin, end))
173 {
174 std::string temp_file =
175 tmp_dir + +"_wt_rlmn_" + util::to_string(util::pid()) + "_" + util::to_string(util::id());
176 {
177 if (0 == m_size)
178 return;
179 int_vector_buffer<width> condensed_wt(temp_file, std::ios::out);
180 // scope for bl and bf
181 bit_vector bl = bit_vector(m_size, 0);
182
184 value_type last_c = (value_type)0;
185 size_type j = 0;
186 for (auto it = begin; it != end; ++it, ++j)
187 {
188 value_type c = *it;
189 if (last_c != c or it == begin)
190 {
191 bl[j] = 1;
192 condensed_wt.push_back(c);
193 }
194 ++C[c];
195 last_c = c;
196 }
197 condensed_wt.close();
199
200 for (size_type i = 0, prefix_sum = 0; i < m_C.size(); ++i)
201 {
202 m_C[i] = prefix_sum;
203 prefix_sum += C[i];
204 }
205
206 C_type lf_map = m_C;
207 bit_vector bf = bit_vector(m_size + 1, 0);
208 bf[m_size] = 1; // initialize last element
209 j = 0;
210 for (auto it = begin; it != end; ++it, ++j)
211 {
212 value_type c = *it;
213 if (bl[j])
214 {
215 bf[lf_map[c]] = 1;
216 }
217 ++lf_map[c];
218 }
219 {
220 int_vector_buffer<width> temp_bwt_buf(temp_file);
221 m_wt = wt_type(temp_bwt_buf.begin(), temp_bwt_buf.end(), tmp_dir);
222 }
223 sdsl::remove(temp_file);
224 m_bl = bit_vector_type(std::move(bl));
225 m_bf = bit_vector_type(std::move(bf));
226 }
227
228 util::init_support(m_bl_rank, &m_bl);
229 util::init_support(m_bf_rank, &m_bf);
230 util::init_support(m_bf_select, &m_bf);
231 util::init_support(m_bl_select, &m_bl);
232
233 m_C_bf_rank = wt_rlmn_trait<alphabet_category>::init_C_bf_rank(m_C, m_size);
234 for (size_type i = 0; i < m_C.size(); ++i)
235 {
236 m_C_bf_rank[i] = m_bf_rank(m_C[i]);
237 }
238 }
239
241 wt_rlmn(wt_rlmn const & wt) :
242 m_size(wt.m_size),
243 m_bl(wt.m_bl),
244 m_bf(wt.m_bf),
245 m_wt(wt.m_wt),
246 m_bl_rank(wt.m_bl_rank),
247 m_bf_rank(wt.m_bf_rank),
248 m_bl_select(wt.m_bl_select),
249 m_bf_select(wt.m_bf_select),
250 m_C(wt.m_C),
251 m_C_bf_rank(wt.m_C_bf_rank)
252 {
253 m_bl_rank.set_vector(&m_bl);
254 m_bf_rank.set_vector(&m_bf);
255 m_bl_select.set_vector(&m_bl);
256 m_bf_select.set_vector(&m_bf);
257 }
258
261 m_size(wt.m_size),
262 m_bl(std::move(wt.m_bl)),
263 m_bf(std::move(wt.m_bf)),
264 m_wt(std::move(wt.m_wt)),
265 m_bl_rank(std::move(wt.m_bl_rank)),
266 m_bf_rank(std::move(wt.m_bf_rank)),
267 m_bl_select(std::move(wt.m_bl_select)),
268 m_bf_select(std::move(wt.m_bf_select)),
269 m_C(std::move(wt.m_C)),
270 m_C_bf_rank(std::move(wt.m_C_bf_rank))
271 {
272 m_bl_rank.set_vector(&m_bl);
273 m_bf_rank.set_vector(&m_bf);
274 m_bl_select.set_vector(&m_bl);
275 m_bf_select.set_vector(&m_bf);
276 }
277
280 {
281 if (this != &wt)
282 {
283 wt_rlmn tmp(wt);
284 *this = std::move(tmp);
285 }
286 return *this;
287 }
288
291 {
292 if (this != &wt)
293 {
294 m_size = std::move(wt.m_size);
295 m_bl = std::move(wt.m_bl);
296 m_bf = std::move(wt.m_bf);
297 m_wt = std::move(wt.m_wt);
298 m_bl_rank = std::move(wt.m_bl_rank);
299 m_bl_rank.set_vector(&m_bl);
300 m_bf_rank = std::move(wt.m_bf_rank);
301 m_bf_rank.set_vector(&m_bf);
302 m_bl_select = std::move(wt.m_bl_select);
303 m_bl_select.set_vector(&m_bl);
304 m_bf_select = std::move(wt.m_bf_select);
305 m_bf_select.set_vector(&m_bf);
306 m_C = std::move(wt.m_C);
307 m_C_bf_rank = std::move(wt.m_C_bf_rank);
308 }
309 return *this;
310 }
311
314 {
315 return m_size;
316 }
317
319 bool empty() const
320 {
321 return 0 == m_size;
322 }
323
325
332 {
333 assert(i < size());
334 return m_wt[m_bl_rank(i + 1) - 1];
335 };
336
338
347 {
348 assert(i <= size());
349 if (i == 0)
350 return 0;
351 size_type wt_ex_pos = m_bl_rank(i);
352 size_type c_runs = m_wt.rank(wt_ex_pos, c);
353 if (c_runs == 0)
354 return 0;
355 if (m_wt[wt_ex_pos - 1] == c)
356 {
357 size_type c_run_begin = m_bl_select(wt_ex_pos);
358 return m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin;
359 }
360 else
361 {
362 return m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c];
363 }
364 };
365
367
373 std::pair<size_type, value_type> inverse_select(size_type i) const
374 {
375 assert(i < size());
376 if (i == 0)
377 {
378 return std::make_pair(0, m_wt[0]);
379 }
380 size_type wt_ex_pos = m_bl_rank(i + 1);
381 auto rc = m_wt.inverse_select(wt_ex_pos - 1);
382 size_type c_runs = rc.first + 1;
383 value_type c = rc.second;
384 if (c_runs == 0)
385 return std::make_pair(0, c);
386 if (m_wt[wt_ex_pos - 1] == c)
387 {
388 size_type c_run_begin = m_bl_select(wt_ex_pos);
389 return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs) - m_C[c] + i - c_run_begin, c);
390 }
391 else
392 {
393 return std::make_pair(m_bf_select(m_C_bf_rank[c] + c_runs + 1) - m_C[c], c);
394 }
395 }
396
398
406 {
407 assert(i > 0);
408 assert(i <= rank(size(), c));
409 size_type c_runs = m_bf_rank(m_C[c] + i) - m_C_bf_rank[c];
410 size_type offset = m_C[c] + i - 1 - m_bf_select(c_runs + m_C_bf_rank[c]);
411 return m_bl_select(m_wt.select(c_runs, c) + 1) + offset;
412 };
413
416 {
417 return const_iterator(this, 0);
418 }
419
422 {
423 return const_iterator(this, size());
424 }
425
427 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
428 {
429 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
430 size_type written_bytes = 0;
431 written_bytes += write_member(m_size, out, child, "size");
432 written_bytes += m_bl.serialize(out, child, "bl");
433 written_bytes += m_bf.serialize(out, child, "bf");
434 written_bytes += m_wt.serialize(out, child, "wt");
435 written_bytes += m_bl_rank.serialize(out, child, "bl_rank");
436 written_bytes += m_bf_rank.serialize(out, child, "bf_rank");
437 written_bytes += m_bl_select.serialize(out, child, "bl_select");
438 written_bytes += m_bf_select.serialize(out, child, "bf_select");
439 written_bytes += m_C.serialize(out, child, "C");
440 written_bytes += m_C_bf_rank.serialize(out, child, "C_bf_rank");
441 structure_tree::add_size(child, written_bytes);
442 return written_bytes;
443 }
444
446 void load(std::istream & in)
447 {
448 read_member(m_size, in);
449 m_bl.load(in);
450 m_bf.load(in);
451 m_wt.load(in);
452 m_bl_rank.load(in, &m_bl);
453 m_bf_rank.load(in, &m_bf);
454 m_bl_select.load(in, &m_bl);
455 m_bf_select.load(in, &m_bf);
456 m_C.load(in);
457 m_C_bf_rank.load(in);
458 }
459
461 template <typename archive_t>
462 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
463 {
464 ar(CEREAL_NVP(m_size));
465 ar(CEREAL_NVP(m_bl));
466 ar(CEREAL_NVP(m_bf));
467 ar(CEREAL_NVP(m_wt));
468 ar(CEREAL_NVP(m_bl_rank));
469 ar(CEREAL_NVP(m_bf_rank));
470 ar(CEREAL_NVP(m_bl_select));
471 ar(CEREAL_NVP(m_bf_select));
472 ar(CEREAL_NVP(m_C));
473 ar(CEREAL_NVP(m_C_bf_rank));
474 }
475
477 template <typename archive_t>
478 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
479 {
480 ar(CEREAL_NVP(m_size));
481 ar(CEREAL_NVP(m_bl));
482 ar(CEREAL_NVP(m_bf));
483 ar(CEREAL_NVP(m_wt));
484 ar(CEREAL_NVP(m_bl_rank));
485 m_bl_rank.set_vector(&m_bl);
486 ar(CEREAL_NVP(m_bf_rank));
487 m_bf_rank.set_vector(&m_bf);
488 ar(CEREAL_NVP(m_bl_select));
489 m_bl_select.set_vector(&m_bl);
490 ar(CEREAL_NVP(m_bf_select));
491 m_bf_select.set_vector(&m_bf);
492 ar(CEREAL_NVP(m_C));
493 ar(CEREAL_NVP(m_C_bf_rank));
494 }
495
497 bool operator==(wt_rlmn const & other) const noexcept
498 {
499 return (m_size == other.m_size) && (m_bl == other.m_bl) && (m_bf == other.m_bf) && (m_wt == other.m_wt)
500 && (m_bl_rank == other.m_bl_rank) && (m_bf_rank == other.m_bf_rank) && (m_bl_select == other.m_bl_select)
501 && (m_bf_select == other.m_bf_select) && (m_C == other.m_C) && (m_C_bf_rank == other.m_C_bf_rank);
502 }
503
505 bool operator!=(wt_rlmn const & other) const noexcept
506 {
507 return !(*this == other);
508 }
509};
510
511} // end namespace sdsl
512#endif
bits.hpp contains the sdsl::bits class.
cereal.hpp offers cereal support
#define CEREAL_NVP(X)
Definition cereal.hpp:31
void close(bool remove_file=false)
Close the int_vector_buffer.
void push_back(const uint64_t value)
Appends the given element value to the end of the int_vector_buffer.
size_type size() const noexcept
The number of elements in the int_vector.
Generic iterator for a random access container.
Definition iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, std::string const &name, std::string const &type)
static void add_size(structure_tree_node *v, uint64_t value)
A Wavelet Tree class for byte sequences.
Definition wt_rlmn.hpp:115
t_bitvector bit_vector_type
Definition wt_rlmn.hpp:123
wt_rlmn_trait< alphabet_category >::C_bf_rank_type C_bf_rank_type
Definition wt_rlmn.hpp:137
size_type const & sigma
Definition wt_rlmn.hpp:160
t_wt::alphabet_category alphabet_category
Definition wt_rlmn.hpp:127
wt_rlmn()=default
random_access_const_iterator< wt_rlmn > const_iterator
Definition wt_rlmn.hpp:121
wt_rlmn_trait< alphabet_category >::C_type C_type
Definition wt_rlmn.hpp:136
bool operator==(wt_rlmn const &other) const noexcept
Equality operator.
Definition wt_rlmn.hpp:497
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition wt_rlmn.hpp:346
t_select select_support_type
Definition wt_rlmn.hpp:125
size_type size() const
Returns the size of the original vector.
Definition wt_rlmn.hpp:313
wt_rlmn & operator=(wt_rlmn const &wt)
Assignment operator.
Definition wt_rlmn.hpp:279
t_wt::value_type value_type
Definition wt_rlmn.hpp:119
void load(std::istream &in)
Loads the data structure from the given istream.
Definition wt_rlmn.hpp:446
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Load via cereal.
Definition wt_rlmn.hpp:478
bool empty() const
Returns whether the wavelet tree contains no data.
Definition wt_rlmn.hpp:319
wt_rlmn(wt_rlmn &&wt)
Move constructor.
Definition wt_rlmn.hpp:260
wt_rlmn(wt_rlmn const &wt)
Copy constructor.
Definition wt_rlmn.hpp:241
value_type operator[](size_type i) const
Recovers the i-th symbol of the original vector.
Definition wt_rlmn.hpp:331
bool operator!=(wt_rlmn const &other) const noexcept
Inequality operator.
Definition wt_rlmn.hpp:505
wt_tag index_category
Definition wt_rlmn.hpp:126
wt_rlmn(t_it begin, t_it end, std::string tmp_dir=ram_file_name(""))
Construct the wavelet tree from a sequence defined by two interators.
Definition wt_rlmn.hpp:172
int_vector ::size_type size_type
Definition wt_rlmn.hpp:118
t_rank rank_support_type
Definition wt_rlmn.hpp:124
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition wt_rlmn.hpp:427
const_iterator begin() const
Returns a const_iterator to the first element.
Definition wt_rlmn.hpp:415
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many times symbol wt[i] occurs in the prefix [0..i-1].
Definition wt_rlmn.hpp:373
t_bitvector::difference_type difference_type
Definition wt_rlmn.hpp:120
wt_rlmn & operator=(wt_rlmn &&wt)
Assignment move operator.
Definition wt_rlmn.hpp:290
const_iterator iterator
Definition wt_rlmn.hpp:122
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Serialise (save) via cereal.
Definition wt_rlmn.hpp:462
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition wt_rlmn.hpp:421
size_type select(size_type i, value_type c) const
Calculates the ith occurrence of the symbol c in the supported vector.
Definition wt_rlmn.hpp:405
int_vector.hpp contains the sdsl::int_vector class.
int_vector_buffer.hpp contains the sdsl::int_vector_buffer class.
io.hpp contains some methods for reading/writing sdsl structures.
iterators.hpp contains an generic iterator for random access containers.
uint64_t id()
uint64_t pid()
std::string to_string(T const &t, int w=1)
Namespace for the succinct data structure library.
int remove(std::string const &)
Remove a file.
Definition ram_fs.hpp:221
std::string ram_file_name(std::string const &file)
Returns the corresponding RAM-file name for file.
Definition ram_fs.hpp:195
size_t write_member(T const &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition io.hpp:94
void read_member(T &t, std::istream &in)
Definition io.hpp:119
int_vector< 1 > bit_vector
bit_vector is a specialization of the int_vector.
int_vector ::size_type size(range_type const &r)
Size of a range.
ram_fs.hpp
sd_vector.hpp contains the sdsl::sd_vector class, and classes which support rank and select for sd_ve...
Contains declarations and definitions of data structure concepts.
static constexpr uint32_t hi(uint64_t x)
Position of the most significant set bit the 64-bit word x.
Definition bits.hpp:652
static C_bf_rank_type init_C_bf_rank(C_type const &, uint64_t)
Definition wt_rlmn.hpp:84
static C_type init_C(C_type &C, uint64_t)
Definition wt_rlmn.hpp:79
static int_vector< 64 > temp_C()
Definition wt_rlmn.hpp:74
static C_type init_C(std::map< uint64_t, uint64_t > &C, uint64_t size)
Definition wt_rlmn.hpp:52
static C_bf_rank_type init_C_bf_rank(C_type const &C, uint64_t size)
Definition wt_rlmn.hpp:58
int_vector C_bf_rank_type
Definition wt_rlmn.hpp:45
static std::map< uint64_t, uint64_t > temp_C()
Definition wt_rlmn.hpp:47
int_vector C_type
Definition wt_rlmn.hpp:44
structure_tree.hpp contains a helper class which can represent the memory structure of a class.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.
wt_huff.hpp contains a class for a Huffman shaped wavelet tree over byte sequences.