SDSL 3.0.3
Succinct Data Structure Library
Loading...
Searching...
No Matches
wt_epr.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
9#ifndef INCLUDED_SDSL_WT_EPR
10#define INCLUDED_SDSL_WT_EPR
11
12#include <assert.h>
13#include <cmath>
14#include <iosfwd>
15#include <iterator>
16#include <stdexcept>
17#include <stdint.h>
18#include <string>
19#include <tuple>
20#include <type_traits>
21#include <utility>
22#include <vector>
23
24#include <sdsl/cereal.hpp>
25#include <sdsl/int_vector.hpp>
26#include <sdsl/io.hpp>
27#include <sdsl/iterators.hpp>
31#include <sdsl/util.hpp>
32#include <sdsl/wt_helper.hpp>
33
35namespace sdsl
36{
37
44template <uint8_t alphabet_size, class rank_type = rank_support_int_v<alphabet_size>, class t_tree_strat = byte_tree<>>
45class wt_epr
46{
47public:
48 typedef typename t_tree_strat::template type<wt_epr> tree_strat_type;
55 typedef byte_alphabet_tag /*typename tree_strat_type::alphabet_category*/ alphabet_category;
56 enum
57 {
58 lex_ordered = true
59 };
60
61private:
63 static constexpr bool has_inblock_text = std::is_same<rank_type, rank_support_int_v<alphabet_size>>::value;
64
65 size_type m_size = 0;
66 size_type m_sigma = 0;
67 int_vector<> m_bv;
68 rank_type m_bv_rank;
69
71 template <bool has_inblock_text_>
72 auto construct_init_rank_select(int_vector<> intermediate_bitvector) -> std::enable_if_t<has_inblock_text_, void>
73 {
74 // The text is stored inside of the rank structure so we do not store it here.
75 m_bv_rank = rank_type{&intermediate_bitvector}; // Create the rank support structure.
76 }
77
79 template <bool has_inblock_text_>
80 auto construct_init_rank_select(int_vector<> intermediate_bitvector) -> std::enable_if_t<!has_inblock_text_, void>
81 {
82 m_bv = std::move(intermediate_bitvector);
83 m_bv_rank = rank_type{&m_bv}; // Create the rank support structure.
84 }
85
87 template <bool has_inblock_text_>
88 auto value_at(size_type const position) const -> std::enable_if_t<has_inblock_text_, value_type>
89 { // In the special epr rank implementation, the text is stored in the superblocks.
90 assert(position < size());
91 return m_bv_rank.value_at(position); // Extract the value from the rank support structure.
92 }
93
95 template <bool has_inblock_text_>
96 auto value_at(size_type const position) const -> std::enable_if_t<!has_inblock_text_, value_type>
97 {
98 assert(position < size());
99 return m_bv[position];
100 }
101
102public:
103 size_type const & sigma = m_sigma;
104 int_vector<> const & bv = m_bv;
105
107 wt_epr() = default;
108
115 template <typename t_it>
116 wt_epr(t_it begin, t_it end) : m_size(std::distance(begin, end))
117 {
118 if (0 == m_size)
119 return;
120 // O(n + |\Sigma|\log|\Sigma|) algorithm for calculating node sizes
121 // TODO: C should also depend on the tree_strategy. C is just a mapping
122 // from a symbol to its frequency. So a map<uint64_t,uint64_t> could be
123 // used for integer alphabets...
124 std::vector<size_type> C;
125 // 1. Count occurrences of characters
127 // 2. Calculate effective alphabet size
129
130 // The text cannot have an alphabet larger than the required alphabet_size.
131 if (m_sigma > alphabet_size)
132 throw std::domain_error{"The given text uses an alphabet that is larger than the explicitly given "
133 "alphabet size."};
134
135 // 4. Generate wavelet tree bit sequence m_bv
136 int_vector<> intermediate_bitvector{};
137 intermediate_bitvector.width(std::ceil(std::log2(m_sigma)));
138 intermediate_bitvector.resize(m_size);
139
140 std::copy(begin, end, intermediate_bitvector.begin());
141
142 // 5. Initialize rank and select data structures for m_bv
143 construct_init_rank_select<has_inblock_text>(std::move(intermediate_bitvector));
144 }
145
146 template <typename t_it>
147 wt_epr(t_it begin, t_it end, std::string) : wt_epr(begin, end)
148 {}
149
151 wt_epr(wt_epr const & wt) : m_size(wt.m_size), m_sigma(wt.m_sigma), m_bv(wt.m_bv), m_bv_rank(wt.m_bv_rank)
152 {
153 m_bv_rank.set_vector(&m_bv);
154 }
155
156 wt_epr(wt_epr && wt) :
157 m_size(wt.m_size),
158 m_sigma(wt.m_sigma),
159 m_bv(std::move(wt.m_bv)),
160 m_bv_rank(std::move(wt.m_bv_rank))
161 {
162 m_bv_rank.set_vector(&m_bv);
163 }
164
166 wt_epr & operator=(wt_epr const & wt)
167 {
168 if (this != &wt)
169 {
170 wt_epr tmp(wt); // re-use copy-constructor
171 *this = std::move(tmp); // re-use move-assignment
172 }
173 return *this;
174 }
175
178 {
179 if (this != &wt)
180 {
181 m_size = wt.m_size;
182 m_sigma = wt.m_sigma;
183 m_bv = std::move(wt.m_bv);
184 m_bv_rank = std::move(wt.m_bv_rank);
185 m_bv_rank.set_vector(&m_bv);
186 }
187 return *this;
188 }
189
192 {
193 return m_size;
194 }
195
197 bool empty() const
198 {
199 return m_size == 0;
200 }
201
211 auto operator[](size_type const i) const
212 {
213 assert(i < size());
214 return value_at<has_inblock_text>(i);
215 };
216
228 {
229 assert(i <= size());
230 return m_bv_rank.rank(i, c);
231 };
232
242 std::pair<size_type, value_type> inverse_select(size_type i) const
243 {
244 assert(i < size());
245 value_type value = (*this)[i];
246 return std::make_pair(m_bv_rank.rank(i, value), value);
247 }
248
249 // TODO: implement (if necessary?)
269 // void interval_symbols(size_type i,
270 // size_type j,
271 // size_type& k,
272 // std::vector<value_type>& cs,
273 // std::vector<size_type>& rank_c_i,
274 // std::vector<size_type>& rank_c_j) const
275 // { }
276
289 template <class t_ret_type = std::tuple<size_type, size_type, size_type>>
290 t_ret_type lex_count(size_type i, size_type j, value_type c) const
291 {
292 assert(i <= j and j <= size());
293 // size_type smaller = 0;
294 // size_type greater = (j - i) - (m_bv_rank.prefix_rank(j, c) - m_bv_rank.prefix_rank(i, c));
295 // if (c > 0)
296 // smaller = m_bv_rank.prefix_rank(j, c-1) - m_bv_rank.prefix_rank(i, c-1);
297 // size_type rank = m_bv_rank.rank(i, c);
298
299 // TODO: write a function returning a pair for (i, c) and (i, c-1) and benchmark!
300 size_type smaller = 0;
301 size_type prefix_i_c = m_bv_rank.prefix_rank(i, c);
302 size_type prefix_i_c_1 = 0;
303 size_type greater = j - i - m_bv_rank.prefix_rank(j, c) + prefix_i_c;
304 if (c > 0)
305 {
306 prefix_i_c_1 = m_bv_rank.prefix_rank(i, c - 1);
307 smaller = m_bv_rank.prefix_rank(j, c - 1) - prefix_i_c_1;
308 }
309 size_type rank = prefix_i_c - prefix_i_c_1;
310
311 return t_ret_type{rank, smaller, greater};
312 }
313
314 /*\brief How many symbols are lexicographic smaller than c in [0..i-1].
315 * \param i Exclusive right bound of the range.
316 * \param c Symbol c.
317 * \return A tuple containing:
318 * * rank(i,c)
319 * * #symbols smaller than c in [0..i-1]
320 * \par Precondition
321 * \f$ i \leq size() \f$
322 * \note
323 * This method is only available if lex_ordered = true
324 */
325 template <class t_ret_type = std::tuple<size_type, size_type>>
327 {
328 assert(i <= size());
329 // TODO: write a function returning a pair for (i, c) and (i, c-1) and benchmark!
330 size_type prefix_count_smaller = 0;
331 if (c > 0)
332 prefix_count_smaller = m_bv_rank.prefix_rank(i, c - 1);
333 return t_ret_type{m_bv_rank.prefix_rank(i, c) - prefix_count_smaller, prefix_count_smaller};
334 }
335
338 {
339 return const_iterator(this, 0);
340 }
341
344 {
345 return const_iterator(this, size());
346 }
347
349 size_type serialize(std::ostream & out, structure_tree_node * v = nullptr, std::string name = "") const
350 {
351 structure_tree_node * child = structure_tree::add_child(v, name, util::class_name(*this));
352 size_type written_bytes = 0;
353 written_bytes += write_member(m_size, out, child, "size");
354 written_bytes += write_member(m_sigma, out, child, "sigma");
355 written_bytes += m_bv.serialize(out, child, "bv");
356 written_bytes += m_bv_rank.serialize(out, child, "bv_rank");
357 structure_tree::add_size(child, written_bytes);
358 return written_bytes;
359 }
360
362 void load(std::istream & in)
363 {
364 read_member(m_size, in);
365 read_member(m_sigma, in);
366 m_bv.load(in);
367 m_bv_rank.load(in, &m_bv);
368 }
369
371 friend bool operator==(wt_epr const & lhs, wt_epr const & rhs) noexcept
372 {
373 return (lhs.m_size == rhs.m_size) && (lhs.m_sigma == rhs.m_sigma) && (lhs.m_bv == rhs.m_bv)
374 && (lhs.m_bv_rank == rhs.m_bv_rank);
375 }
376
378 friend bool operator!=(wt_epr const & lhs, wt_epr const & rhs) noexcept
379 {
380 return !(lhs == rhs);
381 }
382
383 template <typename archive_t>
384 void CEREAL_SAVE_FUNCTION_NAME(archive_t & ar) const
385 {
386 ar(CEREAL_NVP(m_size));
387 ar(CEREAL_NVP(m_sigma));
388 ar(CEREAL_NVP(m_bv));
389 ar(CEREAL_NVP(m_bv_rank));
390 }
391
392 template <typename archive_t>
393 void CEREAL_LOAD_FUNCTION_NAME(archive_t & ar)
394 {
395 ar(CEREAL_NVP(m_size));
396 ar(CEREAL_NVP(m_sigma));
397 ar(CEREAL_NVP(m_bv));
398 ar(CEREAL_NVP(m_bv_rank));
399 m_bv_rank.set_vector(&m_bv);
400 }
401};
402} // namespace sdsl
403
404#endif
cereal.hpp offers cereal support
#define CEREAL_NVP(X)
Definition cereal.hpp:31
A generic vector class for integers of width .
Definition io.hpp:36
int_vector_size_type size_type
ptrdiff_t difference_type
void load(std::istream &in)
Load the int_vector for a stream.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the int_vector to a stream.
int_vector_trait< t_width >::value_type value_type
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Generic iterator for a random access container.
Definition iterators.hpp:24
static structure_tree_node * add_child(structure_tree_node *v, std::string const &name, std::string const &type)
static void add_size(structure_tree_node *v, uint64_t value)
An EPR-dictionary based wavelet.
Definition wt_epr.hpp:46
random_access_const_iterator< wt_epr > const_iterator
Definition wt_epr.hpp:51
const_iterator begin() const
Returns a const_iterator to the first element.
Definition wt_epr.hpp:337
bool empty() const
Returns whether the wavelet tree contains no data.
Definition wt_epr.hpp:197
auto operator[](size_type const i) const
Recovers the i-th symbol of the original vector.
Definition wt_epr.hpp:211
wt_epr()=default
Default constructor.
size_type serialize(std::ostream &out, structure_tree_node *v=nullptr, std::string name="") const
Serializes the data structure into the given ostream.
Definition wt_epr.hpp:349
int_vector ::difference_type difference_type
Definition wt_epr.hpp:53
t_ret_type lex_smaller_count(size_type i, value_type c) const
Definition wt_epr.hpp:326
int_vector ::size_type size_type
Definition wt_epr.hpp:49
size_type rank(size_type i, value_type c) const
Calculates how many symbols c are in the prefix [0..i-1].
Definition wt_epr.hpp:227
friend bool operator!=(wt_epr const &lhs, wt_epr const &rhs) noexcept
Inequality operator.
Definition wt_epr.hpp:378
const_iterator end() const
Returns a const_iterator to the element after the last element.
Definition wt_epr.hpp:343
byte_alphabet_tag alphabet_category
Definition wt_epr.hpp:55
size_type size() const
Returns the size of the original vector.
Definition wt_epr.hpp:191
std::pair< size_type, value_type > inverse_select(size_type i) const
Calculates how many times symbol wt[i] occurs in the prefix [0..i-1].
Definition wt_epr.hpp:242
t_ret_type lex_count(size_type i, size_type j, value_type c) const
For each symbol c in wt[i..j-1] get rank(i,c) and rank(j,c).
Definition wt_epr.hpp:290
wt_tag index_category
Definition wt_epr.hpp:54
wt_epr(t_it begin, t_it end)
Construct the EPR-dictionary from a sequence defined by two interators.
Definition wt_epr.hpp:116
t_tree_strat::template type< wt_epr > tree_strat_type
Definition wt_epr.hpp:48
wt_epr(wt_epr &&wt)
Definition wt_epr.hpp:156
size_type const & sigma
Definition wt_epr.hpp:103
const_iterator iterator
Definition wt_epr.hpp:52
int_vector const & bv
Definition wt_epr.hpp:104
wt_epr(t_it begin, t_it end, std::string)
Definition wt_epr.hpp:147
void load(std::istream &in)
Loads the data structure from the given istream.
Definition wt_epr.hpp:362
wt_epr & operator=(wt_epr &&wt)
Move assignment operator.
Definition wt_epr.hpp:177
void CEREAL_LOAD_FUNCTION_NAME(archive_t &ar)
Definition wt_epr.hpp:393
wt_epr(wt_epr const &wt)
Copy constructor.
Definition wt_epr.hpp:151
void CEREAL_SAVE_FUNCTION_NAME(archive_t &ar) const
Definition wt_epr.hpp:384
wt_epr & operator=(wt_epr const &wt)
Assignment operator.
Definition wt_epr.hpp:166
int_vector ::value_type value_type
Definition wt_epr.hpp:50
friend bool operator==(wt_epr const &lhs, wt_epr const &rhs) noexcept
Equality operator.
Definition wt_epr.hpp:371
int_vector.hpp contains the sdsl::int_vector class.
io.hpp contains some methods for reading/writing sdsl structures.
iterators.hpp contains an generic iterator for random access containers.
Namespace for the succinct data structure library.
void calculate_effective_alphabet_size(t_rac const &C, sigma_type &sigma)
Definition wt_helper.hpp:62
void calculate_character_occurences(t_it begin, t_it end, t_rac &C)
Count for each character the number of occurrences in rac[0..size-1].
Definition wt_helper.hpp:47
size_t write_member(T const &t, std::ostream &out, sdsl::structure_tree_node *v=nullptr, std::string name="")
Definition io.hpp:94
void read_member(T &t, std::istream &in)
Definition io.hpp:119
rank_support_int_v.hpp contains rank_support_int_v.
Contains declarations and definitions of data structure concepts.
structure_tree.hpp contains a helper class which can represent the memory structure of a class.
util.hpp contains some helper methods for int_vector and other stuff like demangle class names.