SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
coder_comma.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef SDSL_CODER_COMMA_INCLUDED
9#define SDSL_CODER_COMMA_INCLUDED
10
11#include <algorithm>
12#include <array>
13#include <math.h>
14
15#include <sdsl/bits.hpp>
16#include <sdsl/config.hpp>
17
18namespace sdsl
19{
20
21namespace coder
22{
23
25
46template <uint8_t t_width = 2>
47class comma
48{
49 private:
50 static_assert(t_width > 1 && t_width <= 32, "comma coder: Width must be in interval [2,32]");
51
52 static constexpr size_t base_fits_in_64(uint32_t base, uint64_t product, size_t res)
53 {
54 return product == 0 ? res : base_fits_in_64(base, product / base, res + 1);
55 }
56
57 // base in which numbers are coded
58 static const uint32_t base = (1 << t_width) - 1;
59
60 // table needed for computation of encoding lengths.
61 // table contains entries of the kind (index, base^index)
62 // to know how much digits a number needs to be encoded.
63 static constexpr size_t codelentbllen = base_fits_in_64(base, 0xFFFFFFFFFFFFFFFFULL, 0);
64
65 static struct impl
66 {
67 std::array<uint64_t, codelentbllen> codelentbl;
68 impl()
69 {
70 // intialize codelentbl
71 uint64_t n = 1;
72 for (size_t i = 0; i < codelentbllen; i++)
73 {
74 codelentbl[i] = n;
75 n = (n << t_width) - n; // n = n * base
76 }
77 }
78 } data;
79
80 // helper function to encode a single number without
81 // termination digit
82 static void encode_in_base(uint64_t x, uint64_t *& z, uint8_t & offset);
83
84 public:
85 typedef uint64_t size_type;
86 static const uint8_t min_codeword_length = t_width; // 0 needs t_width bits as termination
87
89
91 // the value w in comma code.
94 static uint8_t encoding_length(uint64_t w);
95
97 // at bit position start_idx.
98 /* \param x Positive integer to encode.
99 * \param z Raw data of vector to write the encoded form of x.
100 * \param start_idx Beginning bit index to write the encoded form ox x in z.
101 */
102 static void encode(uint64_t x, uint64_t *& z, uint8_t & offset);
103
105 /* \param v vector containing positive integer values
106 * \param z vector to put the encoded values
107 */
108 template <class int_vector>
109 static bool encode(const int_vector & v, int_vector & z);
110
112
114 // in the bitstring "data"
115 /* \param data Bitstring
116 * \param start_idx Starting index of the decoding.
117 * \param n Number of values to decode from the bitstring.
118 * \param it Iterator to store the values.
119 */
120 template <bool t_sumup, bool t_inc, class t_iter>
121 static uint64_t decode(const uint64_t * data, const size_type start_idx, size_type n, t_iter it = (t_iter) nullptr);
122
124 // beginning at start_idx in the bitstring "data"
125 // and return the sum of these values.
134 static uint64_t decode_prefix_sum(const uint64_t * data, const size_type start_idx, size_type n);
135
137 // beginning at start_idx ending at end_idx (exclusive)
138 // in the bitstring "data"
139 // and return the sum of these values.
150 static uint64_t decode_prefix_sum(const uint64_t * data,
151 const size_type start_idx,
152 const size_type end_idx,
153 size_type n);
154
156 // and store them in vector v.
160 template <class int_vector>
161 static bool decode(const int_vector & z, int_vector & v);
162
163 // interface needs this function for whatever :>
164 template <class int_vector>
165 static uint64_t * raw_data(int_vector & v)
166 {
167 return v.m_data;
168 }
169};
170
172
174
176
177template <uint8_t t_width>
178typename comma<t_width>::impl comma<t_width>::data;
179
180template <uint8_t t_width>
181inline uint8_t comma<t_width>::encoding_length(uint64_t w)
182{
183 // use function table and binary search to determine the number of digits
184 // needed to encode w in given base.
185 uint8_t numdigits = std::upper_bound(data.codelentbl.begin(), data.codelentbl.end(), w) - data.codelentbl.begin();
186 // finally calculate length.
187 // Don't forget termination character on calculations ;)
188 return (numdigits + 1) * t_width;
189}
190
191template <uint8_t t_width>
192void comma<t_width>::encode_in_base(uint64_t x, uint64_t *& z, uint8_t & offset)
193{
194 if (x)
195 {
196 uint32_t digit = x % base; // get next digit
197 // encode digits with higher order
198 encode_in_base(x / base, z, offset);
199 // and write own digit
200 bits::write_int_and_move(z, digit, offset, t_width);
201 }
202}
203
204template <uint8_t t_width>
205inline void comma<t_width>::encode(uint64_t x, uint64_t *& z, uint8_t & offset)
206{
207 // encode x itself
208 encode_in_base(x, z, offset);
209 // and append the termination digit
210 bits::write_int_and_move(z, base, offset, t_width);
211}
212
213template <uint8_t t_width>
214template <class int_vector>
216{
217 // first, find out how much bits vector z needs to save values
218 typedef typename int_vector::size_type size_type;
219 size_type z_bit_size = 0;
220 for (typename int_vector::const_iterator it = v.begin(), end = v.end(); it != end; ++it)
221 {
222 z_bit_size += encoding_length(*it);
223 }
224
225 // trim vector z to correct size
226 z.width(v.width());
227 z.bit_resize(z_bit_size); // for future may check if resizing works
228 z.shrink_to_fit();
229
230 // iterate again and save values in z
231 uint64_t * z_data = z.m_data;
232 uint8_t offset = 0;
233 for (typename int_vector::const_iterator it = v.begin(), end = v.end(); it != end; ++it)
234 {
235 encode(*it, z_data, offset);
236 }
237 return true;
238}
239
241
242template <uint8_t t_width>
243template <bool t_sumup, bool t_inc, class t_iter>
244inline uint64_t comma<t_width>::decode(const uint64_t * data, const size_type start_idx, size_type n, t_iter it)
245{
246 data += (start_idx >> 6); // jump to byte offset
247 uint8_t offset = start_idx & 0x3F; // and calculate bit offset
248 uint64_t value = 0;
249 for (size_type i = 0; i < n; i++)
250 {
251 // read next value
252 uint64_t v = 0;
253 for (uint32_t digit = (uint32_t)bits::read_int_and_move(data, offset, t_width); // read first digit
254 digit != base; // while digit is not the terminating digit
255 v = (v << t_width) - v + digit, // v = v * base + digit
256 digit = (uint32_t)bits::read_int_and_move(data, offset, t_width))
257 ; // and read next digit
258 // now decide how to handle value
259 value = (t_sumup) ? value + v : v;
260 if (t_inc) *(it++) = value;
261 }
262 return value;
263}
264
265template <uint8_t t_width>
266uint64_t comma<t_width>::decode_prefix_sum(const uint64_t * data, const size_type start_idx, size_type n)
267{
268 // easiest seems to be to use already build function decode...
269 return decode<true, false, int *>(data, start_idx, n);
270 // Note for above: 3rd template parameter ca be any pntr except void *
271}
272
273template <uint8_t t_width>
274uint64_t comma<t_width>::decode_prefix_sum(const uint64_t * data,
275 const size_type start_idx,
276 SDSL_UNUSED const size_type end_idx,
277 size_type n)
278{
279 // end index does not change anything here...
280 return decode_prefix_sum(data, start_idx, n);
281}
282
283template <uint8_t t_width>
284template <class int_vector>
286{
287 // check if bit size is dividable through t_width.
288 if (z.bit_size() % t_width != 0) return false;
289
290 // calculate num of overall digits in z (including terminating digit)
291 uint64_t numOfDigits = z.bit_size() / t_width;
292 // iteration vars for z vector
293 const uint64_t * z_data = z.data();
294 uint8_t z_offset = 0;
295 // utility to count number of entries in z, and last read digit
296 uint32_t digit = base;
297 typename int_vector::size_type n = 0;
298
299 // iterate over all digits. each time a termination digit is
300 // detected, a encoded number in vector ends.
301 while (numOfDigits--)
302 {
303 digit = (uint32_t)bits::read_int_and_move(z_data, z_offset, t_width);
304 if (digit == base) n++; // termination digit detected
305 }
306
307 // also, ensure last read digit was a termination digit
308 if (digit != base) return false;
309
310 // resize vector v
311 v.width(z.width());
312 v.resize(n);
313 v.shrink_to_fit();
314
315 // and finally decode and save result in v
316 decode<false, true>(z.data(), 0, n, v.begin());
317 return true;
318}
319
320} // end of namespace coder
321} // end of namespace sdsl
322
323#endif
bits.hpp contains the sdsl::bits class.
A class to encode and decode between comma code and binary code.
Definition: coder_comma.hpp:48
static uint64_t decode_prefix_sum(const uint64_t *data, const size_type start_idx, const size_type end_idx, size_type n)
Decode n comma gamma encoded integers.
static uint8_t encoding_length(uint64_t w)
Get the number of bits that are necessary to encode.
static uint64_t decode_prefix_sum(const uint64_t *data, const size_type start_idx, size_type n)
Decode n comma gamma encoded integers.
static void encode(uint64_t x, uint64_t *&z, uint8_t &offset)
Encode one positive integer x to an int_vector.
static const uint8_t min_codeword_length
Definition: coder_comma.hpp:86
static uint64_t decode(const uint64_t *data, const size_type start_idx, size_type n, t_iter it=(t_iter) nullptr)
Decode n comma encoded values beginning at start_idx.
static uint64_t * raw_data(int_vector &v)
A generic vector class for integers of width .
Definition: int_vector.hpp:216
iterator end() noexcept
Iterator that points to the element after the last element of int_vector.
Definition: int_vector.hpp:764
int_vector_size_type size_type
Definition: int_vector.hpp:229
size_type bit_size() const noexcept
The number of bits in the int_vector.
Definition: int_vector.hpp:547
void shrink_to_fit()
Free unused allocated memory.
Definition: int_vector.hpp:506
uint8_t width() const noexcept
Returns the width of the integers which are accessed via the [] operator.
Definition: int_vector.hpp:595
const uint64_t * data() const noexcept
Pointer to the raw data of the int_vector.
Definition: int_vector.hpp:566
void resize(const size_type size)
Resize the int_vector in terms of elements.
Definition: int_vector.hpp:521
iterator begin() noexcept
Iterator that points to the first element of the int_vector.
Definition: int_vector.hpp:759
#define SDSL_UNUSED
Definition: config.hpp:13
Namespace for the succinct data structure library.
static constexpr void write_int_and_move(uint64_t *&word, uint64_t x, uint8_t &offset, const uint8_t len)
Writes value x to an bit position in an array and moves the bit-pointer.
Definition: bits.hpp:742
static constexpr uint64_t read_int_and_move(const uint64_t *&word, uint8_t &offset, const uint8_t len=64)
Reads a value from a bit position in an array and moved the bit-pointer.
Definition: bits.hpp:792