SDSL 3.0.2
Succinct Data Structure Library
Loading...
Searching...
No Matches
suffix_tree_algorithm.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
9#define INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
10
11#include <math.h>
12#include <set>
13#include <stddef.h>
14#include <type_traits>
15#include <utility>
16
17#include <sdsl/config.hpp>
18#include <sdsl/int_vector.hpp>
20
21// clang-format off
22// Cyclic includes start
24// Cyclic includes end
25// clang-format on
26
27namespace sdsl
28{
29
31
43template <class t_cst>
44typename t_cst::size_type forward_search(
45 t_cst const & cst,
46 typename t_cst::node_type & v,
47 const typename t_cst::size_type d,
48 const typename t_cst::char_type c,
49 typename t_cst::size_type & char_pos,
51 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x = cst_tag())
52{
53 auto cc = cst.csa.char2comp[c]; // check if c occurs in the text of the csa
54 if (cc == 0 and cc != c) // " " " " " " " " " "
55 return 0;
56 typename t_cst::size_type depth_node = cst.depth(v);
57 if (d < depth_node)
58 { // in an edge, no branching
59 char_pos = cst.csa.psi[char_pos];
60 if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc + 1])
61 return 0;
62 return cst.size(v);
63 }
64 else if (d == depth_node)
65 { // at a node, branching
66 v = cst.child(v, c, char_pos);
67 if (v == cst.root())
68 return 0;
69 else
70 return cst.size(v);
71 }
72 else
73 {
74 return 0;
75 }
76}
77
79
90template <class t_cst, class t_pat_iter>
91typename t_cst::size_type forward_search(
92 t_cst const & cst,
93 typename t_cst::node_type & v,
94 typename t_cst::size_type d,
95 t_pat_iter begin,
96 t_pat_iter end,
97 typename t_cst::size_type & char_pos,
99 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x = cst_tag())
100{
101 if (begin == end)
102 return cst.size(v);
103 typename t_cst::size_type size = 0;
104 t_pat_iter it = begin;
105 while (it != end and (size = forward_search(cst, v, d, *it, char_pos)))
106 {
107 ++d;
108 ++it;
109 }
110 return size;
111}
112
114
126template <class t_cst, class t_pat_iter>
127typename t_cst::size_type count(t_cst const & cst, t_pat_iter begin, t_pat_iter end, cst_tag)
128{
129 return count(cst.csa, begin, end);
130}
131
133
147template <class t_cst, class t_pat_iter, class t_rac = int_vector<64>>
148t_rac locate(t_cst const & cst,
149 t_pat_iter begin,
150 t_pat_iter end,
152 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
153 cst_tag())
154{
155 return locate(cst.csa, begin, end);
156}
157
159
169template <class t_cst, class t_text_iter>
170typename t_cst::size_type extract(
171 t_cst const & cst,
172 const typename t_cst::node_type & v,
173 t_text_iter text,
175 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x = cst_tag())
176{
177 if (v == cst.root())
178 {
179 text[0] = 0;
180 return 0;
181 }
182 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
183 typename t_cst::size_type begin = cst.csa[cst.lb(v)];
184 // then call the extract method on the compressed suffix array
185 extract(cst.csa, begin, begin + cst.depth(v) - 1, text);
186}
187
189
197template <class t_cst>
198typename t_cst::csa_type::string_type extract(
199 t_cst const & cst,
200 const typename t_cst::node_type & v,
202 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x = cst_tag())
203{
204 typedef typename t_cst::csa_type::string_type t_rac;
205 if (v == cst.root())
206 {
207 return t_rac{};
208 }
209 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
210 typename t_cst::size_type begin = cst.csa[cst.lb(v)];
211 // then call the extract method on the compressed suffix array
212 return extract(cst.csa, begin, begin + cst.depth(v) - 1);
213}
214
216
222template <class t_cst>
223double H0(const typename t_cst::node_type & v, t_cst const & cst)
224{
225 if (cst.is_leaf(v))
226 {
227 return 0;
228 }
229 else
230 {
231 double h0 = 0;
232 auto n = cst.size(v);
233 for (auto const & child : cst.children(v))
234 {
235 double p = ((double)cst.size(child)) / n;
236 h0 -= p * log2(p);
237 }
238 return h0;
239 }
240}
241
243
248template <class t_cst>
249std::pair<double, size_t> Hk(t_cst const & cst, typename t_cst::size_type k)
250{
251 double hk = 0;
252 size_t context = 0;
253 std::set<typename t_cst::size_type> leafs_with_d_smaller_k;
254 for (typename t_cst::size_type d = 1; d < k; ++d)
255 {
256 leafs_with_d_smaller_k.insert(cst.csa.isa[cst.csa.size() - d]);
257 }
258 for (typename t_cst::const_iterator it = cst.begin(), end = cst.end(); it != end; ++it)
259 {
260 if (it.visit() == 1)
261 {
262 if (!cst.is_leaf(*it))
263 {
264 typename t_cst::size_type d = cst.depth(*it);
265 if (d >= k)
266 {
267 if (d == k)
268 {
269 hk += cst.size(*it) * H0(*it, cst);
270 }
271 ++context;
272 it.skip_subtree();
273 }
274 }
275 else
276 {
277 // if d of leaf is >= k, add context
278 if (leafs_with_d_smaller_k.find(cst.lb(*it)) == leafs_with_d_smaller_k.end())
279 {
280 ++context;
281 }
282 }
283 }
284 }
285 hk /= cst.size();
286 return {hk, context};
287}
288
289} // namespace sdsl
290#endif
#define SDSL_UNUSED
Definition config.hpp:12
int_vector.hpp contains the sdsl::int_vector class.
Namespace for the succinct data structure library.
t_csa::size_type forward_search(t_csa const &csa, typename t_csa::size_type l, typename t_csa::size_type r, t_pat_iter begin, t_pat_iter end, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Forward search for a pattern in an -interval in the CSA.
t_csa::size_type extract(t_csa const &csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Writes the substring T[begin..end] of the original text T to text[0..end-begin+1].
std::pair< double, size_t > Hk(t_cst const &cst, typename t_cst::size_type k)
Calculate the k-th order entropy of a text.
uint64_t count(t_k2_treap const &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
double H0(const typename t_cst::node_type &v, t_cst const &cst)
Calculate the zeroth order entropy of the text that follows a certain substring s.
int_vector ::size_type size(range_type const &r)
Size of a range.
t_rac locate(t_csa const &csa, t_pat_iter begin, t_pat_iter end, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Calculates all occurrences of a pattern pat in a CSA.
Contains declarations and definitions of data structure concepts.
suffix_array_algorithm.hpp contains algorithms on CSAs