SDSL 3.0.1
Succinct Data Structure Library
Loading...
Searching...
No Matches
suffix_tree_algorithm.hpp
Go to the documentation of this file.
1// Copyright (c) 2016, the SDSL Project Authors. All rights reserved.
2// Please see the AUTHORS file for details. Use of this source code is governed
3// by a BSD license that can be found in the LICENSE file.
8#ifndef INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
9#define INCLUDED_SDSL_SUFFIX_TREE_ALGORITHM
10
11#include <iterator>
12
14
15namespace sdsl
16{
17
19
31template <class t_cst>
32typename t_cst::size_type
33forward_search(const t_cst & cst,
34 typename t_cst::node_type & v,
35 const typename t_cst::size_type d,
36 const typename t_cst::char_type c,
37 typename t_cst::size_type & char_pos,
39 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
40 cst_tag())
41{
42 auto cc = cst.csa.char2comp[c]; // check if c occurs in the text of the csa
43 if (cc == 0 and cc != c) // " " " " " " " " " "
44 return 0;
45 typename t_cst::size_type depth_node = cst.depth(v);
46 if (d < depth_node)
47 { // in an edge, no branching
48 char_pos = cst.csa.psi[char_pos];
49 if (char_pos < cst.csa.C[cc] or char_pos >= cst.csa.C[cc + 1]) return 0;
50 return cst.size(v);
51 }
52 else if (d == depth_node)
53 { // at a node, branching
54 v = cst.child(v, c, char_pos);
55 if (v == cst.root())
56 return 0;
57 else
58 return cst.size(v);
59 }
60 else
61 {
62 return 0;
63 }
64}
65
67
78template <class t_cst, class t_pat_iter>
79typename t_cst::size_type
80forward_search(const t_cst & cst,
81 typename t_cst::node_type & v,
82 typename t_cst::size_type d,
83 t_pat_iter begin,
84 t_pat_iter end,
85 typename t_cst::size_type & char_pos,
87 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
88 cst_tag())
89{
90 if (begin == end) return cst.size(v);
91 typename t_cst::size_type size = 0;
92 t_pat_iter it = begin;
93 while (it != end and (size = forward_search(cst, v, d, *it, char_pos)))
94 {
95 ++d;
96 ++it;
97 }
98 return size;
99}
100
102
114template <class t_cst, class t_pat_iter>
115typename t_cst::size_type count(const t_cst & cst, t_pat_iter begin, t_pat_iter end, cst_tag)
116{
117 return count(cst.csa, begin, end);
118}
119
121
135template <class t_cst, class t_pat_iter, class t_rac = int_vector<64>>
136t_rac locate(const t_cst & cst,
137 t_pat_iter begin,
138 t_pat_iter end,
140 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
141 cst_tag())
142{
143 return locate(cst.csa, begin, end);
144}
145
147
157template <class t_cst, class t_text_iter>
158typename t_cst::size_type extract(const t_cst & cst,
159 const typename t_cst::node_type & v,
160 t_text_iter text,
162 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value,
163 cst_tag>::type x = cst_tag())
164{
165 if (v == cst.root())
166 {
167 text[0] = 0;
168 return 0;
169 }
170 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
171 typename t_cst::size_type begin = cst.csa[cst.lb(v)];
172 // then call the extract method on the compressed suffix array
173 extract(cst.csa, begin, begin + cst.depth(v) - 1, text);
174}
175
177
185template <class t_cst>
186typename t_cst::csa_type::string_type
187extract(const t_cst & cst,
188 const typename t_cst::node_type & v,
190 typename std::enable_if<std::is_same<cst_tag, typename t_cst::index_category>::value, cst_tag>::type x =
191 cst_tag())
192{
193 typedef typename t_cst::csa_type::string_type t_rac;
194 if (v == cst.root()) { return t_rac{}; }
195 // first get the suffix array entry of the leftmost leaf in the subtree rooted at v
196 typename t_cst::size_type begin = cst.csa[cst.lb(v)];
197 // then call the extract method on the compressed suffix array
198 return extract(cst.csa, begin, begin + cst.depth(v) - 1);
199}
200
202
208template <class t_cst>
209double H0(const typename t_cst::node_type & v, const t_cst & cst)
210{
211 if (cst.is_leaf(v)) { return 0; }
212 else
213 {
214 double h0 = 0;
215 auto n = cst.size(v);
216 for (const auto & child : cst.children(v))
217 {
218 double p = ((double)cst.size(child)) / n;
219 h0 -= p * log2(p);
220 }
221 return h0;
222 }
223}
224
226
231template <class t_cst>
232std::pair<double, size_t> Hk(const t_cst & cst, typename t_cst::size_type k)
233{
234 double hk = 0;
235 size_t context = 0;
236 std::set<typename t_cst::size_type> leafs_with_d_smaller_k;
237 for (typename t_cst::size_type d = 1; d < k; ++d)
238 {
239 leafs_with_d_smaller_k.insert(cst.csa.isa[cst.csa.size() - d]);
240 }
241 for (typename t_cst::const_iterator it = cst.begin(), end = cst.end(); it != end; ++it)
242 {
243 if (it.visit() == 1)
244 {
245 if (!cst.is_leaf(*it))
246 {
247 typename t_cst::size_type d = cst.depth(*it);
248 if (d >= k)
249 {
250 if (d == k) { hk += cst.size(*it) * H0(*it, cst); }
251 ++context;
252 it.skip_subtree();
253 }
254 }
255 else
256 {
257 // if d of leaf is >= k, add context
258 if (leafs_with_d_smaller_k.find(cst.lb(*it)) == leafs_with_d_smaller_k.end()) { ++context; }
259 }
260 }
261 }
262 hk /= cst.size();
263 return { hk, context };
264}
265
266} // namespace sdsl
267#endif
#define SDSL_UNUSED
Definition: config.hpp:13
Namespace for the succinct data structure library.
std::pair< double, size_t > Hk(const t_cst &cst, typename t_cst::size_type k)
Calculate the k-th order entropy of a text.
uint64_t count(const t_k2_treap &treap, k2_treap_ns::point_type p1, k2_treap_ns::point_type p2)
Count how many points are in the rectangle (p1,p2)
t_csa::size_type forward_search(const t_csa &csa, typename t_csa::size_type l, typename t_csa::size_type r, t_pat_iter begin, t_pat_iter end, typename t_csa::size_type &l_res, typename t_csa::size_type &r_res, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Forward search for a pattern in an -interval in the CSA.
t_rac locate(const t_csa &csa, t_pat_iter begin, t_pat_iter end, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Calculates all occurrences of a pattern pat in a CSA.
t_csa::size_type extract(const t_csa &csa, typename t_csa::size_type begin, typename t_csa::size_type end, t_text_iter text, SDSL_UNUSED typename std::enable_if< std::is_same< csa_tag, typename t_csa::index_category >::value, csa_tag >::type x=csa_tag())
Writes the substring T[begin..end] of the original text T to text[0..end-begin+1].
double H0(const typename t_cst::node_type &v, const t_cst &cst)
Calculate the zeroth order entropy of the text that follows a certain substring s.
int_vector ::size_type size(const range_type &r)
Size of a range.
Definition: wt_helper.hpp:787
suffix_array_algorithm.hpp contains algorithms on CSAs