88 typedef int_vector_type tIV;
89 typedef typename tIV::iterator int_iter;
90 typedef typename tIV::size_type size_type;
100 inline int64_t to_sign(uint64_t x)
const
102 return x & m_msb_mask ? -((int64_t)(x & ~m_msb_mask)) : x;
105 inline int64_t mark_pos(uint64_t x)
const
107 return (x & ~m_msb_mask);
110 inline int64_t mark_neg(uint64_t x)
const
112 return x | m_msb_mask;
115 inline bool not_neg(uint64_t x)
const
117 return !(x >> m_msb);
120 inline bool is_neg(uint64_t x)
const
122 return x & m_msb_mask;
125 inline uint64_t key(int_iter
const & p)
const
127 return m_VV[*p + m_hh];
130 inline void swap(int_iter & p, int_iter & q)
const
137 inline int_iter
const & med3(int_iter
const & a, int_iter
const & b, int_iter
const & c)
const
139 return key(a) < key(b) ? (key(b) < key(c) ? b : (key(a) < key(c) ? c : a))
140 : (key(b) > key(c) ? b : (key(a) > key(c) ? c : a));
145 void update_group(int_iter pl, int_iter pm)
147 int64_t g = pm - m_SA;
159 void select_sort_split(int_iter
const & p, int64_t n)
161 int_iter pa, pb, pi, pn;
168 for (pi = pb = (pa + 1), f = key(pa); pi <= pn; ++pi)
169 if ((v = key(pi)) < f)
180 update_group(pa, pb - 1);
185 m_VV[*pa] = pa - m_SA;
191 uint64_t choose_pivot(int_iter
const & p, int64_t n)
204 pl = med3(pl, pl + s, pl + s + s);
205 pm = med3(pm - s, pm, pm + s);
206 pn = med3(pn - s - s, pn - s, pn);
208 pm = med3(pl, pm, pn);
218 void sort_split(int_iter
const & p, int64_t n)
220 int_iter pa, pb, pc, pd, pl, pm, pn;
226 select_sort_split(p, n);
230 v = choose_pivot(p, n);
235 while (pb <= pc && (f = key(pb)) <= v)
244 while (pc >= pb && (f = key(pc)) >= v)
261 if ((s = pa - p) > (t = pb - pa))
263 for (pl = p, pm = pb - s; s; --s, ++pl, ++pm)
265 if ((s = pd - pc) > (t = pn - pd - 1))
267 for (pl = pb, pm = pn - s; s; --s, ++pl, ++pm)
275 std::cout <<
"s=" << s <<
">0 but should be <0; n=" << n << std::endl;
282 std::cout <<
"t=" << t <<
">0 but should be <0; n=" << n << std::endl;
287 update_group(p + s, p + n - t - 1);
289 sort_split(p + n - t, t);
300 void bucketsort(int_iter
const & x, int_iter
const & p, int64_t n, int64_t k)
306 for (pi = p; pi < p + k; ++pi)
308 for (i = 0; i <= n; ++i)
313 for (pi = p + k - 1, i = n; pi >= p; --pi)
329 p[i--] = mark_neg(1);
350 int64_t
transform(int_iter
const & x, int_iter
const & p, int64_t n, int64_t k, int64_t l, int64_t q)
354 std::cout <<
"q=" << q <<
" k-l=" << k - l << std::endl;
357 DBG_OUT <<
"transform(n=" << n <<
", k=" << k <<
", l=" << l <<
", q=" << q <<
")" << std::endl;
364 for (bb = dd = 0; (int)m_rr < n && (int)len < m_msb + 1 - s && (int64_t)(cc = dd << s | (k - l)) <= q;
367 bb = bb << s | (x[m_rr] - l + 1);
370 DBG_OUT <<
"m_rr=" << m_rr << std::endl;
371 uint64_t mm = (1ULL << (m_rr - 1) * s) - 1;
373 if ((int64_t)dd <= n)
375 for (pi = p; pi <= p + dd; ++pi)
377 for (pi = x + m_rr, cc = bb; pi <= x + n; ++pi)
380 cc = (cc & mm) << s | (*pi - l + 1);
382 for (uint64_t i = 1; i < m_rr; ++i)
387 for (pi = p, jj = 1; pi <= p + dd; ++pi)
390 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
393 cc = (cc & mm) << s | (*pj - l + 1);
403 for (pi = x, pj = x + m_rr, cc = bb; pj <= x + n; ++pi, ++pj)
406 cc = (cc & mm) << s | (*pj - l + 1);
416 DBG_OUT <<
"end transformation jj=" << jj << std::endl;
424 void sort(int_iter
const & x, int_iter
const & p, int64_t n, int64_t k, int64_t l)
431 int64_t j =
transform(m_VV, m_SA, n, k, l, n);
432 DBG_OUT <<
"begin bucketsort j=" << j << std::endl;
433 bucketsort(m_VV, m_SA, n, j);
434 DBG_OUT <<
"end bucketsort" << std::endl;
438 transform(m_VV, m_SA, n, k, l, m_msb_mask - 1);
439 DBG_OUT <<
"initialize SA begin" << std::endl;
440 for (int64_t i = 0; i <= n; ++i)
442 DBG_OUT <<
"initialize SA end" << std::endl;
444 sort_split(m_SA, n + 1);
448 while (to_sign(*m_SA) >= -n)
463 DBG_OUT <<
"*m_SA=" << to_sign(*m_SA) << std::endl;
467 DBG_OUT <<
"m_hh=" << m_hh << std::endl;
471 if (to_sign(s) < (int64_t)0)
480 *(pi - sl) = mark_neg(sl);
483 pk = m_SA + m_VV[s] + 1;
484 sort_split(pi, pk - pi);
488 while ((pi - m_SA) <= n);
490 *(pi - sl) = mark_neg(sl);
492 DBG_OUT <<
"m_hh=" << m_hh << std::endl;
494 for (int64_t i = 0; i <= n; ++i)
502 assert(x.size() > 0);
503 DBG_OUT <<
"x.width()=" << (int)x.width() << std::endl;
504 DBG_OUT <<
"x.size()=" << x.size() << std::endl;
505 DBG_OUT <<
"sa.width()=" << (int)sa.width() << std::endl;
506 DBG_OUT <<
"sa.size()=" << sa.size() << std::endl;
513 int64_t max_symbol = 0, min_symbol = x.width() < 64 ?
bits::lo_set[x.width()] : 0x7FFFFFFFFFFFFFFFLL;
515 for (size_type i = 0; i < x.size() - 1; ++i)
517 max_symbol = std::max(max_symbol, (int64_t)x[i]);
518 min_symbol = std::min(min_symbol, (int64_t)x[i]);
523 throw std::logic_error(
"Text contains 0-symbol. Suffix array can not be constructed.");
525 if (x[x.size() - 1] > 0)
527 throw std::logic_error(
"Last symbol is not 0-symbol. Suffix array can not be constructed.");
529 DBG_OUT <<
"sorter: min_symbol=" << min_symbol << std::endl;
530 DBG_OUT <<
"sorter: max_symbol=" << max_symbol << std::endl;
532 int64_t n = x.size() - 1;
533 DBG_OUT <<
"x.size()-1=" << x.size() - 1 <<
" n=" << n << std::endl;
535 DBG_OUT <<
"sorter: width=" << (int)width <<
" max_symbol_width=" <<
bits::hi(max_symbol) + 1
536 <<
" n_width=" <<
bits::hi(n) << std::endl;
539 if (sa.width() < x.width())
541 throw std::logic_error(
"Fixed size suffix array is to small for the specified text.");
545 m_msb = sa.width() - 1;
546 m_msb_mask = 1ULL << m_msb;
547 DBG_OUT <<
"sorter: m_msb=" << (int)m_msb <<
" m_msb_mask=" << m_msb_mask << std::endl;
548 sort(x.begin(), sa.begin(), x.size() - 1, max_symbol + 1, min_symbol);
551 void sort(tIV & sa,
char const * file_name, uint8_t num_bytes)
553 DBG_OUT <<
"sorter: sort(" << file_name <<
")" << std::endl;
557 if (num_bytes == 0 and
typeid(
typename tIV::reference) ==
typeid(uint64_t))
559 DBG_OUT <<
"sorter: use int_vector<64>" << std::endl;
562 x.resize(temp.
size());
563 for (size_type i = 0; i < temp.
size(); ++i)
574 template <
class t_vec>
575 void sort(tIV & sa, t_vec & text)
578 x.resize(text.size());
579 for (size_type i = 0; i < text.size(); ++i)