Point Cloud Library (PCL) 1.12.0
Loading...
Searching...
No Matches
utils.hpp
1/*
2 * Software License Agreement (BSD License)
3 *
4 * Point Cloud Library (PCL) - www.pointclouds.org
5 * Copyright (c) 2011, Willow Garage, Inc.
6 *
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * * Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * * Redistributions in binary form must reproduce the above
16 * copyright notice, this list of conditions and the following
17 * disclaimer in the documentation and/or other materials provided
18 * with the distribution.
19 * * Neither the name of Willow Garage, Inc. nor the names of its
20 * contributors may be used to endorse or promote products derived
21 * from this software without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 *
36 */
37
38
39#ifndef PCL_GPU_KINFU_CUDA_UTILS_HPP_
40#define PCL_GPU_KINFU_CUDA_UTILS_HPP_
41
42#include <pcl/common/utils.h> // pcl::utils::ignore
43
44#include <limits>
45
46#include <cuda.h>
47
48namespace pcl
49{
50 namespace device
51 {
52 namespace kinfuLS
53 {
54 template <class T>
55 __device__ __host__ __forceinline__ void swap ( T& a, T& b )
56 {
57 T c(a); a=b; b=c;
58 }
59
60 __device__ __forceinline__ float
61 dot(const float3& v1, const float3& v2)
62 {
63 return v1.x * v2.x + v1.y*v2.y + v1.z*v2.z;
64 }
65
66 __device__ __forceinline__ float3&
67 operator+=(float3& vec, const float& v)
68 {
69 vec.x += v; vec.y += v; vec.z += v; return vec;
70 }
71
72 __device__ __forceinline__ float3
73 operator+(const float3& v1, const float3& v2)
74 {
75 return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
76 }
77
78 __device__ __forceinline__ float3&
79 operator*=(float3& vec, const float& v)
80 {
81 vec.x *= v; vec.y *= v; vec.z *= v; return vec;
82 }
83
84 __device__ __forceinline__ float3
85 operator-(const float3& v1, const float3& v2)
86 {
87 return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
88 }
89
90 __device__ __forceinline__ float3
91 operator*(const float3& v1, const float& v)
92 {
93 return make_float3(v1.x * v, v1.y * v, v1.z * v);
94 }
95
96 __device__ __forceinline__ float
97 norm(const float3& v)
98 {
99 return sqrt(dot(v, v));
100 }
101
102 __device__ __forceinline__ float3
103 normalized(const float3& v)
104 {
105 return v * rsqrt(dot(v, v));
106 }
107
108 __device__ __host__ __forceinline__ float3
109 cross(const float3& v1, const float3& v2)
110 {
111 return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
112 }
113
114 __device__ __forceinline__ void computeRoots2(const float& b, const float& c, float3& roots)
115 {
116 roots.x = 0.f;
117 float d = b * b - 4.f * c;
118 if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
119 d = 0.f;
120
121 float sd = sqrtf(d);
122
123 roots.z = 0.5f * (b + sd);
124 roots.y = 0.5f * (b - sd);
125 }
126
127 __device__ __forceinline__ void
128 computeRoots3(float c0, float c1, float c2, float3& roots)
129 {
130 if ( std::abs(c0) < std::numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
131 {
132 computeRoots2 (c2, c1, roots);
133 }
134 else
135 {
136 const float s_inv3 = 1.f/3.f;
137 const float s_sqrt3 = sqrtf(3.f);
138 // Construct the parameters used in classifying the roots of the equation
139 // and in solving the equation for the roots in closed form.
140 float c2_over_3 = c2 * s_inv3;
141 float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
142 if (a_over_3 > 0.f)
143 a_over_3 = 0.f;
144
145 float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
146
147 float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
148 if (q > 0.f)
149 q = 0.f;
150
151 // Compute the eigenvalues by solving for the roots of the polynomial.
152 float rho = sqrtf(-a_over_3);
153 float theta = std::atan2 (sqrtf (-q), half_b)*s_inv3;
154 float cos_theta = __cosf (theta);
155 float sin_theta = __sinf (theta);
156 roots.x = c2_over_3 + 2.f * rho * cos_theta;
157 roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
158 roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
159
160 // Sort in increasing order.
161 if (roots.x >= roots.y)
162 swap(roots.x, roots.y);
163
164 if (roots.y >= roots.z)
165 {
166 swap(roots.y, roots.z);
167
168 if (roots.x >= roots.y)
169 swap (roots.x, roots.y);
170 }
171 if (roots.x <= 0) // eigenval for symmetric positive semi-definite matrix can not be negative! Set it to 0
172 computeRoots2 (c2, c1, roots);
173 }
174 }
175
176 struct Eigen33
177 {
178 public:
179 template<int Rows>
180 struct MiniMat
181 {
182 float3 data[Rows];
183 __device__ __host__ __forceinline__ float3& operator[](int i) { return data[i]; }
184 __device__ __host__ __forceinline__ const float3& operator[](int i) const { return data[i]; }
185 };
188
189
190 static __forceinline__ __device__ float3
191 unitOrthogonal (const float3& src)
192 {
193 float3 perp;
194 /* Let us compute the crossed product of *this with a vector
195 * that is not too close to being colinear to *this.
196 */
197
198 /* unless the x and y coords are both close to zero, we can
199 * simply take ( -y, x, 0 ) and normalize it.
200 */
201 if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
202 {
203 float invnm = rsqrtf(src.x*src.x + src.y*src.y);
204 perp.x = -src.y * invnm;
205 perp.y = src.x * invnm;
206 perp.z = 0.0f;
207 }
208 /* if both x and y are close to zero, then the vector is close
209 * to the z-axis, so it's far from colinear to the x-axis for instance.
210 * So we take the crossed product with (1,0,0) and normalize it.
211 */
212 else
213 {
214 float invnm = rsqrtf(src.z * src.z + src.y * src.y);
215 perp.x = 0.0f;
216 perp.y = -src.z * invnm;
217 perp.z = src.y * invnm;
218 }
219
220 return perp;
221 }
222
223 __device__ __forceinline__
224 Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
225 __device__ __forceinline__ void
226 compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
227 {
228 // Scale the matrix so its entries are in [-1,1]. The scaling is applied
229 // only when at least one matrix entry has magnitude larger than 1.
230
231 float max01 = fmaxf( std::abs(mat_pkg[0]), std::abs(mat_pkg[1]) );
232 float max23 = fmaxf( std::abs(mat_pkg[2]), std::abs(mat_pkg[3]) );
233 float max45 = fmaxf( std::abs(mat_pkg[4]), std::abs(mat_pkg[5]) );
234 float m0123 = fmaxf( max01, max23);
235 float scale = fmaxf( max45, m0123);
236
237 if (scale <= std::numeric_limits<float>::min())
238 scale = 1.f;
239
240 mat_pkg[0] /= scale;
241 mat_pkg[1] /= scale;
242 mat_pkg[2] /= scale;
243 mat_pkg[3] /= scale;
244 mat_pkg[4] /= scale;
245 mat_pkg[5] /= scale;
246
247 // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0. The
248 // eigenvalues are the roots to this equation, all guaranteed to be
249 // real-valued, because the matrix is symmetric.
250 float c0 = m00() * m11() * m22()
251 + 2.f * m01() * m02() * m12()
252 - m00() * m12() * m12()
253 - m11() * m02() * m02()
254 - m22() * m01() * m01();
255 float c1 = m00() * m11() -
256 m01() * m01() +
257 m00() * m22() -
258 m02() * m02() +
259 m11() * m22() -
260 m12() * m12();
261 float c2 = m00() + m11() + m22();
262
263 computeRoots3(c0, c1, c2, evals);
264
265 if(evals.z - evals.x <= std::numeric_limits<float>::epsilon())
266 {
267 evecs[0] = make_float3(1.f, 0.f, 0.f);
268 evecs[1] = make_float3(0.f, 1.f, 0.f);
269 evecs[2] = make_float3(0.f, 0.f, 1.f);
270 }
271 else if (evals.y - evals.x <= std::numeric_limits<float>::epsilon() )
272 {
273 // first and second equal
274 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
275 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
276
277 vec_tmp[0] = cross(tmp[0], tmp[1]);
278 vec_tmp[1] = cross(tmp[0], tmp[2]);
279 vec_tmp[2] = cross(tmp[1], tmp[2]);
280
281 float len1 = dot (vec_tmp[0], vec_tmp[0]);
282 float len2 = dot (vec_tmp[1], vec_tmp[1]);
283 float len3 = dot (vec_tmp[2], vec_tmp[2]);
284
285 if (len1 >= len2 && len1 >= len3)
286 {
287 evecs[2] = vec_tmp[0] * rsqrtf (len1);
288 }
289 else if (len2 >= len1 && len2 >= len3)
290 {
291 evecs[2] = vec_tmp[1] * rsqrtf (len2);
292 }
293 else
294 {
295 evecs[2] = vec_tmp[2] * rsqrtf (len3);
296 }
297
298 evecs[1] = unitOrthogonal(evecs[2]);
299 evecs[0] = cross(evecs[1], evecs[2]);
300 }
301 else if (evals.z - evals.y <= std::numeric_limits<float>::epsilon() )
302 {
303 // second and third equal
304 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
305 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
306
307 vec_tmp[0] = cross(tmp[0], tmp[1]);
308 vec_tmp[1] = cross(tmp[0], tmp[2]);
309 vec_tmp[2] = cross(tmp[1], tmp[2]);
310
311 float len1 = dot(vec_tmp[0], vec_tmp[0]);
312 float len2 = dot(vec_tmp[1], vec_tmp[1]);
313 float len3 = dot(vec_tmp[2], vec_tmp[2]);
314
315 if (len1 >= len2 && len1 >= len3)
316 {
317 evecs[0] = vec_tmp[0] * rsqrtf(len1);
318 }
319 else if (len2 >= len1 && len2 >= len3)
320 {
321 evecs[0] = vec_tmp[1] * rsqrtf(len2);
322 }
323 else
324 {
325 evecs[0] = vec_tmp[2] * rsqrtf(len3);
326 }
327
328 evecs[1] = unitOrthogonal( evecs[0] );
329 evecs[2] = cross(evecs[0], evecs[1]);
330 }
331 else
332 {
333
334 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
335 tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
336
337 vec_tmp[0] = cross(tmp[0], tmp[1]);
338 vec_tmp[1] = cross(tmp[0], tmp[2]);
339 vec_tmp[2] = cross(tmp[1], tmp[2]);
340
341 float len1 = dot(vec_tmp[0], vec_tmp[0]);
342 float len2 = dot(vec_tmp[1], vec_tmp[1]);
343 float len3 = dot(vec_tmp[2], vec_tmp[2]);
344
345 float mmax[3];
346
347 unsigned int min_el = 2;
348 unsigned int max_el = 2;
349 if (len1 >= len2 && len1 >= len3)
350 {
351 mmax[2] = len1;
352 evecs[2] = vec_tmp[0] * rsqrtf (len1);
353 }
354 else if (len2 >= len1 && len2 >= len3)
355 {
356 mmax[2] = len2;
357 evecs[2] = vec_tmp[1] * rsqrtf (len2);
358 }
359 else
360 {
361 mmax[2] = len3;
362 evecs[2] = vec_tmp[2] * rsqrtf (len3);
363 }
364
365 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
366 tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
367
368 vec_tmp[0] = cross(tmp[0], tmp[1]);
369 vec_tmp[1] = cross(tmp[0], tmp[2]);
370 vec_tmp[2] = cross(tmp[1], tmp[2]);
371
372 len1 = dot(vec_tmp[0], vec_tmp[0]);
373 len2 = dot(vec_tmp[1], vec_tmp[1]);
374 len3 = dot(vec_tmp[2], vec_tmp[2]);
375
376 if (len1 >= len2 && len1 >= len3)
377 {
378 mmax[1] = len1;
379 evecs[1] = vec_tmp[0] * rsqrtf (len1);
380 min_el = len1 <= mmax[min_el] ? 1 : min_el;
381 max_el = len1 > mmax[max_el] ? 1 : max_el;
382 }
383 else if (len2 >= len1 && len2 >= len3)
384 {
385 mmax[1] = len2;
386 evecs[1] = vec_tmp[1] * rsqrtf (len2);
387 min_el = len2 <= mmax[min_el] ? 1 : min_el;
388 max_el = len2 > mmax[max_el] ? 1 : max_el;
389 }
390 else
391 {
392 mmax[1] = len3;
393 evecs[1] = vec_tmp[2] * rsqrtf (len3);
394 min_el = len3 <= mmax[min_el] ? 1 : min_el;
395 max_el = len3 > mmax[max_el] ? 1 : max_el;
396 }
397
398 tmp[0] = row0(); tmp[1] = row1(); tmp[2] = row2();
399 tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
400
401 vec_tmp[0] = cross(tmp[0], tmp[1]);
402 vec_tmp[1] = cross(tmp[0], tmp[2]);
403 vec_tmp[2] = cross(tmp[1], tmp[2]);
404
405 len1 = dot (vec_tmp[0], vec_tmp[0]);
406 len2 = dot (vec_tmp[1], vec_tmp[1]);
407 len3 = dot (vec_tmp[2], vec_tmp[2]);
408
409
410 if (len1 >= len2 && len1 >= len3)
411 {
412 mmax[0] = len1;
413 evecs[0] = vec_tmp[0] * rsqrtf (len1);
414 min_el = len3 <= mmax[min_el] ? 0 : min_el;
415 max_el = len3 > mmax[max_el] ? 0 : max_el;
416 }
417 else if (len2 >= len1 && len2 >= len3)
418 {
419 mmax[0] = len2;
420 evecs[0] = vec_tmp[1] * rsqrtf (len2);
421 min_el = len3 <= mmax[min_el] ? 0 : min_el;
422 max_el = len3 > mmax[max_el] ? 0 : max_el;
423 }
424 else
425 {
426 mmax[0] = len3;
427 evecs[0] = vec_tmp[2] * rsqrtf (len3);
428 min_el = len3 <= mmax[min_el] ? 0 : min_el;
429 max_el = len3 > mmax[max_el] ? 0 : max_el;
430 }
431
432 unsigned mid_el = 3 - min_el - max_el;
433 evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
434 evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
435 }
436 // Rescale back to the original size.
437 evals *= scale;
438 }
439 private:
440 volatile float* mat_pkg;
441
442 __device__ __forceinline__ float m00() const { return mat_pkg[0]; }
443 __device__ __forceinline__ float m01() const { return mat_pkg[1]; }
444 __device__ __forceinline__ float m02() const { return mat_pkg[2]; }
445 __device__ __forceinline__ float m10() const { return mat_pkg[1]; }
446 __device__ __forceinline__ float m11() const { return mat_pkg[3]; }
447 __device__ __forceinline__ float m12() const { return mat_pkg[4]; }
448 __device__ __forceinline__ float m20() const { return mat_pkg[2]; }
449 __device__ __forceinline__ float m21() const { return mat_pkg[4]; }
450 __device__ __forceinline__ float m22() const { return mat_pkg[5]; }
451
452 __device__ __forceinline__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
453 __device__ __forceinline__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
454 __device__ __forceinline__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
455
456 __device__ __forceinline__ static bool isMuchSmallerThan (float x, float y)
457 {
458 // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
459 constexpr float prec_sqr = std::numeric_limits<float>::epsilon() * std::numeric_limits<float>::epsilon();
460 return x * x <= prec_sqr * y * y;
461 }
462 };
463
464 struct Block
465 {
466 static __device__ __forceinline__ unsigned int stride()
467 {
468 return blockDim.x * blockDim.y * blockDim.z;
469 }
470
471 static __device__ __forceinline__ int
473 {
474 return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
475 }
476
477 template<int CTA_SIZE, typename T, class BinOp>
478 static __device__ __forceinline__ void reduce(volatile T* buffer, BinOp op)
479 {
480 int tid = flattenedThreadId();
481 T val = buffer[tid];
482
483 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
484 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
485 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
486 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
487
488 if (tid < 32)
489 {
490 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
491 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
492 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
493 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
494 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
495 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
496 }
497 }
498
499 template<int CTA_SIZE, typename T, class BinOp>
500 static __device__ __forceinline__ T reduce(volatile T* buffer, T init, BinOp op)
501 {
502 int tid = flattenedThreadId();
503 T val = buffer[tid] = init;
504 __syncthreads();
505
506 if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
507 if (CTA_SIZE >= 512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
508 if (CTA_SIZE >= 256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
509 if (CTA_SIZE >= 128) { if (tid < 64) buffer[tid] = val = op(val, buffer[tid + 64]); __syncthreads(); }
510
511 if (tid < 32)
512 {
513 if (CTA_SIZE >= 64) { buffer[tid] = val = op(val, buffer[tid + 32]); }
514 if (CTA_SIZE >= 32) { buffer[tid] = val = op(val, buffer[tid + 16]); }
515 if (CTA_SIZE >= 16) { buffer[tid] = val = op(val, buffer[tid + 8]); }
516 if (CTA_SIZE >= 8) { buffer[tid] = val = op(val, buffer[tid + 4]); }
517 if (CTA_SIZE >= 4) { buffer[tid] = val = op(val, buffer[tid + 2]); }
518 if (CTA_SIZE >= 2) { buffer[tid] = val = op(val, buffer[tid + 1]); }
519 }
520 __syncthreads();
521 return buffer[0];
522 }
523 };
524
525 struct Warp
526 {
527 enum
528 {
532 };
533
534 /** \brief Returns the warp lane ID of the calling thread. */
535 static __device__ __forceinline__ unsigned int
537 {
538 unsigned int ret;
539 asm("mov.u32 %0, %laneid;" : "=r"(ret) );
540 return ret;
541 }
542
543 static __device__ __forceinline__ unsigned int id()
544 {
545 int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
546 return tid >> LOG_WARP_SIZE;
547 }
548
549 static __device__ __forceinline__
551 {
552 unsigned int ret;
553 asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
554 return ret;
555 }
556
557 static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
558 {
559 return __popc(Warp::laneMaskLt() & ballot_mask);
560 }
561 };
562
563
565 {
566 static __device__ __forceinline__ int
567 warp_reduce ( volatile int *ptr , const unsigned int tid)
568 {
569 const unsigned int lane = tid & 31; // index of thread in warp (0..31)
570
571 if (lane < 16)
572 {
573 int partial = ptr[tid];
574
575 ptr[tid] = partial = partial + ptr[tid + 16];
576 ptr[tid] = partial = partial + ptr[tid + 8];
577 ptr[tid] = partial = partial + ptr[tid + 4];
578 ptr[tid] = partial = partial + ptr[tid + 2];
579 ptr[tid] = partial = partial + ptr[tid + 1];
580 }
581 return ptr[tid - lane];
582 }
583
584 static __forceinline__ __device__ int
585 Ballot(int predicate, volatile int* cta_buffer)
586 {
587 pcl::utils::ignore(cta_buffer);
588 #if CUDA_VERSION >= 9000
589 return __ballot_sync (__activemask (), predicate);
590 #else
591 return __ballot (predicate);
592 #endif
593 }
594
595 static __forceinline__ __device__ bool
596 All(int predicate, volatile int* cta_buffer)
597 {
598 pcl::utils::ignore(cta_buffer);
599 #if CUDA_VERSION >= 9000
600 return __all_sync (__activemask (), predicate);
601 #else
602 return __all (predicate);
603 #endif
604 }
605 };
606 }
607 }
608}
609
610#endif /* PCL_GPU_KINFU_CUDA_UTILS_HPP_ */
__device__ __forceinline__ float norm(const float3 &v)
Definition utils.hpp:97
__device__ __forceinline__ float3 & operator*=(float3 &vec, const float &v)
Definition utils.hpp:79
__device__ __forceinline__ float3 operator*(const Mat33 &m, const float3 &vec)
Definition device.hpp:78
__device__ __forceinline__ float3 operator+(const float3 &v1, const float3 &v2)
Definition utils.hpp:73
__device__ __forceinline__ float dot(const float3 &v1, const float3 &v2)
Definition utils.hpp:61
__device__ __forceinline__ void computeRoots2(const float &b, const float &c, float3 &roots)
Definition utils.hpp:114
__device__ __forceinline__ float3 operator-(const float3 &v1, const float3 &v2)
Definition utils.hpp:85
__device__ __host__ __forceinline__ void swap(T &a, T &b)
Definition utils.hpp:55
__device__ __forceinline__ float3 & operator+=(float3 &vec, const float &v)
Definition utils.hpp:67
__device__ __forceinline__ float3 normalized(const float3 &v)
Definition utils.hpp:103
__device__ __host__ __forceinline__ float3 cross(const float3 &v1, const float3 &v2)
Definition utils.hpp:109
__device__ __forceinline__ void computeRoots3(float c0, float c1, float c2, float3 &roots)
Definition utils.hpp:128
void ignore(const T &...)
Utility function to eliminate unused variable warnings.
Definition utils.h:62
static __device__ __forceinline__ int flattenedThreadId()
Definition utils.hpp:472
static __device__ __forceinline__ void reduce(volatile T *buffer, BinOp op)
Definition utils.hpp:478
static __device__ __forceinline__ T reduce(volatile T *buffer, T init, BinOp op)
Definition utils.hpp:500
static __device__ __forceinline__ unsigned int stride()
Definition utils.hpp:466
__device__ __host__ __forceinline__ float3 & operator[](int i)
Definition utils.hpp:183
__device__ __host__ __forceinline__ const float3 & operator[](int i) const
Definition utils.hpp:184
static __forceinline__ __device__ float3 unitOrthogonal(const float3 &src)
Definition utils.hpp:191
__device__ __forceinline__ void compute(Mat33 &tmp, Mat33 &vec_tmp, Mat33 &evecs, float3 &evals)
Definition utils.hpp:226
__device__ __forceinline__ Eigen33(volatile float *mat_pkg_arg)
Definition utils.hpp:224
static __forceinline__ __device__ bool All(int predicate, volatile int *cta_buffer)
Definition utils.hpp:596
static __forceinline__ __device__ int Ballot(int predicate, volatile int *cta_buffer)
Definition utils.hpp:585
static __device__ __forceinline__ int warp_reduce(volatile int *ptr, const unsigned int tid)
Definition utils.hpp:567
static __device__ __forceinline__ unsigned int laneId()
Returns the warp lane ID of the calling thread.
Definition utils.hpp:536
static __device__ __forceinline__ int laneMaskLt()
Definition utils.hpp:550
static __device__ __forceinline__ unsigned int id()
Definition utils.hpp:543
static __device__ __forceinline__ int binaryExclScan(int ballot_mask)
Definition utils.hpp:557