HIP: Heterogenous-computing Interface for Portability
Loading...
Searching...
No Matches
amd_device_functions.h
1/*
2Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
3
4Permission is hereby granted, free of charge, to any person obtaining a copy
5of this software and associated documentation files (the "Software"), to deal
6in the Software without restriction, including without limitation the rights
7to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8copies of the Software, and to permit persons to whom the Software is
9furnished to do so, subject to the following conditions:
10
11The above copyright notice and this permission notice shall be included in
12all copies or substantial portions of the Software.
13
14THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20THE SOFTWARE.
21*/
22
23#ifndef HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
24#define HIP_INCLUDE_HIP_AMD_DETAIL_DEVICE_FUNCTIONS_H
25
26#if !defined(__HIPCC_RTC__)
27#include <hip/amd_detail/amd_hip_common.h>
29#include <hip/amd_detail/hip_assert.h>
30#include "host_defines.h"
31#include "math_fwd.h"
32#include <hip/hip_runtime_api.h>
33#include <stddef.h>
34#include <hip/hip_vector_types.h>
35#endif // !defined(__HIPCC_RTC__)
36
37#if defined(__clang__) && defined(__HIP__)
38extern "C" __device__ int printf(const char *fmt, ...);
39#else
40template <typename... All>
41static inline __device__ void printf(const char* format, All... all) {}
42#endif
43
44extern "C" __device__ unsigned long long __ockl_steadyctr_u64();
45
46/*
47Integer Intrinsics
48*/
49
50// integer intrinsic function __poc __clz __ffs __brev
51__device__ static inline unsigned int __popc(unsigned int input) {
52 return __builtin_popcount(input);
53}
54__device__ static inline unsigned int __popcll(unsigned long long int input) {
55 return __builtin_popcountll(input);
56}
57
58__device__ static inline int __clz(int input) {
59 return __ockl_clz_u32((uint)input);
60}
61
62__device__ static inline int __clzll(long long int input) {
63 return __ockl_clz_u64((uint64_t)input);
64}
65
66__device__ static inline unsigned int __ffs(unsigned int input) {
67 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
68}
69
70__device__ static inline unsigned int __ffsll(unsigned long long int input) {
71 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
72}
73
74__device__ static inline unsigned int __ffsll(unsigned long int input) {
75 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
76}
77
78__device__ static inline unsigned int __ffs(int input) {
79 return ( input == 0 ? -1 : __builtin_ctz(input) ) + 1;
80}
81
82__device__ static inline unsigned int __ffsll(long long int input) {
83 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
84}
85
86__device__ static inline unsigned int __ffsll(long int input) {
87 return ( input == 0 ? -1 : __builtin_ctzll(input) ) + 1;
88}
89
90// Given a 32/64-bit value exec mask and an integer value base (between 0 and WAVEFRONT_SIZE),
91// find the n-th (given by offset) set bit in the exec mask from the base bit, and return the bit position.
92// If not found, return -1.
93__device__ static int32_t __fns64(uint64_t mask, uint32_t base, int32_t offset) {
94 uint64_t temp_mask = mask;
95 int32_t temp_offset = offset;
96
97 if (offset == 0) {
98 temp_mask &= (1 << base);
99 temp_offset = 1;
100 }
101 else if (offset < 0) {
102 temp_mask = __builtin_bitreverse64(mask);
103 base = 63 - base;
104 temp_offset = -offset;
105 }
106
107 temp_mask = temp_mask & ((~0ULL) << base);
108 if (__builtin_popcountll(temp_mask) < temp_offset)
109 return -1;
110 int32_t total = 0;
111 for (int i = 0x20; i > 0; i >>= 1) {
112 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
113 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
114 if (pcnt < temp_offset) {
115 temp_mask = temp_mask >> i;
116 temp_offset -= pcnt;
117 total += i;
118 }
119 else {
120 temp_mask = temp_mask_lo;
121 }
122 }
123 if (offset < 0)
124 return 63 - total;
125 else
126 return total;
127}
128
129__device__ static int32_t __fns32(uint64_t mask, uint32_t base, int32_t offset) {
130 uint64_t temp_mask = mask;
131 int32_t temp_offset = offset;
132 if (offset == 0) {
133 temp_mask &= (1 << base);
134 temp_offset = 1;
135 }
136 else if (offset < 0) {
137 temp_mask = __builtin_bitreverse64(mask);
138 base = 63 - base;
139 temp_offset = -offset;
140 }
141 temp_mask = temp_mask & ((~0ULL) << base);
142 if (__builtin_popcountll(temp_mask) < temp_offset)
143 return -1;
144 int32_t total = 0;
145 for (int i = 0x20; i > 0; i >>= 1) {
146 uint64_t temp_mask_lo = temp_mask & ((1ULL << i) - 1);
147 int32_t pcnt = __builtin_popcountll(temp_mask_lo);
148 if (pcnt < temp_offset) {
149 temp_mask = temp_mask >> i;
150 temp_offset -= pcnt;
151 total += i;
152 }
153 else {
154 temp_mask = temp_mask_lo;
155 }
156 }
157 if (offset < 0)
158 return 63 - total;
159 else
160 return total;
161}
162__device__ static inline unsigned int __brev(unsigned int input) {
163 return __builtin_bitreverse32(input);
164}
165
166__device__ static inline unsigned long long int __brevll(unsigned long long int input) {
167 return __builtin_bitreverse64(input);
168}
169
170__device__ static inline unsigned int __lastbit_u32_u64(uint64_t input) {
171 return input == 0 ? -1 : __builtin_ctzl(input);
172}
173
174__device__ static inline unsigned int __bitextract_u32(unsigned int src0, unsigned int src1, unsigned int src2) {
175 uint32_t offset = src1 & 31;
176 uint32_t width = src2 & 31;
177 return width == 0 ? 0 : (src0 << (32 - offset - width)) >> (32 - width);
178}
179
180__device__ static inline uint64_t __bitextract_u64(uint64_t src0, unsigned int src1, unsigned int src2) {
181 uint64_t offset = src1 & 63;
182 uint64_t width = src2 & 63;
183 return width == 0 ? 0 : (src0 << (64 - offset - width)) >> (64 - width);
184}
185
186__device__ static inline unsigned int __bitinsert_u32(unsigned int src0, unsigned int src1, unsigned int src2, unsigned int src3) {
187 uint32_t offset = src2 & 31;
188 uint32_t width = src3 & 31;
189 uint32_t mask = (1 << width) - 1;
190 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
191}
192
193__device__ static inline uint64_t __bitinsert_u64(uint64_t src0, uint64_t src1, unsigned int src2, unsigned int src3) {
194 uint64_t offset = src2 & 63;
195 uint64_t width = src3 & 63;
196 uint64_t mask = (1ULL << width) - 1;
197 return ((src0 & ~(mask << offset)) | ((src1 & mask) << offset));
198}
199
200__device__ inline unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
201{
202 uint32_t mask_shift = shift & 31;
203 return mask_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - mask_shift);
204}
205
206__device__ inline unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
207{
208 uint32_t min_shift = shift >= 32 ? 32 : shift;
209 return min_shift == 0 ? hi : __builtin_amdgcn_alignbit(hi, lo, 32 - min_shift);
210}
211
212__device__ inline unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
213{
214 return __builtin_amdgcn_alignbit(hi, lo, shift);
215}
216
217__device__ inline unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
218{
219 return shift >= 32 ? hi : __builtin_amdgcn_alignbit(hi, lo, shift);
220}
221
222__device__ static unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s);
223__device__ static unsigned int __hadd(int x, int y);
224__device__ static int __mul24(int x, int y);
225__device__ static long long int __mul64hi(long long int x, long long int y);
226__device__ static int __mulhi(int x, int y);
227__device__ static int __rhadd(int x, int y);
228__device__ static unsigned int __sad(int x, int y,unsigned int z);
229__device__ static unsigned int __uhadd(unsigned int x, unsigned int y);
230__device__ static int __umul24(unsigned int x, unsigned int y);
231__device__ static unsigned long long int __umul64hi(unsigned long long int x, unsigned long long int y);
232__device__ static unsigned int __umulhi(unsigned int x, unsigned int y);
233__device__ static unsigned int __urhadd(unsigned int x, unsigned int y);
234__device__ static unsigned int __usad(unsigned int x, unsigned int y, unsigned int z);
235
237 union {
238 unsigned char c[4];
239 unsigned int ui;
240 };
241} __attribute__((aligned(4)));
242
244 union {
245 unsigned int ui[2];
246 unsigned char c[8];
247 };
248} __attribute__((aligned(8)));
249
250__device__
251static inline unsigned int __byte_perm(unsigned int x, unsigned int y, unsigned int s) {
252 struct uchar2Holder cHoldVal;
253 struct ucharHolder cHoldKey;
254 cHoldKey.ui = s;
255 cHoldVal.ui[0] = x;
256 cHoldVal.ui[1] = y;
257 unsigned int result;
258 result = cHoldVal.c[cHoldKey.c[0] & 0x07];
259 result += (cHoldVal.c[(cHoldKey.c[0] & 0x70) >> 4] << 8);
260 result += (cHoldVal.c[cHoldKey.c[1] & 0x07] << 16);
261 result += (cHoldVal.c[(cHoldKey.c[1] & 0x70) >> 4] << 24);
262 return result;
263}
264
265__device__ static inline unsigned int __hadd(int x, int y) {
266 int z = x + y;
267 int sign = z & 0x8000000;
268 int value = z & 0x7FFFFFFF;
269 return ((value) >> 1 || sign);
270}
271
272__device__ static inline int __mul24(int x, int y) {
273 return __ockl_mul24_i32(x, y);
274}
275
276__device__ static inline long long __mul64hi(long long int x, long long int y) {
277 unsigned long long x0 = (unsigned long long)x & 0xffffffffUL;
278 long long x1 = x >> 32;
279 unsigned long long y0 = (unsigned long long)y & 0xffffffffUL;
280 long long y1 = y >> 32;
281 unsigned long long z0 = x0*y0;
282 long long t = x1*y0 + (z0 >> 32);
283 long long z1 = t & 0xffffffffL;
284 long long z2 = t >> 32;
285 z1 = x0*y1 + z1;
286 return x1*y1 + z2 + (z1 >> 32);
287}
288
289__device__ static inline int __mulhi(int x, int y) {
290 return __ockl_mul_hi_i32(x, y);
291}
292
293__device__ static inline int __rhadd(int x, int y) {
294 int z = x + y + 1;
295 int sign = z & 0x8000000;
296 int value = z & 0x7FFFFFFF;
297 return ((value) >> 1 || sign);
298}
299__device__ static inline unsigned int __sad(int x, int y, unsigned int z) {
300 return x > y ? x - y + z : y - x + z;
301}
302__device__ static inline unsigned int __uhadd(unsigned int x, unsigned int y) {
303 return (x + y) >> 1;
304}
305__device__ static inline int __umul24(unsigned int x, unsigned int y) {
306 return __ockl_mul24_u32(x, y);
307}
308
309__device__
310static inline unsigned long long __umul64hi(unsigned long long int x, unsigned long long int y) {
311 unsigned long long x0 = x & 0xffffffffUL;
312 unsigned long long x1 = x >> 32;
313 unsigned long long y0 = y & 0xffffffffUL;
314 unsigned long long y1 = y >> 32;
315 unsigned long long z0 = x0*y0;
316 unsigned long long t = x1*y0 + (z0 >> 32);
317 unsigned long long z1 = t & 0xffffffffUL;
318 unsigned long long z2 = t >> 32;
319 z1 = x0*y1 + z1;
320 return x1*y1 + z2 + (z1 >> 32);
321}
322
323__device__ static inline unsigned int __umulhi(unsigned int x, unsigned int y) {
324 return __ockl_mul_hi_u32(x, y);
325}
326__device__ static inline unsigned int __urhadd(unsigned int x, unsigned int y) {
327 return (x + y + 1) >> 1;
328}
329__device__ static inline unsigned int __usad(unsigned int x, unsigned int y, unsigned int z) {
330 return __ockl_sadd_u32(x, y, z);
331}
332
333__device__
334static inline unsigned int __mbcnt_lo(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_lo(x,y);};
335
336__device__
337static inline unsigned int __mbcnt_hi(unsigned int x, unsigned int y) {return __builtin_amdgcn_mbcnt_hi(x,y);};
338
339/*
340HIP specific device functions
341*/
342
343#if !defined(__HIPCC_RTC__)
344#include "amd_warp_functions.h"
345#include "amd_warp_sync_functions.h"
346#endif
347
348#define MASK1 0x00ff00ff
349#define MASK2 0xff00ff00
350
351__device__ static inline char4 __hip_hc_add8pk(char4 in1, char4 in2) {
352 char4 out;
353 unsigned one1 = in1.w & MASK1;
354 unsigned one2 = in2.w & MASK1;
355 out.w = (one1 + one2) & MASK1;
356 one1 = in1.w & MASK2;
357 one2 = in2.w & MASK2;
358 out.w = out.w | ((one1 + one2) & MASK2);
359 return out;
360}
361
362__device__ static inline char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
363 char4 out;
364 unsigned one1 = in1.w & MASK1;
365 unsigned one2 = in2.w & MASK1;
366 out.w = (one1 - one2) & MASK1;
367 one1 = in1.w & MASK2;
368 one2 = in2.w & MASK2;
369 out.w = out.w | ((one1 - one2) & MASK2);
370 return out;
371}
372
373__device__ static inline char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
374 char4 out;
375 unsigned one1 = in1.w & MASK1;
376 unsigned one2 = in2.w & MASK1;
377 out.w = (one1 * one2) & MASK1;
378 one1 = in1.w & MASK2;
379 one2 = in2.w & MASK2;
380 out.w = out.w | ((one1 * one2) & MASK2);
381 return out;
382}
383
384__device__ static inline float __double2float_rd(double x) {
385 return __ocml_cvtrtn_f32_f64(x);
386}
387__device__ static inline float __double2float_rn(double x) { return x; }
388__device__ static inline float __double2float_ru(double x) {
389 return __ocml_cvtrtp_f32_f64(x);
390}
391__device__ static inline float __double2float_rz(double x) {
392 return __ocml_cvtrtz_f32_f64(x);
393}
394
395__device__ static inline int __double2hiint(double x) {
396 static_assert(sizeof(double) == 2 * sizeof(int), "");
397
398 int tmp[2];
399 __builtin_memcpy(tmp, &x, sizeof(tmp));
400
401 return tmp[1];
402}
403__device__ static inline int __double2loint(double x) {
404 static_assert(sizeof(double) == 2 * sizeof(int), "");
405
406 int tmp[2];
407 __builtin_memcpy(tmp, &x, sizeof(tmp));
408
409 return tmp[0];
410}
411
412__device__ static inline int __double2int_rd(double x) { return (int)__ocml_floor_f64(x); }
413__device__ static inline int __double2int_rn(double x) { return (int)__ocml_rint_f64(x); }
414__device__ static inline int __double2int_ru(double x) { return (int)__ocml_ceil_f64(x); }
415__device__ static inline int __double2int_rz(double x) { return (int)x; }
416
417__device__ static inline long long int __double2ll_rd(double x) {
418 return (long long)__ocml_floor_f64(x);
419}
420__device__ static inline long long int __double2ll_rn(double x) {
421 return (long long)__ocml_rint_f64(x);
422}
423__device__ static inline long long int __double2ll_ru(double x) {
424 return (long long)__ocml_ceil_f64(x);
425}
426__device__ static inline long long int __double2ll_rz(double x) { return (long long)x; }
427
428__device__ static inline unsigned int __double2uint_rd(double x) {
429 return (unsigned int)__ocml_floor_f64(x);
430}
431__device__ static inline unsigned int __double2uint_rn(double x) {
432 return (unsigned int)__ocml_rint_f64(x);
433}
434__device__ static inline unsigned int __double2uint_ru(double x) {
435 return (unsigned int)__ocml_ceil_f64(x);
436}
437__device__ static inline unsigned int __double2uint_rz(double x) { return (unsigned int)x; }
438
439__device__ static inline unsigned long long int __double2ull_rd(double x) {
440 return (unsigned long long int)__ocml_floor_f64(x);
441}
442__device__ static inline unsigned long long int __double2ull_rn(double x) {
443 return (unsigned long long int)__ocml_rint_f64(x);
444}
445__device__ static inline unsigned long long int __double2ull_ru(double x) {
446 return (unsigned long long int)__ocml_ceil_f64(x);
447}
448__device__ static inline unsigned long long int __double2ull_rz(double x) {
449 return (unsigned long long int)x;
450}
451__device__ static inline long long int __double_as_longlong(double x) {
452 static_assert(sizeof(long long) == sizeof(double), "");
453
454 long long tmp;
455 __builtin_memcpy(&tmp, &x, sizeof(tmp));
456
457 return tmp;
458}
459
460/*
461__device__ unsigned short __float2half_rn(float x);
462__device__ float __half2float(unsigned short);
463
464The above device function are not a valid .
465Use
466__device__ __half __float2half_rn(float x);
467__device__ float __half2float(__half);
468from hip_fp16.h
469
470CUDA implements half as unsigned short whereas, HIP doesn't.
471
472*/
473
474__device__ static inline int __float2int_rd(float x) { return (int)__ocml_floor_f32(x); }
475__device__ static inline int __float2int_rn(float x) { return (int)__ocml_rint_f32(x); }
476__device__ static inline int __float2int_ru(float x) { return (int)__ocml_ceil_f32(x); }
477__device__ static inline int __float2int_rz(float x) { return (int)__ocml_trunc_f32(x); }
478
479__device__ static inline long long int __float2ll_rd(float x) {
480 return (long long int)__ocml_floor_f32(x);
481}
482__device__ static inline long long int __float2ll_rn(float x) {
483 return (long long int)__ocml_rint_f32(x);
484}
485__device__ static inline long long int __float2ll_ru(float x) {
486 return (long long int)__ocml_ceil_f32(x);
487}
488__device__ static inline long long int __float2ll_rz(float x) { return (long long int)x; }
489
490__device__ static inline unsigned int __float2uint_rd(float x) {
491 return (unsigned int)__ocml_floor_f32(x);
492}
493__device__ static inline unsigned int __float2uint_rn(float x) {
494 return (unsigned int)__ocml_rint_f32(x);
495}
496__device__ static inline unsigned int __float2uint_ru(float x) {
497 return (unsigned int)__ocml_ceil_f32(x);
498}
499__device__ static inline unsigned int __float2uint_rz(float x) { return (unsigned int)x; }
500
501__device__ static inline unsigned long long int __float2ull_rd(float x) {
502 return (unsigned long long int)__ocml_floor_f32(x);
503}
504__device__ static inline unsigned long long int __float2ull_rn(float x) {
505 return (unsigned long long int)__ocml_rint_f32(x);
506}
507__device__ static inline unsigned long long int __float2ull_ru(float x) {
508 return (unsigned long long int)__ocml_ceil_f32(x);
509}
510__device__ static inline unsigned long long int __float2ull_rz(float x) {
511 return (unsigned long long int)x;
512}
513
514__device__ static inline int __float_as_int(float x) {
515 static_assert(sizeof(int) == sizeof(float), "");
516
517 int tmp;
518 __builtin_memcpy(&tmp, &x, sizeof(tmp));
519
520 return tmp;
521}
522
523__device__ static inline unsigned int __float_as_uint(float x) {
524 static_assert(sizeof(unsigned int) == sizeof(float), "");
525
526 unsigned int tmp;
527 __builtin_memcpy(&tmp, &x, sizeof(tmp));
528
529 return tmp;
530}
531
532__device__ static inline double __hiloint2double(int hi, int lo) {
533 static_assert(sizeof(double) == sizeof(uint64_t), "");
534
535 uint64_t tmp0 = (static_cast<uint64_t>(hi) << 32ull) | static_cast<uint32_t>(lo);
536 double tmp1;
537 __builtin_memcpy(&tmp1, &tmp0, sizeof(tmp0));
538
539 return tmp1;
540}
541
542__device__ static inline double __int2double_rn(int x) { return (double)x; }
543
544__device__ static inline float __int2float_rd(int x) {
545 return __ocml_cvtrtn_f32_s32(x);
546}
547__device__ static inline float __int2float_rn(int x) { return (float)x; }
548__device__ static inline float __int2float_ru(int x) {
549 return __ocml_cvtrtp_f32_s32(x);
550}
551__device__ static inline float __int2float_rz(int x) {
552 return __ocml_cvtrtz_f32_s32(x);
553}
554
555__device__ static inline float __int_as_float(int x) {
556 static_assert(sizeof(float) == sizeof(int), "");
557
558 float tmp;
559 __builtin_memcpy(&tmp, &x, sizeof(tmp));
560
561 return tmp;
562}
563
564__device__ static inline double __ll2double_rd(long long int x) {
565 return __ocml_cvtrtn_f64_s64(x);
566}
567__device__ static inline double __ll2double_rn(long long int x) { return (double)x; }
568__device__ static inline double __ll2double_ru(long long int x) {
569 return __ocml_cvtrtp_f64_s64(x);
570}
571__device__ static inline double __ll2double_rz(long long int x) {
572 return __ocml_cvtrtz_f64_s64(x);
573}
574
575__device__ static inline float __ll2float_rd(long long int x) {
576 return __ocml_cvtrtn_f32_s64(x);
577}
578__device__ static inline float __ll2float_rn(long long int x) { return (float)x; }
579__device__ static inline float __ll2float_ru(long long int x) {
580 return __ocml_cvtrtp_f32_s64(x);
581}
582__device__ static inline float __ll2float_rz(long long int x) {
583 return __ocml_cvtrtz_f32_s64(x);
584}
585
586__device__ static inline double __longlong_as_double(long long int x) {
587 static_assert(sizeof(double) == sizeof(long long), "");
588
589 double tmp;
590 __builtin_memcpy(&tmp, &x, sizeof(tmp));
591
592 return tmp;
593}
594
595__device__ static inline double __uint2double_rn(unsigned int x) { return (double)x; }
596
597__device__ static inline float __uint2float_rd(unsigned int x) {
598 return __ocml_cvtrtn_f32_u32(x);
599}
600__device__ static inline float __uint2float_rn(unsigned int x) { return (float)x; }
601__device__ static inline float __uint2float_ru(unsigned int x) {
602 return __ocml_cvtrtp_f32_u32(x);
603}
604__device__ static inline float __uint2float_rz(unsigned int x) {
605 return __ocml_cvtrtz_f32_u32(x);
606}
607
608__device__ static inline float __uint_as_float(unsigned int x) {
609 static_assert(sizeof(float) == sizeof(unsigned int), "");
610
611 float tmp;
612 __builtin_memcpy(&tmp, &x, sizeof(tmp));
613
614 return tmp;
615}
616
617__device__ static inline double __ull2double_rd(unsigned long long int x) {
618 return __ocml_cvtrtn_f64_u64(x);
619}
620__device__ static inline double __ull2double_rn(unsigned long long int x) { return (double)x; }
621__device__ static inline double __ull2double_ru(unsigned long long int x) {
622 return __ocml_cvtrtp_f64_u64(x);
623}
624__device__ static inline double __ull2double_rz(unsigned long long int x) {
625 return __ocml_cvtrtz_f64_u64(x);
626}
627
628__device__ static inline float __ull2float_rd(unsigned long long int x) {
629 return __ocml_cvtrtn_f32_u64(x);
630}
631__device__ static inline float __ull2float_rn(unsigned long long int x) { return (float)x; }
632__device__ static inline float __ull2float_ru(unsigned long long int x) {
633 return __ocml_cvtrtp_f32_u64(x);
634}
635__device__ static inline float __ull2float_rz(unsigned long long int x) {
636 return __ocml_cvtrtz_f32_u64(x);
637}
638
639#if defined(__clang__) && defined(__HIP__)
640
641// Clock functions
642__device__ long long int __clock64();
643__device__ long long int __clock();
644__device__ long long int clock64();
645__device__ long long int clock();
646__device__ long long int wall_clock64();
647// hip.amdgcn.bc - named sync
648__device__ void __named_sync();
649
650#ifdef __HIP_DEVICE_COMPILE__
651
652// Clock function to return GPU core cycle count.
653// GPU can change its core clock frequency at runtime. The maximum frequency can be queried
654// through hipDeviceAttributeClockRate attribute.
655__device__
656inline __attribute((always_inline))
657long long int __clock64() {
658#if __has_builtin(__builtin_amdgcn_s_memtime)
659 // Exists on gfx8, gfx9, gfx10.1, gfx10.2, gfx10.3
660 return (long long int) __builtin_amdgcn_s_memtime();
661#else
662 // Subject to change when better solution available
663 return (long long int) __builtin_readcyclecounter();
664#endif
665}
666
667__device__
668inline __attribute((always_inline))
669long long int __clock() { return __clock64(); }
670
671// Clock function to return wall clock count at a constant frequency that can be queried
672// through hipDeviceAttributeWallClockRate attribute.
673__device__
674inline __attribute__((always_inline))
675long long int wall_clock64() {
676 return (long long int) __ockl_steadyctr_u64();
677}
678
679__device__
680inline __attribute__((always_inline))
681long long int clock64() { return __clock64(); }
682
683__device__
684inline __attribute__((always_inline))
685long long int clock() { return __clock(); }
686
687// hip.amdgcn.bc - named sync
688__device__
689inline
690void __named_sync() { __builtin_amdgcn_s_barrier(); }
691
692#endif // __HIP_DEVICE_COMPILE__
693
694// hip.amdgcn.bc - lanemask
695__device__
696inline
697uint64_t __lanemask_gt()
698{
699 uint32_t lane = __ockl_lane_u32();
700 if (lane == 63)
701 return 0;
702 uint64_t ballot = __ballot64(1);
703 uint64_t mask = (~((uint64_t)0)) << (lane + 1);
704 return mask & ballot;
705}
706
707__device__
708inline
709uint64_t __lanemask_lt()
710{
711 uint32_t lane = __ockl_lane_u32();
712 int64_t ballot = __ballot64(1);
713 uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
714 return mask & ballot;
715}
716
717__device__
718inline
719uint64_t __lanemask_eq()
720{
721 uint32_t lane = __ockl_lane_u32();
722 int64_t mask = ((uint64_t)1 << lane);
723 return mask;
724}
725
726
727__device__ inline void* __local_to_generic(void* p) { return p; }
728
729#ifdef __HIP_DEVICE_COMPILE__
730__device__
731inline
732void* __get_dynamicgroupbaseptr()
733{
734 // Get group segment base pointer.
735 return (char*)__local_to_generic((void*)__to_local(__builtin_amdgcn_groupstaticsize()));
736}
737#else
738__device__
739void* __get_dynamicgroupbaseptr();
740#endif // __HIP_DEVICE_COMPILE__
741
742__device__
743inline
744void *__amdgcn_get_dynamicgroupbaseptr() {
745 return __get_dynamicgroupbaseptr();
746}
747
748// Memory Fence Functions
749__device__
750inline
751static void __threadfence()
752{
753 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
754}
755
756__device__
757inline
758static void __threadfence_block()
759{
760 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
761}
762
763__device__
764inline
765static void __threadfence_system()
766{
767 __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
768}
769__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
770 if (flags) {
771 __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
772 __builtin_amdgcn_s_barrier();
773 __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
774 } else {
775 __builtin_amdgcn_s_barrier();
776 }
777}
778
779__device__
780inline
781static void __barrier(int n)
782{
783 __work_group_barrier((__cl_mem_fence_flags)n);
784}
785
786__device__
787inline
788__attribute__((convergent))
789void __syncthreads()
790{
791 __barrier(__CLK_LOCAL_MEM_FENCE);
792}
793
794__device__
795inline
796__attribute__((convergent))
797int __syncthreads_count(int predicate)
798{
799 return __ockl_wgred_add_i32(!!predicate);
800}
801
802__device__
803inline
804__attribute__((convergent))
805int __syncthreads_and(int predicate)
806{
807 return __ockl_wgred_and_i32(!!predicate);
808}
809
810__device__
811inline
812__attribute__((convergent))
813int __syncthreads_or(int predicate)
814{
815 return __ockl_wgred_or_i32(!!predicate);
816}
817
818// hip.amdgcn.bc - device routine
819/*
820 HW_ID Register bit structure for RDNA2 & RDNA3
821 WAVE_ID 4:0 Wave id within the SIMD.
822 SIMD_ID 9:8 SIMD_ID within the WGP: [0] = row, [1] = column.
823 WGP_ID 13:10 Physical WGP ID.
824 SA_ID 16 Shader Array ID
825 SE_ID 20:18 Shader Engine the wave is assigned to for gfx11
826 SE_ID 19:18 Shader Engine the wave is assigned to for gfx10
827 DP_RATE 31:29 Number of double-precision float units per SIMD
828
829 HW_ID Register bit structure for GCN and CDNA
830 WAVE_ID 3:0 Wave buffer slot number. 0-9.
831 SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
832 PIPE_ID 7:6 Pipeline from which the wave was dispatched.
833 CU_ID 11:8 Compute Unit the wave is assigned to.
834 SH_ID 12 Shader Array (within an SE) the wave is assigned to.
835 SE_ID 15:13 Shader Engine the wave is assigned to for gfx908, gfx90a, gfx940-942
836 14:13 Shader Engine the wave is assigned to for Vega.
837 TG_ID 19:16 Thread-group ID
838 VM_ID 23:20 Virtual Memory ID
839 QUEUE_ID 26:24 Queue from which this wave was dispatched.
840 STATE_ID 29:27 State ID (graphics only, not compute).
841 ME_ID 31:30 Micro-engine ID.
842
843 XCC_ID Register bit structure for gfx940
844 XCC_ID 3:0 XCC the wave is assigned to.
845 */
846
847#if (defined (__GFX10__) || defined (__GFX11__))
848 #define HW_ID 23
849#else
850 #define HW_ID 4
851#endif
852
853#if (defined(__GFX10__) || defined(__GFX11__))
854 #define HW_ID_WGP_ID_SIZE 4
855 #define HW_ID_WGP_ID_OFFSET 10
856 #if (defined(__AMDGCN_CUMODE__))
857 #define HW_ID_CU_ID_SIZE 1
858 #define HW_ID_CU_ID_OFFSET 8
859 #endif
860#else
861 #define HW_ID_CU_ID_SIZE 4
862 #define HW_ID_CU_ID_OFFSET 8
863#endif
864
865#if (defined(__gfx908__) || defined(__gfx90a__) || \
866 defined(__GFX11__))
867 #define HW_ID_SE_ID_SIZE 3
868#else //4 SEs/XCC for gfx940-942
869 #define HW_ID_SE_ID_SIZE 2
870#endif
871#if (defined(__GFX10__) || defined(__GFX11__))
872 #define HW_ID_SE_ID_OFFSET 18
873 #define HW_ID_SA_ID_OFFSET 16
874 #define HW_ID_SA_ID_SIZE 1
875#else
876 #define HW_ID_SE_ID_OFFSET 13
877#endif
878
879#if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
880 #define XCC_ID 20
881 #define XCC_ID_XCC_ID_SIZE 4
882 #define XCC_ID_XCC_ID_OFFSET 0
883#endif
884
885#if (!defined(__HIP_NO_IMAGE_SUPPORT) && \
886 (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)))
887 #define __HIP_NO_IMAGE_SUPPORT 1
888#endif
889
890/*
891 Encoding of parameter bitmask
892 HW_ID 5:0 HW_ID
893 OFFSET 10:6 Range: 0..31
894 SIZE 15:11 Range: 1..32
895 */
896
897#define GETREG_IMMED(SZ,OFF,REG) (((SZ) << 11) | ((OFF) << 6) | (REG))
898
899/*
900 __smid returns the wave's assigned Compute Unit and Shader Engine.
901 The Compute Unit, CU_ID returned in bits 3:0, and Shader Engine, SE_ID in bits 5:4.
902 Note: the results vary over time.
903 SZ minus 1 since SIZE is 1-based.
904*/
905__device__
906inline
907unsigned __smid(void)
908{
909 unsigned se_id = __builtin_amdgcn_s_getreg(
910 GETREG_IMMED(HW_ID_SE_ID_SIZE-1, HW_ID_SE_ID_OFFSET, HW_ID));
911 #if (defined(__GFX10__) || defined(__GFX11__))
912 unsigned wgp_id = __builtin_amdgcn_s_getreg(
913 GETREG_IMMED(HW_ID_WGP_ID_SIZE - 1, HW_ID_WGP_ID_OFFSET, HW_ID));
914 unsigned sa_id = __builtin_amdgcn_s_getreg(
915 GETREG_IMMED(HW_ID_SA_ID_SIZE - 1, HW_ID_SA_ID_OFFSET, HW_ID));
916 #if (defined(__AMDGCN_CUMODE__))
917 unsigned cu_id = __builtin_amdgcn_s_getreg(
918 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
919 #endif
920 #else
921 #if (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
922 unsigned xcc_id = __builtin_amdgcn_s_getreg(
923 GETREG_IMMED(XCC_ID_XCC_ID_SIZE - 1, XCC_ID_XCC_ID_OFFSET, XCC_ID));
924 #endif
925 unsigned cu_id = __builtin_amdgcn_s_getreg(
926 GETREG_IMMED(HW_ID_CU_ID_SIZE - 1, HW_ID_CU_ID_OFFSET, HW_ID));
927 #endif
928 #if (defined(__GFX10__) || defined(__GFX11__))
929 unsigned temp = se_id;
930 temp = (temp << HW_ID_SA_ID_SIZE) | sa_id;
931 temp = (temp << HW_ID_WGP_ID_SIZE) | wgp_id;
932 #if (defined(__AMDGCN_CUMODE__))
933 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
934 #endif
935 return temp;
936 //TODO : CU Mode impl
937 #elif (defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
938 unsigned temp = xcc_id;
939 temp = (temp << HW_ID_SE_ID_SIZE) | se_id;
940 temp = (temp << HW_ID_CU_ID_SIZE) | cu_id;
941 return temp;
942 #else
943 return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
944 #endif
945}
946
951#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
952#define HIP_DYNAMIC_SHARED_ATTRIBUTE
953
954#endif //defined(__clang__) && defined(__HIP__)
955
956
957// loop unrolling
958static inline __device__ void* __hip_hc_memcpy(void* dst, const void* src, size_t size) {
959 auto dstPtr = static_cast<unsigned char*>(dst);
960 auto srcPtr = static_cast<const unsigned char*>(src);
961
962 while (size >= 4u) {
963 dstPtr[0] = srcPtr[0];
964 dstPtr[1] = srcPtr[1];
965 dstPtr[2] = srcPtr[2];
966 dstPtr[3] = srcPtr[3];
967
968 size -= 4u;
969 srcPtr += 4u;
970 dstPtr += 4u;
971 }
972 switch (size) {
973 case 3:
974 dstPtr[2] = srcPtr[2];
975 case 2:
976 dstPtr[1] = srcPtr[1];
977 case 1:
978 dstPtr[0] = srcPtr[0];
979 }
980
981 return dst;
982}
983
984static inline __device__ void* __hip_hc_memset(void* dst, unsigned char val, size_t size) {
985 auto dstPtr = static_cast<unsigned char*>(dst);
986
987 while (size >= 4u) {
988 dstPtr[0] = val;
989 dstPtr[1] = val;
990 dstPtr[2] = val;
991 dstPtr[3] = val;
992
993 size -= 4u;
994 dstPtr += 4u;
995 }
996 switch (size) {
997 case 3:
998 dstPtr[2] = val;
999 case 2:
1000 dstPtr[1] = val;
1001 case 1:
1002 dstPtr[0] = val;
1003 }
1004
1005 return dst;
1006}
1007#ifndef __OPENMP_AMDGCN__
1008static inline __device__ void* memcpy(void* dst, const void* src, size_t size) {
1009 return __hip_hc_memcpy(dst, src, size);
1010}
1011
1012static inline __device__ void* memset(void* ptr, int val, size_t size) {
1013 unsigned char val8 = static_cast<unsigned char>(val);
1014 return __hip_hc_memset(ptr, val8, size);
1015}
1016#endif // !__OPENMP_AMDGCN__
1017
1018#endif
Contains declarations for types and functions in device library. Uses int64_t and uint64_t instead of...
_Float16 __2f16 __attribute__((ext_vector_type(2)))
Definition hip_fp16_math_fwd.h:57
Definition amd_device_functions.h:236
Definition amd_device_functions.h:243
Definition amd_hip_vector_types.h:1672