Halide  17.0.2
Halide compiler and libraries
GPUMemInfo.h
Go to the documentation of this file.
1 #ifndef GPU_MEM_INFO_H
2 #define GPU_MEM_INFO_H
3 
4 #include <unordered_map>
5 #include <unordered_set>
6 #include <vector>
7 
8 #include "ASLog.h"
9 #include "Errors.h"
10 
11 /** \file
12  *
13  * Data structures that help track memory access information. Useful when
14  * computing GPU features
15  */
16 
17 namespace Halide {
18 namespace Internal {
19 namespace Autoscheduler {
20 
21 struct GlobalMem;
22 struct GlobalAccessAccumulator;
23 struct SharedMem;
24 struct SharedAccessAccumulator;
25 struct LocalMem;
26 struct LocalAccessAccumulator;
27 
28 template<typename T>
29 struct MemTraits;
30 
31 template<>
32 struct MemTraits<GlobalMem> {
33  static constexpr double bytes_per_transaction = 32;
34  using MemInfoType = GlobalMem;
36 };
37 
38 template<>
39 struct MemTraits<SharedMem> {
40  static constexpr double bytes_per_transaction = 128;
41  using MemInfoType = SharedMem;
43 };
44 
45 template<>
46 struct MemTraits<LocalMem> {
47  static constexpr double bytes_per_transaction = 32;
48  using MemInfoType = GlobalMem; // Local mem behaves similarly to global mem
50 };
51 
52 template<typename T>
54 
55 template<typename T>
56 struct MemInfo {
58 
59  double num_transactions() const {
60  return total_num_transactions;
61  }
62 
63  void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request) {
64  internal_assert(num_bytes_used_per_request > 0);
65 
66  double total_transactions = num_requests * num_transactions_per_request;
67  double total_bytes = total_transactions * bytes_per_transaction;
68  double total_bytes_used = num_requests * num_bytes_used_per_request;
69 
70  internal_assert(total_bytes_used <= total_bytes)
71  << "\ntotal_bytes_used = " << total_bytes_used
72  << "\ntotal_bytes = " << total_bytes
73  << "\ntotal_transactions = " << total_transactions
74  << "\nnum_transactions_per_request = " << num_transactions_per_request
75  << "\nnum_requests = " << num_requests;
76 
77  update_totals(total_transactions, total_bytes_used, total_bytes);
78  }
79 
80  void add(const MemInfo<T> &other) {
81  total_num_transactions += other.total_num_transactions;
82  total_num_bytes_used += other.total_num_bytes_used;
83  total_num_bytes += other.total_num_bytes;
84  }
85 
86  double efficiency() const {
87  if (total_num_bytes == 0) {
88  return 1;
89  }
90 
91  double result = total_num_bytes_used / total_num_bytes;
92  internal_assert(result <= 1);
93  return result;
94  }
95 
96 private:
97  void update_totals(double num_transactions, double num_bytes_used, double num_bytes) {
98  total_num_transactions += num_transactions;
99  total_num_bytes_used += num_bytes_used;
100  total_num_bytes += num_bytes;
101  }
102 
103  double total_num_transactions = 0;
104  double total_num_bytes_used = 0;
105  double total_num_bytes = 0;
106 };
107 
108 template<typename T>
110 
114 
115 struct Strides {
116 public:
117  explicit Strides(const std::vector<int64_t> &storage_strides)
118  : storage_strides{storage_strides} {
119  }
120 
121  void add_valid(const std::vector<double> &strides) {
122  add(strides, true);
123  }
124 
125  void add_invalid() {
126  add({}, false);
127  }
128 
129  bool valid(size_t loop_index) const {
130  return is_valid[loop_index];
131  }
132 
133  int64_t offset(size_t loop_index, int64_t point) const {
134  internal_assert(loop_index < is_valid.size() && valid(loop_index));
135  internal_assert(index_strides[loop_index].size() == storage_strides.size());
136 
137  int64_t result = 0;
138  for (size_t i = 0; i < storage_strides.size(); ++i) {
139  result += (int64_t)(point * index_strides[loop_index][i]) * storage_strides[i];
140  }
141  return std::abs(result);
142  }
143 
144  void dump(bool verbose = false) {
145  if (!verbose) {
146  return;
147  }
148 
149  for (size_t i = 0; i < storage_strides.size(); ++i) {
150  if (!valid(i)) {
151  aslog(2) << "stride " << i << ": invalid\n";
152  continue;
153  }
154  aslog(2) << "storage_stride " << i << ": " << storage_strides[i] << "\n";
155  }
156 
157  for (size_t i = 0; i < index_strides.size(); ++i) {
158  for (size_t j = 0; j < index_strides[i].size(); ++j) {
159  aslog(2) << "index_stride " << i << ", storage_stride " << j << ": " << index_strides[i][j] << " ";
160  }
161  aslog(2) << "\n";
162  }
163  }
164 
165 private:
166  void add(const std::vector<double> &strides, bool e) {
167  index_strides.push_back(strides);
168  is_valid.push_back(e);
169  }
170 
171  std::vector<int64_t> storage_strides;
172  std::vector<std::vector<double>> index_strides;
173  std::vector<bool> is_valid;
174 };
175 
177  GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
178  : bytes_per_access{bytes_per_access},
179  dimensions{dimensions},
180  strides{strides},
181  verbose{verbose} {
182  }
183 
184  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
185  if (!active) {
186  return;
187  }
188 
189  if (verbose) {
190  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
191  }
192 
193  int thread_ids[3] = {x, y, z};
194  int64_t byte = 0;
195  for (size_t i = 0; i < dimensions; ++i) {
196  if (!strides.valid(i)) {
197  ++unknown_sectors;
198  return;
199  }
200  byte += bytes_per_access * strides.offset(i, thread_ids[i]);
201  }
202 
203  if (verbose) {
204  aslog(2) << "byte accessed: " << byte << "\n";
205  }
206 
207  int64_t sector = byte / 32;
208  if (verbose) {
209  aslog(2) << "sectors accessed: ";
210  }
211  for (int i = 0; i < bytes_per_access; ++i) {
212  if (verbose) {
213  aslog(2) << sector << " ";
214  }
215  sectors_accessed[sector].insert(byte + i);
216  }
217  if (verbose) {
218  aslog(2) << "\n\n";
219  }
220  }
221 
222  void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const {
223  int num_transactions_per_request = sectors_accessed.size() + unknown_sectors;
224 
225  if (verbose) {
226  if (is_tail_warp) {
227  aslog(2) << "tail_";
228  }
229  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
230  }
231 
232  int num_bytes_used_per_request = 0;
233  for (const auto &sector : sectors_accessed) {
234  num_bytes_used_per_request += sector.second.size();
235  }
236 
237  num_bytes_used_per_request += unknown_sectors * bytes_per_access;
238 
239  if (verbose) {
240  if (is_tail_warp) {
241  aslog(2) << "tail_";
242  }
243  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
244  }
245 
246  global_mem_info.add_access_info(
247  num_requests,
248  num_transactions_per_request,
249  num_bytes_used_per_request);
250  }
251 
252 private:
253  int bytes_per_access;
254  size_t dimensions;
255  Strides strides;
256  bool verbose;
257  int unknown_sectors = 0;
258  std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
259 };
260 
262  SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
263  : bytes_per_access{bytes_per_access},
264  dimensions{dimensions},
265  strides{strides},
266  verbose{verbose} {
267  }
268 
269  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
270  if (!active) {
271  return;
272  }
273 
274  if (verbose) {
275  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
276  }
277 
278  int thread_ids[3] = {x, y, z};
279  int64_t byte = 0;
280  for (size_t i = 0; i < dimensions; ++i) {
281  if (!strides.valid(i)) {
282  ++unknown_banks;
283  return;
284  }
285  byte += bytes_per_access * strides.offset(i, thread_ids[i]);
286  }
287 
288  if (verbose) {
289  aslog(2) << "bytes accessed: ";
290  for (int i = 0; i < bytes_per_access; ++i) {
291  aslog(2) << byte + i << " ";
292  }
293  aslog(2) << "\n";
294  }
295 
296  if (verbose) {
297  aslog(2) << "banks accessed: ";
298  }
299  for (int i = 0; i < bytes_per_access; ++i) {
300  int64_t word = (byte + i) / 4;
301  int64_t bank = word % 32;
302  if (verbose) {
303  aslog(2) << bank << " ";
304  }
305  bytes_accessed.insert(byte + i);
306  bank_to_words_accessed[bank].insert(word);
307  }
308  if (verbose) {
309  aslog(2) << "\n\n";
310  }
311  }
312 
313  void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const {
314  int num_transactions_per_request = 0;
315  for (const auto &bank : bank_to_words_accessed) {
316  num_transactions_per_request = std::max(num_transactions_per_request, (int)bank.size());
317  }
318 
319  num_transactions_per_request += unknown_banks;
320 
321  if (verbose) {
322  if (is_tail_warp) {
323  aslog(2) << "tail_";
324  }
325  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
326  }
327 
328  int num_bytes_used_per_request = bytes_accessed.size();
329 
330  num_bytes_used_per_request += unknown_banks * bytes_per_access;
331 
332  if (verbose) {
333  if (is_tail_warp) {
334  aslog(2) << "tail_";
335  }
336  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
337  }
338 
339  shared_mem_info.add_access_info(
340  num_requests,
341  num_transactions_per_request,
342  num_bytes_used_per_request);
343  }
344 
345 private:
346  int bytes_per_access;
347  size_t dimensions;
348  Strides strides;
349  bool verbose;
350  int unknown_banks = 0;
351  std::unordered_set<int64_t> bytes_accessed;
352  std::array<std::unordered_set<int64_t>, 32> bank_to_words_accessed;
353 };
354 
356  LocalAccessAccumulator(int bytes_per_access, bool verbose)
357  : bytes_per_access{bytes_per_access},
358  verbose{verbose} {
359  }
360 
361  void operator()(int thread_id, int x, int y, int z, int active, bool last_thread) {
362  if (!active) {
363  return;
364  }
365 
366  ++thread_count;
367 
368  if (verbose) {
369  aslog(2) << "thread_id: " << thread_id << " (" << x << ", " << y << ", " << z << ")\n";
370  }
371  }
372 
373  void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const {
374  int num_bytes_used_per_request = thread_count * bytes_per_access;
375  int sectors_accessed = std::ceil((float)num_bytes_used_per_request / (float)LocalMemInfo::bytes_per_transaction);
376  int num_transactions_per_request = sectors_accessed;
377 
378  if (verbose) {
379  if (is_tail_warp) {
380  aslog(2) << "tail_";
381  }
382  aslog(2) << "num_transactions_per_request = " << num_transactions_per_request << "\n";
383  }
384 
385  if (verbose) {
386  if (is_tail_warp) {
387  aslog(2) << "tail_";
388  }
389  aslog(2) << "num_requests_per_block = " << num_requests << "\n";
390  }
391 
392  local_mem_info.add_access_info(
393  num_requests,
394  num_transactions_per_request,
395  num_bytes_used_per_request);
396  }
397 
398 private:
399  int bytes_per_access;
400  bool verbose;
401  int thread_count = 0;
402  std::unordered_map<int64_t, std::unordered_set<int64_t>> sectors_accessed;
403 };
404 
405 } // namespace Autoscheduler
406 } // namespace Internal
407 } // namespace Halide
408 
409 #endif // GPU_MEM_INFO_H
#define internal_assert(c)
Definition: Errors.h:19
typename MemTraits< T >::Accumulator Accumulator
Definition: GPUMemInfo.h:53
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Expr ceil(Expr x)
Return the least whole number greater than or equal to a floating-point expression.
@ Internal
Not visible externally, similar to 'static' linkage in C.
Expr abs(Expr a)
Returns the absolute value of a signed integer or floating-point expression.
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:606
signed __INT64_TYPE__ int64_t
GlobalAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition: GPUMemInfo.h:177
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:184
void add_access_info(int num_requests, GlobalMemInfo &global_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:222
void add_access_info(int num_requests, LocalMemInfo &local_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:373
LocalAccessAccumulator(int bytes_per_access, bool verbose)
Definition: GPUMemInfo.h:356
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:361
void add_access_info(double num_requests, double num_transactions_per_request, double num_bytes_used_per_request)
Definition: GPUMemInfo.h:63
static constexpr double bytes_per_transaction
Definition: GPUMemInfo.h:57
void add(const MemInfo< T > &other)
Definition: GPUMemInfo.h:80
void operator()(int thread_id, int x, int y, int z, int active, bool last_thread)
Definition: GPUMemInfo.h:269
SharedAccessAccumulator(int bytes_per_access, size_t dimensions, const Strides &strides, bool verbose)
Definition: GPUMemInfo.h:262
void add_access_info(int num_requests, SharedMemInfo &shared_mem_info, bool is_tail_warp) const
Definition: GPUMemInfo.h:313
bool valid(size_t loop_index) const
Definition: GPUMemInfo.h:129
void add_valid(const std::vector< double > &strides)
Definition: GPUMemInfo.h:121
Strides(const std::vector< int64_t > &storage_strides)
Definition: GPUMemInfo.h:117
int64_t offset(size_t loop_index, int64_t point) const
Definition: GPUMemInfo.h:133