Halide  17.0.2
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
19 
20 #include <map>
21 #include <utility>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 
27 /** A class that can represent Vars or RVars. Used for reorder calls
28  * which can accept a mix of either. */
29 struct VarOrRVar {
30  VarOrRVar(const std::string &n, bool r)
31  : var(n), rvar(n), is_rvar(r) {
32  }
33  VarOrRVar(const Var &v)
34  : var(v), is_rvar(false) {
35  }
36  VarOrRVar(const RVar &r)
37  : rvar(r), is_rvar(true) {
38  }
39  VarOrRVar(const RDom &r)
40  : rvar(RVar(r)), is_rvar(true) {
41  }
42  template<int N>
44  : var(u), is_rvar(false) {
45  }
46 
47  const std::string &name() const {
48  if (is_rvar) {
49  return rvar.name();
50  } else {
51  return var.name();
52  }
53  }
54 
57  bool is_rvar;
58 };
59 
60 class ImageParam;
61 
62 namespace Internal {
63 class Function;
64 struct Split;
65 struct StorageDim;
66 } // namespace Internal
67 
68 /** A single definition of a Func. May be a pure or update definition. */
69 class Stage {
70  /** Reference to the Function this stage (or definition) belongs to. */
71  Internal::Function function;
72  Internal::Definition definition;
73  /** Indicate which stage the definition belongs to (0 for initial
74  * definition, 1 for first update, etc.). */
75  size_t stage_index;
76  /** Pure Vars of the Function (from the init definition). */
77  std::vector<Var> dim_vars;
78 
79  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
80  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
81  void split(const std::string &old, const std::string &outer, const std::string &inner,
82  const Expr &factor, bool exact, TailStrategy tail);
83  void remove(const std::string &var);
84  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
85 
86  const std::vector<Internal::StorageDim> &storage_dims() const {
87  return function.schedule().storage_dims();
88  }
89 
90  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
91 
92 public:
93  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
94  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
95  internal_assert(definition.defined());
96 
97  dim_vars.reserve(function.args().size());
98  for (const auto &arg : function.args()) {
99  dim_vars.emplace_back(arg);
100  }
101  internal_assert(definition.args().size() == dim_vars.size());
102  }
103 
104  /** Return the current StageSchedule associated with this Stage. For
105  * introspection only: to modify schedule, use the Func interface. */
107  return definition.schedule();
108  }
109 
110  /** Return a string describing the current var list taking into
111  * account all the splits, reorders, and tiles. */
112  std::string dump_argument_list() const;
113 
114  /** Return the name of this stage, e.g. "f.update(2)" */
115  std::string name() const;
116 
117  /** Calling rfactor() on an associative update definition a Func will split
118  * the update into an intermediate which computes the partial results and
119  * replaces the current update definition with a new definition which merges
120  * the partial results. If called on a init/pure definition, this will
121  * throw an error. rfactor() will automatically infer the associative reduction
122  * operator and identity of the operator. If it can't prove the operation
123  * is associative or if it cannot find an identity for that operator, this
124  * will throw an error. In addition, commutativity of the operator is required
125  * if rfactor() is called on the inner dimension but excluding the outer
126  * dimensions.
127  *
128  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
129  * The rvars not listed in 'preserved' are removed from the original Func and
130  * are lifted to the intermediate Func. The remaining rvars (the ones in
131  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
132  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
133  * applied to the original Func's update definition. The loop order of the
134  * intermediate Func's update definition is the same as the original, although
135  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
136  * intermediate Func's init definition from innermost to outermost is the args'
137  * order of the original Func's init definition followed by the new pure Vars.
138  *
139  * The intermediate Func also inherits storage order from the original Func
140  * with the new pure Vars added to the outermost.
141  *
142  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
143  \code
144  f(x, y) = 0;
145  f(x, y) += g(r.x, r.y);
146  \endcode
147  * into a pipeline like this:
148  \code
149  f_intm(x, y, u) = 0;
150  f_intm(x, y, u) += g(r.x, u);
151 
152  f(x, y) = 0;
153  f(x, y) += f_intm(x, y, r.y);
154  \endcode
155  *
156  * This has a variety of uses. You can use it to split computation of an associative reduction:
157  \code
158  f(x, y) = 10;
159  RDom r(0, 96);
160  f(x, y) = max(f(x, y), g(x, y, r.x));
161  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
162  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
163  \endcode
164  *
165  *, which is equivalent to:
166  \code
167  parallel for u = 0 to 11:
168  for y:
169  for x:
170  f_intm(x, y, u) = -inf
171  parallel for x:
172  for y:
173  parallel for u = 0 to 11:
174  for rxi = 0 to 7:
175  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
176  for y:
177  for x:
178  f(x, y) = 10
179  parallel for x:
180  for y:
181  for rxo = 0 to 11:
182  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
183  \endcode
184  *
185  */
186  // @{
187  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
188  Func rfactor(const RVar &r, const Var &v);
189  // @}
190 
191  /** Schedule the iteration over this stage to be fused with another
192  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
193  * be computed AFTER 's' in the innermost fused dimension. There should not
194  * be any dependencies between those two fused stages. If either of the
195  * stages being fused is a stage of an extern Func, this will throw an error.
196  *
197  * Note that the two stages that are fused together should have the same
198  * exact schedule from the outermost to the innermost fused dimension, and
199  * the stage we are calling compute_with on should not have specializations,
200  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
201  *
202  * Also, if a producer is desired to be computed at the fused loop level,
203  * the function passed to the compute_at() needs to be the "parent". Consider
204  * the following code:
205  \code
206  input(x, y) = x + y;
207  f(x, y) = input(x, y);
208  f(x, y) += 5;
209  g(x, y) = x - y;
210  g(x, y) += 10;
211  f.compute_with(g, y);
212  f.update().compute_with(g.update(), y);
213  \endcode
214  *
215  * To compute 'input' at the fused loop level at dimension y, we specify
216  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
217  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
218  * is computed). On the other hand, to compute 'input' at the innermost
219  * dimension of 'f', we specify input.compute_at(f, x) instead of
220  * input.compute_at(g, x) since the x dimension of 'f' is not fused
221  * (only the y dimension is).
222  *
223  * Given the constraints, this has a variety of uses. Consider the
224  * following code:
225  \code
226  f(x, y) = x + y;
227  g(x, y) = x - y;
228  h(x, y) = f(x, y) + g(x, y);
229  f.compute_root();
230  g.compute_root();
231  f.split(x, xo, xi, 8);
232  g.split(x, xo, xi, 8);
233  g.compute_with(f, xo);
234  \endcode
235  *
236  * This is equivalent to:
237  \code
238  for y:
239  for xo:
240  for xi:
241  f(8*xo + xi) = (8*xo + xi) + y
242  for xi:
243  g(8*xo + xi) = (8*xo + xi) - y
244  for y:
245  for x:
246  h(x, y) = f(x, y) + g(x, y)
247  \endcode
248  *
249  * The size of the dimensions of the stages computed_with do not have
250  * to match. Consider the following code where 'g' is half the size of 'f':
251  \code
252  Image<int> f_im(size, size), g_im(size/2, size/2);
253  input(x, y) = x + y;
254  f(x, y) = input(x, y);
255  g(x, y) = input(2*x, 2*y);
256  g.compute_with(f, y);
257  input.compute_at(f, y);
258  Pipeline({f, g}).realize({f_im, g_im});
259  \endcode
260  *
261  * This is equivalent to:
262  \code
263  for y = 0 to size-1:
264  for x = 0 to size-1:
265  input(x, y) = x + y;
266  for x = 0 to size-1:
267  f(x, y) = input(x, y)
268  for x = 0 to size/2-1:
269  if (y < size/2-1):
270  g(x, y) = input(2*x, 2*y)
271  \endcode
272  *
273  * 'align' specifies how the loop iteration of each dimension of the
274  * two stages being fused should be aligned in the fused loop nests
275  * (see LoopAlignStrategy for options). Consider the following loop nests:
276  \code
277  for z = f_min_z to f_max_z:
278  for y = f_min_y to f_max_y:
279  for x = f_min_x to f_max_x:
280  f(x, y, z) = x + y + z
281  for z = g_min_z to g_max_z:
282  for y = g_min_y to g_max_y:
283  for x = g_min_x to g_max_x:
284  g(x, y, z) = x - y - z
285  \endcode
286  *
287  * If no alignment strategy is specified, the following loop nest will be
288  * generated:
289  \code
290  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
291  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
292  for x = f_min_x to f_max_x:
293  if (f_min_z <= z <= f_max_z):
294  if (f_min_y <= y <= f_max_y):
295  f(x, y, z) = x + y + z
296  for x = g_min_x to g_max_x:
297  if (g_min_z <= z <= g_max_z):
298  if (g_min_y <= y <= g_max_y):
299  g(x, y, z) = x - y - z
300  \endcode
301  *
302  * Instead, these alignment strategies:
303  \code
304  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
305  \endcode
306  * will produce the following loop nest:
307  \code
308  f_loop_min_z = f_min_z
309  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
310  for z = f_min_z to f_loop_max_z:
311  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
312  f_loop_max_y = f_max_y
313  for y = f_loop_min_y to f_loop_max_y:
314  for x = f_min_x to f_max_x:
315  if (f_loop_min_z <= z <= f_loop_max_z):
316  if (f_loop_min_y <= y <= f_loop_max_y):
317  f(x, y, z) = x + y + z
318  for x = g_min_x to g_max_x:
319  g_shift_z = g_min_z - f_loop_min_z
320  g_shift_y = g_max_y - f_loop_max_y
321  if (g_min_z <= (z + g_shift_z) <= g_max_z):
322  if (g_min_y <= (y + g_shift_y) <= g_max_y):
323  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
324  \endcode
325  *
326  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
327  * of 'g' at dimension z so that its starting value matches that of 'f'.
328  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
329  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
330  */
331  // @{
332  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
333  Stage &compute_with(LoopLevel loop_level, LoopAlignStrategy align = LoopAlignStrategy::Auto);
334  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
335  Stage &compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align = LoopAlignStrategy::Auto);
336  // @}
337 
338  /** Scheduling calls that control how the domain of this stage is
339  * traversed. See the documentation for Func for the meanings. */
340  // @{
341 
342  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
343  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
344  Stage &serial(const VarOrRVar &var);
345  Stage &parallel(const VarOrRVar &var);
346  Stage &vectorize(const VarOrRVar &var);
347  Stage &unroll(const VarOrRVar &var);
348  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
349  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
350  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351  Stage &partition(const VarOrRVar &var, Partition partition_policy);
353  Stage &never_partition(const std::vector<VarOrRVar> &vars);
355  Stage &always_partition(const std::vector<VarOrRVar> &vars);
356 
357  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
358  const VarOrRVar &xo, const VarOrRVar &yo,
359  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
361  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
362  const VarOrRVar &xi, const VarOrRVar &yi,
363  const Expr &xfactor, const Expr &yfactor,
365  Stage &tile(const std::vector<VarOrRVar> &previous,
366  const std::vector<VarOrRVar> &outers,
367  const std::vector<VarOrRVar> &inners,
368  const std::vector<Expr> &factors,
369  const std::vector<TailStrategy> &tails);
370  Stage &tile(const std::vector<VarOrRVar> &previous,
371  const std::vector<VarOrRVar> &outers,
372  const std::vector<VarOrRVar> &inners,
373  const std::vector<Expr> &factors,
375  Stage &tile(const std::vector<VarOrRVar> &previous,
376  const std::vector<VarOrRVar> &inners,
377  const std::vector<Expr> &factors,
379  Stage &reorder(const std::vector<VarOrRVar> &vars);
380 
381  template<typename... Args>
382  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
383  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
384  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
385  return reorder(collected_args);
386  }
387 
388  template<typename... Args>
389  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
390  never_partition(const VarOrRVar &x, Args &&...args) {
391  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
392  return never_partition(collected_args);
393  }
394 
395  template<typename... Args>
396  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
397  always_partition(const VarOrRVar &x, Args &&...args) {
398  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
399  return always_partition(collected_args);
400  }
401 
402  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
403  Stage specialize(const Expr &condition);
404  void specialize_fail(const std::string &message);
405 
406  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
407  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
408  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
409 
410  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
411 
413 
414  Stage &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
415  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
416  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
417 
418  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
419  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
420  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
421  DeviceAPI device_api = DeviceAPI::Default_GPU);
422  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
423  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
424  DeviceAPI device_api = DeviceAPI::Default_GPU);
425 
426  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
428  DeviceAPI device_api = DeviceAPI::Default_GPU);
429 
430  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
432  DeviceAPI device_api = DeviceAPI::Default_GPU);
433  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
434  const VarOrRVar &bx, const VarOrRVar &by,
435  const VarOrRVar &tx, const VarOrRVar &ty,
436  const Expr &x_size, const Expr &y_size,
438  DeviceAPI device_api = DeviceAPI::Default_GPU);
439 
440  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
441  const VarOrRVar &tx, const VarOrRVar &ty,
442  const Expr &x_size, const Expr &y_size,
444  DeviceAPI device_api = DeviceAPI::Default_GPU);
445 
446  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
447  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
448  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
449  const Expr &x_size, const Expr &y_size, const Expr &z_size,
451  DeviceAPI device_api = DeviceAPI::Default_GPU);
452  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
453  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
454  const Expr &x_size, const Expr &y_size, const Expr &z_size,
456  DeviceAPI device_api = DeviceAPI::Default_GPU);
457 
459  Stage &atomic(bool override_associativity_test = false);
460 
461  Stage &hexagon(const VarOrRVar &x = Var::outermost());
462 
463  Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
465  Stage &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
467  template<typename T>
468  Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
470  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
471  }
472  // @}
473 
474  /** Attempt to get the source file and line where this stage was
475  * defined by parsing the process's own debug symbols. Returns an
476  * empty string if no debug symbols were found or the debug
477  * symbols were not understood. Works on OS X and Linux only. */
478  std::string source_location() const;
479 
480  /** Assert that this stage has intentionally been given no schedule, and
481  * suppress the warning about unscheduled update definitions that would
482  * otherwise fire. This counts as a schedule, so calling this twice on the
483  * same Stage will fail the assertion. */
484  void unscheduled();
485 };
486 
487 // For backwards compatibility, keep the ScheduleHandle name.
489 
490 class FuncTupleElementRef;
491 
492 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
493  * z are Vars or Exprs. If could be the left hand side of a definition or
494  * an update definition, or it could be a call to a function. We don't know
495  * until we see how this object gets used.
496  */
497 class FuncRef {
498  Internal::Function func;
499  int implicit_placeholder_pos;
500  int implicit_count;
501  std::vector<Expr> args;
502  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
503 
504  /** Helper for function update by Tuple. If the function does not
505  * already have a pure definition, init_val will be used as RHS of
506  * each tuple element in the initial function definition. */
507  template<typename BinaryOp>
508  Stage func_ref_update(const Tuple &e, int init_val);
509 
510  /** Helper for function update by Expr. If the function does not
511  * already have a pure definition, init_val will be used as RHS in
512  * the initial function definition. */
513  template<typename BinaryOp>
514  Stage func_ref_update(Expr e, int init_val);
515 
516 public:
517  FuncRef(const Internal::Function &, const std::vector<Expr> &,
518  int placeholder_pos = -1, int count = 0);
519  FuncRef(Internal::Function, const std::vector<Var> &,
520  int placeholder_pos = -1, int count = 0);
521 
522  /** Use this as the left-hand-side of a definition or an update definition
523  * (see \ref RDom).
524  */
525  Stage operator=(const Expr &);
526 
527  /** Use this as the left-hand-side of a definition or an update definition
528  * for a Func with multiple outputs. */
529  Stage operator=(const Tuple &);
530 
531  /** Define a stage that adds the given expression to this Func. If the
532  * expression refers to some RDom, this performs a sum reduction of the
533  * expression over the domain. If the function does not already have a
534  * pure definition, this sets it to zero.
535  */
536  // @{
538  Stage operator+=(const Tuple &);
539  Stage operator+=(const FuncRef &);
540  // @}
541 
542  /** Define a stage that adds the negative of the given expression to this
543  * Func. If the expression refers to some RDom, this performs a sum reduction
544  * of the negative of the expression over the domain. If the function does
545  * not already have a pure definition, this sets it to zero.
546  */
547  // @{
549  Stage operator-=(const Tuple &);
550  Stage operator-=(const FuncRef &);
551  // @}
552 
553  /** Define a stage that multiplies this Func by the given expression. If the
554  * expression refers to some RDom, this performs a product reduction of the
555  * expression over the domain. If the function does not already have a pure
556  * definition, this sets it to 1.
557  */
558  // @{
560  Stage operator*=(const Tuple &);
561  Stage operator*=(const FuncRef &);
562  // @}
563 
564  /** Define a stage that divides this Func by the given expression.
565  * If the expression refers to some RDom, this performs a product
566  * reduction of the inverse of the expression over the domain. If the
567  * function does not already have a pure definition, this sets it to 1.
568  */
569  // @{
571  Stage operator/=(const Tuple &);
572  Stage operator/=(const FuncRef &);
573  // @}
574 
575  /* Override the usual assignment operator, so that
576  * f(x, y) = g(x, y) defines f.
577  */
578  Stage operator=(const FuncRef &);
579 
580  /** Use this as a call to the function, and not the left-hand-side
581  * of a definition. Only works for single-output Funcs. */
582  operator Expr() const;
583 
584  /** When a FuncRef refers to a function that provides multiple
585  * outputs, you can access each output as an Expr using
586  * operator[].
587  */
588  FuncTupleElementRef operator[](int) const;
589 
590  /** How many outputs does the function this refers to produce. */
591  size_t size() const;
592 
593  /** What function is this calling? */
594  Internal::Function function() const {
595  return func;
596  }
597 };
598 
599 /** Explicit overloads of min and max for FuncRef. These exist to
600  * disambiguate calls to min on FuncRefs when a user has pulled both
601  * Halide::min and std::min into their namespace. */
602 // @{
603 inline Expr min(const FuncRef &a, const FuncRef &b) {
604  return min(Expr(a), Expr(b));
605 }
606 inline Expr max(const FuncRef &a, const FuncRef &b) {
607  return max(Expr(a), Expr(b));
608 }
609 // @}
610 
611 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
612  * z are Vars or Exprs. If could be the left hand side of an update
613  * definition, or it could be a call to a function. We don't know
614  * until we see how this object gets used.
615  */
617  FuncRef func_ref;
618  std::vector<Expr> args; // args to the function
619  int idx; // Index to function outputs
620 
621  /** Helper function that generates a Tuple where element at 'idx' is set
622  * to 'e' and the rests are undef. */
623  Tuple values_with_undefs(const Expr &e) const;
624 
625 public:
626  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
627 
628  /** Use this as the left-hand-side of an update definition of Tuple
629  * component 'idx' of a Func (see \ref RDom). The function must
630  * already have an initial definition.
631  */
632  Stage operator=(const Expr &e);
633 
634  /** Define a stage that adds the given expression to Tuple component 'idx'
635  * of this Func. The other Tuple components are unchanged. If the expression
636  * refers to some RDom, this performs a sum reduction of the expression over
637  * the domain. The function must already have an initial definition.
638  */
639  Stage operator+=(const Expr &e);
640 
641  /** Define a stage that adds the negative of the given expression to Tuple
642  * component 'idx' of this Func. The other Tuple components are unchanged.
643  * If the expression refers to some RDom, this performs a sum reduction of
644  * the negative of the expression over the domain. The function must already
645  * have an initial definition.
646  */
647  Stage operator-=(const Expr &e);
648 
649  /** Define a stage that multiplies Tuple component 'idx' of this Func by
650  * the given expression. The other Tuple components are unchanged. If the
651  * expression refers to some RDom, this performs a product reduction of
652  * the expression over the domain. The function must already have an
653  * initial definition.
654  */
655  Stage operator*=(const Expr &e);
656 
657  /** Define a stage that divides Tuple component 'idx' of this Func by
658  * the given expression. The other Tuple components are unchanged.
659  * If the expression refers to some RDom, this performs a product
660  * reduction of the inverse of the expression over the domain. The function
661  * must already have an initial definition.
662  */
663  Stage operator/=(const Expr &e);
664 
665  /* Override the usual assignment operator, so that
666  * f(x, y)[index] = g(x, y) defines f.
667  */
668  Stage operator=(const FuncRef &e);
669 
670  /** Use this as a call to Tuple component 'idx' of a Func, and not the
671  * left-hand-side of a definition. */
672  operator Expr() const;
673 
674  /** What function is this calling? */
675  Internal::Function function() const {
676  return func_ref.function();
677  }
678 
679  /** Return index to the function outputs. */
680  int index() const {
681  return idx;
682  }
683 };
684 
685 namespace Internal {
686 class IRMutator;
687 } // namespace Internal
688 
689 /** Helper class for identifying purpose of an Expr passed to memoize.
690  */
691 class EvictionKey {
692 protected:
694  friend class Func;
695 
696 public:
697  explicit EvictionKey(const Expr &expr = Expr())
698  : key(expr) {
699  }
700 };
701 
702 /** A halide function. This class represents one stage in a Halide
703  * pipeline, and is the unit by which we schedule things. By default
704  * they are aggressively inlined, so you are encouraged to make lots
705  * of little functions, rather than storing things in Exprs. */
706 class Func {
707 
708  /** A handle on the internal halide function that this
709  * represents */
710  Internal::Function func;
711 
712  /** When you make a reference to this function with fewer
713  * arguments than it has dimensions, the argument list is bulked
714  * up with 'implicit' vars with canonical names. This lets you
715  * pass around partially applied Halide functions. */
716  // @{
717  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
718  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
719  // @}
720 
721  /** The imaging pipeline that outputs this Func alone. */
722  Pipeline pipeline_;
723 
724  /** Get the imaging pipeline that outputs this Func alone,
725  * creating it (and freezing the Func) if necessary. */
726  Pipeline pipeline();
727 
728  // Helper function for recursive reordering support
729  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
730 
731  void invalidate_cache();
732 
733 public:
734  /** Declare a new undefined function with the given name */
735  explicit Func(const std::string &name);
736 
737  /** Declare a new undefined function with the given name.
738  * The function will be constrained to represent Exprs of required_type.
739  * If required_dims is not AnyDims, the function will be constrained to exactly
740  * that many dimensions. */
741  explicit Func(const Type &required_type, int required_dims, const std::string &name);
742 
743  /** Declare a new undefined function with the given name.
744  * If required_types is not empty, the function will be constrained to represent
745  * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
746  * If required_dims is not AnyDims, the function will be constrained to exactly
747  * that many dimensions. */
748  explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
749 
750  /** Declare a new undefined function with an
751  * automatically-generated unique name */
752  Func();
753 
754  /** Declare a new function with an automatically-generated unique
755  * name, and define it to return the given expression (which may
756  * not contain free variables). */
757  explicit Func(const Expr &e);
758 
759  /** Construct a new Func to wrap an existing, already-define
760  * Function object. */
761  explicit Func(Internal::Function f);
762 
763  /** Construct a new Func to wrap a Buffer. */
764  template<typename T, int Dims>
766  : Func() {
767  (*this)(_) = im(_);
768  }
769 
770  /** Evaluate this function over some rectangular domain and return
771  * the resulting buffer or buffers. Performs compilation if the
772  * Func has not previously been realized and compile_jit has not
773  * been called. If the final stage of the pipeline is on the GPU,
774  * data is copied back to the host before being returned. The
775  * returned Realization should probably be instantly converted to
776  * a Buffer class of the appropriate type. That is, do this:
777  *
778  \code
779  f(x) = sin(x);
780  Buffer<float> im = f.realize(...);
781  \endcode
782  *
783  * If your Func has multiple values, because you defined it using
784  * a Tuple, then casting the result of a realize call to a buffer
785  * or image will produce a run-time error. Instead you should do the
786  * following:
787  *
788  \code
789  f(x) = Tuple(x, sin(x));
790  Realization r = f.realize(...);
791  Buffer<int> im0 = r[0];
792  Buffer<float> im1 = r[1];
793  \endcode
794  *
795  * In Halide formal arguments of a computation are specified using
796  * Param<T> and ImageParam objects in the expressions defining the
797  * computation. Note that this method is not thread-safe, in that
798  * Param<T> and ImageParam are globals shared by all threads; to call
799  * jitted code in a thread-safe manner, use compile_to_callable() instead.
800  *
801  \code
802  Param<int32> p(42);
803  ImageParam img(Int(32), 1);
804  f(x) = img(x) + p;
805 
806  Buffer<int32_t) arg_img(10, 10);
807  <fill in arg_img...>
808 
809  Target t = get_jit_target_from_environment();
810  Buffer<int32_t> result = f.realize({10, 10}, t);
811  \endcode
812  *
813  * Alternatively, an initializer list can be used
814  * directly in the realize call to pass this information:
815  *
816  \code
817  Param<int32> p(42);
818  ImageParam img(Int(32), 1);
819  f(x) = img(x) + p;
820 
821  Buffer<int32_t) arg_img(10, 10);
822  <fill in arg_img...>
823 
824  Target t = get_jit_target_from_environment();
825  Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
826  \endcode
827  *
828  * If the Func cannot be realized into a buffer of the given size
829  * due to scheduling constraints on scattering update definitions,
830  * it will be realized into a larger buffer of the minimum size
831  * possible, and a cropped view at the requested size will be
832  * returned. It is thus not safe to assume the returned buffers
833  * are contiguous in memory. This behavior can be disabled with
834  * the NoBoundsQuery target flag, in which case an error about
835  * writing out of bounds on the output buffer will trigger
836  * instead.
837  *
838  */
839  Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target());
840 
841  /** Same as above, but takes a custom user-provided context to be
842  * passed to runtime functions. This can be used to pass state to
843  * runtime overrides in a thread-safe manner. A nullptr context is
844  * legal, and is equivalent to calling the variant of realize
845  * that does not take a context. */
846  Realization realize(JITUserContext *context,
847  std::vector<int32_t> sizes = {},
848  const Target &target = Target());
849 
850  /** Evaluate this function into an existing allocated buffer or
851  * buffers. If the buffer is also one of the arguments to the
852  * function, strange things may happen, as the pipeline isn't
853  * necessarily safe to run in-place. If you pass multiple buffers,
854  * they must have matching sizes. This form of realize does *not*
855  * automatically copy data back from the GPU. */
856  void realize(Pipeline::RealizationArg outputs, const Target &target = Target());
857 
858  /** Same as above, but takes a custom user-provided context to be
859  * passed to runtime functions. This can be used to pass state to
860  * runtime overrides in a thread-safe manner. A nullptr context is
861  * legal, and is equivalent to calling the variant of realize
862  * that does not take a context. */
863  void realize(JITUserContext *context,
864  Pipeline::RealizationArg outputs,
865  const Target &target = Target());
866 
867  /** For a given size of output, or a given output buffer,
868  * determine the bounds required of all unbound ImageParams
869  * referenced. Communicates the result by allocating new buffers
870  * of the appropriate size and binding them to the unbound
871  * ImageParams.
872  */
873  // @{
874  void infer_input_bounds(const std::vector<int32_t> &sizes,
875  const Target &target = get_jit_target_from_environment());
876  void infer_input_bounds(Pipeline::RealizationArg outputs,
877  const Target &target = get_jit_target_from_environment());
878  // @}
879 
880  /** Versions of infer_input_bounds that take a custom user context
881  * to pass to runtime functions. */
882  // @{
883  void infer_input_bounds(JITUserContext *context,
884  const std::vector<int32_t> &sizes,
885  const Target &target = get_jit_target_from_environment());
886  void infer_input_bounds(JITUserContext *context,
887  Pipeline::RealizationArg outputs,
888  const Target &target = get_jit_target_from_environment());
889  // @}
890  /** Statically compile this function to llvm bitcode, with the
891  * given filename (which should probably end in .bc), type
892  * signature, and C function name (which defaults to the same name
893  * as this halide function */
894  //@{
895  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
896  const Target &target = get_target_from_environment());
897  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
898  const Target &target = get_target_from_environment());
899  // @}
900 
901  /** Statically compile this function to llvm assembly, with the
902  * given filename (which should probably end in .ll), type
903  * signature, and C function name (which defaults to the same name
904  * as this halide function */
905  //@{
906  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
907  const Target &target = get_target_from_environment());
908  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
909  const Target &target = get_target_from_environment());
910  // @}
911 
912  /** Statically compile this function to an object file, with the
913  * given filename (which should probably end in .o or .obj), type
914  * signature, and C function name (which defaults to the same name
915  * as this halide function. You probably don't want to use this
916  * directly; call compile_to_static_library or compile_to_file instead. */
917  //@{
918  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
919  const Target &target = get_target_from_environment());
920  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
921  const Target &target = get_target_from_environment());
922  // @}
923 
924  /** Emit a header file with the given filename for this
925  * function. The header will define a function with the type
926  * signature given by the second argument, and a name given by the
927  * third. The name defaults to the same name as this halide
928  * function. You don't actually have to have defined this function
929  * yet to call this. You probably don't want to use this directly;
930  * call compile_to_static_library or compile_to_file instead. */
931  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
932  const Target &target = get_target_from_environment());
933 
934  /** Statically compile this function to text assembly equivalent
935  * to the object file generated by compile_to_object. This is
936  * useful for checking what Halide is producing without having to
937  * disassemble anything, or if you need to feed the assembly into
938  * some custom toolchain to produce an object file (e.g. iOS) */
939  //@{
940  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
941  const Target &target = get_target_from_environment());
942  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
943  const Target &target = get_target_from_environment());
944  // @}
945 
946  /** Statically compile this function to C source code. This is
947  * useful for providing fallback code paths that will compile on
948  * many platforms. Vectorization will fail, and parallelization
949  * will produce serial code. */
950  void compile_to_c(const std::string &filename,
951  const std::vector<Argument> &,
952  const std::string &fn_name = "",
953  const Target &target = get_target_from_environment());
954 
955  /** Write out an internal representation of lowered code. Useful
956  * for analyzing and debugging scheduling. Can emit html or plain
957  * text. */
958  void compile_to_lowered_stmt(const std::string &filename,
959  const std::vector<Argument> &args,
960  StmtOutputFormat fmt = Text,
961  const Target &target = get_target_from_environment());
962 
963  /** Write out the loop nests specified by the schedule for this
964  * Function. Helpful for understanding what a schedule is
965  * doing. */
966  void print_loop_nest();
967 
968  /** Compile to object file and header pair, with the given
969  * arguments. The name defaults to the same name as this halide
970  * function.
971  */
972  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
973  const std::string &fn_name = "",
974  const Target &target = get_target_from_environment());
975 
976  /** Compile to static-library file and header pair, with the given
977  * arguments. The name defaults to the same name as this halide
978  * function.
979  */
980  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
981  const std::string &fn_name = "",
982  const Target &target = get_target_from_environment());
983 
984  /** Compile to static-library file and header pair once for each target;
985  * each resulting function will be considered (in order) via halide_can_use_target_features()
986  * at runtime, with the first appropriate match being selected for subsequent use.
987  * This is typically useful for specializations that may vary unpredictably by machine
988  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
989  * All targets must have identical arch-os-bits.
990  */
991  void compile_to_multitarget_static_library(const std::string &filename_prefix,
992  const std::vector<Argument> &args,
993  const std::vector<Target> &targets);
994 
995  /** Like compile_to_multitarget_static_library(), except that the object files
996  * are all output as object files (rather than bundled into a static library).
997  *
998  * `suffixes` is an optional list of strings to use for as the suffix for each object
999  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1000  * will be used for each suffix.)
1001  *
1002  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1003  * will be generated with the filename `${filename_prefix}_wrapper.o`
1004  *
1005  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1006  * will be generated with the filename `${filename_prefix}_runtime.o`
1007  */
1008  void compile_to_multitarget_object_files(const std::string &filename_prefix,
1009  const std::vector<Argument> &args,
1010  const std::vector<Target> &targets,
1011  const std::vector<std::string> &suffixes);
1012 
1013  /** Store an internal representation of lowered code as a self
1014  * contained Module suitable for further compilation. */
1015  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1016  const Target &target = get_target_from_environment());
1017 
1018  /** Compile and generate multiple target files with single call.
1019  * Deduces target files based on filenames specified in
1020  * output_files map.
1021  */
1022  void compile_to(const std::map<OutputFileType, std::string> &output_files,
1023  const std::vector<Argument> &args,
1024  const std::string &fn_name,
1025  const Target &target = get_target_from_environment());
1026 
1027  /** Eagerly jit compile the function to machine code. This
1028  * normally happens on the first call to realize. If you're
1029  * running your halide pipeline inside time-sensitive code and
1030  * wish to avoid including the time taken to compile a pipeline,
1031  * then you can call this ahead of time. Default is to use the Target
1032  * returned from Halide::get_jit_target_from_environment()
1033  */
1034  void compile_jit(const Target &target = get_jit_target_from_environment());
1035 
1036  /** Get a struct containing the currently set custom functions
1037  * used by JIT. This can be mutated. Changes will take effect the
1038  * next time this Func is realized. */
1039  JITHandlers &jit_handlers();
1040 
1041  /** Eagerly jit compile the function to machine code and return a callable
1042  * struct that behaves like a function pointer. The calling convention
1043  * will exactly match that of an AOT-compiled version of this Func
1044  * with the same Argument list.
1045  */
1046  Callable compile_to_callable(const std::vector<Argument> &args,
1047  const Target &target = get_jit_target_from_environment());
1048 
1049  /** Add a custom pass to be used during lowering. It is run after
1050  * all other lowering passes. Can be used to verify properties of
1051  * the lowered Stmt, instrument it with extra code, or otherwise
1052  * modify it. The Func takes ownership of the pass, and will call
1053  * delete on it when the Func goes out of scope. So don't pass a
1054  * stack object, or share pass instances between multiple
1055  * Funcs. */
1056  template<typename T>
1058  // Template instantiate a custom deleter for this type, then
1059  // wrap in a lambda. The custom deleter lives in user code, so
1060  // that deletion is on the same heap as construction (I hate Windows).
1061  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1062  }
1063 
1064  /** Add a custom pass to be used during lowering, with the
1065  * function that will be called to delete it also passed in. Set
1066  * it to nullptr if you wish to retain ownership of the object. */
1067  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1068 
1069  /** Remove all previously-set custom lowering passes */
1071 
1072  /** Get the custom lowering passes. */
1073  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1074 
1075  /** When this function is compiled, include code that dumps its
1076  * values to a file after it is realized, for the purpose of
1077  * debugging.
1078  *
1079  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1080  * is in TIFF format and can be read by standard tools. Oherwise, the
1081  * file format is as follows:
1082  *
1083  * All data is in the byte-order of the target platform. First, a
1084  * 20 byte-header containing four 32-bit ints, giving the extents
1085  * of the first four dimensions. Dimensions beyond four are
1086  * folded into the fourth. Then, a fifth 32-bit int giving the
1087  * data type of the function. The typecodes are given by: float =
1088  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1089  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1090  * data follows the header, as a densely packed array of the given
1091  * size and the given type. If given the extension .tmp, this file
1092  * format can be natively read by the program ImageStack. */
1093  void debug_to_file(const std::string &filename);
1094 
1095  /** The name of this function, either given during construction,
1096  * or automatically generated. */
1097  const std::string &name() const;
1098 
1099  /** Get the pure arguments. */
1100  std::vector<Var> args() const;
1101 
1102  /** The right-hand-side value of the pure definition of this
1103  * function. Causes an error if there's no pure definition, or if
1104  * the function is defined to return multiple values. */
1105  Expr value() const;
1106 
1107  /** The values returned by this function. An error if the function
1108  * has not been been defined. Returns a Tuple with one element for
1109  * functions defined to return a single value. */
1110  Tuple values() const;
1111 
1112  /** Does this function have at least a pure definition. */
1113  bool defined() const;
1114 
1115  /** Get the left-hand-side of the update definition. An empty
1116  * vector if there's no update definition. If there are
1117  * multiple update definitions for this function, use the
1118  * argument to select which one you want. */
1119  const std::vector<Expr> &update_args(int idx = 0) const;
1120 
1121  /** Get the right-hand-side of an update definition. An error if
1122  * there's no update definition. If there are multiple
1123  * update definitions for this function, use the argument to
1124  * select which one you want. */
1125  Expr update_value(int idx = 0) const;
1126 
1127  /** Get the right-hand-side of an update definition for
1128  * functions that returns multiple values. An error if there's no
1129  * update definition. Returns a Tuple with one element for
1130  * functions that return a single value. */
1131  Tuple update_values(int idx = 0) const;
1132 
1133  /** Get the RVars of the reduction domain for an update definition, if there is
1134  * one. */
1135  std::vector<RVar> rvars(int idx = 0) const;
1136 
1137  /** Does this function have at least one update definition? */
1138  bool has_update_definition() const;
1139 
1140  /** How many update definitions does this function have? */
1141  int num_update_definitions() const;
1142 
1143  /** Is this function an external stage? That is, was it defined
1144  * using define_extern? */
1145  bool is_extern() const;
1146 
1147  /** Add an extern definition for this Func. This lets you define a
1148  * Func that represents an external pipeline stage. You can, for
1149  * example, use it to wrap a call to an extern library such as
1150  * fftw. */
1151  // @{
1152  void define_extern(const std::string &function_name,
1153  const std::vector<ExternFuncArgument> &params, Type t,
1154  int dimensionality,
1156  DeviceAPI device_api = DeviceAPI::Host) {
1157  define_extern(function_name, params, t,
1158  Internal::make_argument_list(dimensionality), mangling,
1159  device_api);
1160  }
1161 
1162  void define_extern(const std::string &function_name,
1163  const std::vector<ExternFuncArgument> &params,
1164  const std::vector<Type> &types, int dimensionality,
1165  NameMangling mangling) {
1166  define_extern(function_name, params, types,
1167  Internal::make_argument_list(dimensionality), mangling);
1168  }
1169 
1170  void define_extern(const std::string &function_name,
1171  const std::vector<ExternFuncArgument> &params,
1172  const std::vector<Type> &types, int dimensionality,
1174  DeviceAPI device_api = DeviceAPI::Host) {
1175  define_extern(function_name, params, types,
1176  Internal::make_argument_list(dimensionality), mangling,
1177  device_api);
1178  }
1179 
1180  void define_extern(const std::string &function_name,
1181  const std::vector<ExternFuncArgument> &params, Type t,
1182  const std::vector<Var> &arguments,
1184  DeviceAPI device_api = DeviceAPI::Host) {
1185  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1186  mangling, device_api);
1187  }
1188 
1189  void define_extern(const std::string &function_name,
1190  const std::vector<ExternFuncArgument> &params,
1191  const std::vector<Type> &types,
1192  const std::vector<Var> &arguments,
1194  DeviceAPI device_api = DeviceAPI::Host);
1195  // @}
1196 
1197  /** Get the type(s) of the outputs of this Func.
1198  *
1199  * It is not legal to call type() unless the Func has non-Tuple elements.
1200  *
1201  * If the Func isn't yet defined, and was not specified with required types,
1202  * a runtime error will occur.
1203  *
1204  * If the Func isn't yet defined, but *was* specified with required types,
1205  * the requirements will be returned. */
1206  // @{
1207  const Type &type() const;
1208  const std::vector<Type> &types() const;
1209  // @}
1210 
1211  /** Get the number of outputs of this Func. Corresponds to the
1212  * size of the Tuple this Func was defined to return.
1213  * If the Func isn't yet defined, but was specified with required types,
1214  * the number of outputs specified in the requirements will be returned. */
1215  int outputs() const;
1216 
1217  /** Get the name of the extern function called for an extern
1218  * definition. */
1219  const std::string &extern_function_name() const;
1220 
1221  /** The dimensionality (number of arguments) of this function.
1222  * If the Func isn't yet defined, but was specified with required dimensionality,
1223  * the dimensionality specified in the requirements will be returned. */
1224  int dimensions() const;
1225 
1226  /** Construct either the left-hand-side of a definition, or a call
1227  * to a functions that happens to only contain vars as
1228  * arguments. If the function has already been defined, and fewer
1229  * arguments are given than the function has dimensions, then
1230  * enough implicit vars are added to the end of the argument list
1231  * to make up the difference (see \ref Var::implicit) */
1232  // @{
1233  FuncRef operator()(std::vector<Var>) const;
1234 
1235  template<typename... Args>
1237  operator()(Args &&...args) const {
1238  std::vector<Var> collected_args{std::forward<Args>(args)...};
1239  return this->operator()(collected_args);
1240  }
1241  // @}
1242 
1243  /** Either calls to the function, or the left-hand-side of
1244  * an update definition (see \ref RDom). If the function has
1245  * already been defined, and fewer arguments are given than the
1246  * function has dimensions, then enough implicit vars are added to
1247  * the end of the argument list to make up the difference. (see
1248  * \ref Var::implicit)*/
1249  // @{
1250  FuncRef operator()(std::vector<Expr>) const;
1251 
1252  template<typename... Args>
1254  operator()(const Expr &x, Args &&...args) const {
1255  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1256  return (*this)(collected_args);
1257  }
1258  // @}
1259 
1260  /** Creates and returns a new identity Func that wraps this Func. During
1261  * compilation, Halide replaces all calls to this Func done by 'f'
1262  * with calls to the wrapper. If this Func is already wrapped for
1263  * use in 'f', will return the existing wrapper.
1264  *
1265  * For example, g.in(f) would rewrite a pipeline like this:
1266  \code
1267  g(x, y) = ...
1268  f(x, y) = ... g(x, y) ...
1269  \endcode
1270  * into a pipeline like this:
1271  \code
1272  g(x, y) = ...
1273  g_wrap(x, y) = g(x, y)
1274  f(x, y) = ... g_wrap(x, y)
1275  \endcode
1276  *
1277  * This has a variety of uses. You can use it to schedule this
1278  * Func differently in the different places it is used:
1279  \code
1280  g(x, y) = ...
1281  f1(x, y) = ... g(x, y) ...
1282  f2(x, y) = ... g(x, y) ...
1283  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1284  g.in(f2).compute_at(f2, x).unroll(x);
1285  \endcode
1286  *
1287  * You can also use it to stage loads from this Func via some
1288  * intermediate buffer (perhaps on the stack as in
1289  * test/performance/block_transpose.cpp, or in shared GPU memory
1290  * as in test/performance/wrap.cpp). In this we compute the
1291  * wrapper at tiles of the consuming Funcs like so:
1292  \code
1293  g.compute_root()...
1294  g.in(f).compute_at(f, tiles)...
1295  \endcode
1296  *
1297  * Func::in() can also be used to compute pieces of a Func into a
1298  * smaller scratch buffer (perhaps on the GPU) and then copy them
1299  * into a larger output buffer one tile at a time. See
1300  * apps/interpolate/interpolate.cpp for an example of this. In
1301  * this case we compute the Func at tiles of its own wrapper:
1302  \code
1303  f.in(g).compute_root().gpu_tile(...)...
1304  f.compute_at(f.in(g), tiles)...
1305  \endcode
1306  *
1307  * A similar use of Func::in() wrapping Funcs with multiple update
1308  * stages in a pure wrapper. The following code:
1309  \code
1310  f(x, y) = x + y;
1311  f(x, y) += 5;
1312  g(x, y) = f(x, y);
1313  f.compute_root();
1314  \endcode
1315  *
1316  * Is equivalent to:
1317  \code
1318  for y:
1319  for x:
1320  f(x, y) = x + y;
1321  for y:
1322  for x:
1323  f(x, y) += 5
1324  for y:
1325  for x:
1326  g(x, y) = f(x, y)
1327  \endcode
1328  * using Func::in(), we can write:
1329  \code
1330  f(x, y) = x + y;
1331  f(x, y) += 5;
1332  g(x, y) = f(x, y);
1333  f.in(g).compute_root();
1334  \endcode
1335  * which instead produces:
1336  \code
1337  for y:
1338  for x:
1339  f(x, y) = x + y;
1340  f(x, y) += 5
1341  f_wrap(x, y) = f(x, y)
1342  for y:
1343  for x:
1344  g(x, y) = f_wrap(x, y)
1345  \endcode
1346  */
1347  Func in(const Func &f);
1348 
1349  /** Create and return an identity wrapper shared by all the Funcs in
1350  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1351  * this will throw an error. */
1352  Func in(const std::vector<Func> &fs);
1353 
1354  /** Create and return a global identity wrapper, which wraps all calls to
1355  * this Func by any other Func. If a global wrapper already exists,
1356  * returns it. The global identity wrapper is only used by callers for
1357  * which no custom wrapper has been specified.
1358  */
1359  Func in();
1360 
1361  /** Similar to \ref Func::in; however, instead of replacing the call to
1362  * this Func with an identity Func that refers to it, this replaces the
1363  * call with a clone of this Func.
1364  *
1365  * For example, f.clone_in(g) would rewrite a pipeline like this:
1366  \code
1367  f(x, y) = x + y;
1368  g(x, y) = f(x, y) + 2;
1369  h(x, y) = f(x, y) - 3;
1370  \endcode
1371  * into a pipeline like this:
1372  \code
1373  f(x, y) = x + y;
1374  f_clone(x, y) = x + y;
1375  g(x, y) = f_clone(x, y) + 2;
1376  h(x, y) = f(x, y) - 3;
1377  \endcode
1378  *
1379  */
1380  //@{
1381  Func clone_in(const Func &f);
1382  Func clone_in(const std::vector<Func> &fs);
1383  //@}
1384 
1385  /** Declare that this function should be implemented by a call to
1386  * halide_buffer_copy with the given target device API. Asserts
1387  * that the Func has a pure definition which is a simple call to a
1388  * single input, and no update definitions. The wrapper Funcs
1389  * returned by in() are suitable candidates. Consumes all pure
1390  * variables, and rewrites the Func to have an extern definition
1391  * that calls halide_buffer_copy. */
1393 
1394  /** Declare that this function should be implemented by a call to
1395  * halide_buffer_copy with a NULL target device API. Equivalent to
1396  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1397  * pure definition which is a simple call to a single input, and
1398  * no update definitions. The wrapper Funcs returned by in() are
1399  * suitable candidates. Consumes all pure variables, and rewrites
1400  * the Func to have an extern definition that calls
1401  * halide_buffer_copy.
1402  *
1403  * Note that if the source Func is already valid in host memory,
1404  * this compiles to code that does the minimum number of calls to
1405  * memcpy.
1406  */
1407  Func copy_to_host();
1408 
1409  /** Split a dimension into inner and outer subdimensions with the
1410  * given names, where the inner dimension iterates from 0 to
1411  * factor-1. The inner and outer subdimensions can then be dealt
1412  * with using the other scheduling calls. It's ok to reuse the old
1413  * variable name as either the inner or outer variable. The final
1414  * argument specifies how the tail should be handled if the split
1415  * factor does not provably divide the extent. */
1416  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1417 
1418  /** Join two dimensions into a single fused dimension. The fused dimension
1419  * covers the product of the extents of the inner and outer dimensions
1420  * given. The loop type (e.g. parallel, vectorized) of the resulting fused
1421  * dimension is inherited from the first argument. */
1422  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1423 
1424  /** Mark a dimension to be traversed serially. This is the default. */
1425  Func &serial(const VarOrRVar &var);
1426 
1427  /** Mark a dimension to be traversed in parallel */
1428  Func &parallel(const VarOrRVar &var);
1429 
1430  /** Split a dimension by the given task_size, and the parallelize the
1431  * outer dimension. This creates parallel tasks that have size
1432  * task_size. After this call, var refers to the outer dimension of
1433  * the split. The inner dimension has a new anonymous name. If you
1434  * wish to mutate it, or schedule with respect to it, do the split
1435  * manually. */
1436  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1437 
1438  /** Mark a dimension to be computed all-at-once as a single
1439  * vector. The dimension should have constant extent -
1440  * e.g. because it is the inner dimension following a split by a
1441  * constant factor. For most uses of vectorize you want the two
1442  * argument form. The variable to be vectorized should be the
1443  * innermost one. */
1444  Func &vectorize(const VarOrRVar &var);
1445 
1446  /** Mark a dimension to be completely unrolled. The dimension
1447  * should have constant extent - e.g. because it is the inner
1448  * dimension following a split by a constant factor. For most uses
1449  * of unroll you want the two-argument form. */
1450  Func &unroll(const VarOrRVar &var);
1451 
1452  /** Split a dimension by the given factor, then vectorize the
1453  * inner dimension. This is how you vectorize a loop of unknown
1454  * size. The variable to be vectorized should be the innermost
1455  * one. After this call, var refers to the outer dimension of the
1456  * split. 'factor' must be an integer. */
1457  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1458 
1459  /** Split a dimension by the given factor, then unroll the inner
1460  * dimension. This is how you unroll a loop of unknown size by
1461  * some constant factor. After this call, var refers to the outer
1462  * dimension of the split. 'factor' must be an integer. */
1463  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1464 
1465  /** Set the loop partition policy. Loop partitioning can be useful to
1466  * optimize boundary conditions (such as clamp_edge). Loop partitioning
1467  * splits a for loop into three for loops: a prologue, a steady-state,
1468  * and an epilogue.
1469  * The default policy is Auto. */
1470  Func &partition(const VarOrRVar &var, Partition partition_policy);
1471 
1472  /** Set the loop partition policy to Never for a vector of Vars and
1473  * RVars. */
1474  Func &never_partition(const std::vector<VarOrRVar> &vars);
1475 
1476  /** Set the loop partition policy to Never for some number of Vars and RVars. */
1477  template<typename... Args>
1479  never_partition(const VarOrRVar &x, Args &&...args) {
1480  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1481  return never_partition(collected_args);
1482  }
1483 
1484  /** Set the loop partition policy to Never for all Vars and RVar of the
1485  * initial definition of the Func. It must be called separately on any
1486  * update definitions. */
1488 
1489  /** Set the loop partition policy to Always for a vector of Vars and
1490  * RVars. */
1491  Func &always_partition(const std::vector<VarOrRVar> &vars);
1492 
1493  /** Set the loop partition policy to Always for some number of Vars and RVars. */
1494  template<typename... Args>
1496  always_partition(const VarOrRVar &x, Args &&...args) {
1497  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1498  return always_partition(collected_args);
1499  }
1500 
1501  /** Set the loop partition policy to Always for all Vars and RVar of the
1502  * initial definition of the Func. It must be called separately on any
1503  * update definitions. */
1505 
1506  /** Statically declare that the range over which a function should
1507  * be evaluated is given by the second and third arguments. This
1508  * can let Halide perform some optimizations. E.g. if you know
1509  * there are going to be 4 color channels, you can completely
1510  * vectorize the color channel dimension without the overhead of
1511  * splitting it up. If bounds inference decides that it requires
1512  * more of this function than the bounds you have stated, a
1513  * runtime error will occur when you try to run your pipeline. */
1514  Func &bound(const Var &var, Expr min, Expr extent);
1515 
1516  /** Statically declare the range over which the function will be
1517  * evaluated in the general case. This provides a basis for the auto
1518  * scheduler to make trade-offs and scheduling decisions. The auto
1519  * generated schedules might break when the sizes of the dimensions are
1520  * very different from the estimates specified. These estimates are used
1521  * only by the auto scheduler if the function is a pipeline output. */
1522  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1523 
1524  /** Set (min, extent) estimates for all dimensions in the Func
1525  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1526  * repeatedly, but slightly terser. The size of the estimates vector
1527  * must match the dimensionality of the Func. */
1528  Func &set_estimates(const Region &estimates);
1529 
1530  /** Expand the region computed so that the min coordinates is
1531  * congruent to 'remainder' modulo 'modulus', and the extent is a
1532  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1533  * the min and extent realized to be even, and calling
1534  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1535  * to be even. The region computed always contains the region that
1536  * would have been computed without this directive, so no
1537  * assertions are injected.
1538  */
1539  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1540 
1541  /** Expand the region computed so that the extent is a
1542  * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1543  * the extent realized to be even. The region computed always contains the
1544  * region that would have been computed without this directive, so no
1545  * assertions are injected. (This is essentially equivalent to align_bounds(),
1546  * but always leaving the min untouched.)
1547  */
1548  Func &align_extent(const Var &var, Expr modulus);
1549 
1550  /** Bound the extent of a Func's realization, but not its
1551  * min. This means the dimension can be unrolled or vectorized
1552  * even when its min is not fixed (for example because it is
1553  * compute_at tiles of another Func). This can also be useful for
1554  * forcing a function's allocation to be a fixed size, which often
1555  * means it can go on the stack. */
1556  Func &bound_extent(const Var &var, Expr extent);
1557 
1558  /** Split two dimensions at once by the given factors, and then
1559  * reorder the resulting dimensions to be xi, yi, xo, yo from
1560  * innermost outwards. This gives a tiled traversal. */
1561  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1562  const VarOrRVar &xo, const VarOrRVar &yo,
1563  const VarOrRVar &xi, const VarOrRVar &yi,
1564  const Expr &xfactor, const Expr &yfactor,
1566 
1567  /** A shorter form of tile, which reuses the old variable names as
1568  * the new outer dimensions */
1569  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1570  const VarOrRVar &xi, const VarOrRVar &yi,
1571  const Expr &xfactor, const Expr &yfactor,
1573 
1574  /** A more general form of tile, which defines tiles of any dimensionality. */
1575  Func &tile(const std::vector<VarOrRVar> &previous,
1576  const std::vector<VarOrRVar> &outers,
1577  const std::vector<VarOrRVar> &inners,
1578  const std::vector<Expr> &factors,
1579  const std::vector<TailStrategy> &tails);
1580 
1581  /** The generalized tile, with a single tail strategy to apply to all vars. */
1582  Func &tile(const std::vector<VarOrRVar> &previous,
1583  const std::vector<VarOrRVar> &outers,
1584  const std::vector<VarOrRVar> &inners,
1585  const std::vector<Expr> &factors,
1587 
1588  /** Generalized tiling, reusing the previous names as the outer names. */
1589  Func &tile(const std::vector<VarOrRVar> &previous,
1590  const std::vector<VarOrRVar> &inners,
1591  const std::vector<Expr> &factors,
1593 
1594  /** Reorder variables to have the given nesting order, from
1595  * innermost out */
1596  Func &reorder(const std::vector<VarOrRVar> &vars);
1597 
1598  template<typename... Args>
1600  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1601  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1602  return reorder(collected_args);
1603  }
1604 
1605  /** Rename a dimension. Equivalent to split with a inner size of one. */
1606  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1607 
1608  /** Specify that race conditions are permitted for this Func,
1609  * which enables parallelizing over RVars even when Halide cannot
1610  * prove that it is safe to do so. Use this with great caution,
1611  * and only if you can prove to yourself that this is safe, as it
1612  * may result in a non-deterministic routine that returns
1613  * different values at different times or on different machines. */
1615 
1616  /** Issue atomic updates for this Func. This allows parallelization
1617  * on associative RVars. The function throws a compile error when
1618  * Halide fails to prove associativity. Use override_associativity_test
1619  * to disable the associativity test if you believe the function is
1620  * associative or the order of reduction variable execution does not
1621  * matter.
1622  * Halide compiles this into hardware atomic operations whenever possible,
1623  * and falls back to a mutex lock per storage element if it is impossible
1624  * to atomically update.
1625  * There are three possible outcomes of the compiled code:
1626  * atomic add, compare-and-swap loop, and mutex lock.
1627  * For example:
1628  *
1629  * hist(x) = 0;
1630  * hist(im(r)) += 1;
1631  * hist.compute_root();
1632  * hist.update().atomic().parallel();
1633  *
1634  * will be compiled to atomic add operations.
1635  *
1636  * hist(x) = 0;
1637  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1638  * hist.compute_root();
1639  * hist.update().atomic().parallel();
1640  *
1641  * will be compiled to compare-and-swap loops.
1642  *
1643  * arg_max() = {0, im(0)};
1644  * Expr old_index = arg_max()[0];
1645  * Expr old_max = arg_max()[1];
1646  * Expr new_index = select(old_max < im(r), r, old_index);
1647  * Expr new_max = max(im(r), old_max);
1648  * arg_max() = {new_index, new_max};
1649  * arg_max.compute_root();
1650  * arg_max.update().atomic().parallel();
1651  *
1652  * will be compiled to updates guarded by a mutex lock,
1653  * since it is impossible to atomically update two different locations.
1654  *
1655  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1656  * Compiling to other backends results in a compile error.
1657  * If an operation is compiled into a mutex lock, and is vectorized or is
1658  * compiled to CUDA or OpenCL, it also results in a compile error,
1659  * since per-element mutex lock on vectorized operation leads to a
1660  * deadlock.
1661  * Vectorization of predicated RVars (through rdom.where()) on CPU
1662  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1663  * 8-bit and 16-bit atomics on GPU are also not supported. */
1664  Func &atomic(bool override_associativity_test = false);
1665 
1666  /** Specialize a Func. This creates a special-case version of the
1667  * Func where the given condition is true. The most effective
1668  * conditions are those of the form param == value, and boolean
1669  * Params. Consider a simple example:
1670  \code
1671  f(x) = x + select(cond, 0, 1);
1672  f.compute_root();
1673  \endcode
1674  * This is equivalent to:
1675  \code
1676  for (int x = 0; x < width; x++) {
1677  f[x] = x + (cond ? 0 : 1);
1678  }
1679  \endcode
1680  * Adding the scheduling directive:
1681  \code
1682  f.specialize(cond)
1683  \endcode
1684  * makes it equivalent to:
1685  \code
1686  if (cond) {
1687  for (int x = 0; x < width; x++) {
1688  f[x] = x;
1689  }
1690  } else {
1691  for (int x = 0; x < width; x++) {
1692  f[x] = x + 1;
1693  }
1694  }
1695  \endcode
1696  * Note that the inner loops have been simplified. In the first
1697  * path Halide knows that cond is true, and in the second path
1698  * Halide knows that it is false.
1699  *
1700  * The specialized version gets its own schedule, which inherits
1701  * every directive made about the parent Func's schedule so far
1702  * except for its specializations. This method returns a handle to
1703  * the new schedule. If you wish to retrieve the specialized
1704  * sub-schedule again later, you can call this method with the
1705  * same condition. Consider the following example of scheduling
1706  * the specialized version:
1707  *
1708  \code
1709  f(x) = x;
1710  f.compute_root();
1711  f.specialize(width > 1).unroll(x, 2);
1712  \endcode
1713  * Assuming for simplicity that width is even, this is equivalent to:
1714  \code
1715  if (width > 1) {
1716  for (int x = 0; x < width/2; x++) {
1717  f[2*x] = 2*x;
1718  f[2*x + 1] = 2*x + 1;
1719  }
1720  } else {
1721  for (int x = 0; x < width/2; x++) {
1722  f[x] = x;
1723  }
1724  }
1725  \endcode
1726  * For this case, it may be better to schedule the un-specialized
1727  * case instead:
1728  \code
1729  f(x) = x;
1730  f.compute_root();
1731  f.specialize(width == 1); // Creates a copy of the schedule so far.
1732  f.unroll(x, 2); // Only applies to the unspecialized case.
1733  \endcode
1734  * This is equivalent to:
1735  \code
1736  if (width == 1) {
1737  f[0] = 0;
1738  } else {
1739  for (int x = 0; x < width/2; x++) {
1740  f[2*x] = 2*x;
1741  f[2*x + 1] = 2*x + 1;
1742  }
1743  }
1744  \endcode
1745  * This can be a good way to write a pipeline that splits,
1746  * vectorizes, or tiles, but can still handle small inputs.
1747  *
1748  * If a Func has several specializations, the first matching one
1749  * will be used, so the order in which you define specializations
1750  * is significant. For example:
1751  *
1752  \code
1753  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1754  f.specialize(cond1);
1755  f.specialize(cond2);
1756  \endcode
1757  * is equivalent to:
1758  \code
1759  if (cond1) {
1760  for (int x = 0; x < width; x++) {
1761  f[x] = x + a - (cond2 ? c : d);
1762  }
1763  } else if (cond2) {
1764  for (int x = 0; x < width; x++) {
1765  f[x] = x + b - c;
1766  }
1767  } else {
1768  for (int x = 0; x < width; x++) {
1769  f[x] = x + b - d;
1770  }
1771  }
1772  \endcode
1773  *
1774  * Specializations may in turn be specialized, which creates a
1775  * nested if statement in the generated code.
1776  *
1777  \code
1778  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1779  f.specialize(cond1).specialize(cond2);
1780  \endcode
1781  * This is equivalent to:
1782  \code
1783  if (cond1) {
1784  if (cond2) {
1785  for (int x = 0; x < width; x++) {
1786  f[x] = x + a - c;
1787  }
1788  } else {
1789  for (int x = 0; x < width; x++) {
1790  f[x] = x + a - d;
1791  }
1792  }
1793  } else {
1794  for (int x = 0; x < width; x++) {
1795  f[x] = x + b - (cond2 ? c : d);
1796  }
1797  }
1798  \endcode
1799  * To create a 4-way if statement that simplifies away all of the
1800  * ternary operators above, you could say:
1801  \code
1802  f.specialize(cond1).specialize(cond2);
1803  f.specialize(cond2);
1804  \endcode
1805  * or
1806  \code
1807  f.specialize(cond1 && cond2);
1808  f.specialize(cond1);
1809  f.specialize(cond2);
1810  \endcode
1811  *
1812  * Any prior Func which is compute_at some variable of this Func
1813  * gets separately included in all paths of the generated if
1814  * statement. The Var in the compute_at call to must exist in all
1815  * paths, but it may have been generated via a different path of
1816  * splits, fuses, and renames. This can be used somewhat
1817  * creatively. Consider the following code:
1818  \code
1819  g(x, y) = 8*x;
1820  f(x, y) = g(x, y) + 1;
1821  f.compute_root().specialize(cond);
1822  Var g_loop;
1823  f.specialize(cond).rename(y, g_loop);
1824  f.rename(x, g_loop);
1825  g.compute_at(f, g_loop);
1826  \endcode
1827  * When cond is true, this is equivalent to g.compute_at(f,y).
1828  * When it is false, this is equivalent to g.compute_at(f,x).
1829  */
1830  Stage specialize(const Expr &condition);
1831 
1832  /** Add a specialization to a Func that always terminates execution
1833  * with a call to halide_error(). By itself, this is of limited use,
1834  * but can be useful to terminate chains of specialize() calls where
1835  * no "default" case is expected (thus avoiding unnecessary code generation).
1836  *
1837  * For instance, say we want to optimize a pipeline to process images
1838  * in planar and interleaved format; we might typically do something like:
1839  \code
1840  ImageParam im(UInt(8), 3);
1841  Func f = do_something_with(im);
1842  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1843  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1844  \endcode
1845  * This code will vectorize along rows for the planar case, and across pixel
1846  * components for the interleaved case... but there is an implicit "else"
1847  * for the unhandled cases, which generates unoptimized code. If we never
1848  * anticipate passing any other sort of images to this, we code streamline
1849  * our code by adding specialize_fail():
1850  \code
1851  ImageParam im(UInt(8), 3);
1852  Func f = do_something(im);
1853  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1854  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1855  f.specialize_fail("Unhandled image format");
1856  \endcode
1857  * Conceptually, this produces codes like:
1858  \code
1859  if (im.dim(0).stride() == 1) {
1860  do_something_planar();
1861  } else if (im.dim(2).stride() == 1) {
1862  do_something_interleaved();
1863  } else {
1864  halide_error("Unhandled image format");
1865  }
1866  \endcode
1867  *
1868  * Note that calling specialize_fail() terminates the specialization chain
1869  * for a given Func; you cannot create new specializations for the Func
1870  * afterwards (though you can retrieve handles to previous specializations).
1871  */
1872  void specialize_fail(const std::string &message);
1873 
1874  /** Tell Halide that the following dimensions correspond to GPU
1875  * thread indices. This is useful if you compute a producer
1876  * function within the block indices of a consumer function, and
1877  * want to control how that function's dimensions map to GPU
1878  * threads. If the selected target is not an appropriate GPU, this
1879  * just marks those dimensions as parallel. */
1880  // @{
1881  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1882  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1883  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1884  // @}
1885 
1886  /** The given dimension corresponds to the lanes in a GPU
1887  * warp. GPU warp lanes are distinguished from GPU threads by the
1888  * fact that all warp lanes run together in lockstep, which
1889  * permits lightweight communication of data from one lane to
1890  * another. */
1891  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1892 
1893  /** Tell Halide to run this stage using a single gpu thread and
1894  * block. This is not an efficient use of your GPU, but it can be
1895  * useful to avoid copy-back for intermediate update stages that
1896  * touch a very small part of your Func. */
1898 
1899  /** Tell Halide that the following dimensions correspond to GPU
1900  * block indices. This is useful for scheduling stages that will
1901  * run serially within each GPU block. If the selected target is
1902  * not ptx, this just marks those dimensions as parallel. */
1903  // @{
1904  Func &gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1905  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1906  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1907  // @}
1908 
1909  /** Tell Halide that the following dimensions correspond to GPU
1910  * block indices and thread indices. If the selected target is not
1911  * ptx, these just mark the given dimensions as parallel. The
1912  * dimensions are consumed by this call, so do all other
1913  * unrolling, reordering, etc first. */
1914  // @{
1915  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1916  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1917  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1918  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1919  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1920  // @}
1921 
1922  /** Short-hand for tiling a domain and mapping the tile indices
1923  * to GPU block indices and the coordinates within each tile to
1924  * GPU thread indices. Consumes the variables given, so do all
1925  * other scheduling first. */
1926  // @{
1927  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1929  DeviceAPI device_api = DeviceAPI::Default_GPU);
1930 
1931  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1933  DeviceAPI device_api = DeviceAPI::Default_GPU);
1934  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1935  const VarOrRVar &bx, const VarOrRVar &by,
1936  const VarOrRVar &tx, const VarOrRVar &ty,
1937  const Expr &x_size, const Expr &y_size,
1939  DeviceAPI device_api = DeviceAPI::Default_GPU);
1940 
1941  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1942  const VarOrRVar &tx, const VarOrRVar &ty,
1943  const Expr &x_size, const Expr &y_size,
1945  DeviceAPI device_api = DeviceAPI::Default_GPU);
1946 
1947  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1948  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1949  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1950  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1952  DeviceAPI device_api = DeviceAPI::Default_GPU);
1953  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1954  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1955  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1957  DeviceAPI device_api = DeviceAPI::Default_GPU);
1958  // @}
1959 
1960  /** Schedule for execution on Hexagon. When a loop is marked with
1961  * Hexagon, that loop is executed on a Hexagon DSP. */
1962  Func &hexagon(const VarOrRVar &x = Var::outermost());
1963 
1964  /** Prefetch data written to or read from a Func or an ImageParam by a
1965  * subsequent loop iteration, at an optionally specified iteration offset. You may specify
1966  * specification of different vars for the location of the prefetch() instruction
1967  * vs. the location that is being prefetched:
1968  *
1969  * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1970  * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1971  * (in conjunction with 'offset')
1972  *
1973  * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
1974  * Note that the value for 'offset' applies only to 'from', not 'at'.
1975  *
1976  * The final argument specifies how prefetch of region outside bounds
1977  * should be handled.
1978  *
1979  * For example, consider this pipeline:
1980  \code
1981  Func f, g;
1982  Var x, y, z;
1983  f(x, y) = x + y;
1984  g(x, y) = 2 * f(x, y);
1985  h(x, y) = 3 * f(x, y);
1986  \endcode
1987  *
1988  * The following schedule:
1989  \code
1990  f.compute_root();
1991  g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
1992  h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
1993  \endcode
1994  *
1995  * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
1996  * the following loop nest:
1997  \code
1998  for y = ...
1999  for x = ...
2000  f(x, y) = x + y
2001  for y = ..
2002  for x = ...
2003  prefetch(&f[x + 2, y], 1, 16);
2004  g(x, y) = 2 * f(x, y)
2005  for y = ..
2006  for x = ...
2007  prefetch(&f[x, y + 2], 1, 16);
2008  h(x, y) = 3 * f(x, y)
2009  \endcode
2010  *
2011  * Note that the 'from' nesting level need not be adjacent to 'at':
2012  \code
2013  Func f, g;
2014  Var x, y, z, w;
2015  f(x, y, z, w) = x + y + z + w;
2016  g(x, y, z, w) = 2 * f(x, y, z, w);
2017  \endcode
2018  *
2019  * The following schedule:
2020  \code
2021  f.compute_root();
2022  g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2023  \endcode
2024  *
2025  * will produce code that prefetches a tile of data:
2026  \code
2027  for w = ...
2028  for z = ...
2029  for y = ...
2030  for x = ...
2031  f(x, y, z, w) = x + y + z + w
2032  for w = ...
2033  for z = ...
2034  for y = ...
2035  for x0 = ...
2036  prefetch(&f[x0, y, z, w + 2], 1, 16);
2037  for x = ...
2038  g(x, y, z, w) = 2 * f(x, y, z, w)
2039  \endcode
2040  *
2041  * Note that calling prefetch() with the same var for both 'at' and 'from'
2042  * is equivalent to calling prefetch() with that var.
2043  */
2044  // @{
2045  Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2047  Func &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2049  template<typename T>
2050  Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2052  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2053  }
2054  // @}
2055 
2056  /** Specify how the storage for the function is laid out. These
2057  * calls let you specify the nesting order of the dimensions. For
2058  * example, foo.reorder_storage(y, x) tells Halide to use
2059  * column-major storage for any realizations of foo, without
2060  * changing how you refer to foo in the code. You may want to do
2061  * this if you intend to vectorize across y. When representing
2062  * color images, foo.reorder_storage(c, x, y) specifies packed
2063  * storage (red, green, and blue values adjacent in memory), and
2064  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2065  * red, green, and blue images one after the other in memory).
2066  *
2067  * If you leave out some dimensions, those remain in the same
2068  * positions in the nesting order while the specified variables
2069  * are reordered around them. */
2070  // @{
2071  Func &reorder_storage(const std::vector<Var> &dims);
2072 
2073  Func &reorder_storage(const Var &x, const Var &y);
2074  template<typename... Args>
2076  reorder_storage(const Var &x, const Var &y, Args &&...args) {
2077  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2078  return reorder_storage(collected_args);
2079  }
2080  // @}
2081 
2082  /** Pad the storage extent of a particular dimension of
2083  * realizations of this function up to be a multiple of the
2084  * specified alignment. This guarantees that the strides for the
2085  * dimensions stored outside of dim will be multiples of the
2086  * specified alignment, where the strides and alignment are
2087  * measured in numbers of elements.
2088  *
2089  * For example, to guarantee that a function foo(x, y, c)
2090  * representing an image has scanlines starting on offsets
2091  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2092  Func &align_storage(const Var &dim, const Expr &alignment);
2093 
2094  /** Store realizations of this function in a circular buffer of a
2095  * given extent. This is more efficient when the extent of the
2096  * circular buffer is a power of 2. If the fold factor is too
2097  * small, or the dimension is not accessed monotonically, the
2098  * pipeline will generate an error at runtime.
2099  *
2100  * The fold_forward option indicates that the new values of the
2101  * producer are accessed by the consumer in a monotonically
2102  * increasing order. Folding storage of producers is also
2103  * supported if the new values are accessed in a monotonically
2104  * decreasing order by setting fold_forward to false.
2105  *
2106  * For example, consider the pipeline:
2107  \code
2108  Func f, g;
2109  Var x, y;
2110  g(x, y) = x*y;
2111  f(x, y) = g(x, y) + g(x, y+1);
2112  \endcode
2113  *
2114  * If we schedule f like so:
2115  *
2116  \code
2117  g.compute_at(f, y).store_root().fold_storage(y, 2);
2118  \endcode
2119  *
2120  * Then g will be computed at each row of f and stored in a buffer
2121  * with an extent in y of 2, alternately storing each computed row
2122  * of g in row y=0 or y=1.
2123  */
2124  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2125 
2126  /** Compute this function as needed for each unique value of the
2127  * given var for the given calling function f.
2128  *
2129  * For example, consider the simple pipeline:
2130  \code
2131  Func f, g;
2132  Var x, y;
2133  g(x, y) = x*y;
2134  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2135  \endcode
2136  *
2137  * If we schedule f like so:
2138  *
2139  \code
2140  g.compute_at(f, x);
2141  \endcode
2142  *
2143  * Then the C code equivalent to this pipeline will look like this
2144  *
2145  \code
2146 
2147  int f[height][width];
2148  for (int y = 0; y < height; y++) {
2149  for (int x = 0; x < width; x++) {
2150  int g[2][2];
2151  g[0][0] = x*y;
2152  g[0][1] = (x+1)*y;
2153  g[1][0] = x*(y+1);
2154  g[1][1] = (x+1)*(y+1);
2155  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2156  }
2157  }
2158 
2159  \endcode
2160  *
2161  * The allocation and computation of g is within f's loop over x,
2162  * and enough of g is computed to satisfy all that f will need for
2163  * that iteration. This has excellent locality - values of g are
2164  * used as soon as they are computed, but it does redundant
2165  * work. Each value of g ends up getting computed four times. If
2166  * we instead schedule f like so:
2167  *
2168  \code
2169  g.compute_at(f, y);
2170  \endcode
2171  *
2172  * The equivalent C code is:
2173  *
2174  \code
2175  int f[height][width];
2176  for (int y = 0; y < height; y++) {
2177  int g[2][width+1];
2178  for (int x = 0; x < width; x++) {
2179  g[0][x] = x*y;
2180  g[1][x] = x*(y+1);
2181  }
2182  for (int x = 0; x < width; x++) {
2183  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2184  }
2185  }
2186  \endcode
2187  *
2188  * The allocation and computation of g is within f's loop over y,
2189  * and enough of g is computed to satisfy all that f will need for
2190  * that iteration. This does less redundant work (each point in g
2191  * ends up being evaluated twice), but the locality is not quite
2192  * as good, and we have to allocate more temporary memory to store
2193  * g.
2194  */
2195  Func &compute_at(const Func &f, const Var &var);
2196 
2197  /** Schedule a function to be computed within the iteration over
2198  * some dimension of an update domain. Produces equivalent code
2199  * to the version of compute_at that takes a Var. */
2200  Func &compute_at(const Func &f, const RVar &var);
2201 
2202  /** Schedule a function to be computed within the iteration over
2203  * a given LoopLevel. */
2204  Func &compute_at(LoopLevel loop_level);
2205 
2206  /** Schedule the iteration over the initial definition of this function
2207  * to be fused with another stage 's' from outermost loop to a
2208  * given LoopLevel. */
2209  // @{
2210  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2212  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2214 
2215  /** Compute all of this function once ahead of time. Reusing
2216  * the example in \ref Func::compute_at :
2217  *
2218  \code
2219  Func f, g;
2220  Var x, y;
2221  g(x, y) = x*y;
2222  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2223 
2224  g.compute_root();
2225  \endcode
2226  *
2227  * is equivalent to
2228  *
2229  \code
2230  int f[height][width];
2231  int g[height+1][width+1];
2232  for (int y = 0; y < height+1; y++) {
2233  for (int x = 0; x < width+1; x++) {
2234  g[y][x] = x*y;
2235  }
2236  }
2237  for (int y = 0; y < height; y++) {
2238  for (int x = 0; x < width; x++) {
2239  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2240  }
2241  }
2242  \endcode
2243  *
2244  * g is computed once ahead of time, and enough is computed to
2245  * satisfy all uses of it. This does no redundant work (each point
2246  * in g is evaluated once), but has poor locality (values of g are
2247  * probably not still in cache when they are used by f), and
2248  * allocates lots of temporary memory to store g.
2249  */
2250  Func &compute_root();
2251 
2252  /** Use the halide_memoization_cache_... interface to store a
2253  * computed version of this function across invocations of the
2254  * Func.
2255  *
2256  * If an eviction_key is provided, it must be constructed with
2257  * Expr of integer or handle type. The key Expr will be promoted
2258  * to a uint64_t and can be used with halide_memoization_cache_evict
2259  * to remove memoized entries using this eviction key from the
2260  * cache. Memoized computations that do not provide an eviction
2261  * key will never be evicted by this mechanism.
2262  */
2263  Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2264 
2265  /** Produce this Func asynchronously in a separate
2266  * thread. Consumers will be run by the task system when the
2267  * production is complete. If this Func's store level is different
2268  * to its compute level, consumers will be run concurrently,
2269  * blocking as necessary to prevent reading ahead of what the
2270  * producer has computed. If storage is folded, then the producer
2271  * will additionally not be permitted to run too far ahead of the
2272  * consumer, to avoid clobbering data that has not yet been
2273  * used.
2274  *
2275  * Take special care when combining this with custom thread pool
2276  * implementations, as avoiding deadlock with producer-consumer
2277  * parallelism requires a much more sophisticated parallel runtime
2278  * than with data parallelism alone. It is strongly recommended
2279  * you just use Halide's default thread pool, which guarantees no
2280  * deadlock and a bound on the number of threads launched.
2281  */
2282  Func &async();
2283 
2284  /** Bound the extent of a Func's storage, but not extent of its
2285  * compute. This can be useful for forcing a function's allocation
2286  * to be a fixed size, which often means it can go on the stack.
2287  * If bounds inference decides that it requires more storage for
2288  * this function than the allocation size you have stated, a runtime
2289  * error will occur when you try to run the pipeline. */
2290  Func &bound_storage(const Var &dim, const Expr &bound);
2291 
2292  /** Allocate storage for this function within f's loop over
2293  * var. Scheduling storage is optional, and can be used to
2294  * separate the loop level at which storage occurs from the loop
2295  * level at which computation occurs to trade off between locality
2296  * and redundant work. This can open the door for two types of
2297  * optimization.
2298  *
2299  * Consider again the pipeline from \ref Func::compute_at :
2300  \code
2301  Func f, g;
2302  Var x, y;
2303  g(x, y) = x*y;
2304  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2305  \endcode
2306  *
2307  * If we schedule it like so:
2308  *
2309  \code
2310  g.compute_at(f, x).store_at(f, y);
2311  \endcode
2312  *
2313  * Then the computation of g takes place within the loop over x,
2314  * but the storage takes place within the loop over y:
2315  *
2316  \code
2317  int f[height][width];
2318  for (int y = 0; y < height; y++) {
2319  int g[2][width+1];
2320  for (int x = 0; x < width; x++) {
2321  g[0][x] = x*y;
2322  g[0][x+1] = (x+1)*y;
2323  g[1][x] = x*(y+1);
2324  g[1][x+1] = (x+1)*(y+1);
2325  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2326  }
2327  }
2328  \endcode
2329  *
2330  * Provided the for loop over x is serial, halide then
2331  * automatically performs the following sliding window
2332  * optimization:
2333  *
2334  \code
2335  int f[height][width];
2336  for (int y = 0; y < height; y++) {
2337  int g[2][width+1];
2338  for (int x = 0; x < width; x++) {
2339  if (x == 0) {
2340  g[0][x] = x*y;
2341  g[1][x] = x*(y+1);
2342  }
2343  g[0][x+1] = (x+1)*y;
2344  g[1][x+1] = (x+1)*(y+1);
2345  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2346  }
2347  }
2348  \endcode
2349  *
2350  * Two of the assignments to g only need to be done when x is
2351  * zero. The rest of the time, those sites have already been
2352  * filled in by a previous iteration. This version has the
2353  * locality of compute_at(f, x), but allocates more memory and
2354  * does much less redundant work.
2355  *
2356  * Halide then further optimizes this pipeline like so:
2357  *
2358  \code
2359  int f[height][width];
2360  for (int y = 0; y < height; y++) {
2361  int g[2][2];
2362  for (int x = 0; x < width; x++) {
2363  if (x == 0) {
2364  g[0][0] = x*y;
2365  g[1][0] = x*(y+1);
2366  }
2367  g[0][(x+1)%2] = (x+1)*y;
2368  g[1][(x+1)%2] = (x+1)*(y+1);
2369  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2370  }
2371  }
2372  \endcode
2373  *
2374  * Halide has detected that it's possible to use a circular buffer
2375  * to represent g, and has reduced all accesses to g modulo 2 in
2376  * the x dimension. This optimization only triggers if the for
2377  * loop over x is serial, and if halide can statically determine
2378  * some power of two large enough to cover the range needed. For
2379  * powers of two, the modulo operator compiles to more efficient
2380  * bit-masking. This optimization reduces memory usage, and also
2381  * improves locality by reusing recently-accessed memory instead
2382  * of pulling new memory into cache.
2383  *
2384  */
2385  Func &store_at(const Func &f, const Var &var);
2386 
2387  /** Equivalent to the version of store_at that takes a Var, but
2388  * schedules storage within the loop over a dimension of a
2389  * reduction domain */
2390  Func &store_at(const Func &f, const RVar &var);
2391 
2392  /** Equivalent to the version of store_at that takes a Var, but
2393  * schedules storage at a given LoopLevel. */
2394  Func &store_at(LoopLevel loop_level);
2395 
2396  /** Equivalent to \ref Func::store_at, but schedules storage
2397  * outside the outermost loop. */
2398  Func &store_root();
2399 
2400  /** Hoist storage for this function within f's loop over
2401  * var. This is different from \ref Func::store_at, because hoist_storage
2402  * simply moves an actual allocation to a given loop level and
2403  * doesn't trigger any of the optimizations such as sliding window.
2404  * Hoisting storage is optional and can be used as an optimization
2405  * to avoid unnecessary allocations by moving it out from an inner
2406  * loop.
2407  *
2408  * Consider again the pipeline from \ref Func::compute_at :
2409  \code
2410  Func f, g;
2411  Var x, y;
2412  g(x, y) = x*y;
2413  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2414  \endcode
2415  *
2416  * If we schedule f like so:
2417  *
2418  \code
2419  g.compute_at(f, x);
2420  \endcode
2421  *
2422  * Then the C code equivalent to this pipeline will look like this
2423  *
2424  \code
2425 
2426  int f[height][width];
2427  for (int y = 0; y < height; y++) {
2428  for (int x = 0; x < width; x++) {
2429  int g[2][2];
2430  g[0][0] = x*y;
2431  g[0][1] = (x+1)*y;
2432  g[1][0] = x*(y+1);
2433  g[1][1] = (x+1)*(y+1);
2434  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2435  }
2436  }
2437 
2438  \endcode
2439  *
2440  * Note the allocation for g inside of the loop over variable x which
2441  * can happen for each iteration of the inner loop (in total height * width times).
2442  * In some cases allocation can be expensive, so it might be better to do it once
2443  * and reuse allocated memory across all iterations of the loop.
2444  *
2445  * This can be done by scheduling g like so:
2446  *
2447  \code
2448  g.compute_at(f, x).hoist_storage(f, Var::outermost());
2449  \endcode
2450  *
2451  * Then the C code equivalent to this pipeline will look like this
2452  *
2453  \code
2454 
2455  int f[height][width];
2456  int g[2][2];
2457  for (int y = 0; y < height; y++) {
2458  for (int x = 0; x < width; x++) {
2459  g[0][0] = x*y;
2460  g[0][1] = (x+1)*y;
2461  g[1][0] = x*(y+1);
2462  g[1][1] = (x+1)*(y+1);
2463  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2464  }
2465  }
2466 
2467  \endcode
2468  *
2469  * hoist_storage can be used together with \ref Func::store_at and
2470  * \ref Func::fold_storage (for example, to hoist the storage allocated
2471  * after sliding window optimization).
2472  *
2473  */
2474  Func &hoist_storage(const Func &f, const Var &var);
2475 
2476  /** Equivalent to the version of hoist_storage that takes a Var, but
2477  * schedules storage within the loop over a dimension of a
2478  * reduction domain */
2479  Func &hoist_storage(const Func &f, const RVar &var);
2480 
2481  /** Equivalent to the version of hoist_storage that takes a Var, but
2482  * schedules storage at a given LoopLevel. */
2483  Func &hoist_storage(LoopLevel loop_level);
2484 
2485  /** Equivalent to \ref Func::hoist_storage_root, but schedules storage
2486  * outside the outermost loop. */
2488 
2489  /** Aggressively inline all uses of this function. This is the
2490  * default schedule, so you're unlikely to need to call this. For
2491  * a Func with an update definition, that means it gets computed
2492  * as close to the innermost loop as possible.
2493  *
2494  * Consider once more the pipeline from \ref Func::compute_at :
2495  *
2496  \code
2497  Func f, g;
2498  Var x, y;
2499  g(x, y) = x*y;
2500  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2501  \endcode
2502  *
2503  * Leaving g as inline, this compiles to code equivalent to the following C:
2504  *
2505  \code
2506  int f[height][width];
2507  for (int y = 0; y < height; y++) {
2508  for (int x = 0; x < width; x++) {
2509  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2510  }
2511  }
2512  \endcode
2513  */
2514  Func &compute_inline();
2515 
2516  /** Get a handle on an update step for the purposes of scheduling
2517  * it. */
2518  Stage update(int idx = 0);
2519 
2520  /** Set the type of memory this Func should be stored in. Controls
2521  * whether allocations go on the stack or the heap on the CPU, and
2522  * in global vs shared vs local on the GPU. See the documentation
2523  * on MemoryType for more detail. */
2524  Func &store_in(MemoryType memory_type);
2525 
2526  /** Trace all loads from this Func by emitting calls to
2527  * halide_trace. If the Func is inlined, this has no
2528  * effect. */
2529  Func &trace_loads();
2530 
2531  /** Trace all stores to the buffer backing this Func by emitting
2532  * calls to halide_trace. If the Func is inlined, this call
2533  * has no effect. */
2534  Func &trace_stores();
2535 
2536  /** Trace all realizations of this Func by emitting calls to
2537  * halide_trace. */
2539 
2540  /** Add a string of arbitrary text that will be passed thru to trace
2541  * inspection code if the Func is realized in trace mode. (Funcs that are
2542  * inlined won't have their tags emitted.) Ignored entirely if
2543  * tracing is not enabled for the Func (or globally).
2544  */
2545  Func &add_trace_tag(const std::string &trace_tag);
2546 
2547  /** Get a handle on the internal halide function that this Func
2548  * represents. Useful if you want to do introspection on Halide
2549  * functions */
2550  Internal::Function function() const {
2551  return func;
2552  }
2553 
2554  /** You can cast a Func to its pure stage for the purposes of
2555  * scheduling it. */
2556  operator Stage() const;
2557 
2558  /** Get a handle on the output buffer for this Func. Only relevant
2559  * if this is the output Func in a pipeline. Useful for making
2560  * static promises about strides, mins, and extents. */
2561  // @{
2563  std::vector<OutputImageParam> output_buffers() const;
2564  // @}
2565 
2566  /** Use a Func as an argument to an external stage. */
2567  operator ExternFuncArgument() const;
2568 
2569  /** Infer the arguments to the Func, sorted into a canonical order:
2570  * all buffers (sorted alphabetically by name), followed by all non-buffers
2571  * (sorted alphabetically by name).
2572  This lets you write things like:
2573  \code
2574  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2575  \endcode
2576  */
2577  std::vector<Argument> infer_arguments() const;
2578 
2579  /** Get the source location of the pure definition of this
2580  * Func. See Stage::source_location() */
2581  std::string source_location() const;
2582 
2583  /** Return the current StageSchedule associated with this initial
2584  * Stage of this Func. For introspection only: to modify schedule,
2585  * use the Func interface. */
2587  return Stage(*this).get_schedule();
2588  }
2589 };
2590 
2591 namespace Internal {
2592 
2593 template<typename Last>
2594 inline void check_types(const Tuple &t, int idx) {
2595  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2596  user_assert(t[idx].type() == type_of<T>())
2597  << "Can't evaluate expression "
2598  << t[idx] << " of type " << t[idx].type()
2599  << " as a scalar of type " << type_of<T>() << "\n";
2600 }
2601 
2602 template<typename First, typename Second, typename... Rest>
2603 inline void check_types(const Tuple &t, int idx) {
2604  check_types<First>(t, idx);
2605  check_types<Second, Rest...>(t, idx + 1);
2606 }
2607 
2608 template<typename Last>
2609 inline void assign_results(Realization &r, int idx, Last last) {
2610  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2611  *last = Buffer<T>(r[idx])();
2612 }
2613 
2614 template<typename First, typename Second, typename... Rest>
2615 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2616  assign_results<First>(r, idx, first);
2617  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2618 }
2619 
2620 } // namespace Internal
2621 
2622 /** JIT-Compile and run enough code to evaluate a Halide
2623  * expression. This can be thought of as a scalar version of
2624  * \ref Func::realize */
2625 template<typename T>
2627  user_assert(e.type() == type_of<T>())
2628  << "Can't evaluate expression "
2629  << e << " of type " << e.type()
2630  << " as a scalar of type " << type_of<T>() << "\n";
2631  Func f;
2632  f() = e;
2633  Buffer<T, 0> im = f.realize(ctx);
2634  return im();
2635 }
2636 
2637 /** evaluate with a default user context */
2638 template<typename T>
2640  return evaluate<T>(nullptr, e);
2641 }
2642 
2643 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2644 template<typename First, typename... Rest>
2645 HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2646  Internal::check_types<First, Rest...>(t, 0);
2647 
2648  Func f;
2649  f() = t;
2650  Realization r = f.realize(ctx);
2651  Internal::assign_results(r, 0, first, rest...);
2652 }
2653 
2654 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2655 template<typename First, typename... Rest>
2656 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2657  evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2658 }
2659 
2660 namespace Internal {
2661 
2662 inline void schedule_scalar(Func f) {
2664  if (t.has_gpu_feature()) {
2665  f.gpu_single_thread();
2666  }
2667  if (t.has_feature(Target::HVX)) {
2668  f.hexagon();
2669  }
2670 }
2671 
2672 } // namespace Internal
2673 
2674 /** JIT-Compile and run enough code to evaluate a Halide
2675  * expression. This can be thought of as a scalar version of
2676  * \ref Func::realize. Can use GPU if jit target from environment
2677  * specifies one.
2678  */
2679 template<typename T>
2681  user_assert(e.type() == type_of<T>())
2682  << "Can't evaluate expression "
2683  << e << " of type " << e.type()
2684  << " as a scalar of type " << type_of<T>() << "\n";
2685  Func f;
2686  f() = e;
2688  Buffer<T, 0> im = f.realize();
2689  return im();
2690 }
2691 
2692 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2693  * use GPU if jit target from environment specifies one. */
2694 // @{
2695 template<typename First, typename... Rest>
2696 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2697  Internal::check_types<First, Rest...>(t, 0);
2698 
2699  Func f;
2700  f() = t;
2702  Realization r = f.realize();
2703  Internal::assign_results(r, 0, first, rest...);
2704 }
2705 // @}
2706 
2707 } // namespace Halide
2708 
2709 #endif
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:345
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
Definition: Func.h:1237
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:43
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
Expr max(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:606
A reference to a site in a Halide statement at the top of the body of a particular for loop...
Definition: Schedule.h:203
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type always_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Always for some number of Vars and RVars.
Definition: Func.h:1496
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:695
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_...
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
A fragment of Halide syntax.
Definition: Expr.h:258
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
A class representing a Halide pipeline.
Definition: Pipeline.h:107
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
Target get_target_from_environment()
Return the target that Halide will use.
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
std::string name() const
Return the name of this stage, e.g.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized...
std::vector< OutputImageParam > output_buffers() const
Schedule the iteration over the initial definition of this function to be fused with another stage &#39;s...
Func & compute_inline()
Aggressively inline all uses of this function.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:603
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
A halide function.
Definition: Func.h:706
Defines the front-end syntax for reduction domains and reduction variables.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
const std::vector< Type > & types() const
Get the type(s) of the outputs of this Func.
int dimensions() const
The dimensionality (number of arguments) of this function.
Defines the front-end class representing an entire Halide imaging pipeline.
Partition
Different ways to handle loops with a potentially optimizable boundary conditions.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Scheduling calls that control how the domain of this stage is traversed.
bool has_feature(Feature f) const
Func & compute_root()
Compute all of this function once ahead of time.
Defines Module, an IR container that fully describes a Halide program.
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent...
Definition: Schedule.h:33
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition. ...
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:383
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component &#39;idx&#39; of this Func by the given expression.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
VarOrRVar(const Var &v)
Definition: Func.h:33
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:40
Stage & serial(const VarOrRVar &var)
Scheduling calls that control how the domain of this stage is traversed.
#define internal_assert(c)
Definition: Errors.h:19
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Definition: Func.h:47
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
bool defined() const
Does this function have at least a pure definition.
STL namespace.
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
Callable compile_to_callable(const std::vector< Argument > &args, const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code and return a callable struct that behaves like a fun...
bool is_rvar
Definition: Func.h:57
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:497
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Argument.h:16
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of &#39;modulus&#39;.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func. ...
Definition: Func.h:2586
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline, and contains methods to using Halide&#39;s bounds tools to query properties of it.
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:93
int index() const
Return index to the function outputs.
Definition: Func.h:680
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type never_partition(const VarOrRVar &x, Args &&...args)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:390
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage & unroll(const VarOrRVar &var)
Scheduling calls that control how the domain of this stage is traversed.
bool has_update_definition() const
Does this function have at least one update definition?
A class that can represent Vars or RVars.
Definition: Func.h:29
size_t size() const
How many outputs does the function this refers to produce.
Defines the structure that describes a Halide target.
std::vector< Var > args() const
Get the pure arguments.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Specify how the storage for the function is laid out.
Definition: Func.h:2076
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:468
Func & always_partition_all()
Set the loop partition policy to Always for all Vars and RVar of the initial definition of the Func...
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values...
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Internal::Function function() const
What function is this calling?
Definition: Func.h:594
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs...
Definition: Func.h:616
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
int num_update_definitions() const
How many update definitions does this function have?
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1180
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits, reorders, and tiles.
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline...
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Scheduling calls that control how the domain of this stage is traversed.
Stage & vectorize(const VarOrRVar &var)
Scheduling calls that control how the domain of this stage is traversed.
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Used to denote for loops that run on the same device as the containing code.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:137
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
Func & never_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Never for a vector of Vars and RVars.
Func & never_partition_all()
Set the loop partition policy to Never for all Vars and RVar of the initial definition of the Func...
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration...
Definition: Func.h:2050
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices...
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1170
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:322
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt) ...
const std::string & name() const
Get the name of a Var.
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
Stage & always_partition(const std::vector< VarOrRVar > &vars)
Scheduling calls that control how the domain of this stage is traversed.
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
void check_types(const Tuple &t, int idx)
Definition: Func.h:2594
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage &#39;s...
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Match whatever is specified in the Target.
Func()
Declare a new undefined function with an automatically-generated unique name.
Stage specialize(const Expr &condition)
Specialize a Func.
A handle on the output buffer of a pipeline.
Classes for declaring scalar parameters to halide pipelines.
Stage & never_partition_all()
Scheduling calls that control how the domain of this stage is traversed.
Stage specialize(const Expr &condition)
Scheduling calls that control how the domain of this stage is traversed.
Let Halide select a storage type automatically.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
void schedule_scalar(Func f)
Definition: Func.h:2662
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:46
Not visible externally, similar to &#39;static&#39; linkage in C.
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2609
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
Stage & partition(const VarOrRVar &var, Partition partition_policy)
Scheduling calls that control how the domain of this stage is traversed.
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component &#39;idx&#39; of this Func by the given expression.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type always_partition(const VarOrRVar &x, Args &&...args)
Scheduling calls that control how the domain of this stage is traversed.
Definition: Func.h:397
VarOrRVar(const RDom &r)
Definition: Func.h:39
bool defined() const
Definition objects are nullable.
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:765
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:401
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func&#39;s storage, but not extent of its compute.
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
Stage & allow_race_conditions()
Scheduling calls that control how the domain of this stage is traversed.
Tuple values() const
The values returned by this function.
const Type & type() const
Get the type(s) of the outputs of this Func.
Stage & always_partition_all()
Scheduling calls that control how the domain of this stage is traversed.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type never_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Never for some number of Vars and RVars.
Definition: Func.h:1479
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Defines the Var - the front-end variable.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to &#39;remainder&#39; modulo &#39;modulus&#39;...
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
A context to be passed to Pipeline::realize.
Definition: JITModule.h:136
Defines a type used for expressing the type signature of a generated halide pipeline.
Stage & never_partition(const std::vector< VarOrRVar > &vars)
Scheduling calls that control how the domain of this stage is traversed.
A reference-counted handle to Halide&#39;s internal representation of a function.
Definition: Function.h:38
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
An argument to an extern-defined Func.
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
Func & always_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Always for a vector of Vars and RVars.
Types in the halide type system.
Definition: Type.h:276
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component &#39;idx&#39; of this Func...
Func & hoist_storage(const Func &f, const Var &var)
Hoist storage for this function within f&#39;s loop over var.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f&#39;s loop over var.
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:697
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:25
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Definition: Func.h:1254
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1600
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Add an extern definition for this Func.
Definition: Func.h:1162
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:691
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component &#39;idx&#39; of this Func.
const std::string & name() const
The name of this reduction variable.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1057
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
Expr value() const
The right-hand-side value of the pure definition of this function.
Stage & atomic(bool override_associativity_test=false)
Scheduling calls that control how the domain of this stage is traversed.
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
A single definition of a Func.
Definition: Func.h:69
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
Stage & parallel(const VarOrRVar &var)
Scheduling calls that control how the domain of this stage is traversed.
int outputs() const
Get the number of outputs of this Func.
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
std::string source_location() const
Get the source location of the pure definition of this Func.
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
VarOrRVar(const RVar &r)
Definition: Func.h:36
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2680
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1152
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
Stage ScheduleHandle
Definition: Func.h:488
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2626
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & partition(const VarOrRVar &var, Partition partition_policy)
Set the loop partition policy.
void specialize_fail(const std::string &message)
Scheduling calls that control how the domain of this stage is traversed.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:30
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:106
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Scheduling calls that control how the domain of this stage is traversed.
Defines Tuple - the front-end handle on small arrays of expressions.
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Scheduling calls that control how the domain of this stage is traversed.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
Func & hoist_storage_root()
Equivalent to Func::hoist_storage_root, but schedules storage outside the outermost loop...
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Scheduling calls that control how the domain of this stage is traversed.
#define user_assert(c)
Definition: test.h:10
Func & async()
Produce this Func asynchronously in a separate thread.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case...
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers...
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func&#39;s realization, but not its min.
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process&#39;s own deb...
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration...
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Stage & reorder(const std::vector< VarOrRVar > &vars)
Scheduling calls that control how the domain of this stage is traversed.
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component &#39;idx&#39; of a Func (see RDom)...
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:72
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:348