Halide  17.0.2
Halide compiler and libraries
Func.h
Go to the documentation of this file.
1 #ifndef HALIDE_FUNC_H
2 #define HALIDE_FUNC_H
3 
4 /** \file
5  *
6  * Defines Func - the front-end handle on a halide function, and related classes.
7  */
8 
9 #include "Argument.h"
10 #include "Expr.h"
11 #include "JITModule.h"
12 #include "Module.h"
13 #include "Param.h"
14 #include "Pipeline.h"
15 #include "RDom.h"
16 #include "Target.h"
17 #include "Tuple.h"
18 #include "Var.h"
19 
20 #include <map>
21 #include <utility>
22 
23 namespace Halide {
24 
25 class OutputImageParam;
26 
27 /** A class that can represent Vars or RVars. Used for reorder calls
28  * which can accept a mix of either. */
29 struct VarOrRVar {
30  VarOrRVar(const std::string &n, bool r)
31  : var(n), rvar(n), is_rvar(r) {
32  }
33  VarOrRVar(const Var &v)
34  : var(v), is_rvar(false) {
35  }
36  VarOrRVar(const RVar &r)
37  : rvar(r), is_rvar(true) {
38  }
39  VarOrRVar(const RDom &r)
40  : rvar(RVar(r)), is_rvar(true) {
41  }
42  template<int N>
44  : var(u), is_rvar(false) {
45  }
46 
47  const std::string &name() const {
48  if (is_rvar) {
49  return rvar.name();
50  } else {
51  return var.name();
52  }
53  }
54 
57  bool is_rvar;
58 };
59 
60 class ImageParam;
61 
62 namespace Internal {
63 class Function;
64 struct Split;
65 struct StorageDim;
66 } // namespace Internal
67 
68 /** A single definition of a Func. May be a pure or update definition. */
69 class Stage {
70  /** Reference to the Function this stage (or definition) belongs to. */
71  Internal::Function function;
72  Internal::Definition definition;
73  /** Indicate which stage the definition belongs to (0 for initial
74  * definition, 1 for first update, etc.). */
75  size_t stage_index;
76  /** Pure Vars of the Function (from the init definition). */
77  std::vector<Var> dim_vars;
78 
79  void set_dim_type(const VarOrRVar &var, Internal::ForType t);
80  void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
81  void split(const std::string &old, const std::string &outer, const std::string &inner,
82  const Expr &factor, bool exact, TailStrategy tail);
83  void remove(const std::string &var);
84  Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
85 
86  const std::vector<Internal::StorageDim> &storage_dims() const {
87  return function.schedule().storage_dims();
88  }
89 
90  Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
91 
92 public:
93  Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
94  : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
95  internal_assert(definition.defined());
96 
97  dim_vars.reserve(function.args().size());
98  for (const auto &arg : function.args()) {
99  dim_vars.emplace_back(arg);
100  }
101  internal_assert(definition.args().size() == dim_vars.size());
102  }
103 
104  /** Return the current StageSchedule associated with this Stage. For
105  * introspection only: to modify schedule, use the Func interface. */
107  return definition.schedule();
108  }
109 
110  /** Return a string describing the current var list taking into
111  * account all the splits, reorders, and tiles. */
112  std::string dump_argument_list() const;
113 
114  /** Return the name of this stage, e.g. "f.update(2)" */
115  std::string name() const;
116 
117  /** Calling rfactor() on an associative update definition a Func will split
118  * the update into an intermediate which computes the partial results and
119  * replaces the current update definition with a new definition which merges
120  * the partial results. If called on a init/pure definition, this will
121  * throw an error. rfactor() will automatically infer the associative reduction
122  * operator and identity of the operator. If it can't prove the operation
123  * is associative or if it cannot find an identity for that operator, this
124  * will throw an error. In addition, commutativity of the operator is required
125  * if rfactor() is called on the inner dimension but excluding the outer
126  * dimensions.
127  *
128  * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
129  * The rvars not listed in 'preserved' are removed from the original Func and
130  * are lifted to the intermediate Func. The remaining rvars (the ones in
131  * 'preserved') are made pure in the intermediate Func. The intermediate Func's
132  * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
133  * applied to the original Func's update definition. The loop order of the
134  * intermediate Func's update definition is the same as the original, although
135  * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
136  * intermediate Func's init definition from innermost to outermost is the args'
137  * order of the original Func's init definition followed by the new pure Vars.
138  *
139  * The intermediate Func also inherits storage order from the original Func
140  * with the new pure Vars added to the outermost.
141  *
142  * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
143  \code
144  f(x, y) = 0;
145  f(x, y) += g(r.x, r.y);
146  \endcode
147  * into a pipeline like this:
148  \code
149  f_intm(x, y, u) = 0;
150  f_intm(x, y, u) += g(r.x, u);
151 
152  f(x, y) = 0;
153  f(x, y) += f_intm(x, y, r.y);
154  \endcode
155  *
156  * This has a variety of uses. You can use it to split computation of an associative reduction:
157  \code
158  f(x, y) = 10;
159  RDom r(0, 96);
160  f(x, y) = max(f(x, y), g(x, y, r.x));
161  f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
162  f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
163  \endcode
164  *
165  *, which is equivalent to:
166  \code
167  parallel for u = 0 to 11:
168  for y:
169  for x:
170  f_intm(x, y, u) = -inf
171  parallel for x:
172  for y:
173  parallel for u = 0 to 11:
174  for rxi = 0 to 7:
175  f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
176  for y:
177  for x:
178  f(x, y) = 10
179  parallel for x:
180  for y:
181  for rxo = 0 to 11:
182  f(x, y) = max(f(x, y), f_intm(x, y, rxo))
183  \endcode
184  *
185  */
186  // @{
187  Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
188  Func rfactor(const RVar &r, const Var &v);
189  // @}
190 
191  /** Schedule the iteration over this stage to be fused with another
192  * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
193  * be computed AFTER 's' in the innermost fused dimension. There should not
194  * be any dependencies between those two fused stages. If either of the
195  * stages being fused is a stage of an extern Func, this will throw an error.
196  *
197  * Note that the two stages that are fused together should have the same
198  * exact schedule from the outermost to the innermost fused dimension, and
199  * the stage we are calling compute_with on should not have specializations,
200  * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
201  *
202  * Also, if a producer is desired to be computed at the fused loop level,
203  * the function passed to the compute_at() needs to be the "parent". Consider
204  * the following code:
205  \code
206  input(x, y) = x + y;
207  f(x, y) = input(x, y);
208  f(x, y) += 5;
209  g(x, y) = x - y;
210  g(x, y) += 10;
211  f.compute_with(g, y);
212  f.update().compute_with(g.update(), y);
213  \endcode
214  *
215  * To compute 'input' at the fused loop level at dimension y, we specify
216  * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
217  * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
218  * is computed). On the other hand, to compute 'input' at the innermost
219  * dimension of 'f', we specify input.compute_at(f, x) instead of
220  * input.compute_at(g, x) since the x dimension of 'f' is not fused
221  * (only the y dimension is).
222  *
223  * Given the constraints, this has a variety of uses. Consider the
224  * following code:
225  \code
226  f(x, y) = x + y;
227  g(x, y) = x - y;
228  h(x, y) = f(x, y) + g(x, y);
229  f.compute_root();
230  g.compute_root();
231  f.split(x, xo, xi, 8);
232  g.split(x, xo, xi, 8);
233  g.compute_with(f, xo);
234  \endcode
235  *
236  * This is equivalent to:
237  \code
238  for y:
239  for xo:
240  for xi:
241  f(8*xo + xi) = (8*xo + xi) + y
242  for xi:
243  g(8*xo + xi) = (8*xo + xi) - y
244  for y:
245  for x:
246  h(x, y) = f(x, y) + g(x, y)
247  \endcode
248  *
249  * The size of the dimensions of the stages computed_with do not have
250  * to match. Consider the following code where 'g' is half the size of 'f':
251  \code
252  Image<int> f_im(size, size), g_im(size/2, size/2);
253  input(x, y) = x + y;
254  f(x, y) = input(x, y);
255  g(x, y) = input(2*x, 2*y);
256  g.compute_with(f, y);
257  input.compute_at(f, y);
258  Pipeline({f, g}).realize({f_im, g_im});
259  \endcode
260  *
261  * This is equivalent to:
262  \code
263  for y = 0 to size-1:
264  for x = 0 to size-1:
265  input(x, y) = x + y;
266  for x = 0 to size-1:
267  f(x, y) = input(x, y)
268  for x = 0 to size/2-1:
269  if (y < size/2-1):
270  g(x, y) = input(2*x, 2*y)
271  \endcode
272  *
273  * 'align' specifies how the loop iteration of each dimension of the
274  * two stages being fused should be aligned in the fused loop nests
275  * (see LoopAlignStrategy for options). Consider the following loop nests:
276  \code
277  for z = f_min_z to f_max_z:
278  for y = f_min_y to f_max_y:
279  for x = f_min_x to f_max_x:
280  f(x, y, z) = x + y + z
281  for z = g_min_z to g_max_z:
282  for y = g_min_y to g_max_y:
283  for x = g_min_x to g_max_x:
284  g(x, y, z) = x - y - z
285  \endcode
286  *
287  * If no alignment strategy is specified, the following loop nest will be
288  * generated:
289  \code
290  for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
291  for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
292  for x = f_min_x to f_max_x:
293  if (f_min_z <= z <= f_max_z):
294  if (f_min_y <= y <= f_max_y):
295  f(x, y, z) = x + y + z
296  for x = g_min_x to g_max_x:
297  if (g_min_z <= z <= g_max_z):
298  if (g_min_y <= y <= g_max_y):
299  g(x, y, z) = x - y - z
300  \endcode
301  *
302  * Instead, these alignment strategies:
303  \code
304  g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
305  \endcode
306  * will produce the following loop nest:
307  \code
308  f_loop_min_z = f_min_z
309  f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
310  for z = f_min_z to f_loop_max_z:
311  f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
312  f_loop_max_y = f_max_y
313  for y = f_loop_min_y to f_loop_max_y:
314  for x = f_min_x to f_max_x:
315  if (f_loop_min_z <= z <= f_loop_max_z):
316  if (f_loop_min_y <= y <= f_loop_max_y):
317  f(x, y, z) = x + y + z
318  for x = g_min_x to g_max_x:
319  g_shift_z = g_min_z - f_loop_min_z
320  g_shift_y = g_max_y - f_loop_max_y
321  if (g_min_z <= (z + g_shift_z) <= g_max_z):
322  if (g_min_y <= (y + g_shift_y) <= g_max_y):
323  g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
324  \endcode
325  *
326  * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
327  * of 'g' at dimension z so that its starting value matches that of 'f'.
328  * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
329  * iteration of 'g' at dimension y so that its end value matches that of 'f'.
330  */
331  // @{
332  Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
334  Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
336  // @}
337 
338  /** Scheduling calls that control how the domain of this stage is
339  * traversed. See the documentation for Func for the meanings. */
340  // @{
341 
342  Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
343  Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
344  Stage &serial(const VarOrRVar &var);
345  Stage &parallel(const VarOrRVar &var);
346  Stage &vectorize(const VarOrRVar &var);
347  Stage &unroll(const VarOrRVar &var);
348  Stage &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
349  Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
350  Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351  Stage &partition(const VarOrRVar &var, Partition partition_policy);
353  Stage &never_partition(const std::vector<VarOrRVar> &vars);
355  Stage &always_partition(const std::vector<VarOrRVar> &vars);
356 
357  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
358  const VarOrRVar &xo, const VarOrRVar &yo,
359  const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
361  Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
362  const VarOrRVar &xi, const VarOrRVar &yi,
363  const Expr &xfactor, const Expr &yfactor,
365  Stage &tile(const std::vector<VarOrRVar> &previous,
366  const std::vector<VarOrRVar> &outers,
367  const std::vector<VarOrRVar> &inners,
368  const std::vector<Expr> &factors,
369  const std::vector<TailStrategy> &tails);
370  Stage &tile(const std::vector<VarOrRVar> &previous,
371  const std::vector<VarOrRVar> &outers,
372  const std::vector<VarOrRVar> &inners,
373  const std::vector<Expr> &factors,
375  Stage &tile(const std::vector<VarOrRVar> &previous,
376  const std::vector<VarOrRVar> &inners,
377  const std::vector<Expr> &factors,
379  Stage &reorder(const std::vector<VarOrRVar> &vars);
380 
381  template<typename... Args>
382  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
383  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
384  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
385  return reorder(collected_args);
386  }
387 
388  template<typename... Args>
389  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
390  never_partition(const VarOrRVar &x, Args &&...args) {
391  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
392  return never_partition(collected_args);
393  }
394 
395  template<typename... Args>
396  HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
397  always_partition(const VarOrRVar &x, Args &&...args) {
398  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
399  return always_partition(collected_args);
400  }
401 
402  Stage &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
403  Stage specialize(const Expr &condition);
404  void specialize_fail(const std::string &message);
405 
406  Stage &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
407  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
408  Stage &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
409 
410  Stage &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
411 
413 
415  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
416  Stage &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
417 
418  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
419  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
420  const VarOrRVar &thread_x, const VarOrRVar &thread_y,
421  DeviceAPI device_api = DeviceAPI::Default_GPU);
422  Stage &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
423  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
424  DeviceAPI device_api = DeviceAPI::Default_GPU);
425 
426  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
428  DeviceAPI device_api = DeviceAPI::Default_GPU);
429 
430  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
432  DeviceAPI device_api = DeviceAPI::Default_GPU);
433  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
434  const VarOrRVar &bx, const VarOrRVar &by,
435  const VarOrRVar &tx, const VarOrRVar &ty,
436  const Expr &x_size, const Expr &y_size,
438  DeviceAPI device_api = DeviceAPI::Default_GPU);
439 
440  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
441  const VarOrRVar &tx, const VarOrRVar &ty,
442  const Expr &x_size, const Expr &y_size,
444  DeviceAPI device_api = DeviceAPI::Default_GPU);
445 
446  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
447  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
448  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
449  const Expr &x_size, const Expr &y_size, const Expr &z_size,
451  DeviceAPI device_api = DeviceAPI::Default_GPU);
452  Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
453  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
454  const Expr &x_size, const Expr &y_size, const Expr &z_size,
456  DeviceAPI device_api = DeviceAPI::Default_GPU);
457 
459  Stage &atomic(bool override_associativity_test = false);
460 
462 
463  Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
465  Stage &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
467  template<typename T>
468  Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
470  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
471  }
472  // @}
473 
474  /** Attempt to get the source file and line where this stage was
475  * defined by parsing the process's own debug symbols. Returns an
476  * empty string if no debug symbols were found or the debug
477  * symbols were not understood. Works on OS X and Linux only. */
478  std::string source_location() const;
479 
480  /** Assert that this stage has intentionally been given no schedule, and
481  * suppress the warning about unscheduled update definitions that would
482  * otherwise fire. This counts as a schedule, so calling this twice on the
483  * same Stage will fail the assertion. */
484  void unscheduled();
485 };
486 
487 // For backwards compatibility, keep the ScheduleHandle name.
489 
490 class FuncTupleElementRef;
491 
492 /** A fragment of front-end syntax of the form f(x, y, z), where x, y,
493  * z are Vars or Exprs. If could be the left hand side of a definition or
494  * an update definition, or it could be a call to a function. We don't know
495  * until we see how this object gets used.
496  */
497 class FuncRef {
498  Internal::Function func;
499  int implicit_placeholder_pos;
500  int implicit_count;
501  std::vector<Expr> args;
502  std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
503 
504  /** Helper for function update by Tuple. If the function does not
505  * already have a pure definition, init_val will be used as RHS of
506  * each tuple element in the initial function definition. */
507  template<typename BinaryOp>
508  Stage func_ref_update(const Tuple &e, int init_val);
509 
510  /** Helper for function update by Expr. If the function does not
511  * already have a pure definition, init_val will be used as RHS in
512  * the initial function definition. */
513  template<typename BinaryOp>
514  Stage func_ref_update(Expr e, int init_val);
515 
516 public:
517  FuncRef(const Internal::Function &, const std::vector<Expr> &,
518  int placeholder_pos = -1, int count = 0);
519  FuncRef(Internal::Function, const std::vector<Var> &,
520  int placeholder_pos = -1, int count = 0);
521 
522  /** Use this as the left-hand-side of a definition or an update definition
523  * (see \ref RDom).
524  */
525  Stage operator=(const Expr &);
526 
527  /** Use this as the left-hand-side of a definition or an update definition
528  * for a Func with multiple outputs. */
530 
531  /** Define a stage that adds the given expression to this Func. If the
532  * expression refers to some RDom, this performs a sum reduction of the
533  * expression over the domain. If the function does not already have a
534  * pure definition, this sets it to zero.
535  */
536  // @{
540  // @}
541 
542  /** Define a stage that adds the negative of the given expression to this
543  * Func. If the expression refers to some RDom, this performs a sum reduction
544  * of the negative of the expression over the domain. If the function does
545  * not already have a pure definition, this sets it to zero.
546  */
547  // @{
551  // @}
552 
553  /** Define a stage that multiplies this Func by the given expression. If the
554  * expression refers to some RDom, this performs a product reduction of the
555  * expression over the domain. If the function does not already have a pure
556  * definition, this sets it to 1.
557  */
558  // @{
562  // @}
563 
564  /** Define a stage that divides this Func by the given expression.
565  * If the expression refers to some RDom, this performs a product
566  * reduction of the inverse of the expression over the domain. If the
567  * function does not already have a pure definition, this sets it to 1.
568  */
569  // @{
573  // @}
574 
575  /* Override the usual assignment operator, so that
576  * f(x, y) = g(x, y) defines f.
577  */
579 
580  /** Use this as a call to the function, and not the left-hand-side
581  * of a definition. Only works for single-output Funcs. */
582  operator Expr() const;
583 
584  /** When a FuncRef refers to a function that provides multiple
585  * outputs, you can access each output as an Expr using
586  * operator[].
587  */
589 
590  /** How many outputs does the function this refers to produce. */
591  size_t size() const;
592 
593  /** What function is this calling? */
594  Internal::Function function() const {
595  return func;
596  }
597 };
598 
599 /** Explicit overloads of min and max for FuncRef. These exist to
600  * disambiguate calls to min on FuncRefs when a user has pulled both
601  * Halide::min and std::min into their namespace. */
602 // @{
603 inline Expr min(const FuncRef &a, const FuncRef &b) {
604  return min(Expr(a), Expr(b));
605 }
606 inline Expr max(const FuncRef &a, const FuncRef &b) {
607  return max(Expr(a), Expr(b));
608 }
609 // @}
610 
611 /** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
612  * z are Vars or Exprs. If could be the left hand side of an update
613  * definition, or it could be a call to a function. We don't know
614  * until we see how this object gets used.
615  */
617  FuncRef func_ref;
618  std::vector<Expr> args; // args to the function
619  int idx; // Index to function outputs
620 
621  /** Helper function that generates a Tuple where element at 'idx' is set
622  * to 'e' and the rests are undef. */
623  Tuple values_with_undefs(const Expr &e) const;
624 
625 public:
626  FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
627 
628  /** Use this as the left-hand-side of an update definition of Tuple
629  * component 'idx' of a Func (see \ref RDom). The function must
630  * already have an initial definition.
631  */
632  Stage operator=(const Expr &e);
633 
634  /** Define a stage that adds the given expression to Tuple component 'idx'
635  * of this Func. The other Tuple components are unchanged. If the expression
636  * refers to some RDom, this performs a sum reduction of the expression over
637  * the domain. The function must already have an initial definition.
638  */
639  Stage operator+=(const Expr &e);
640 
641  /** Define a stage that adds the negative of the given expression to Tuple
642  * component 'idx' of this Func. The other Tuple components are unchanged.
643  * If the expression refers to some RDom, this performs a sum reduction of
644  * the negative of the expression over the domain. The function must already
645  * have an initial definition.
646  */
647  Stage operator-=(const Expr &e);
648 
649  /** Define a stage that multiplies Tuple component 'idx' of this Func by
650  * the given expression. The other Tuple components are unchanged. If the
651  * expression refers to some RDom, this performs a product reduction of
652  * the expression over the domain. The function must already have an
653  * initial definition.
654  */
655  Stage operator*=(const Expr &e);
656 
657  /** Define a stage that divides Tuple component 'idx' of this Func by
658  * the given expression. The other Tuple components are unchanged.
659  * If the expression refers to some RDom, this performs a product
660  * reduction of the inverse of the expression over the domain. The function
661  * must already have an initial definition.
662  */
663  Stage operator/=(const Expr &e);
664 
665  /* Override the usual assignment operator, so that
666  * f(x, y)[index] = g(x, y) defines f.
667  */
669 
670  /** Use this as a call to Tuple component 'idx' of a Func, and not the
671  * left-hand-side of a definition. */
672  operator Expr() const;
673 
674  /** What function is this calling? */
675  Internal::Function function() const {
676  return func_ref.function();
677  }
678 
679  /** Return index to the function outputs. */
680  int index() const {
681  return idx;
682  }
683 };
684 
685 namespace Internal {
686 class IRMutator;
687 } // namespace Internal
688 
689 /** Helper class for identifying purpose of an Expr passed to memoize.
690  */
691 class EvictionKey {
692 protected:
694  friend class Func;
695 
696 public:
697  explicit EvictionKey(const Expr &expr = Expr())
698  : key(expr) {
699  }
700 };
701 
702 /** A halide function. This class represents one stage in a Halide
703  * pipeline, and is the unit by which we schedule things. By default
704  * they are aggressively inlined, so you are encouraged to make lots
705  * of little functions, rather than storing things in Exprs. */
706 class Func {
707 
708  /** A handle on the internal halide function that this
709  * represents */
710  Internal::Function func;
711 
712  /** When you make a reference to this function with fewer
713  * arguments than it has dimensions, the argument list is bulked
714  * up with 'implicit' vars with canonical names. This lets you
715  * pass around partially applied Halide functions. */
716  // @{
717  std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
718  std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
719  // @}
720 
721  /** The imaging pipeline that outputs this Func alone. */
722  Pipeline pipeline_;
723 
724  /** Get the imaging pipeline that outputs this Func alone,
725  * creating it (and freezing the Func) if necessary. */
726  Pipeline pipeline();
727 
728  // Helper function for recursive reordering support
729  Func &reorder_storage(const std::vector<Var> &dims, size_t start);
730 
731  void invalidate_cache();
732 
733 public:
734  /** Declare a new undefined function with the given name */
735  explicit Func(const std::string &name);
736 
737  /** Declare a new undefined function with the given name.
738  * The function will be constrained to represent Exprs of required_type.
739  * If required_dims is not AnyDims, the function will be constrained to exactly
740  * that many dimensions. */
741  explicit Func(const Type &required_type, int required_dims, const std::string &name);
742 
743  /** Declare a new undefined function with the given name.
744  * If required_types is not empty, the function will be constrained to represent
745  * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
746  * If required_dims is not AnyDims, the function will be constrained to exactly
747  * that many dimensions. */
748  explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
749 
750  /** Declare a new undefined function with an
751  * automatically-generated unique name */
752  Func();
753 
754  /** Declare a new function with an automatically-generated unique
755  * name, and define it to return the given expression (which may
756  * not contain free variables). */
757  explicit Func(const Expr &e);
758 
759  /** Construct a new Func to wrap an existing, already-define
760  * Function object. */
762 
763  /** Construct a new Func to wrap a Buffer. */
764  template<typename T, int Dims>
766  : Func() {
767  (*this)(_) = im(_);
768  }
769 
770  /** Evaluate this function over some rectangular domain and return
771  * the resulting buffer or buffers. Performs compilation if the
772  * Func has not previously been realized and compile_jit has not
773  * been called. If the final stage of the pipeline is on the GPU,
774  * data is copied back to the host before being returned. The
775  * returned Realization should probably be instantly converted to
776  * a Buffer class of the appropriate type. That is, do this:
777  *
778  \code
779  f(x) = sin(x);
780  Buffer<float> im = f.realize(...);
781  \endcode
782  *
783  * If your Func has multiple values, because you defined it using
784  * a Tuple, then casting the result of a realize call to a buffer
785  * or image will produce a run-time error. Instead you should do the
786  * following:
787  *
788  \code
789  f(x) = Tuple(x, sin(x));
790  Realization r = f.realize(...);
791  Buffer<int> im0 = r[0];
792  Buffer<float> im1 = r[1];
793  \endcode
794  *
795  * In Halide formal arguments of a computation are specified using
796  * Param<T> and ImageParam objects in the expressions defining the
797  * computation. Note that this method is not thread-safe, in that
798  * Param<T> and ImageParam are globals shared by all threads; to call
799  * jitted code in a thread-safe manner, use compile_to_callable() instead.
800  *
801  \code
802  Param<int32> p(42);
803  ImageParam img(Int(32), 1);
804  f(x) = img(x) + p;
805 
806  Buffer<int32_t) arg_img(10, 10);
807  <fill in arg_img...>
808 
809  Target t = get_jit_target_from_environment();
810  Buffer<int32_t> result = f.realize({10, 10}, t);
811  \endcode
812  *
813  * Alternatively, an initializer list can be used
814  * directly in the realize call to pass this information:
815  *
816  \code
817  Param<int32> p(42);
818  ImageParam img(Int(32), 1);
819  f(x) = img(x) + p;
820 
821  Buffer<int32_t) arg_img(10, 10);
822  <fill in arg_img...>
823 
824  Target t = get_jit_target_from_environment();
825  Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
826  \endcode
827  *
828  * If the Func cannot be realized into a buffer of the given size
829  * due to scheduling constraints on scattering update definitions,
830  * it will be realized into a larger buffer of the minimum size
831  * possible, and a cropped view at the requested size will be
832  * returned. It is thus not safe to assume the returned buffers
833  * are contiguous in memory. This behavior can be disabled with
834  * the NoBoundsQuery target flag, in which case an error about
835  * writing out of bounds on the output buffer will trigger
836  * instead.
837  *
838  */
839  Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target());
840 
841  /** Same as above, but takes a custom user-provided context to be
842  * passed to runtime functions. This can be used to pass state to
843  * runtime overrides in a thread-safe manner. A nullptr context is
844  * legal, and is equivalent to calling the variant of realize
845  * that does not take a context. */
847  std::vector<int32_t> sizes = {},
848  const Target &target = Target());
849 
850  /** Evaluate this function into an existing allocated buffer or
851  * buffers. If the buffer is also one of the arguments to the
852  * function, strange things may happen, as the pipeline isn't
853  * necessarily safe to run in-place. If you pass multiple buffers,
854  * they must have matching sizes. This form of realize does *not*
855  * automatically copy data back from the GPU. */
857 
858  /** Same as above, but takes a custom user-provided context to be
859  * passed to runtime functions. This can be used to pass state to
860  * runtime overrides in a thread-safe manner. A nullptr context is
861  * legal, and is equivalent to calling the variant of realize
862  * that does not take a context. */
863  void realize(JITUserContext *context,
865  const Target &target = Target());
866 
867  /** For a given size of output, or a given output buffer,
868  * determine the bounds required of all unbound ImageParams
869  * referenced. Communicates the result by allocating new buffers
870  * of the appropriate size and binding them to the unbound
871  * ImageParams.
872  */
873  // @{
874  void infer_input_bounds(const std::vector<int32_t> &sizes,
875  const Target &target = get_jit_target_from_environment());
877  const Target &target = get_jit_target_from_environment());
878  // @}
879 
880  /** Versions of infer_input_bounds that take a custom user context
881  * to pass to runtime functions. */
882  // @{
884  const std::vector<int32_t> &sizes,
885  const Target &target = get_jit_target_from_environment());
888  const Target &target = get_jit_target_from_environment());
889  // @}
890  /** Statically compile this function to llvm bitcode, with the
891  * given filename (which should probably end in .bc), type
892  * signature, and C function name (which defaults to the same name
893  * as this halide function */
894  //@{
895  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
896  const Target &target = get_target_from_environment());
897  void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
898  const Target &target = get_target_from_environment());
899  // @}
900 
901  /** Statically compile this function to llvm assembly, with the
902  * given filename (which should probably end in .ll), type
903  * signature, and C function name (which defaults to the same name
904  * as this halide function */
905  //@{
906  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
907  const Target &target = get_target_from_environment());
908  void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
909  const Target &target = get_target_from_environment());
910  // @}
911 
912  /** Statically compile this function to an object file, with the
913  * given filename (which should probably end in .o or .obj), type
914  * signature, and C function name (which defaults to the same name
915  * as this halide function. You probably don't want to use this
916  * directly; call compile_to_static_library or compile_to_file instead. */
917  //@{
918  void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
919  const Target &target = get_target_from_environment());
920  void compile_to_object(const std::string &filename, const std::vector<Argument> &,
921  const Target &target = get_target_from_environment());
922  // @}
923 
924  /** Emit a header file with the given filename for this
925  * function. The header will define a function with the type
926  * signature given by the second argument, and a name given by the
927  * third. The name defaults to the same name as this halide
928  * function. You don't actually have to have defined this function
929  * yet to call this. You probably don't want to use this directly;
930  * call compile_to_static_library or compile_to_file instead. */
931  void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
932  const Target &target = get_target_from_environment());
933 
934  /** Statically compile this function to text assembly equivalent
935  * to the object file generated by compile_to_object. This is
936  * useful for checking what Halide is producing without having to
937  * disassemble anything, or if you need to feed the assembly into
938  * some custom toolchain to produce an object file (e.g. iOS) */
939  //@{
940  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
941  const Target &target = get_target_from_environment());
942  void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
943  const Target &target = get_target_from_environment());
944  // @}
945 
946  /** Statically compile this function to C source code. This is
947  * useful for providing fallback code paths that will compile on
948  * many platforms. Vectorization will fail, and parallelization
949  * will produce serial code. */
950  void compile_to_c(const std::string &filename,
951  const std::vector<Argument> &,
952  const std::string &fn_name = "",
953  const Target &target = get_target_from_environment());
954 
955  /** Write out an internal representation of lowered code. Useful
956  * for analyzing and debugging scheduling. Can emit html or plain
957  * text. */
958  void compile_to_lowered_stmt(const std::string &filename,
959  const std::vector<Argument> &args,
960  StmtOutputFormat fmt = Text,
961  const Target &target = get_target_from_environment());
962 
963  /** Write out the loop nests specified by the schedule for this
964  * Function. Helpful for understanding what a schedule is
965  * doing. */
967 
968  /** Compile to object file and header pair, with the given
969  * arguments. The name defaults to the same name as this halide
970  * function.
971  */
972  void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
973  const std::string &fn_name = "",
974  const Target &target = get_target_from_environment());
975 
976  /** Compile to static-library file and header pair, with the given
977  * arguments. The name defaults to the same name as this halide
978  * function.
979  */
980  void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
981  const std::string &fn_name = "",
982  const Target &target = get_target_from_environment());
983 
984  /** Compile to static-library file and header pair once for each target;
985  * each resulting function will be considered (in order) via halide_can_use_target_features()
986  * at runtime, with the first appropriate match being selected for subsequent use.
987  * This is typically useful for specializations that may vary unpredictably by machine
988  * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
989  * All targets must have identical arch-os-bits.
990  */
991  void compile_to_multitarget_static_library(const std::string &filename_prefix,
992  const std::vector<Argument> &args,
993  const std::vector<Target> &targets);
994 
995  /** Like compile_to_multitarget_static_library(), except that the object files
996  * are all output as object files (rather than bundled into a static library).
997  *
998  * `suffixes` is an optional list of strings to use for as the suffix for each object
999  * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1000  * will be used for each suffix.)
1001  *
1002  * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1003  * will be generated with the filename `${filename_prefix}_wrapper.o`
1004  *
1005  * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1006  * will be generated with the filename `${filename_prefix}_runtime.o`
1007  */
1008  void compile_to_multitarget_object_files(const std::string &filename_prefix,
1009  const std::vector<Argument> &args,
1010  const std::vector<Target> &targets,
1011  const std::vector<std::string> &suffixes);
1012 
1013  /** Store an internal representation of lowered code as a self
1014  * contained Module suitable for further compilation. */
1015  Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1016  const Target &target = get_target_from_environment());
1017 
1018  /** Compile and generate multiple target files with single call.
1019  * Deduces target files based on filenames specified in
1020  * output_files map.
1021  */
1022  void compile_to(const std::map<OutputFileType, std::string> &output_files,
1023  const std::vector<Argument> &args,
1024  const std::string &fn_name,
1025  const Target &target = get_target_from_environment());
1026 
1027  /** Eagerly jit compile the function to machine code. This
1028  * normally happens on the first call to realize. If you're
1029  * running your halide pipeline inside time-sensitive code and
1030  * wish to avoid including the time taken to compile a pipeline,
1031  * then you can call this ahead of time. Default is to use the Target
1032  * returned from Halide::get_jit_target_from_environment()
1033  */
1035 
1036  /** Get a struct containing the currently set custom functions
1037  * used by JIT. This can be mutated. Changes will take effect the
1038  * next time this Func is realized. */
1040 
1041  /** Eagerly jit compile the function to machine code and return a callable
1042  * struct that behaves like a function pointer. The calling convention
1043  * will exactly match that of an AOT-compiled version of this Func
1044  * with the same Argument list.
1045  */
1046  Callable compile_to_callable(const std::vector<Argument> &args,
1047  const Target &target = get_jit_target_from_environment());
1048 
1049  /** Add a custom pass to be used during lowering. It is run after
1050  * all other lowering passes. Can be used to verify properties of
1051  * the lowered Stmt, instrument it with extra code, or otherwise
1052  * modify it. The Func takes ownership of the pass, and will call
1053  * delete on it when the Func goes out of scope. So don't pass a
1054  * stack object, or share pass instances between multiple
1055  * Funcs. */
1056  template<typename T>
1058  // Template instantiate a custom deleter for this type, then
1059  // wrap in a lambda. The custom deleter lives in user code, so
1060  // that deletion is on the same heap as construction (I hate Windows).
1061  add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1062  }
1063 
1064  /** Add a custom pass to be used during lowering, with the
1065  * function that will be called to delete it also passed in. Set
1066  * it to nullptr if you wish to retain ownership of the object. */
1067  void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1068 
1069  /** Remove all previously-set custom lowering passes */
1071 
1072  /** Get the custom lowering passes. */
1073  const std::vector<CustomLoweringPass> &custom_lowering_passes();
1074 
1075  /** When this function is compiled, include code that dumps its
1076  * values to a file after it is realized, for the purpose of
1077  * debugging.
1078  *
1079  * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1080  * is in TIFF format and can be read by standard tools. Oherwise, the
1081  * file format is as follows:
1082  *
1083  * All data is in the byte-order of the target platform. First, a
1084  * 20 byte-header containing four 32-bit ints, giving the extents
1085  * of the first four dimensions. Dimensions beyond four are
1086  * folded into the fourth. Then, a fifth 32-bit int giving the
1087  * data type of the function. The typecodes are given by: float =
1088  * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1089  * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1090  * data follows the header, as a densely packed array of the given
1091  * size and the given type. If given the extension .tmp, this file
1092  * format can be natively read by the program ImageStack. */
1093  void debug_to_file(const std::string &filename);
1094 
1095  /** The name of this function, either given during construction,
1096  * or automatically generated. */
1097  const std::string &name() const;
1098 
1099  /** Get the pure arguments. */
1100  std::vector<Var> args() const;
1101 
1102  /** The right-hand-side value of the pure definition of this
1103  * function. Causes an error if there's no pure definition, or if
1104  * the function is defined to return multiple values. */
1105  Expr value() const;
1106 
1107  /** The values returned by this function. An error if the function
1108  * has not been been defined. Returns a Tuple with one element for
1109  * functions defined to return a single value. */
1110  Tuple values() const;
1111 
1112  /** Does this function have at least a pure definition. */
1113  bool defined() const;
1114 
1115  /** Get the left-hand-side of the update definition. An empty
1116  * vector if there's no update definition. If there are
1117  * multiple update definitions for this function, use the
1118  * argument to select which one you want. */
1119  const std::vector<Expr> &update_args(int idx = 0) const;
1120 
1121  /** Get the right-hand-side of an update definition. An error if
1122  * there's no update definition. If there are multiple
1123  * update definitions for this function, use the argument to
1124  * select which one you want. */
1125  Expr update_value(int idx = 0) const;
1126 
1127  /** Get the right-hand-side of an update definition for
1128  * functions that returns multiple values. An error if there's no
1129  * update definition. Returns a Tuple with one element for
1130  * functions that return a single value. */
1131  Tuple update_values(int idx = 0) const;
1132 
1133  /** Get the RVars of the reduction domain for an update definition, if there is
1134  * one. */
1135  std::vector<RVar> rvars(int idx = 0) const;
1136 
1137  /** Does this function have at least one update definition? */
1139 
1140  /** How many update definitions does this function have? */
1142 
1143  /** Is this function an external stage? That is, was it defined
1144  * using define_extern? */
1145  bool is_extern() const;
1146 
1147  /** Add an extern definition for this Func. This lets you define a
1148  * Func that represents an external pipeline stage. You can, for
1149  * example, use it to wrap a call to an extern library such as
1150  * fftw. */
1151  // @{
1152  void define_extern(const std::string &function_name,
1153  const std::vector<ExternFuncArgument> &params, Type t,
1154  int dimensionality,
1156  DeviceAPI device_api = DeviceAPI::Host) {
1157  define_extern(function_name, params, t,
1158  Internal::make_argument_list(dimensionality), mangling,
1159  device_api);
1160  }
1161 
1162  void define_extern(const std::string &function_name,
1163  const std::vector<ExternFuncArgument> &params,
1164  const std::vector<Type> &types, int dimensionality,
1165  NameMangling mangling) {
1166  define_extern(function_name, params, types,
1167  Internal::make_argument_list(dimensionality), mangling);
1168  }
1169 
1170  void define_extern(const std::string &function_name,
1171  const std::vector<ExternFuncArgument> &params,
1172  const std::vector<Type> &types, int dimensionality,
1174  DeviceAPI device_api = DeviceAPI::Host) {
1175  define_extern(function_name, params, types,
1176  Internal::make_argument_list(dimensionality), mangling,
1177  device_api);
1178  }
1179 
1180  void define_extern(const std::string &function_name,
1181  const std::vector<ExternFuncArgument> &params, Type t,
1182  const std::vector<Var> &arguments,
1184  DeviceAPI device_api = DeviceAPI::Host) {
1185  define_extern(function_name, params, std::vector<Type>{t}, arguments,
1186  mangling, device_api);
1187  }
1188 
1189  void define_extern(const std::string &function_name,
1190  const std::vector<ExternFuncArgument> &params,
1191  const std::vector<Type> &types,
1192  const std::vector<Var> &arguments,
1194  DeviceAPI device_api = DeviceAPI::Host);
1195  // @}
1196 
1197  /** Get the type(s) of the outputs of this Func.
1198  *
1199  * It is not legal to call type() unless the Func has non-Tuple elements.
1200  *
1201  * If the Func isn't yet defined, and was not specified with required types,
1202  * a runtime error will occur.
1203  *
1204  * If the Func isn't yet defined, but *was* specified with required types,
1205  * the requirements will be returned. */
1206  // @{
1207  const Type &type() const;
1208  const std::vector<Type> &types() const;
1209  // @}
1210 
1211  /** Get the number of outputs of this Func. Corresponds to the
1212  * size of the Tuple this Func was defined to return.
1213  * If the Func isn't yet defined, but was specified with required types,
1214  * the number of outputs specified in the requirements will be returned. */
1215  int outputs() const;
1216 
1217  /** Get the name of the extern function called for an extern
1218  * definition. */
1219  const std::string &extern_function_name() const;
1220 
1221  /** The dimensionality (number of arguments) of this function.
1222  * If the Func isn't yet defined, but was specified with required dimensionality,
1223  * the dimensionality specified in the requirements will be returned. */
1224  int dimensions() const;
1225 
1226  /** Construct either the left-hand-side of a definition, or a call
1227  * to a functions that happens to only contain vars as
1228  * arguments. If the function has already been defined, and fewer
1229  * arguments are given than the function has dimensions, then
1230  * enough implicit vars are added to the end of the argument list
1231  * to make up the difference (see \ref Var::implicit) */
1232  // @{
1233  FuncRef operator()(std::vector<Var>) const;
1234 
1235  template<typename... Args>
1237  operator()(Args &&...args) const {
1238  std::vector<Var> collected_args{std::forward<Args>(args)...};
1239  return this->operator()(collected_args);
1240  }
1241  // @}
1242 
1243  /** Either calls to the function, or the left-hand-side of
1244  * an update definition (see \ref RDom). If the function has
1245  * already been defined, and fewer arguments are given than the
1246  * function has dimensions, then enough implicit vars are added to
1247  * the end of the argument list to make up the difference. (see
1248  * \ref Var::implicit)*/
1249  // @{
1250  FuncRef operator()(std::vector<Expr>) const;
1251 
1252  template<typename... Args>
1254  operator()(const Expr &x, Args &&...args) const {
1255  std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1256  return (*this)(collected_args);
1257  }
1258  // @}
1259 
1260  /** Creates and returns a new identity Func that wraps this Func. During
1261  * compilation, Halide replaces all calls to this Func done by 'f'
1262  * with calls to the wrapper. If this Func is already wrapped for
1263  * use in 'f', will return the existing wrapper.
1264  *
1265  * For example, g.in(f) would rewrite a pipeline like this:
1266  \code
1267  g(x, y) = ...
1268  f(x, y) = ... g(x, y) ...
1269  \endcode
1270  * into a pipeline like this:
1271  \code
1272  g(x, y) = ...
1273  g_wrap(x, y) = g(x, y)
1274  f(x, y) = ... g_wrap(x, y)
1275  \endcode
1276  *
1277  * This has a variety of uses. You can use it to schedule this
1278  * Func differently in the different places it is used:
1279  \code
1280  g(x, y) = ...
1281  f1(x, y) = ... g(x, y) ...
1282  f2(x, y) = ... g(x, y) ...
1283  g.in(f1).compute_at(f1, y).vectorize(x, 8);
1284  g.in(f2).compute_at(f2, x).unroll(x);
1285  \endcode
1286  *
1287  * You can also use it to stage loads from this Func via some
1288  * intermediate buffer (perhaps on the stack as in
1289  * test/performance/block_transpose.cpp, or in shared GPU memory
1290  * as in test/performance/wrap.cpp). In this we compute the
1291  * wrapper at tiles of the consuming Funcs like so:
1292  \code
1293  g.compute_root()...
1294  g.in(f).compute_at(f, tiles)...
1295  \endcode
1296  *
1297  * Func::in() can also be used to compute pieces of a Func into a
1298  * smaller scratch buffer (perhaps on the GPU) and then copy them
1299  * into a larger output buffer one tile at a time. See
1300  * apps/interpolate/interpolate.cpp for an example of this. In
1301  * this case we compute the Func at tiles of its own wrapper:
1302  \code
1303  f.in(g).compute_root().gpu_tile(...)...
1304  f.compute_at(f.in(g), tiles)...
1305  \endcode
1306  *
1307  * A similar use of Func::in() wrapping Funcs with multiple update
1308  * stages in a pure wrapper. The following code:
1309  \code
1310  f(x, y) = x + y;
1311  f(x, y) += 5;
1312  g(x, y) = f(x, y);
1313  f.compute_root();
1314  \endcode
1315  *
1316  * Is equivalent to:
1317  \code
1318  for y:
1319  for x:
1320  f(x, y) = x + y;
1321  for y:
1322  for x:
1323  f(x, y) += 5
1324  for y:
1325  for x:
1326  g(x, y) = f(x, y)
1327  \endcode
1328  * using Func::in(), we can write:
1329  \code
1330  f(x, y) = x + y;
1331  f(x, y) += 5;
1332  g(x, y) = f(x, y);
1333  f.in(g).compute_root();
1334  \endcode
1335  * which instead produces:
1336  \code
1337  for y:
1338  for x:
1339  f(x, y) = x + y;
1340  f(x, y) += 5
1341  f_wrap(x, y) = f(x, y)
1342  for y:
1343  for x:
1344  g(x, y) = f_wrap(x, y)
1345  \endcode
1346  */
1347  Func in(const Func &f);
1348 
1349  /** Create and return an identity wrapper shared by all the Funcs in
1350  * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1351  * this will throw an error. */
1352  Func in(const std::vector<Func> &fs);
1353 
1354  /** Create and return a global identity wrapper, which wraps all calls to
1355  * this Func by any other Func. If a global wrapper already exists,
1356  * returns it. The global identity wrapper is only used by callers for
1357  * which no custom wrapper has been specified.
1358  */
1360 
1361  /** Similar to \ref Func::in; however, instead of replacing the call to
1362  * this Func with an identity Func that refers to it, this replaces the
1363  * call with a clone of this Func.
1364  *
1365  * For example, f.clone_in(g) would rewrite a pipeline like this:
1366  \code
1367  f(x, y) = x + y;
1368  g(x, y) = f(x, y) + 2;
1369  h(x, y) = f(x, y) - 3;
1370  \endcode
1371  * into a pipeline like this:
1372  \code
1373  f(x, y) = x + y;
1374  f_clone(x, y) = x + y;
1375  g(x, y) = f_clone(x, y) + 2;
1376  h(x, y) = f(x, y) - 3;
1377  \endcode
1378  *
1379  */
1380  //@{
1381  Func clone_in(const Func &f);
1382  Func clone_in(const std::vector<Func> &fs);
1383  //@}
1384 
1385  /** Declare that this function should be implemented by a call to
1386  * halide_buffer_copy with the given target device API. Asserts
1387  * that the Func has a pure definition which is a simple call to a
1388  * single input, and no update definitions. The wrapper Funcs
1389  * returned by in() are suitable candidates. Consumes all pure
1390  * variables, and rewrites the Func to have an extern definition
1391  * that calls halide_buffer_copy. */
1393 
1394  /** Declare that this function should be implemented by a call to
1395  * halide_buffer_copy with a NULL target device API. Equivalent to
1396  * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1397  * pure definition which is a simple call to a single input, and
1398  * no update definitions. The wrapper Funcs returned by in() are
1399  * suitable candidates. Consumes all pure variables, and rewrites
1400  * the Func to have an extern definition that calls
1401  * halide_buffer_copy.
1402  *
1403  * Note that if the source Func is already valid in host memory,
1404  * this compiles to code that does the minimum number of calls to
1405  * memcpy.
1406  */
1408 
1409  /** Split a dimension into inner and outer subdimensions with the
1410  * given names, where the inner dimension iterates from 0 to
1411  * factor-1. The inner and outer subdimensions can then be dealt
1412  * with using the other scheduling calls. It's ok to reuse the old
1413  * variable name as either the inner or outer variable. The final
1414  * argument specifies how the tail should be handled if the split
1415  * factor does not provably divide the extent. */
1416  Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1417 
1418  /** Join two dimensions into a single fused dimension. The fused dimension
1419  * covers the product of the extents of the inner and outer dimensions
1420  * given. The loop type (e.g. parallel, vectorized) of the resulting fused
1421  * dimension is inherited from the first argument. */
1422  Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1423 
1424  /** Mark a dimension to be traversed serially. This is the default. */
1425  Func &serial(const VarOrRVar &var);
1426 
1427  /** Mark a dimension to be traversed in parallel */
1428  Func &parallel(const VarOrRVar &var);
1429 
1430  /** Split a dimension by the given task_size, and the parallelize the
1431  * outer dimension. This creates parallel tasks that have size
1432  * task_size. After this call, var refers to the outer dimension of
1433  * the split. The inner dimension has a new anonymous name. If you
1434  * wish to mutate it, or schedule with respect to it, do the split
1435  * manually. */
1436  Func &parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail = TailStrategy::Auto);
1437 
1438  /** Mark a dimension to be computed all-at-once as a single
1439  * vector. The dimension should have constant extent -
1440  * e.g. because it is the inner dimension following a split by a
1441  * constant factor. For most uses of vectorize you want the two
1442  * argument form. The variable to be vectorized should be the
1443  * innermost one. */
1444  Func &vectorize(const VarOrRVar &var);
1445 
1446  /** Mark a dimension to be completely unrolled. The dimension
1447  * should have constant extent - e.g. because it is the inner
1448  * dimension following a split by a constant factor. For most uses
1449  * of unroll you want the two-argument form. */
1450  Func &unroll(const VarOrRVar &var);
1451 
1452  /** Split a dimension by the given factor, then vectorize the
1453  * inner dimension. This is how you vectorize a loop of unknown
1454  * size. The variable to be vectorized should be the innermost
1455  * one. After this call, var refers to the outer dimension of the
1456  * split. 'factor' must be an integer. */
1457  Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1458 
1459  /** Split a dimension by the given factor, then unroll the inner
1460  * dimension. This is how you unroll a loop of unknown size by
1461  * some constant factor. After this call, var refers to the outer
1462  * dimension of the split. 'factor' must be an integer. */
1463  Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1464 
1465  /** Set the loop partition policy. Loop partitioning can be useful to
1466  * optimize boundary conditions (such as clamp_edge). Loop partitioning
1467  * splits a for loop into three for loops: a prologue, a steady-state,
1468  * and an epilogue.
1469  * The default policy is Auto. */
1470  Func &partition(const VarOrRVar &var, Partition partition_policy);
1471 
1472  /** Set the loop partition policy to Never for a vector of Vars and
1473  * RVars. */
1474  Func &never_partition(const std::vector<VarOrRVar> &vars);
1475 
1476  /** Set the loop partition policy to Never for some number of Vars and RVars. */
1477  template<typename... Args>
1479  never_partition(const VarOrRVar &x, Args &&...args) {
1480  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1481  return never_partition(collected_args);
1482  }
1483 
1484  /** Set the loop partition policy to Never for all Vars and RVar of the
1485  * initial definition of the Func. It must be called separately on any
1486  * update definitions. */
1488 
1489  /** Set the loop partition policy to Always for a vector of Vars and
1490  * RVars. */
1491  Func &always_partition(const std::vector<VarOrRVar> &vars);
1492 
1493  /** Set the loop partition policy to Always for some number of Vars and RVars. */
1494  template<typename... Args>
1496  always_partition(const VarOrRVar &x, Args &&...args) {
1497  std::vector<VarOrRVar> collected_args{x, std::forward<Args>(args)...};
1498  return always_partition(collected_args);
1499  }
1500 
1501  /** Set the loop partition policy to Always for all Vars and RVar of the
1502  * initial definition of the Func. It must be called separately on any
1503  * update definitions. */
1505 
1506  /** Statically declare that the range over which a function should
1507  * be evaluated is given by the second and third arguments. This
1508  * can let Halide perform some optimizations. E.g. if you know
1509  * there are going to be 4 color channels, you can completely
1510  * vectorize the color channel dimension without the overhead of
1511  * splitting it up. If bounds inference decides that it requires
1512  * more of this function than the bounds you have stated, a
1513  * runtime error will occur when you try to run your pipeline. */
1514  Func &bound(const Var &var, Expr min, Expr extent);
1515 
1516  /** Statically declare the range over which the function will be
1517  * evaluated in the general case. This provides a basis for the auto
1518  * scheduler to make trade-offs and scheduling decisions. The auto
1519  * generated schedules might break when the sizes of the dimensions are
1520  * very different from the estimates specified. These estimates are used
1521  * only by the auto scheduler if the function is a pipeline output. */
1522  Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1523 
1524  /** Set (min, extent) estimates for all dimensions in the Func
1525  * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1526  * repeatedly, but slightly terser. The size of the estimates vector
1527  * must match the dimensionality of the Func. */
1528  Func &set_estimates(const Region &estimates);
1529 
1530  /** Expand the region computed so that the min coordinates is
1531  * congruent to 'remainder' modulo 'modulus', and the extent is a
1532  * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1533  * the min and extent realized to be even, and calling
1534  * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1535  * to be even. The region computed always contains the region that
1536  * would have been computed without this directive, so no
1537  * assertions are injected.
1538  */
1539  Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1540 
1541  /** Expand the region computed so that the extent is a
1542  * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1543  * the extent realized to be even. The region computed always contains the
1544  * region that would have been computed without this directive, so no
1545  * assertions are injected. (This is essentially equivalent to align_bounds(),
1546  * but always leaving the min untouched.)
1547  */
1548  Func &align_extent(const Var &var, Expr modulus);
1549 
1550  /** Bound the extent of a Func's realization, but not its
1551  * min. This means the dimension can be unrolled or vectorized
1552  * even when its min is not fixed (for example because it is
1553  * compute_at tiles of another Func). This can also be useful for
1554  * forcing a function's allocation to be a fixed size, which often
1555  * means it can go on the stack. */
1556  Func &bound_extent(const Var &var, Expr extent);
1557 
1558  /** Split two dimensions at once by the given factors, and then
1559  * reorder the resulting dimensions to be xi, yi, xo, yo from
1560  * innermost outwards. This gives a tiled traversal. */
1561  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1562  const VarOrRVar &xo, const VarOrRVar &yo,
1563  const VarOrRVar &xi, const VarOrRVar &yi,
1564  const Expr &xfactor, const Expr &yfactor,
1566 
1567  /** A shorter form of tile, which reuses the old variable names as
1568  * the new outer dimensions */
1569  Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1570  const VarOrRVar &xi, const VarOrRVar &yi,
1571  const Expr &xfactor, const Expr &yfactor,
1573 
1574  /** A more general form of tile, which defines tiles of any dimensionality. */
1575  Func &tile(const std::vector<VarOrRVar> &previous,
1576  const std::vector<VarOrRVar> &outers,
1577  const std::vector<VarOrRVar> &inners,
1578  const std::vector<Expr> &factors,
1579  const std::vector<TailStrategy> &tails);
1580 
1581  /** The generalized tile, with a single tail strategy to apply to all vars. */
1582  Func &tile(const std::vector<VarOrRVar> &previous,
1583  const std::vector<VarOrRVar> &outers,
1584  const std::vector<VarOrRVar> &inners,
1585  const std::vector<Expr> &factors,
1587 
1588  /** Generalized tiling, reusing the previous names as the outer names. */
1589  Func &tile(const std::vector<VarOrRVar> &previous,
1590  const std::vector<VarOrRVar> &inners,
1591  const std::vector<Expr> &factors,
1593 
1594  /** Reorder variables to have the given nesting order, from
1595  * innermost out */
1596  Func &reorder(const std::vector<VarOrRVar> &vars);
1597 
1598  template<typename... Args>
1600  reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1601  std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1602  return reorder(collected_args);
1603  }
1604 
1605  /** Rename a dimension. Equivalent to split with a inner size of one. */
1606  Func &rename(const VarOrRVar &old_name, const VarOrRVar &new_name);
1607 
1608  /** Specify that race conditions are permitted for this Func,
1609  * which enables parallelizing over RVars even when Halide cannot
1610  * prove that it is safe to do so. Use this with great caution,
1611  * and only if you can prove to yourself that this is safe, as it
1612  * may result in a non-deterministic routine that returns
1613  * different values at different times or on different machines. */
1615 
1616  /** Issue atomic updates for this Func. This allows parallelization
1617  * on associative RVars. The function throws a compile error when
1618  * Halide fails to prove associativity. Use override_associativity_test
1619  * to disable the associativity test if you believe the function is
1620  * associative or the order of reduction variable execution does not
1621  * matter.
1622  * Halide compiles this into hardware atomic operations whenever possible,
1623  * and falls back to a mutex lock per storage element if it is impossible
1624  * to atomically update.
1625  * There are three possible outcomes of the compiled code:
1626  * atomic add, compare-and-swap loop, and mutex lock.
1627  * For example:
1628  *
1629  * hist(x) = 0;
1630  * hist(im(r)) += 1;
1631  * hist.compute_root();
1632  * hist.update().atomic().parallel();
1633  *
1634  * will be compiled to atomic add operations.
1635  *
1636  * hist(x) = 0;
1637  * hist(im(r)) = min(hist(im(r)) + 1, 100);
1638  * hist.compute_root();
1639  * hist.update().atomic().parallel();
1640  *
1641  * will be compiled to compare-and-swap loops.
1642  *
1643  * arg_max() = {0, im(0)};
1644  * Expr old_index = arg_max()[0];
1645  * Expr old_max = arg_max()[1];
1646  * Expr new_index = select(old_max < im(r), r, old_index);
1647  * Expr new_max = max(im(r), old_max);
1648  * arg_max() = {new_index, new_max};
1649  * arg_max.compute_root();
1650  * arg_max.update().atomic().parallel();
1651  *
1652  * will be compiled to updates guarded by a mutex lock,
1653  * since it is impossible to atomically update two different locations.
1654  *
1655  * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1656  * Compiling to other backends results in a compile error.
1657  * If an operation is compiled into a mutex lock, and is vectorized or is
1658  * compiled to CUDA or OpenCL, it also results in a compile error,
1659  * since per-element mutex lock on vectorized operation leads to a
1660  * deadlock.
1661  * Vectorization of predicated RVars (through rdom.where()) on CPU
1662  * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1663  * 8-bit and 16-bit atomics on GPU are also not supported. */
1664  Func &atomic(bool override_associativity_test = false);
1665 
1666  /** Specialize a Func. This creates a special-case version of the
1667  * Func where the given condition is true. The most effective
1668  * conditions are those of the form param == value, and boolean
1669  * Params. Consider a simple example:
1670  \code
1671  f(x) = x + select(cond, 0, 1);
1672  f.compute_root();
1673  \endcode
1674  * This is equivalent to:
1675  \code
1676  for (int x = 0; x < width; x++) {
1677  f[x] = x + (cond ? 0 : 1);
1678  }
1679  \endcode
1680  * Adding the scheduling directive:
1681  \code
1682  f.specialize(cond)
1683  \endcode
1684  * makes it equivalent to:
1685  \code
1686  if (cond) {
1687  for (int x = 0; x < width; x++) {
1688  f[x] = x;
1689  }
1690  } else {
1691  for (int x = 0; x < width; x++) {
1692  f[x] = x + 1;
1693  }
1694  }
1695  \endcode
1696  * Note that the inner loops have been simplified. In the first
1697  * path Halide knows that cond is true, and in the second path
1698  * Halide knows that it is false.
1699  *
1700  * The specialized version gets its own schedule, which inherits
1701  * every directive made about the parent Func's schedule so far
1702  * except for its specializations. This method returns a handle to
1703  * the new schedule. If you wish to retrieve the specialized
1704  * sub-schedule again later, you can call this method with the
1705  * same condition. Consider the following example of scheduling
1706  * the specialized version:
1707  *
1708  \code
1709  f(x) = x;
1710  f.compute_root();
1711  f.specialize(width > 1).unroll(x, 2);
1712  \endcode
1713  * Assuming for simplicity that width is even, this is equivalent to:
1714  \code
1715  if (width > 1) {
1716  for (int x = 0; x < width/2; x++) {
1717  f[2*x] = 2*x;
1718  f[2*x + 1] = 2*x + 1;
1719  }
1720  } else {
1721  for (int x = 0; x < width/2; x++) {
1722  f[x] = x;
1723  }
1724  }
1725  \endcode
1726  * For this case, it may be better to schedule the un-specialized
1727  * case instead:
1728  \code
1729  f(x) = x;
1730  f.compute_root();
1731  f.specialize(width == 1); // Creates a copy of the schedule so far.
1732  f.unroll(x, 2); // Only applies to the unspecialized case.
1733  \endcode
1734  * This is equivalent to:
1735  \code
1736  if (width == 1) {
1737  f[0] = 0;
1738  } else {
1739  for (int x = 0; x < width/2; x++) {
1740  f[2*x] = 2*x;
1741  f[2*x + 1] = 2*x + 1;
1742  }
1743  }
1744  \endcode
1745  * This can be a good way to write a pipeline that splits,
1746  * vectorizes, or tiles, but can still handle small inputs.
1747  *
1748  * If a Func has several specializations, the first matching one
1749  * will be used, so the order in which you define specializations
1750  * is significant. For example:
1751  *
1752  \code
1753  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1754  f.specialize(cond1);
1755  f.specialize(cond2);
1756  \endcode
1757  * is equivalent to:
1758  \code
1759  if (cond1) {
1760  for (int x = 0; x < width; x++) {
1761  f[x] = x + a - (cond2 ? c : d);
1762  }
1763  } else if (cond2) {
1764  for (int x = 0; x < width; x++) {
1765  f[x] = x + b - c;
1766  }
1767  } else {
1768  for (int x = 0; x < width; x++) {
1769  f[x] = x + b - d;
1770  }
1771  }
1772  \endcode
1773  *
1774  * Specializations may in turn be specialized, which creates a
1775  * nested if statement in the generated code.
1776  *
1777  \code
1778  f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1779  f.specialize(cond1).specialize(cond2);
1780  \endcode
1781  * This is equivalent to:
1782  \code
1783  if (cond1) {
1784  if (cond2) {
1785  for (int x = 0; x < width; x++) {
1786  f[x] = x + a - c;
1787  }
1788  } else {
1789  for (int x = 0; x < width; x++) {
1790  f[x] = x + a - d;
1791  }
1792  }
1793  } else {
1794  for (int x = 0; x < width; x++) {
1795  f[x] = x + b - (cond2 ? c : d);
1796  }
1797  }
1798  \endcode
1799  * To create a 4-way if statement that simplifies away all of the
1800  * ternary operators above, you could say:
1801  \code
1802  f.specialize(cond1).specialize(cond2);
1803  f.specialize(cond2);
1804  \endcode
1805  * or
1806  \code
1807  f.specialize(cond1 && cond2);
1808  f.specialize(cond1);
1809  f.specialize(cond2);
1810  \endcode
1811  *
1812  * Any prior Func which is compute_at some variable of this Func
1813  * gets separately included in all paths of the generated if
1814  * statement. The Var in the compute_at call to must exist in all
1815  * paths, but it may have been generated via a different path of
1816  * splits, fuses, and renames. This can be used somewhat
1817  * creatively. Consider the following code:
1818  \code
1819  g(x, y) = 8*x;
1820  f(x, y) = g(x, y) + 1;
1821  f.compute_root().specialize(cond);
1822  Var g_loop;
1823  f.specialize(cond).rename(y, g_loop);
1824  f.rename(x, g_loop);
1825  g.compute_at(f, g_loop);
1826  \endcode
1827  * When cond is true, this is equivalent to g.compute_at(f,y).
1828  * When it is false, this is equivalent to g.compute_at(f,x).
1829  */
1830  Stage specialize(const Expr &condition);
1831 
1832  /** Add a specialization to a Func that always terminates execution
1833  * with a call to halide_error(). By itself, this is of limited use,
1834  * but can be useful to terminate chains of specialize() calls where
1835  * no "default" case is expected (thus avoiding unnecessary code generation).
1836  *
1837  * For instance, say we want to optimize a pipeline to process images
1838  * in planar and interleaved format; we might typically do something like:
1839  \code
1840  ImageParam im(UInt(8), 3);
1841  Func f = do_something_with(im);
1842  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1843  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1844  \endcode
1845  * This code will vectorize along rows for the planar case, and across pixel
1846  * components for the interleaved case... but there is an implicit "else"
1847  * for the unhandled cases, which generates unoptimized code. If we never
1848  * anticipate passing any other sort of images to this, we code streamline
1849  * our code by adding specialize_fail():
1850  \code
1851  ImageParam im(UInt(8), 3);
1852  Func f = do_something(im);
1853  f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1854  f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1855  f.specialize_fail("Unhandled image format");
1856  \endcode
1857  * Conceptually, this produces codes like:
1858  \code
1859  if (im.dim(0).stride() == 1) {
1860  do_something_planar();
1861  } else if (im.dim(2).stride() == 1) {
1862  do_something_interleaved();
1863  } else {
1864  halide_error("Unhandled image format");
1865  }
1866  \endcode
1867  *
1868  * Note that calling specialize_fail() terminates the specialization chain
1869  * for a given Func; you cannot create new specializations for the Func
1870  * afterwards (though you can retrieve handles to previous specializations).
1871  */
1872  void specialize_fail(const std::string &message);
1873 
1874  /** Tell Halide that the following dimensions correspond to GPU
1875  * thread indices. This is useful if you compute a producer
1876  * function within the block indices of a consumer function, and
1877  * want to control how that function's dimensions map to GPU
1878  * threads. If the selected target is not an appropriate GPU, this
1879  * just marks those dimensions as parallel. */
1880  // @{
1881  Func &gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1882  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1883  Func &gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1884  // @}
1885 
1886  /** The given dimension corresponds to the lanes in a GPU
1887  * warp. GPU warp lanes are distinguished from GPU threads by the
1888  * fact that all warp lanes run together in lockstep, which
1889  * permits lightweight communication of data from one lane to
1890  * another. */
1891  Func &gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1892 
1893  /** Tell Halide to run this stage using a single gpu thread and
1894  * block. This is not an efficient use of your GPU, but it can be
1895  * useful to avoid copy-back for intermediate update stages that
1896  * touch a very small part of your Func. */
1898 
1899  /** Tell Halide that the following dimensions correspond to GPU
1900  * block indices. This is useful for scheduling stages that will
1901  * run serially within each GPU block. If the selected target is
1902  * not ptx, this just marks those dimensions as parallel. */
1903  // @{
1905  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1906  Func &gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1907  // @}
1908 
1909  /** Tell Halide that the following dimensions correspond to GPU
1910  * block indices and thread indices. If the selected target is not
1911  * ptx, these just mark the given dimensions as parallel. The
1912  * dimensions are consumed by this call, so do all other
1913  * unrolling, reordering, etc first. */
1914  // @{
1915  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api = DeviceAPI::Default_GPU);
1916  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y,
1917  const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api = DeviceAPI::Default_GPU);
1918  Func &gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z,
1919  const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api = DeviceAPI::Default_GPU);
1920  // @}
1921 
1922  /** Short-hand for tiling a domain and mapping the tile indices
1923  * to GPU block indices and the coordinates within each tile to
1924  * GPU thread indices. Consumes the variables given, so do all
1925  * other scheduling first. */
1926  // @{
1927  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1929  DeviceAPI device_api = DeviceAPI::Default_GPU);
1930 
1931  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1933  DeviceAPI device_api = DeviceAPI::Default_GPU);
1934  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1935  const VarOrRVar &bx, const VarOrRVar &by,
1936  const VarOrRVar &tx, const VarOrRVar &ty,
1937  const Expr &x_size, const Expr &y_size,
1939  DeviceAPI device_api = DeviceAPI::Default_GPU);
1940 
1941  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1942  const VarOrRVar &tx, const VarOrRVar &ty,
1943  const Expr &x_size, const Expr &y_size,
1945  DeviceAPI device_api = DeviceAPI::Default_GPU);
1946 
1947  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1948  const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1949  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1950  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1952  DeviceAPI device_api = DeviceAPI::Default_GPU);
1953  Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1954  const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1955  const Expr &x_size, const Expr &y_size, const Expr &z_size,
1957  DeviceAPI device_api = DeviceAPI::Default_GPU);
1958  // @}
1959 
1960  /** Schedule for execution on Hexagon. When a loop is marked with
1961  * Hexagon, that loop is executed on a Hexagon DSP. */
1963 
1964  /** Prefetch data written to or read from a Func or an ImageParam by a
1965  * subsequent loop iteration, at an optionally specified iteration offset. You may specify
1966  * specification of different vars for the location of the prefetch() instruction
1967  * vs. the location that is being prefetched:
1968  *
1969  * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1970  * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1971  * (in conjunction with 'offset')
1972  *
1973  * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
1974  * Note that the value for 'offset' applies only to 'from', not 'at'.
1975  *
1976  * The final argument specifies how prefetch of region outside bounds
1977  * should be handled.
1978  *
1979  * For example, consider this pipeline:
1980  \code
1981  Func f, g;
1982  Var x, y, z;
1983  f(x, y) = x + y;
1984  g(x, y) = 2 * f(x, y);
1985  h(x, y) = 3 * f(x, y);
1986  \endcode
1987  *
1988  * The following schedule:
1989  \code
1990  f.compute_root();
1991  g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
1992  h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
1993  \endcode
1994  *
1995  * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
1996  * the following loop nest:
1997  \code
1998  for y = ...
1999  for x = ...
2000  f(x, y) = x + y
2001  for y = ..
2002  for x = ...
2003  prefetch(&f[x + 2, y], 1, 16);
2004  g(x, y) = 2 * f(x, y)
2005  for y = ..
2006  for x = ...
2007  prefetch(&f[x, y + 2], 1, 16);
2008  h(x, y) = 3 * f(x, y)
2009  \endcode
2010  *
2011  * Note that the 'from' nesting level need not be adjacent to 'at':
2012  \code
2013  Func f, g;
2014  Var x, y, z, w;
2015  f(x, y, z, w) = x + y + z + w;
2016  g(x, y, z, w) = 2 * f(x, y, z, w);
2017  \endcode
2018  *
2019  * The following schedule:
2020  \code
2021  f.compute_root();
2022  g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2023  \endcode
2024  *
2025  * will produce code that prefetches a tile of data:
2026  \code
2027  for w = ...
2028  for z = ...
2029  for y = ...
2030  for x = ...
2031  f(x, y, z, w) = x + y + z + w
2032  for w = ...
2033  for z = ...
2034  for y = ...
2035  for x0 = ...
2036  prefetch(&f[x0, y, z, w + 2], 1, 16);
2037  for x = ...
2038  g(x, y, z, w) = 2 * f(x, y, z, w)
2039  \endcode
2040  *
2041  * Note that calling prefetch() with the same var for both 'at' and 'from'
2042  * is equivalent to calling prefetch() with that var.
2043  */
2044  // @{
2045  Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2047  Func &prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2049  template<typename T>
2050  Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2052  return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2053  }
2054  // @}
2055 
2056  /** Specify how the storage for the function is laid out. These
2057  * calls let you specify the nesting order of the dimensions. For
2058  * example, foo.reorder_storage(y, x) tells Halide to use
2059  * column-major storage for any realizations of foo, without
2060  * changing how you refer to foo in the code. You may want to do
2061  * this if you intend to vectorize across y. When representing
2062  * color images, foo.reorder_storage(c, x, y) specifies packed
2063  * storage (red, green, and blue values adjacent in memory), and
2064  * foo.reorder_storage(x, y, c) specifies planar storage (entire
2065  * red, green, and blue images one after the other in memory).
2066  *
2067  * If you leave out some dimensions, those remain in the same
2068  * positions in the nesting order while the specified variables
2069  * are reordered around them. */
2070  // @{
2071  Func &reorder_storage(const std::vector<Var> &dims);
2072 
2073  Func &reorder_storage(const Var &x, const Var &y);
2074  template<typename... Args>
2076  reorder_storage(const Var &x, const Var &y, Args &&...args) {
2077  std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2078  return reorder_storage(collected_args);
2079  }
2080  // @}
2081 
2082  /** Pad the storage extent of a particular dimension of
2083  * realizations of this function up to be a multiple of the
2084  * specified alignment. This guarantees that the strides for the
2085  * dimensions stored outside of dim will be multiples of the
2086  * specified alignment, where the strides and alignment are
2087  * measured in numbers of elements.
2088  *
2089  * For example, to guarantee that a function foo(x, y, c)
2090  * representing an image has scanlines starting on offsets
2091  * aligned to multiples of 16, use foo.align_storage(x, 16). */
2092  Func &align_storage(const Var &dim, const Expr &alignment);
2093 
2094  /** Store realizations of this function in a circular buffer of a
2095  * given extent. This is more efficient when the extent of the
2096  * circular buffer is a power of 2. If the fold factor is too
2097  * small, or the dimension is not accessed monotonically, the
2098  * pipeline will generate an error at runtime.
2099  *
2100  * The fold_forward option indicates that the new values of the
2101  * producer are accessed by the consumer in a monotonically
2102  * increasing order. Folding storage of producers is also
2103  * supported if the new values are accessed in a monotonically
2104  * decreasing order by setting fold_forward to false.
2105  *
2106  * For example, consider the pipeline:
2107  \code
2108  Func f, g;
2109  Var x, y;
2110  g(x, y) = x*y;
2111  f(x, y) = g(x, y) + g(x, y+1);
2112  \endcode
2113  *
2114  * If we schedule f like so:
2115  *
2116  \code
2117  g.compute_at(f, y).store_root().fold_storage(y, 2);
2118  \endcode
2119  *
2120  * Then g will be computed at each row of f and stored in a buffer
2121  * with an extent in y of 2, alternately storing each computed row
2122  * of g in row y=0 or y=1.
2123  */
2124  Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2125 
2126  /** Compute this function as needed for each unique value of the
2127  * given var for the given calling function f.
2128  *
2129  * For example, consider the simple pipeline:
2130  \code
2131  Func f, g;
2132  Var x, y;
2133  g(x, y) = x*y;
2134  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2135  \endcode
2136  *
2137  * If we schedule f like so:
2138  *
2139  \code
2140  g.compute_at(f, x);
2141  \endcode
2142  *
2143  * Then the C code equivalent to this pipeline will look like this
2144  *
2145  \code
2146 
2147  int f[height][width];
2148  for (int y = 0; y < height; y++) {
2149  for (int x = 0; x < width; x++) {
2150  int g[2][2];
2151  g[0][0] = x*y;
2152  g[0][1] = (x+1)*y;
2153  g[1][0] = x*(y+1);
2154  g[1][1] = (x+1)*(y+1);
2155  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2156  }
2157  }
2158 
2159  \endcode
2160  *
2161  * The allocation and computation of g is within f's loop over x,
2162  * and enough of g is computed to satisfy all that f will need for
2163  * that iteration. This has excellent locality - values of g are
2164  * used as soon as they are computed, but it does redundant
2165  * work. Each value of g ends up getting computed four times. If
2166  * we instead schedule f like so:
2167  *
2168  \code
2169  g.compute_at(f, y);
2170  \endcode
2171  *
2172  * The equivalent C code is:
2173  *
2174  \code
2175  int f[height][width];
2176  for (int y = 0; y < height; y++) {
2177  int g[2][width+1];
2178  for (int x = 0; x < width; x++) {
2179  g[0][x] = x*y;
2180  g[1][x] = x*(y+1);
2181  }
2182  for (int x = 0; x < width; x++) {
2183  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2184  }
2185  }
2186  \endcode
2187  *
2188  * The allocation and computation of g is within f's loop over y,
2189  * and enough of g is computed to satisfy all that f will need for
2190  * that iteration. This does less redundant work (each point in g
2191  * ends up being evaluated twice), but the locality is not quite
2192  * as good, and we have to allocate more temporary memory to store
2193  * g.
2194  */
2195  Func &compute_at(const Func &f, const Var &var);
2196 
2197  /** Schedule a function to be computed within the iteration over
2198  * some dimension of an update domain. Produces equivalent code
2199  * to the version of compute_at that takes a Var. */
2200  Func &compute_at(const Func &f, const RVar &var);
2201 
2202  /** Schedule a function to be computed within the iteration over
2203  * a given LoopLevel. */
2204  Func &compute_at(LoopLevel loop_level);
2205 
2206  /** Schedule the iteration over the initial definition of this function
2207  * to be fused with another stage 's' from outermost loop to a
2208  * given LoopLevel. */
2209  // @{
2210  Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2212  Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2214 
2215  /** Compute all of this function once ahead of time. Reusing
2216  * the example in \ref Func::compute_at :
2217  *
2218  \code
2219  Func f, g;
2220  Var x, y;
2221  g(x, y) = x*y;
2222  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2223 
2224  g.compute_root();
2225  \endcode
2226  *
2227  * is equivalent to
2228  *
2229  \code
2230  int f[height][width];
2231  int g[height+1][width+1];
2232  for (int y = 0; y < height+1; y++) {
2233  for (int x = 0; x < width+1; x++) {
2234  g[y][x] = x*y;
2235  }
2236  }
2237  for (int y = 0; y < height; y++) {
2238  for (int x = 0; x < width; x++) {
2239  f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2240  }
2241  }
2242  \endcode
2243  *
2244  * g is computed once ahead of time, and enough is computed to
2245  * satisfy all uses of it. This does no redundant work (each point
2246  * in g is evaluated once), but has poor locality (values of g are
2247  * probably not still in cache when they are used by f), and
2248  * allocates lots of temporary memory to store g.
2249  */
2251 
2252  /** Use the halide_memoization_cache_... interface to store a
2253  * computed version of this function across invocations of the
2254  * Func.
2255  *
2256  * If an eviction_key is provided, it must be constructed with
2257  * Expr of integer or handle type. The key Expr will be promoted
2258  * to a uint64_t and can be used with halide_memoization_cache_evict
2259  * to remove memoized entries using this eviction key from the
2260  * cache. Memoized computations that do not provide an eviction
2261  * key will never be evicted by this mechanism.
2262  */
2263  Func &memoize(const EvictionKey &eviction_key = EvictionKey());
2264 
2265  /** Produce this Func asynchronously in a separate
2266  * thread. Consumers will be run by the task system when the
2267  * production is complete. If this Func's store level is different
2268  * to its compute level, consumers will be run concurrently,
2269  * blocking as necessary to prevent reading ahead of what the
2270  * producer has computed. If storage is folded, then the producer
2271  * will additionally not be permitted to run too far ahead of the
2272  * consumer, to avoid clobbering data that has not yet been
2273  * used.
2274  *
2275  * Take special care when combining this with custom thread pool
2276  * implementations, as avoiding deadlock with producer-consumer
2277  * parallelism requires a much more sophisticated parallel runtime
2278  * than with data parallelism alone. It is strongly recommended
2279  * you just use Halide's default thread pool, which guarantees no
2280  * deadlock and a bound on the number of threads launched.
2281  */
2283 
2284  /** Bound the extent of a Func's storage, but not extent of its
2285  * compute. This can be useful for forcing a function's allocation
2286  * to be a fixed size, which often means it can go on the stack.
2287  * If bounds inference decides that it requires more storage for
2288  * this function than the allocation size you have stated, a runtime
2289  * error will occur when you try to run the pipeline. */
2290  Func &bound_storage(const Var &dim, const Expr &bound);
2291 
2292  /** Allocate storage for this function within f's loop over
2293  * var. Scheduling storage is optional, and can be used to
2294  * separate the loop level at which storage occurs from the loop
2295  * level at which computation occurs to trade off between locality
2296  * and redundant work. This can open the door for two types of
2297  * optimization.
2298  *
2299  * Consider again the pipeline from \ref Func::compute_at :
2300  \code
2301  Func f, g;
2302  Var x, y;
2303  g(x, y) = x*y;
2304  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2305  \endcode
2306  *
2307  * If we schedule it like so:
2308  *
2309  \code
2310  g.compute_at(f, x).store_at(f, y);
2311  \endcode
2312  *
2313  * Then the computation of g takes place within the loop over x,
2314  * but the storage takes place within the loop over y:
2315  *
2316  \code
2317  int f[height][width];
2318  for (int y = 0; y < height; y++) {
2319  int g[2][width+1];
2320  for (int x = 0; x < width; x++) {
2321  g[0][x] = x*y;
2322  g[0][x+1] = (x+1)*y;
2323  g[1][x] = x*(y+1);
2324  g[1][x+1] = (x+1)*(y+1);
2325  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2326  }
2327  }
2328  \endcode
2329  *
2330  * Provided the for loop over x is serial, halide then
2331  * automatically performs the following sliding window
2332  * optimization:
2333  *
2334  \code
2335  int f[height][width];
2336  for (int y = 0; y < height; y++) {
2337  int g[2][width+1];
2338  for (int x = 0; x < width; x++) {
2339  if (x == 0) {
2340  g[0][x] = x*y;
2341  g[1][x] = x*(y+1);
2342  }
2343  g[0][x+1] = (x+1)*y;
2344  g[1][x+1] = (x+1)*(y+1);
2345  f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2346  }
2347  }
2348  \endcode
2349  *
2350  * Two of the assignments to g only need to be done when x is
2351  * zero. The rest of the time, those sites have already been
2352  * filled in by a previous iteration. This version has the
2353  * locality of compute_at(f, x), but allocates more memory and
2354  * does much less redundant work.
2355  *
2356  * Halide then further optimizes this pipeline like so:
2357  *
2358  \code
2359  int f[height][width];
2360  for (int y = 0; y < height; y++) {
2361  int g[2][2];
2362  for (int x = 0; x < width; x++) {
2363  if (x == 0) {
2364  g[0][0] = x*y;
2365  g[1][0] = x*(y+1);
2366  }
2367  g[0][(x+1)%2] = (x+1)*y;
2368  g[1][(x+1)%2] = (x+1)*(y+1);
2369  f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2370  }
2371  }
2372  \endcode
2373  *
2374  * Halide has detected that it's possible to use a circular buffer
2375  * to represent g, and has reduced all accesses to g modulo 2 in
2376  * the x dimension. This optimization only triggers if the for
2377  * loop over x is serial, and if halide can statically determine
2378  * some power of two large enough to cover the range needed. For
2379  * powers of two, the modulo operator compiles to more efficient
2380  * bit-masking. This optimization reduces memory usage, and also
2381  * improves locality by reusing recently-accessed memory instead
2382  * of pulling new memory into cache.
2383  *
2384  */
2385  Func &store_at(const Func &f, const Var &var);
2386 
2387  /** Equivalent to the version of store_at that takes a Var, but
2388  * schedules storage within the loop over a dimension of a
2389  * reduction domain */
2390  Func &store_at(const Func &f, const RVar &var);
2391 
2392  /** Equivalent to the version of store_at that takes a Var, but
2393  * schedules storage at a given LoopLevel. */
2394  Func &store_at(LoopLevel loop_level);
2395 
2396  /** Equivalent to \ref Func::store_at, but schedules storage
2397  * outside the outermost loop. */
2399 
2400  /** Hoist storage for this function within f's loop over
2401  * var. This is different from \ref Func::store_at, because hoist_storage
2402  * simply moves an actual allocation to a given loop level and
2403  * doesn't trigger any of the optimizations such as sliding window.
2404  * Hoisting storage is optional and can be used as an optimization
2405  * to avoid unnecessary allocations by moving it out from an inner
2406  * loop.
2407  *
2408  * Consider again the pipeline from \ref Func::compute_at :
2409  \code
2410  Func f, g;
2411  Var x, y;
2412  g(x, y) = x*y;
2413  f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2414  \endcode
2415  *
2416  * If we schedule f like so:
2417  *
2418  \code
2419  g.compute_at(f, x);
2420  \endcode
2421  *
2422  * Then the C code equivalent to this pipeline will look like this
2423  *
2424  \code
2425 
2426  int f[height][width];
2427  for (int y = 0; y < height; y++) {
2428  for (int x = 0; x < width; x++) {
2429  int g[2][2];
2430  g[0][0] = x*y;
2431  g[0][1] = (x+1)*y;
2432  g[1][0] = x*(y+1);
2433  g[1][1] = (x+1)*(y+1);
2434  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2435  }
2436  }
2437 
2438  \endcode
2439  *
2440  * Note the allocation for g inside of the loop over variable x which
2441  * can happen for each iteration of the inner loop (in total height * width times).
2442  * In some cases allocation can be expensive, so it might be better to do it once
2443  * and reuse allocated memory across all iterations of the loop.
2444  *
2445  * This can be done by scheduling g like so:
2446  *
2447  \code
2448  g.compute_at(f, x).hoist_storage(f, Var::outermost());
2449  \endcode
2450  *
2451  * Then the C code equivalent to this pipeline will look like this
2452  *
2453  \code
2454 
2455  int f[height][width];
2456  int g[2][2];
2457  for (int y = 0; y < height; y++) {
2458  for (int x = 0; x < width; x++) {
2459  g[0][0] = x*y;
2460  g[0][1] = (x+1)*y;
2461  g[1][0] = x*(y+1);
2462  g[1][1] = (x+1)*(y+1);
2463  f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2464  }
2465  }
2466 
2467  \endcode
2468  *
2469  * hoist_storage can be used together with \ref Func::store_at and
2470  * \ref Func::fold_storage (for example, to hoist the storage allocated
2471  * after sliding window optimization).
2472  *
2473  */
2474  Func &hoist_storage(const Func &f, const Var &var);
2475 
2476  /** Equivalent to the version of hoist_storage that takes a Var, but
2477  * schedules storage within the loop over a dimension of a
2478  * reduction domain */
2479  Func &hoist_storage(const Func &f, const RVar &var);
2480 
2481  /** Equivalent to the version of hoist_storage that takes a Var, but
2482  * schedules storage at a given LoopLevel. */
2484 
2485  /** Equivalent to \ref Func::hoist_storage_root, but schedules storage
2486  * outside the outermost loop. */
2488 
2489  /** Aggressively inline all uses of this function. This is the
2490  * default schedule, so you're unlikely to need to call this. For
2491  * a Func with an update definition, that means it gets computed
2492  * as close to the innermost loop as possible.
2493  *
2494  * Consider once more the pipeline from \ref Func::compute_at :
2495  *
2496  \code
2497  Func f, g;
2498  Var x, y;
2499  g(x, y) = x*y;
2500  f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2501  \endcode
2502  *
2503  * Leaving g as inline, this compiles to code equivalent to the following C:
2504  *
2505  \code
2506  int f[height][width];
2507  for (int y = 0; y < height; y++) {
2508  for (int x = 0; x < width; x++) {
2509  f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2510  }
2511  }
2512  \endcode
2513  */
2515 
2516  /** Get a handle on an update step for the purposes of scheduling
2517  * it. */
2518  Stage update(int idx = 0);
2519 
2520  /** Set the type of memory this Func should be stored in. Controls
2521  * whether allocations go on the stack or the heap on the CPU, and
2522  * in global vs shared vs local on the GPU. See the documentation
2523  * on MemoryType for more detail. */
2524  Func &store_in(MemoryType memory_type);
2525 
2526  /** Trace all loads from this Func by emitting calls to
2527  * halide_trace. If the Func is inlined, this has no
2528  * effect. */
2530 
2531  /** Trace all stores to the buffer backing this Func by emitting
2532  * calls to halide_trace. If the Func is inlined, this call
2533  * has no effect. */
2535 
2536  /** Trace all realizations of this Func by emitting calls to
2537  * halide_trace. */
2539 
2540  /** Add a string of arbitrary text that will be passed thru to trace
2541  * inspection code if the Func is realized in trace mode. (Funcs that are
2542  * inlined won't have their tags emitted.) Ignored entirely if
2543  * tracing is not enabled for the Func (or globally).
2544  */
2545  Func &add_trace_tag(const std::string &trace_tag);
2546 
2547  /** Get a handle on the internal halide function that this Func
2548  * represents. Useful if you want to do introspection on Halide
2549  * functions */
2550  Internal::Function function() const {
2551  return func;
2552  }
2553 
2554  /** You can cast a Func to its pure stage for the purposes of
2555  * scheduling it. */
2556  operator Stage() const;
2557 
2558  /** Get a handle on the output buffer for this Func. Only relevant
2559  * if this is the output Func in a pipeline. Useful for making
2560  * static promises about strides, mins, and extents. */
2561  // @{
2563  std::vector<OutputImageParam> output_buffers() const;
2564  // @}
2565 
2566  /** Use a Func as an argument to an external stage. */
2567  operator ExternFuncArgument() const;
2568 
2569  /** Infer the arguments to the Func, sorted into a canonical order:
2570  * all buffers (sorted alphabetically by name), followed by all non-buffers
2571  * (sorted alphabetically by name).
2572  This lets you write things like:
2573  \code
2574  func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2575  \endcode
2576  */
2577  std::vector<Argument> infer_arguments() const;
2578 
2579  /** Get the source location of the pure definition of this
2580  * Func. See Stage::source_location() */
2581  std::string source_location() const;
2582 
2583  /** Return the current StageSchedule associated with this initial
2584  * Stage of this Func. For introspection only: to modify schedule,
2585  * use the Func interface. */
2587  return Stage(*this).get_schedule();
2588  }
2589 };
2590 
2591 namespace Internal {
2592 
2593 template<typename Last>
2594 inline void check_types(const Tuple &t, int idx) {
2595  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2596  user_assert(t[idx].type() == type_of<T>())
2597  << "Can't evaluate expression "
2598  << t[idx] << " of type " << t[idx].type()
2599  << " as a scalar of type " << type_of<T>() << "\n";
2600 }
2601 
2602 template<typename First, typename Second, typename... Rest>
2603 inline void check_types(const Tuple &t, int idx) {
2604  check_types<First>(t, idx);
2605  check_types<Second, Rest...>(t, idx + 1);
2606 }
2607 
2608 template<typename Last>
2609 inline void assign_results(Realization &r, int idx, Last last) {
2610  using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2611  *last = Buffer<T>(r[idx])();
2612 }
2613 
2614 template<typename First, typename Second, typename... Rest>
2615 inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2616  assign_results<First>(r, idx, first);
2617  assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2618 }
2619 
2620 } // namespace Internal
2621 
2622 /** JIT-Compile and run enough code to evaluate a Halide
2623  * expression. This can be thought of as a scalar version of
2624  * \ref Func::realize */
2625 template<typename T>
2627  user_assert(e.type() == type_of<T>())
2628  << "Can't evaluate expression "
2629  << e << " of type " << e.type()
2630  << " as a scalar of type " << type_of<T>() << "\n";
2631  Func f;
2632  f() = e;
2633  Buffer<T, 0> im = f.realize(ctx);
2634  return im();
2635 }
2636 
2637 /** evaluate with a default user context */
2638 template<typename T>
2640  return evaluate<T>(nullptr, e);
2641 }
2642 
2643 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2644 template<typename First, typename... Rest>
2645 HALIDE_NO_USER_CODE_INLINE void evaluate(JITUserContext *ctx, Tuple t, First first, Rest &&...rest) {
2646  Internal::check_types<First, Rest...>(t, 0);
2647 
2648  Func f;
2649  f() = t;
2650  Realization r = f.realize(ctx);
2651  Internal::assign_results(r, 0, first, rest...);
2652 }
2653 
2654 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
2655 template<typename First, typename... Rest>
2656 HALIDE_NO_USER_CODE_INLINE void evaluate(Tuple t, First first, Rest &&...rest) {
2657  evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2658 }
2659 
2660 namespace Internal {
2661 
2662 inline void schedule_scalar(Func f) {
2664  if (t.has_gpu_feature()) {
2665  f.gpu_single_thread();
2666  }
2667  if (t.has_feature(Target::HVX)) {
2668  f.hexagon();
2669  }
2670 }
2671 
2672 } // namespace Internal
2673 
2674 /** JIT-Compile and run enough code to evaluate a Halide
2675  * expression. This can be thought of as a scalar version of
2676  * \ref Func::realize. Can use GPU if jit target from environment
2677  * specifies one.
2678  */
2679 template<typename T>
2681  user_assert(e.type() == type_of<T>())
2682  << "Can't evaluate expression "
2683  << e << " of type " << e.type()
2684  << " as a scalar of type " << type_of<T>() << "\n";
2685  Func f;
2686  f() = e;
2688  Buffer<T, 0> im = f.realize();
2689  return im();
2690 }
2691 
2692 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2693  * use GPU if jit target from environment specifies one. */
2694 // @{
2695 template<typename First, typename... Rest>
2696 HALIDE_NO_USER_CODE_INLINE void evaluate_may_gpu(Tuple t, First first, Rest &&...rest) {
2697  Internal::check_types<First, Rest...>(t, 0);
2698 
2699  Func f;
2700  f() = t;
2702  Realization r = f.realize();
2703  Internal::assign_results(r, 0, first, rest...);
2704 }
2705 // @}
2706 
2707 } // namespace Halide
2708 
2709 #endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition: Errors.h:19
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition: Util.h:46
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition: Buffer.h:122
Helper class for identifying purpose of an Expr passed to memoize.
Definition: Func.h:691
EvictionKey(const Expr &expr=Expr())
Definition: Func.h:697
A halide function.
Definition: Func.h:706
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition: Func.h:1237
Func & always_partition_all()
Set the loop partition policy to Always for all Vars and RVar of the initial definition of the Func.
Func & hoist_storage_root()
Equivalent to Func::hoist_storage_root, but schedules storage outside the outermost loop.
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_...
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & async()
Produce this Func asynchronously in a separate thread.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & hoist_storage(const Func &f, const Var &var)
Hoist storage for this function within f's loop over var.
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment())
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type always_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Always for some number of Vars and RVars.
Definition: Func.h:1496
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_root()
Compute all of this function once ahead of time.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & reorder_storage(const Var &x, const Var &y)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition: Func.h:1254
Func(const Type &required_type, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
bool defined() const
Does this function have at least a pure definition.
Func(const std::vector< Type > &required_types, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition: Func.h:2586
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
void infer_input_bounds(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment())
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
const Type & type() const
Get the type(s) of the outputs of this Func.
Func & hoist_storage(LoopLevel loop_level)
Equivalent to the version of hoist_storage that takes a Var, but schedules storage at a given LoopLev...
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:2050
std::vector< Var > args() const
Get the pure arguments.
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
int dimensions() const
The dimensionality (number of arguments) of this function.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition: Func.h:2076
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
std::string source_location() const
Get the source location of the pure definition of this Func.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:1600
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Realization realize(JITUserContext *context, std::vector< int32_t > sizes={}, const Target &target=Target())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
int outputs() const
Get the number of outputs of this Func.
Func & never_partition_all()
Set the loop partition policy to Never for all Vars and RVar of the initial definition of the Func.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
int num_update_definitions() const
How many update definitions does this function have?
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Stage specialize(const Expr &condition)
Specialize a Func.
Callable compile_to_callable(const std::vector< Argument > &args, const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code and return a callable struct that behaves like a fun...
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition: Func.h:765
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
Func & hoist_storage(const Func &f, const RVar &var)
Equivalent to the version of hoist_storage that takes a Var, but schedules storage within the loop ov...
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1170
void realize(Pipeline::RealizationArg outputs, const Target &target=Target())
Evaluate this function into an existing allocated buffer or buffers.
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
Func & partition(const VarOrRVar &var, Partition partition_policy)
Set the loop partition policy.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition: Func.h:1057
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
void realize(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=Target())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
Func & never_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Never for a vector of Vars and RVars.
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition: Func.h:1152
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
Func & prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
std::vector< OutputImageParam > output_buffers() const
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type never_partition(const VarOrRVar &x, Args &&...args)
Set the loop partition policy to Never for some number of Vars and RVars.
Definition: Func.h:1479
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
const std::vector< Type > & types() const
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
Tuple values() const
The values returned by this function.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & compute_inline()
Aggressively inline all uses of this function.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition: Func.h:1180
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
void infer_input_bounds(JITUserContext *context, const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment())
Versions of infer_input_bounds that take a custom user context to pass to runtime functions.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition: Func.h:1162
Func & always_partition(const std::vector< VarOrRVar > &vars)
Set the loop partition policy to Always for a vector of Vars and RVars.
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition: Func.h:497
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition: Func.h:594
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition: Func.h:616
int index() const
Return index to the function outputs.
Definition: Func.h:680
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition: ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition: Definition.h:38
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
bool defined() const
Definition objects are nullable.
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
A base class for passes over the IR which modify it (e.g.
Definition: IRMutator.h:26
A schedule for a single stage of a Halide pipeline.
Definition: Schedule.h:695
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:203
A halide module.
Definition: Module.h:142
A handle on the output buffer of a pipeline.
A reference-counted handle to a parameter to a halide pipeline.
Definition: Parameter.h:40
A class representing a Halide pipeline.
Definition: Pipeline.h:107
A multi-dimensional domain over which to iterate.
Definition: RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition: RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition: Realization.h:19
A single definition of a Func.
Definition: Func.h:69
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition: Func.h:468
Stage & always_partition_all()
std::string name() const
Return the name of this stage, e.g.
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition: Func.h:383
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Func rfactor(const RVar &r, const Var &v)
Stage & always_partition(const std::vector< VarOrRVar > &vars)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & unroll(const VarOrRVar &var)
Stage & parallel(const VarOrRVar &var)
Stage & allow_race_conditions()
Stage & serial(const VarOrRVar &var)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage & prefetch(const Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type never_partition(const VarOrRVar &x, Args &&...args)
Definition: Func.h:390
Stage specialize(const Expr &condition)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition: Func.h:93
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type always_partition(const VarOrRVar &x, Args &&...args)
Definition: Func.h:397
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Func rfactor(std::vector< std::pair< RVar, Var >> preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition: Func.h:106
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy >> &align)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & never_partition_all()
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Stage & never_partition(const std::vector< VarOrRVar > &vars)
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Stage & partition(const VarOrRVar &var, Partition partition_policy)
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition: Tuple.h:18
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition: Var.h:163
void schedule_scalar(Func f)
Definition: Func.h:2662
void assign_results(Realization &r, int idx, Last last)
Definition: Func.h:2609
void check_types(const Tuple &t, int idx)
Definition: Func.h:2594
ForType
An enum describing a type of loop traversal.
Definition: Expr.h:401
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2680
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition: Schedule.h:33
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition: Schedule.h:137
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:603
NameMangling
An enum to specify calling convention for extern stages.
Definition: Function.h:25
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition: DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition: Pipeline.h:72
@ Text
Definition: Pipeline.h:73
Stage ScheduleHandle
Definition: Func.h:488
std::vector< Range > Region
A multi-dimensional box.
Definition: Expr.h:345
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:606
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition: Expr.h:348
Partition
Different ways to handle loops with a potentially optimizable boundary conditions.
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition: Func.h:2626
A fragment of Halide syntax.
Definition: Expr.h:258
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition: Expr.h:322
An argument to an extern-defined Func.
A set of custom overrides of runtime functions.
Definition: JITModule.h:35
A context to be passed to Pipeline::realize.
Definition: JITModule.h:136
A struct representing a target machine and os to generate code for.
Definition: Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition: Type.h:276
A class that can represent Vars or RVars.
Definition: Func.h:29
bool is_rvar
Definition: Func.h:57
VarOrRVar(const Var &v)
Definition: Func.h:33
VarOrRVar(const RVar &r)
Definition: Func.h:36
const std::string & name() const
Definition: Func.h:47
VarOrRVar(const std::string &n, bool r)
Definition: Func.h:30
VarOrRVar(const ImplicitVar< N > &u)
Definition: Func.h:43
VarOrRVar(const RDom &r)
Definition: Func.h:39
#define user_assert(c)
Definition: test.h:10