% c_dtype = dtype_to_c_type(dtype)

// same dimension add floating point op
__kernel void apply_gradient_<%= dtype %>(__global const <%= c_dtype %> *A, __global const <%= c_dtype %> *B, __global <%= c_dtype %> *C) {
   // Get the index of the current element to be processed
   const int id = get_global_id(0);

   C[id] -= A[id] * B[0];

}