% c_dtype = dtype_to_c_type(dtype)

__kernel void bias_add_grad_<%= dtype %>(__global const <%= c_dtype %> *received_grad, __global <%= c_dtype %> *output) {

const int id = get_global_id(0);
<%= c_dtype %> sum = 0;
for(int i = 0; i < <%= rows %>; i++) {
  sum += received_grad[<%= n %> * i + id];
}
output[id] = sum;

}