What does the following return for you?
code = "
__global__ void addTwo(mint * in, mint * out, mint length) {
int index = threadIdx.x + blockIdx.x*blockDim.x;
if (index < length)
out[index] = in[index] + 2;
}";
cudaFun =
CUDAFunctionLoad[code,
"addTwo", {{_Integer, _, "Input"}, {_Integer, _,
"Output"}, _Integer}, 256, "ShellCommandFunction" -> Print,
"ShellOutputFunction" -> Print]