I have CUDA 6.0 and Mathematica 9. CUDADot runs slower than Dot for large matrix multiplication.
In[1]:= ClearAll[Evaluate[Context[] <> "*"]]
n = 3*^3;
M = RandomReal[1, {n, n}];
MG = CUDAMemoryLoad[M];
AbsoluteTiming[A = CUDADot[MG, MG]]
B = CUDAMemoryUnload[A];
Out[3]= True
Out[4]= {1 -> {"Name" -> "GeForce GT 650M", "Clock Rate" -> 900000,
"Compute Capabilities" -> 3., "GPU Overlap" -> 1,
"Maximum Block Dimensions" -> {1024, 1024, 64},
"Maximum Grid Dimensions" -> {2147483647, 65535, 65535},
"Maximum Threads Per Block" -> 1024,
"Maximum Shared Memory Per Block" -> 49152,
"Total Constant Memory" -> 65536, "Warp Size" -> 32,
"Maximum Pitch" -> 2147483647,
"Maximum Registers Per Block" -> 65536, "Texture Alignment" -> 512,
"Multiprocessor Count" -> 2, "Core Count" -> 64,
"Execution Timeout" -> 1, "Integrated" -> False,
"Can Map Host Memory" -> True, "Compute Mode" -> "Default",
"Texture1D Width" -> 65536, "Texture2D Width" -> 65536,
"Texture2D Height" -> 65536, "Texture3D Width" -> 4096,
"Texture3D Height" -> 4096, "Texture3D Depth" -> 4096,
"Texture2D Array Width" -> 16384,
"Texture2D Array Height" -> 16384,
"Texture2D Array Slices" -> 2048, "Surface Alignment" -> 512,
"Concurrent Kernels" -> True, "ECC Enabled" -> False,
"TCC Enabled" -> False, "Total Memory" -> 1073414144}}
Out[5]= {{"Name" -> "CUDAResources", "Version" -> "",
"BuildNumber" -> "", "Qualifier" -> "OSX",
"MathematicaVersion" -> "9.0.0+", "SystemID" -> {"MacOSX-x86-64"},
"Description" -> "{ToolkitVersion -> 5.0, MinimumDriver -> 270.0}",
"Category" -> "", "Creator" -> "", "Publisher" -> "",
"Support" -> "", "Internal" -> False,
"Location" ->
OSX-", "Context" -> {}, "Enabled" -> True, "Loading" -> Manual,
"Hash" -> "fa491b5d7dd0144b2608a1daf4530222"}}
Out[8]= {1.819966, Null}
Out[10]= {3.032584, CUDAMemory["<477942836>", "Double"]}