Cool.  But one thing I noticed is that because you write
images = CurrentImage[2]
the framerate is divided by two.  A better way to do it might be
image=CurrentImage[];
oldimage=CurrentImage[];
Dynamic[oldimage=image;
image=CurrentImage[]]
Here is a rewritten faster version
 With[{keySize = 40, threshold = 0.05},
  With[{overlay = 
     Image[Table[
       If[i === keySize || i < keySize && Mod[j, keySize] === 0, 1, 
        0], {i, 1, 240}, {j, 1, 320}]]},
   DynamicModule[{image = ImageReflect[CurrentImage[], Left -> Right], 
     oldImage = ImageReflect[CurrentImage[], Left -> Right], imageKeys,
      diff, imageKeysDiff, keysPressed},
    Dynamic[
    oldImage = image;
    image = ImageReflect[CurrentImage[], Left -> Right];
    imageKeys = 
     Flatten[ImagePartition[ImageTake[image, keySize], keySize]];
    diff = ImageDifference[oldImage, image];
    imageKeysDiff = 
     Flatten[ImagePartition[ImageTake[diff, keySize], keySize]];
    keysPressed = 
     Flatten[Position[
       Mean[Flatten[ImageData[#]]] > threshold & /@ imageKeysDiff, 
       True]];
    EmitSound[Sound[SoundNote[#, .2, "Xylophone"]]] & /@ keysPressed;
    Show[ImageAdd[image, overlay], ImageSize -> 1000]
    ]]]]