In my current project I need a high performace xorcpy implemented in C. Here is my implementation to share and it is tested on my Xeon E5-2680 PC, it reaches almost 1.7GB/s.
#include <emmintrin.h>
...
__forceinline unsigned char* xorcpy(unsigned char* dst, const unsigned char* src, unsigned block_size)
{
// Do the bulk of the copy a __m128i at a time, for faster speed
__m128i* mto = (__m128i*)dst;
const __m128i* mfrom = (__m128i*)(src);
for(int i=(block_size / sizeof(__m128i) - 1); i>=0; i--)
{
__m128i xmm1 = _mm_loadu_si128(mto);
__m128i xmm2 = _mm_loadu_si128(mfrom);
xmm1 = _mm_xor_si128(xmm1, xmm2); // XOR 16 bytes
_mm_storeu_si128(mto, xmm1);
++mto;
++mfrom;
}
// The rest bytes we have to do a byte a time though
unsigned char* cto = (unsigned char*) mto;
const unsigned char* cfrom = (const unsigned char*)mfrom;
for(int i=(block_size % sizeof(__m128i)) - 1; i>=0; i--)
{
*cto++ ^= (*cfrom++);
}
return dst;
}