SIMD high performace xorcpy in C

In my current project I need a high performace xorcpy implemented in C. Here is my implementation to share and it is tested on my Xeon E5-2680 PC, it reaches almost 1.7GB/s.

#include <emmintrin.h>
__forceinline unsigned char* xorcpy(unsigned char* dst, const unsigned char* src, unsigned block_size)
    // Do the bulk of the copy a __m128i at a time, for faster speed
    __m128i* mto = (__m128i*)dst;
    const __m128i* mfrom = (__m128i*)(src);
    for(int i=(block_size / sizeof(__m128i) - 1); i>=0; i--)
        __m128i xmm1 = _mm_loadu_si128(mto);
        __m128i xmm2 = _mm_loadu_si128(mfrom);

        xmm1 = _mm_xor_si128(xmm1, xmm2);     //  XOR 16 bytes
        _mm_storeu_si128(mto, xmm1);

    // The rest bytes we have to do a byte a time though
    unsigned char* cto = (unsigned char*) mto;
    const unsigned char* cfrom = (const unsigned char*)mfrom;
    for(int i=(block_size % sizeof(__m128i)) - 1; i>=0; i--)
        *cto++ ^= (*cfrom++);
    return dst;

Convert DIB or BMP into JPEG in memory (diskless) using Windows GDIPlus


    using namespace Gdiplus;

    GdiplusStartupInput gdiplusStartupInput;
    ULONG_PTR gdiplusToken;
    GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, NULL);
    IStream* pJpegStream = NULL;  //declare a memory stream
    CreateStreamOnHGlobal(NULL, TRUE, (LPSTREAM*)&pJpegStream);

    GpBitmap* pBitmap = NULL;
    unsigned char* dib = GetDib(); //get DIB or BMP data buffer
    DllExports::GdipCreateBitmapFromGdiDib((LPBITMAPINFO)dib, dib + sizeof(BITMAPINFOHEADER), &pBitmap);

    CLSID imageCLSID;
    GetEncoderClsid(L"image/jpeg", &imageCLSID);

    int jpegQuality = 100;

    EncoderParameters encoderParams; //setup jpeg encoder parameters
    encoderParams.Count = 1;
    encoderParams.Parameter[0].NumberOfValues = 1;
    encoderParams.Parameter[0].Guid = EncoderQuality;
    encoderParams.Parameter[0].Type = EncoderParameterValueTypeLong;
    encoderParams.Parameter[0].Value = &jpegQuality;

    //save jpeg into memory stream
    DllExports::GdipSaveImageToStream(pBitmap, pJpegStream, &imageCLSID, &encoderParams);

    LARGE_INTEGER lnOffset;
    lnOffset.QuadPart = 0;
    //determine memory stream length
    pJpegStream->Seek(lnOffset, STREAM_SEEK_END, &ulnSize);
    pJpegStream->Seek(lnOffset, STREAM_SEEK_SET, NULL);
    int encodedBytes = ulnSize.QuadPart;
    //store jpeg memory stream into jpeg data buffer
    pJpegStream->Read(racBuf_, ulnSize.QuadPart, NULL);

Programmably dislable Windows Firewall in C/C++

// firewall.cpp : Defines the entry point for the console application.

#include "stdafx.h"
Copyright (C) Microsoft. All Rights Reserved.

    This C++ file includes sample code for disabling Windows Firewall 
    per profile using the Microsoft Windows Firewall APIs.


#pragma comment( lib, "ole32.lib" )

// Forward declarations
HRESULT     WFCOMInitialize(INetFwPolicy2** ppNetFwPolicy2);

int __cdecl main()
    HRESULT hrComInit = S_OK;
    HRESULT hr = S_OK;

    INetFwPolicy2 *pNetFwPolicy2 = NULL;

    // Initialize COM.
    hrComInit = CoInitializeEx(

    // Ignore RPC_E_CHANGED_MODE; this just means that COM has already been
    // initialized with a different mode. Since we don't care what the mode is,
    // we'll just use the existing mode.
    if (hrComInit != RPC_E_CHANGED_MODE)
        if (FAILED(hrComInit))
            printf("CoInitializeEx failed: 0x%08lxn", hrComInit);
            goto Cleanup;

    // Retrieve INetFwPolicy2
    hr = WFCOMInitialize(&pNetFwPolicy2);
    if (FAILED(hr))
        goto Cleanup;

    // Disable Windows Firewall for the Domain profile
    hr = pNetFwPolicy2->put_FirewallEnabled(NET_FW_PROFILE2_DOMAIN, FALSE);
    if (FAILED(hr))
        printf("put_FirewallEnabled failed for Domain: 0x%08lxn", hr);
        goto Cleanup;

    // Disable Windows Firewall for the Private profile
    hr = pNetFwPolicy2->put_FirewallEnabled(NET_FW_PROFILE2_PRIVATE, FALSE);
    if (FAILED(hr))
        printf("put_FirewallEnabled failed for Private: 0x%08lxn", hr);
        goto Cleanup;

    // Disable Windows Firewall for the Public profile
    hr = pNetFwPolicy2->put_FirewallEnabled(NET_FW_PROFILE2_PUBLIC, FALSE);
    if (FAILED(hr))
        printf("put_FirewallEnabled failed for Public: 0x%08lxn", hr);
        goto Cleanup;


    // Release INetFwPolicy2
    if (pNetFwPolicy2 != NULL)

    // Uninitialize COM.
    if (SUCCEEDED(hrComInit))
    return 0;

// Instantiate INetFwPolicy2
HRESULT WFCOMInitialize(INetFwPolicy2** ppNetFwPolicy2)
    HRESULT hr = S_OK;

    hr = CoCreateInstance(

    if (FAILED(hr))
        printf("CoCreateInstance for INetFwPolicy2 failed: 0x%08lxn", hr);
        goto Cleanup;        

    return hr;

Maximize the output of your CPU (Keep your CPU in full power mode)

Recently I am working on UI decoding optimization. I found this program, Full Throttle Override, is very useful, and it can fully release the power of your CPU.

To balance of power consumption and performance, almost all x86 CPUs support either Cool’n’Quiet or SpeedStep or PowerNow! technology, which can dynamically adjust the CPU frequency based on the loading.

I found it’s pretty easy to implement Full Throttle Override and here is the core C++ code

void FullThrottle()
    memset(&osvi, 0, sizeof(OSVERSIONINFO));
    osvi.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
    // For Vista and above
    if (osvi.dwMajorVersion >= 6)
	GUID *scheme;
	PowerGetActiveScheme(NULL, &scheme);
            , scheme
            , 100);
	PowerSetActiveScheme(NULL, scheme);
	MessageBox(NULL, L"Not supported by your OS!",L"",0);

Feel the power of parallel computing (OpenMP)

These two weeks, I am working on our product UI side to improve the performance of animation rendering. Previously, there is only one single thread to decode the animation line by line, and it takes around 50ms for the whole frame.


Now, I change the way of rendering, and let all lines parallel decode to fully take advantage of modern multi-core CPU.



Visual Studio natively supports OpenMP, it gives me a easy way to access this powerful tool.

To set this compiler option in the Visual Studio development environment

  1. Open the project’s Property Pages dialog box. For details, see How to: Open Project Property Pages.
  2. Expand the Configuration Properties node.
  3. Expand the C/C++ node.
  4. Select the Language property page.
  5. Modify the OpenMP Support property.

After some simple code update, surprisingly, I found that my frame decoding performance boosts 950% (almost 10 times faster), from 8 FPS to 76 FPS!


Let’s do simple test with the following code:

#define TEST_LENGTH 0x3fffffff

double mptest()
    LARGE_INTEGER  large_interger;
    double dff;
    __int64  c1, c2;
    dff = large_interger.QuadPart;
    unsigned char *test = new unsigned char[TEST_LENGTH];
    c1 = large_interger.QuadPart;
    #pragma omp parallel for
    for (int i = 0; i<TEST_LENGTH; i++)
        test[i] = rand();
    c2 = large_interger.QuadPart;
    delete test;
    return (c2 - c1) * 1000.0f / dff;

double test()
    LARGE_INTEGER  large_interger;
    double dff;
    __int64  c1, c2;
    dff = large_interger.QuadPart;
    unsigned char *test = new unsigned char[TEST_LENGTH];
    c1 = large_interger.QuadPart;
    for (int i = 0; i<TEST_LENGTH; i++)
        test[i] = rand();
    c2 = large_interger.QuadPart;
    delete test;
    return (c2 - c1) * 1000.0f / dff;

int _tmain(int argc, _TCHAR* argv[])
    printf("Random generation cost with MP %lfmsn", mptest());
    printf("Random generation cost without MP %lfmsn", test());
    return 0;

Look at the huge difference!


A funny joke application makes your window dance

These days I am so busy with work, and haven’t got time to write anything.

This is a funny joke application I wrote long time back. It will make your front window dance. If you press any key, this application will exit.

You can download here
This is the code, in case if you want to make your own.

#include "stdafx.h"
#include "math.h"
#include "windows.h"
#include "time.h"
#include "conio.h"

int _tmain(int argc, _TCHAR* argv[])
	RECT rect;
	int offset;
		HWND hWnd = GetForegroundWindow();
		offset = sin((float)rand())*10;
		rect.left += offset;
		rect.bottom += offset;
		rect.right += offset; += offset;
	return 0;

Web (HTTP) based remote PC monitor

VNC and Microsoft RemoteDesktop both provide remote control software which lets you see and interact with desktop applications across any network, however either software requires to install a client application on control side.

Here I have developed a small tool, WebPCMonitor. It will allow you to see and interact with your own PC desktop through HTTP protocol. It means that on client side, you don’t need install any 3rd party tool but a internet browser. Simply type your remote PC’s name or IP address, and you will see your PC’s desktop. You even can remote control your PC through PDA or SmartPhone (eg iPhone, iPad, Android, etc) as long as you have internet access. This tool works with Win98, Win2000, WinXP, and Win7.

You can download here.

Run the application, and open your browser and type “http://localhost:8000“. If you know your PC’s IP address, you can remote access through PDA or SmartPhone by typing “http://address:8000