This post is to record the steps how I run my first “Hello World” OpenCL C++ program.
- To make things easier, I created this, OpenCL.zip, OpenCL library and C/C++ header files.
- Create a Visual Studio C++ project
- The following code is to add two 2^25 array altogether using GPU:
#include <iostream>
#include <vector>
#include <string>
using namespace std;
#define __CL_ENABLE_EXCEPTIONS
#include <CL\cl.hpp>
// Compute c = a + b.
static const char source[] =
"kernel void add(\n"
" ulong n,\n"
" global const float *a,\n"
" global const float *b,\n"
" global float *c\n"
" )\n"
"{\n"
" size_t i = get_global_id(0);\n"
" if (i < n) {\n"
" c[i] = a[i] + b[i];\n"
" }\n"
"}\n";
int main() {
const size_t N = 1 << 25;
try {
// Get list of OpenCL platforms.
std::vector platform;
cl::Platform::get(&platform);
if (platform.empty()) {
std::cerr << "OpenCL platforms not found." << std::endl;
return 1;
}
// Get first available GPU device.
cl::Context context;
std::vector device;
for (auto p = platform.begin(); device.empty() && p != platform.end(); p++) {
std::vector pldev;
try {
p->getDevices(CL_DEVICE_TYPE_DEFAULT, &pldev);
for (auto d = pldev.begin(); device.empty() && d != pldev.end(); d++) {
if (!d->getInfo()) continue;
std::string ext = d->getInfo();
device.push_back(*d);
context = cl::Context(device);
}
}
catch (...) {
device.clear();
}
}
if (device.empty()) {
std::cerr << "GPUs device not found." << std::endl;
return 1;
}
std::cout << device[0].getInfo() << std::endl;
// Create command queue.
cl::CommandQueue queue(context, device[0]);
// Compile OpenCL program for found device.
cl::Program program(context, cl::Program::Sources(
1, std::make_pair(source, strlen(source))
));
try {
program.build(device);
}
catch (const cl::Error&) {
std::cerr
<< "OpenCL compilation error" << std::endl
<< program.getBuildInfo(device[0])
<< std::endl;
return 1;
}
cl::Kernel add(program, "add");
// Prepare input data.
std::vector a(N, 1);
std::vector b(N, 2);
std::vector c(N);
// Allocate device buffers and transfer input data to device.
cl::Buffer A(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
a.size() * sizeof(float), a.data());
cl::Buffer B(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
b.size() * sizeof(float), b.data());
cl::Buffer C(context, CL_MEM_READ_WRITE,
c.size() * sizeof(float));
// Set kernel parameters.
add.setArg(0, static_cast(N));
add.setArg(1, A);
add.setArg(2, B);
add.setArg(3, C);
// Launch kernel on the compute device.
queue.enqueueNDRangeKernel(add, cl::NullRange, N, cl::NullRange);
// Get result back to host.
queue.enqueueReadBuffer(C, CL_TRUE, 0, c.size() * sizeof(float), c.data());
// Should get '3' here.
std::cout << c[42] << std::endl;
}
catch (const cl::Error &err) {
std::cerr
<< "OpenCL error: "
<< err.what() << "(" << err.err() << ")"
<< std::endl;
return 1;
}
}