我对clEnqueueWriteBuffer有一个很好奇的问题。在我当前的项目中,我想将约500张图像(1GB)复制到图形卡上,并平均一些像素。图像存储在一个大的double *数组中(大小:width * height * nImages)。如果我将300张图像复制到VRAM中并使用clEnqueueReadBuffer读出,我将得到存储在RAM中的确切信息:
内存:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
但是,如果加载的图像超过350张,则cl_mem对象的内容已损坏:
RAM:14450,5006076793 14450,5006076793 14456,8079379383 14455,2294939826 14444,7361060619
VRAM:-6,27743856220419E + 66 -6,27743856220419E + 66 -6,27743856220419E + 66 -6,27743856220419E + 66 -6,27743856220419E + 66
如果您能帮助我,我将非常高兴!
这是我的代码:
private: System::Void button7_Click(System::Object^ sender, System::EventArgs^ e) {
std::string text;
text = StringConvA(maskedTextBox1->Text);
textBox1->Text += "You want a bin size of " + atoi(text.c_str()) + ". You have "+ nforegroundImages+" images.\r\n";
binWidth = atoi(text.c_str());
nbins = (int)ceil((double)nforegroundImages / (double)binWidth);
textBox1->Text += "That is going to give you "+nbins+" bins\r\n";
//create context and cmd_queue
context = clCreateContext(NULL, nDevices, &deviceID[0], NULL, NULL, &err);
cmd_queue = clCreateCommandQueue(context, deviceID[0], NULL, &err);
//allocate result memory
//each result image will have width*height double entries. res_im is an array of pointer to double.
res_im = (double*)malloc(width*height*sizeof(double)*nbins);
cl_mem imageData_mem, result_mem, nWavenumber_mem, binSize_mem, imageSizeInPixels_mem, nbins_mem;
imageData_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, width * height * sizeof(double)*nforegroundImages, NULL, NULL);
result_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, width * height * sizeof(double)*nbins, NULL, NULL);
nWavenumber_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
binSize_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
imageSizeInPixels_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
nbins_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int), NULL, NULL);
clFinish(cmd_queue);
int imageSizeInPixels = width*height;
err = clEnqueueWriteBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages, (void*)images, 0, NULL, NULL); //this is where the images are copied into VRAM. If nforegroundImages>300, the data in VRAM is wrong, otherwise it is the same as in the images array
err = clEnqueueWriteBuffer(cmd_queue, nWavenumber_mem, CL_TRUE, 0, sizeof(int), (void*)&nforegroundImages, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, binSize_mem, CL_TRUE, 0, sizeof(int), (void*)&binWidth, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, imageSizeInPixels_mem, CL_TRUE, 0, sizeof(int), (void*)&imageSizeInPixels, 0, NULL, NULL);
err = clEnqueueWriteBuffer(cmd_queue, nbins_mem, CL_TRUE, 0, sizeof(int), (void*)&nbins, 0, NULL, NULL);
clFinish(cmd_queue);
//read the content of imageData_mem and store it in test array
double * test = (double*)malloc(width*height*sizeof(double)*nforegroundImages);
err = clEnqueueReadBuffer(cmd_queue, imageData_mem, CL_TRUE, 0, width*height*sizeof(double)*nforegroundImages,
test, 0, NULL, NULL);
clFinish(cmd_queue);
//compare original value from the images array to the value retrieved from the VRAM
textBox1->Text += images[1] + "\t" + images[1] + "\t" + images[10] + "\t" + images[100] + "\t" + images[1000] + "\t\r\n"; //original data
textBox1->Text += test[1] + "\t" + test[1] + "\t" + test[10] + "\t" + test[100] + "\t" + test[1000] + "\t\r\n"; //retrieved from imageData_mem
free(test);
//build the program from the source file and print the program build log
cl_program program[2];
cl_kernel kernel[2];
const char * filename = "addKernel.c";
char *program_source = load_program_source(filename);
program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source,
NULL, &err);
if (err == CL_OUT_OF_HOST_MEMORY){
textBox1->Text += "Error: out of Host Memory!\r\n";
}
else if (err == CL_INVALID_CONTEXT){
textBox1->Text += "Error: invalid Context!\r\n";
}
else if (err == CL_INVALID_VALUE){
textBox1->Text += "Error: invalid Value!\r\n";
}
err = clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL);
textBox1->Text += "Program build error: " + err + "\r\n";
cl_build_status status;
size_t logSize;
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL);
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
char* programLog;
programLog = (char*)calloc(logSize + 1, sizeof(char));
clGetProgramBuildInfo(program[0], deviceID[0], CL_PROGRAM_BUILD_LOG, logSize + 1, programLog, NULL);
this->textBox1->Text += "Program build info: error=" + err + ", status=" + status + ", programLog:\r\n" + *programLog + "\r\n" + "In case of an error please make sure that openCL has been initialized\r\n";
kernel[0] = clCreateKernel(program[0], "filterSpectrum", &err);
//(__global double *imageData, __global double *result, __constant int *nWavenumbers, __constant int *binSize, __constant int *imageSizeInPixels,__constant int * nbins)
// Now setup the arguments to our kernel
err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &imageData_mem);
err |= clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &result_mem);
err |= clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &nWavenumber_mem);
err |= clSetKernelArg(kernel[0], 3, sizeof(cl_mem), &binSize_mem);
err |= clSetKernelArg(kernel[0], 4, sizeof(cl_mem), &imageSizeInPixels_mem);
err |= clSetKernelArg(kernel[0], 5, sizeof(cl_mem), &nbins_mem);
size_t local_work_size = 32;
// Run the calculation by enqueuing it and forcing the
// command queue to complete the task
size_t global_work_size = width*height;
err = clEnqueueNDRangeKernel(cmd_queue, kernel[0], 1, NULL,&global_work_size, &local_work_size, 0, NULL, NULL);
clFinish(cmd_queue);
// Once finished read back the results from the answer
// array into the results array
err = clEnqueueReadBuffer(cmd_queue, result_mem, CL_TRUE, 0, width*height*sizeof(double)*nbins,
res_im, 0, NULL, NULL);
clFinish(cmd_queue);
textBox1->Text += "result values " + res_im[1] + "\t" + res_im[100] + "\t" + res_im[1000] + "\t" + res_im[10000] + "\t" + res_im[100000] + "\t" + res_im[1000000] + "\r\n";
hScrollBar2->Maximum = nbins+3;
clReleaseMemObject(imageSizeInPixels_mem);
clReleaseMemObject(imageData_mem);
clReleaseMemObject(result_mem);
clReleaseMemObject(nWavenumber_mem);
clReleaseMemObject(binSize_mem);
clReleaseMemObject(nbins_mem);
clReleaseCommandQueue(cmd_queue);
clReleaseContext(context);
}
最佳答案
您最有可能要求的内存大于驱动程序在单个分配中所允许的内存。看来您没有检查OpenCL运行时函数返回的大多数错误代码;这样做使诊断OpenCL程序的问题变得更加容易。您确实应该为每个API调用执行此操作。
您可以使用以下代码片段找出设备支持的最大单个内存分配:
cl_ulong maxMemAlloc;
clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL);
textBox1->Text += "Maximum memory allocation size is " + maxMemAlloc + " bytes\r\n";
通常情况下,最大的内存分配远远小于GPU内存的总大小。 OpenCL规范仅要求它至少为最大大小的1/4,或至少为128 MB。