本文介绍了CUDA mapped memory:device - >主机写入在主机上不可见的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

我想要做的是修改驻留在映射内存中的变量,这将导致主程序退出。



但是,旋转 while(var == 0); 行。我不知道如何刷新新的价值,所以它也将在主机端可见。



Btw。该变量声明为 volatile 无处不在,我试着使用 __ threadfence_system()函数没有成功。

主机 - >设备方向工作正常。



系统:Windows 7 x64,驱动程序358.50,GTX 560



下面是我无法工作的代码:

  static void handleCUDAError(cudaError_t err,const char * file,int line)
{
if(err!= cudaSuccess){
printf(%s在%s在行%d\\\
,cudaGetErrorString(err),file,line);
exit(EXIT_FAILURE);
}
}

#define CUDA_ERROR_CHECK(err)(handleCUDAError(err,__FILE__,__LINE__))

__global__ void echoKernel(volatile int * semaphore )
{
* semaphore = 1;

__threadfence_system();
}

int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));

volatile int var = 0;
volatile int * devptr;

CUDA_ERROR_CHECK(cudaHostRegister((int *)& var,sizeof(int),cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(& devptr,(int *)& var,0));

echoKernel<< 1,1>> (devptr);

while(var == 0);

CUDA_ERROR_CHECK(cudaDeviceSynchronize());

CUDA_ERROR_CHECK(cudaHostUnregister((int *)& var));
CUDA_ERROR_CHECK(cudaDeviceReset());

return 0;
}


解决方案

linux,它运行原样没有问题。



然而在windows上,WDDM命令批处理有一个问题。实际上,您的内核不会启动,并且在进入挂起的while循环之前不会启动。



WDDM命令队列是一个命令队列,最终去GPU设备。各种事件将导致该队列被刷新并且内容作为批处理命令被传递到GPU。



各种cuda运行时API调用可以有效地强制命令队列的刷新,例如 cudaDeviceSynchronize() cudaMemcpy()。但是在内核启动后,您不会在进入while循环之前发出任何运行时API调用。因此,在这种情况下,内核调用似乎被卡住在队列中,从未刷新。



的方式,例如通过在启动内核之后记录事件,然后查询该事件的状态。



下面是一个适用于我的代码修改示例:

  #include< stdio.h> 
static void handleCUDAError(cudaError_t err,const char * file,int line)
{
if(err!= cudaSuccess){
printf(%s in%s at line %d \\\
,cudaGetErrorString(err),file,line);
exit(EXIT_FAILURE);
}
}

#define CUDA_ERROR_CHECK(err)(handleCUDAError(err,__FILE__,__LINE__))

__global__ void echoKernel(volatile int * semaphore )
{
* semaphore = 1;

__Threadfence_system();
}

int main()
{
CUDA_ERROR_CHECK(cudaSetDevice(0));
CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));

volatile int var = 0;
volatile int * devptr;

CUDA_ERROR_CHECK(cudaHostRegister((int *)& var,sizeof(int),cudaHostRegisterMapped));
CUDA_ERROR_CHECK(cudaHostGetDevicePointer(& devptr,(int *)& var,0));

cudaEvent_t my_event;
CUDA_ERROR_CHECK(cudaEventCreate(& my_event));

echoKernel<< < 1,1> > (devptr);
CUDA_ERROR_CHECK(cudaEventRecord(my_event));
cudaEventQuery(my_event);

while(var == 0);

CUDA_ERROR_CHECK(cudaDeviceSynchronize());

CUDA_ERROR_CHECK(cudaHostUnregister((int *)& var));
CUDA_ERROR_CHECK(cudaDeviceReset());

return 0;
}

在CUDA 7.5,驱动程序358.50,Win7 x64发行项目,GTX460M上测试。 / p>

请注意,我们不会在标准错误检查器中包装 cudaEventQuery 调用,因为它的预期行为是在事件尚未完成时返回非零状态。


What I was trying to do is modifying a variable which resides in mapped memory that would cause the main program to exit.

But instead of this the main program keeps spinning on while (var == 0) ; line. I don't know how the new value could be flushed out so it would be visible on the host side too.

Btw. the variable is declared as volatile everywhere and I tried using the __threadfence_system() function with no success.

The host -> device direction works well.

System: Windows 7 x64, driver 358.50, GTX 560

Here is the piece of code that I can't get working:

static void handleCUDAError(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))

__global__ void echoKernel(volatile int* semaphore)
{
    *semaphore = 1;

    __threadfence_system();
}

int main()
{
    CUDA_ERROR_CHECK(cudaSetDevice(0));
    CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));

    volatile int var = 0;
    volatile int *devptr;

    CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof (int), cudaHostRegisterMapped));
    CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));

    echoKernel <<< 1, 1 >>> (devptr);

    while (var == 0) ;

    CUDA_ERROR_CHECK(cudaDeviceSynchronize());

    CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
    CUDA_ERROR_CHECK(cudaDeviceReset());

    return 0;
}
解决方案

When I run your code on linux, it runs as-is without issue.

However on windows, there is a problem around WDDM command batching. In effect, your kernel does not launch and is not getting launched before you enter the while-loop that hangs.

The WDDM command queue is a queue of commands that will eventually go to the GPU device. Various events will cause this queue to be "flushed" and the contents to be delivered as a "batch" of commands to the GPU.

Various cuda runtime API calls may effectively force the "flushing" of the command queue, such as cudaDeviceSynchronize() or cudaMemcpy(). However after the kernel launch, you are not issuing any runtime API calls before entering your while-loop. As a result, in this scenario it seems that the kernel call is getting "stuck" in the queue and never "flushed".

You can work around this in a variety of ways, for example by recording an event after the launch of the kernel and then querying the status of that event. This will have the effect of flushing the queue, which will launch the kernel.

Here's an example modification of your code that works for me:

#include <stdio.h>
static void handleCUDAError(cudaError_t err, const char *file, int line)
{
    if (err != cudaSuccess) {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}

#define CUDA_ERROR_CHECK(err) (handleCUDAError(err, __FILE__, __LINE__ ))

__global__ void echoKernel(volatile int* semaphore)
{
    *semaphore = 1;

    __threadfence_system();
}

int main()
{
    CUDA_ERROR_CHECK(cudaSetDevice(0));
    CUDA_ERROR_CHECK(cudaSetDeviceFlags(cudaDeviceMapHost));

    volatile int var = 0;
    volatile int *devptr;

    CUDA_ERROR_CHECK(cudaHostRegister((int*)&var, sizeof(int), cudaHostRegisterMapped));
    CUDA_ERROR_CHECK(cudaHostGetDevicePointer(&devptr, (int*)&var, 0));

    cudaEvent_t my_event;
    CUDA_ERROR_CHECK(cudaEventCreate(&my_event));

    echoKernel << < 1, 1 >> > (devptr);
    CUDA_ERROR_CHECK(cudaEventRecord(my_event));
    cudaEventQuery(my_event);

    while (var == 0);

    CUDA_ERROR_CHECK(cudaDeviceSynchronize());

    CUDA_ERROR_CHECK(cudaHostUnregister((int*)&var));
    CUDA_ERROR_CHECK(cudaDeviceReset());

    return 0;
}

Tested on CUDA 7.5, Driver 358.50, Win7 x64 release project, GTX460M.

Note that we don't wrap the cudaEventQuery call in a standard error checker, because the expected behavior for it is to return a non-zero status when the event has not been completed yet.

这篇关于CUDA mapped memory:device - &gt;主机写入在主机上不可见的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

11-01 03:23