[Rtk-users] GPU kernel do not change output variables

Kabelitz, Gordian Gordian.Kabelitz at medma.uni-heidelberg.de
Tue Mar 5 02:43:51 EST 2019


Hello,

I removed the cudaMalloc and it works now.
Thank you very much for pointing this out.

Best regards,
Gordian

Von: Simon Rit [mailto:simon.rit at creatis.insa-lyon.fr]
Gesendet: Montag, 4. März 2019 13:16
An: Kabelitz, Gordian
Cc: rtk-users at public.kitware.com
Betreff: Re: [Rtk-users] GPU kernel do not change output variables

Hi,
I think the problem is that you're doing the memory allocation yourself. The cudaMalloc is automatically done by the data manager when you require the buffer pointer (GetCudaDataManager()->GetGPUBufferPointer()), you don't have to do it. Try removing the cudaMalloc calls.
Cheers,
Simon

On Sun, Mar 3, 2019 at 5:38 PM Kabelitz, Gordian <Gordian.Kabelitz at medma.uni-heidelberg.de<mailto:Gordian.Kabelitz at medma.uni-heidelberg.de>> wrote:
Hi Simon,

I made a workaround by including a explicit copying from device to host after the kernel is called. The function looks now like this:

void
CUDA_gradient(
       int size[3],
       float spacing[3],
       float *dev_in,
       float *dev_out_x,
       float *dev_out_y,
       float *dev_out_z)
{

       int3 dev_Size = make_int3(size[0], size[1], size[2]);
       float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);

       // Output volume
       long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(float);
       float * dev_grad_x;
       float * dev_grad_y;
       float * dev_grad_z;
       cudaMalloc((void**)&dev_grad_x, outputMemorySize);
       cudaMalloc((void**)&dev_grad_y, outputMemorySize);
       cudaMalloc((void**)&dev_grad_z, outputMemorySize);
       cudaMemset(dev_grad_x, 2.f, outputMemorySize);
       cudaMemset(dev_grad_y, 3.f, outputMemorySize);
       cudaMemset(dev_grad_z, 4.f, outputMemorySize);

       // Thread Block Dimensions
       dim3 dimBlock = dim3(10, 10, 10);

       int blocksInX = iDivUp(size[0], dimBlock.x);
       int blocksInY = iDivUp(size[1], dimBlock.y);
       int blocksInZ = iDivUp(size[2], dimBlock.z);

       dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);

       gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_grad_x, dev_grad_y, dev_grad_z, dev_Size, dev_Spacing);
       CUDA_CHECK_ERROR;

       cudaMemcpy(dev_out_x, dev_grad_x, outputMemorySize, cudaMemcpyDeviceToHost);
       CUDA_CHECK_ERROR;
       cudaMemcpy(dev_out_y, dev_grad_y, outputMemorySize, cudaMemcpyDeviceToHost);
       CUDA_CHECK_ERROR;
       cudaMemcpy(dev_out_z, dev_grad_z, outputMemorySize, cudaMemcpyDeviceToHost);
       CUDA_CHECK_ERROR;
}

I am not sure if this violates the intended behavior of the ITKCudaCommon by explicitly copying the memory (that needed to be allocated before).

Still I cannot solve why the implicit memory copying mechanism do not work.
Have you look into this problem or do you miss any information?
With best regards,
Gordian

Von: Simon Rit [mailto:simon.rit at creatis.insa-lyon.fr<mailto:simon.rit at creatis.insa-lyon.fr>]
Gesendet: Mittwoch, 27. Februar 2019 21:58
An: Kabelitz, Gordian
Cc: rtk-users at public.kitware.com<mailto:rtk-users at public.kitware.com>
Betreff: Re: [Rtk-users] GPU kernel do not change output variables

Hi,
Sounds like a challenge. When you say you set fixed numbers, did you check that you reach the point where you set this number? You can use cuprintf to check what's going on in the kernel.
One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd because it will Fill the size with the last value of the loop.
I hope this helps,
Simon

On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian <Gordian.Kabelitz at medma.uni-heidelberg.de<mailto:Gordian.Kabelitz at medma.uni-heidelberg.de>> wrote:
Hi rtk-users,
I am facing an oddity which I cannot explain.
I want to implement a new gradient  filter. The input is an CudaImage<float,3>  and the output should be an CudaImage<CovariantVector<float,3>,3>. The filter runs without any cuda errors but the output (pout_(xyz)) is has not changed at all. The kernel function is accessed and the print out from there seems to be okay. I tried to explicitly copy the content of the GPUBuffer into the CPUBuffer. Still no success.  Even if I set fixed numbers in the kernel to the output image nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 2.0 as remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets are below.
Do I miss something obvious? Any recommendation are welcome.
With kind regards,
Gordian

The GPUGenerateData function:
GPUGenerateData()
       {
              int inputSize[3];
              int outputSize[3];
              float inputSpacing[3];
              float outputSpacing[3];

              for (int i = 0; i<3; i++)
              {
                     inputSize[i] = this->GetInput()->GetBufferedRegion().GetSize()[i];
                     outputSize[i] = this->GetOutput()->GetBufferedRegion().GetSize()[i];
                     inputSpacing[i] = this->GetInput()->GetSpacing()[i];
                     outputSpacing[i] = this->GetOutput()->GetSpacing()[i];

                     if ((inputSize[i] != outputSize[i]) || (inputSpacing[i] != outputSpacing[i]))
                     {
                           std::cerr << "The CUDA laplacian filter can only handle input and output regions of equal size and spacing" << std::endl;
                           exit(1);
                     }
              }

              float *pin = *(float**)(this->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());

              // This is a test area
typename InputImageType::IndexType index;
              index.Fill(0);
              typename InputImageType::SizeType size;
              for (auto i = 0; i < 3; ++i)
                     size.Fill(this->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
              typename InputImageType::RegionType region(index, size);
              // images for gradients
              auto grad_x = CudaImage<float, 3>::New();
              grad_x->SetRegions(region);
              grad_x->Allocate();
              grad_x->FillBuffer(1);
              auto grad_y = CudaImage<float, 3>::New();
              grad_y->SetRegions(region);
              grad_y->Allocate();
              auto grad_z = CudaImage<float, 3>::New();
              grad_z->SetRegions(region);
              grad_z->Allocate();

              float *pout_x = *(float**)(grad_x->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_y = *(float**)(grad_y->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_z = *(float**)(grad_z->GetCudaDataManager()->GetGPUBufferPointer());

              CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z); // after this line neither of the pout_(xyz) images have changed.

              // put the gradient images in a single covariant vector image
              auto CompositeImageFilter = itk::ComposeImageFilter<CudaImage<float, 3>, CudaImage<CovariantVector<float, 3>,3>>::New();
              CompositeImageFilter->SetInput1(grad_x);
              CompositeImageFilter->SetInput2(grad_y);
              CompositeImageFilter->SetInput3(grad_z);
              CompositeImageFilter->Update();

              this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
       }

The cuda/kernel function

__global__ void gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, int3 c_Size, float3 c_Spacing);

void
CUDA_gradient(
       int size[3],
       float spacing[3],
       float *dev_in,
       float *dev_out_x,
       float *dev_out_y,
       float *dev_out_z)
{
       int3 dev_Size = make_int3(size[0], size[1], size[2]);
       float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);

       // Output volume
       long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(float);
       cudaMalloc((void**)&dev_out_x, outputMemorySize);
       cudaMalloc((void**)&dev_out_y, outputMemorySize);
       cudaMalloc((void**)&dev_out_z, outputMemorySize);
       cudaMemset(dev_out_x, 0, outputMemorySize);
       cudaMemset(dev_out_y, 0, outputMemorySize);
       cudaMemset(dev_out_z, 0, outputMemorySize);
       printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));

       // Thread Block Dimensions
       dim3 dimBlock = dim3(16, 4, 4);

       int blocksInX = iDivUp(size[0], dimBlock.x);
       int blocksInY = iDivUp(size[1], dimBlock.y);
       int blocksInZ = iDivUp(size[2], dimBlock.z);

       dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);

       gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x, dev_out_y, dev_out_z, dev_Size, dev_Spacing);
       cudaDeviceSynchronize();
       printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));
       CUDA_CHECK_ERROR;
}

__global__
void
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, int3 c_Size, float3 c_Spacing)
{

       unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
       unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
       unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;

       if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
              return;

       long int id = (k     * c_Size.y + j)    * c_Size.x + i;
       long int id_x = (k     * c_Size.y + j)    * c_Size.x + i + 1;
       long int id_y = (k     * c_Size.y + j + 1)* c_Size.x + i;
       long int id_z = ((k + 1) * c_Size.y + j)    * c_Size.x + i;

       if (i == (c_Size.x - 1)) grad_x[id] = 0;
       else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;

       if (j == (c_Size.y - 1)) grad_y[id] = 0;
       else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;

       if (k == (c_Size.z - 1)) grad_z[id] = 0;
       else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
}



_______________________________________________
Rtk-users mailing list
Rtk-users at public.kitware.com<mailto:Rtk-users at public.kitware.com>
https://public.kitware.com/mailman/listinfo/rtk-users
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://public.kitware.com/pipermail/rtk-users/attachments/20190305/0d1f3b36/attachment-0001.html>


More information about the Rtk-users mailing list