[Rtk-users] GPU kernel do not change output variables

Wed Feb 27 15:32:09 EST 2019

Hi rtk-users,
I am facing an oddity which I cannot explain.
I want to implement a new gradient  filter. The input is an CudaImage<float,3>  and the output should be an CudaImage<CovariantVector<float,3>,3>. The filter runs without any cuda errors but the output (pout_(xyz)) is has not changed at all. The kernel function is accessed and the print out from there seems to be okay. I tried to explicitly copy the content of the GPUBuffer into the CPUBuffer. Still no success.  Even if I set fixed numbers in the kernel to the output image nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 2.0 as remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets are below.
Do I miss something obvious? Any recommendation are welcome.
With kind regards,
Gordian

The GPUGenerateData function:
GPUGenerateData()
       {
              int inputSize[3];
              int outputSize[3];
              float inputSpacing[3];
              float outputSpacing[3];

              for (int i = 0; i<3; i++)
              {
                     inputSize[i] = this->GetInput()->GetBufferedRegion().GetSize()[i];
                     outputSize[i] = this->GetOutput()->GetBufferedRegion().GetSize()[i];
                     inputSpacing[i] = this->GetInput()->GetSpacing()[i];
                     outputSpacing[i] = this->GetOutput()->GetSpacing()[i];

                     if ((inputSize[i] != outputSize[i]) || (inputSpacing[i] != outputSpacing[i]))
                     {
                           std::cerr << "The CUDA laplacian filter can only handle input and output regions of equal size and spacing" << std::endl;
                           exit(1);
                     }
              }

              float *pin = *(float**)(this->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());

              // This is a test area
typename InputImageType::IndexType index;
              index.Fill(0);
              typename InputImageType::SizeType size;
              for (auto i = 0; i < 3; ++i)
                     size.Fill(this->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
              typename InputImageType::RegionType region(index, size);
              // images for gradients
              auto grad_x = CudaImage<float, 3>::New();
              grad_x->SetRegions(region);
              grad_x->Allocate();
              grad_x->FillBuffer(1);
              auto grad_y = CudaImage<float, 3>::New();
              grad_y->SetRegions(region);
              grad_y->Allocate();
              auto grad_z = CudaImage<float, 3>::New();
              grad_z->SetRegions(region);
              grad_z->Allocate();

              float *pout_x = *(float**)(grad_x->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_y = *(float**)(grad_y->GetCudaDataManager()->GetGPUBufferPointer());
              float *pout_z = *(float**)(grad_z->GetCudaDataManager()->GetGPUBufferPointer());

              CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y, pout_z); // after this line neither of the pout_(xyz) images have changed.

              // put the gradient images in a single covariant vector image
              auto CompositeImageFilter = itk::ComposeImageFilter<CudaImage<float, 3>, CudaImage<CovariantVector<float, 3>,3>>::New();
              CompositeImageFilter->SetInput1(grad_x);
              CompositeImageFilter->SetInput2(grad_y);
              CompositeImageFilter->SetInput3(grad_z);
              CompositeImageFilter->Update();

              this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
       }

The cuda/kernel function

__global__ void gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, int3 c_Size, float3 c_Spacing);

void
CUDA_gradient(
       int size[3],
       float spacing[3],
       float *dev_in,
       float *dev_out_x,
       float *dev_out_y,
       float *dev_out_z)
{
       int3 dev_Size = make_int3(size[0], size[1], size[2]);
       float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing[2]);

       // Output volume
       long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(float);
       cudaMalloc((void**)&dev_out_x, outputMemorySize);
       cudaMalloc((void**)&dev_out_y, outputMemorySize);
       cudaMalloc((void**)&dev_out_z, outputMemorySize);
       cudaMemset(dev_out_x, 0, outputMemorySize);
       cudaMemset(dev_out_y, 0, outputMemorySize);
       cudaMemset(dev_out_z, 0, outputMemorySize);
       printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));

       // Thread Block Dimensions
       dim3 dimBlock = dim3(16, 4, 4);

       int blocksInX = iDivUp(size[0], dimBlock.x);
       int blocksInY = iDivUp(size[1], dimBlock.y);
       int blocksInZ = iDivUp(size[2], dimBlock.z);

       dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);

       gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x, dev_out_y, dev_out_z, dev_Size, dev_Spacing);
       cudaDeviceSynchronize();
       printf("Device Variable Copying:\t%s\n", cudaGetErrorString(cudaGetLastError()));
       CUDA_CHECK_ERROR;
}

__global__
void
gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z, int3 c_Size, float3 c_Spacing)
{

       unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
       unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
       unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;

       if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
              return;

       long int id = (k     * c_Size.y + j)    * c_Size.x + i;
       long int id_x = (k     * c_Size.y + j)    * c_Size.x + i + 1;
       long int id_y = (k     * c_Size.y + j + 1)* c_Size.x + i;
       long int id_z = ((k + 1) * c_Size.y + j)    * c_Size.x + i;

       if (i == (c_Size.x - 1)) grad_x[id] = 0;
       else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;

       if (j == (c_Size.y - 1)) grad_y[id] = 0;
       else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;

       if (k == (c_Size.z - 1)) grad_z[id] = 0;
       else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
}

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://public.kitware.com/pipermail/rtk-users/attachments/20190227/00256c8b/attachment-0001.html>