[Rtk-users] GPU kernel do not change output variables

Wed Feb 27 15:58:14 EST 2019

Hi,
Sounds like a challenge. When you say you set fixed numbers, did you check
that you reach the point where you set this number? You can use cuprintf to
check what's going on in the kernel.
One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd
because it will Fill the size with the last value of the loop.
I hope this helps,
Simon

On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian <
Gordian.Kabelitz at medma.uni-heidelberg.de> wrote:

> Hi rtk-users,
>
> I am facing an oddity which I cannot explain.
>
> I want to implement a new gradient  filter. The input is an
> CudaImage<float,3>  and the output should be an
> CudaImage<CovariantVector<float,3>,3>. The filter runs without any cuda
> errors but the output (pout_(xyz)) is has not changed at all. The kernel
> function is accessed and the print out from there seems to be okay. I tried
> to explicitly copy the content of the GPUBuffer into the CPUBuffer. Still
> no success.  Even if I set fixed numbers in the kernel to the output image
> nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 2.0 as
> remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets are
> below.
>
> Do I miss something obvious? Any recommendation are welcome.
>
> With kind regards,
>
> Gordian
>
>
>
> The GPUGenerateData function:
>
> GPUGenerateData()
>
>        {
>
>               int inputSize[3];
>
>               int outputSize[3];
>
>               float inputSpacing[3];
>
>               float outputSpacing[3];
>
>
>
>               for (int i = 0; i<3; i++)
>
>               {
>
>                      inputSize[i] = this
> ->GetInput()->GetBufferedRegion().GetSize()[i];
>
>                      outputSize[i] = this
> ->GetOutput()->GetBufferedRegion().GetSize()[i];
>
>                      inputSpacing[i] = this->GetInput()->GetSpacing()[i];
>
>                      outputSpacing[i] = this
> ->GetOutput()->GetSpacing()[i];
>
>
>
>                      if ((inputSize[i] != outputSize[i]) ||
> (inputSpacing[i] != outputSpacing[i]))
>
>                      {
>
>                            std::cerr << "The CUDA laplacian filter can
> only handle input and output regions of equal size and spacing" << std::
> endl;
>
>                            exit(1);
>
>                      }
>
>               }
>
>
>
>               float *pin = *(float**)(this
> ->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());
>
>
>
>               // This is a test area
>
> typename InputImageType::IndexType index;
>
>               index.Fill(0);
>
>               typename InputImageType::SizeType size;
>
>               for (auto i = 0; i < 3; ++i)
>
>                      size.Fill(this
> ->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
>
>               typename InputImageType::RegionType region(index, size);
>
>               // images for gradients
>
>               auto grad_x = CudaImage<float, 3>::New();
>
>               grad_x->SetRegions(region);
>
>               grad_x->Allocate();
>
>               grad_x->FillBuffer(1);
>
>               auto grad_y = CudaImage<float, 3>::New();
>
>               grad_y->SetRegions(region);
>
>               grad_y->Allocate();
>
>               auto grad_z = CudaImage<float, 3>::New();
>
>               grad_z->SetRegions(region);
>
>               grad_z->Allocate();
>
>
>
>               float *pout_x = *(float**)(grad_x->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>               float *pout_y = *(float**)(grad_y->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>               float *pout_z = *(float**)(grad_z->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>
>
>               CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y,
> pout_z); // after this line neither of the pout_(xyz) images have changed.
>
>
>
>               // put the gradient images in a single covariant vector image
>
>               auto CompositeImageFilter = itk::ComposeImageFilter<
> CudaImage<float, 3>, CudaImage<CovariantVector<float, 3>,3>>::New();
>
>               CompositeImageFilter->SetInput1(grad_x);
>
>               CompositeImageFilter->SetInput2(grad_y);
>
>               CompositeImageFilter->SetInput3(grad_z);
>
>               CompositeImageFilter->Update();
>
>
>
>               this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
>
>        }
>
>
>
> The cuda/kernel function
>
>
>
> __global__ void gradient_kernel(float * in, float * grad_x, float * grad_y,
> float * grad_z, int3 c_Size, float3 c_Spacing);
>
>
>
> void
>
> CUDA_gradient(
>
>        int size[3],
>
>        float spacing[3],
>
>        float *dev_in,
>
>        float *dev_out_x,
>
>        float *dev_out_y,
>
>        float *dev_out_z)
>
> {
>
>        int3 dev_Size = make_int3(size[0], size[1], size[2]);
>
>        float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing
> [2]);
>
>
>
>        // Output volume
>
>        long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(
> float);
>
>        cudaMalloc((void**)&dev_out_x, outputMemorySize);
>
>        cudaMalloc((void**)&dev_out_y, outputMemorySize);
>
>        cudaMalloc((void**)&dev_out_z, outputMemorySize);
>
>        cudaMemset(dev_out_x, 0, outputMemorySize);
>
>        cudaMemset(dev_out_y, 0, outputMemorySize);
>
>        cudaMemset(dev_out_z, 0, outputMemorySize);
>
>        printf("Device Variable Copying:\t%s\n", cudaGetErrorString(
> cudaGetLastError()));
>
>
>
>        // Thread Block Dimensions
>
>        dim3 dimBlock = dim3(16, 4, 4);
>
>
>
>        int blocksInX = iDivUp(size[0], dimBlock.x);
>
>        int blocksInY = iDivUp(size[1], dimBlock.y);
>
>        int blocksInZ = iDivUp(size[2], dimBlock.z);
>
>
>
>        dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
>
>
>
>        gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x,
> dev_out_y, dev_out_z, dev_Size, dev_Spacing);
>
>        cudaDeviceSynchronize();
>
>        printf("Device Variable Copying:\t%s\n", cudaGetErrorString(
> cudaGetLastError()));
>
>        CUDA_CHECK_ERROR;
>
> }
>
>
>
> __global__
>
> void
>
> gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z,
> int3 c_Size, float3 c_Spacing)
>
> {
>
>
>
>        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
>
>        unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
>
>        unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;
>
>
>
>        if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
>
>               return;
>
>
>
>        long int id = (k     * c_Size.y + j)    * c_Size.x + i;
>
>        long int id_x = (k     * c_Size.y + j)    * c_Size.x + i + 1;
>
>        long int id_y = (k     * c_Size.y + j + 1)* c_Size.x + i;
>
>        long int id_z = ((k + 1) * c_Size.y + j)    * c_Size.x + i;
>
>
>
>        if (i == (c_Size.x - 1)) grad_x[id] = 0;
>
>        else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;
>
>
>
>        if (j == (c_Size.y - 1)) grad_y[id] = 0;
>
>        else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;
>
>
>
>        if (k == (c_Size.z - 1)) grad_z[id] = 0;
>
>        else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
>
> }
>
>
>
>
>
>
> _______________________________________________
> Rtk-users mailing list
> Rtk-users at public.kitware.com
> https://public.kitware.com/mailman/listinfo/rtk-users
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://public.kitware.com/pipermail/rtk-users/attachments/20190227/68910c0c/attachment-0001.html>