[Rtk-users] GPU kernel do not change output variables

Mon Mar 4 07:15:52 EST 2019

Hi,
I think the problem is that you're doing the memory allocation yourself.
The cudaMalloc is automatically done by the data manager when you require
the buffer pointer (GetCudaDataManager()->GetGPUBufferPointer()), you don't
have to do it. Try removing the cudaMalloc calls.
Cheers,
Simon

On Sun, Mar 3, 2019 at 5:38 PM Kabelitz, Gordian <
Gordian.Kabelitz at medma.uni-heidelberg.de> wrote:

> Hi Simon,
>
>
>
> I made a workaround by including a explicit copying from device to host
> after the kernel is called. The function looks now like this:
>
>
>
> void
>
> CUDA_gradient(
>
>        int size[3],
>
>        float spacing[3],
>
>        float *dev_in,
>
>        float *dev_out_x,
>
>        float *dev_out_y,
>
>        float *dev_out_z)
>
> {
>
>
>
>        int3 dev_Size = make_int3(size[0], size[1], size[2]);
>
>        float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing
> [2]);
>
>
>
>        // Output volume
>
>        long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(
> float);
>
>        float * dev_grad_x;
>
>        float * dev_grad_y;
>
>        float * dev_grad_z;
>
>        cudaMalloc((void**)&dev_grad_x, outputMemorySize);
>
>        cudaMalloc((void**)&dev_grad_y, outputMemorySize);
>
>        cudaMalloc((void**)&dev_grad_z, outputMemorySize);
>
>        cudaMemset(dev_grad_x, 2.f, outputMemorySize);
>
>        cudaMemset(dev_grad_y, 3.f, outputMemorySize);
>
>        cudaMemset(dev_grad_z, 4.f, outputMemorySize);
>
>
>
>        // Thread Block Dimensions
>
>        dim3 dimBlock = dim3(10, 10, 10);
>
>
>
>        int blocksInX = iDivUp(size[0], dimBlock.x);
>
>        int blocksInY = iDivUp(size[1], dimBlock.y);
>
>        int blocksInZ = iDivUp(size[2], dimBlock.z);
>
>
>
>        dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
>
>
>
>        gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_grad_x,
> dev_grad_y, dev_grad_z, dev_Size, dev_Spacing);
>
>        CUDA_CHECK_ERROR;
>
>
>
>        cudaMemcpy(dev_out_x, dev_grad_x, outputMemorySize,
> cudaMemcpyDeviceToHost);
>
>        CUDA_CHECK_ERROR;
>
>        cudaMemcpy(dev_out_y, dev_grad_y, outputMemorySize,
> cudaMemcpyDeviceToHost);
>
>        CUDA_CHECK_ERROR;
>
>        cudaMemcpy(dev_out_z, dev_grad_z, outputMemorySize,
> cudaMemcpyDeviceToHost);
>
>        CUDA_CHECK_ERROR;
>
> }
>
>
>
> I am not sure if this violates the intended behavior of the ITKCudaCommon
> by explicitly copying the memory (that needed to be allocated before).
>
>
>
> Still I cannot solve why the implicit memory copying mechanism do not work.
>
> Have you look into this problem or do you miss any information?
>
> With best regards,
>
> Gordian
>
>
>
> *Von:* Simon Rit [mailto:simon.rit at creatis.insa-lyon.fr]
> *Gesendet:* Mittwoch, 27. Februar 2019 21:58
> *An:* Kabelitz, Gordian
> *Cc:* rtk-users at public.kitware.com
> *Betreff:* Re: [Rtk-users] GPU kernel do not change output variables
>
>
>
> Hi,
>
> Sounds like a challenge. When you say you set fixed numbers, did you check
> that you reach the point where you set this number? You can use cuprintf to
> check what's going on in the kernel.
>
> One thing wrong I noticed: you use size.Fill in a loop, which is a bit odd
> because it will Fill the size with the last value of the loop.
>
> I hope this helps,
>
> Simon
>
>
>
> On Wed, Feb 27, 2019 at 9:39 PM Kabelitz, Gordian <
> Gordian.Kabelitz at medma.uni-heidelberg.de> wrote:
>
> Hi rtk-users,
>
> I am facing an oddity which I cannot explain.
>
> I want to implement a new gradient  filter. The input is an
> CudaImage<float,3>  and the output should be an
> CudaImage<CovariantVector<float,3>,3>. The filter runs without any cuda
> errors but the output (pout_(xyz)) is has not changed at all. The kernel
> function is accessed and the print out from there seems to be okay. I tried
> to explicitly copy the content of the GPUBuffer into the CPUBuffer. Still
> no success.  Even if I set fixed numbers in the kernel to the output image
> nothing changed. I use CUDA 9.0, Visual Studio 2015, ITK 5.0, RTK 2.0 as
> remote module, CMake 3.13., Windows 7 64bit. The relevant code snippets are
> below.
>
> Do I miss something obvious? Any recommendation are welcome.
>
> With kind regards,
>
> Gordian
>
>
>
> The GPUGenerateData function:
>
> GPUGenerateData()
>
>        {
>
>               int inputSize[3];
>
>               int outputSize[3];
>
>               float inputSpacing[3];
>
>               float outputSpacing[3];
>
>
>
>               for (int i = 0; i<3; i++)
>
>               {
>
>                      inputSize[i] = this
> ->GetInput()->GetBufferedRegion().GetSize()[i];
>
>                      outputSize[i] = this
> ->GetOutput()->GetBufferedRegion().GetSize()[i];
>
>                      inputSpacing[i] = this->GetInput()->GetSpacing()[i];
>
>                      outputSpacing[i] = this
> ->GetOutput()->GetSpacing()[i];
>
>
>
>                      if ((inputSize[i] != outputSize[i]) ||
> (inputSpacing[i] != outputSpacing[i]))
>
>                      {
>
>                            std::cerr << "The CUDA laplacian filter can
> only handle input and output regions of equal size and spacing" << std::
> endl;
>
>                            exit(1);
>
>                      }
>
>               }
>
>
>
>               float *pin = *(float**)(this
> ->GetInput()->GetCudaDataManager()->GetGPUBufferPointer());
>
>
>
>               // This is a test area
>
> typename InputImageType::IndexType index;
>
>               index.Fill(0);
>
>               typename InputImageType::SizeType size;
>
>               for (auto i = 0; i < 3; ++i)
>
>                      size.Fill(this
> ->GetInput()->GetLargestPossibleRegion().GetSize()[i]);
>
>               typename InputImageType::RegionType region(index, size);
>
>               // images for gradients
>
>               auto grad_x = CudaImage<float, 3>::New();
>
>               grad_x->SetRegions(region);
>
>               grad_x->Allocate();
>
>               grad_x->FillBuffer(1);
>
>               auto grad_y = CudaImage<float, 3>::New();
>
>               grad_y->SetRegions(region);
>
>               grad_y->Allocate();
>
>               auto grad_z = CudaImage<float, 3>::New();
>
>               grad_z->SetRegions(region);
>
>               grad_z->Allocate();
>
>
>
>               float *pout_x = *(float**)(grad_x->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>               float *pout_y = *(float**)(grad_y->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>               float *pout_z = *(float**)(grad_z->GetCudaDataManager()->
> GetGPUBufferPointer());
>
>
>
>               CUDA_gradient(inputSize, inputSpacing, pin, pout_x, pout_y,
> pout_z); // after this line neither of the pout_(xyz) images have changed.
>
>
>
>               // put the gradient images in a single covariant vector image
>
>               auto CompositeImageFilter = itk::ComposeImageFilter<
> CudaImage<float, 3>, CudaImage<CovariantVector<float, 3>,3>>::New();
>
>               CompositeImageFilter->SetInput1(grad_x);
>
>               CompositeImageFilter->SetInput2(grad_y);
>
>               CompositeImageFilter->SetInput3(grad_z);
>
>               CompositeImageFilter->Update();
>
>
>
>               this->GetOutput()->Graft(CompositeImageFilter->GetOutput());
>
>        }
>
>
>
> The cuda/kernel function
>
>
>
> __global__ void gradient_kernel(float * in, float * grad_x, float * grad_y,
> float * grad_z, int3 c_Size, float3 c_Spacing);
>
>
>
> void
>
> CUDA_gradient(
>
>        int size[3],
>
>        float spacing[3],
>
>        float *dev_in,
>
>        float *dev_out_x,
>
>        float *dev_out_y,
>
>        float *dev_out_z)
>
> {
>
>        int3 dev_Size = make_int3(size[0], size[1], size[2]);
>
>        float3 dev_Spacing = make_float3(spacing[0], spacing[1], spacing
> [2]);
>
>
>
>        // Output volume
>
>        long int outputMemorySize = size[0] * size[1] * size[2] * sizeof(
> float);
>
>        cudaMalloc((void**)&dev_out_x, outputMemorySize);
>
>        cudaMalloc((void**)&dev_out_y, outputMemorySize);
>
>        cudaMalloc((void**)&dev_out_z, outputMemorySize);
>
>        cudaMemset(dev_out_x, 0, outputMemorySize);
>
>        cudaMemset(dev_out_y, 0, outputMemorySize);
>
>        cudaMemset(dev_out_z, 0, outputMemorySize);
>
>        printf("Device Variable Copying:\t%s\n", cudaGetErrorString(
> cudaGetLastError()));
>
>
>
>        // Thread Block Dimensions
>
>        dim3 dimBlock = dim3(16, 4, 4);
>
>
>
>        int blocksInX = iDivUp(size[0], dimBlock.x);
>
>        int blocksInY = iDivUp(size[1], dimBlock.y);
>
>        int blocksInZ = iDivUp(size[2], dimBlock.z);
>
>
>
>        dim3 dimGrid = dim3(blocksInX, blocksInY, blocksInZ);
>
>
>
>        gradient_kernel <<< dimGrid, dimBlock >>> (dev_in, dev_out_x,
> dev_out_y, dev_out_z, dev_Size, dev_Spacing);
>
>        cudaDeviceSynchronize();
>
>        printf("Device Variable Copying:\t%s\n", cudaGetErrorString(
> cudaGetLastError()));
>
>        CUDA_CHECK_ERROR;
>
> }
>
>
>
> __global__
>
> void
>
> gradient_kernel(float * in, float * grad_x, float * grad_y, float * grad_z,
> int3 c_Size, float3 c_Spacing)
>
> {
>
>
>
>        unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
>
>        unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
>
>        unsigned int k = blockIdx.z * blockDim.z + threadIdx.z;
>
>
>
>        if (i >= c_Size.x || j >= c_Size.y || k >= c_Size.z)
>
>               return;
>
>
>
>        long int id = (k     * c_Size.y + j)    * c_Size.x + i;
>
>        long int id_x = (k     * c_Size.y + j)    * c_Size.x + i + 1;
>
>        long int id_y = (k     * c_Size.y + j + 1)* c_Size.x + i;
>
>        long int id_z = ((k + 1) * c_Size.y + j)    * c_Size.x + i;
>
>
>
>        if (i == (c_Size.x - 1)) grad_x[id] = 0;
>
>        else grad_x[id] = (in[id_x] - in[id]) / c_Spacing.x;
>
>
>
>        if (j == (c_Size.y - 1)) grad_y[id] = 0;
>
>        else grad_y[id] = (in[id_y] - in[id]) / c_Spacing.y;
>
>
>
>        if (k == (c_Size.z - 1)) grad_z[id] = 0;
>
>        else grad_z[id] = (in[id_z] - in[id]) / c_Spacing.z;
>
> }
>
>
>
>
>
>
>
> _______________________________________________
> Rtk-users mailing list
> Rtk-users at public.kitware.com
> https://public.kitware.com/mailman/listinfo/rtk-users
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://public.kitware.com/pipermail/rtk-users/attachments/20190304/5192184d/attachment-0001.html>