代码之家  ›  专栏  ›  技术社区  ›  Iter Ator

如何在设备上运行推力::count\u?(Cuda)

  •  0
  • Iter Ator  · 技术社区  · 7 年前

    我想实施RANSAC。我生成60k个点和500个平面,我想计算每个平面,它们附近有多少个点。然后选择具有最大值的一个。

    在我生成向量之后( d_vec )还有飞机( d_pl )并将其传输到GPU,我使用 thrust::transform 在那里面 thrust:count_if 计算闭合点的数量。

    很遗憾,我遇到了以下错误:

    1>D:\Projects\cuda\CudaTest\CudaTest>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin\nvcc.exe" -gencode=arch=compute_30,code=\"sm_30,compute_30\" --use-local-env --cl-version 2015 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\bin\x86_amd64" -x cu  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include" -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include"     --keep-dir x64\Release -maxrregcount=0  --machine 64 --compile -cudart static     -DWIN32 -DWIN64 -DNDEBUG -D_CONSOLE -D_MBCS -Xcompiler "/EHsc /W3 /nologo /O2 /FS /Zi  /MD " -o x64\Release\kernel.cu.obj "D:\Projects\cuda\CudaTest\CudaTest\kernel.cu"
    1>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include\thrust/detail/type_traits/pointer_traits.h(201): error : calling a __host__ function("thrust::detail::vector_base< ::Vec3,  ::thrust::device_malloc_allocator< ::Vec3> > ::begin") from a __device__ function("thrust::cuda_cub::__transform::unary_transform_f< ::thrust::detail::normal_iterator< ::thrust::device_ptr< ::Plane> > ,  ::thrust::detail::normal_iterator< ::thrust::device_ptr<int> > ,  ::thrust::cuda_cub::__transform::no_stencil_tag,  ::plane_functor,  ::thrust::cuda_cub::__transform::always_true_predicate> ::operator ()<long long> ") is not allowed
    1>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0\include\thrust/detail/type_traits/pointer_traits.h(201): error : identifier "thrust::detail::vector_base< ::Vec3,  ::thrust::device_malloc_allocator< ::Vec3> > ::begin" is undefined in device code
    1>D:/Projects/cuda/CudaTest/CudaTest/kernel.cu(84): error : calling a __host__ function("thrust::detail::vector_base< ::Vec3,  ::thrust::device_malloc_allocator< ::Vec3> > ::end") from a __device__ function("thrust::cuda_cub::__transform::unary_transform_f< ::thrust::detail::normal_iterator< ::thrust::device_ptr< ::Plane> > ,  ::thrust::detail::normal_iterator< ::thrust::device_ptr<int> > ,  ::thrust::cuda_cub::__transform::no_stencil_tag,  ::plane_functor,  ::thrust::cuda_cub::__transform::always_true_predicate> ::operator ()<long long> ") is not allowed
    1>D:/Projects/cuda/CudaTest/CudaTest/kernel.cu(84): error : identifier "thrust::detail::vector_base< ::Vec3,  ::thrust::device_malloc_allocator< ::Vec3> > ::end" is undefined in device code
    

    如何从设备代码调用推力::count\u?我做错了什么? 以下是完整代码:

    #include <thrust/host_vector.h>
    #include <thrust/device_vector.h>
    #include <thrust/generate.h>
    #include <thrust/sort.h>
    #include <thrust/copy.h>
    #include <thrust/execution_policy.h>
    #include <algorithm>
    #include <iostream>
    #include <cstdlib>
    #include <time.h>
    #include <thrust/count.h>
    #include <thrust/extrema.h>
    
    struct Vec3 {
        float x;
        float y;
        float z;
    
    
        friend std::ostream& operator<<(std::ostream& os, const Vec3& dt);
    };
    
    std::ostream& operator<<(std::ostream& os, const Vec3& dt)
    {
        os << dt.x << ", " << dt.y << ", " << dt.z;
        return os;
    }
    
    struct Plane {
    
        float a;
        float b;
        float c;
        float d;
    
        // https://keisan.casio.com/exec/system/1223596129
        static Plane FromPoints(Vec3 A, Vec3 B, Vec3 C) {
            Plane ret;
    
            ret.a = (B.y - A.y)*(C.z - A.z) - (C.y - A.y)*(B.z - A.z);
            ret.b = (B.z - A.z)*(C.x - A.x) - (C.z - A.z)*(B.x - A.x);
            ret.c = (B.x - A.x)*(C.y - A.y) - (C.x - A.x)*(B.y - A.y);
    
            ret.d = -(ret.a*A.x + ret.b*A.y + ret.c*A.z);
    
            return ret;
    
        }
    
    };
    
    Vec3 generator() {
        return {
            float(rand()) / float(RAND_MAX) * 1000.f,
            float(rand()) / float(RAND_MAX) * 1000.f,
            float(rand()) / float(RAND_MAX) * 1000.f
        };
    }
    
    int index_generator() {
        return rand() % 69632;
    }
    
    struct plane_distance {
    
        const Plane pl;
    
        __device__ plane_distance(const Plane pl) : pl(pl) {}
    
        __device__ bool operator()(const Vec3& vv) const {
            return fabsf(pl.a*vv.x + pl.b*vv.y + pl.c*vv.z + pl.d) / sqrtf(pl.a*pl.a + pl.b*pl.b + pl.c*pl.c) > 0.128f;
        }
    
    };
    
    struct plane_functor
    {
        thrust::device_vector<Vec3>& d_vec;
    
        plane_functor(thrust::device_vector<Vec3>& d_vec) : d_vec(d_vec) {}
    
        __device__ int operator()(const Plane& pl) const {
    
            return thrust::count_if(thrust::device, d_vec.begin(), d_vec.end(), plane_distance(pl));
    
        }
    };
    
    int main(void)
    {
    
    
        // Generate random points for testing
    
        std::cout << "Generating..." << std::endl;
    
        // generate random vectors serially
        thrust::host_vector<Vec3> h_vec(65536);
        std::generate(h_vec.begin(), h_vec.end(), generator);
    
        // Generate random planes
        thrust::host_vector<Plane> h_pl(512);
        std::generate(h_pl.begin(), h_pl.end(), [&h_vec]() {
    
            return Plane::FromPoints(
                h_vec[index_generator()],
                h_vec[index_generator()],
                h_vec[index_generator()]
            );
    
        });
    
        std::cout << "Transfer" << std::endl;
    
        // transfer data to the device
        thrust::device_vector<Vec3> d_vec = h_vec;
        thrust::device_vector<Plane> d_pl = h_pl;
        thrust::device_vector<int> counts(512);
    
        std::cout << "Searching" << std::endl;
    
        thrust::transform(thrust::device, d_pl.begin(), d_pl.end(), counts.begin(), plane_functor(d_vec));
    
        auto result = thrust::max_element(thrust::device, counts.begin(), counts.end());
    
        std::cout << "Press any key to exit" << std::endl;
        std::cin.get();
    
        return 0;
    }
    
    1 回复  |  直到 7 年前
        1
  •  2
  •   talonmies    7 年前

    正如评论中所建议的,访问是非法的 device_vector 在设备代码中。在撰写本文时可用的所有推力版本中,它们(尽管名称不同)都是主机端抽象。之所以会出现错误,是因为functor正在设备代码中调用device\u向量的复制构造,这需要构造新的容器,这将调用内存分配,并且无法编译。

    应该 可以使用原始设备指针来实现这一点,例如:

    struct plane_functor
    {
        Vec3* d_vec0;
        Vec3* d_vec1;
    
        __host__ __device__ plane_functor(Vec3* d_vec0, Vec3* d_vec1) : d_vec0(d_vec0), d_vec1(d_vec1) {}
    
        __device__ int operator()(const Plane& pl) const {
    
            return thrust::count_if(thrust::device, d_vec0, d_vec1, plane_distance(pl));
    
        }
    };
    
    // ....
    
    Vec3* d_vec0 = thrust::raw_pointer_cast(d_vec.data());
    Vec3* d_vec1 = d_vec0 + (d_vec.end() - d_vec.begin());
    thrust::transform(d_pl.begin(), d_pl.end(), counts.begin(), plane_functor( d_vec0, d_vec1 ) );
    

    请注意,虽然这可以为我编译,但我无法运行您的代码,因为当我尝试运行时,主机端初始化lambda会崩溃。还要密切注意标记和基于策略的执行的混合。这个 thrust::transform 正如所写的,调用即使有有效的functor也会失败,因为 device\u向量 迭代器和 thrust::device