代码之家  ›  专栏  ›  技术社区  ›  McAngus

CuDNN Reduce格式错误

  •  0
  • McAngus  · 技术社区  · 6 年前

    我真的不想在这里转储很多代码,但我希望它是可编译的。下面是用来演示CuDNN中可能存在的bug(很可能是误解)。

    #include <vector>
    #include <cudnn.h>
    #include <cuda.h>
    #include <iostream>
    #include <sstream>
    
    #define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
    
    inline void gpuAssert(cudnnStatus_t code, const char *file, int line, bool abort=true)
    {
        if (code != CUDNN_STATUS_SUCCESS) 
        {
            std::stringstream ss;
            ss << "CuDNNassert: (" << code << ") " << cudnnGetErrorString(code) << " " << file << " " << line;
            std::cerr << ss.str() << std::endl;
            if (abort)
            {
                throw std::runtime_error(ss.str());
            }
        }
    }
    
    inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
    {
        if (code != cudaSuccess) 
        {
            std::stringstream ss;
            ss << "CUDAassert: (" << code << ") " << cudaGetErrorString(code) << " " << file << " " << line;
            std::cerr << ss.str() << std::endl;
            if (abort)
            {
                throw std::runtime_error(ss.str());
            }
        }
    }
    
    template<typename T>
    cudnnDataType_t getCudnnType()
    {
        if(std::is_same<T, float>::value)
            return CUDNN_DATA_FLOAT;
        else if(std::is_same<T, double>::value)
            return CUDNN_DATA_DOUBLE;
        else if(std::is_same<T, int>::value)
            return CUDNN_DATA_INT32;
        else if(std::is_same<T, char>::value)
            return CUDNN_DATA_INT8;
        else
            throw std::runtime_error("Cannot use any other type of");
    }
    
    template<typename T>
    void _reduce(cudnnHandle_t& cudnn, T* gpuA, T** gpuB,
        int n,    int h,    int w,    int c,
        int outN, int outH, int outW, int outC,
        cudnnReduceTensorOp_t reduceType, cudnnTensorFormat_t format)
    {
        gpuErrchk( cudaMalloc(gpuB, outN*outH*outW*outC*sizeof(T)) );
        gpuErrchk( cudaMemset(*gpuB, 0, outN*outH*outW*outC*sizeof(T)) );
    
        cudnnDataType_t dType = getCudnnType<T>();
    
        cudnnTensorDescriptor_t inputDescriptor;
        gpuErrchk( cudnnCreateTensorDescriptor(&inputDescriptor) );
        gpuErrchk( cudnnSetTensor4dDescriptor(inputDescriptor,
                                                format,
                                                dType,
                                                n, c, h, w) );
    
        cudnnTensorDescriptor_t outputDescriptor;
        gpuErrchk( cudnnCreateTensorDescriptor(&outputDescriptor) );
        gpuErrchk( cudnnSetTensor4dDescriptor(outputDescriptor,
                                                format,
                                                dType,
                                                outN, outC, outH, outW) );
    
        cudnnReduceTensorDescriptor_t reduceTensorDesc;
        gpuErrchk( cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) );
        gpuErrchk( cudnnSetReduceTensorDescriptor(reduceTensorDesc,
                                                    reduceType,
                                                    dType,
                                                    CUDNN_NOT_PROPAGATE_NAN,
                                                    CUDNN_REDUCE_TENSOR_NO_INDICES,
                                                    CUDNN_8BIT_INDICES) );
    
        size_t workspaceSize;
        gpuErrchk( cudnnGetReductionWorkspaceSize(cudnn,
                                                    reduceTensorDesc,
                                                    inputDescriptor,
                                                    outputDescriptor,
                                                    &workspaceSize) );
    
        size_t indicesSize;
        gpuErrchk( cudnnGetReductionIndicesSize(cudnn,
                                                    reduceTensorDesc,
                                                    inputDescriptor,
                                                    outputDescriptor,
                                                    &indicesSize) );
    
        float alpha = 1;
        float beta = 0;
    
        void* gpuWorkspace;
        gpuErrchk( cudaMalloc(&gpuWorkspace, workspaceSize) );
    
        void* gpuIndices;
        gpuErrchk( cudaMalloc(&gpuIndices, indicesSize) );
    
        gpuErrchk( cudnnReduceTensor(cudnn,
                                        reduceTensorDesc,
                                        gpuIndices, indicesSize,
                                        gpuWorkspace, workspaceSize,
                                        &alpha,
                                        inputDescriptor, gpuA,
                                        &beta,
                                        outputDescriptor, *gpuB) );
    
        gpuErrchk( cudaDeviceSynchronize() );
    
        gpuErrchk( cudnnDestroyReduceTensorDescriptor(reduceTensorDesc) );
        gpuErrchk( cudnnDestroyTensorDescriptor(inputDescriptor) );
        gpuErrchk( cudnnDestroyTensorDescriptor(outputDescriptor) );
    
        gpuErrchk( cudaFree(gpuIndices) );
        gpuErrchk( cudaFree(gpuWorkspace) );
    
    }
    
    int main(int argc, char **argv) {
        std::cout << "cudnn ver: " << CUDNN_MAJOR << "." << CUDNN_MINOR << "." << CUDNN_PATCHLEVEL << std::endl;
    
        cudnnHandle_t cudnn;
        gpuErrchk( cudnnCreate(&cudnn) );
    
        std::vector<float> in = {3,5,7,11,13,17,19,23,29,31};
        //NHWC: 3, 7,  13, 19, 29
        //      5, 11, 17, 23, 31
    
        //HCHW: 3,  5,  7,  11, 13
        //      17, 19, 23, 29, 31
    
        float* data_d;
        int n = 1, h = 1, w = 5, c = 2;
        size_t numElem = n*h*w*c;
        size_t arrSize = numElem*sizeof(float);
    
        //buffer to print results
        std::vector<float> cpuRes(5);
    
        gpuErrchk( cudaMalloc((void**) &data_d, arrSize) );
    
        gpuErrchk( cudaMemcpy(data_d, &in[0], arrSize, cudaMemcpyHostToDevice) );
    
        float* res_d;
    
        _reduce(cudnn, data_d, &res_d,
            n, h, w, c,
            1, 1, 5, 1, //reduce along channels
            CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NHWC); //use intended format
    
        gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
    
        std::cout << "[";
        for(auto& v : cpuRes)
            std::cout << v << ",";
        std::cout << "]" << std::endl;
        //expected: [8,18,30,42,60,]
        //result: [20,24,30,40,44,]
    
        gpuErrchk( cudaFree(res_d) ); //next call will alloc again
    
        _reduce(cudnn, data_d, &res_d,
                n, h, w, c,
                1, 1, 5, 1, //reduce along channels
                CUDNN_REDUCE_TENSOR_ADD, CUDNN_TENSOR_NCHW); //use other format
    
    
        gpuErrchk( cudaMemcpy(&cpuRes[0], res_d, 5*sizeof(float), cudaMemcpyDeviceToHost) );
    
        std::cout << "[";
        for(auto& v : cpuRes)
            std::cout << v << ",";
        std::cout << "]" << std::endl;
        //expected: [20,24,30,40,44,]
        //result: [20,24,30,40,44,]
    
        gpuErrchk( cudaFree(res_d) );
        gpuErrchk( cudaFree(data_d) );
        gpuErrchk( cudnnDestroy(cudnn) );
    
        return 0;
    }
    

    如果你想自己测试,这里是 cmake 我用来编译的文件:

    cmake_minimum_required(VERSION 3.0)
    
    project(Main)
    
    find_package(OpenCV REQUIRED)
    find_package(CUDA REQUIRED)
    #find_package(CUDNN REQUIRED)
    
    set(CMAKE_CXX_FLAGS "--std=c++11 -Wall -fPIC -D_GLIBCXX_USE_CXX11_ABI=0 -D GOOGLE_CUDA=1")
    set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --default-stream per-thread" )
    set(CMAKE_BUILD_TYPE Debug)
    
    #pass flags to c++ compiler
    set(CUDA_PROPAGATE_HOST_FLAGS ON)
    
    set(MAIN_SRC
        "main.cu"
    )
    include_directories(${OpenCV_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS})
    
    cuda_add_executable(Main ${MAIN_SRC})
    target_link_libraries(Main ${OpenCV_LIBS} ${CUDA_LIBRARIES} cudnn stdc++fs)
    

    控制台的输出为:

    cudnn ver: 7.3.1
    [20,24,30,40,44,]
    [20,24,30,40,44,]
    

    这显然是错误的输出。当沿相同尺寸缩小时,更改尺寸顺序应导致不同的值(即。 [8,18,30,42,60,]

    甚至使用 cudnnSetTensor4dDescriptorEx

    int ns = c*w*h;
    int cs = 1;
    int hs = c*w;
    int ws = c;
    

    看看他们使用的CuDNN库下载的例子 cudnnSetTensorNdDescriptor 而不是 cudnnSetTensor4dDescriptor . 但是 cudnnSetTensorNdDescriptor 声明:

    用户创建一个4D张量,并将沿未使用尺寸的大小设置为

    考虑到你需要自己计算步幅 cudnsetensendDescriptor属性 CUDNSETTENSOR4D描述符

    这是CuDNN中的一个bug还是我的代码中有什么我看不到的错误?

    1 回复  |  直到 6 年前
        1
  •  0
  •   McAngus    6 年前

    documentation :

    C=α*减少op(A)+β*C

    以及

    错误出现在两行代码中:

    float alpha = 1;
    float beta = 0;
    

    应该是:

    T alpha = 1;
    T beta = 0;