diff --git a/src/detect.c b/src/detect.c index 22f0953040..c232c0792e 100644 --- a/src/detect.c +++ b/src/detect.c @@ -4432,8 +4432,8 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) } #ifdef __SC_CUDA_SUPPORT__ - unsigned int cuda_total = 0; - unsigned int cuda_free_before_alloc = 0; + size_t cuda_total = 0; + size_t cuda_free_before_alloc = 0; /* we register a module that would require cuda handler service. This * module would hold the context for all the patterns in the rules */ de_ctx->cuda_rc_mod_handle = SCCudaHlRegisterModule("SC_RULES_CONTENT_B2G_CUDA"); @@ -4466,7 +4466,7 @@ int SigGroupBuild(DetectEngineCtx *de_ctx) } #ifdef __SC_CUDA_SUPPORT__ - unsigned int cuda_free_after_alloc = 0; + size_t cuda_free_after_alloc = 0; /* if a user has selected some other mpm algo other than b2g_cuda, inspite of * enabling cuda support, then no cuda contexts or cuda vars would be created. * Pop the cuda context, only on confirming that the MPM algo selected is the diff --git a/src/util-cuda.c b/src/util-cuda.c index d224f1316f..80eb4788d4 100644 --- a/src/util-cuda.c +++ b/src/util-cuda.c @@ -20,7 +20,7 @@ * * \author Anoop Saldanha * - * NVIDIA CUDA utility functions + * NVIDIA CUDA utility functions - last referenced Cuda Toolkit 4.2 */ /* compile in, only if we have a CUDA enabled device on the machine, with the @@ -40,84 +40,87 @@ typedef enum SCCudaAPIS_ { /* init api */ SC_CUDA_CU_INIT, + + /* version management api */ + SC_CUDA_CU_DRIVER_GET_VERSION, + /* device management api */ - SC_CUDA_CU_DEVICE_GET_COUNT, + SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY, SC_CUDA_CU_DEVICE_GET, + SC_CUDA_CU_DEVICE_GET_ATTRIBUTE, + SC_CUDA_CU_DEVICE_GET_COUNT, SC_CUDA_CU_DEVICE_GET_NAME, - SC_CUDA_CU_DEVICE_TOTAL_MEM, - SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY, SC_CUDA_CU_DEVICE_GET_PROPERTIES, - SC_CUDA_CU_DEVICE_GET_ATTRIBUTE, - /* version management api */ - SC_CUDA_CU_DRIVER_GET_VERSION, + SC_CUDA_CU_DEVICE_TOTAL_MEM, + /* context management api */ - SC_CUDA_CU_CTX_ATTACH, SC_CUDA_CU_CTX_CREATE, SC_CUDA_CU_CTX_DESTROY, - SC_CUDA_CU_CTX_DETACH, + SC_CUDA_CU_CTX_GET_API_VERSION, + SC_CUDA_CU_CTX_GET_CACHE_CONFIG, + SC_CUDA_CU_CTX_GET_CURRENT, SC_CUDA_CU_CTX_GET_DEVICE, + SC_CUDA_CU_CTX_GET_LIMIT, SC_CUDA_CU_CTX_POP_CURRENT, SC_CUDA_CU_CTX_PUSH_CURRENT, + SC_CUDA_CU_CTX_SET_CACHE_CONFIG, + SC_CUDA_CU_CTX_SET_CURRENT, + SC_CUDA_CU_CTX_SET_LIMIT, SC_CUDA_CU_CTX_SYNCHRONIZE, + SC_CUDA_CU_CTX_ATTACH, + SC_CUDA_CU_CTX_DETACH, + /* module management api */ SC_CUDA_CU_MODULE_GET_FUNCTION, SC_CUDA_CU_MODULE_GET_GLOBAL, + SC_CUDA_CU_MODULE_GET_SURF_REF, SC_CUDA_CU_MODULE_GET_TEX_REF, SC_CUDA_CU_MODULE_LOAD, SC_CUDA_CU_MODULE_LOAD_DATA, SC_CUDA_CU_MODULE_LOAD_DATA_EX, SC_CUDA_CU_MODULE_LOAD_FAT_BINARY, SC_CUDA_CU_MODULE_UNLOAD, - /* stream management api */ - SC_CUDA_CU_STREAM_CREATE, - SC_CUDA_CU_STREAM_DESTROY, - SC_CUDA_CU_STREAM_QUERY, - SC_CUDA_CU_STREAM_SYNCHRONIZE, - /* event management api */ - SC_CUDA_CU_EVENT_CREATE, - SC_CUDA_CU_EVENT_DESTROY, - SC_CUDA_CU_EVENT_ELAPSED_TIME, - SC_CUDA_CU_EVENT_QUERY, - SC_CUDA_CU_EVENT_RECORD, - SC_CUDA_CU_EVENT_SYNCHRONIZE, - /* execution control api */ - SC_CUDA_CU_FUNC_GET_ATTRIBUTE, - SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE, - SC_CUDA_CU_FUNC_SET_SHARED_SIZE, - SC_CUDA_CU_LAUNCH, - SC_CUDA_CU_LAUNCH_GRID, - SC_CUDA_CU_LAUNCH_GRID_ASYNC, - SC_CUDA_CU_PARAM_SETF, - SC_CUDA_CU_PARAM_SETI, - SC_CUDA_CU_PARAM_SET_SIZE, - SC_CUDA_CU_PARAM_SET_TEX_REF, - SC_CUDA_CU_PARAM_SETV, + /* memory management api */ SC_CUDA_CU_ARRAY_3D_CREATE, SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR, SC_CUDA_CU_ARRAY_CREATE, SC_CUDA_CU_ARRAY_DESTROY, SC_CUDA_CU_ARRAY_GET_DESCRIPTOR, + SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID, + SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID, + SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE, + SC_CUDA_CU_IPC_GET_EVENT_HANDLE, + SC_CUDA_CU_IPC_GET_MEM_HANDLE, + SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE, + SC_CUDA_CU_IPC_OPEN_MEM_HANDLE, SC_CUDA_CU_MEM_ALLOC, SC_CUDA_CU_MEM_ALLOC_HOST, SC_CUDA_CU_MEM_ALLOC_PITCH, + SC_CUDA_CU_MEMCPY, SC_CUDA_CU_MEMCPY_2D, SC_CUDA_CU_MEMCPY_2D_ASYNC, SC_CUDA_CU_MEMCPY_2D_UNALIGNED, SC_CUDA_CU_MEMCPY_3D, SC_CUDA_CU_MEMCPY_3D_ASYNC, + SC_CUDA_CU_MEMCPY_3D_PEER, + SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC, + SC_CUDA_CU_MEMCPY_ASYNC, SC_CUDA_CU_MEMCPY_A_TO_A, SC_CUDA_CU_MEMCPY_A_TO_D, SC_CUDA_CU_MEMCPY_A_TO_H, SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC, SC_CUDA_CU_MEMCPY_D_TO_A, SC_CUDA_CU_MEMCPY_D_TO_D, + SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC, SC_CUDA_CU_MEMCPY_D_TO_H, SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC, SC_CUDA_CU_MEMCPY_H_TO_A, SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC, SC_CUDA_CU_MEMCPY_H_TO_D, SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC, + SC_CUDA_CU_MEMCPY_PEER, + SC_CUDA_CU_MEMCPY_PEER_ASYNC, SC_CUDA_CU_MEM_FREE, SC_CUDA_CU_MEM_FREE_HOST, SC_CUDA_CU_MEM_GET_ADDRESS_RANGE, @@ -125,12 +128,54 @@ typedef enum SCCudaAPIS_ { SC_CUDA_CU_MEM_HOST_ALLOC, SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER, SC_CUDA_CU_MEM_HOST_GET_FLAGS, + SC_CUDA_CU_MEM_HOST_REGISTER, + SC_CUDA_CU_MEM_HOST_UNREGISTER, SC_CUDA_CU_MEMSET_D16, + SC_CUDA_CU_MEMSET_D16_ASYNC, SC_CUDA_CU_MEMSET_D2_D16, + SC_CUDA_CU_MEMSET_D2_D16_ASYNC, SC_CUDA_CU_MEMSET_D2_D32, + SC_CUDA_CU_MEMSET_D2_D32_ASYNC, SC_CUDA_CU_MEMSET_D2_D8, + SC_CUDA_CU_MEMSET_D2_D8_ASYNC, SC_CUDA_CU_MEMSET_D32, + SC_CUDA_CU_MEMSET_D32_ASYNC, SC_CUDA_CU_MEMSET_D8, + SC_CUDA_CU_MEMSET_D8_ASYNC, + + /* unified addresssing */ + SC_CUDA_CU_POINTER_GET_ATTRIBUTE, + + /* stream management api */ + SC_CUDA_CU_STREAM_CREATE, + SC_CUDA_CU_STREAM_DESTROY, + SC_CUDA_CU_STREAM_QUERY, + SC_CUDA_CU_STREAM_SYNCHRONIZE, + SC_CUDA_CU_STREAM_WAIT_EVENT, + + /* event management api */ + SC_CUDA_CU_EVENT_CREATE, + SC_CUDA_CU_EVENT_DESTROY, + SC_CUDA_CU_EVENT_ELAPSED_TIME, + SC_CUDA_CU_EVENT_QUERY, + SC_CUDA_CU_EVENT_RECORD, + SC_CUDA_CU_EVENT_SYNCHRONIZE, + + /* execution control api */ + SC_CUDA_CU_FUNC_GET_ATTRIBUTE, + SC_CUDA_CU_FUNC_SET_CACHE_CONFIG, + SC_CUDA_CU_LAUNCH_KERNEL, + SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE, + SC_CUDA_CU_FUNC_SET_SHARED_SIZE, + SC_CUDA_CU_LAUNCH, + SC_CUDA_CU_LAUNCH_GRID, + SC_CUDA_CU_LAUNCH_GRID_ASYNC, + SC_CUDA_CU_PARAM_SETF, + SC_CUDA_CU_PARAM_SETI, + SC_CUDA_CU_PARAM_SET_SIZE, + SC_CUDA_CU_PARAM_SET_TEX_REF, + SC_CUDA_CU_PARAM_SETV, + /* texture reference api */ SC_CUDA_CU_TEX_REF_CREATE, SC_CUDA_CU_TEX_REF_DESTROY, @@ -152,83 +197,87 @@ typedef enum SCCudaAPIS_ { SCEnumCharMap sc_cuda_api_names_string_map[] = { /* init api */ { "cuInit", SC_CUDA_CU_INIT }, + + /* version management api */ + { "cuDriverGetVersion", SC_CUDA_CU_DRIVER_GET_VERSION }, + /* device management api */ - { "cuDeviceGetCount", SC_CUDA_CU_DEVICE_GET_COUNT }, + { "cuDeviceComputeCapability", SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY }, { "cuDeviceGet", SC_CUDA_CU_DEVICE_GET }, + { "cuDeviceGetAttribute", SC_CUDA_CU_DEVICE_GET_ATTRIBUTE }, + { "cuDeviceGetCount", SC_CUDA_CU_DEVICE_GET_COUNT }, { "cuDeviceGetName", SC_CUDA_CU_DEVICE_GET_NAME }, - { "cuDeviceTotalMem", SC_CUDA_CU_DEVICE_TOTAL_MEM }, { "cuDeviceGetProperties", SC_CUDA_CU_DEVICE_GET_PROPERTIES }, - { "cuDeviceGetAttributes", SC_CUDA_CU_DEVICE_GET_ATTRIBUTE }, - /* version management api */ - { "cuDriverGetVersion", SC_CUDA_CU_DRIVER_GET_VERSION, }, + { "cuDeviceTotalMem", SC_CUDA_CU_DEVICE_TOTAL_MEM }, + /* context management api */ - { "cuCtxAttach", SC_CUDA_CU_CTX_ATTACH }, { "cuCtxCreate", SC_CUDA_CU_CTX_CREATE }, { "cuCtxDestroy", SC_CUDA_CU_CTX_DESTROY }, - { "cuCtxDetach", SC_CUDA_CU_CTX_DETACH }, + { "cuCtxGetApiVersion", SC_CUDA_CU_CTX_GET_API_VERSION }, + { "cuCtxGetCacheConfig", SC_CUDA_CU_CTX_GET_CACHE_CONFIG }, + { "cuCtxGetCurrent", SC_CUDA_CU_CTX_GET_CURRENT }, { "cuCtxGetDevice", SC_CUDA_CU_CTX_GET_DEVICE }, + { "cuCtxGetLimit", SC_CUDA_CU_CTX_GET_LIMIT }, { "cuCtxPopCurrent", SC_CUDA_CU_CTX_POP_CURRENT }, { "cuCtxPushCurrent", SC_CUDA_CU_CTX_PUSH_CURRENT }, + { "cuCtxSetCacheConfig", SC_CUDA_CU_CTX_SET_CACHE_CONFIG }, + { "cuCtxSetCurrent", SC_CUDA_CU_CTX_SET_CURRENT }, + { "cuCtxSetLimit", SC_CUDA_CU_CTX_SET_LIMIT }, { "cuCtxSynchronize", SC_CUDA_CU_CTX_SYNCHRONIZE }, + { "cuCtxAttach", SC_CUDA_CU_CTX_ATTACH }, + { "cuCtxDetach", SC_CUDA_CU_CTX_DETACH }, + /* module management api */ { "cuModuleGetFunction", SC_CUDA_CU_MODULE_GET_FUNCTION }, { "cuModuleGetGlobal", SC_CUDA_CU_MODULE_GET_GLOBAL }, + { "cuModuleGetSurfRef", SC_CUDA_CU_MODULE_GET_SURF_REF }, { "cuModuleGetTexRef", SC_CUDA_CU_MODULE_GET_TEX_REF }, { "cuModuleLoad", SC_CUDA_CU_MODULE_LOAD }, { "cuModuleLoadData", SC_CUDA_CU_MODULE_LOAD_DATA }, { "cuModuleLoadDataEx", SC_CUDA_CU_MODULE_LOAD_DATA_EX }, { "cuModuleLoadFatBinary", SC_CUDA_CU_MODULE_LOAD_FAT_BINARY }, { "cuModuleUnload", SC_CUDA_CU_MODULE_UNLOAD }, - /* stream management api */ - { "cuStreamCreate", SC_CUDA_CU_STREAM_CREATE }, - { "cuStreamDestroy", SC_CUDA_CU_STREAM_DESTROY }, - { "cuStreamQuery", SC_CUDA_CU_STREAM_QUERY }, - { "cuStreamSynchronize", SC_CUDA_CU_STREAM_SYNCHRONIZE }, - /* event management api */ - { "cuEventCreate", SC_CUDA_CU_EVENT_CREATE }, - { "cuEventDestroy", SC_CUDA_CU_EVENT_DESTROY }, - { "cuEventElapseTime", SC_CUDA_CU_EVENT_ELAPSED_TIME }, - { "cuEventQuery", SC_CUDA_CU_EVENT_QUERY }, - { "cuEventRecord", SC_CUDA_CU_EVENT_RECORD }, - { "cuEventSynchronize", SC_CUDA_CU_EVENT_SYNCHRONIZE }, - /* execution control api */ - { "cuFuncGetAttribute", SC_CUDA_CU_FUNC_GET_ATTRIBUTE }, - { "cuFuncSetShape", SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE }, - { "cuFuncSetSharedSize", SC_CUDA_CU_FUNC_SET_SHARED_SIZE }, - { "cuLaunch", SC_CUDA_CU_LAUNCH }, - { "cuLaunchGrid", SC_CUDA_CU_LAUNCH_GRID }, - { "cuLaunchGridAsync", SC_CUDA_CU_LAUNCH_GRID_ASYNC }, - { "cuParamSetf", SC_CUDA_CU_PARAM_SETF }, - { "cuParamSeti", SC_CUDA_CU_PARAM_SETI }, - { "cuParamSetSize", SC_CUDA_CU_PARAM_SET_SIZE }, - { "cuSetTexRef", SC_CUDA_CU_PARAM_SET_TEX_REF }, - { "cuSetv", SC_CUDA_CU_PARAM_SETV }, + /* memory management api */ { "cuArray3DCreate", SC_CUDA_CU_ARRAY_3D_CREATE }, { "cuArray3DGetDescriptor", SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR }, { "cuArrayCreate", SC_CUDA_CU_ARRAY_CREATE }, { "cuArrayDestroy", SC_CUDA_CU_ARRAY_DESTROY }, { "cuArrayGetDescriptor", SC_CUDA_CU_ARRAY_GET_DESCRIPTOR }, + { "cuDeviceGetByPCIBusId", SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID }, + { "cuDeviceGetPCIBusId", SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID }, + { "cuIpcCloseMemHandle", SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE }, + { "cuIpcGetEventHandle", SC_CUDA_CU_IPC_GET_MEM_HANDLE }, + { "cuIpcGetMemHandle", SC_CUDA_CU_IPC_GET_MEM_HANDLE }, + { "cuIpcOpenEventHandle", SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE }, + { "cuIpcOpenMemHandle", SC_CUDA_CU_IPC_OPEN_MEM_HANDLE }, { "cuMemAlloc", SC_CUDA_CU_MEM_ALLOC }, { "cuMemAllocHost", SC_CUDA_CU_MEM_ALLOC_HOST }, { "cuMemAllocPitch", SC_CUDA_CU_MEM_ALLOC_PITCH }, + { "cuMemcpy", SC_CUDA_CU_MEMCPY }, { "cuMemcpy2D", SC_CUDA_CU_MEMCPY_2D }, { "cuMemcpy2DAsync", SC_CUDA_CU_MEMCPY_2D_ASYNC }, { "cuMemcpy2DUnaligned", SC_CUDA_CU_MEMCPY_2D_UNALIGNED }, { "cuMemcpy3D", SC_CUDA_CU_MEMCPY_3D }, { "cuMemcpy3DAsync", SC_CUDA_CU_MEMCPY_3D_ASYNC }, + { "cuMemcpy3DPeer", SC_CUDA_CU_MEMCPY_3D_PEER }, + { "cuMemcpy3DPeerAsync", SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC }, + { "cuMemcpyAsync", SC_CUDA_CU_MEMCPY_ASYNC }, { "cuMemcpyAtoA", SC_CUDA_CU_MEMCPY_A_TO_A }, { "cuMemcpyAtoD", SC_CUDA_CU_MEMCPY_A_TO_D }, { "cuMemcpyAtoH", SC_CUDA_CU_MEMCPY_A_TO_H }, - { "cuMemcpyAtoHAsyn", SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC }, + { "cuMemcpyAtoHAsync", SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC }, { "cuMemcpyDtoA", SC_CUDA_CU_MEMCPY_D_TO_A }, { "cuMemcpyDtoD", SC_CUDA_CU_MEMCPY_D_TO_D }, + { "cuMemcpyDtoDAsync", SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC }, { "cuMemcpyDtoH", SC_CUDA_CU_MEMCPY_D_TO_H }, - { "cuMemcpyDtoHAsyn", SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC }, + { "cuMemcpyDtoHAsync", SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC }, { "cuMemcpyHtoA", SC_CUDA_CU_MEMCPY_H_TO_A }, { "cuMemcpyHtoAAsync", SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC }, { "cuMemcpyHtoD", SC_CUDA_CU_MEMCPY_H_TO_D }, { "cuMemcpyHtoDAsync", SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC }, + { "cuMemcpyPeer", SC_CUDA_CU_MEMCPY_PEER }, + { "cuMemcpyPeerAsync", SC_CUDA_CU_MEMCPY_PEER_ASYNC }, { "cuMemFree", SC_CUDA_CU_MEM_FREE }, { "cuMemFreeHost", SC_CUDA_CU_MEM_FREE_HOST }, { "cuMemGetAddressRange", SC_CUDA_CU_MEM_GET_ADDRESS_RANGE }, @@ -236,12 +285,54 @@ SCEnumCharMap sc_cuda_api_names_string_map[] = { { "cuMemHostAlloc", SC_CUDA_CU_MEM_HOST_ALLOC }, { "cuMemHostGetDevicePointer", SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER }, { "cuMemHostGetFlags", SC_CUDA_CU_MEM_HOST_GET_FLAGS }, + { "cuMemHostRegister", SC_CUDA_CU_MEM_HOST_REGISTER }, + { "cuMemHostUnregister", SC_CUDA_CU_MEM_HOST_UNREGISTER }, { "cuMemsetD16", SC_CUDA_CU_MEMSET_D16 }, + { "cuMemsetD16Async", SC_CUDA_CU_MEMSET_D16_ASYNC }, { "cuMemsetD2D16", SC_CUDA_CU_MEMSET_D2_D16 }, + { "cuMemsetD2D16Async", SC_CUDA_CU_MEMSET_D2_D16_ASYNC }, { "cuMemsetD2D32", SC_CUDA_CU_MEMSET_D2_D32 }, + { "cuMemsetD2D32Async", SC_CUDA_CU_MEMSET_D2_D32_ASYNC }, { "cuMemsetD2D8", SC_CUDA_CU_MEMSET_D2_D8 }, + { "cuMemsetD2D8Async", SC_CUDA_CU_MEMSET_D2_D8_ASYNC }, { "cuMemsetD32", SC_CUDA_CU_MEMSET_D32 }, + { "cuMemsetD32Async", SC_CUDA_CU_MEMSET_D32_ASYNC }, { "cuMemsetD8", SC_CUDA_CU_MEMSET_D8 }, + { "cuMemsetD8Async", SC_CUDA_CU_MEMSET_D8_ASYNC }, + + /* unified addressing */ + { "cuPointerGetAttribute", SC_CUDA_CU_POINTER_GET_ATTRIBUTE }, + + /* stream management api */ + { "cuStreamCreate", SC_CUDA_CU_STREAM_CREATE }, + { "cuStreamDestroy", SC_CUDA_CU_STREAM_DESTROY }, + { "cuStreamQuery", SC_CUDA_CU_STREAM_QUERY }, + { "cuStreamSynchronize", SC_CUDA_CU_STREAM_SYNCHRONIZE }, + { "cuStreamWaitEvent", SC_CUDA_CU_STREAM_WAIT_EVENT }, + + /* event management api */ + { "cuEventCreate", SC_CUDA_CU_EVENT_CREATE }, + { "cuEventDestroy", SC_CUDA_CU_EVENT_DESTROY }, + { "cuEventElapseTime", SC_CUDA_CU_EVENT_ELAPSED_TIME }, + { "cuEventQuery", SC_CUDA_CU_EVENT_QUERY }, + { "cuEventRecord", SC_CUDA_CU_EVENT_RECORD }, + { "cuEventSynchronize", SC_CUDA_CU_EVENT_SYNCHRONIZE }, + + /* execution control api */ + { "cuFuncGetAttribute", SC_CUDA_CU_FUNC_GET_ATTRIBUTE }, + { "cuFuncSetCacheConfig", SC_CUDA_CU_FUNC_SET_CACHE_CONFIG }, + { "cuLaunchKernel", SC_CUDA_CU_LAUNCH_KERNEL }, + { "cuFuncSetBlockShape", SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE }, + { "cuFuncSetSharedSize", SC_CUDA_CU_FUNC_SET_SHARED_SIZE }, + { "cuLaunch", SC_CUDA_CU_LAUNCH }, + { "cuLaunchGrid", SC_CUDA_CU_LAUNCH_GRID }, + { "cuLaunchGridAsync", SC_CUDA_CU_LAUNCH_GRID_ASYNC }, + { "cuParamSetf", SC_CUDA_CU_PARAM_SETF }, + { "cuParamSeti", SC_CUDA_CU_PARAM_SETI }, + { "cuParamSetSize", SC_CUDA_CU_PARAM_SET_SIZE }, + { "cuSetTexRef", SC_CUDA_CU_PARAM_SET_TEX_REF }, + { "cuSetv", SC_CUDA_CU_PARAM_SETV }, + /* texture reference api */ { "cuTexRefCreate", SC_CUDA_CU_TEX_REF_CREATE}, { "cuTexRefDestroy", SC_CUDA_CU_TEX_REF_DESTROY}, @@ -258,6 +349,8 @@ SCEnumCharMap sc_cuda_api_names_string_map[] = { { "cuTexRefSetFilterMode", SC_CUDA_CU_TEX_REF_SET_FILTER_MODE}, { "cuTexRefSetFlags", SC_CUDA_CU_TEX_REF_SET_FLAGS}, { "cuTexRefSetFormat", SC_CUDA_CU_TEX_REF_SET_FORMAT}, + + { NULL, -1 }, }; static SCCudaDevices *devices = NULL; @@ -282,10 +375,15 @@ static const char *SCCudaGetErrorCodeInString(int err) CASE_CODE(CUDA_ERROR_OUT_OF_MEMORY); CASE_CODE(CUDA_ERROR_NOT_INITIALIZED); CASE_CODE(CUDA_ERROR_DEINITIALIZED); + CASE_CODE(CUDA_ERROR_PROFILER_DISABLED); + CASE_CODE(CUDA_ERROR_PROFILER_NOT_INITIALIZED); + CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STARTED); + CASE_CODE(CUDA_ERROR_PROFILER_ALREADY_STOPPED); CASE_CODE(CUDA_ERROR_NO_DEVICE); CASE_CODE(CUDA_ERROR_INVALID_DEVICE); CASE_CODE(CUDA_ERROR_INVALID_IMAGE); CASE_CODE(CUDA_ERROR_INVALID_CONTEXT); + /* deprecated error code as of 3.2 */ CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_CURRENT); CASE_CODE(CUDA_ERROR_MAP_FAILED); CASE_CODE(CUDA_ERROR_UNMAP_FAILED); @@ -294,8 +392,16 @@ static const char *SCCudaGetErrorCodeInString(int err) CASE_CODE(CUDA_ERROR_NO_BINARY_FOR_GPU); CASE_CODE(CUDA_ERROR_ALREADY_ACQUIRED); CASE_CODE(CUDA_ERROR_NOT_MAPPED); + CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_ARRAY); + CASE_CODE(CUDA_ERROR_NOT_MAPPED_AS_POINTER); + CASE_CODE(CUDA_ERROR_ECC_UNCORRECTABLE); + CASE_CODE(CUDA_ERROR_UNSUPPORTED_LIMIT); + CASE_CODE(CUDA_ERROR_CONTEXT_ALREADY_IN_USE); CASE_CODE(CUDA_ERROR_INVALID_SOURCE); CASE_CODE(CUDA_ERROR_FILE_NOT_FOUND); + CASE_CODE(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND); + CASE_CODE(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED); + CASE_CODE(CUDA_ERROR_OPERATING_SYSTEM); CASE_CODE(CUDA_ERROR_INVALID_HANDLE); CASE_CODE(CUDA_ERROR_NOT_FOUND); CASE_CODE(CUDA_ERROR_NOT_READY); @@ -303,6 +409,14 @@ static const char *SCCudaGetErrorCodeInString(int err) CASE_CODE(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES); CASE_CODE(CUDA_ERROR_LAUNCH_TIMEOUT); CASE_CODE(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING); + CASE_CODE(CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED); + CASE_CODE(CUDA_ERROR_PEER_ACCESS_NOT_ENABLED); + CASE_CODE(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE); + CASE_CODE(CUDA_ERROR_CONTEXT_IS_DESTROYED); + CASE_CODE(CUDA_ERROR_ASSERT); + CASE_CODE(CUDA_ERROR_TOO_MANY_PEERS); + CASE_CODE(CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED); + CASE_CODE(CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED); CASE_CODE(CUDA_ERROR_UNKNOWN); default: return "CUDA_UNKNOWN_ERROR_CODE"; @@ -336,95 +450,21 @@ static int SCCudaHandleRetValue(CUresult result, SCCudaAPIS api_type) } } -/****************************Memory_Management_API*****************************/ +/*****************************Cuda_Initialization_API**************************/ /** - * \brief Creates a CUDA array according to the CUDA_ARRAY3D_DESCRIPTOR - * structure pAllocateArray and returns a handle to the new CUDA - * array in *p_handle. The CUDA_ARRAY3D_DESCRIPTOR is defined as: - * - * typedef struct { - * unsigned int Width; - * unsigned int Height; - * unsigned int Depth; - * CUarray_format Format; - * unsigned int NumChannels; - * unsigned int Flags; - * } CUDA_ARRAY3D_DESCRIPTOR; - * - * where: - * - * - Width, Height, and Depth are the width, height, and depth of the - * CUDA array (in elements); the CUDA array is one-dimensional if - * height and depth are 0, two-dimensional if depth is 0, and - * three-dimensional otherwise; - * - Format speci?es the format of the elements; CUarray_format is - * defined as: - * - * typedef enum CUarray_format_enum { - * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - * CU_AD_FORMAT_SIGNED_INT8 = 0x08, - * CU_AD_FORMAT_SIGNED_INT16 = 0x09, - * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - * CU_AD_FORMAT_HALF = 0x10, - * CU_AD_FORMAT_FLOAT = 0x20 - * } CUarray_format; - * - * - NumChannels speci?es the number of packed components per CUDA array - * element; it may be 1, 2, or 4; - * - Flags provides for future features. For now, it must be set to 0. - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 2048; - * desc.Height = 0; - * desc.Depth = 0; - * - * Description for a 64 x 64 CUDA array of floats: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 64; - * desc.Height = 64; - * desc.Depth = 0; - * - * Description for a width x height x depth CUDA array of 64-bit, - * 4x16-bit float16's: - * - * CUDA_ARRAY3D_DESCRIPTOR desc; - * desc.FormatFlags = CU_AD_FORMAT_HALF; - * desc.NumChannels = 4; - * desc.Width = width; - * desc.Height = height; - * desc.Depth = depth; + * \internal + * \brief Inits the cuda driver API. * - * \param p_handle Returned Handle. - * \param p_allocate_array 3D array descriptor. + * \param flags Currently should be 0. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaArray3DCreate(CUarray *p_handle, - const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array) +int SCCudaInit(unsigned int flags) { - CUresult result = 0; - - if (p_handle == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_handle is NULL"); - goto error; - } - - result = cuArray3DCreate(p_handle, p_allocate_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_CREATE) == -1) + CUresult result = cuInit(flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_INIT) == -1) goto error; return 0; @@ -433,34 +473,30 @@ int SCCudaArray3DCreate(CUarray *p_handle, return -1; } +/*****************************Version_Management_API***************************/ + /** - * \brief Returns in *p_rray_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. - * - * This function may be called on 1D and 2D arrays, in which case the - * Height and/or Depth members of the descriptor struct will be set to 0. + * \brief Returns in *driver_version the version number of the installed CUDA + * driver. This function automatically returns CUDA_ERROR_INVALID_VALUE + * if the driver_version argument is NULL. * - * \param p_array_descriptor Returned 3D array descriptor. - * \param h_array 3D array to get descriptor of. + * \param driver_version Returns the CUDA driver version. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, - CUarray h_array) +int SCCudaDriverGetVersion(int *driver_version) { CUresult result = 0; - if (p_array_descriptor == NULL) { + if (driver_version == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_array_descriptor is NULL"); + "driver_version NULL"); goto error; } - result = cuArray3DGetDescriptor(p_array_descriptor, h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR) == -1) + result = cuDriverGetVersion(driver_version); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DRIVER_GET_VERSION) == -1) goto error; return 0; @@ -469,95 +505,32 @@ int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, return -1; } +/*****************************Device_Management_API****************************/ + /** - * \brief Creates a CUDA array according to the CUDA_ARRAY_DESCRIPTOR structure - * p_allocate_array and returns a handle to the new CUDA array in - * p_handle. The CUDA_ARRAY_DESCRIPTOR is defined as: - * - * typedef struct { - * unsigned int Width; - * unsigned int Height; - * CUarray_format Format; - * unsigned int NumChannels; - * } CUDA_ARRAY_DESCRIPTOR; - * - * where: - * - * - Width, and Height are the width, and height of the CUDA array - * (in elements); the CUDA array is one-dimensional if height is 0, - * two-dimensional otherwise; - * - Format speci?es the format of the elements; CUarray_format is - * defined as: - * - * typedef enum CUarray_format_enum { - * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, - * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, - * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, - * CU_AD_FORMAT_SIGNED_INT8 = 0x08, - * CU_AD_FORMAT_SIGNED_INT16 = 0x09, - * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, - * CU_AD_FORMAT_HALF = 0x10, - * CU_AD_FORMAT_FLOAT = 0x20 - * } CUarray_format; - * - * - NumChannels specifies the number of packed components per CUDA - * array element; it may be 1, 2, or 4; - * - * Here are examples of CUDA array descriptions: - * - * Description for a CUDA array of 2048 floats: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 2048; - * desc.Height = 1; - * - * Description for a 64 x 64 CUDA array of floats: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.Format = CU_AD_FORMAT_FLOAT; - * desc.NumChannels = 1; - * desc.Width = 64; - * desc.Height = 64; - * - * Description for a width x height CUDA array of 64-bit, 4x16-bit - * float16's: - * - * CUDA_ARRAY_DESCRIPTOR desc; - * desc.FormatFlags = CU_AD_FORMAT_HALF; - * desc.NumChannels = 4; - * desc.Width = width; - * desc.Height = height; - * - * Description for a width x height CUDA array of 16-bit elements, each - * of which is two 8-bit unsigned chars: - * - * CUDA_ARRAY_DESCRIPTOR arrayDesc; - * desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; - * desc.NumChannels = 2; - * desc.Width = width; - * desc.Height = height; + * \internal + * \brief Returns the major and the minor revision numbers that define the + * compute capability for the device that is sent as the argument. * - * \param p_handle Returned array. - * \param p_allocate_array Array descriptor. + * \param major Pointer to an integer, that will be updated with the major revision. + * \param minor Pointer to an integer, that will be updated with the minor revision. + * \param dev The device handle. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaArrayCreate(CUarray *p_handle, - const CUDA_ARRAY_DESCRIPTOR *p_allocate_array) +int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev) { CUresult result = 0; - if (p_handle == NULL) { + if (major == NULL || minor == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_handle is NULL"); + "major is NULL or minor is NULL"); goto error; } - result = cuArrayCreate(p_handle, p_allocate_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_CREATE) == -1) + result = cuDeviceComputeCapability(major, minor, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY) == -1) goto error; return 0; @@ -566,19 +539,30 @@ int SCCudaArrayCreate(CUarray *p_handle, return -1; } - /** - * \brief Destroys the CUDA array h_array. + * \internal + * \brief Returns a device handle given an ordinal in the range + * [0, cuDeviceGetCount() - 1]. * - * \param h_array Array to destroy. + * \param device Pointer to a CUDevice instance that will be updated with the + * device handle. + * \param ordinal An index in the range [0, cuDeviceGetCount() - 1]. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaArrayDestroy(CUarray h_array) +int SCCudaDeviceGet(CUdevice *device, int ordinal) { - int result = cuArrayDestroy(h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_DESTROY) == -1) + CUresult result = 0; + + if (device == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "device NULL"); + goto error; + } + + result = cuDeviceGet(device, ordinal); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET) == -1) goto error; return 0; @@ -588,30 +572,76 @@ int SCCudaArrayDestroy(CUarray h_array) } /** - * \brief Returns in *p_array_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. + * \internal + * \brief Returns the various attributes for the device that is sent as the arg. * - * \param p_array_descriptor Returned array descriptor. - * \param h_array Array to get descriptor of. + * The supported attributes are: + * + * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads + * per block; + * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; + * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; + * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; + * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; + * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; + * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; + * CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared mem-ory available to a thread block in bytes; this amount + * is shared by all thread blocks simultaneously resident on a + * multiprocessor; + * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device + * for __constant_-_ variables in a CUDA C kernel in bytes; + * CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; + * CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated + * through cuMemAllocPitch(); + * CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers avail-able to a thread block; this number is shared by + * all thread blocks simultaneously resident on a multiprocessor; + * CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz; + * CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to textureAlign bytes do not need an offset + * applied to texture fetches; + * CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not; + * CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device; + * CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not; + * CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not; + * CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not; + * CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted + * and can have multiple CUDA contexts present at a single time. + * - CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have + * only one CUDA con-text present on it at a time. + * - CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * + * \param pi Pointer to an interger instance that will be updated with the + * attribute value. + * \param attrib Device attribute to query. + * \param dev The device handle. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, - CUarray h_array) +int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, + CUdevice dev) { CUresult result = 0; - if (p_array_descriptor == NULL) { + if (pi == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_array_descriptor is NULL"); + "prop is NULL"); goto error; } - result = cuArrayGetDescriptor(p_array_descriptor, h_array); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_GET_DESCRIPTOR) == -1) + result = cuDeviceGetAttribute(pi, attrib, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_ATTRIBUTE) == -1) goto error; return 0; @@ -621,29 +651,27 @@ int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, } /** - * \brief Returns in *p_array_descriptor a descriptor containing information on - * the format and dimensions of the CUDA array h_array. It is useful for - * subroutines that have been passed a CUDA array, but need to know the - * CUDA array parameters for validation or other purposes. + * \internal + * \brief Gets the total no of devices with compute capability greater than or + * equal to 1.0 that are available for execution. * - * \param p_array_descriptor Returned array descriptor. - * \param h_array Array to get descriptor of. + * \param count Pointer to an integer that will be updated with the device count. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemAlloc(CUdeviceptr *dptr, unsigned int byte_size) +int SCCudaDeviceGetCount(int *count) { CUresult result = 0; - if (dptr == NULL) { + if (count == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "dptr is NULL"); + "count NULL"); goto error; } - result = cuMemAlloc(dptr, byte_size); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) + result = cuDeviceGetCount(count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_COUNT) == -1) goto error; return 0; @@ -653,36 +681,28 @@ int SCCudaMemAlloc(CUdeviceptr *dptr, unsigned int byte_size) } /** - * \brief Allocates bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the vir-tual memory - * ranges allocated with this function and automatically accelerates - * calls to functions such as cuMemcpy(). Since the memory can be - * accessed directly by the device, it can be read or written with - * much higher bandwidth than pageable memory obtained with functions - * such as SCMalloc(). Allocating excessive amounts of memory with - * cuMemAllocHost() may degrade system performance, since it reduces - * the amount of memory available to the system for paging. As a result, - * this function is best used sparingly to allocate staging areas for - * data exchange between host and device. + * \internal + * \brief Returns the device name, given the device handle. * - * \param pp Returned host pointer to page-locked memory. - * \param byte_size Requested allocation size in bytes. + * \param name Pointer to a char buffer which will be updated with the device name. + * \param len Length of the above buffer. + * \param dev The device handle. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemAllocHost(void **pp, unsigned int byte_size) +int SCCudaDeviceGetName(char *name, int len, CUdevice dev) { CUresult result = 0; - if (pp == NULL) { + if (name == NULL || len == 0) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pp is NULL"); + "name is NULL or len is 0"); goto error; } - result = cuMemAllocHost(pp, byte_size); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) + result = cuDeviceGetName(name, len, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_NAME) == -1) goto error; return 0; @@ -692,58 +712,41 @@ int SCCudaMemAllocHost(void **pp, unsigned int byte_size) } /** - * \brief Allocates at least width_in_bytes * height bytes of linear memory on the - * device and returns in *dptr a pointer to the allocated memory. The - * function may pad the allocation to ensure that corresponding pointers in - * any given row will continue to meet the alignment requirements for - * coalescing as the address is updated from row to row. ElementSizeBytes - * specifies the size of the largest reads and writes that will be - * performed on the memory range. + * \internal + * \brief Returns the properties of the device. The CUdevprop structure is + * defined as * - * element_size_bytes may be 4, 8 or 16 (since coalesced memory - * transactions are not possible on other data sizes). If element_size_bytes - * is smaller than the actual read/write size of a kernel, the kernel will - * run correctly, but possibly at reduced speed. The pitch returned in - * *p_itch by cuMemAllocPitch() is the width in bytes of the allocation. - * The intended usage of pitch is as a separate parameter of the allocation, - * used to compute addresses within the 2D array. Given the row and column - * of an array element of type T, the address is computed as: + * typedef struct CUdevprop_st { + * int maxThreadsPerBlock; + * int maxThreadsDim[3]; + * int maxGridSize[3]; + * int sharedMemPerBlock; + * int totalConstantMemory; + * int SIMDWidth; + * int memPitch; + * int regsPerBlock; + * int clockRate; + * int textureAlign + * } CUdevprop; * - * T * p_element = (T*)((char*)base_address + row * pitch) + column; - * - * The pitch returned by cuMemAllocPitch() is guaranteed to work with - * cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it - * is recommended that programmers consider performing pitch allocations - * using cuMemAllocPitch(). Due to alignment restrictions in the hardware, - * this is especially true if the application will be performing 2D memory - * copies between different regions of device memory (whether linear memory - * or CUDA arrays). - * - * \param dptr Returned device pointer. - * \param p_pitch Returned pitch of allocation in bytes. - * \param width_in_bytes Requested allocation width in bytes. - * \param height Requested allocation width in rows. - * \param element_size_bytes Size of largest reads/writes for range. + * \param prop Pointer to a CUdevprop instance that holds the device properties. + * \param dev The device handle. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemAllocPitch(CUdeviceptr *dptr, unsigned int *p_pitch, - unsigned int width_in_bytes, - unsigned int height, - unsigned int element_size_bytes) +int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev) { CUresult result = 0; - if (dptr == NULL || p_pitch == NULL) { + if (prop == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "dptr is NULL or p_pitch is NULL"); + "prop is NULL"); goto error; } - result = cuMemAllocPitch(dptr, p_pitch, width_in_bytes, height, - element_size_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_PITCH) == -1) + result = cuDeviceGetProperties(prop, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PROPERTIES) == -1) goto error; return 0; @@ -753,119 +756,29 @@ int SCCudaMemAllocPitch(CUdeviceptr *dptr, unsigned int *p_pitch, } /** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. - * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. + * \internal + * \brief Returns the total amount of memory availabe on the device which + * is sent as the argument. * - * \param p_copy Parameters for the memory copy. + * \param bytes Pointer to an unsigned int instance, that will be updated with + * total memory for the device. + * \param dev The device handle. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy) +int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev) { CUresult result = 0; - if (p_copy == NULL) { + if (bytes == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); + "bytes is NULL"); goto error; } - result = cuMemcpy2D(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D) == -1) + result = cuDeviceTotalMem(bytes, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_TOTAL_MEM) == -1) goto error; return 0; @@ -875,253 +788,1096 @@ int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy) } /** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. - * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. - * - * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a - * stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param p_copy Parameters for the memory copy. - * \param h_stream Stream identifier. + * \internal + * \brief Creates and returns a new instance of SCCudaDevice. * - * \retval 0 On success. - * \retval -1 On failure. + * \retval device Pointer to the new instance of SCCudaDevice. */ -int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream) +static SCCudaDevice *SCCudaAllocSCCudaDevice(void) { - CUresult result = 0; - - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } + SCCudaDevice *device = SCMalloc(sizeof(SCCudaDevice)); + if (device == NULL) + return NULL; + memset(device, 0 , sizeof(SCCudaDevice)); - result = cuMemcpy2DAsync(p_copy, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_ASYNC) == -1) - goto error; + return device; +} - return 0; +/** + * \internal + * \brief Frees an instance of SCCudaDevice. + * + * \param device Pointer to the an instance of SCCudaDevice to be freed. + */ +static void SCCudaDeAllocSCCudaDevice(SCCudaDevice *device) +{ + SCFree(device); + + return; +} + +/** + * \internal + * \brief Creates and returns a new instance of SCCudaDevices. + * + * \retval devices Pointer to the new instance of SCCudaDevices. + */ +static SCCudaDevices *SCCudaAllocSCCudaDevices(void) +{ + SCCudaDevices *devices = SCMalloc(sizeof(SCCudaDevices)); + if (devices == NULL) + return NULL; + memset(devices, 0 , sizeof(SCCudaDevices)); + + return devices; +} + +/** + * \internal + * \brief Frees an instance of SCCudaDevices. + * + * \param device Pointer to the an instance of SCCudaDevices to be freed. + */ +static void SCCudaDeAllocSCCudaDevices(SCCudaDevices *devices) +{ + int i = 0; + + if (devices == NULL) + return; + + if (devices->devices != NULL) { + for (i = 0; i < devices->count; i++) + SCCudaDeAllocSCCudaDevice(devices->devices[i]); + + SCFree(devices->devices); + } + + SCFree(devices); + + return; +} + +/** + * \brief Retrieves all the devices and all the information corresponding to + * the devices on the CUDA device available on this system and returns + * a SCCudaDevices instances which holds all this information. + * + * \retval devices Pointer to a SCCudaDevices instance that holds information + * for all the CUDA devices on the system. + */ +static SCCudaDevices *SCCudaGetDevices(void) +{ + SCCudaDevices *devices = SCCudaAllocSCCudaDevices(); + int i = 0; + + if (SCCudaDeviceGetCount(&devices->count) == -1) + goto error; + + devices->devices = SCMalloc(devices->count * sizeof(SCCudaDevice *)); + if (devices->devices == NULL) + goto error; + + /* update the device properties */ + for (i = 0; i < devices->count; i++) { + devices->devices[i] = SCCudaAllocSCCudaDevice(); + + if (SCCudaDeviceGet(&devices->devices[i]->device, i) == -1) + goto error; + + if (SCCudaDeviceComputeCapability(&devices->devices[i]->major_rev, + &devices->devices[i]->minor_rev, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetName(devices->devices[i]->name, + SC_CUDA_DEVICE_NAME_MAX_LEN, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceTotalMem(&devices->devices[i]->bytes, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetProperties(&devices->devices[i]->prop, + devices->devices[i]->device) == -1) { + goto error; + } + + /* retrieve the attributes */ + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_threads_per_block, + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_x, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_y, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_z, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_x, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_y, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_z, + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_shared_memory_per_block, + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_total_constant_memory, + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_warp_size, + CU_DEVICE_ATTRIBUTE_WARP_SIZE, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_pitch, + CU_DEVICE_ATTRIBUTE_MAX_PITCH, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_registers_per_block, + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_clock_rate, + CU_DEVICE_ATTRIBUTE_CLOCK_RATE, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_texture_alignment, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_gpu_overlap, + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_multiprocessor_count, + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_kernel_exec_timeout, + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_integrated, + CU_DEVICE_ATTRIBUTE_INTEGRATED, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_can_map_host_memory, + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, + devices->devices[i]->device) == -1) { + goto error; + } + + if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_compute_mode, + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, + devices->devices[i]->device) == -1) { + goto error; + } + } + +#ifdef DEBUG + SCCudaPrintDeviceList(devices); +#endif + + return devices; + + error: + SCCudaDeAllocSCCudaDevices(devices); + return NULL; +} + +/** + * \brief Prints the information for all the devices for this CUDA platform, + * supplied inside the argument. + * + * \param devices Pointer to a SCCudaDevices instance that holds information on + * the devices. + */ +void SCCudaPrintDeviceList(SCCudaDevices *devices) +{ + int i = 0; + + if (devices == NULL) { + SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " + "Please initialized the CUDA environment by calling " + "SCCudaInitCudaEnvironment() before making any calls " + "to the CUDA API."); + return; + } + + SCLogDebug("Printing device info for this CUDA context"); + SCLogDebug("No of devices: %d", devices->count); + + for (i = 0; i < devices->count; i++) { + SCLogDebug("Device ID: %d", devices->devices[i]->device); + SCLogDebug("Device Name: %s", devices->devices[i]->name); + SCLogDebug("Device Major Revision: %d", devices->devices[i]->major_rev); + SCLogDebug("Device Minor Revision: %d", devices->devices[i]->minor_rev); + + /* Cudevprop */ + SCLogDebug("Device Max Threads Per Block: %d", + devices->devices[i]->prop.maxThreadsPerBlock); + SCLogDebug("Device Max Threads Dim: [%d, %d, %d]", + devices->devices[i]->prop.maxThreadsDim[0], + devices->devices[i]->prop.maxThreadsDim[1], + devices->devices[i]->prop.maxThreadsDim[2]); + SCLogDebug("Device Max Grid Size: [%d, %d, %d]", + devices->devices[i]->prop.maxGridSize[0], + devices->devices[i]->prop.maxGridSize[1], + devices->devices[i]->prop.maxGridSize[2]); + SCLogDebug("Device Shared Memory Per Block: %d", + devices->devices[i]->prop.sharedMemPerBlock); + SCLogDebug("Device Total Constant Memory: %d", + devices->devices[i]->prop.totalConstantMemory); + SCLogDebug("Device SIMD Width(Warp Size): %d", + devices->devices[i]->prop.SIMDWidth); + SCLogDebug("Device Maximum Mem Pitch: %d", devices->devices[i]->prop.memPitch); + SCLogDebug("Device Total Registers Available Per Block: %d", + devices->devices[i]->prop.regsPerBlock); + SCLogDebug("Device Clock Frequency: %d", devices->devices[i]->prop.clockRate); + SCLogDebug("Device Texture Alignment Requirement: %d", + devices->devices[i]->prop.textureAlign); + + + /* device attributes */ + SCLogDebug("Device Max Threads Per Block: %d", + devices->devices[i]->attr_max_threads_per_block); + SCLogDebug("Device Max Block Dim X: %d", + devices->devices[i]->attr_max_block_dim_x); + SCLogDebug("Device Max Block Dim Y: %d", + devices->devices[i]->attr_max_block_dim_y); + SCLogDebug("Device Max Block Dim Z: %d", + devices->devices[i]->attr_max_block_dim_z); + SCLogDebug("Device Max Grid Dim X: %d", + devices->devices[i]->attr_max_grid_dim_x); + SCLogDebug("Device Max Grid Dim Y: %d", + devices->devices[i]->attr_max_grid_dim_y); + SCLogDebug("Device Max Grid Dim Z: %d", + devices->devices[i]->attr_max_grid_dim_z); + SCLogDebug("Device Max Shared Memory Per Block: %d", + devices->devices[i]->attr_max_shared_memory_per_block); + SCLogDebug("Device Total Constant Memory: %d", + devices->devices[i]->attr_total_constant_memory); + SCLogDebug("Device Warp Size: %d", devices->devices[i]->attr_warp_size); + SCLogDebug("Device Max Pitch: %d", devices->devices[i]->attr_max_pitch); + SCLogDebug("Device Max Registers Per Block: %d", + devices->devices[i]->attr_max_registers_per_block); + SCLogDebug("Device Clock Rate: %d", devices->devices[i]->attr_clock_rate); + SCLogDebug("Device Texture Alignement: %d", + devices->devices[i]->attr_texture_alignment); + SCLogDebug("Device GPU Overlap: %s", + (devices->devices[i]->attr_gpu_overlap == 1) ? "Yes": "No"); + SCLogDebug("Device Multiprocessor Count: %d", + devices->devices[i]->attr_multiprocessor_count); + SCLogDebug("Device Kernel Exec Timeout: %s", + (devices->devices[i]->attr_kernel_exec_timeout) ? "Yes": "No"); + SCLogDebug("Device Integrated With Memory Subsystem: %s", + (devices->devices[i]->attr_integrated) ? "Yes": "No"); + SCLogDebug("Device Can Map Host Memory: %s", + (devices->devices[i]->attr_can_map_host_memory) ? "Yes": "No"); + if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_DEFAULT) + SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_DEFAULT"); + else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_EXCLUSIVE) + SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_EXCLUSIVE"); + else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_PROHIBITED) + SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_PROHIBITED"); + } + + return; +} + +/** + * \brief Prints some basic information for the default device(the first devie) + * we will be using on this cuda platform for use by our engine. This + * function is basically to be used to print some minimal information to + * the user at engine startup. + * + * \param devices Pointer to a SCCudaDevices instance that holds information on + * the devices. + */ +void SCCudaPrintBasicDeviceInfo(SCCudaDevices *devices) +{ + int i = 0; + + if (devices == NULL) { + SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " + "Please initialized the CUDA environment by calling " + "SCCudaInitCudaEnvironment() before making any calls " + "to the CUDA API."); + return; + } + + for (i = 0; i < devices->count; i++) { + SCLogInfo("GPU Device %d: %s, %d Multiprocessors, %dMHz, CUDA Compute " + "Capability %d.%d", i + 1, + devices->devices[i]->name, + devices->devices[i]->attr_multiprocessor_count, + devices->devices[i]->attr_clock_rate/1000, + devices->devices[i]->major_rev, + devices->devices[i]->minor_rev); + } + + return; +} + +/** + * \brief Gets the device list, for the CUDA platform environment initialized by + * the engine. + * + * \retval devices Pointer to the CUDA device list on success; NULL on failure. + */ +SCCudaDevices *SCCudaGetDeviceList(void) +{ + if (devices == NULL) { + SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " + "Please initialized the CUDA environment by calling " + "SCCudaInitCudaEnvironment() before making any calls " + "to the CUDA API."); + return NULL; + } + + return devices; +} + +/*****************************Context_Management_API***************************/ + +/** + * \brief Creates a new CUDA context and associates it with the calling thread. + * The flags parameter is described below. The context is created with + * a usage count of 1 and the caller of cuCtxCreate() must call + * cuCtxDestroy() or cuCtxDetach() when done using the context. If a + * context is already current to the thread, it is supplanted by the + * newly created context and may be restored by a subsequent call to + * cuCtxPopCurrent(). The two LSBs of the flags parameter can be used + * to control how the OS thread, which owns the CUDA context at the + * time of an API call, interacts with the OS scheduler when waiting for + * results from the GPU. + * + * - CU_CTX_SCHED_AUTO: The default value if the flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in + * the process C and the number of logical processors in the system + * P. If C > P, then CUDA will yield to other OS threads when + * waiting for the GPU, otherwise CUDA will not yield while waiting + * for results and actively spin on the processor. + * - CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can de-crease latency when waiting for + * the GPU, but may lower the performance of CPU threads if they are + * performing work in parallel with the CUDA thread. + * - CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting + * for results from the GPU. This can increase latency when waiting + * for the GPU, but can increase the performance of CPU threads + * performing work in parallel with the GPU. + * - CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * - CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory + * that is accessible to the GPU. + * + * Note to Linux users: + * Context creation will fail with CUDA_ERROR_UNKNOWN if the compute mode + * of the device is CU_COMPUTEMODE_PROHIBITED. Similarly, context creation + * will also fail with CUDA_ERROR_UNKNOWN if the compute mode for the + * device is set to CU_COMPUTEMODE_EXCLUSIVE and there is already an + * active context on the device. The function cuDeviceGetAttribute() can + * be used with CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute + * mode of the device. The nvidia-smi tool can be used to set the compute + * mode for devices. Documentation for nvidia-smi can be obtained by + * passing a -h option to it. + * + * \param pctx Returned context handle of the current context. + * \param flags Context creation flags. + * \param dev Device to create context on. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) +{ + CUresult result = 0; + + if (pctx == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pctx NULL"); + goto error; + } + + result = cuCtxCreate(pctx, flags, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_CREATE) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Destroys the CUDA context specified by ctx. If the context usage count + * is not equal to 1, or the context is current to any CPU thread other + * than the current one, this function fails. Floating contexts (detached + * from a CPU thread via cuCtxPopCurrent()) may be destroyed by this + * function. + * + * \param ctx Context to destroy. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxDestroy(CUcontext ctx) +{ + CUresult result = 0; + + result = cuCtxDestroy(ctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DESTROY) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version) +{ + CUresult result = 0; + + if (version == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "version NULL"); + goto error; + } + + result = cuCtxGetApiVersion(ctx, version); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_API_VERSION) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig) +{ + CUresult result = 0; + + if (pconfig == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pconfig NULL"); + goto error; + } + + result = cuCtxGetCacheConfig(pconfig); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CACHE_CONFIG) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxGetCurrent(CUcontext *pctx) +{ + CUresult result = 0; + + if (pctx == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pctx NULL"); + goto error; + } + + result = cuCtxGetCurrent(pctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_CURRENT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Returns in *device the ordinal of the current context's device. + * + * \param device Returned device id for the current context. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxGetDevice(CUdevice *device) +{ + CUresult result = 0; + + if (device == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "device NULL"); + goto error; + } + + result = cuCtxGetDevice(device); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_DEVICE) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit) +{ + CUresult result = 0; + + result = cuCtxGetLimit(pvalue, limit); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_LIMIT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Pops the current CUDA context from the CPU thread. The CUDA context + * must have a usage count of 1. CUDA contexts have a usage count of 1 + * upon creation; the usage count may be incremented with cuCtxAttach() + * and decremented with cuCtxDetach(). + * + * If successful, cuCtxPopCurrent() passes back the new context handle + * in *pctx. The old context may then be made current to a different CPU + * thread by calling cuCtxPushCurrent(). + * + * Floating contexts may be destroyed by calling cuCtxDestroy(). + * + * If a context was current to the CPU thread before cuCtxCreate() or + * cuCtxPushCurrent() was called, this function makes that context + * current to the CPU thread again. + * + * \param pctx Returned new context handle. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxPopCurrent(CUcontext *pctx) +{ + CUresult result = 0; + + result = cuCtxPopCurrent(pctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_POP_CURRENT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Pushes the given context ctx onto the CPU thread's stack of current + * contexts. The speci?ed context becomes the CPU thread's current + * context, so all CUDA functions that operate on the current context + * are affected. + * + * The previous current context may be made current again by calling + * cuCtxDestroy() or cuCtxPopCurrent(). + * + * The context must be "floating," i.e. not attached to any thread. + * Contexts are made to float by calling cuCtxPopCurrent(). + * + * \param ctx Floating context to attach. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxPushCurrent(CUcontext ctx) +{ + CUresult result = 0; + + result = cuCtxPushCurrent(ctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_PUSH_CURRENT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxSetCacheConfig(CUfunc_cache config) +{ + CUresult result = 0; + + result = cuCtxSetCacheConfig(config); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CACHE_CONFIG) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxSetCurrent(CUcontext ctx) +{ + CUresult result = 0; + + result = cuCtxSetCurrent(ctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_CURRENT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaCtxSetLimit(CUlimit limit, size_t value) +{ + CUresult result = 0; + + result = cuCtxSetLimit(value, limit); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SET_LIMIT) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Blocks until the device has completed all preceding requested tasks. + * cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxSynchronize(void) +{ + CUresult result = 0; + + result = cuCtxSynchronize(); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SYNCHRONIZE) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Increments the usage count of the context and passes back a context + * handle in *pctx that must be passed to cuCtxDetach() when the + * application is done with the context. cuCtxAttach() fails if there is + * no context current to the thread. Currently, the flags parameter must + * be 0. + * + * \param pctx Returned context handle of the current context. + * \param flags Context attach flags (must be 0). + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags) +{ + CUresult result = 0; + + SCLogInfo("Cuda API - %s deprecated", + SCMapEnumValueToName(SC_CUDA_CU_CTX_ATTACH, + sc_cuda_api_names_string_map)); + + if (pctx == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pctx NULL"); + goto error; + } + + result = cuCtxAttach(pctx, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_ATTACH) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Decrements the usage count of the context ctx, and destroys the + * context if the usage count goes to 0. The context must be a handle + * that was passed back by cuCtxCreate() or cuCtxAttach(), and must be + * current to the calling thread. + * + * \param ctx Context to destroy. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaCtxDetach(CUcontext ctx) +{ + CUresult result = 0; + + SCLogInfo("Cuda API - %s deprecated", + SCMapEnumValueToName(SC_CUDA_CU_CTX_DETACH, + sc_cuda_api_names_string_map)); + + result = cuCtxDetach(ctx); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DETACH) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/*****************************Module_Management_API****************************/ + +/** + * \brief Returns in *hfunc the handle of the function of name \"name\" located + * in module hmod. If no function of that name exists, + * cuModuleGetFunction() returns CUDA_ERROR_NOT_FOUND. + * + * \param hfunc Returned function handle. + * \param hmod Module to return function from. + * \param name Name of function to retrieve. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) +{ + CUresult result = 0; + + if (hfunc == NULL || name == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "hfunc is NULL or name is NULL"); + goto error; + } + + result = cuModuleGetFunction(hfunc, hmod, name); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_FUNCTION) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Returns in *dptr and *bytes the base pointer and size of the global + * name \"name\" located in module hmod. If no variable of that name + * exists, cuModuleGetGlobal() returns CUDA_ERROR_NOT_FOUND. Both + * parameters dptr and bytes are optional. If one of them is NULL, + * it is ignored. + * + * \param dptr Returned global device pointer. + * \param bytes Returned global size in bytes. + * \param hmod Module to return function from. + * \param name Name of global to retrieve. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, + const char *name) +{ + CUresult result = 0; + + if (name == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "name is NULL"); + goto error; + } + + result = cuModuleGetGlobal(dptr, bytes, hmod, name); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_GLOBAL) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod, const char *name) +{ + CUresult result = 0; + + if (p_surf_ref == NULL || name == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_surf_ref is NULL or name is NULL"); + goto error; + } + + result = cuModuleGetSurfRef(p_surf_ref, hmod, name); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_SURF_REF) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Returns in *p_tex_ref the handle of the texture reference of name + * \"name\" in the module hmod. If no texture reference of that name + * exists, cuModuleGetTexRef() returns CUDA_ERROR_NOT_FOUND. This texture + * reference handle should not be destroyed, since it will be destroyed + * when the module is unloaded. + * + * \param p_tex_ref Returned global device pointer. + * \param hmod Module to retrieve texture reference from. + * \param name Name of the texture reference to retrieve. + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, const char *name) +{ + CUresult result = 0; + + if (p_tex_ref == NULL || name == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_tex_ref is NULL or name is NULL"); + goto error; + } + + result = cuModuleGetTexRef(p_tex_ref, hmod, name); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_TEX_REF) == -1) + goto error; + + return 0; error: return -1; } /** - * \brief Perform a 2D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY2D structure is defined as: - * - * typedef struct CUDA_MEMCPY2D_st { - * unsigned int srcXInBytes, srcY; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; - * unsigned int dstXInBytes, dstY; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; - * unsigned int WidthInBytes; - * unsigned int Height; - * } CUDA_MEMCPY2D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * - * CUmemorytype_enum is de?ned as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify - * the (host) base address of the source data and the bytes per row to - * apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch - * specify the (device) base address of the source data and the bytes per - * row to apply. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle - * of the source data. srcHost, srcDevice and srcPitch are ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify - * the (host) base address of the destination data and the bytes per row - * to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data and the - * bytes per row to apply. dstArray is ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle - * of the destination data dstHost, dstDevice and dstPitch are ignored. - * - * - srcXInBytes and srcY specify the base address of the source data for - * the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes and dstY specify the base address of the destination data - * for the copy. - * - * For host pointers, the base address is + * \brief Takes a filename fname and loads the corresponding module \"module\" + * into the current context. The CUDA driver API does not attempt to + * lazily allocate the resources needed by a module; if the memory for + * functions and data (constant and global) needed by the module cannot + * be allocated, cuModuleLoad() fails. The file should be a cubin file + * as output by nvcc or a PTX file, either as output by nvcc or handwrtten. * - * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \param module Returned module. + * \param fname Filename of module to load. * - * For device pointers, the starting address is + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleLoad(CUmodule *module, const char *fname) +{ + CUresult result = 0; + + if (module == NULL || fname == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "module is NULL or fname is NULL"); + goto error; + } + + result = cuModuleLoad(module, fname); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Takes a pointer image and loads the corresponding module \"module\" + * into the current context. The pointer may be obtained by mapping a + * cubin or PTX file, passing a cubin or PTX ?le as a NULL-terminated + * text string, or incorporating a cubin object into the executable + * resources and using operating system calls such as Windows + * FindResource() to obtain the pointer. * - * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \param module Returned module. + * \param image Module data to load * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleLoadData(CUmodule *module, const void *image) +{ + CUresult result = 0; + + if (module == NULL || image == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "module is NULL or image is NULL"); + goto error; + } + + result = cuModuleLoadData(module, image); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Takes a pointer image and loads the corresponding module module into + * the current context. The pointer may be obtained by mapping a cubin or + * PTX file, passing a cubin or PTX file as a NULL-terminated text + * string, or incorporating a cubin object into the executable resources + * and using operating system calls such as Windows FindResource() to + * obtain the pointer. Options are passed as an array via options and any + * corresponding parameters are passed in optionValues. The number of + * total options is supplied via numOptions. Any outputs will be returned + * via optionValues. Supported options are: * - * - WidthInBytes and Height specify the width (in bytes) and height of - * the 2D copy being performed. Any pitches must be greater than or - * equal to WidthInBytes. + * - CU_JIT_MAX_REGISTERS: input specifies the maximum number of registers + * per thread; + * - CU_JIT_THREADS_PER_BLOCK: input specifies number of threads per block + * to target compilation for; output returns the number of threads + * the compiler actually targeted; + * - CU_JIT_WALL_TIME: output returns the float value of wall clock time, + * in milliseconds, spent compiling the PTX code; + * - CU_JIT_INFO_LOG_BUFFER: input is a pointer to a buffer in which to + * print any informational log messages from PTX assembly; + * - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the + * buffer; output is the number of bytes filled with messages; + * - CU_JIT_ERROR_LOG_BUFFER: input is a pointer to a buffer in which to + * print any error log messages from PTX assembly; + * - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the + * buffer; output is the number of bytes filled with messages; + * - CU_JIT_OPTIMIZATION_LEVEL: input is the level of optimization to apply + * to generated code (0 - 4), with 4 being the default and highest + * level; + * - CU_JIT_TARGET_FROM_CUCONTEXT: causes compilation target to be + * determined based on current attached context (default); + * - CU_JIT_TARGET: input is the compilation target based on supplied + * CUjit_target_enum; possible values are: + * -- CU_TARGET_COMPUTE_10 + * -- CU_TARGET_COMPUTE_11 + * -- CU_TARGET_COMPUTE_12 + * -- CU_TARGET_COMPUTE_13 * - * cuMemcpy2D() returns an error if any pitch is greater than the - * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() - * passes back pitches that always work with cuMemcpy2D(). On intra-device - * memory copies (device ? device, CUDA array ? device, CUDA array ? - * CUDA array), cuMemcpy2D() may fail for pitches not computed by - * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, - * but may run signi?cantly slower in the cases where cuMemcpy2D() would - * have returned an error code. + * \param module Returned module. + * \param image Module data to load. + * \param numOptions Number of options. + * \param options Options for JIT. + * \param optionValues Option values for JIT. * - * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a - * stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaModuleLoadDataEx(CUmodule *module, const void *image, + unsigned int num_options, CUjit_option *options, + void **option_values) +{ + CUresult result = 0; + + if (module == NULL || image == NULL || options == NULL || + option_values == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "module is NULL or image is NULL or options is NULL or " + "option_values is NULL"); + goto error; + } + + result = cuModuleLoadDataEx(module, image, num_options, options, option_values); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA_EX) == -1) + goto error; + + return 0; + + error: + return -1; +} + +/** + * \brief Takes a pointer fat_cubin and loads the corresponding module \"module\" + * into the current context. The pointer represents a fat binary object, + * which is a collection of different cubin files, all representing the + * same device code, but compiled and optimized for different + * architectures. There is currently no documented API for constructing + * and using fat binary objects by programmers, and therefore this + * function is an internal function in this version of CUDA. More + * information can be found in the nvcc document. * - * \param p_copy Parameters for the memory copy. - * \param h_stream Stream identifier. + * \param module Returned module. + * \param fatCubin Fat binary to load. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy) +int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin) { CUresult result = 0; - if (p_copy == NULL) { + if (module == NULL || fat_cubin == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); + "module is NULL or fatCubin is NULL"); goto error; } - result = cuMemcpy2DUnaligned(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_UNALIGNED) == -1) + result = cuModuleLoadFatBinary(module, fat_cubin); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_FAT_BINARY) == -1) goto error; return 0; @@ -1131,123 +1887,19 @@ int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy) } /** - * \brief Perform a 3D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY3D structure is defined as: - * - * typedef struct CUDA_MEMCPY3D_st { - * unsigned int srcXInBytes, srcY, srcZ; - * unsigned int srcLOD; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; // ignored when src is array - * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - * unsigned int dstXInBytes, dstY, dstZ; - * unsigned int dstLOD; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; // ignored when dst is array - * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - * unsigned int WidthInBytes; - * unsigned int Height; - * unsigned int Depth; - * } CUDA_MEMCPY3D; - * - * where: - * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * CUmemorytype_enum is defined as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight - * specify the (host) base address of the source data, the bytes per row, - * and the height of each 2D slice of the 3D array. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and - * srcHeight specify the (device) base address of the source data, the - * bytes per row, and the height of each 2D slice of the 3D array. - * srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle - * of the source data. srcHost, srcDevice, srcPitch and srcHeight are - * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch - * specify the (host) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data, the bytes - * per row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the - * handle of the destination data. dstHost, dstDevice, dstPitch and - * dstHeight are ignored. - * - * - srcXInBytes, srcY and srcZ specify the base address of the source - * data for the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, dstY and dstZ specify the base address of the destination - * data for the copy. - * - * For host pointers, the base address is - * - * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; - * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. - * - * - WidthInBytes, Height and Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. Any pitches must be greater - * than or equal to WidthInBytes. - * - * cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). - * - * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be - * set to 0. + * \brief Unloads a module hmod from the current context. * - * \param p_copy Parameters for the memory copy. + * \param module Module to unload * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy) +int SCCudaModuleUnload(CUmodule hmod) { CUresult result = 0; - if (p_copy == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); - goto error; - } - - result = cuMemcpy3D(p_copy); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D) == -1) + result = cuModuleUnload(hmod); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_UNLOAD) == -1) goto error; return 0; @@ -1256,129 +1908,95 @@ int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy) return -1; } +/****************************Memory_Management_API*****************************/ + /** - * \brief Perform a 3D memory copy according to the parameters specified in - * p_copy. The CUDA_MEMCPY3D structure is defined as: + * \brief Creates a CUDA array according to the CUDA_ARRAY3D_DESCRIPTOR + * structure pAllocateArray and returns a handle to the new CUDA + * array in *p_handle. The CUDA_ARRAY3D_DESCRIPTOR is defined as: * - * typedef struct CUDA_MEMCPY3D_st { - * unsigned int srcXInBytes, srcY, srcZ; - * unsigned int srcLOD; - * CUmemorytype srcMemoryType; - * const void *srcHost; - * CUdeviceptr srcDevice; - * CUarray srcArray; - * unsigned int srcPitch; // ignored when src is array - * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 - * unsigned int dstXInBytes, dstY, dstZ; - * unsigned int dstLOD; - * CUmemorytype dstMemoryType; - * void *dstHost; - * CUdeviceptr dstDevice; - * CUarray dstArray; - * unsigned int dstPitch; // ignored when dst is array - * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 - * unsigned int WidthInBytes; + * typedef struct { + * unsigned int Width; * unsigned int Height; * unsigned int Depth; - * } CUDA_MEMCPY3D; + * CUarray_format Format; + * unsigned int NumChannels; + * unsigned int Flags; + * } CUDA_ARRAY3D_DESCRIPTOR; * * where: * - * - srcMemoryType and dstMemoryType specify the type of memory of the - * source and destination, respectively; - * CUmemorytype_enum is defined as: - * - * typedef enum CUmemorytype_enum { - * CU_MEMORYTYPE_HOST = 0x01, - * CU_MEMORYTYPE_DEVICE = 0x02, - * CU_MEMORYTYPE_ARRAY = 0x03 - * } CUmemorytype; - * - * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight - * specify the (host) base address of the source data, the bytes per row, - * and the height of each 2D slice of the 3D array. srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and - * srcHeight specify the (device) base address of the source data, the - * bytes per row, and the height of each 2D slice of the 3D array. - * srcArray is ignored. - * - * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle - * of the source data. srcHost, srcDevice, srcPitch and srcHeight are - * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch - * specify the (host) base address of the destination data, the bytes per - * row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch - * specify the (device) base address of the destination data, the bytes - * per row, and the height of each 2D slice of the 3D array. dstArray is - * ignored. - * - * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the - * handle of the destination data. dstHost, dstDevice, dstPitch and - * dstHeight are ignored. - * - * - srcXInBytes, srcY and srcZ specify the base address of the source - * data for the copy. - * - * For host pointers, the starting address is - * - * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); - * - * For device pointers, the starting address is - * - * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; - * - * For CUDA arrays, srcXInBytes must be evenly divisible by the array - * element size. - * - * - dstXInBytes, dstY and dstZ specify the base address of the destination - * data for the copy. + * - Width, Height, and Depth are the width, height, and depth of the + * CUDA array (in elements); the CUDA array is one-dimensional if +v * height and depth are 0, two-dimensional if depth is 0, and + * three-dimensional otherwise; + * - Format speci?es the format of the elements; CUarray_format is + * defined as: * - * For host pointers, the base address is + * typedef enum CUarray_format_enum { + * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + * CU_AD_FORMAT_SIGNED_INT8 = 0x08, + * CU_AD_FORMAT_SIGNED_INT16 = 0x09, + * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + * CU_AD_FORMAT_HALF = 0x10, + * CU_AD_FORMAT_FLOAT = 0x20 + * } CUarray_format; * - * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * - NumChannels speci?es the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * - Flags provides for future features. For now, it must be set to 0. * - * For device pointers, the starting address is + * Here are examples of CUDA array descriptions: * - * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * Description for a CUDA array of 2048 floats: * - * For CUDA arrays, dstXInBytes must be evenly divisible by the array - * element size. + * CUDA_ARRAY3D_DESCRIPTOR desc; + * desc.Format = CU_AD_FORMAT_FLOAT; + * desc.NumChannels = 1; + * desc.Width = 2048; + * desc.Height = 0; + * desc.Depth = 0; * - * - WidthInBytes, Height and Depth specify the width (in bytes), height - * and depth of the 3D copy being performed. Any pitches must be greater - * than or equal to WidthInBytes. + * Description for a 64 x 64 CUDA array of floats: * - * cuMemcpy3D() returns an error if any pitch is greater than the maximum - * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * CUDA_ARRAY3D_DESCRIPTOR desc; + * desc.Format = CU_AD_FORMAT_FLOAT; + * desc.NumChannels = 1; + * desc.Width = 64; + * desc.Height = 64; + * desc.Depth = 0; * - * cuMemcpy3DAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero hStream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. + * Description for a width x height x depth CUDA array of 64-bit, + * 4x16-bit float16's: * - * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be - * set to 0. + * CUDA_ARRAY3D_DESCRIPTOR desc; + * desc.FormatFlags = CU_AD_FORMAT_HALF; + * desc.NumChannels = 4; + * desc.Width = width; + * desc.Height = height; + * desc.Depth = depth; * - * \param p_copy Parameters for the memory copy. + * \param p_handle Returned Handle. + * \param p_allocate_array 3D array descriptor. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream) +int SCCudaArray3DCreate(CUarray *p_handle, + const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array) { CUresult result = 0; - if (p_copy == NULL) { + if (p_handle == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_copy is NULL"); + "p_handle is NULL"); goto error; } - result = cuMemcpy3DAsync(p_copy, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_ASYNC) == -1) + result = cuArray3DCreate(p_handle, p_allocate_array); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_CREATE) == -1) goto error; return 0; @@ -1388,33 +2006,33 @@ int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream) } /** - * \brief Copies from one 1D CUDA array to another. dstArray and srcArray - * specify the handles of the destination and source CUDA arrays for the - * copy, respectively. dstIndex and srcIndex specify the destination and - * source indices into the CUDA array. These values are in the range - * [0, Width-1] for the CUDA array; they are not byte offsets. ByteCount - * is the number of bytes to be copied. The size of the elements in the - * CUDA arrays need not be the same format, but the elements must be the - * same size; and count must be evenly divisible by that size. + * \brief Returns in *p_rray_descriptor a descriptor containing information on + * the format and dimensions of the CUDA array h_array. It is useful for + * subroutines that have been passed a CUDA array, but need to know the + * CUDA array parameters for validation or other purposes. * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param src_array Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. + * This function may be called on 1D and 2D arrays, in which case the + * Height and/or Depth members of the descriptor struct will be set to 0. + * + * \param p_array_descriptor Returned 3D array descriptor. + * \param h_array 3D array to get descriptor of. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyAtoA(CUarray dst_array, unsigned int dst_index, - CUarray src_array, unsigned int src_index, - unsigned int byte_count) +int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, + CUarray h_array) { CUresult result = 0; - result = cuMemcpyAtoA(dst_array, dst_index, src_array, src_index, - byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_A) == -1) + if (p_array_descriptor == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_array_descriptor is NULL"); + goto error; + } + + result = cuArray3DGetDescriptor(p_array_descriptor, h_array); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_3D_GET_DESCRIPTOR) == -1) goto error; return 0; @@ -1424,28 +2042,94 @@ int SCCudaMemcpyAtoA(CUarray dst_array, unsigned int dst_index, } /** - * \param Copies from one 1D CUDA array to device memory. dstDevice specifies the - * base pointer of the destination and must be naturally aligned with the - * CUDA array elements. hSrc and SrcIndex specify the CUDA array handle and - * the index (in array elements) of the array element where the copy is - * to begin. ByteCount speci?es the number of bytes to copy and must be - * evenly divisible by the array element size. + * \brief Creates a CUDA array according to the CUDA_ARRAY_DESCRIPTOR structure + * p_allocate_array and returns a handle to the new CUDA array in + * p_handle. The CUDA_ARRAY_DESCRIPTOR is defined as: * - * \param dst_device Destination device pointer. - * \param h_src Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. + * typedef struct { + * unsigned int Width; + * unsigned int Height; + * CUarray_format Format; + * unsigned int NumChannels; + * } CUDA_ARRAY_DESCRIPTOR; + * + * where: + * + * - Width, and Height are the width, and height of the CUDA array + * (in elements); the CUDA array is one-dimensional if height is 0, + * two-dimensional otherwise; + * - Format speci?es the format of the elements; CUarray_format is + * defined as: + * + * typedef enum CUarray_format_enum { + * CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + * CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + * CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + * CU_AD_FORMAT_SIGNED_INT8 = 0x08, + * CU_AD_FORMAT_SIGNED_INT16 = 0x09, + * CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + * CU_AD_FORMAT_HALF = 0x10, + * CU_AD_FORMAT_FLOAT = 0x20 + * } CUarray_format; + * + * - NumChannels specifies the number of packed components per CUDA + * array element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * + * CUDA_ARRAY_DESCRIPTOR desc; + * desc.Format = CU_AD_FORMAT_FLOAT; + * desc.NumChannels = 1; + * desc.Width = 2048; + * desc.Height = 1; + * + * Description for a 64 x 64 CUDA array of floats: + * + * CUDA_ARRAY_DESCRIPTOR desc; + * desc.Format = CU_AD_FORMAT_FLOAT; + * desc.NumChannels = 1; + * desc.Width = 64; + * desc.Height = 64; + * + * Description for a width x height CUDA array of 64-bit, 4x16-bit + * float16's: + * + * CUDA_ARRAY_DESCRIPTOR desc; + * desc.FormatFlags = CU_AD_FORMAT_HALF; + * desc.NumChannels = 4; + * desc.Width = width; + * desc.Height = height; + * + * Description for a width x height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * + * CUDA_ARRAY_DESCRIPTOR arrayDesc; + * desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8; + * desc.NumChannels = 2; + * desc.Width = width; + * desc.Height = height; + * + * \param p_handle Returned array. + * \param p_allocate_array Array descriptor. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray h_src, - unsigned int src_index, unsigned int byte_count) +int SCCudaArrayCreate(CUarray *p_handle, + const CUDA_ARRAY_DESCRIPTOR *p_allocate_array) { CUresult result = 0; - result = cuMemcpyAtoD(dst_device, h_src, src_index, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_D) == -1) + if (p_handle == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_handle is NULL"); + goto error; + } + + result = cuArrayCreate(p_handle, p_allocate_array); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_CREATE) == -1) goto error; return 0; @@ -1454,27 +2138,19 @@ int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray h_src, return -1; } + /** - * \param Copies from one 1D CUDA array to host memory. dstHost specifies the - * base pointer of the destination. srcArray and srcIndex specify the - * CUDA array handle and starting index of the source data. ByteCount - * specifies the number of bytes to copy. + * \brief Destroys the CUDA array h_array. * - * \param dst_device Destination device pointer. - * \param h_src Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. + * \param h_array Array to destroy. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array,unsigned int src_index, - unsigned int byte_count) +int SCCudaArrayDestroy(CUarray h_array) { - CUresult result = 0; - - result = cuMemcpyAtoH(dst_host, src_array, src_index, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H) == -1) + int result = cuArrayDestroy(h_array); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_DESTROY) == -1) goto error; return 0; @@ -1484,34 +2160,30 @@ int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array,unsigned int src_index, } /** - * \param Copies from one 1D CUDA array to host memory. dstHost specifies the - * base pointer of the destination. srcArray and srcIndex specify the - * CUDA array handle and starting index of the source data. ByteCount - * specifies the number of bytes to copy. - * - * cuMemcpyAtoHAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero stream argument. It only works on - * page-locked host memory and returns an error if a pointer to pageable - * memory is passed as input. + * \brief Returns in *p_array_descriptor a descriptor containing information on + * the format and dimensions of the CUDA array h_array. It is useful for + * subroutines that have been passed a CUDA array, but need to know the + * CUDA array parameters for validation or other purposes. * - * \param dst_device Destination device pointer. - * \param src_array Source array. - * \param src_index Offset of source array. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. + * \param p_array_descriptor Returned array descriptor. + * \param h_array Array to get descriptor of. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, - unsigned int src_index, unsigned int byte_count, - CUstream h_stream) +int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, + CUarray h_array) { CUresult result = 0; - result = cuMemcpyAtoHAsync(dst_host, src_array, src_index, byte_count, - h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC) == -1) + if (p_array_descriptor == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_array_descriptor is NULL"); + goto error; + } + + result = cuArrayGetDescriptor(p_array_descriptor, h_array); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_ARRAY_GET_DESCRIPTOR) == -1) goto error; return 0; @@ -1520,180 +2192,122 @@ int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, return -1; } -/** - * \brief Copies from device memory to a 1D CUDA array. dstArray and dstIndex - * specify the CUDA array handle and starting index of the destination - * data. srcDevice speci?es the base pointer of the source. ByteCount - * specifies the number of bytes to copy. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoA(CUarray dst_array, unsigned int dst_index, - CUdeviceptr src_device, unsigned int byte_count) +int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id) { CUresult result = 0; - result = cuMemcpyDtoA(dst_array, dst_index, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_A) == -1) + result = cuDeviceGetByPCIBusId(dev, pci_bus_id); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_BY_PCI_BUS_ID) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Copies from device memory to device memory. dstDevice and srcDevice are - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is asynchronous. - * - * \param dst_device Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device, - unsigned int byte_count) +int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev) { CUresult result = 0; - result = cuMemcpyDtoD(dst_device, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D) == -1) + result = cuDeviceGetPCIBusId(pci_bus_id, len, dev); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PCI_BUS_ID) == -1) goto error; return 0; + error: + return -1; +} + +int SCCudaIpcCloseMemHandle(CUdeviceptr dptr) +{ + CUresult result = 0; + + result = cuIpcCloseMemHandle(dptr); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_CLOSE_MEM_HANDLE) == -1) + goto error; + return 0; error: return -1; } -/** - * \brief Copies from device to host memory. dst_host and src_device specify - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is synchronous. - * - * \param dst_host Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device, - unsigned int byte_count) +int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event) +{ + CUresult result = 0; + + result = cuIpcGetEventHandle(p_handle, event); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) + goto error; + + return 0; + error: + return -1; +} + +int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr) { CUresult result = 0; - result = cuMemcpyDtoH(dst_host, src_device, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H) == -1) + result = cuIpcGetMemHandle(p_handle, dptr); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Copies from device to host memory. dst_host and src_device specify - * the base pointers of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. - * - * cuMemcpyDtoHAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works - * on page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. - * - * \param dst_host Destination device pointer. - * \param src_device Source device pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device, - unsigned int byte_count, CUstream h_stream) +int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle) { CUresult result = 0; - result = cuMemcpyDtoHAsync(dst_host, src_device, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC) == -1) + result = cuIpcOpenEventHandle(ph_event, handle); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_GET_MEM_HANDLE) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index - * specify the CUDA array handle and starting index of the destination - * data. p_src specifies the base address of the source. byte_count - * specifies the number of bytes to copy. - * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param p_src Source host pointer. - * \param byte_count Size of memory copy in bytes. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemcpyHtoA(CUarray dst_array, unsigned int dst_index, - const void *p_src, unsigned int byte_count) +int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, + unsigned int flags) { CUresult result = 0; - result = cuMemcpyHtoA(dst_array, dst_index, p_src, byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A) == -1) + result = cuIpcOpenMemHandle(pdptr, handle, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_IPC_OPEN_EVENT_HANDLE) == -1) goto error; return 0; - error: return -1; } /** - * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index - * specify the CUDA array handle and starting index of the destination - * data. p_src specifies the base address of the source. byte_count - * specfies the number of bytes to copy. - * - * cuMemcpyHtoAAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works on - * page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. + * \brief Returns in *p_array_descriptor a descriptor containing information on + * the format and dimensions of the CUDA array h_array. It is useful for + * subroutines that have been passed a CUDA array, but need to know the + * CUDA array parameters for validation or other purposes. * - * \param dst_array Destination array. - * \param dst_index Offset of destination array. - * \param p_src Source host pointer. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. + * \param p_array_descriptor Returned array descriptor. + * \param h_array Array to get descriptor of. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyHtoAAsync(CUarray dst_array, unsigned int dst_index, - const void *p_src, unsigned int byte_count, - CUstream h_stream) +int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size) { CUresult result = 0; - result = cuMemcpyHtoAAsync(dst_array, dst_index, p_src, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC) == -1) + if (dptr == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "dptr is NULL"); + goto error; + } + + result = cuMemAlloc(dptr, byte_size); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) goto error; return 0; @@ -1703,25 +2317,36 @@ int SCCudaMemcpyHtoAAsync(CUarray dst_array, unsigned int dst_index, } /** - * \brief Copies from host memory to device memory. dst_device and src_host - * are the base addresses of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. Note that this - * function is synchronous. + * \brief Allocates bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the vir-tual memory + * ranges allocated with this function and automatically accelerates + * calls to functions such as cuMemcpy(). Since the memory can be + * accessed directly by the device, it can be read or written with + * much higher bandwidth than pageable memory obtained with functions + * such as SCMalloc(). Allocating excessive amounts of memory with + * cuMemAllocHost() may degrade system performance, since it reduces + * the amount of memory available to the system for paging. As a result, + * this function is best used sparingly to allocate staging areas for + * data exchange between host and device. * - * \param dst_device Destination device pointer. - * \param src_host Source host pointer. - * \param byte_count Size of memory copy in bytes. + * \param pp Returned host pointer to page-locked memory. + * \param byte_size Requested allocation size in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, - unsigned int byte_count) +int SCCudaMemAllocHost(void **pp, size_t byte_size) { CUresult result = 0; - result = cuMemcpyHtoD(dst_device, src_host,byte_count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D) == -1) + if (pp == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pp is NULL"); + goto error; + } + + result = cuMemAllocHost(pp, byte_size); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC) == -1) goto error; return 0; @@ -1731,31 +2356,58 @@ int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, } /** - * \brief Copies from host memory to device memory. dst_device and src_host are - * the base addresses of the destination and source, respectively. - * byte_count specifies the number of bytes to copy. + * \brief Allocates at least width_in_bytes * height bytes of linear memory on the + * device and returns in *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. ElementSizeBytes + * specifies the size of the largest reads and writes that will be + * performed on the memory range. * - * cuMemcpyHtoDAsync() is asynchronous and can optionally be associated - * to a stream by passing a non-zero h_stream argument. It only works on - * page-locked memory and returns an error if a pointer to pageable - * memory is passed as input. + * element_size_bytes may be 4, 8 or 16 (since coalesced memory + * transactions are not possible on other data sizes). If element_size_bytes + * is smaller than the actual read/write size of a kernel, the kernel will + * run correctly, but possibly at reduced speed. The pitch returned in + * *p_itch by cuMemAllocPitch() is the width in bytes of the allocation. + * The intended usage of pitch is as a separate parameter of the allocation, + * used to compute addresses within the 2D array. Given the row and column + * of an array element of type T, the address is computed as: * + * T * p_element = (T*)((char*)base_address + row * pitch) + column; * - * \param dst_device Destination device pointer. - * \param src_host Source host pointer. - * \param byte_count Size of memory copy in bytes. - * \param h_stream Stream identifier. + * The pitch returned by cuMemAllocPitch() is guaranteed to work with + * cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it + * is recommended that programmers consider performing pitch allocations + * using cuMemAllocPitch(). Due to alignment restrictions in the hardware, + * this is especially true if the application will be performing 2D memory + * copies between different regions of device memory (whether linear memory + * or CUDA arrays). + * + * \param dptr Returned device pointer. + * \param p_pitch Returned pitch of allocation in bytes. + * \param width_in_bytes Requested allocation width in bytes. + * \param height Requested allocation width in rows. + * \param element_size_bytes Size of largest reads/writes for range. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, - unsigned int byte_count, CUstream h_stream) +int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch, + size_t width_in_bytes, + size_t height, + unsigned int element_size_bytes) { CUresult result = 0; - result = cuMemcpyHtoDAsync(dst_device, src_host, byte_count, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC) == -1) + if (dptr == NULL || p_pitch == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "dptr is NULL or p_pitch is NULL"); + goto error; + } + + result = cuMemAllocPitch(dptr, p_pitch, width_in_bytes, height, + element_size_bytes); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_ALLOC_PITCH) == -1) goto error; return 0; @@ -1764,21 +2416,134 @@ int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, return -1; } +int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count) +{ + CUresult result = 0; + + result = cuMemcpy(dst, src, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY) == -1) + goto error; + + return 0; + error: + return -1; +} + + /** - * \brief Frees the memory space pointed to by dptr, which must have been - * returned by a previous call to cuMemAlloc() or cuMemAllocPitch(). + * \brief Perform a 2D memory copy according to the parameters specified in + * p_copy. The CUDA_MEMCPY2D structure is defined as: + * + * typedef struct CUDA_MEMCPY2D_st { + * unsigned int srcXInBytes, srcY; + * CUmemorytype srcMemoryType; + * const void *srcHost; + * CUdeviceptr srcDevice; + * CUarray srcArray; + * unsigned int srcPitch; + * unsigned int dstXInBytes, dstY; + * CUmemorytype dstMemoryType; + * void *dstHost; + * CUdeviceptr dstDevice; + * CUarray dstArray; + * unsigned int dstPitch; + * unsigned int WidthInBytes; + * unsigned int Height; + * } CUDA_MEMCPY2D; + * + * where: + * + * - srcMemoryType and dstMemoryType specify the type of memory of the + * source and destination, respectively; + * + * CUmemorytype_enum is de?ned as: + * + * typedef enum CUmemorytype_enum { + * CU_MEMORYTYPE_HOST = 0x01, + * CU_MEMORYTYPE_DEVICE = 0x02, + * CU_MEMORYTYPE_ARRAY = 0x03 + * } CUmemorytype; + * + * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify + * the (host) base address of the source data and the bytes per row to + * apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch + * specify the (device) base address of the source data and the bytes per + * row to apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle + * of the source data. srcHost, srcDevice and srcPitch are ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify + * the (host) base address of the destination data and the bytes per row + * to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch + * specify the (device) base address of the destination data and the + * bytes per row to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle + * of the destination data dstHost, dstDevice and dstPitch are ignored. + * + * - srcXInBytes and srcY specify the base address of the source data for + * the copy. + * + * For host pointers, the starting address is + * + * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * + * For CUDA arrays, srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes and dstY specify the base address of the destination data + * for the copy. + * + * For host pointers, the base address is + * + * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * + * For CUDA arrays, dstXInBytes must be evenly divisible by the array + * element size. + * + * - WidthInBytes and Height specify the width (in bytes) and height of + * the 2D copy being performed. Any pitches must be greater than or + * equal to WidthInBytes. * - * \param dptr Pointer to the memory to free. + * cuMemcpy2D() returns an error if any pitch is greater than the + * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() + * passes back pitches that always work with cuMemcpy2D(). On intra-device + * memory copies (device ? device, CUDA array ? device, CUDA array ? + * CUDA array), cuMemcpy2D() may fail for pitches not computed by + * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, + * but may run signi?cantly slower in the cases where cuMemcpy2D() would + * have returned an error code. + * + * \param p_copy Parameters for the memory copy. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemFree(CUdeviceptr dptr) +int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy) { CUresult result = 0; - result = cuMemFree(dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE) == -1) + if (p_copy == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_copy is NULL"); + goto error; + } + + result = cuMemcpy2D(p_copy); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D) == -1) goto error; return 0; @@ -1788,26 +2553,125 @@ int SCCudaMemFree(CUdeviceptr dptr) } /** - * \brief Frees the memory space pointed to by p, which must have been returned - * by a previous call to cuMemAllocHost(). + * \brief Perform a 2D memory copy according to the parameters specified in + * p_copy. The CUDA_MEMCPY2D structure is defined as: * - * \param p Pointer to the memory to free. + * typedef struct CUDA_MEMCPY2D_st { + * unsigned int srcXInBytes, srcY; + * CUmemorytype srcMemoryType; + * const void *srcHost; + * CUdeviceptr srcDevice; + * CUarray srcArray; + * unsigned int srcPitch; + * unsigned int dstXInBytes, dstY; + * CUmemorytype dstMemoryType; + * void *dstHost; + * CUdeviceptr dstDevice; + * CUarray dstArray; + * unsigned int dstPitch; + * unsigned int WidthInBytes; + * unsigned int Height; + * } CUDA_MEMCPY2D; + * + * where: + * + * - srcMemoryType and dstMemoryType specify the type of memory of the + * source and destination, respectively; + * + * CUmemorytype_enum is de?ned as: + * + * typedef enum CUmemorytype_enum { + * CU_MEMORYTYPE_HOST = 0x01, + * CU_MEMORYTYPE_DEVICE = 0x02, + * CU_MEMORYTYPE_ARRAY = 0x03 + * } CUmemorytype; + * + * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify + * the (host) base address of the source data and the bytes per row to + * apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch + * specify the (device) base address of the source data and the bytes per + * row to apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle + * of the source data. srcHost, srcDevice and srcPitch are ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify + * the (host) base address of the destination data and the bytes per row + * to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch + * specify the (device) base address of the destination data and the + * bytes per row to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle + * of the destination data dstHost, dstDevice and dstPitch are ignored. + * + * - srcXInBytes and srcY specify the base address of the source data for + * the copy. + * + * For host pointers, the starting address is + * + * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * + * For CUDA arrays, srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes and dstY specify the base address of the destination data + * for the copy. + * + * For host pointers, the base address is + * + * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * + * For CUDA arrays, dstXInBytes must be evenly divisible by the array + * element size. + * + * - WidthInBytes and Height specify the width (in bytes) and height of + * the 2D copy being performed. Any pitches must be greater than or + * equal to WidthInBytes. + * + * cuMemcpy2D() returns an error if any pitch is greater than the + * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() + * passes back pitches that always work with cuMemcpy2D(). On intra-device + * memory copies (device ? device, CUDA array ? device, CUDA array ? + * CUDA array), cuMemcpy2D() may fail for pitches not computed by + * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, + * but may run signi?cantly slower in the cases where cuMemcpy2D() would + * have returned an error code. + * + * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a + * stream by passing a non-zero hStream argument. It only works on + * page-locked host memory and returns an error if a pointer to pageable + * memory is passed as input. + * + * \param p_copy Parameters for the memory copy. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemFreeHost(void *p) +int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream) { CUresult result = 0; - if (p == NULL) { + if (p_copy == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p is NULL"); + "p_copy is NULL"); goto error; } - result = cuMemFreeHost(p); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE_HOST) == -1) + result = cuMemcpy2DAsync(p_copy, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_ASYNC) == -1) goto error; return 0; @@ -1817,55 +2681,125 @@ int SCCudaMemFreeHost(void *p) } /** - * \brief Returns the base address in *pbase and size in *psize of the allocation - * by cuMemAlloc() or cuMemAllocPitch() that contains the input pointer - * dptr. Both parameters pbase and psize are optional. If one of them is - * NULL, it is ignored. + * \brief Perform a 2D memory copy according to the parameters specified in + * p_copy. The CUDA_MEMCPY2D structure is defined as: * - * \param pbase Returned base address. - * \param psize Returned size of device memory allocation. - * \param dptr Device pointer to query + * typedef struct CUDA_MEMCPY2D_st { + * unsigned int srcXInBytes, srcY; + * CUmemorytype srcMemoryType; + * const void *srcHost; + * CUdeviceptr srcDevice; + * CUarray srcArray; + * unsigned int srcPitch; + * unsigned int dstXInBytes, dstY; + * CUmemorytype dstMemoryType; + * void *dstHost; + * CUdeviceptr dstDevice; + * CUarray dstArray; + * unsigned int dstPitch; + * unsigned int WidthInBytes; + * unsigned int Height; + * } CUDA_MEMCPY2D; * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemGetAddressRange(CUdeviceptr *pbase, unsigned int *psize, - CUdeviceptr dptr) -{ - CUresult result = 0; - - result = cuMemGetAddressRange(pbase, psize, dptr); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_ADDRESS_RANGE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *free and *total respectively, the free and total amount - * of memory available for allocation by the CUDA context, in bytes. + * where: + * + * - srcMemoryType and dstMemoryType specify the type of memory of the + * source and destination, respectively; + * + * CUmemorytype_enum is de?ned as: + * + * typedef enum CUmemorytype_enum { + * CU_MEMORYTYPE_HOST = 0x01, + * CU_MEMORYTYPE_DEVICE = 0x02, + * CU_MEMORYTYPE_ARRAY = 0x03 + * } CUmemorytype; + * + * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost and srcPitch specify + * the (host) base address of the source data and the bytes per row to + * apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice and srcPitch + * specify the (device) base address of the source data and the bytes per + * row to apply. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray speci?es the handle + * of the source data. srcHost, srcDevice and srcPitch are ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch specify + * the (host) base address of the destination data and the bytes per row + * to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch + * specify the (device) base address of the destination data and the + * bytes per row to apply. dstArray is ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the handle + * of the destination data dstHost, dstDevice and dstPitch are ignored. + * + * - srcXInBytes and srcY specify the base address of the source data for + * the copy. + * + * For host pointers, the starting address is + * + * void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * + * For CUDA arrays, srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes and dstY specify the base address of the destination data + * for the copy. + * + * For host pointers, the base address is + * + * void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * + * For CUDA arrays, dstXInBytes must be evenly divisible by the array + * element size. + * + * - WidthInBytes and Height specify the width (in bytes) and height of + * the 2D copy being performed. Any pitches must be greater than or + * equal to WidthInBytes. + * + * cuMemcpy2D() returns an error if any pitch is greater than the + * maximum allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). cuMemAllocPitch() + * passes back pitches that always work with cuMemcpy2D(). On intra-device + * memory copies (device ? device, CUDA array ? device, CUDA array ? + * CUDA array), cuMemcpy2D() may fail for pitches not computed by + * cuMemAllocPitch(). cuMemcpy2DUnaligned() does not have this restriction, + * but may run signi?cantly slower in the cases where cuMemcpy2D() would + * have returned an error code. * - * \param free Returned free memory in bytes. - * \param total Returned total memory in bytes. + * cuMemcpy2DAsync() is asynchronous and can optionally be associated to a + * stream by passing a non-zero hStream argument. It only works on + * page-locked host memory and returns an error if a pointer to pageable + * memory is passed as input. + * + * \param p_copy Parameters for the memory copy. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemGetInfo(unsigned int *free, unsigned int *total) +int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy) { CUresult result = 0; - if (free == NULL || total == NULL) { + if (p_copy == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "free is NULL || total is NULL"); + "p_copy is NULL"); goto error; } - result = cuMemGetInfo(free, total); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_INFO) == -1) + result = cuMemcpy2DUnaligned(p_copy); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_2D_UNALIGNED) == -1) goto error; return 0; @@ -1875,60 +2809,123 @@ int SCCudaMemGetInfo(unsigned int *free, unsigned int *total) } /** - * \brief Allocates bytesize bytes of host memory that is page-locked and - * accessible to the device. The driver tracks the virtual memory ranges - * allocated with this function and automatically accelerates calls to - * functions such as cuMemcpyHtoD(). Since the memory can be accessed - * directly by the device, it can be read or written with much higher - * bandwidth than pageable memory obtained with functions such as - * SCMalloc(). Allocating excessive amounts of pinned memory may degrade - * system performance, since it reduces the amount of memory available - * to the system for paging. As a result, this function is best used - * sparingly to allocate staging areas for data exchange between host - * and device. + * \brief Perform a 3D memory copy according to the parameters specified in + * p_copy. The CUDA_MEMCPY3D structure is defined as: * - * The Flags parameter enables different options to be specified that - * affect the allocation, as follows. + * typedef struct CUDA_MEMCPY3D_st { + * unsigned int srcXInBytes, srcY, srcZ; + * unsigned int srcLOD; + * CUmemorytype srcMemoryType; + * const void *srcHost; + * CUdeviceptr srcDevice; + * CUarray srcArray; + * unsigned int srcPitch; // ignored when src is array + * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + * unsigned int dstXInBytes, dstY, dstZ; + * unsigned int dstLOD; + * CUmemorytype dstMemoryType; + * void *dstHost; + * CUdeviceptr dstDevice; + * CUarray dstArray; + * unsigned int dstPitch; // ignored when dst is array + * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + * unsigned int WidthInBytes; + * unsigned int Height; + * unsigned int Depth; + * } CUDA_MEMCPY3D; * - * - CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be - * considered as pinned memory by all CUDA contexts, not just the one - * that performed the allocation. - * - CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA - * address space. The device pointer to the memory may be obtained by - * calling cuMemHostGetDevicePointer(). This feature is available only - * on GPUs with compute capability greater than or equal to 1.1. - * - CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined - * (WC). WC memory can be transferred across the PCI Express bus more - * quickly on some system con?gurations, but cannot be read efficiently - * by most CPUs. WC memory is a good option for buffers that will be - * written by the CPU and read by the GPU via mapped pinned memory or - * host->device transfers. All of these fags are orthogonal to one - * another: a developer may allocate memory that is portable, mapped - * and/or write-combined with no restrictions. + * where: * - * The CUDA context must have been created with the CU_CTX_MAP_HOST flag - * in order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect. + * - srcMemoryType and dstMemoryType specify the type of memory of the + * source and destination, respectively; + * CUmemorytype_enum is defined as: * - * The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for - * devices that do not support mapped pinned memory. The failure is - * deferred to cuMemHostGetDevicePointer() because the memory may be - * mapped into other CUDA contexts via the CU_MEMHOSTALLOC_PORTABLE flag. + * typedef enum CUmemorytype_enum { + * CU_MEMORYTYPE_HOST = 0x01, + * CU_MEMORYTYPE_DEVICE = 0x02, + * CU_MEMORYTYPE_ARRAY = 0x03 + * } CUmemorytype; * - * The memory allocated by this function must be freed with cuMemFreeHost(). + * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight + * specify the (host) base address of the source data, the bytes per row, + * and the height of each 2D slice of the 3D array. srcArray is ignored. * - * \param pp Returned host pointer to page-locked memory. - * \param byte_size Requested allocation size in bytes. - * \param flags Flags for allocation request. + * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and + * srcHeight specify the (device) base address of the source data, the + * bytes per row, and the height of each 2D slice of the 3D array. + * srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle + * of the source data. srcHost, srcDevice, srcPitch and srcHeight are + * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch + * specify the (host) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. dstArray is + * ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch + * specify the (device) base address of the destination data, the bytes + * per row, and the height of each 2D slice of the 3D array. dstArray is + * ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the + * handle of the destination data. dstHost, dstDevice, dstPitch and + * dstHeight are ignored. + * + * - srcXInBytes, srcY and srcZ specify the base address of the source + * data for the copy. + * + * For host pointers, the starting address is + * + * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * + * For CUDA arrays, srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, dstY and dstZ specify the base address of the destination + * data for the copy. + * + * For host pointers, the base address is + * + * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * + * For CUDA arrays, dstXInBytes must be evenly divisible by the array + * element size. + * + * - WidthInBytes, Height and Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. Any pitches must be greater + * than or equal to WidthInBytes. + * + * cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param p_copy Parameters for the memory copy. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags) +int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy) { CUresult result = 0; - result = cuMemHostAlloc(pp, byte_size, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_ALLOC) == -1) + if (p_copy == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_copy is NULL"); + goto error; + } + + result = cuMemcpy3D(p_copy); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D) == -1) goto error; return 0; @@ -1938,55 +2935,128 @@ int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags) } /** - * \brief Passes back the device pointer pdptr corresponding to the mapped, - * pinned host buffer p allocated by cuMemHostAlloc. + * \brief Perform a 3D memory copy according to the parameters specified in + * p_copy. The CUDA_MEMCPY3D structure is defined as: * - * cuMemHostGetDevicePointer() will fail if the CU_MEMALLOCHOST_DEVICEMAP - * flag was not speci?ed at the time the memory was allocated, or if the - * function is called on a GPU that does not support mapped pinned memory. + * typedef struct CUDA_MEMCPY3D_st { + * unsigned int srcXInBytes, srcY, srcZ; + * unsigned int srcLOD; + * CUmemorytype srcMemoryType; + * const void *srcHost; + * CUdeviceptr srcDevice; + * CUarray srcArray; + * unsigned int srcPitch; // ignored when src is array + * unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + * unsigned int dstXInBytes, dstY, dstZ; + * unsigned int dstLOD; + * CUmemorytype dstMemoryType; + * void *dstHost; + * CUdeviceptr dstDevice; + * CUarray dstArray; + * unsigned int dstPitch; // ignored when dst is array + * unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + * unsigned int WidthInBytes; + * unsigned int Height; + * unsigned int Depth; + * } CUDA_MEMCPY3D; + * + * where: + * + * - srcMemoryType and dstMemoryType specify the type of memory of the + * source and destination, respectively; + * CUmemorytype_enum is defined as: + * + * typedef enum CUmemorytype_enum { + * CU_MEMORYTYPE_HOST = 0x01, + * CU_MEMORYTYPE_DEVICE = 0x02, + * CU_MEMORYTYPE_ARRAY = 0x03 + * } CUmemorytype; + * + * If srcMemoryType is CU_MEMORYTYPE_HOST, srcHost, srcPitch and srcHeight + * specify the (host) base address of the source data, the bytes per row, + * and the height of each 2D slice of the 3D array. srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_DEVICE, srcDevice, srcPitch and + * srcHeight specify the (device) base address of the source data, the + * bytes per row, and the height of each 2D slice of the 3D array. + * srcArray is ignored. + * + * If srcMemoryType is CU_MEMORYTYPE_ARRAY, srcArray specifies the handle + * of the source data. srcHost, srcDevice, srcPitch and srcHeight are + * ignored. If dstMemoryType is CU_MEMORYTYPE_HOST, dstHost and dstPitch + * specify the (host) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. dstArray is + * ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_DEVICE, dstDevice and dstPitch + * specify the (device) base address of the destination data, the bytes + * per row, and the height of each 2D slice of the 3D array. dstArray is + * ignored. + * + * If dstMemoryType is CU_MEMORYTYPE_ARRAY, dstArray specifies the + * handle of the destination data. dstHost, dstDevice, dstPitch and + * dstHeight are ignored. + * + * - srcXInBytes, srcY and srcZ specify the base address of the source + * data for the copy. + * + * For host pointers, the starting address is + * + * void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * + * For CUDA arrays, srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, dstY and dstZ specify the base address of the destination + * data for the copy. + * + * For host pointers, the base address is + * + * void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * + * For device pointers, the starting address is + * + * CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; * - * Flags provides for future releases. For now, it must be set to 0. + * For CUDA arrays, dstXInBytes must be evenly divisible by the array + * element size. * - * \param pdptr Returned device pointer. - * \param p Host pointer. - * \param flags Options(must be 0). + * - WidthInBytes, Height and Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. Any pitches must be greater + * than or equal to WidthInBytes. * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int flags) -{ - CUresult result = 0; - - result = cuMemHostGetDevicePointer(pdptr, p, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Passes back the flags p_flags that were specified when allocating the - * pinned host buffer p allocated by cuMemHostAlloc. + * cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (CU_DEVICE_ATTRIBUTE_MAX_PITCH). * - * cuMemHostGetFlags() will fail if the pointer does not reside in an - * allocation performed by cuMemAllocHost() or cuMemHostAlloc(). + * cuMemcpy3DAsync() is asynchronous and can optionally be associated + * to a stream by passing a non-zero hStream argument. It only works on + * page-locked host memory and returns an error if a pointer to pageable + * memory is passed as input. * - * \param p_flags Returned flags word. - * \param p Host pointer. + * The srcLOD and dstLOD members of the CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param p_copy Parameters for the memory copy. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p) +int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream) { CUresult result = 0; - result = cuMemHostGetFlags(p_flags, p); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_FLAGS) == -1) + if (p_copy == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_copy is NULL"); + goto error; + } + + result = cuMemcpy3DAsync(p_copy, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_ASYNC) == -1) goto error; return 0; @@ -1995,116 +3065,75 @@ int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p) return -1; } -/** - * \brief Sets the memory range of N 16-bit values to the speci?ed value us. - * - * \param dst_device Destination device pointer. - * \param us Value to set. - * \param n Number of elements. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, unsigned int n) +int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy) { CUresult result = 0; - result = cuMemsetD16(dst_device, us, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16) == -1) + result = cuMemcpy3DPeer(p_copy); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Sets the 2D memory range of Width 16-bit values to the specified - * value us. Height specifies the number of rows to set, and dst_pitch - * specifies the number of bytes between each row. This function - * performs fastest when the pitch is one that has been passed back - * by cuMemAllocPitch(). - * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param us Value to set - * \param width Width of row. - * \param height Number of rows - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD2D16(CUdeviceptr dst_device, unsigned int dst_pitch, - unsigned short us, unsigned int width, - unsigned int height) +int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy, + CUstream h_stream) { CUresult result = 0; - result = cuMemsetD2D16(dst_device, dst_pitch, us, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16) == -1) + result = cuMemcpy3DPeerAsync(p_copy, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_3D_PEER_ASYNC) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Sets the 2D memory range of Width 32-bit values to the specified value - * ui. Height speci?es the number of rows to set, and dstPitch specifies - * the number of bytes between each row. This function performs fastest - * when the pitch is one that has been passed back by cuMemAllocPitch(). - * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param ui Value to set - * \param width Width of row. - * \param height Number of rows - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaMemsetD2D32(CUdeviceptr dst_device, unsigned int dst_pitch, - unsigned int ui, unsigned int width, - unsigned int height) +int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count, + CUstream h_stream) { CUresult result = 0; - result = cuMemsetD2D32(dst_device, dst_pitch, ui, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32) == -1) + result = cuMemcpyAsync(dst, src, byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Sets the 2D memory range of Width 8-bit values to the specified value - * uc. Height speci?es the number of rows to set, and dstPitch specifies - * the number of bytes between each row. This function performs fastest - * when the pitch is one that has been passed back by cuMemAllocPitch(). + * \brief Copies from one 1D CUDA array to another. dstArray and srcArray + * specify the handles of the destination and source CUDA arrays for the + * copy, respectively. dstIndex and srcIndex specify the destination and + * source indices into the CUDA array. These values are in the range + * [0, Width-1] for the CUDA array; they are not byte offsets. ByteCount + * is the number of bytes to be copied. The size of the elements in the + * CUDA arrays need not be the same format, but the elements must be the + * same size; and count must be evenly divisible by that size. * - * \param dst_device Destination device pointer. - * \param dst_pitch Pitch of destination device pointer. - * \param uc Value to set - * \param width Width of row. - * \param height Number of rows + * \param dst_array Destination array. + * \param dst_index Offset of destination array. + * \param src_array Source array. + * \param src_index Offset of source array. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemsetD2D8(CUdeviceptr dst_device, unsigned int dst_pitch, - unsigned char uc, unsigned int width, - unsigned int height) +int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset, + CUarray src_array, size_t src_offset, + size_t byte_count) { CUresult result = 0; - result = cuMemsetD2D8(dst_device, dst_pitch, uc, width, height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8) == -1) + result = cuMemcpyAtoA(dst_array, dst_offset, src_array, src_offset, + byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_A) == -1) goto error; return 0; @@ -2114,21 +3143,28 @@ int SCCudaMemsetD2D8(CUdeviceptr dst_device, unsigned int dst_pitch, } /** - * \brief Sets the memory range of N 32-bit values to the specified value ui. + * \param Copies from one 1D CUDA array to device memory. dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the + * CUDA array elements. hSrc and SrcIndex specify the CUDA array handle and + * the index (in array elements) of the array element where the copy is + * to begin. ByteCount speci?es the number of bytes to copy and must be + * evenly divisible by the array element size. * * \param dst_device Destination device pointer. - * \param ui Value to set. - * \param n Number of elements. + * \param h_src Source array. + * \param src_index Offset of source array. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, unsigned int n) +int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array, + size_t src_offset, size_t byte_count) { CUresult result = 0; - result = cuMemsetD32(dst_device, ui, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32) == -1) + result = cuMemcpyAtoD(dst_device, src_array, src_offset, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_D) == -1) goto error; return 0; @@ -2138,21 +3174,26 @@ int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, unsigned int n) } /** - * \brief Sets the memory range of N 8-bit values to the specified value ui. + * \param Copies from one 1D CUDA array to host memory. dstHost specifies the + * base pointer of the destination. srcArray and srcIndex specify the + * CUDA array handle and starting index of the source data. ByteCount + * specifies the number of bytes to copy. * * \param dst_device Destination device pointer. - * \param uc Value to set. - * \param n Number of elements. + * \param h_src Source array. + * \param src_index Offset of source array. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, unsigned int n) +int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset, + size_t byte_count) { CUresult result = 0; - result = cuMemsetD8(dst_device, uc, n); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8) == -1) + result = cuMemcpyAtoH(dst_host, src_array, src_offset, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H) == -1) goto error; return 0; @@ -2161,34 +3202,35 @@ int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, unsigned int n) return -1; } -/***********************Texture_Reference_Management_API***********************/ - /** - * \brief Creates a texture reference and returns its handle in *pTexRef. Once - * created, the application must call cuTexRefSetArray() or cuTexRefSetAddress() - * to associate the reference with allocated memory. Other texture reference - * functions are used to specify the format and interpretation (addressing, - * filtering, etc.) to be used when the memory is read through this texture - * reference. To associate the texture reference with a texture ordinal for - * a given function, the application should call cuParamSetTexRef(). + * \param Copies from one 1D CUDA array to host memory. dstHost specifies the + * base pointer of the destination. srcArray and srcIndex specify the + * CUDA array handle and starting index of the source data. ByteCount + * specifies the number of bytes to copy. * - * \param p_tex_ref Returned texture reference + * cuMemcpyAtoHAsync() is asynchronous and can optionally be associated + * to a stream by passing a non-zero stream argument. It only works on + * page-locked host memory and returns an error if a pointer to pageable + * memory is passed as input. + * + * \param dst_device Destination device pointer. + * \param src_array Source array. + * \param src_index Offset of source array. + * \param byte_count Size of memory copy in bytes. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefCreate(CUtexref *p_tex_ref) +int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, + size_t src_offset, size_t byte_count, + CUstream h_stream) { CUresult result = 0; - if (p_tex_ref == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_tex_ref is NULL"); - goto error; - } - - result = cuTexRefCreate(p_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_CREATE) == -1) + result = cuMemcpyAtoHAsync(dst_host, src_array, src_offset, byte_count, + h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_A_TO_H_ASYNC) == -1) goto error; return 0; @@ -2198,19 +3240,26 @@ int SCCudaTexRefCreate(CUtexref *p_tex_ref) } /** - * \brief Destroys the texture reference specified by hTexRef. + * \brief Copies from device memory to a 1D CUDA array. dstArray and dstIndex + * specify the CUDA array handle and starting index of the destination + * data. srcDevice speci?es the base pointer of the source. ByteCount + * specifies the number of bytes to copy. * - * \param h_tex_ref Texture reference to destroy + * \param dst_array Destination array. + * \param dst_index Offset of destination array. + * \param src_device Source device pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefDestroy(CUtexref h_tex_ref) +int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset, + CUdeviceptr src_device, size_t byte_count) { CUresult result = 0; - result = cuTexRefDestroy(h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_DESTROY) == -1) + result = cuMemcpyDtoA(dst_array, dst_offset, src_device, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_A) == -1) goto error; return 0; @@ -2220,28 +3269,25 @@ int SCCudaTexRefDestroy(CUtexref h_tex_ref) } /** - * \brief Returns in *pdptr the base address bound to the texture reference - * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any device memory range. + * \brief Copies from device memory to device memory. dstDevice and srcDevice are + * the base pointers of the destination and source, respectively. + * byte_count specifies the number of bytes to copy. Note that this + * function is asynchronous. * - * \param pdptr Returned device address - * \param h_tex_ref Texture reference + * \param dst_device Destination device pointer. + * \param src_device Source device pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref) +int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device, + size_t byte_count) { CUresult result = 0; - if (pdptr == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pdptr is NULL"); - goto error; - } - - result = cuTexRefGetAddress(pdptr, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS) == -1) + result = cuMemcpyDtoD(dst_device, src_device, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D) == -1) goto error; return 0; @@ -2250,61 +3296,41 @@ int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref) return -1; } -/** - * \brief Returns in *pam the addressing mode corresponding to the dimension - * dim of the texture reference hTexRef. Currently, the only valid value - * for dim are 0 and 1. - * - * \param pam Returned addressing mode - * \param h_tex_ref Texture reference - * \param dim Dimension - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, int dim) +int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device, + size_t byte_count, CUstream h_stream) { CUresult result = 0; - if (pam == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pam is NULL"); - goto error; - } - - result = cuTexRefGetAddressMode(pam, h_tex_ref, dim); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE) == -1) + result = cuMemcpyDtoDAsync(dst_device, src_device, byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_D_ASYNC) == -1) goto error; return 0; - error: return -1; } + /** - * \brief Returns in *phArray the CUDA array bound to the texture reference - * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference - * is not bound to any CUDA array. + * \brief Copies from device to host memory. dst_host and src_device specify + * the base pointers of the destination and source, respectively. + * byte_count specifies the number of bytes to copy. Note that this + * function is synchronous. * - * \param ph_array Returned array - * \param h_tex_ref Texture reference + * \param dst_host Destination device pointer. + * \param src_device Source device pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref) +int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device, + size_t byte_count) { CUresult result = 0; - if (ph_array == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ph_array is NULL"); - goto error; - } - - result = cuTexRefGetArray(ph_array, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ARRAY) == -1) + result = cuMemcpyDtoH(dst_host, src_device, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H) == -1) goto error; return 0; @@ -2314,26 +3340,29 @@ int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref) } /** - * \brief Returns in *pfm the filtering mode of the texture reference hTexRef. + * \brief Copies from device to host memory. dst_host and src_device specify + * the base pointers of the destination and source, respectively. + * byte_count specifies the number of bytes to copy. * - * \param pfm Returned filtering mode - * \param h_tex_ref Texture reference + * cuMemcpyDtoHAsync() is asynchronous and can optionally be associated + * to a stream by passing a non-zero h_stream argument. It only works + * on page-locked memory and returns an error if a pointer to pageable + * memory is passed as input. + * + * \param dst_host Destination device pointer. + * \param src_device Source device pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref) +int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device, + size_t byte_count, CUstream h_stream) { CUresult result = 0; - if (pfm == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pfm is NULL"); - goto error; - } - - result = cuTexRefGetFilterMode(pfm, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FILTER_MODE) == -1) + result = cuMemcpyDtoHAsync(dst_host, src_device, byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_D_TO_H_ASYNC) == -1) goto error; return 0; @@ -2343,26 +3372,26 @@ int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref) } /** - * \brief Returns in *pFlags the flags of the texture reference hTexRef. + * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index + * specify the CUDA array handle and starting index of the destination + * data. p_src specifies the base address of the source. byte_count + * specifies the number of bytes to copy. * - * \param p_flags Returned flags - * \param h_tex_ref Texture reference + * \param dst_array Destination array. + * \param dst_index Offset of destination array. + * \param p_src Source host pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref) +int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset, + const void *src_host, size_t byte_count) { CUresult result = 0; - if (p_flags == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_flags is NULL"); - goto error; - } - - result = cuTexRefGetFlags(p_flags, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FLAGS) == -1) + result = cuMemcpyHtoA(dst_array, dst_offset, src_host, byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A) == -1) goto error; return 0; @@ -2372,30 +3401,33 @@ int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref) } /** - * \brief Returns in *pFormat and *pNumChannels the format and number of - * components of the CUDA array bound to the texture reference hTexRef. - * If pFormat or pNumChannels is NULL, it will be ignored. + * \brief Copies from host memory to a 1D CUDA array. dst_array and dst_index + * specify the CUDA array handle and starting index of the destination + * data. p_src specifies the base address of the source. byte_count + * specfies the number of bytes to copy. * - * \param p_format Returned format - * \param p_num_channels Returned number of components - * \param h_tex_ref Texture reference + * cuMemcpyHtoAAsync() is asynchronous and can optionally be associated + * to a stream by passing a non-zero h_stream argument. It only works on + * page-locked memory and returns an error if a pointer to pageable + * memory is passed as input. + * + * \param dst_array Destination array. + * \param dst_index Offset of destination array. + * \param p_src Source host pointer. + * \param byte_count Size of memory copy in bytes. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, - CUtexref h_tex_ref) +int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset, + const void *src_host, size_t byte_count, + CUstream h_stream) { CUresult result = 0; - if (p_format == NULL || p_num_channels == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_format == NULL || p_num_channels == NULL"); - goto error; - } - - result = cuTexRefGetFormat(p_format, p_num_channels, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FORMAT) == -1) + result = cuMemcpyHtoAAsync(dst_array, dst_offset, src_host, byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_A_ASYNC) == -1) goto error; return 0; @@ -2405,43 +3437,25 @@ int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, } /** - * \brief Binds a linear address range to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Any memory previously - * bound to hTexRef is unbound. - * - * Since the hardware enforces an alignment requirement on texture - * base addresses, cuTexRefSetAddress() passes back a byte offset in - * *ByteOffset that must be applied to texture fetches in order to read - * from the desired memory. This offset must be divided by the texel - * size and passed to kernels that read from the texture so they can be - * applied to the tex1Dfetch() function. - * - * If the device memory pointer was returned from cuMemAlloc(), the - * offset is guaranteed to be 0 and NULL may be passed as the - * ByteOffset parameter. + * \brief Copies from host memory to device memory. dst_device and src_host + * are the base addresses of the destination and source, respectively. + * byte_count specifies the number of bytes to copy. Note that this + * function is synchronous. * - * \param byte_offset Returned byte offset - * \param h_tex_ref Texture reference to bind - * \param dptr Device pointer to bind - * \param bytes Size of memory to bind in bytes + * \param dst_device Destination device pointer. + * \param src_host Source host pointer. + * \param byte_count Size of memory copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefSetAddress(unsigned int *byte_offset, CUtexref h_tex_ref, - CUdeviceptr dptr, unsigned int bytes) +int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, + size_t byte_count) { CUresult result = 0; - if (byte_offset == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "byte_offset is NULL"); - goto error; - } - - result = cuTexRefSetAddress(byte_offset, h_tex_ref, dptr, bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS) == -1) + result = cuMemcpyHtoD(dst_device, src_host,byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D) == -1) goto error; return 0; @@ -2451,39 +3465,31 @@ int SCCudaTexRefSetAddress(unsigned int *byte_offset, CUtexref h_tex_ref, } /** - * \brief Binds a linear address range to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Any memory previously bound - * to hTexRef is unbound. - * - * Using a tex2D() function inside a kernel requires a call to either - * cuTexRefSetArray() to bind the corresponding texture reference to an - * array, or cuTexRefSetAddress2D() to bind the texture reference to - * linear memory. + * \brief Copies from host memory to device memory. dst_device and src_host are + * the base addresses of the destination and source, respectively. + * byte_count specifies the number of bytes to copy. * - * Function calls to cuTexRefSetFormat() cannot follow calls to - * cuTexRefSetAddress2D() for the same texture reference. + * cuMemcpyHtoDAsync() is asynchronous and can optionally be associated + * to a stream by passing a non-zero h_stream argument. It only works on + * page-locked memory and returns an error if a pointer to pageable + * memory is passed as input. * - * It is required that dptr be aligned to the appropriate hardware- - * specific texture alignment. You can query this value using the device - * attribute CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned dptr - * is supplied, CUDA_ERROR_INVALID_VALUE is returned. * - * \param h_tex_ref Texture reference to bind - * \param desc Descriptor of CUDA array - * \param dptr Device pointer to bind - * \param pitch Line pitch in bytes + * \param dst_device Destination device pointer. + * \param src_host Source host pointer. + * \param byte_count Size of memory copy in bytes. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, const CUDA_ARRAY_DESCRIPTOR *desc, - CUdeviceptr dptr, unsigned int pitch) +int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, + size_t byte_count, CUstream h_stream) { CUresult result = 0; - result = cuTexRefSetAddress2D(h_tex_ref, desc, dptr, pitch); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D) == -1) + result = cuMemcpyHtoDAsync(dst_device, src_host, byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_H_TO_D_ASYNC) == -1) goto error; return 0; @@ -2492,122 +3498,82 @@ int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, const CUDA_ARRAY_DESCRIPTOR *de return -1; } -/** - * \brief Specifies the addressing mode am for the given dimension dim of the - * texture reference hTexRef. If dim is zero, the addressing mode is - * applied to the first parameter of the functions used to fetch from - * the texture; if dim is 1, the second, and so on. CUaddress_mode is - * defined as: - * - * typedef enum CUaddress_mode_enum { - * CU_TR_ADDRESS_MODE_WRAP = 0, - * CU_TR_ADDRESS_MODE_CLAMP = 1, - * CU_TR_ADDRESS_MODE_MIRROR = 2, - * } CUaddress_mode; - * - * \param h_tex_ref Texture reference - * \param dim Dimension - * \param am Addressing mode to set - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am) +int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context, + CUdeviceptr src_device, CUcontext src_context, + size_t byte_count) { CUresult result = 0; - result = cuTexRefSetAddressMode(h_tex_ref, dim, am); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE) == -1) + result = cuMemcpyPeer(dst_device, dst_context, src_device, src_context, + byte_count); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Binds the CUDA array hArray to the texture reference hTexRef. Any - * previous address or CUDA array state associated with the texture - * reference is superseded by this function. Flags must be set to - * CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to hTexRef - * is unbound. - * - * \param h_tex_ref Texture reference to bind - * \param h_array Array to bind - * \param flags Options (must be CU_TRSA_OVERRIDE_FORMAT) - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, unsigned int flags) +int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context, + CUdeviceptr src_device, CUcontext src_context, + size_t byte_count, CUstream h_stream) { CUresult result = 0; - result = cuTexRefSetArray(h_tex_ref, h_array, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ARRAY) == -1) + result = cuMemcpyPeerAsync(dst_device, dst_context, src_device, src_context, + byte_count, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMCPY_PEER_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Specifies the filtering mode fm to be used when reading memory through - * the texture reference hTexRef. CUfilter_mode_enum is defined as: - * - * typedef enum CUfilter_mode_enum { - * CU_TR_FILTER_MODE_POINT = 0, - * CU_TR_FILTER_MODE_LINEAR = 1 - * } CUfilter_mode; + * \brief Frees the memory space pointed to by dptr, which must have been + * returned by a previous call to cuMemAlloc() or cuMemAllocPitch(). * - * \param h_tex_ref Texture reference - * \param fm Filtering mode to set + * \param dptr Pointer to the memory to free. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm) +int SCCudaMemFree(CUdeviceptr dptr) { CUresult result = 0; - result = cuTexRefSetFilterMode(h_tex_ref, fm); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FILTER_MODE) == -1) + result = cuMemFree(dptr); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE) == -1) goto error; return 0; error: - return -1; -} - -/** - * \brief Specifies optional flags via Flags to specify the behavior of data - * returned through the texture reference hTexRef. The valid flags are: - * - * * CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of - * having the texture promote integer data to floating point data in - * the range [0, 1]; - * * CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default - * behavior of having the texture coordinates range from [0, Dim) where - * Dim is the width or height of the CUDA array. Instead, the texture - * coordinates [0, 1.0) reference the entire breadth of the array - * dimension; + return -1; +} + +/** + * \brief Frees the memory space pointed to by p, which must have been returned + * by a previous call to cuMemAllocHost(). * - * \param h_tex_ref Texture reference - * \param flags Optional flags to set + * \param p Pointer to the memory to free. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags) +int SCCudaMemFreeHost(void *p) { CUresult result = 0; - result = cuTexRefSetFlags(h_tex_ref, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FLAGS) == -1) + if (p == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p is NULL"); + goto error; + } + + result = cuMemFreeHost(p); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_FREE_HOST) == -1) goto error; return 0; @@ -2617,26 +3583,25 @@ int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags) } /** - * \brief Specifies the format of the data to be read by the texture reference - * hTexRef. fmt and NumPackedComponents are exactly analogous to the - * Format and NumChannels members of the CUDA_ARRAY_DESCRIPTOR structure: - * They specify the format of each component and the number of components - * per array element. + * \brief Returns the base address in *pbase and size in *psize of the allocation + * by cuMemAlloc() or cuMemAllocPitch() that contains the input pointer + * dptr. Both parameters pbase and psize are optional. If one of them is + * NULL, it is ignored. * - * \param h_tex_ref Texture reference - * \param fmt Format to set - * \param num_packed_components Number of components per array element + * \param pbase Returned base address. + * \param psize Returned size of device memory allocation. + * \param dptr Device pointer to query * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, - int num_packed_components) +int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, + CUdeviceptr dptr) { CUresult result = 0; - result = cuTexRefSetFormat(h_tex_ref, fmt, num_packed_components); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FORMAT) == -1) + result = cuMemGetAddressRange(pbase, psize, dptr); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_ADDRESS_RANGE) == -1) goto error; return 0; @@ -2645,47 +3610,28 @@ int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, return -1; } -/***********************Execution_Control_Management_API***********************/ - /** - * \brief Returns in *pi the integer value of the attribute attrib on the - * kernel given by hfunc. The supported attributes are: - * - * - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The number of threads - * beyond which a launch of the function would fail. This number - * depends on both the function and the device on which the - * function is currently loaded. - * - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of - * statically-allocated shared memory required by this function. - * This does not include dynamically-allocated shared memory - * requested by the user at runtime. - * - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of - * user-allocated constant memory required by this function. - * - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of thread - * local memory used by this function. - * - CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each - * thread of this function. + * \brief Returns in *free and *total respectively, the free and total amount + * of memory available for allocation by the CUDA context, in bytes. * - * \param pi Pointer to an integer which would be updated with the returned - * attribute value. - * \param attrib Attribute requested. - * \param hfunc Function to query attribute of. + * \param free Returned free memory in bytes. + * \param total Returned total memory in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) +int SCCudaMemGetInfo(size_t *free, size_t *total) { CUresult result = 0; - if (pi == NULL) { + if (free == NULL || total == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pi is NULL"); + "free is NULL || total is NULL"); goto error; } - result = cuFuncGetAttribute(pi, attrib, hfunc); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_GET_ATTRIBUTE) == -1) + result = cuMemGetInfo(free, total); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_GET_INFO) == -1) goto error; return 0; @@ -2695,23 +3641,60 @@ int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfun } /** - * \brief Specifies the x, y, and z dimensions of the thread blocks that are - * created when the kernel given by hfunc is launched. + * \brief Allocates bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher + * bandwidth than pageable memory obtained with functions such as + * SCMalloc(). Allocating excessive amounts of pinned memory may degrade + * system performance, since it reduces the amount of memory available + * to the system for paging. As a result, this function is best used + * sparingly to allocate staging areas for data exchange between host + * and device. * - * \param hfunc Kernel to specify dimensions of. - * \param x X dimension. - * \param y Y dimension. - * \param z Z dimension. + * The Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one + * that performed the allocation. + * - CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA + * address space. The device pointer to the memory may be obtained by + * calling cuMemHostGetDevicePointer(). This feature is available only + * on GPUs with compute capability greater than or equal to 1.1. + * - CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system con?gurations, but cannot be read efficiently + * by most CPUs. WC memory is a good option for buffers that will be + * written by the CPU and read by the GPU via mapped pinned memory or + * host->device transfers. All of these fags are orthogonal to one + * another: a developer may allocate memory that is portable, mapped + * and/or write-combined with no restrictions. + * + * The CUDA context must have been created with the CU_CTX_MAP_HOST flag + * in order for the CU_MEMHOSTALLOC_MAPPED flag to have any effect. + * + * The CU_MEMHOSTALLOC_MAPPED flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is + * deferred to cuMemHostGetDevicePointer() because the memory may be + * mapped into other CUDA contexts via the CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with cuMemFreeHost(). + * + * \param pp Returned host pointer to page-locked memory. + * \param byte_size Requested allocation size in bytes. + * \param flags Flags for allocation request. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) +int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags) { CUresult result = 0; - result = cuFuncSetBlockShape(hfunc, x, y, z); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE) == -1) + result = cuMemHostAlloc(pp, byte_size, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_ALLOC) == -1) goto error; return 0; @@ -2721,22 +3704,28 @@ int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) } /** - * \brief Sets through bytes the amount of dynamic shared memory that will be - * available to each thread block when the kernel given by hfunc is - * launched. + * \brief Passes back the device pointer pdptr corresponding to the mapped, + * pinned host buffer p allocated by cuMemHostAlloc. * - * \param hfunc Kernel to specify dynamic shared memory for. - * \param bytes Dynamic shared memory size per thread in bytes. + * cuMemHostGetDevicePointer() will fail if the CU_MEMALLOCHOST_DEVICEMAP + * flag was not speci?ed at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr Returned device pointer. + * \param p Host pointer. + * \param flags Options(must be 0). * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) +int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int flags) { CUresult result = 0; - result = cuFuncSetSharedSize(hfunc, bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_SHARED_SIZE) == -1) + result = cuMemHostGetDevicePointer(pdptr, p, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_DEVICE_POINTER) == -1) goto error; return 0; @@ -2746,21 +3735,24 @@ int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) } /** - * \brief Invokes the kernel f on a 1 x 1 x 1 grid of blocks. The block contains - * the number of threads specified by a previous call to - * cuFuncSetBlockShape(). + * \brief Passes back the flags p_flags that were specified when allocating the + * pinned host buffer p allocated by cuMemHostAlloc. * - * \param f Kernel to launch. + * cuMemHostGetFlags() will fail if the pointer does not reside in an + * allocation performed by cuMemAllocHost() or cuMemHostAlloc(). + * + * \param p_flags Returned flags word. + * \param p Host pointer. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaLaunch(CUfunction f) +int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p) { CUresult result = 0; - result = cuLaunch(f); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH) == -1) + result = cuMemHostGetFlags(p_flags, p); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_GET_FLAGS) == -1) goto error; return 0; @@ -2769,53 +3761,48 @@ int SCCudaLaunch(CUfunction f) return -1; } -/** - * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. - * Each block contains the number of threads specified by a previous call - * to cuFuncSetBlockShape(). - * - * \param f Kernel to launch. - * \param grid_width Width of grid in blocks. - * \param grib_height Height of grid in blocks. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height) +int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags) { CUresult result = 0; - result = cuLaunchGrid(f, grid_width, grid_height); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID) == -1) + result = cuMemHostRegister(p, byte_size, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_REGISTER) == -1) goto error; return 0; + error: + return -1; +} + +int SCCudaMemHostUnregister(void *p) +{ + CUresult result = 0; + + result = cuMemHostUnregister(p); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEM_HOST_UNREGISTER) == -1) + goto error; + return 0; error: return -1; } /** - * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. - * Each block contains the number of threads specified by a previous call - * to cuFuncSetBlockShape(). cuLaunchGridAsync() can optionally be - * associated to a stream by passing a non-zero hStream argument. + * \brief Sets the memory range of N 16-bit values to the speci?ed value us. * - * \param f Kernel to launch. - * \param grid_width Width of grid in blocks. - * \param grib_height Height of grid in blocks. - * \param h_stream Stream identifier. + * \param dst_device Destination device pointer. + * \param us Value to set. + * \param n Number of elements. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, - CUstream h_stream) +int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n) { CUresult result = 0; - result = cuLaunchGridAsync(f, grid_width, grid_height, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID_ASYNC) == -1) + result = cuMemsetD16(dst_device, us, n); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16) == -1) goto error; return 0; @@ -2824,74 +3811,90 @@ int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, return -1; } -/** - * \brief Sets a foating-point parameter that will be specified the next time - * the kernel corresponding to hfunc will be invoked. offset is a byte - * offset. - * - * \param h_func Kernel to add parameter to. - * \param offset Offset to add parameter to argument list. - * \param value Value of parameter. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetf(CUfunction h_func, int offset, float value) +int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us, + size_t n, CUstream h_stream) { CUresult result = 0; - result = cuParamSetf(h_func, offset, value); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETF) == -1) + result = cuMemsetD16Async(dst_device, us, n, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D16_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Sets an integer parameter that will be specified the next time - * the kernel corresponding to hfunc will be invoked. offset is a byte - * offset. + * \brief Sets the 2D memory range of Width 16-bit values to the specified + * value us. Height specifies the number of rows to set, and dst_pitch + * specifies the number of bytes between each row. This function + * performs fastest when the pitch is one that has been passed back + * by cuMemAllocPitch(). * - * \param h_func Kernel to add parameter to. - * \param offset Offset to add parameter to argument list. - * \param value Value of parameter. + * \param dst_device Destination device pointer. + * \param dst_pitch Pitch of destination device pointer. + * \param us Value to set + * \param width Width of row. + * \param height Number of rows * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value) +int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch, + unsigned short us, size_t width, + size_t height) +{ + CUresult result = 0; + + result = cuMemsetD2D16(dst_device, dst_pitch, us, width, height); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16) == -1) + goto error; + + return 0; + + error: + return -1; +} + +int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned short us, size_t width, + size_t height, CUstream h_stream) { CUresult result = 0; - result = cuParamSeti(h_func, offset, value); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETI) == -1) + result = cuMemsetD2D16Async(dst_device, dst_pitch, us, width, height, + h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D16_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Sets through numbytes the total size in bytes needed by the function - * parameters of the kernel corresponding to hfunc. + * \brief Sets the 2D memory range of Width 32-bit values to the specified value + * ui. Height speci?es the number of rows to set, and dstPitch specifies + * the number of bytes between each row. This function performs fastest + * when the pitch is one that has been passed back by cuMemAllocPitch(). * - * \param h_func Kernel to set parameter size for. - * \param num_bytes Size of paramter list in bytes. + * \param dst_device Destination device pointer. + * \param dst_pitch Pitch of destination device pointer. + * \param ui Value to set + * \param width Width of row. + * \param height Number of rows * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes) +int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch, + unsigned int ui, size_t width, size_t height) { CUresult result = 0; - result = cuParamSetSize(h_func, num_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_SIZE) == -1) + result = cuMemsetD2D32(dst_device, dst_pitch, ui, width, height); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32) == -1) goto error; return 0; @@ -2900,59 +3903,44 @@ int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes) return -1; } -/** - * \brief Makes the CUDA array or linear memory bound to the texture reference - * h_tex_ref available to a device program as a texture. In this version - * of CUDA, the texture-reference must be obtained via cuModuleGetTexRef() - * and the tex_unit parameter must be set to CU_PARAM_TR_DEFAULT. - * - * \param h_func Kernel to add texture-reference to. - * \param tex_unit Texture unit (must be CU_PARAM_TR_DEFAULT). - * \param h_tex_ref Texture-reference to add to argument list. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref) +int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned int ui, size_t width, size_t height, + CUstream h_stream) { CUresult result = 0; - result = cuParamSetTexRef(h_func, tex_unit, h_tex_ref); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_TEX_REF) == -1) + result = cuMemsetD2D32Async(dst_device, dst_pitch, ui, width, height, + h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D32_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Copies an arbitrary amount of data (specified in numbytes) from ptr - * into the parameter space of the kernel corresponding to hfunc. - * offset is a byte offset. + * \brief Sets the 2D memory range of Width 8-bit values to the specified value + * uc. Height speci?es the number of rows to set, and dstPitch specifies + * the number of bytes between each row. This function performs fastest + * when the pitch is one that has been passed back by cuMemAllocPitch(). * - * \param h_func Kernel to add data to. - * \param offset Offset to add data to argument list. - * \param ptr Pointer to arbitrary data. - * \param num_bytes Size of data to copy in bytes. + * \param dst_device Destination device pointer. + * \param dst_pitch Pitch of destination device pointer. + * \param uc Value to set + * \param width Width of row. + * \param height Number of rows * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, - unsigned int num_bytes) +int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch, + unsigned char uc, size_t width, size_t height) { CUresult result = 0; - if (ptr == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ptr is NULL"); - goto error; - } - - result = cuParamSetv(h_func, offset, ptr, num_bytes); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETV) == -1) + result = cuMemsetD2D8(dst_device, dst_pitch, uc, width, height); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8) == -1) goto error; return 0; @@ -2961,56 +3949,38 @@ int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, return -1; } -/*****************************Event_Management_API*****************************/ - -/** - * \brief Creates an event *ph_event with the flags specified via flags. Valid - * flags include: - * - * CU_EVENT_DEFAULT: Default event creation flag. - * CU_EVENT_BLOCKING_SYNC: Specifies that event should use blocking - * synchronization. - * - * \param ph_event Returns newly created event. - * \param flags Event creation flags. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventCreate(CUevent *ph_event, unsigned int flags) +int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned char uc, size_t width, size_t height, + CUstream h_stream) { CUresult result = 0; - if (ph_event == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "ph_event is NULL"); - goto error; - } - - result = cuEventCreate(ph_event, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_CREATE) == -1) + result = cuMemsetD2D8Async(dst_device, dst_pitch, uc, width, height, + h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D2_D8_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Destroys the event specified by h_event. + * \brief Sets the memory range of N 32-bit values to the specified value ui. * - * \param h_event Event to destroy. + * \param dst_device Destination device pointer. + * \param ui Value to set. + * \param n Number of elements. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaEventDestroy(CUevent h_event) +int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n) { CUresult result = 0; - result = cuEventDestroy(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_DESTROY) == -1) + result = cuMemsetD32(dst_device, ui, n); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32) == -1) goto error; return 0; @@ -3019,56 +3989,36 @@ int SCCudaEventDestroy(CUevent h_event) return -1; } -/** - * \brief Computes the elapsed time between two events (in milliseconds with - * a resolution of around 0.5 microseconds). If either event has not - * been recorded yet, this function returns CUDA_ERROR_NOT_READY. If - * either event has been recorded with a non-zero stream, the result - * is undefined. - * - * \param p_milli_seconds Returned elapsed time in milliseconds. - * \param h_start Starting event. - * \param h_end Ending event. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, CUevent h_end) +int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui, + size_t n, CUstream h_stream) { CUresult result = 0; - if (p_milli_seconds == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_milli_seconds is NULL"); - goto error; - } - - result = cuEventElapsedTime(p_milli_seconds, h_start, h_end); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_ELAPSED_TIME) == -1) + result = cuMemsetD32Async(dst_device, ui, n, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D32_ASYNC) == -1) goto error; return 0; - error: return -1; } /** - * \brief Returns CUDA_SUCCESS if the event has actually been recorded, or - * CUDA_ERROR_NOT_READY if not. If cuEventRecord() has not been called - * on this event, the function returns CUDA_ERROR_INVALID_VALUE. + * \brief Sets the memory range of N 8-bit values to the specified value ui. * - * \param h_event Event to query. + * \param dst_device Destination device pointer. + * \param uc Value to set. + * \param n Number of elements. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaEventQuery(CUevent h_event) +int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n) { CUresult result = 0; - result = cuEventQuery(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_QUERY) == -1) + result = cuMemsetD8(dst_device, uc, n); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8) == -1) goto error; return 0; @@ -3077,60 +4027,32 @@ int SCCudaEventQuery(CUevent h_event) return -1; } -/** - * \brief Records an event. If stream is non-zero, the event is recorded after - * all preceding operations in the stream have been completed; otherwise, - * it is recorded after all preceding operations in the CUDA context have - * been completed. Since operation is asynchronous, cuEventQuery() and/or - * cuEventSynchronize() must be used to determine when the event has - * actually been recorded. - * - * If cuEventRecord() has previously been called and the event has not - * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. - * - * \param h_event Event to record. - * \param h_stream Stream to record event for. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventRecord(CUevent h_event, CUstream h_stream) +int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc, + size_t n, CUstream h_stream) { CUresult result = 0; - result = cuEventRecord(h_event, h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_RECORD) == -1) + result = cuMemsetD8Async(dst_device, uc, n, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_MEMSET_D8_ASYNC) == -1) goto error; return 0; - error: return -1; } -/** - * \brief Waits until the event has actually been recorded. If cuEventRecord() - * has been called on this event, the function returns - * CUDA_ERROR_INVALID_VALUE. - * - * If cuEventRecord() has previously been called and the event has not - * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. - * - * \param h_event Event to wait for. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaEventSynchronize(CUevent h_event) +/*****************************Unified_Addressing_API****************************/ + +int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute, + CUdeviceptr ptr) { CUresult result = 0; - result = cuEventSynchronize(h_event); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_SYNCHRONIZE) == -1) + result = cuPointerGetAttribute(data, attribute, ptr); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_POINTER_GET_ATTRIBUTE) == -1) goto error; return 0; - error: return -1; } @@ -3226,77 +4148,7 @@ int SCCudaStreamSynchronize(CUstream h_stream) CUresult result = 0; result = cuStreamSynchronize(h_stream); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_SYNCHRONIZE) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/*****************************Module_Management_API****************************/ - -/** - * \brief Returns in *hfunc the handle of the function of name \"name\" located - * in module hmod. If no function of that name exists, - * cuModuleGetFunction() returns CUDA_ERROR_NOT_FOUND. - * - * \param hfunc Returned function handle. - * \param hmod Module to return function from. - * \param name Name of function to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) -{ - CUresult result = 0; - - if (hfunc == NULL || name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "hfunc is NULL or name is NULL"); - goto error; - } - - result = cuModuleGetFunction(hfunc, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_FUNCTION) == -1) - goto error; - - return 0; - - error: - return -1; -} - -/** - * \brief Returns in *dptr and *bytes the base pointer and size of the global - * name \"name\" located in module hmod. If no variable of that name - * exists, cuModuleGetGlobal() returns CUDA_ERROR_NOT_FOUND. Both - * parameters dptr and bytes are optional. If one of them is NULL, - * it is ignored. - * - * \param dptr Returned global device pointer. - * \param bytes Returned global size in bytes. - * \param hmod Module to return function from. - * \param name Name of global to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, - const char *name) -{ - CUresult result = 0; - - if (name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "name is NULL"); - goto error; - } - - result = cuModuleGetGlobal(dptr, bytes, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_GLOBAL) == -1) + if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_SYNCHRONIZE) == -1) goto error; return 0; @@ -3305,66 +4157,48 @@ int SCCudaModuleGetGlobal(CUdeviceptr *dptr, unsigned int *bytes, CUmodule hmod, return -1; } -/** - * \brief Returns in *p_tex_ref the handle of the texture reference of name - * \"name\" in the module hmod. If no texture reference of that name - * exists, cuModuleGetTexRef() returns CUDA_ERROR_NOT_FOUND. This texture - * reference handle should not be destroyed, since it will be destroyed - * when the module is unloaded. - * - * \param p_tex_ref Returned global device pointer. - * \param hmod Module to retrieve texture reference from. - * \param name Name of the texture reference to retrieve. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, const char *name) +int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event, + unsigned int flags) { CUresult result = 0; - if (p_tex_ref == NULL || name == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "p_tex_ref is NULL or name is NULL"); - goto error; - } - - result = cuModuleGetTexRef(p_tex_ref, hmod, name); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_GET_TEX_REF) == -1) + result = cuStreamWaitEvent(h_stream, h_event, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_STREAM_WAIT_EVENT) == -1) goto error; return 0; - error: return -1; } +/*****************************Event_Management_API*****************************/ + /** - * \brief Takes a filename fname and loads the corresponding module \"module\" - * into the current context. The CUDA driver API does not attempt to - * lazily allocate the resources needed by a module; if the memory for - * functions and data (constant and global) needed by the module cannot - * be allocated, cuModuleLoad() fails. The file should be a cubin file - * as output by nvcc or a PTX file, either as output by nvcc or handwrtten. + * \brief Creates an event *ph_event with the flags specified via flags. Valid + * flags include: * - * \param module Returned module. - * \param fname Filename of module to load. + * CU_EVENT_DEFAULT: Default event creation flag. + * CU_EVENT_BLOCKING_SYNC: Specifies that event should use blocking + * synchronization. + * + * \param ph_event Returns newly created event. + * \param flags Event creation flags. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaModuleLoad(CUmodule *module, const char *fname) +int SCCudaEventCreate(CUevent *ph_event, unsigned int flags) { CUresult result = 0; - if (module == NULL || fname == NULL) { + if (ph_event == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or fname is NULL"); + "ph_event is NULL"); goto error; } - result = cuModuleLoad(module, fname); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD) == -1) + result = cuEventCreate(ph_event, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_CREATE) == -1) goto error; return 0; @@ -3374,31 +4208,19 @@ int SCCudaModuleLoad(CUmodule *module, const char *fname) } /** - * \brief Takes a pointer image and loads the corresponding module \"module\" - * into the current context. The pointer may be obtained by mapping a - * cubin or PTX file, passing a cubin or PTX ?le as a NULL-terminated - * text string, or incorporating a cubin object into the executable - * resources and using operating system calls such as Windows - * FindResource() to obtain the pointer. + * \brief Destroys the event specified by h_event. * - * \param module Returned module. - * \param image Module data to load + * \param h_event Event to destroy. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaModuleLoadData(CUmodule *module, const char *image) +int SCCudaEventDestroy(CUevent h_event) { CUresult result = 0; - if (module == NULL || image == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or image is NULL"); - goto error; - } - - result = cuModuleLoadData(module, image); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA) == -1) + result = cuEventDestroy(h_event); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_DESTROY) == -1) goto error; return 0; @@ -3408,68 +4230,31 @@ int SCCudaModuleLoadData(CUmodule *module, const char *image) } /** - * \brief Takes a pointer image and loads the corresponding module module into - * the current context. The pointer may be obtained by mapping a cubin or - * PTX file, passing a cubin or PTX file as a NULL-terminated text - * string, or incorporating a cubin object into the executable resources - * and using operating system calls such as Windows FindResource() to - * obtain the pointer. Options are passed as an array via options and any - * corresponding parameters are passed in optionValues. The number of - * total options is supplied via numOptions. Any outputs will be returned - * via optionValues. Supported options are: - * - * - CU_JIT_MAX_REGISTERS: input specifies the maximum number of registers - * per thread; - * - CU_JIT_THREADS_PER_BLOCK: input specifies number of threads per block - * to target compilation for; output returns the number of threads - * the compiler actually targeted; - * - CU_JIT_WALL_TIME: output returns the float value of wall clock time, - * in milliseconds, spent compiling the PTX code; - * - CU_JIT_INFO_LOG_BUFFER: input is a pointer to a buffer in which to - * print any informational log messages from PTX assembly; - * - CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the - * buffer; output is the number of bytes filled with messages; - * - CU_JIT_ERROR_LOG_BUFFER: input is a pointer to a buffer in which to - * print any error log messages from PTX assembly; - * - CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES: input is the size in bytes of the - * buffer; output is the number of bytes filled with messages; - * - CU_JIT_OPTIMIZATION_LEVEL: input is the level of optimization to apply - * to generated code (0 - 4), with 4 being the default and highest - * level; - * - CU_JIT_TARGET_FROM_CUCONTEXT: causes compilation target to be - * determined based on current attached context (default); - * - CU_JIT_TARGET: input is the compilation target based on supplied - * CUjit_target_enum; possible values are: - * -- CU_TARGET_COMPUTE_10 - * -- CU_TARGET_COMPUTE_11 - * -- CU_TARGET_COMPUTE_12 - * -- CU_TARGET_COMPUTE_13 + * \brief Computes the elapsed time between two events (in milliseconds with + * a resolution of around 0.5 microseconds). If either event has not + * been recorded yet, this function returns CUDA_ERROR_NOT_READY. If + * either event has been recorded with a non-zero stream, the result + * is undefined. * - * \param module Returned module. - * \param image Module data to load. - * \param numOptions Number of options. - * \param options Options for JIT. - * \param optionValues Option values for JIT. + * \param p_milli_seconds Returned elapsed time in milliseconds. + * \param h_start Starting event. + * \param h_end Ending event. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaModuleLoadDataEx(CUmodule *module, const char *image, - unsigned int num_options, CUjit_option *options, - void **option_values) +int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, CUevent h_end) { CUresult result = 0; - if (module == NULL || image == NULL || options == NULL || - option_values == NULL) { + if (p_milli_seconds == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or image is NULL or options is NULL or " - "option_values is NULL"); + "p_milli_seconds is NULL"); goto error; } - result = cuModuleLoadDataEx(module, image, num_options, options, option_values); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_DATA_EX) == -1) + result = cuEventElapsedTime(p_milli_seconds, h_start, h_end); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_ELAPSED_TIME) == -1) goto error; return 0; @@ -3479,33 +4264,21 @@ int SCCudaModuleLoadDataEx(CUmodule *module, const char *image, } /** - * \brief Takes a pointer fat_cubin and loads the corresponding module \"module\" - * into the current context. The pointer represents a fat binary object, - * which is a collection of different cubin files, all representing the - * same device code, but compiled and optimized for different - * architectures. There is currently no documented API for constructing - * and using fat binary objects by programmers, and therefore this - * function is an internal function in this version of CUDA. More - * information can be found in the nvcc document. + * \brief Returns CUDA_SUCCESS if the event has actually been recorded, or + * CUDA_ERROR_NOT_READY if not. If cuEventRecord() has not been called + * on this event, the function returns CUDA_ERROR_INVALID_VALUE. * - * \param module Returned module. - * \param fatCubin Fat binary to load. + * \param h_event Event to query. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin) +int SCCudaEventQuery(CUevent h_event) { CUresult result = 0; - if (module == NULL || fat_cubin == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "module is NULL or fatCubin is NULL"); - goto error; - } - - result = cuModuleLoadFatBinary(module, fat_cubin); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_LOAD_FAT_BINARY) == -1) + result = cuEventQuery(h_event); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_QUERY) == -1) goto error; return 0; @@ -3515,19 +4288,28 @@ int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin) } /** - * \brief Unloads a module hmod from the current context. + * \brief Records an event. If stream is non-zero, the event is recorded after + * all preceding operations in the stream have been completed; otherwise, + * it is recorded after all preceding operations in the CUDA context have + * been completed. Since operation is asynchronous, cuEventQuery() and/or + * cuEventSynchronize() must be used to determine when the event has + * actually been recorded. * - * \param module Module to unload + * If cuEventRecord() has previously been called and the event has not + * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. + * + * \param h_event Event to record. + * \param h_stream Stream to record event for. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaModuleUnload(CUmodule hmod) +int SCCudaEventRecord(CUevent h_event, CUstream h_stream) { CUresult result = 0; - result = cuModuleUnload(hmod); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_MODULE_UNLOAD) == -1) + result = cuEventRecord(h_event, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_RECORD) == -1) goto error; return 0; @@ -3536,33 +4318,25 @@ int SCCudaModuleUnload(CUmodule hmod) return -1; } -/*****************************Context_Management_API***************************/ - /** - * \brief Increments the usage count of the context and passes back a context - * handle in *pctx that must be passed to cuCtxDetach() when the - * application is done with the context. cuCtxAttach() fails if there is - * no context current to the thread. Currently, the flags parameter must - * be 0. + * \brief Waits until the event has actually been recorded. If cuEventRecord() + * has been called on this event, the function returns + * CUDA_ERROR_INVALID_VALUE. * - * \param pctx Returned context handle of the current context. - * \param flags Context attach flags (must be 0). + * If cuEventRecord() has previously been called and the event has not + * been recorded yet, this function returns CUDA_ERROR_INVALID_VALUE. + * + * \param h_event Event to wait for. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags) +int SCCudaEventSynchronize(CUevent h_event) { CUresult result = 0; - if (pctx == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pctx NULL"); - goto error; - } - - result = cuCtxAttach(pctx, flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_ATTACH) == -1) + result = cuEventSynchronize(h_event); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_EVENT_SYNCHRONIZE) == -1) goto error; return 0; @@ -3571,68 +4345,47 @@ int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags) return -1; } +/***********************Execution_Control_Management_API***********************/ + /** - * \brief Creates a new CUDA context and associates it with the calling thread. - * The flags parameter is described below. The context is created with - * a usage count of 1 and the caller of cuCtxCreate() must call - * cuCtxDestroy() or cuCtxDetach() when done using the context. If a - * context is already current to the thread, it is supplanted by the - * newly created context and may be restored by a subsequent call to - * cuCtxPopCurrent(). The two LSBs of the flags parameter can be used - * to control how the OS thread, which owns the CUDA context at the - * time of an API call, interacts with the OS scheduler when waiting for - * results from the GPU. - * - * - CU_CTX_SCHED_AUTO: The default value if the flags parameter is zero, - * uses a heuristic based on the number of active CUDA contexts in - * the process C and the number of logical processors in the system - * P. If C > P, then CUDA will yield to other OS threads when - * waiting for the GPU, otherwise CUDA will not yield while waiting - * for results and actively spin on the processor. - * - CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for - * results from the GPU. This can de-crease latency when waiting for - * the GPU, but may lower the performance of CPU threads if they are - * performing work in parallel with the CUDA thread. - * - CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting - * for results from the GPU. This can increase latency when waiting - * for the GPU, but can increase the performance of CPU threads - * performing work in parallel with the GPU. - * - CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a - * synchronization primitive when waiting for the GPU to finish work. - * - CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. - * This flag must be set in order to allocate pinned host memory - * that is accessible to the GPU. + * \brief Returns in *pi the integer value of the attribute attrib on the + * kernel given by hfunc. The supported attributes are: * - * Note to Linux users: - * Context creation will fail with CUDA_ERROR_UNKNOWN if the compute mode - * of the device is CU_COMPUTEMODE_PROHIBITED. Similarly, context creation - * will also fail with CUDA_ERROR_UNKNOWN if the compute mode for the - * device is set to CU_COMPUTEMODE_EXCLUSIVE and there is already an - * active context on the device. The function cuDeviceGetAttribute() can - * be used with CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the compute - * mode of the device. The nvidia-smi tool can be used to set the compute - * mode for devices. Documentation for nvidia-smi can be obtained by - * passing a -h option to it. + * - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The number of threads + * beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the + * function is currently loaded. + * - CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory required by this function. + * This does not include dynamically-allocated shared memory + * requested by the user at runtime. + * - CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of + * user-allocated constant memory required by this function. + * - CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of thread + * local memory used by this function. + * - CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each + * thread of this function. * - * \param pctx Returned context handle of the current context. - * \param flags Context creation flags. - * \param dev Device to create context on. + * \param pi Pointer to an integer which would be updated with the returned + * attribute value. + * \param attrib Attribute requested. + * \param hfunc Function to query attribute of. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) +int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) { CUresult result = 0; - if (pctx == NULL) { + if (pi == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "pctx NULL"); + "pi is NULL"); goto error; } - result = cuCtxCreate(pctx, flags, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_CREATE) == -1) + result = cuFuncGetAttribute(pi, attrib, hfunc); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_GET_ATTRIBUTE) == -1) goto error; return 0; @@ -3641,49 +4394,56 @@ int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) return -1; } -/** - * \brief Destroys the CUDA context specified by ctx. If the context usage count - * is not equal to 1, or the context is current to any CPU thread other - * than the current one, this function fails. Floating contexts (detached - * from a CPU thread via cuCtxPopCurrent()) may be destroyed by this - * function. - * - * \param ctx Context to destroy. - * - * \retval 0 On success. - * \retval -1 On failure. - */ -int SCCudaCtxDestroy(CUcontext ctx) +int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) { CUresult result = 0; - result = cuCtxDestroy(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DESTROY) == -1) + result = cuFuncSetCacheConfig(hfunc, config); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_CACHE_CONFIG) == -1) goto error; return 0; + error: + return -1; +} + +int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x, + unsigned int grid_dim_y, unsigned int grid_dim_z, + unsigned int block_dim_x, unsigned int block_dim_y, + unsigned int block_dim_z, unsigned int shared_mem_bytes, + CUstream h_stream, void **kernel_params, void **extra) +{ + CUresult result = 0; + + result = cuLaunchKernel(f, grid_dim_x, grid_dim_y, grid_dim_z, + block_dim_x, block_dim_y, block_dim_z, + shared_mem_bytes, h_stream, kernel_params, extra); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_KERNEL) == -1) + goto error; + return 0; error: return -1; } /** - * \brief Decrements the usage count of the context ctx, and destroys the - * context if the usage count goes to 0. The context must be a handle - * that was passed back by cuCtxCreate() or cuCtxAttach(), and must be - * current to the calling thread. + * \brief Specifies the x, y, and z dimensions of the thread blocks that are + * created when the kernel given by hfunc is launched. * - * \param ctx Context to destroy. + * \param hfunc Kernel to specify dimensions of. + * \param x X dimension. + * \param y Y dimension. + * \param z Z dimension. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxDetach(CUcontext ctx) +int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) { CUresult result = 0; - result = cuCtxDetach(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_DETACH) == -1) + result = cuFuncSetBlockShape(hfunc, x, y, z); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_BLOCK_SHAPE) == -1) goto error; return 0; @@ -3693,25 +4453,22 @@ int SCCudaCtxDetach(CUcontext ctx) } /** - * \brief Returns in *device the ordinal of the current context's device. + * \brief Sets through bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by hfunc is + * launched. * - * \param device Returned device id for the current context. + * \param hfunc Kernel to specify dynamic shared memory for. + * \param bytes Dynamic shared memory size per thread in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxGetDevice(CUdevice *device) +int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) { CUresult result = 0; - if (device == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "device NULL"); - goto error; - } - - result = cuCtxGetDevice(device); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_GET_DEVICE) == -1) + result = cuFuncSetSharedSize(hfunc, bytes); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_FUNC_SET_SHARED_SIZE) == -1) goto error; return 0; @@ -3721,32 +4478,21 @@ int SCCudaCtxGetDevice(CUdevice *device) } /** - * \brief Pops the current CUDA context from the CPU thread. The CUDA context - * must have a usage count of 1. CUDA contexts have a usage count of 1 - * upon creation; the usage count may be incremented with cuCtxAttach() - * and decremented with cuCtxDetach(). - * - * If successful, cuCtxPopCurrent() passes back the new context handle - * in *pctx. The old context may then be made current to a different CPU - * thread by calling cuCtxPushCurrent(). - * - * Floating contexts may be destroyed by calling cuCtxDestroy(). - * - * If a context was current to the CPU thread before cuCtxCreate() or - * cuCtxPushCurrent() was called, this function makes that context - * current to the CPU thread again. + * \brief Invokes the kernel f on a 1 x 1 x 1 grid of blocks. The block contains + * the number of threads specified by a previous call to + * cuFuncSetBlockShape(). * - * \param pctx Returned new context handle. + * \param f Kernel to launch. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxPopCurrent(CUcontext *pctx) +int SCCudaLaunch(CUfunction f) { CUresult result = 0; - result = cuCtxPopCurrent(pctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_POP_CURRENT) == -1) + result = cuLaunch(f); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH) == -1) goto error; return 0; @@ -3756,28 +4502,23 @@ int SCCudaCtxPopCurrent(CUcontext *pctx) } /** - * \brief Pushes the given context ctx onto the CPU thread's stack of current - * contexts. The speci?ed context becomes the CPU thread's current - * context, so all CUDA functions that operate on the current context - * are affected. - * - * The previous current context may be made current again by calling - * cuCtxDestroy() or cuCtxPopCurrent(). - * - * The context must be "floating," i.e. not attached to any thread. - * Contexts are made to float by calling cuCtxPopCurrent(). + * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. + * Each block contains the number of threads specified by a previous call + * to cuFuncSetBlockShape(). * - * \param ctx Floating context to attach. + * \param f Kernel to launch. + * \param grid_width Width of grid in blocks. + * \param grib_height Height of grid in blocks. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxPushCurrent(CUcontext ctx) +int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height) { CUresult result = 0; - result = cuCtxPushCurrent(ctx); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_PUSH_CURRENT) == -1) + result = cuLaunchGrid(f, grid_width, grid_height); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID) == -1) goto error; return 0; @@ -3787,18 +4528,26 @@ int SCCudaCtxPushCurrent(CUcontext ctx) } /** - * \brief Blocks until the device has completed all preceding requested tasks. - * cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * \brief Invokes the kernel f on a grid_width x grid_height grid of blocks. + * Each block contains the number of threads specified by a previous call + * to cuFuncSetBlockShape(). cuLaunchGridAsync() can optionally be + * associated to a stream by passing a non-zero hStream argument. + * + * \param f Kernel to launch. + * \param grid_width Width of grid in blocks. + * \param grib_height Height of grid in blocks. + * \param h_stream Stream identifier. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaCtxSynchronize(void) +int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, + CUstream h_stream) { CUresult result = 0; - result = cuCtxSynchronize(); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_CTX_SYNCHRONIZE) == -1) + result = cuLaunchGridAsync(f, grid_width, grid_height, h_stream); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_LAUNCH_GRID_ASYNC) == -1) goto error; return 0; @@ -3807,30 +4556,24 @@ int SCCudaCtxSynchronize(void) return -1; } -/*****************************Version_Management_API***************************/ - /** - * \brief Returns in *driver_version the version number of the installed CUDA - * driver. This function automatically returns CUDA_ERROR_INVALID_VALUE - * if the driverVersion argument is NULL. + * \brief Sets a foating-point parameter that will be specified the next time + * the kernel corresponding to hfunc will be invoked. offset is a byte + * offset. * - * \param driver_version Returns the CUDA driver version. + * \param h_func Kernel to add parameter to. + * \param offset Offset to add parameter to argument list. + * \param value Value of parameter. * * \retval 0 On success. * \retval -1 On failure. */ -int SCCudaDriverGetVersion(int *driver_version) +int SCCudaParamSetf(CUfunction h_func, int offset, float value) { CUresult result = 0; - if (driver_version == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "driver_version NULL"); - goto error; - } - - result = cuDriverGetVersion(driver_version); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DRIVER_GET_VERSION) == -1) + result = cuParamSetf(h_func, offset, value); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETF) == -1) goto error; return 0; @@ -3839,30 +4582,24 @@ int SCCudaDriverGetVersion(int *driver_version) return -1; } -/*****************************Device_Management_API****************************/ - /** - * \internal - * \brief Gets the total no of devices with compute capability greater than or - * equal to 1.0 that are available for execution. + * \brief Sets an integer parameter that will be specified the next time + * the kernel corresponding to hfunc will be invoked. offset is a byte + * offset. * - * \param count Pointer to an integer that will be updated with the device count. + * \param h_func Kernel to add parameter to. + * \param offset Offset to add parameter to argument list. + * \param value Value of parameter. * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceGetCount(int *count) +int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value) { CUresult result = 0; - if (count == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "count NULL"); - goto error; - } - - result = cuDeviceGetCount(count); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_COUNT) == -1) + result = cuParamSeti(h_func, offset, value); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETI) == -1) goto error; return 0; @@ -3872,29 +4609,21 @@ static int SCCudaDeviceGetCount(int *count) } /** - * \internal - * \brief Returns a device handle given an ordinal in the range - * [0, cuDeviceGetCount() - 1]. + * \brief Sets through numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to hfunc. * - * \param device Pointer to a CUDevice instance that will be updated with the - * device handle. - * \param ordinal An index in the range [0, cuDeviceGetCount() - 1]. + * \param h_func Kernel to set parameter size for. + * \param num_bytes Size of paramter list in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceGet(CUdevice *device, int ordinal) +int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes) { CUresult result = 0; - if (device == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "device NULL"); - goto error; - } - - result = cuDeviceGet(device, ordinal); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET) == -1) + result = cuParamSetSize(h_func, num_bytes); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_SIZE) == -1) goto error; return 0; @@ -3904,28 +4633,24 @@ static int SCCudaDeviceGet(CUdevice *device, int ordinal) } /** - * \internal - * \brief Returns the device name, given the device handle. + * \brief Makes the CUDA array or linear memory bound to the texture reference + * h_tex_ref available to a device program as a texture. In this version + * of CUDA, the texture-reference must be obtained via cuModuleGetTexRef() + * and the tex_unit parameter must be set to CU_PARAM_TR_DEFAULT. * - * \param name Pointer to a char buffer which will be updated with the device name. - * \param len Length of the above buffer. - * \param dev The device handle. + * \param h_func Kernel to add texture-reference to. + * \param tex_unit Texture unit (must be CU_PARAM_TR_DEFAULT). + * \param h_tex_ref Texture-reference to add to argument list. * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceGetName(char *name, int len, CUdevice dev) +int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref) { CUresult result = 0; - if (name == NULL || len == 0) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "name is NULL or len is 0"); - goto error; - } - - result = cuDeviceGetName(name, len, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_NAME) == -1) + result = cuParamSetTexRef(h_func, tex_unit, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SET_TEX_REF) == -1) goto error; return 0; @@ -3935,29 +4660,31 @@ static int SCCudaDeviceGetName(char *name, int len, CUdevice dev) } /** - * \internal - * \brief Returns the total amount of memory availabe on the device which - * is sent as the argument. + * \brief Copies an arbitrary amount of data (specified in numbytes) from ptr + * into the parameter space of the kernel corresponding to hfunc. + * offset is a byte offset. * - * \param bytes Pointer to an unsigned int instance, that will be updated with - * total memory for the device. - * \param dev The device handle. + * \param h_func Kernel to add data to. + * \param offset Offset to add data to argument list. + * \param ptr Pointer to arbitrary data. + * \param num_bytes Size of data to copy in bytes. * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceTotalMem(unsigned int *bytes, CUdevice dev) +int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, + unsigned int num_bytes) { CUresult result = 0; - if (bytes == NULL) { + if (ptr == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "bytes is NULL"); + "ptr is NULL"); goto error; } - result = cuDeviceTotalMem(bytes, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_TOTAL_MEM) == -1) + result = cuParamSetv(h_func, offset, ptr, num_bytes); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_PARAM_SETV) == -1) goto error; return 0; @@ -3966,30 +4693,34 @@ static int SCCudaDeviceTotalMem(unsigned int *bytes, CUdevice dev) return -1; } +/***********************Texture_Reference_Management_API***********************/ + /** - * \internal - * \brief Returns the major and the minor revision numbers that define the - * compute capability for the device that is sent as the argument. + * \brief Creates a texture reference and returns its handle in *pTexRef. Once + * created, the application must call cuTexRefSetArray() or cuTexRefSetAddress() + * to associate the reference with allocated memory. Other texture reference + * functions are used to specify the format and interpretation (addressing, + * filtering, etc.) to be used when the memory is read through this texture + * reference. To associate the texture reference with a texture ordinal for + * a given function, the application should call cuParamSetTexRef(). * - * \param major Pointer to an integer, that will be updated with the major revision. - * \param minor Pointer to an integer, that will be updated with the minor revision. - * \param dev The device handle. + * \param p_tex_ref Returned texture reference * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev) +int SCCudaTexRefCreate(CUtexref *p_tex_ref) { CUresult result = 0; - if (major == NULL || minor == NULL) { + if (p_tex_ref == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "major is NULL or minor is NULL"); + "p_tex_ref is NULL"); goto error; } - result = cuDeviceComputeCapability(major, minor, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_COMPUTE_CAPABILITY) == -1) + result = cuTexRefCreate(p_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_CREATE) == -1) goto error; return 0; @@ -3999,41 +4730,19 @@ static int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev) } /** - * \internal - * \brief Returns the properties of the device. The CUdevprop structure is - * defined as - * - * typedef struct CUdevprop_st { - * int maxThreadsPerBlock; - * int maxThreadsDim[3]; - * int maxGridSize[3]; - * int sharedMemPerBlock; - * int totalConstantMemory; - * int SIMDWidth; - * int memPitch; - * int regsPerBlock; - * int clockRate; - * int textureAlign - * } CUdevprop; + * \brief Destroys the texture reference specified by hTexRef. * - * \param prop Pointer to a CUdevprop instance that holds the device properties. - * \param dev The device handle. + * \param h_tex_ref Texture reference to destroy * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev) +int SCCudaTexRefDestroy(CUtexref h_tex_ref) { CUresult result = 0; - if (prop == NULL) { - SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "prop is NULL"); - goto error; - } - - result = cuDeviceGetProperties(prop, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_PROPERTIES) == -1) + result = cuTexRefDestroy(h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_DESTROY) == -1) goto error; return 0; @@ -4043,76 +4752,28 @@ static int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev) } /** - * \internal - * \brief Returns the various attributes for the device that is sent as the arg. - * - * The supported attributes are: - * - * CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads - * per block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid; - * CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of - * shared mem-ory available to a thread block in bytes; this amount - * is shared by all thread blocks simultaneously resident on a - * multiprocessor; - * CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device - * for __constant_-_ variables in a CUDA C kernel in bytes; - * CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads; - * CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the - * memory copy functions that involve memory regions allocated - * through cuMemAllocPitch(); - * CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit - * registers avail-able to a thread block; this number is shared by - * all thread blocks simultaneously resident on a multiprocessor; - * CU_DEVICE_ATTRIBUTE_CLOCK_RATE: Peak clock frequency in kilohertz; - * CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture - * base addresses aligned to textureAlign bytes do not need an offset - * applied to texture fetches; - * CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy - * memory between host and device while executing a kernel, or 0 if not; - * CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on - * the device; - * CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit - * for kernels executed on the device, or 0 if not; - * CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the - * memory subsystem, or 0 if not; - * CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host - * memory into the CUDA address space, or 0 if not; - * CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently - * in. Available modes are as follows: - * - CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted - * and can have multiple CUDA contexts present at a single time. - * - CU_COMPUTEMODE_EXCLUSIVE: Compute-exclusive mode - Device can have - * only one CUDA con-text present on it at a time. - * - CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is - * prohibited from creating new CUDA contexts. + * \brief Returns in *pdptr the base address bound to the texture reference + * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. * - * \param pi Pointer to an interger instance that will be updated with the - * attribute value. - * \param attrib Device attribute to query. - * \param dev The device handle. + * \param pdptr Returned device address + * \param h_tex_ref Texture reference * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, - CUdevice dev) +int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref) { CUresult result = 0; - if (pi == NULL) { + if (pdptr == NULL) { SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " - "prop is NULL"); + "pdptr is NULL"); goto error; } - result = cuDeviceGetAttribute(pi, attrib, dev); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_DEVICE_GET_ATTRIBUTE) == -1) + result = cuTexRefGetAddress(pdptr, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS) == -1) goto error; return 0; @@ -4122,425 +4783,392 @@ static int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, } /** - * \internal - * \brief Creates and returns a new instance of SCCudaDevice. - * - * \retval device Pointer to the new instance of SCCudaDevice. - */ -static SCCudaDevice *SCCudaAllocSCCudaDevice(void) -{ - SCCudaDevice *device = SCMalloc(sizeof(SCCudaDevice)); - if (unlikely(device == NULL)) - return NULL; - memset(device, 0 , sizeof(SCCudaDevice)); - - return device; -} - -/** - * \internal - * \brief Frees an instance of SCCudaDevice. - * - * \param device Pointer to the an instance of SCCudaDevice to be freed. - */ -static void SCCudaDeAllocSCCudaDevice(SCCudaDevice *device) -{ - SCFree(device); - - return; -} - -/** - * \internal - * \brief Creates and returns a new instance of SCCudaDevices. + * \brief Returns in *pam the addressing mode corresponding to the dimension + * dim of the texture reference hTexRef. Currently, the only valid value + * for dim are 0 and 1. * - * \retval devices Pointer to the new instance of SCCudaDevices. - */ -static SCCudaDevices *SCCudaAllocSCCudaDevices(void) -{ - SCCudaDevices *devices = SCMalloc(sizeof(SCCudaDevices)); - if (unlikely(devices == NULL)) - return NULL; - memset(devices, 0 , sizeof(SCCudaDevices)); - - return devices; -} - -/** - * \internal - * \brief Frees an instance of SCCudaDevices. + * \param pam Returned addressing mode + * \param h_tex_ref Texture reference + * \param dim Dimension * - * \param device Pointer to the an instance of SCCudaDevices to be freed. + * \retval 0 On success. + * \retval -1 On failure. */ -static void SCCudaDeAllocSCCudaDevices(SCCudaDevices *devices) +int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, int dim) { - int i = 0; - - if (devices == NULL) - return; - - if (devices->devices != NULL) { - for (i = 0; i < devices->count; i++) - SCCudaDeAllocSCCudaDevice(devices->devices[i]); + CUresult result = 0; - SCFree(devices->devices); + if (pam == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pam is NULL"); + goto error; } - SCFree(devices); + result = cuTexRefGetAddressMode(pam, h_tex_ref, dim); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ADDRESS_MODE) == -1) + goto error; - return; + return 0; + + error: + return -1; } /** - * \brief Retrieves all the devices and all the information corresponding to - * the devices on the CUDA device available on this system and returns - * a SCCudaDevices instances which holds all this information. + * \brief Returns in *phArray the CUDA array bound to the texture reference + * hTexRef, or returns CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. * - * \retval devices Pointer to a SCCudaDevices instance that holds information - * for all the CUDA devices on the system. + * \param ph_array Returned array + * \param h_tex_ref Texture reference + * + * \retval 0 On success. + * \retval -1 On failure. */ -static SCCudaDevices *SCCudaGetDevices(void) +int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref) { - SCCudaDevices *devices = SCCudaAllocSCCudaDevices(); - int i = 0; + CUresult result = 0; - if (SCCudaDeviceGetCount(&devices->count) == -1) + if (ph_array == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "ph_array is NULL"); goto error; + } - devices->devices = SCMalloc(devices->count * sizeof(SCCudaDevice *)); - if (devices->devices == NULL) + result = cuTexRefGetArray(ph_array, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_ARRAY) == -1) goto error; - /* update the device properties */ - for (i = 0; i < devices->count; i++) { - devices->devices[i] = SCCudaAllocSCCudaDevice(); - - if (SCCudaDeviceGet(&devices->devices[i]->device, i) == -1) - goto error; - - if (SCCudaDeviceComputeCapability(&devices->devices[i]->major_rev, - &devices->devices[i]->minor_rev, - devices->devices[i]->device) == -1) { - goto error; - } - - if (SCCudaDeviceGetName(devices->devices[i]->name, - SC_CUDA_DEVICE_NAME_MAX_LEN, - devices->devices[i]->device) == -1) { - goto error; - } + return 0; - if (SCCudaDeviceTotalMem(&devices->devices[i]->bytes, - devices->devices[i]->device) == -1) { - goto error; - } + error: + return -1; +} - if (SCCudaDeviceGetProperties(&devices->devices[i]->prop, - devices->devices[i]->device) == -1) { - goto error; - } +/** + * \brief Returns in *pfm the filtering mode of the texture reference hTexRef. + * + * \param pfm Returned filtering mode + * \param h_tex_ref Texture reference + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref) +{ + CUresult result = 0; - /* retrieve the attributes */ - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_threads_per_block, - CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } + if (pfm == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "pfm is NULL"); + goto error; + } - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_x, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, - devices->devices[i]->device) == -1) { - goto error; - } + result = cuTexRefGetFilterMode(pfm, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FILTER_MODE) == -1) + goto error; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_y, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, - devices->devices[i]->device) == -1) { - goto error; - } + return 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_block_dim_z, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, - devices->devices[i]->device) == -1) { - goto error; - } + error: + return -1; +} - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_x, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - devices->devices[i]->device) == -1) { - goto error; - } +/** + * \brief Returns in *pFlags the flags of the texture reference hTexRef. + * + * \param p_flags Returned flags + * \param h_tex_ref Texture reference + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref) +{ + CUresult result = 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_y, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, - devices->devices[i]->device) == -1) { - goto error; - } + if (p_flags == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_flags is NULL"); + goto error; + } - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_grid_dim_z, - CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, - devices->devices[i]->device) == -1) { - goto error; - } + result = cuTexRefGetFlags(p_flags, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FLAGS) == -1) + goto error; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_shared_memory_per_block, - CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } + return 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_total_constant_memory, - CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, - devices->devices[i]->device) == -1) { - goto error; - } + error: + return -1; +} - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_warp_size, - CU_DEVICE_ATTRIBUTE_WARP_SIZE, - devices->devices[i]->device) == -1) { - goto error; - } +/** + * \brief Returns in *pFormat and *pNumChannels the format and number of + * components of the CUDA array bound to the texture reference hTexRef. + * If pFormat or pNumChannels is NULL, it will be ignored. + * + * \param p_format Returned format + * \param p_num_channels Returned number of components + * \param h_tex_ref Texture reference + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, + CUtexref h_tex_ref) +{ + CUresult result = 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_pitch, - CU_DEVICE_ATTRIBUTE_MAX_PITCH, - devices->devices[i]->device) == -1) { - goto error; - } + if (p_format == NULL || p_num_channels == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "p_format == NULL || p_num_channels == NULL"); + goto error; + } - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_max_registers_per_block, - CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, - devices->devices[i]->device) == -1) { - goto error; - } + result = cuTexRefGetFormat(p_format, p_num_channels, h_tex_ref); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_GET_FORMAT) == -1) + goto error; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_clock_rate, - CU_DEVICE_ATTRIBUTE_CLOCK_RATE, - devices->devices[i]->device) == -1) { - goto error; - } + return 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_texture_alignment, - CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, - devices->devices[i]->device) == -1) { - goto error; - } + error: + return -1; +} - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_gpu_overlap, - CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, - devices->devices[i]->device) == -1) { - goto error; - } +/** + * \brief Binds a linear address range to the texture reference hTexRef. Any + * previous address or CUDA array state associated with the texture + * reference is superseded by this function. Any memory previously + * bound to hTexRef is unbound. + * + * Since the hardware enforces an alignment requirement on texture + * base addresses, cuTexRefSetAddress() passes back a byte offset in + * *ByteOffset that must be applied to texture fetches in order to read + * from the desired memory. This offset must be divided by the texel + * size and passed to kernels that read from the texture so they can be + * applied to the tex1Dfetch() function. + * + * If the device memory pointer was returned from cuMemAlloc(), the + * offset is guaranteed to be 0 and NULL may be passed as the + * ByteOffset parameter. + * + * \param byte_offset Returned byte offset + * \param h_tex_ref Texture reference to bind + * \param dptr Device pointer to bind + * \param bytes Size of memory to bind in bytes + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref, + CUdeviceptr dptr, unsigned int bytes) +{ + CUresult result = 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_multiprocessor_count, - CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, - devices->devices[i]->device) == -1) { - goto error; - } + if (byte_offset == NULL) { + SCLogError(SC_ERR_INVALID_ARGUMENTS, "Invalid argument supplied. " + "byte_offset is NULL"); + goto error; + } - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_kernel_exec_timeout, - CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, - devices->devices[i]->device) == -1) { - goto error; - } + result = cuTexRefSetAddress(byte_offset, h_tex_ref, dptr, bytes); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS) == -1) + goto error; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_integrated, - CU_DEVICE_ATTRIBUTE_INTEGRATED, - devices->devices[i]->device) == -1) { - goto error; - } + return 0; - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_can_map_host_memory, - CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, - devices->devices[i]->device) == -1) { - goto error; - } + error: + return -1; +} - if (SCCudaDeviceGetAttribute(&devices->devices[i]->attr_compute_mode, - CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, - devices->devices[i]->device) == -1) { - goto error; - } - } +/** + * \brief Binds a linear address range to the texture reference hTexRef. Any + * previous address or CUDA array state associated with the texture + * reference is superseded by this function. Any memory previously bound + * to hTexRef is unbound. + * + * Using a tex2D() function inside a kernel requires a call to either + * cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or cuTexRefSetAddress2D() to bind the texture reference to + * linear memory. + * + * Function calls to cuTexRefSetFormat() cannot follow calls to + * cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that dptr be aligned to the appropriate hardware- + * specific texture alignment. You can query this value using the device + * attribute CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned dptr + * is supplied, CUDA_ERROR_INVALID_VALUE is returned. + * + * \param h_tex_ref Texture reference to bind + * \param desc Descriptor of CUDA array + * \param dptr Device pointer to bind + * \param pitch Line pitch in bytes + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, const CUDA_ARRAY_DESCRIPTOR *desc, + CUdeviceptr dptr, unsigned int pitch) +{ + CUresult result = 0; -#ifdef DEBUG - SCCudaPrintDeviceList(devices); -#endif + result = cuTexRefSetAddress2D(h_tex_ref, desc, dptr, pitch); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_2D) == -1) + goto error; - return devices; + return 0; error: - SCCudaDeAllocSCCudaDevices(devices); - return NULL; + return -1; } /** - * \brief Prints the information for all the devices for this CUDA platform, - * supplied inside the argument. + * \brief Specifies the addressing mode am for the given dimension dim of the + * texture reference hTexRef. If dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from + * the texture; if dim is 1, the second, and so on. CUaddress_mode is + * defined as: * - * \param devices Pointer to a SCCudaDevices instance that holds information on - * the devices. + * typedef enum CUaddress_mode_enum { + * CU_TR_ADDRESS_MODE_WRAP = 0, + * CU_TR_ADDRESS_MODE_CLAMP = 1, + * CU_TR_ADDRESS_MODE_MIRROR = 2, + * } CUaddress_mode; + * + * \param h_tex_ref Texture reference + * \param dim Dimension + * \param am Addressing mode to set + * + * \retval 0 On success. + * \retval -1 On failure. */ -void SCCudaPrintDeviceList(SCCudaDevices *devices) +int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am) { - int i = 0; + CUresult result = 0; - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return; - } + result = cuTexRefSetAddressMode(h_tex_ref, dim, am); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ADDRESS_MODE) == -1) + goto error; - SCLogDebug("Printing device info for this CUDA context"); - SCLogDebug("No of devices: %d", devices->count); + return 0; - for (i = 0; i < devices->count; i++) { - SCLogDebug("Device ID: %d", devices->devices[i]->device); - SCLogDebug("Device Name: %s", devices->devices[i]->name); - SCLogDebug("Device Major Revision: %d", devices->devices[i]->major_rev); - SCLogDebug("Device Minor Revision: %d", devices->devices[i]->minor_rev); + error: + return -1; +} - /* Cudevprop */ - SCLogDebug("Device Max Threads Per Block: %d", - devices->devices[i]->prop.maxThreadsPerBlock); - SCLogDebug("Device Max Threads Dim: [%d, %d, %d]", - devices->devices[i]->prop.maxThreadsDim[0], - devices->devices[i]->prop.maxThreadsDim[1], - devices->devices[i]->prop.maxThreadsDim[2]); - SCLogDebug("Device Max Grid Size: [%d, %d, %d]", - devices->devices[i]->prop.maxGridSize[0], - devices->devices[i]->prop.maxGridSize[1], - devices->devices[i]->prop.maxGridSize[2]); - SCLogDebug("Device Shared Memory Per Block: %d", - devices->devices[i]->prop.sharedMemPerBlock); - SCLogDebug("Device Total Constant Memory: %d", - devices->devices[i]->prop.totalConstantMemory); - SCLogDebug("Device SIMD Width(Warp Size): %d", - devices->devices[i]->prop.SIMDWidth); - SCLogDebug("Device Maximum Mem Pitch: %d", devices->devices[i]->prop.memPitch); - SCLogDebug("Device Total Registers Available Per Block: %d", - devices->devices[i]->prop.regsPerBlock); - SCLogDebug("Device Clock Frequency: %d", devices->devices[i]->prop.clockRate); - SCLogDebug("Device Texture Alignment Requirement: %d", - devices->devices[i]->prop.textureAlign); +/** + * \brief Binds the CUDA array hArray to the texture reference hTexRef. Any + * previous address or CUDA array state associated with the texture + * reference is superseded by this function. Flags must be set to + * CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to hTexRef + * is unbound. + * + * \param h_tex_ref Texture reference to bind + * \param h_array Array to bind + * \param flags Options (must be CU_TRSA_OVERRIDE_FORMAT) + * + * \retval 0 On success. + * \retval -1 On failure. + */ +int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, unsigned int flags) +{ + CUresult result = 0; + result = cuTexRefSetArray(h_tex_ref, h_array, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_ARRAY) == -1) + goto error; - /* device attributes */ - SCLogDebug("Device Max Threads Per Block: %d", - devices->devices[i]->attr_max_threads_per_block); - SCLogDebug("Device Max Block Dim X: %d", - devices->devices[i]->attr_max_block_dim_x); - SCLogDebug("Device Max Block Dim Y: %d", - devices->devices[i]->attr_max_block_dim_y); - SCLogDebug("Device Max Block Dim Z: %d", - devices->devices[i]->attr_max_block_dim_z); - SCLogDebug("Device Max Grid Dim X: %d", - devices->devices[i]->attr_max_grid_dim_x); - SCLogDebug("Device Max Grid Dim Y: %d", - devices->devices[i]->attr_max_grid_dim_y); - SCLogDebug("Device Max Grid Dim Z: %d", - devices->devices[i]->attr_max_grid_dim_z); - SCLogDebug("Device Max Shared Memory Per Block: %d", - devices->devices[i]->attr_max_shared_memory_per_block); - SCLogDebug("Device Total Constant Memory: %d", - devices->devices[i]->attr_total_constant_memory); - SCLogDebug("Device Warp Size: %d", devices->devices[i]->attr_warp_size); - SCLogDebug("Device Max Pitch: %d", devices->devices[i]->attr_max_pitch); - SCLogDebug("Device Max Registers Per Block: %d", - devices->devices[i]->attr_max_registers_per_block); - SCLogDebug("Device Clock Rate: %d", devices->devices[i]->attr_clock_rate); - SCLogDebug("Device Texture Alignement: %d", - devices->devices[i]->attr_texture_alignment); - SCLogDebug("Device GPU Overlap: %s", - (devices->devices[i]->attr_gpu_overlap == 1) ? "Yes": "No"); - SCLogDebug("Device Multiprocessor Count: %d", - devices->devices[i]->attr_multiprocessor_count); - SCLogDebug("Device Kernel Exec Timeout: %s", - (devices->devices[i]->attr_kernel_exec_timeout) ? "Yes": "No"); - SCLogDebug("Device Integrated With Memory Subsystem: %s", - (devices->devices[i]->attr_integrated) ? "Yes": "No"); - SCLogDebug("Device Can Map Host Memory: %s", - (devices->devices[i]->attr_can_map_host_memory) ? "Yes": "No"); - if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_DEFAULT) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_DEFAULT"); - else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_EXCLUSIVE) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_EXCLUSIVE"); - else if (devices->devices[i]->attr_compute_mode == CU_COMPUTEMODE_PROHIBITED) - SCLogDebug("Device Compute Mode: CU_COMPUTEMODE_PROHIBITED"); - } + return 0; - return; + error: + return -1; } /** - * \brief Prints some basic information for the default device(the first devie) - * we will be using on this cuda platform for use by our engine. This - * function is basically to be used to print some minimal information to - * the user at engine startup. + * \brief Specifies the filtering mode fm to be used when reading memory through + * the texture reference hTexRef. CUfilter_mode_enum is defined as: * - * \param devices Pointer to a SCCudaDevices instance that holds information on - * the devices. + * typedef enum CUfilter_mode_enum { + * CU_TR_FILTER_MODE_POINT = 0, + * CU_TR_FILTER_MODE_LINEAR = 1 + * } CUfilter_mode; + * + * \param h_tex_ref Texture reference + * \param fm Filtering mode to set + * + * \retval 0 On success. + * \retval -1 On failure. */ -void SCCudaPrintBasicDeviceInfo(SCCudaDevices *devices) +int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm) { - int i = 0; + CUresult result = 0; - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return; - } + result = cuTexRefSetFilterMode(h_tex_ref, fm); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FILTER_MODE) == -1) + goto error; - for (i = 0; i < devices->count; i++) { - SCLogInfo("GPU Device %d: %s, %d Multiprocessors, %dMHz, CUDA Compute " - "Capability %d.%d", i + 1, - devices->devices[i]->name, - devices->devices[i]->attr_multiprocessor_count, - devices->devices[i]->attr_clock_rate/1000, - devices->devices[i]->major_rev, - devices->devices[i]->minor_rev); - } + return 0; - return; + error: + return -1; } /** - * \brief Gets the device list, for the CUDA platform environment initialized by - * the engine. + * \brief Specifies optional flags via Flags to specify the behavior of data + * returned through the texture reference hTexRef. The valid flags are: * - * \retval devices Pointer to the CUDA device list on success; NULL on failure. + * * CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in + * the range [0, 1]; + * * CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default + * behavior of having the texture coordinates range from [0, Dim) where + * Dim is the width or height of the CUDA array. Instead, the texture + * coordinates [0, 1.0) reference the entire breadth of the array + * dimension; + * + * \param h_tex_ref Texture reference + * \param flags Optional flags to set + * + * \retval 0 On success. + * \retval -1 On failure. */ -SCCudaDevices *SCCudaGetDeviceList(void) +int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags) { - if (devices == NULL) { - SCLogError(SC_ERR_CUDA_ERROR, "CUDA environment not initialized. " - "Please initialized the CUDA environment by calling " - "SCCudaInitCudaEnvironment() before making any calls " - "to the CUDA API."); - return NULL; - } + CUresult result = 0; - return devices; -} + result = cuTexRefSetFlags(h_tex_ref, flags); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FLAGS) == -1) + goto error; -/*****************************Cuda_Initialization_API**************************/ + return 0; + + error: + return -1; +} /** - * \internal - * \brief Inits the cuda driver API. + * \brief Specifies the format of the data to be read by the texture reference + * hTexRef. fmt and NumPackedComponents are exactly analogous to the + * Format and NumChannels members of the CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components + * per array element. * - * \param flags Currently should be 0. + * \param h_tex_ref Texture reference + * \param fmt Format to set + * \param num_packed_components Number of components per array element * * \retval 0 On success. * \retval -1 On failure. */ -static int SCCudaInit(unsigned int flags) +int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, + int num_packed_components) { - CUresult result = cuInit(flags); - if (SCCudaHandleRetValue(result, SC_CUDA_CU_INIT) == -1) + CUresult result = 0; + + result = cuTexRefSetFormat(h_tex_ref, fmt, num_packed_components); + if (SCCudaHandleRetValue(result, SC_CUDA_CU_TEX_REF_SET_FORMAT) == -1) goto error; return 0; diff --git a/src/util-cuda.h b/src/util-cuda.h index 5967a9896f..8e544fd04a 100644 --- a/src/util-cuda.h +++ b/src/util-cuda.h @@ -46,7 +46,7 @@ typedef struct SCCudaDevice_ { CUdevprop prop; /* device total memory */ - unsigned int bytes; + size_t bytes; /* device attributes. We could have used a fixed int array table to hold * the attributes, but it is better we specify it exclusively this way, @@ -79,125 +79,244 @@ typedef struct SCCudaDevices_ { SCCudaDevice **devices; } SCCudaDevices; -int SCCudaArray3DCreate(CUarray *, const CUDA_ARRAY3D_DESCRIPTOR *); -int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *, CUarray); -int SCCudaArrayCreate(CUarray *, const CUDA_ARRAY_DESCRIPTOR *); -int SCCudaArrayDestroy(CUarray); -int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *, CUarray); -int SCCudaMemAlloc(CUdeviceptr *dptr, unsigned int); -int SCCudaMemAllocHost(void **, unsigned int); -int SCCudaMemAllocPitch(CUdeviceptr *, unsigned int *, unsigned int, - unsigned int, unsigned int); -int SCCudaMemcpy2D(const CUDA_MEMCPY2D *); -int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *, CUstream); -int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *); -int SCCudaMemcpy3D(const CUDA_MEMCPY3D *); -int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *, CUstream); -int SCCudaMemcpyAtoA(CUarray, unsigned int, CUarray, unsigned int, unsigned int); -int SCCudaMemcpyAtoD(CUdeviceptr, CUarray, unsigned int, unsigned int); -int SCCudaMemcpyAtoH(void *, CUarray, unsigned int, unsigned int); -int SCCudaMemcpyAtoHAsync(void *, CUarray, unsigned int, unsigned int, - CUstream); -int SCCudaMemcpyDtoA(CUarray, unsigned int, CUdeviceptr, unsigned int); -int SCCudaMemcpyDtoD(CUdeviceptr, CUdeviceptr, unsigned int byte_count); -int SCCudaMemcpyDtoH(void *, CUdeviceptr, unsigned int); -int SCCudaMemcpyDtoHAsync(void *, CUdeviceptr, unsigned int, CUstream); -int SCCudaMemcpyHtoA(CUarray, unsigned int, const void *, unsigned int); -int SCCudaMemcpyHtoAAsync(CUarray, unsigned int, const void *, - unsigned int, CUstream); -int SCCudaMemcpyHtoD(CUdeviceptr, const void *, unsigned int); -int SCCudaMemcpyHtoDAsync(CUdeviceptr, const void *, unsigned int, - CUstream); -int SCCudaMemFree(CUdeviceptr); -int SCCudaMemFreeHost(void *); -int SCCudaMemGetAddressRange(CUdeviceptr *, unsigned int *, CUdeviceptr); -int SCCudaMemGetInfo(unsigned int *, unsigned int *); -int SCCudaMemHostAlloc(void **, size_t, unsigned int); -int SCCudaMemHostGetDevicePointer(CUdeviceptr *, void *, unsigned int); -int SCCudaMemHostGetFlags(unsigned int *, void *); -int SCCudaMemsetD16(CUdeviceptr, unsigned short, unsigned int); -int SCCudaMemsetD2D16(CUdeviceptr, unsigned int, unsigned short, - unsigned int, unsigned int); -int SCCudaMemsetD2D32(CUdeviceptr, unsigned int, unsigned int, unsigned int, - unsigned int); -int SCCudaMemsetD2D8(CUdeviceptr, unsigned int, unsigned char, unsigned int, - unsigned int); -int SCCudaMemsetD32(CUdeviceptr, unsigned int, unsigned int); -int SCCudaMemsetD8(CUdeviceptr, unsigned char, unsigned int); - -int SCCudaTexRefCreate(CUtexref *); -int SCCudaTexRefDestroy(CUtexref); -int SCCudaTexRefGetAddress(CUdeviceptr *, CUtexref); -int SCCudaTexRefGetAddressMode(CUaddress_mode *, CUtexref, int); -int SCCudaTexRefGetArray(CUarray *, CUtexref); -int SCCudaTexRefGetFilterMode(CUfilter_mode *, CUtexref); -int SCCudaTexRefGetFlags(unsigned int *, CUtexref); -int SCCudaTexRefGetFormat(CUarray_format *, int *, CUtexref); -int SCCudaTexRefSetAddress(unsigned int *, CUtexref, CUdeviceptr, - unsigned int); -int SCCudaTexRefSetAddress2D(CUtexref, const CUDA_ARRAY_DESCRIPTOR *, - CUdeviceptr, unsigned int); -int SCCudaTexRefSetAddressMode(CUtexref, int, CUaddress_mode); -int SCCudaTexRefSetArray(CUtexref, CUarray, unsigned int); -int SCCudaTexRefSetFilterMode(CUtexref, CUfilter_mode); -int SCCudaTexRefSetFlags(CUtexref, unsigned int); -int SCCudaTexRefSetFormat(CUtexref, CUarray_format, int); - -int SCCudaFuncGetAttribute(int *, CUfunction_attribute, CUfunction); -int SCCudaFuncSetBlockShape(CUfunction, int, int, int); -int SCCudaFuncSetSharedSize(CUfunction, unsigned int); -int SCCudaLaunch(CUfunction); -int SCCudaLaunchGrid(CUfunction, int, int); -int SCCudaLaunchGridAsync(CUfunction, int, int, CUstream); -int SCCudaParamSetf(CUfunction, int, float); -int SCCudaParamSeti(CUfunction, int, unsigned int); -int SCCudaParamSetSize(CUfunction, unsigned int); -int SCCudaParamSetTexRef(CUfunction, int, CUtexref); -int SCCudaParamSetv(CUfunction, int, void *, unsigned int); - -int SCCudaEventCreate(CUevent *, unsigned int); -int SCCudaEventDestroy(CUevent); -int SCCudaEventElapsedTime(float *, CUevent, CUevent); -int SCCudaEventQuery(CUevent); -int SCCudaEventRecord(CUevent, CUstream); -int SCCudaEventSynchronize(CUevent); - -int SCCudaStreamCreate(CUstream *, unsigned int); -int SCCudaStreamDestroy(CUstream); -int SCCudaStreamQuery(CUstream); -int SCCudaStreamSynchronize(CUstream); - -int SCCudaModuleGetFunction(CUfunction *, CUmodule, const char *); -int SCCudaModuleGetGlobal(CUdeviceptr *, unsigned int *, CUmodule, const char *); -int SCCudaModuleGetTexRef(CUtexref *, CUmodule, const char *); -int SCCudaModuleLoad(CUmodule *, const char *); -int SCCudaModuleLoadData(CUmodule *, const char *); -int SCCudaModuleLoadDataEx(CUmodule *, const char *, unsigned int, - CUjit_option *, void **); -int SCCudaModuleLoadFatBinary(CUmodule *, const void *); -int SCCudaModuleUnload(CUmodule); - - -int SCCudaCtxAttach(CUcontext *, unsigned int); -int SCCudaCtxCreate(CUcontext *, unsigned int, CUdevice); -int SCCudaCtxDestroy(CUcontext); -int SCCudaCtxDetach(CUcontext); -int SCCudaCtxGetDevice(CUdevice *); -int SCCudaCtxPopCurrent(CUcontext *); -int SCCudaCtxPushCurrent(CUcontext); -int SCCudaCtxSynchronize(void); -int SCCudaDriverGetVersion(int *); +/**************************Cuda_Initialization_API**************************/ +int SCCudaInit(unsigned int flags); + +/***************************Version_Management_API***************************/ +int SCCudaDriverGetVersion(int *driver_version); + +/***************************Device_Management_API****************************/ +int SCCudaDeviceComputeCapability(int *major, int *minor, CUdevice dev); +int SCCudaDeviceGet(CUdevice *device, int ordinal); +int SCCudaDeviceGetAttribute(int *pi, CUdevice_attribute attrib, + CUdevice dev); +int SCCudaDeviceGetCount(int *count); +int SCCudaDeviceGetName(char *name, int len, CUdevice dev); +int SCCudaDeviceGetProperties(CUdevprop *prop, CUdevice dev); +int SCCudaDeviceTotalMem(size_t *bytes, CUdevice dev); void SCCudaPrintDeviceList(SCCudaDevices *); void SCCudaPrintBasicDeviceInfo(SCCudaDevices *); SCCudaDevices *SCCudaGetDeviceList(void); +/***************************Context_Management_API***************************/ +int SCCudaCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); +int SCCudaCtxDestroy(CUcontext ctx); +int SCCudaCtxGetApiVersion(CUcontext ctx, unsigned int *version); +int SCCudaCtxGetCacheConfig(CUfunc_cache *pconfig); +int SCCudaCtxGetCurrent(CUcontext *pctx); +int SCCudaCtxGetDevice(CUdevice *device); +int SCCudaCtxGetLimit(size_t *pvalue, CUlimit limit); +int SCCudaCtxPopCurrent(CUcontext *pctx); +int SCCudaCtxPushCurrent(CUcontext ctx); +int SCCudaCtxSetCacheConfig(CUfunc_cache config); +int SCCudaCtxSetCurrent(CUcontext ctx); +int SCCudaCtxSetLimit(CUlimit limit, size_t value); +int SCCudaCtxSynchronize(void); +int SCCudaCtxAttach(CUcontext *pctx, unsigned int flags); +int SCCudaCtxDetach(CUcontext ctx); + +/***************************Module_Management_API****************************/ +int SCCudaModuleGetFunction(CUfunction *hfunc, CUmodule hmod, + const char *name); +int SCCudaModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, + const char *name); +int SCCudaModuleGetSurfRef(CUsurfref *p_surf_ref, CUmodule hmod, + const char *name); +int SCCudaModuleGetTexRef(CUtexref *p_tex_ref, CUmodule hmod, + const char *name); +int SCCudaModuleLoad(CUmodule *module, const char *fname); +int SCCudaModuleLoadData(CUmodule *module, const void *image); +int SCCudaModuleLoadDataEx(CUmodule *module, const void *image, + unsigned int num_options, CUjit_option *options, + void **option_values); +int SCCudaModuleLoadFatBinary(CUmodule *module, const void *fat_cubin); +int SCCudaModuleUnload(CUmodule hmod); + +/**************************Memory_Management_API*****************************/ +int SCCudaArray3DCreate(CUarray *p_handle, + const CUDA_ARRAY3D_DESCRIPTOR *p_allocate_array); +int SCCudaArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *p_array_descriptor, + CUarray h_array); +int SCCudaArrayCreate(CUarray *p_handle, + const CUDA_ARRAY_DESCRIPTOR *p_allocate_array); +int SCCudaArrayDestroy(CUarray h_array); +int SCCudaArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *p_array_descriptor, + CUarray h_array); +int SCCudaDeviceGetByPCIBusId(CUdevice *dev, char *pci_bus_id); +int SCCudaDeviceGetPCIBusId(char *pci_bus_id, int len, CUdevice dev); +int SCCudaIpcCloseMemHandle(CUdeviceptr dptr); +int SCCudaIpcGetEventHandle(CUipcEventHandle *p_handle, CUevent event); +int SCCudaIpcGetMemHandle(CUipcMemHandle *p_handle, CUdeviceptr dptr); +int SCCudaIpcOpenEventHandle(CUevent *ph_event, CUipcEventHandle handle); +int SCCudaIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, + unsigned int flags); +int SCCudaMemAlloc(CUdeviceptr *dptr, size_t byte_size); +int SCCudaMemAllocHost(void **pp, size_t byte_size); +int SCCudaMemAllocPitch(CUdeviceptr *dptr, size_t *p_pitch, + size_t width_in_bytes, + size_t height, + unsigned int element_size_bytes); +int SCCudaMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t byte_count); +int SCCudaMemcpy2D(const CUDA_MEMCPY2D *p_copy); +int SCCudaMemcpy2DAsync(const CUDA_MEMCPY2D *p_copy, CUstream h_stream); +int SCCudaMemcpy2DUnaligned(const CUDA_MEMCPY2D *p_copy); +int SCCudaMemcpy3D(const CUDA_MEMCPY3D *p_copy); +int SCCudaMemcpy3DAsync(const CUDA_MEMCPY3D *p_copy, CUstream h_stream); +int SCCudaMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *p_copy); +int SCCudaMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *p_copy, + CUstream h_stream); +int SCCudaMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t byte_count, + CUstream h_stream); +int SCCudaMemcpyAtoA(CUarray dst_array, size_t dst_offset, + CUarray src_array, size_t src_offset, + size_t byte_count); +int SCCudaMemcpyAtoD(CUdeviceptr dst_device, CUarray src_array, + size_t src_offset, size_t byte_count); +int SCCudaMemcpyAtoH(void *dst_host, CUarray src_array, size_t src_offset, + size_t byte_count); +int SCCudaMemcpyAtoHAsync(void *dst_host, CUarray src_array, + size_t src_offset, size_t byte_count, + CUstream h_stream); +int SCCudaMemcpyDtoA(CUarray dst_array, size_t dst_offset, + CUdeviceptr src_device, size_t byte_count); +int SCCudaMemcpyDtoD(CUdeviceptr dst_device, CUdeviceptr src_device, + size_t byte_count); +int SCCudaMemcpyDtoDAsync(CUdeviceptr dst_device, CUdeviceptr src_device, + size_t byte_count, CUstream h_stream); +int SCCudaMemcpyDtoH(void *dst_host, CUdeviceptr src_device, + size_t byte_count); +int SCCudaMemcpyDtoHAsync(void *dst_host, CUdeviceptr src_device, + size_t byte_count, CUstream h_stream); +int SCCudaMemcpyHtoA(CUarray dst_array, size_t dst_offset, + const void *src_host, size_t byte_count); +int SCCudaMemcpyHtoAAsync(CUarray dst_array, size_t dst_offset, + const void *src_host, size_t byte_count, + CUstream h_stream); +int SCCudaMemcpyHtoD(CUdeviceptr dst_device, const void *src_host, + size_t byte_count); +int SCCudaMemcpyHtoDAsync(CUdeviceptr dst_device, const void *src_host, + size_t byte_count, CUstream h_stream); +int SCCudaMemcpyPeer(CUdeviceptr dst_device, CUcontext dst_context, + CUdeviceptr src_device, CUcontext src_context, + size_t byte_count); +int SCCudaMemcpyPeerAsync(CUdeviceptr dst_device, CUcontext dst_context, + CUdeviceptr src_device, CUcontext src_context, + size_t byte_count, CUstream h_stream); +int SCCudaMemFree(CUdeviceptr dptr); +int SCCudaMemFreeHost(void *p); +int SCCudaMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, + CUdeviceptr dptr); +int SCCudaMemGetInfo(size_t *free, size_t *total); +int SCCudaMemHostAlloc(void **pp, size_t byte_size, unsigned int flags); +int SCCudaMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, + unsigned int flags); +int SCCudaMemHostGetFlags(unsigned int *p_flags, void *p); +int SCCudaMemHostRegister(void *p, size_t byte_size, unsigned int flags); +int SCCudaMemHostUnregister(void *p); +int SCCudaMemsetD16(CUdeviceptr dst_device, unsigned short us, size_t n); +int SCCudaMemsetD16Async(CUdeviceptr dst_device, unsigned short us, + size_t n, CUstream h_stream); +int SCCudaMemsetD2D16(CUdeviceptr dst_device, size_t dst_pitch, + unsigned short us, size_t width, + size_t height); +int SCCudaMemsetD2D16Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned short us, size_t width, + size_t height, CUstream h_stream); +int SCCudaMemsetD2D32(CUdeviceptr dst_device, size_t dst_pitch, + unsigned int ui, size_t width, size_t height); +int SCCudaMemsetD2D32Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned int ui, size_t width, size_t height, + CUstream h_stream); +int SCCudaMemsetD2D8(CUdeviceptr dst_device, size_t dst_pitch, + unsigned char uc, size_t width, size_t height); +int SCCudaMemsetD2D8Async(CUdeviceptr dst_device, size_t dst_pitch, + unsigned char uc, size_t width, size_t height, + CUstream h_stream); +int SCCudaMemsetD32(CUdeviceptr dst_device, unsigned int ui, size_t n); +int SCCudaMemsetD32Async(CUdeviceptr dst_device, unsigned int ui, + size_t n, CUstream h_stream); +int SCCudaMemsetD8(CUdeviceptr dst_device, unsigned char uc, size_t n); +int SCCudaMemsetD8Async(CUdeviceptr dst_device, unsigned char uc, + size_t n, CUstream h_stream); + +/***************************Unified_Addressing_API****************************/ + +int SCCudaPointerGetAttribute(void *data, CUpointer_attribute attribute, + CUdeviceptr ptr); + +/***************************Stream_Management_API****************************/ +int SCCudaStreamCreate(CUstream *ph_stream, unsigned int flags); +int SCCudaStreamDestroy(CUstream h_stream); +int SCCudaStreamQuery(CUstream h_stream); +int SCCudaStreamSynchronize(CUstream h_stream); +int SCCudaStreamWaitEvent(CUstream h_stream, CUevent h_event, + unsigned int flags); + +/***************************Event_Management_API*****************************/ +int SCCudaEventCreate(CUevent *ph_event, unsigned int flags); +int SCCudaEventDestroy(CUevent h_event); +int SCCudaEventElapsedTime(float *p_milli_seconds, CUevent h_start, + CUevent h_end); +int SCCudaEventQuery(CUevent h_event); +int SCCudaEventRecord(CUevent h_event, CUstream h_stream); +int SCCudaEventSynchronize(CUevent h_event); + +/***********************Execution_Control_Management_API***********************/ +int SCCudaFuncGetAttribute(int *pi, CUfunction_attribute attrib, + CUfunction hfunc); +int SCCudaFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); +int SCCudaLaunchKernel(CUfunction f, unsigned int grid_dim_x, + unsigned int grid_dim_y, unsigned int grid_dim_z, + unsigned int block_dim_x, unsigned int block_dim_y, + unsigned int block_dim_z, unsigned int shared_mem_bytes, + CUstream h_stream, void **kernel_params, void **extra); +int SCCudaFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); +int SCCudaFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); +int SCCudaLaunch(CUfunction f); +int SCCudaLaunchGrid(CUfunction f, int grid_width, int grid_height); +int SCCudaLaunchGridAsync(CUfunction f, int grid_width, int grid_height, + CUstream h_stream); +int SCCudaParamSetf(CUfunction h_func, int offset, float value); +int SCCudaParamSeti(CUfunction h_func, int offset, unsigned int value); +int SCCudaParamSetSize(CUfunction h_func, unsigned int num_bytes); +int SCCudaParamSetTexRef(CUfunction h_func, int tex_unit, CUtexref h_tex_ref); +int SCCudaParamSetv(CUfunction h_func, int offset, void *ptr, + unsigned int num_bytes); + +/*********************Texture_Reference_Management_API***********************/ +int SCCudaTexRefCreate(CUtexref *p_tex_ref); +int SCCudaTexRefDestroy(CUtexref h_tex_ref); +int SCCudaTexRefGetAddress(CUdeviceptr *pdptr, CUtexref h_tex_ref); +int SCCudaTexRefGetAddressMode(CUaddress_mode *pam, CUtexref h_tex_ref, + int dim); +int SCCudaTexRefGetArray(CUarray *ph_array, CUtexref h_tex_ref); +int SCCudaTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref h_tex_ref); +int SCCudaTexRefGetFlags(unsigned int *p_flags, CUtexref h_tex_ref); +int SCCudaTexRefGetFormat(CUarray_format *p_format, int *p_num_channels, + CUtexref h_tex_ref); +int SCCudaTexRefSetAddress(size_t *byte_offset, CUtexref h_tex_ref, + CUdeviceptr dptr, unsigned int bytes); +int SCCudaTexRefSetAddress2D(CUtexref h_tex_ref, + const CUDA_ARRAY_DESCRIPTOR *desc, + CUdeviceptr dptr, unsigned int pitch); +int SCCudaTexRefSetAddressMode(CUtexref h_tex_ref, int dim, CUaddress_mode am); +int SCCudaTexRefSetArray(CUtexref h_tex_ref, CUarray h_array, + unsigned int flags); +int SCCudaTexRefSetFilterMode(CUtexref h_tex_ref, CUfilter_mode fm); +int SCCudaTexRefSetFlags(CUtexref h_tex_ref, unsigned int flags); +int SCCudaTexRefSetFormat(CUtexref h_tex_ref, CUarray_format fmt, + int num_packed_components); + +/************************Cuda_Env_Initialization_API*************************/ int SCCudaInitCudaEnvironment(void); +/********************************Cuda_Utility********************************/ void SCCudaListCards(void); -int SCCudaIsCudaDeviceIdValid(int); +int SCCudaIsCudaDeviceIdValid(int cuda_device_id); +/********************************Unittests***********************************/ void SCCudaRegisterTests(void); #endif /* __SC_CUDA_SUPPORT__ */