/* Copyright (c) 2019-2022, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Intel Corporation nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #if defined(_WIN32) || defined(_WIN64) #define ISPC_IS_WINDOWS #elif defined(__linux__) #define ISPC_IS_LINUX #elif defined(__APPLE__) #error "L0 is not supported on macOS" #elif defined(__FreeBSD__) #error "L0 is not supported on FreeBSD" #else #error "Host OS was not detected" #endif #ifdef ISPC_IS_WINDOWS #define _CRT_SECURE_NO_WARNINGS #define NOMINMAX #pragma warning(disable : 4244) #pragma warning(disable : 4305) #include #endif // ISPC_IS_WINDOWS #include #include #include #include #include #ifdef ISPC_IS_LINUX #include #endif /******************************/ #include #include #include #include #include #include #include #include #include #define L0_SAFE_CALL(call) \ { \ auto status = (call); \ if (status != 0) { \ fprintf(stderr, "%s:%d: L0 error %d\n", __FILE__, __LINE__, (int)status); \ exit(1); \ } \ } #define N 64 int width() { #if defined(TEST_WIDTH) return TEST_WIDTH; #else #error "Unknown or unset TEST_WIDTH value" #endif } #if defined(_WIN32) || defined(_WIN64) #define ALIGN #else #define ALIGN __attribute__((aligned(64))) #endif static void L0InitContext(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue) { L0_SAFE_CALL(zeInit(ZE_INIT_FLAG_GPU_ONLY)); // Retrieve drivers uint32_t driverCount = 0; L0_SAFE_CALL(zeDriverGet(&driverCount, nullptr)); std::vector allDrivers(driverCount); L0_SAFE_CALL(zeDriverGet(&driverCount, allDrivers.data())); // Find an instance of Intel GPU device // User can select particular device using env variable // By default first available device is selected auto gpuDeviceToGrab = 0; const char *gpuDeviceEnv = getenv("ISPC_GPU_DEVICE"); if (gpuDeviceEnv) { std::istringstream(gpuDeviceEnv) >> gpuDeviceToGrab; } else { // Allow using ISPCRT env to make things easier const char *gpuDeviceEnv = getenv("ISPCRT_GPU_DEVICE"); if (gpuDeviceEnv) { std::istringstream(gpuDeviceEnv) >> gpuDeviceToGrab; } } auto gpuDevice = 0; ze_driver_handle_t hDriver = 0; for (auto &driver : allDrivers) { uint32_t deviceCount = 0; L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, nullptr)); std::vector allDevices(deviceCount); L0_SAFE_CALL(zeDeviceGet(driver, &deviceCount, allDevices.data())); for (auto &device : allDevices) { ze_device_properties_t device_properties = {ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES}; L0_SAFE_CALL(zeDeviceGetProperties(device, &device_properties)); if (device_properties.type == ZE_DEVICE_TYPE_GPU && device_properties.vendorId == 0x8086) { gpuDevice++; if (gpuDevice == gpuDeviceToGrab + 1) { hDevice = device; hDriver = driver; break; } } } if (hDevice) break; } assert(hDriver); assert(hDevice); // Create default command context ze_context_desc_t contextDesc = {}; // use default values L0_SAFE_CALL(zeContextCreate(hDriver, &contextDesc, &hContext)); // Create a command queue ze_command_queue_desc_t commandQueueDesc = {}; commandQueueDesc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS; commandQueueDesc.priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL; L0_SAFE_CALL(zeCommandQueueCreate(hContext, hDevice, &commandQueueDesc, &hCommandQueue)); std::ifstream is; #ifdef TEST_ZEBIN std::string fn = "test_xe.bin"; #else std::string fn = "test_xe.spv"; #endif is.open(fn, std::ios::binary); if (!is.good()) { fprintf(stderr, "Open %s failed\n", fn.c_str()); return; } is.seekg(0, std::ios::end); size_t codeSize = is.tellg(); is.seekg(0, std::ios::beg); if (codeSize == 0) { return; } unsigned char *codeBin = new unsigned char[codeSize]; if (!codeBin) { return; } is.read((char *)codeBin, codeSize); is.close(); std::string igcOptions = "-vc-codegen -no-optimize -Xfinalizer '-presched'"; #ifdef ISPC_IS_LINUX // `newspillcost` is not yet supported on Windows in open source // TODO: use `newspillcost` for all platforms as soon as it available igcOptions += " -Xfinalizer '-newspillcost'"; #endif const char *userIgcOptionsEnv = getenv("ISPCRT_IGC_OPTIONS"); if (userIgcOptionsEnv) { std::string userIgcOptions(userIgcOptionsEnv); if (userIgcOptions.length() >= 3) { auto prefix = userIgcOptions.substr(0, 2); if (prefix == "+ ") { igcOptions += ' ' + userIgcOptions.substr(2); } else if (prefix == "= ") { igcOptions = userIgcOptions.substr(2); } else { throw std::runtime_error("Invalid ISPCRT_IGC_OPTIONS string" + userIgcOptions); } } else { throw std::runtime_error("Invalid ISPCRT_IGC_OPTIONS string" + userIgcOptions); } } // Create module ze_module_desc_t moduleDesc = {}; #ifdef TEST_ZEBIN moduleDesc.format = ZE_MODULE_FORMAT_NATIVE; #else moduleDesc.format = ZE_MODULE_FORMAT_IL_SPIRV; #endif moduleDesc.pInputModule = codeBin; moduleDesc.inputSize = codeSize; moduleDesc.pBuildFlags = igcOptions.c_str(); // Add build log output for easier debugginer the tests ze_module_build_log_handle_t buildlog; if (zeModuleCreate(hContext, hDevice, &moduleDesc, &hModule, &buildlog) != ZE_RESULT_SUCCESS) { size_t szLog = 0; zeModuleBuildLogGetString(buildlog, &szLog, nullptr); char *strLog = (char *)malloc(szLog); zeModuleBuildLogGetString(buildlog, &szLog, strLog); std::cout << "Build log:" << strLog << std::endl; free(strLog); } L0_SAFE_CALL(zeModuleBuildLogDestroy(buildlog)); } static void L0Create_Kernel(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_list_handle_t &hCommandList, ze_kernel_handle_t &hKernel, const char *name) { // Create command list ze_command_list_desc_t commandListDesc = {}; L0_SAFE_CALL(zeCommandListCreate(hContext, hDevice, &commandListDesc, &hCommandList)); ze_kernel_desc_t kernelDesc = {}; kernelDesc.pKernelName = name; L0_SAFE_CALL(zeKernelCreate(hModule, &kernelDesc, &hKernel)); // Set device/shared indirect flags ze_kernel_indirect_access_flags_t kernel_flags = ZE_KERNEL_INDIRECT_ACCESS_FLAG_DEVICE | ZE_KERNEL_INDIRECT_ACCESS_FLAG_SHARED; } static void L0Launch_Kernel(ze_command_queue_handle_t &hCommandQueue, ze_command_list_handle_t &hCommandList, ze_kernel_handle_t &hKernel, int bufsize = 0, void *return_data = nullptr, void *OUTBuff = nullptr, int groupSpaceWidth = 1, int groupSpaceHeight = 1) { // set group size uint32_t group_size = groupSpaceWidth * groupSpaceHeight; L0_SAFE_CALL(zeKernelSetGroupSize(hKernel, /*x*/ groupSpaceWidth, /*y*/ groupSpaceHeight, /*z*/ 1)); // set grid size ze_group_count_t dispatchTraits = {1, 1, 1}; // launch L0_SAFE_CALL(zeCommandListAppendBarrier(hCommandList, nullptr, 0, nullptr)); L0_SAFE_CALL(zeCommandListAppendLaunchKernel(hCommandList, hKernel, &dispatchTraits, nullptr, 0, nullptr)); L0_SAFE_CALL(zeCommandListAppendBarrier(hCommandList, nullptr, 0, nullptr)); // copy result to host if (return_data && OUTBuff) L0_SAFE_CALL(zeCommandListAppendMemoryCopy(hCommandList, return_data, OUTBuff, bufsize, nullptr, 0, nullptr)); // dispatch & wait L0_SAFE_CALL(zeCommandListClose(hCommandList)); L0_SAFE_CALL(zeCommandQueueExecuteCommandLists(hCommandQueue, 1, &hCommandList, nullptr)); L0_SAFE_CALL(zeCommandQueueSynchronize(hCommandQueue, (std::numeric_limits::max)())); } static void L0Launch_F_V(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_v"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_Threads(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, int groupSpaceWidth, int groupSpaceHeight) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_t"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff, groupSpaceWidth, groupSpaceHeight); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_F(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vfloat_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_f"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vfloat_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_FI(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vfloat_data, void *vint_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_fi"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr, *IN1Buff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &INBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(int), N * sizeof(int), hDevice, &IN1Buff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vfloat_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL(zeCommandListAppendMemoryCopy(hCommandList, IN1Buff, vint_data, N * sizeof(int), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(IN1Buff), &IN1Buff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeMemFree(hContext, IN1Buff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_FU(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vfloat_data, float b) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_fu"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vfloat_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(float), &b)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_DU(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vdouble_data, double b) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_du"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(double), N * sizeof(double), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vdouble_data, N * sizeof(double), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(double), &b)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_DUF(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vdouble_data, float b) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_duf"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(double), N * sizeof(double), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vdouble_data, N * sizeof(double), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(float), &b)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_F_DI(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, void *vdouble_data, void *vint2_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "f_di"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr, *INBuff = nullptr, *IN1Buff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(double), N * sizeof(double), hDevice, &INBuff)); L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(int), N * sizeof(int), hDevice, &IN1Buff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vdouble_data, N * sizeof(double), nullptr, 0, nullptr)); L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, IN1Buff, vint2_data, N * sizeof(int), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(IN1Buff), &IN1Buff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeMemFree(hContext, IN1Buff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Print_UF(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, float b) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "print_uf"); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(float), &b)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Print_F(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *vfloat_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "print_f"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vfloat_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(INBuff), &INBuff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Print_FUF(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *vfloat_data, float b) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "print_fuf"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *INBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &INBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, INBuff, vfloat_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(INBuff), &INBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(float), &b)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel); L0_SAFE_CALL(zeMemFree(hContext, INBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Print_NO(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "print_no"); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Result(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "result"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Print_Result(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "print_result"); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } static void L0Launch_Result_Threads(ze_device_handle_t &hDevice, ze_module_handle_t &hModule, ze_context_handle_t &hContext, ze_command_queue_handle_t &hCommandQueue, void *return_data, int groupSpaceWidth, int groupSpaceHeight) { ze_command_list_handle_t hCommandList; ze_kernel_handle_t hKernel; L0Create_Kernel(hDevice, hModule, hContext, hCommandList, hKernel, "result_t"); // allocate buffers ze_device_mem_alloc_desc_t allocDesc = {}; void *OUTBuff = nullptr; L0_SAFE_CALL(zeMemAllocDevice(hContext, &allocDesc, N * sizeof(float), N * sizeof(float), hDevice, &OUTBuff)); // copy buffers to device L0_SAFE_CALL( zeCommandListAppendMemoryCopy(hCommandList, OUTBuff, return_data, N * sizeof(float), nullptr, 0, nullptr)); // set kernel arguments L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 0, sizeof(OUTBuff), &OUTBuff)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 1, sizeof(int), &groupSpaceWidth)); L0_SAFE_CALL(zeKernelSetArgumentValue(hKernel, 2, sizeof(int), &groupSpaceHeight)); L0Launch_Kernel(hCommandQueue, hCommandList, hKernel, N * sizeof(float), return_data, OUTBuff, groupSpaceWidth, groupSpaceHeight); L0_SAFE_CALL(zeMemFree(hContext, OUTBuff)); L0_SAFE_CALL(zeKernelDestroy(hKernel)); L0_SAFE_CALL(zeCommandListDestroy(hCommandList)); } int main(int argc, char *argv[]) { // init data struct alignas(4096) AlignedArray { float data[N]; } returned_result, expected_result, vfloat; struct alignas(4096) AlignedArray1 { int data[N]; } vint, vint2; struct alignas(4096) AlignedArray2 { double data[N]; } vdouble; for (int i = 0; i < N; ++i) { returned_result.data[i] = float(-1e20); vfloat.data[i] = float(i + 1); vdouble.data[i] = double(i + 1); vint.data[i] = 2 * (i + 1); vint2.data[i] = i + 5; } void *return_data = returned_result.data; void *expect_data = expected_result.data; void *vfloat_data = vfloat.data; void *vint_data = vint.data; void *vint2_data = vint2.data; void *vdouble_data = vdouble.data; ze_device_handle_t hDevice = nullptr; ze_module_handle_t hModule = nullptr; ze_driver_handle_t hDriver = nullptr; ze_context_handle_t hContext = nullptr; ze_command_queue_handle_t hCommandQueue = nullptr; L0InitContext(hDevice, hModule, hContext, hCommandQueue); #if (TEST_SIG == 0) L0Launch_F_V(hDevice, hModule, hContext, hCommandQueue, return_data); #elif (TEST_SIG == 1) L0Launch_F_F(hDevice, hModule, hContext, hCommandQueue, return_data, vfloat_data); #elif (TEST_SIG == 2) float num = 5.0f; L0Launch_F_FU(hDevice, hModule, hContext, hCommandQueue, return_data, vfloat_data, num); #elif (TEST_SIG == 3) L0Launch_F_FI(hDevice, hModule, hContext, hCommandQueue, return_data, vfloat_data, vint_data); #elif (TEST_SIG == 4) double num = 5.0; L0Launch_F_DU(hDevice, hModule, hContext, hCommandQueue, return_data, vdouble_data, num); #elif (TEST_SIG == 5) float num = 5.0f; L0Launch_F_DUF(hDevice, hModule, hContext, hCommandQueue, return_data, vdouble_data, num); #elif (TEST_SIG == 6) L0Launch_F_DI(hDevice, hModule, hContext, hCommandQueue, return_data, vdouble_data, vint2_data); #elif (TEST_SIG == 7) // L0Launch_F_SZ(return_data); #error "Currently unsupported for Xe" #elif (TEST_SIG == 8) int groupSpaceWidth = 2; int groupSpaceHeight = 16; assert(N >= groupSpaceWidth * groupSpaceHeight); L0Launch_F_Threads(hDevice, hModule, hContext, hCommandQueue, return_data, groupSpaceWidth, groupSpaceHeight); L0Launch_Result_Threads(hDevice, hModule, hContext, hCommandQueue, expect_data, groupSpaceWidth, groupSpaceHeight); #elif (TEST_SIG == 32) L0Launch_Print_UF(hDevice, hModule, hContext, hCommandQueue, 5.0f); #elif (TEST_SIG == 33) L0Launch_Print_F(hDevice, hModule, hContext, hCommandQueue, vfloat_data); #elif (TEST_SIG == 34) L0Launch_Print_FUF(hDevice, hModule, hContext, hCommandQueue, vfloat_data, 5.0f); #elif (TEST_SIG == 35) L0Launch_Print_NO(hDevice, hModule, hContext, hCommandQueue); #else #error "Unknown or unset TEST_SIG value" #endif #if 0 const bool verbose = true; #else const bool verbose = false; #endif #if (TEST_SIG < 8) L0Launch_Result(hDevice, hModule, hContext, hCommandQueue, expect_data); #elif (TEST_SIG >= 32) L0Launch_Print_Result(hDevice, hModule, hContext, hCommandQueue); return 0; #endif L0_SAFE_CALL(zeCommandQueueDestroy(hCommandQueue)); L0_SAFE_CALL(zeModuleDestroy(hModule)); L0_SAFE_CALL(zeContextDestroy(hContext)); // check results. int errors = 0; for (int i = 0; i < width(); ++i) { if (fabs(returned_result.data[i] - expected_result.data[i]) > 16 * FLT_EPSILON) { #ifdef EXPECT_FAILURE // bingo, failed return 1; #else printf("%s: value %d disagrees: returned %f [%a], expected %f [%a]\n", argv[0], i, returned_result.data[i], returned_result.data[i], expected_result.data[i], expected_result.data[i]); ++errors; #endif // EXPECT_FAILURE } } #ifdef EXPECT_FAILURE // Don't expect to get here return 0; #else return errors > 0; #endif }