/* * Copyright (c) 2021-2023 ARM Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include "libgpuinfo.hpp" namespace libgpuinfo { struct product_entry { uint32_t id; uint32_t mask; uint32_t min_cores; const char* name; const char* architecture; uint32_t fp32_fmas_per_engine; std::function get_num_texels; std::function get_num_pixels; std::function get_num_exec_engines; }; static const uint32_t MASK_OLD { 0xFFFF }; static const uint32_t MASK_NEW { 0xF00F }; static uint32_t get_num_1( int core_count, uint32_t core_features, uint32_t thread_features ) { return 1; } static uint32_t get_num_2( int core_count, uint32_t core_features, uint32_t thread_features ) { return 2; } static uint32_t get_num_3( int core_count, uint32_t core_features, uint32_t thread_features ) { return 3; } static uint32_t get_num_4( int core_count, uint32_t core_features, uint32_t thread_features ) { return 4; } static uint32_t get_num_8( int core_count, uint32_t core_features, uint32_t thread_features ) { return 8; } static uint32_t get_num_ee_g31( int core_count, uint32_t core_features, uint32_t thread_features ) { if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000)) { return 1; } return 2; } static uint32_t get_num_ee_g51( int core_count, uint32_t core_features, uint32_t thread_features ) { if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000)) { return 1; } return 3; } static uint32_t get_num_ee_g52( int core_count, uint32_t core_features, uint32_t thread_features ) { return core_features & 0xF; } static uint32_t get_num_ee_g310_g510( int core_count, uint32_t core_features, uint32_t thread_features ) { if ((core_features & 0xF) <= 1) { return 1; } return 2; } const std::array PRODUCT_VERSIONS {{ // ID, ID Mask, Min cores, Name, Args, FMA, Texels, Pixels, Engines product_entry { 0x6956, MASK_OLD, 1, "Mali-T600", "Midgard", 4, get_num_1, get_num_1, get_num_2 }, product_entry { 0x0620, MASK_OLD, 1, "Mali-T620", "Midgard", 4, get_num_1, get_num_1, get_num_2 }, product_entry { 0x0720, MASK_OLD, 1, "Mali-T720", "Midgard", 4, get_num_1, get_num_1, get_num_1 }, product_entry { 0x0750, MASK_OLD, 1, "Mali-T760", "Midgard", 4, get_num_1, get_num_1, get_num_2 }, product_entry { 0x0820, MASK_OLD, 1, "Mali-T820", "Midgard", 4, get_num_1, get_num_1, get_num_1 }, product_entry { 0x0830, MASK_OLD, 1, "Mali-T830", "Midgard", 4, get_num_1, get_num_1, get_num_2 }, product_entry { 0x0860, MASK_OLD, 1, "Mali-T860", "Midgard", 4, get_num_1, get_num_1, get_num_2 }, product_entry { 0x0880, MASK_OLD, 1, "Mali-T880", "Midgard", 4, get_num_1, get_num_1, get_num_3 }, product_entry { 0x6000, MASK_NEW, 1, "Mali-G71", "Bifrost", 4, get_num_1, get_num_1, get_num_3 }, product_entry { 0x6001, MASK_NEW, 1, "Mali-G72", "Bifrost", 4, get_num_1, get_num_1, get_num_3 }, product_entry { 0x7000, MASK_NEW, 1, "Mali-G51", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g51 }, product_entry { 0x7001, MASK_NEW, 1, "Mali-G76", "Bifrost", 8, get_num_2, get_num_2, get_num_3 }, product_entry { 0x7002, MASK_NEW, 1, "Mali-G52", "Bifrost", 8, get_num_2, get_num_2, get_num_ee_g52 }, product_entry { 0x7003, MASK_NEW, 1, "Mali-G31", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g31 }, product_entry { 0x9000, MASK_NEW, 1, "Mali-G77", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0x9001, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0x9003, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0x9004, MASK_NEW, 1, "Mali-G68", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0x9002, MASK_NEW, 1, "Mali-G78", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0x9005, MASK_NEW, 1, "Mali-G78AE", "Valhall", 16, get_num_4, get_num_2, get_num_2 }, product_entry { 0xa002, MASK_NEW, 1, "Mali-G710", "Valhall", 32, get_num_8, get_num_4, get_num_2 }, product_entry { 0xa007, MASK_NEW, 1, "Mali-G610", "Valhall", 32, get_num_8, get_num_4, get_num_2 }, // TODO: Extract FMA, pixel, and texel settings product_entry { 0xa003, MASK_NEW, 1, "Mali-G510", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 }, // TODO: Extract FMA, pixel, and texel settings product_entry { 0xa004, MASK_NEW, 1, "Mali-G310", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 }, product_entry { 0xb002, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, product_entry { 0xb003, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, product_entry { 0xb002, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, product_entry { 0xb003, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, product_entry { 0xb002, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, product_entry { 0xb003, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 }, }}; uint32_t get_gpu_id( uint32_t gpu_id ) { for (const auto& entry : PRODUCT_VERSIONS) { if (((gpu_id & entry.mask) == entry.id)) { return gpu_id & entry.mask; } } return gpu_id; } const char* get_gpu_name( uint32_t gpu_id, int core_count ) { for (const auto& entry : PRODUCT_VERSIONS) { if(((gpu_id & entry.mask) == entry.id) && (core_count >= entry.min_cores)) { return entry.name; } } return "Unknown gpu_id"; } const char* get_architecture_name( uint32_t gpu_id ) { for (const auto& entry : PRODUCT_VERSIONS) { if((gpu_id & entry.mask) == entry.id) { return entry.architecture; } } return "Unknown gpu_id"; } int get_num_exec_engines( uint32_t gpu_id, int core_count, uint32_t core_features, uint32_t thread_features ) { for (const auto& entry : PRODUCT_VERSIONS) { if(((gpu_id & entry.mask) == entry.id) && (core_count >= entry.min_cores)) { return entry.get_num_exec_engines(core_count, core_features, thread_features); } } return 0; } const uint32_t get_num_fp32_fmas( uint32_t gpu_id, int core_count, uint32_t core_features, uint32_t thread_features ) { for (const auto& entry : PRODUCT_VERSIONS) { if(((gpu_id & entry.mask) == entry.id) && (core_count >= entry.min_cores)) { return entry.fp32_fmas_per_engine * entry.get_num_exec_engines(core_count, core_features, thread_features); } } return 0; } const uint32_t get_num_texels( uint32_t gpu_id, int core_count, uint32_t core_features, uint32_t thread_features ) { for (const auto& entry : PRODUCT_VERSIONS) { if(((gpu_id & entry.mask) == entry.id) && (core_count >= entry.min_cores)) { return entry.get_num_texels(core_count, core_features, thread_features); } } return 0; } const uint32_t get_num_pixels( uint32_t gpu_id, int core_count, uint32_t core_features, uint32_t thread_features ) { for (const auto& entry : PRODUCT_VERSIONS) { if(((gpu_id & entry.mask) == entry.id) && (core_count >= entry.min_cores)) { return entry.get_num_pixels(core_count, core_features, thread_features); } } return 0; } /** Kbase Pre R21 ioctl interface. */ namespace kbase_pre_r21 { /** Related to mali0 ioctl interface */ enum class header_id : uint32_t { /** Version check. */ version_check = 0, /** Base Context Create Kernel Flags. */ create_kernel_flags = 2, /** Kbase Func Get Props. */ get_props = 526, /** Kbase Func Set Flags. */ set_flags = 530, }; /** Message header. */ union uk_header { /** Number identifying the called UK function. */ header_id id; /** The return code of the called UK function. */ uint32_t ret; /** Dummy to ensure type has 64-bit alignment */ uint64_t sizer; }; /** Check version compatibility between kernel and userspace. */ struct version_check_t { /** UK header */ uk_header header; /** Major version number */ uint16_t major; /** Minor version number */ uint16_t minor; bool is_set() const { return major || minor; } }; /** IOCTL parameters to set flags */ struct set_flags_t { /** UK header */ uk_header header; /** Create flags */ uint32_t create_flags; /** Padding */ uint32_t padding; }; /** Base GPU Num Texture Features Registers. */ static constexpr const uint32_t base_gpu_num_texture_features_registers = 3; /** Base Max Coherent Groups. */ static constexpr const uint32_t base_max_coherent_groups = 16; /** GPU Max Job Slots. */ static constexpr const uint32_t gpu_max_job_slots = 16; /** Kbase UK GPU props. */ struct uk_gpuprops_t { /** * IOCTL parameters to probe GPU properties * * NOTE: the raw_props member in this data structure contains the register * values from which the value of the other members are derived. The derived * members exist to allow for efficient access and/or shielding the details * of the layout of the registers. * */ struct gpu_props { /** Core. */ struct core { /** Product specific value. */ uint32_t product_id; /** * Status of the GPU release. * No defined values, but starts at 0 and increases by one for each * release status (alpha, beta, EAC, etc.). * 4 bit values (0-15). */ uint16_t version_status; /** * Minor release number of the GPU. "P" part of an "RnPn" release number. * 8 bit values (0-255). */ uint16_t minor_revision; /** * Major release number of the GPU. "R" part of an "RnPn" release number. * 4 bit values (0-15). */ uint16_t major_revision; /** Padding. */ uint16_t padding; /** * This property is deprecated since it has not contained the real current * value of GPU clock speed. It is kept here only for backwards compatibility. * For the new ioctl interface, it is ignored and is treated as a padding * to keep the structure of the same size and retain the placement of its * members. */ uint32_t gpu_speed_mhz; /** * @usecase GPU clock max speed is required for computing best case * in tasks as job scheduling ant irq_throttling. (It is not specified in the * Midgard Architecture). * Also, GPU clock max speed is used for OpenCL's clGetDeviceInfo() function. */ uint32_t gpu_freq_khz_max; /** * @usecase GPU clock min speed is required for computing worst case * in tasks as job scheduling ant irq_throttling. (It is not specified in the * Midgard Architecture). */ uint32_t gpu_freq_khz_min; /** Size of the shader program counter, in bits. */ uint32_t log2_program_counter_size; /** * TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a * bitpattern where a set bit indicates that the format is supported. * * Before using a texture format, it is recommended that the corresponding * bit be checked. */ uint32_t texture_features[base_gpu_num_texture_features_registers]; /** * Theoretical maximum memory available to the GPU. It is unlikely that a * client will be able to allocate all of this memory for their own * purposes, but this at least provides an upper bound on the memory * available to the GPU. * * This is required for OpenCL's clGetDeviceInfo() call when * CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The * client will not be expecting to allocate anywhere near this value. */ uint64_t gpu_available_memory_size; }; /** * More information is possible - but associativity and bus width are not * required by upper-level apis. */ struct l2_cache { /** Log2 Line Size. */ uint8_t log2_line_size; /** Log2 Cache Size. */ uint8_t log2_cache_size; /** Num L2 Slices. */ uint8_t num_l2_slices; /** Padding bytes. */ uint8_t padding[5]; }; /** Tiler. */ struct tiler { /** Max is 4*2^15 */ uint32_t bin_size_bytes; /** Max is 2^15 */ uint32_t max_active_levels; }; /** GPU threading system details. */ struct thread { /** Max. number of threads per core */ uint32_t max_threads; /** Max. number of threads per workgroup */ uint32_t max_workgroup_size; /** Max. number of threads that can synchronize on a simple barrier */ uint32_t max_barrier_size; /** Total size [1..65535] of the register file available per core. */ uint16_t max_registers; /** Max. tasks [1..255] which may be sent to a core before it becomes blocked. */ uint8_t max_task_queue; /** Max. allowed value [1..15] of the Thread Group Split field. */ uint8_t max_thread_group_split; /** 0 = Not specified, 1 = Silicon, 2 = FPGA, 3 = SW Model/Emulation */ uint8_t impl_tech; /** Padding bytes. */ uint8_t padding[7]; }; /** * A complete description of the GPU's Hardware Configuration Discovery * registers. * * The information is presented inefficiently for access. For frequent access, * the values should be better expressed in an unpacked form in the * base_gpu_props structure. * * @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to * allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device * behaving differently?". In this case, all information about the * by the driver. Instead, the raw registers can be processed by the Mali * Tools software on the host PC. */ struct raw { /** Shader Present. */ uint64_t shader_present; /** Tiler Present. */ uint64_t tiler_present; /** L2 Present. */ uint64_t l2_present; /** Unused 1. */ uint64_t unused_1; /** L2 Features. */ uint32_t l2_features; /** Suspend Size. */ uint32_t suspend_size; /** Mem Features. */ uint32_t mem_features; /** Mmu Features. */ uint32_t mmu_features; /** As Present. */ uint32_t as_present; /** Js Present. */ uint32_t js_present; /** Js Features. */ uint32_t js_features[gpu_max_job_slots]; /** Tiler Features. */ uint32_t tiler_features; /** Texture Features. */ uint32_t texture_features[3]; /** GPU ID. */ uint32_t gpu_id; /** Thread Max Threads. */ uint32_t thread_max_threads; /** Thread Max Workgroup Size. */ uint32_t thread_max_workgroup_size; /** Thread Max Barrier Size. */ uint32_t thread_max_barrier_size; /** Thread Features. */ uint32_t thread_features; /** * Coherency Mode. * Note: This is the _selected_ coherency mode rather than the * available modes as exposed in the coherency_features register. */ uint32_t coherency_mode; }; /** * Coherency group information * * Note that the sizes of the members could be reduced. However, the \c group * member might be 8-byte aligned to ensure the u64 core_mask is 8-byte * aligned, thus leading to wastage if the other members sizes were reduced. * * The groups are sorted by core mask. The core masks are non-repeating and do * not intersect. */ struct coherent_group_info { /** * descriptor for a coherent group * * \c core_mask exposes all cores in that coherent group, and \c num_cores * provides a cached population-count for that mask. * * @note Whilst all cores are exposed in the mask, not all may be available to * the application, depending on the Kernel Power policy. * * @note if u64s must be 8-byte aligned, then this structure has 32-bits of * wastage. */ struct coherent_group { /** Core restriction mask required for the group */ uint64_t core_mask; /** Number of cores in the group */ uint16_t num_cores; /** Padding bytes. */ uint16_t padding[3]; }; /** Num Groups. */ uint32_t num_groups; /** * Number of core groups (coherent or not) in the GPU. Equivalent to the number of * L2 Caches. * The GPU Counter dumping writes 2048 bytes per core group, regardless of whether * the core groups are coherent or not. Hence this member is needed to calculate * how much memory is required for dumping. * @note Do not use it to work out how many valid elements are in the group[] * member. Use num_groups instead. */ uint32_t num_core_groups; /** Coherency features of the memory, accessed by @ref gpu_mem_features methods. */ uint32_t coherency; /** Padding. */ uint32_t padding; /** Descriptors of coherent groups */ coherent_group group[base_max_coherent_groups]; }; /** Core Props. */ core core_props; /** L2 Props. */ l2_cache l2_props; /** Unused to keep for backwards compatibility. */ uint64_t unused; /** Tiler Props. */ tiler tiler_props; /** Thread Props. */ thread thread_props; /** This member is large, likely to be 128 bytes. */ raw raw_props; /** This must be last member of the structure. */ coherent_group_info coherency_info; }; /** Header. */ uk_header header; /** Props. */ gpu_props props; }; constexpr auto iface_number = 0x80; /** Commands describing kbase_pre_r21 ioctl interface. */ enum command_type { /** Check version compatibility between JM kernel and userspace. */ version_check = _IOWR(iface_number, 0x0, version_check_t), /** Set kernel context creation flags. */ set_flags = _IOWR(iface_number, 0x212, set_flags_t), /** Get GPU properties. */ get_gpuprops = _IOWR(iface_number, 0x20e, uk_gpuprops_t), }; } /** Kbase Post R21 ioctl interface. */ namespace kbase_post_r21 { template class pointer64 { public: /** @return Pointer to the object. */ value_t* get() const { return reinterpret_cast(static_cast(value)); } /** * Set pointer value. * * @param ptr The new pointer value. */ void reset(value_t* ptr) { value = static_cast(reinterpret_cast(ptr)); } private: /** Pointer value as uint64_t. */ uint64_t value { 0 }; }; /** Check version compatibility between kernel and userspace. */ struct version_check_t { /** Major version number. */ uint16_t major; /** Minor version number */ uint16_t minor; bool is_set() const { return major || minor; } }; /** Set kernel context creation flags. */ struct set_flags_t { /** kernel context creation flags. */ uint32_t create_flags; }; /** * The ioctl will return the number of bytes stored into buffer or an error * on failure (e.g. size is too small). If size is specified as 0 then no * data will be written but the return value will be the number of bytes needed * for all the properties. * * flags may be used in the future to request a different format for the * buffer. With flags == 0 the following format is used. * * The buffer will be filled with pairs of values, a __u32 key identifying the * property followed by the value. The size of the value is identified using * the bottom bits of the key. The value then immediately followed the key and * is tightly packed (there is no padding). All keys and values are * little-endian. * * 00 = __u8 * 01 = __u16 * 10 = __u32 * 11 = __u64 */ struct get_gpuprops_t { /** GPU property size. */ enum class gpuprop_size : uint8_t { /** Property type is uint8_t. */ uint8 = 0x0, /** Property type is uint16_t. */ uint16 = 0x1, /** Property type is uint32_t. */ uint32 = 0x2, /** Property type is uint64_t. */ uint64 = 0x3 }; /** GPU properties codes. */ enum class gpuprop_code : uint8_t { /** Product id. */ product_id = 1, /** L2 log2 line size. */ l2_log2_line_size = 13, /** L2 log2 cache size. */ l2_log2_cache_size = 14, /** L2 num l2 slices. */ l2_num_l2_slices = 15, /** Max threads. */ max_threads = 18, /** Max registers. */ max_registers = 21, /** Raw l2 features. */ raw_l2_features = 29, /** Raw core features. */ raw_core_features = 30, /** Raw thread max threads. */ raw_thread_max_threads = 56, /** Raw thread max workgroup size. */ raw_thread_max_workgroup_size = 57, /** Raw thread max barrier size. */ raw_thread_max_barrier_size = 58, /** Raw thread features. */ raw_thread_features = 59, /** Raw coherency mode. */ raw_coherency_mode = 60, /** Coherency num groups. */ coherency_num_groups = 61, /** Coherency num core groups. */ coherency_num_core_groups = 62, /** Coherency coherency. */ coherency_coherency = 63, /** Coherency group 0. */ coherency_group_0 = 64, /** Coherency group 1. */ coherency_group_1 = 65, /** Coherency group 2. */ coherency_group_2 = 66, /** Coherency group 3. */ coherency_group_3 = 67, /** Num exec engines. */ num_exec_engines = 82 }; /** Pointer to the buffer to store properties into. */ pointer64 buffer; /** Size of the buffer. */ uint32_t size; /** Flags - must be zero for now. */ uint32_t flags; }; constexpr auto iface_number = 0x80; /** Commands describing kbase ioctl interface. */ enum command_type { /** Check version compatibility between JM kernel and userspace. */ version_check_jm = _IOWR(iface_number, 0x0, version_check_t), /** Check version compatibility between CSF kernel and userspace. */ version_check_csf = _IOWR(iface_number, 0x34, version_check_t), /** Set kernel context creation flags. */ set_flags = _IOW(iface_number, 0x1, set_flags_t), /** Get GPU properties. */ get_gpuprops = _IOW(iface_number, 0x3, get_gpuprops_t), }; } class prop_decoder { public: prop_decoder(std::vector buffer) : buffer_{ std::move(buffer) } , data_{ buffer_.data() } , size_{ buffer_.size() } {} bool decode(gpuinfo& info) { bool success = true; uint64_t raw_core_features {}; uint64_t raw_thread_features {}; while (size_ > 0) { auto p = next(success); if (!success) { return false; } prop_id_t id = p.first; uint64_t value = p.second; switch (id) { case prop_id_t::product_id: info.gpu_id = value; break; case prop_id_t::l2_log2_cache_size: info.num_l2_bytes = 1UL << value; break; case prop_id_t::l2_num_l2_slices: info.num_l2_slices = value; break; case prop_id_t::raw_l2_features: /* log2(bus width) stored in top 8 bits of register. */ info.num_bus_bits = 1UL << ((value >> 24) & 0xFF); break; case prop_id_t::raw_core_features: raw_core_features = value; break; case prop_id_t::raw_thread_features: raw_thread_features = value; break; case prop_id_t::coherency_group_0: case prop_id_t::coherency_group_1: case prop_id_t::coherency_group_2: case prop_id_t::coherency_group_3: info.num_shader_cores += __builtin_popcount(value); break; default: break; } } info.num_exec_engines = get_num_exec_engines( info.gpu_id, info.num_shader_cores, raw_core_features, raw_thread_features); if (!info.num_exec_engines) { return false; } info.num_fp32_fmas_per_cy = get_num_fp32_fmas( info.gpu_id, info.num_shader_cores, raw_core_features, raw_thread_features); info.num_fp16_fmas_per_cy = info.num_fp32_fmas_per_cy * 2; info.num_texels_per_cy = get_num_texels( info.gpu_id, info.num_shader_cores, raw_core_features, raw_thread_features); info.num_pixels_per_cy = get_num_pixels( info.gpu_id, info.num_shader_cores, raw_core_features, raw_thread_features); return true; } private: /** Property id type. */ using prop_id_t = kbase_post_r21::get_gpuprops_t::gpuprop_code; /** Property size type. */ using prop_size_t = kbase_post_r21::get_gpuprops_t::gpuprop_size; static std::pair to_prop_metadata(uint32_t v) { /* Property id/size encoding is: * +--------+----------+ * | 31 2 | 1 0 | * +--------+----------+ * | PropId | PropSize | * +--------+----------+ */ static unsigned int id_shift { 2 }; static unsigned int size_mask { 0b11 }; return { static_cast(v >> id_shift), static_cast(v & size_mask) }; } std::pair next(bool& success) { success = true; auto p = to_prop_metadata(read_bytes(success)); if (success) { prop_id_t id = p.first; prop_size_t size = p.second; switch (size) { case prop_size_t::uint8: return { id, read_bytes(success) }; case prop_size_t::uint16: return { id, read_bytes(success) }; case prop_size_t::uint32: return { id, read_bytes(success) }; case prop_size_t::uint64: return { id, read_bytes(success) }; } } return {}; } template T read_bytes(bool& success) { // Check we have enough bytes in the buffer if (size_ < sizeof(T)) { success = false; return 0; } T ret {}; for (size_t b = 0; b < sizeof(T); b++) { ret |= static_cast(static_cast(data_[b]) << (8 * b)); } data_ += sizeof(T); size_ -= sizeof(T); return ret; } std::vector const buffer_; unsigned char const *data_; std::size_t size_; }; /* See header for documentation */ std::unique_ptr instance::create( const uint32_t id ) { std::string device_path("/dev/mali" + std::to_string(id)); // Open the kernel driver device node const int fd = ::open(device_path.c_str(), O_RDONLY); if (fd < 0) { return nullptr; } // Check that it is a character device struct stat s {}; const int fs_result = fstat(fd, &s); if ((fs_result < 0) || (S_ISCHR(s.st_mode) == 0)) { ::close(fd); return nullptr; } // Create the instance auto result = std::unique_ptr(new instance(fd)); if (!result || !result->valid_) { return nullptr; } return result; } /* See header for documentation */ const gpuinfo& instance::get_info() const { return info_; }; /* See header for documentation */ instance::~instance() { ::close(fd_); } /* See header for documentation */ instance::instance(int fd): fd_(fd) { if (!check_version()) { valid_ = false; return; } if (!set_flags()) { valid_ = false; return; } if (!init_props()) { valid_ = false; return; } } static bool is_supported(unsigned int major, unsigned int minor) { return (major > 10) || ((major == 10) && (minor >= 2)); } /* See header for documentation */ bool instance::check_version() { // Probe pre-r21 JM kernel // Must be first in the list because CSF reuses an old IOCTL ID iface_ = iface_type::pre_r21; kbase_pre_r21::version_check_t pre_r21 {}; pre_r21.header.id = kbase_pre_r21::header_id::version_check; ::ioctl(fd_, kbase_pre_r21::version_check, &pre_r21); // If this is non-zero this must be pre-r21 driver, so check version if (pre_r21.is_set()) { return is_supported(pre_r21.major, pre_r21.minor); } // Probe r21+ JM kernel iface_ = iface_type::post_r21; kbase_post_r21::version_check_t post_r21 {}; ::ioctl(fd_, kbase_post_r21::version_check_jm, &post_r21); // If this is non-zero this must be post-r21 JM driver, so check version if (post_r21.is_set()) { return is_supported(post_r21.major, post_r21.minor); } // Probe r21+ CSF kernel ::ioctl(fd_, kbase_post_r21::version_check_csf, &post_r21); // If this is any non-zero value this is a valid CSF GPU return post_r21.is_set(); } /** Call set flags ioctl. */ bool instance::set_flags() { static constexpr auto system_monitor_flag_submit_disabled_bit = 1; static constexpr auto system_monitor_flag = 1U << system_monitor_flag_submit_disabled_bit; // Clear errno errno = 0; if (iface_ == iface_type::pre_r21) { kbase_pre_r21::set_flags_t flags {}; flags.header.id = kbase_pre_r21::header_id::set_flags; flags.create_flags = system_monitor_flag; ::ioctl(fd_, kbase_pre_r21::set_flags, &flags); } else { kbase_post_r21::set_flags_t flags { system_monitor_flag }; ::ioctl(fd_, kbase_post_r21::set_flags, &flags); } // Mali driver will fail if reinitialized, but it's benign // TODO: Does this ever happen with this usage pattern return errno == 0 || errno == EINVAL || errno == EPERM; } /* See header for documentation */ bool instance::init_props() { bool success; if (iface_ == iface_type::pre_r21) { success = init_props_pre_r21(); } else { success = init_props_post_r21(); } // Perform some common cleanup on the data if (!success) { return false; } info_.num_l2_bytes *= info_.num_l2_slices; info_.gpu_name = get_gpu_name(info_.gpu_id, info_.num_shader_cores); info_.architecture_name = get_architecture_name(info_.gpu_id); info_.gpu_id = get_gpu_id(info_.gpu_id); return true; } /* See header for documentation */ bool instance::init_props_pre_r21() { int error = 0; kbase_pre_r21::uk_gpuprops_t props {}; props.header.id = kbase_pre_r21::header_id::get_props; errno = 0; ::ioctl(fd_, kbase_pre_r21::get_gpuprops, &props); if (errno) { return false; } info_.gpu_id = props.props.core_props.product_id; info_.num_l2_bytes = 1UL << props.props.l2_props.log2_cache_size; info_.num_l2_slices = props.props.l2_props.num_l2_slices; info_.num_bus_bits = 1UL << (props.props.raw_props.l2_features >> 24); info_.num_shader_cores = 0; for (uint32_t i = 0; i < props.props.coherency_info.num_core_groups; i++) { info_.num_shader_cores += __builtin_popcount(props.props.coherency_info.group[i].core_mask); } info_.num_exec_engines = get_num_exec_engines( info_.gpu_id, info_.num_shader_cores, 0, 0); info_.num_fp32_fmas_per_cy = get_num_fp32_fmas( info_.gpu_id, info_.num_shader_cores, 0, 0); info_.num_fp16_fmas_per_cy = info_.num_fp32_fmas_per_cy * 2; info_.num_texels_per_cy = get_num_texels( info_.gpu_id, info_.num_shader_cores, 0, 0); info_.num_pixels_per_cy = get_num_pixels( info_.gpu_id, info_.num_shader_cores, 0, 0); return true; } /* See header for documentation */ bool instance::init_props_post_r21() { errno = 0; kbase_post_r21::get_gpuprops_t get_props = {}; int size = ::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props); if (errno) { return false; } std::vector buffer(static_cast(size)); get_props.size = static_cast(size); get_props.buffer.reset(buffer.data()); ::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props); if (errno) { return false; } prop_decoder decoder { buffer }; return decoder.decode(info_); } }