Files
UnrealEngine/Engine/Source/ThirdParty/ARM/ArmlibGPUInfo/Private/libgpuinfo.cpp
2025-05-18 13:04:45 +08:00

1134 lines
36 KiB
C++

/*
* Copyright (c) 2021-2023 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include <array>
#include <cerrno>
#include <cstdint>
#include <functional>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <sys/ioctl.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "libgpuinfo.hpp"
namespace libgpuinfo {
struct product_entry {
uint32_t id;
uint32_t mask;
uint32_t min_cores;
const char* name;
const char* architecture;
uint32_t fp32_fmas_per_engine;
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_texels;
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_pixels;
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_exec_engines;
};
static const uint32_t MASK_OLD { 0xFFFF };
static const uint32_t MASK_NEW { 0xF00F };
static uint32_t get_num_1(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return 1;
}
static uint32_t get_num_2(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return 2;
}
static uint32_t get_num_3(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return 3;
}
static uint32_t get_num_4(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return 4;
}
static uint32_t get_num_8(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return 8;
}
static uint32_t get_num_ee_g31(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000))
{
return 1;
}
return 2;
}
static uint32_t get_num_ee_g51(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000))
{
return 1;
}
return 3;
}
static uint32_t get_num_ee_g52(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
return core_features & 0xF;
}
static uint32_t get_num_ee_g310_g510(
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
if ((core_features & 0xF) <= 1)
{
return 1;
}
return 2;
}
const std::array<product_entry, 30> PRODUCT_VERSIONS {{
// ID, ID Mask, Min cores, Name, Args, FMA, Texels, Pixels, Engines
product_entry { 0x6956, MASK_OLD, 1, "Mali-T600", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
product_entry { 0x0620, MASK_OLD, 1, "Mali-T620", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
product_entry { 0x0720, MASK_OLD, 1, "Mali-T720", "Midgard", 4, get_num_1, get_num_1, get_num_1 },
product_entry { 0x0750, MASK_OLD, 1, "Mali-T760", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
product_entry { 0x0820, MASK_OLD, 1, "Mali-T820", "Midgard", 4, get_num_1, get_num_1, get_num_1 },
product_entry { 0x0830, MASK_OLD, 1, "Mali-T830", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
product_entry { 0x0860, MASK_OLD, 1, "Mali-T860", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
product_entry { 0x0880, MASK_OLD, 1, "Mali-T880", "Midgard", 4, get_num_1, get_num_1, get_num_3 },
product_entry { 0x6000, MASK_NEW, 1, "Mali-G71", "Bifrost", 4, get_num_1, get_num_1, get_num_3 },
product_entry { 0x6001, MASK_NEW, 1, "Mali-G72", "Bifrost", 4, get_num_1, get_num_1, get_num_3 },
product_entry { 0x7000, MASK_NEW, 1, "Mali-G51", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g51 },
product_entry { 0x7001, MASK_NEW, 1, "Mali-G76", "Bifrost", 8, get_num_2, get_num_2, get_num_3 },
product_entry { 0x7002, MASK_NEW, 1, "Mali-G52", "Bifrost", 8, get_num_2, get_num_2, get_num_ee_g52 },
product_entry { 0x7003, MASK_NEW, 1, "Mali-G31", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g31 },
product_entry { 0x9000, MASK_NEW, 1, "Mali-G77", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0x9001, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0x9003, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0x9004, MASK_NEW, 1, "Mali-G68", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0x9002, MASK_NEW, 1, "Mali-G78", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0x9005, MASK_NEW, 1, "Mali-G78AE", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
product_entry { 0xa002, MASK_NEW, 1, "Mali-G710", "Valhall", 32, get_num_8, get_num_4, get_num_2 },
product_entry { 0xa007, MASK_NEW, 1, "Mali-G610", "Valhall", 32, get_num_8, get_num_4, get_num_2 },
// TODO: Extract FMA, pixel, and texel settings
product_entry { 0xa003, MASK_NEW, 1, "Mali-G510", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 },
// TODO: Extract FMA, pixel, and texel settings
product_entry { 0xa004, MASK_NEW, 1, "Mali-G310", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 },
product_entry { 0xb002, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
product_entry { 0xb003, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
product_entry { 0xb002, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
product_entry { 0xb003, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
product_entry { 0xb002, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
product_entry { 0xb003, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
}};
uint32_t get_gpu_id(
uint32_t gpu_id
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if (((gpu_id & entry.mask) == entry.id))
{
return gpu_id & entry.mask;
}
}
return gpu_id;
}
const char* get_gpu_name(
uint32_t gpu_id,
int core_count
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if(((gpu_id & entry.mask) == entry.id) &&
(core_count >= entry.min_cores))
{
return entry.name;
}
}
return "Unknown gpu_id";
}
const char* get_architecture_name(
uint32_t gpu_id
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if((gpu_id & entry.mask) == entry.id)
{
return entry.architecture;
}
}
return "Unknown gpu_id";
}
int get_num_exec_engines(
uint32_t gpu_id,
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if(((gpu_id & entry.mask) == entry.id) &&
(core_count >= entry.min_cores))
{
return entry.get_num_exec_engines(core_count, core_features, thread_features);
}
}
return 0;
}
const uint32_t get_num_fp32_fmas(
uint32_t gpu_id,
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if(((gpu_id & entry.mask) == entry.id) &&
(core_count >= entry.min_cores))
{
return entry.fp32_fmas_per_engine * entry.get_num_exec_engines(core_count, core_features, thread_features);
}
}
return 0;
}
const uint32_t get_num_texels(
uint32_t gpu_id,
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if(((gpu_id & entry.mask) == entry.id) &&
(core_count >= entry.min_cores))
{
return entry.get_num_texels(core_count, core_features, thread_features);
}
}
return 0;
}
const uint32_t get_num_pixels(
uint32_t gpu_id,
int core_count,
uint32_t core_features,
uint32_t thread_features
) {
for (const auto& entry : PRODUCT_VERSIONS)
{
if(((gpu_id & entry.mask) == entry.id) &&
(core_count >= entry.min_cores))
{
return entry.get_num_pixels(core_count, core_features, thread_features);
}
}
return 0;
}
/** Kbase Pre R21 ioctl interface. */
namespace kbase_pre_r21 {
/** Related to mali0 ioctl interface */
enum class header_id : uint32_t {
/** Version check. */
version_check = 0,
/** Base Context Create Kernel Flags. */
create_kernel_flags = 2,
/** Kbase Func Get Props. */
get_props = 526,
/** Kbase Func Set Flags. */
set_flags = 530,
};
/** Message header. */
union uk_header {
/** Number identifying the called UK function. */
header_id id;
/** The return code of the called UK function. */
uint32_t ret;
/** Dummy to ensure type has 64-bit alignment */
uint64_t sizer;
};
/** Check version compatibility between kernel and userspace. */
struct version_check_t {
/** UK header */
uk_header header;
/** Major version number */
uint16_t major;
/** Minor version number */
uint16_t minor;
bool is_set() const
{
return major || minor;
}
};
/** IOCTL parameters to set flags */
struct set_flags_t {
/** UK header */
uk_header header;
/** Create flags */
uint32_t create_flags;
/** Padding */
uint32_t padding;
};
/** Base GPU Num Texture Features Registers. */
static constexpr const uint32_t base_gpu_num_texture_features_registers = 3;
/** Base Max Coherent Groups. */
static constexpr const uint32_t base_max_coherent_groups = 16;
/** GPU Max Job Slots. */
static constexpr const uint32_t gpu_max_job_slots = 16;
/** Kbase UK GPU props. */
struct uk_gpuprops_t {
/**
* IOCTL parameters to probe GPU properties
*
* NOTE: the raw_props member in this data structure contains the register
* values from which the value of the other members are derived. The derived
* members exist to allow for efficient access and/or shielding the details
* of the layout of the registers.
*
*/
struct gpu_props {
/** Core. */
struct core {
/** Product specific value. */
uint32_t product_id;
/**
* Status of the GPU release.
* No defined values, but starts at 0 and increases by one for each
* release status (alpha, beta, EAC, etc.).
* 4 bit values (0-15).
*/
uint16_t version_status;
/**
* Minor release number of the GPU. "P" part of an "RnPn" release number.
* 8 bit values (0-255).
*/
uint16_t minor_revision;
/**
* Major release number of the GPU. "R" part of an "RnPn" release number.
* 4 bit values (0-15).
*/
uint16_t major_revision;
/** Padding. */
uint16_t padding;
/**
* This property is deprecated since it has not contained the real current
* value of GPU clock speed. It is kept here only for backwards compatibility.
* For the new ioctl interface, it is ignored and is treated as a padding
* to keep the structure of the same size and retain the placement of its
* members.
*/
uint32_t gpu_speed_mhz;
/**
* @usecase GPU clock max speed is required for computing best case
* in tasks as job scheduling ant irq_throttling. (It is not specified in the
* Midgard Architecture).
* Also, GPU clock max speed is used for OpenCL's clGetDeviceInfo() function.
*/
uint32_t gpu_freq_khz_max;
/**
* @usecase GPU clock min speed is required for computing worst case
* in tasks as job scheduling ant irq_throttling. (It is not specified in the
* Midgard Architecture).
*/
uint32_t gpu_freq_khz_min;
/** Size of the shader program counter, in bits. */
uint32_t log2_program_counter_size;
/**
* TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a
* bitpattern where a set bit indicates that the format is supported.
*
* Before using a texture format, it is recommended that the corresponding
* bit be checked.
*/
uint32_t texture_features[base_gpu_num_texture_features_registers];
/**
* Theoretical maximum memory available to the GPU. It is unlikely that a
* client will be able to allocate all of this memory for their own
* purposes, but this at least provides an upper bound on the memory
* available to the GPU.
*
* This is required for OpenCL's clGetDeviceInfo() call when
* CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
* client will not be expecting to allocate anywhere near this value.
*/
uint64_t gpu_available_memory_size;
};
/**
* More information is possible - but associativity and bus width are not
* required by upper-level apis.
*/
struct l2_cache {
/** Log2 Line Size. */
uint8_t log2_line_size;
/** Log2 Cache Size. */
uint8_t log2_cache_size;
/** Num L2 Slices. */
uint8_t num_l2_slices;
/** Padding bytes. */
uint8_t padding[5];
};
/** Tiler. */
struct tiler {
/** Max is 4*2^15 */
uint32_t bin_size_bytes;
/** Max is 2^15 */
uint32_t max_active_levels;
};
/** GPU threading system details. */
struct thread {
/** Max. number of threads per core */
uint32_t max_threads;
/** Max. number of threads per workgroup */
uint32_t max_workgroup_size;
/** Max. number of threads that can synchronize on a simple barrier */
uint32_t max_barrier_size;
/** Total size [1..65535] of the register file available per core. */
uint16_t max_registers;
/** Max. tasks [1..255] which may be sent to a core before it becomes blocked. */
uint8_t max_task_queue;
/** Max. allowed value [1..15] of the Thread Group Split field. */
uint8_t max_thread_group_split;
/** 0 = Not specified, 1 = Silicon, 2 = FPGA, 3 = SW Model/Emulation */
uint8_t impl_tech;
/** Padding bytes. */
uint8_t padding[7];
};
/**
* A complete description of the GPU's Hardware Configuration Discovery
* registers.
*
* The information is presented inefficiently for access. For frequent access,
* the values should be better expressed in an unpacked form in the
* base_gpu_props structure.
*
* @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to
* allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
* behaving differently?". In this case, all information about the
* by the driver</b>. Instead, the raw registers can be processed by the Mali
* Tools software on the host PC.
*/
struct raw {
/** Shader Present. */
uint64_t shader_present;
/** Tiler Present. */
uint64_t tiler_present;
/** L2 Present. */
uint64_t l2_present;
/** Unused 1. */
uint64_t unused_1;
/** L2 Features. */
uint32_t l2_features;
/** Suspend Size. */
uint32_t suspend_size;
/** Mem Features. */
uint32_t mem_features;
/** Mmu Features. */
uint32_t mmu_features;
/** As Present. */
uint32_t as_present;
/** Js Present. */
uint32_t js_present;
/** Js Features. */
uint32_t js_features[gpu_max_job_slots];
/** Tiler Features. */
uint32_t tiler_features;
/** Texture Features. */
uint32_t texture_features[3];
/** GPU ID. */
uint32_t gpu_id;
/** Thread Max Threads. */
uint32_t thread_max_threads;
/** Thread Max Workgroup Size. */
uint32_t thread_max_workgroup_size;
/** Thread Max Barrier Size. */
uint32_t thread_max_barrier_size;
/** Thread Features. */
uint32_t thread_features;
/**
* Coherency Mode.
* Note: This is the _selected_ coherency mode rather than the
* available modes as exposed in the coherency_features register.
*/
uint32_t coherency_mode;
};
/**
* Coherency group information
*
* Note that the sizes of the members could be reduced. However, the \c group
* member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
* aligned, thus leading to wastage if the other members sizes were reduced.
*
* The groups are sorted by core mask. The core masks are non-repeating and do
* not intersect.
*/
struct coherent_group_info {
/**
* descriptor for a coherent group
*
* \c core_mask exposes all cores in that coherent group, and \c num_cores
* provides a cached population-count for that mask.
*
* @note Whilst all cores are exposed in the mask, not all may be available to
* the application, depending on the Kernel Power policy.
*
* @note if u64s must be 8-byte aligned, then this structure has 32-bits of
* wastage.
*/
struct coherent_group {
/** Core restriction mask required for the group */
uint64_t core_mask;
/** Number of cores in the group */
uint16_t num_cores;
/** Padding bytes. */
uint16_t padding[3];
};
/** Num Groups. */
uint32_t num_groups;
/**
* Number of core groups (coherent or not) in the GPU. Equivalent to the number of
* L2 Caches.
* The GPU Counter dumping writes 2048 bytes per core group, regardless of whether
* the core groups are coherent or not. Hence this member is needed to calculate
* how much memory is required for dumping.
* @note Do not use it to work out how many valid elements are in the group[]
* member. Use num_groups instead.
*/
uint32_t num_core_groups;
/** Coherency features of the memory, accessed by @ref gpu_mem_features methods. */
uint32_t coherency;
/** Padding. */
uint32_t padding;
/** Descriptors of coherent groups */
coherent_group group[base_max_coherent_groups];
};
/** Core Props. */
core core_props;
/** L2 Props. */
l2_cache l2_props;
/** Unused to keep for backwards compatibility. */
uint64_t unused;
/** Tiler Props. */
tiler tiler_props;
/** Thread Props. */
thread thread_props;
/** This member is large, likely to be 128 bytes. */
raw raw_props;
/** This must be last member of the structure. */
coherent_group_info coherency_info;
};
/** Header. */
uk_header header;
/** Props. */
gpu_props props;
};
constexpr auto iface_number = 0x80;
/** Commands describing kbase_pre_r21 ioctl interface. */
enum command_type {
/** Check version compatibility between JM kernel and userspace. */
version_check = _IOWR(iface_number, 0x0, version_check_t),
/** Set kernel context creation flags. */
set_flags = _IOWR(iface_number, 0x212, set_flags_t),
/** Get GPU properties. */
get_gpuprops = _IOWR(iface_number, 0x20e, uk_gpuprops_t),
};
}
/** Kbase Post R21 ioctl interface. */
namespace kbase_post_r21 {
template <typename value_t>
class pointer64 {
public:
/** @return Pointer to the object. */
value_t* get() const {
return reinterpret_cast<value_t*>(static_cast<uintptr_t>(value));
}
/**
* Set pointer value.
*
* @param ptr The new pointer value.
*/
void reset(value_t* ptr) {
value = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ptr));
}
private:
/** Pointer value as uint64_t. */
uint64_t value { 0 };
};
/** Check version compatibility between kernel and userspace. */
struct version_check_t {
/** Major version number. */
uint16_t major;
/** Minor version number */
uint16_t minor;
bool is_set() const
{
return major || minor;
}
};
/** Set kernel context creation flags. */
struct set_flags_t {
/** kernel context creation flags. */
uint32_t create_flags;
};
/**
* The ioctl will return the number of bytes stored into buffer or an error
* on failure (e.g. size is too small). If size is specified as 0 then no
* data will be written but the return value will be the number of bytes needed
* for all the properties.
*
* flags may be used in the future to request a different format for the
* buffer. With flags == 0 the following format is used.
*
* The buffer will be filled with pairs of values, a __u32 key identifying the
* property followed by the value. The size of the value is identified using
* the bottom bits of the key. The value then immediately followed the key and
* is tightly packed (there is no padding). All keys and values are
* little-endian.
*
* 00 = __u8
* 01 = __u16
* 10 = __u32
* 11 = __u64
*/
struct get_gpuprops_t {
/** GPU property size. */
enum class gpuprop_size : uint8_t {
/** Property type is uint8_t. */
uint8 = 0x0,
/** Property type is uint16_t. */
uint16 = 0x1,
/** Property type is uint32_t. */
uint32 = 0x2,
/** Property type is uint64_t. */
uint64 = 0x3
};
/** GPU properties codes. */
enum class gpuprop_code : uint8_t {
/** Product id. */
product_id = 1,
/** L2 log2 line size. */
l2_log2_line_size = 13,
/** L2 log2 cache size. */
l2_log2_cache_size = 14,
/** L2 num l2 slices. */
l2_num_l2_slices = 15,
/** Max threads. */
max_threads = 18,
/** Max registers. */
max_registers = 21,
/** Raw l2 features. */
raw_l2_features = 29,
/** Raw core features. */
raw_core_features = 30,
/** Raw thread max threads. */
raw_thread_max_threads = 56,
/** Raw thread max workgroup size. */
raw_thread_max_workgroup_size = 57,
/** Raw thread max barrier size. */
raw_thread_max_barrier_size = 58,
/** Raw thread features. */
raw_thread_features = 59,
/** Raw coherency mode. */
raw_coherency_mode = 60,
/** Coherency num groups. */
coherency_num_groups = 61,
/** Coherency num core groups. */
coherency_num_core_groups = 62,
/** Coherency coherency. */
coherency_coherency = 63,
/** Coherency group 0. */
coherency_group_0 = 64,
/** Coherency group 1. */
coherency_group_1 = 65,
/** Coherency group 2. */
coherency_group_2 = 66,
/** Coherency group 3. */
coherency_group_3 = 67,
/** Num exec engines. */
num_exec_engines = 82
};
/** Pointer to the buffer to store properties into. */
pointer64<uint8_t> buffer;
/** Size of the buffer. */
uint32_t size;
/** Flags - must be zero for now. */
uint32_t flags;
};
constexpr auto iface_number = 0x80;
/** Commands describing kbase ioctl interface. */
enum command_type {
/** Check version compatibility between JM kernel and userspace. */
version_check_jm = _IOWR(iface_number, 0x0, version_check_t),
/** Check version compatibility between CSF kernel and userspace. */
version_check_csf = _IOWR(iface_number, 0x34, version_check_t),
/** Set kernel context creation flags. */
set_flags = _IOW(iface_number, 0x1, set_flags_t),
/** Get GPU properties. */
get_gpuprops = _IOW(iface_number, 0x3, get_gpuprops_t),
};
}
class prop_decoder {
public:
prop_decoder(std::vector<unsigned char> buffer)
: buffer_{ std::move(buffer) }
, data_{ buffer_.data() }
, size_{ buffer_.size() } {}
bool decode(gpuinfo& info) {
bool success = true;
uint64_t raw_core_features {};
uint64_t raw_thread_features {};
while (size_ > 0) {
auto p = next(success);
if (!success) {
return false;
}
prop_id_t id = p.first;
uint64_t value = p.second;
switch (id) {
case prop_id_t::product_id:
info.gpu_id = value;
break;
case prop_id_t::l2_log2_cache_size:
info.num_l2_bytes = 1UL << value;
break;
case prop_id_t::l2_num_l2_slices:
info.num_l2_slices = value;
break;
case prop_id_t::raw_l2_features:
/* log2(bus width) stored in top 8 bits of register. */
info.num_bus_bits = 1UL << ((value >> 24) & 0xFF);
break;
case prop_id_t::raw_core_features:
raw_core_features = value;
break;
case prop_id_t::raw_thread_features:
raw_thread_features = value;
break;
case prop_id_t::coherency_group_0:
case prop_id_t::coherency_group_1:
case prop_id_t::coherency_group_2:
case prop_id_t::coherency_group_3:
info.num_shader_cores += __builtin_popcount(value);
break;
default:
break;
}
}
info.num_exec_engines = get_num_exec_engines(
info.gpu_id,
info.num_shader_cores,
raw_core_features,
raw_thread_features);
if (!info.num_exec_engines) {
return false;
}
info.num_fp32_fmas_per_cy = get_num_fp32_fmas(
info.gpu_id,
info.num_shader_cores,
raw_core_features,
raw_thread_features);
info.num_fp16_fmas_per_cy = info.num_fp32_fmas_per_cy * 2;
info.num_texels_per_cy = get_num_texels(
info.gpu_id,
info.num_shader_cores,
raw_core_features,
raw_thread_features);
info.num_pixels_per_cy = get_num_pixels(
info.gpu_id,
info.num_shader_cores,
raw_core_features,
raw_thread_features);
return true;
}
private:
/** Property id type. */
using prop_id_t = kbase_post_r21::get_gpuprops_t::gpuprop_code;
/** Property size type. */
using prop_size_t = kbase_post_r21::get_gpuprops_t::gpuprop_size;
static std::pair<prop_id_t, prop_size_t> to_prop_metadata(uint32_t v) {
/* Property id/size encoding is:
* +--------+----------+
* | 31 2 | 1 0 |
* +--------+----------+
* | PropId | PropSize |
* +--------+----------+
*/
static unsigned int id_shift { 2 };
static unsigned int size_mask { 0b11 };
return { static_cast<prop_id_t>(v >> id_shift), static_cast<prop_size_t>(v & size_mask) };
}
std::pair<prop_id_t, uint64_t> next(bool& success) {
success = true;
auto p = to_prop_metadata(read_bytes<uint32_t>(success));
if (success)
{
prop_id_t id = p.first;
prop_size_t size = p.second;
switch (size) {
case prop_size_t::uint8:
return { id, read_bytes<uint8_t>(success) };
case prop_size_t::uint16:
return { id, read_bytes<uint16_t>(success) };
case prop_size_t::uint32:
return { id, read_bytes<uint32_t>(success) };
case prop_size_t::uint64:
return { id, read_bytes<uint64_t>(success) };
}
}
return {};
}
template <typename T>
T read_bytes(bool& success) {
// Check we have enough bytes in the buffer
if (size_ < sizeof(T)) {
success = false;
return 0;
}
T ret {};
for (size_t b = 0; b < sizeof(T); b++)
{
ret |= static_cast<T>(static_cast<uint64_t>(data_[b]) << (8 * b));
}
data_ += sizeof(T);
size_ -= sizeof(T);
return ret;
}
std::vector<unsigned char> const buffer_;
unsigned char const *data_;
std::size_t size_;
};
/* See header for documentation */
std::unique_ptr<instance> instance::create(
const uint32_t id
) {
std::string device_path("/dev/mali" + std::to_string(id));
// Open the kernel driver device node
const int fd = ::open(device_path.c_str(), O_RDONLY);
if (fd < 0) {
return nullptr;
}
// Check that it is a character device
struct stat s {};
const int fs_result = fstat(fd, &s);
if ((fs_result < 0) || (S_ISCHR(s.st_mode) == 0)) {
::close(fd);
return nullptr;
}
// Create the instance
auto result = std::unique_ptr<instance>(new instance(fd));
if (!result || !result->valid_) {
return nullptr;
}
return result;
}
/* See header for documentation */
const gpuinfo& instance::get_info() const
{
return info_;
};
/* See header for documentation */
instance::~instance()
{
::close(fd_);
}
/* See header for documentation */
instance::instance(int fd):
fd_(fd)
{
if (!check_version()) {
valid_ = false;
return;
}
if (!set_flags()) {
valid_ = false;
return;
}
if (!init_props()) {
valid_ = false;
return;
}
}
static bool is_supported(unsigned int major, unsigned int minor)
{
return (major > 10) || ((major == 10) && (minor >= 2));
}
/* See header for documentation */
bool instance::check_version() {
// Probe pre-r21 JM kernel
// Must be first in the list because CSF reuses an old IOCTL ID
iface_ = iface_type::pre_r21;
kbase_pre_r21::version_check_t pre_r21 {};
pre_r21.header.id = kbase_pre_r21::header_id::version_check;
::ioctl(fd_, kbase_pre_r21::version_check, &pre_r21);
// If this is non-zero this must be pre-r21 driver, so check version
if (pre_r21.is_set()) {
return is_supported(pre_r21.major, pre_r21.minor);
}
// Probe r21+ JM kernel
iface_ = iface_type::post_r21;
kbase_post_r21::version_check_t post_r21 {};
::ioctl(fd_, kbase_post_r21::version_check_jm, &post_r21);
// If this is non-zero this must be post-r21 JM driver, so check version
if (post_r21.is_set()) {
return is_supported(post_r21.major, post_r21.minor);
}
// Probe r21+ CSF kernel
::ioctl(fd_, kbase_post_r21::version_check_csf, &post_r21);
// If this is any non-zero value this is a valid CSF GPU
return post_r21.is_set();
}
/** Call set flags ioctl. */
bool instance::set_flags() {
static constexpr auto system_monitor_flag_submit_disabled_bit = 1;
static constexpr auto system_monitor_flag = 1U << system_monitor_flag_submit_disabled_bit;
// Clear errno
errno = 0;
if (iface_ == iface_type::pre_r21) {
kbase_pre_r21::set_flags_t flags {};
flags.header.id = kbase_pre_r21::header_id::set_flags;
flags.create_flags = system_monitor_flag;
::ioctl(fd_, kbase_pre_r21::set_flags, &flags);
} else {
kbase_post_r21::set_flags_t flags { system_monitor_flag };
::ioctl(fd_, kbase_post_r21::set_flags, &flags);
}
// Mali driver will fail if reinitialized, but it's benign
// TODO: Does this ever happen with this usage pattern
return errno == 0 || errno == EINVAL || errno == EPERM;
}
/* See header for documentation */
bool instance::init_props() {
bool success;
if (iface_ == iface_type::pre_r21) {
success = init_props_pre_r21();
} else {
success = init_props_post_r21();
}
// Perform some common cleanup on the data
if (!success)
{
return false;
}
info_.num_l2_bytes *= info_.num_l2_slices;
info_.gpu_name = get_gpu_name(info_.gpu_id, info_.num_shader_cores);
info_.architecture_name = get_architecture_name(info_.gpu_id);
info_.gpu_id = get_gpu_id(info_.gpu_id);
return true;
}
/* See header for documentation */
bool instance::init_props_pre_r21() {
int error = 0;
kbase_pre_r21::uk_gpuprops_t props {};
props.header.id = kbase_pre_r21::header_id::get_props;
errno = 0;
::ioctl(fd_, kbase_pre_r21::get_gpuprops, &props);
if (errno) {
return false;
}
info_.gpu_id = props.props.core_props.product_id;
info_.num_l2_bytes = 1UL << props.props.l2_props.log2_cache_size;
info_.num_l2_slices = props.props.l2_props.num_l2_slices;
info_.num_bus_bits = 1UL << (props.props.raw_props.l2_features >> 24);
info_.num_shader_cores = 0;
for (uint32_t i = 0; i < props.props.coherency_info.num_core_groups; i++)
{
info_.num_shader_cores += __builtin_popcount(props.props.coherency_info.group[i].core_mask);
}
info_.num_exec_engines = get_num_exec_engines(
info_.gpu_id,
info_.num_shader_cores,
0, 0);
info_.num_fp32_fmas_per_cy = get_num_fp32_fmas(
info_.gpu_id,
info_.num_shader_cores,
0, 0);
info_.num_fp16_fmas_per_cy = info_.num_fp32_fmas_per_cy * 2;
info_.num_texels_per_cy = get_num_texels(
info_.gpu_id,
info_.num_shader_cores,
0, 0);
info_.num_pixels_per_cy = get_num_pixels(
info_.gpu_id,
info_.num_shader_cores,
0, 0);
return true;
}
/* See header for documentation */
bool instance::init_props_post_r21() {
errno = 0;
kbase_post_r21::get_gpuprops_t get_props = {};
int size = ::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props);
if (errno) {
return false;
}
std::vector<unsigned char> buffer(static_cast<std::size_t>(size));
get_props.size = static_cast<uint32_t>(size);
get_props.buffer.reset(buffer.data());
::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props);
if (errno) {
return false;
}
prop_decoder decoder { buffer };
return decoder.decode(info_);
}
}