1134 lines
36 KiB
C++
1134 lines
36 KiB
C++
/*
|
|
* Copyright (c) 2021-2023 ARM Limited.
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
* of this software and associated documentation files (the "Software"), to
|
|
* deal in the Software without restriction, including without limitation the
|
|
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
* sell copies of the Software, and to permit persons to whom the Software is
|
|
* furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be included in all
|
|
* copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
* SOFTWARE.
|
|
*/
|
|
|
|
#include <array>
|
|
#include <cerrno>
|
|
#include <cstdint>
|
|
#include <functional>
|
|
#include <iostream>
|
|
#include <memory>
|
|
#include <string>
|
|
#include <vector>
|
|
|
|
#include <sys/ioctl.h>
|
|
#include <sys/stat.h>
|
|
#include <fcntl.h>
|
|
#include <unistd.h>
|
|
|
|
#include "libgpuinfo.hpp"
|
|
|
|
namespace libgpuinfo {
|
|
|
|
struct product_entry {
|
|
uint32_t id;
|
|
uint32_t mask;
|
|
uint32_t min_cores;
|
|
const char* name;
|
|
const char* architecture;
|
|
uint32_t fp32_fmas_per_engine;
|
|
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_texels;
|
|
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_pixels;
|
|
std::function<uint32_t(int, uint32_t, uint32_t)> get_num_exec_engines;
|
|
};
|
|
|
|
static const uint32_t MASK_OLD { 0xFFFF };
|
|
static const uint32_t MASK_NEW { 0xF00F };
|
|
|
|
static uint32_t get_num_1(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return 1;
|
|
}
|
|
|
|
static uint32_t get_num_2(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return 2;
|
|
}
|
|
|
|
static uint32_t get_num_3(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return 3;
|
|
}
|
|
|
|
static uint32_t get_num_4(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return 4;
|
|
}
|
|
|
|
static uint32_t get_num_8(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return 8;
|
|
}
|
|
|
|
static uint32_t get_num_ee_g31(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000))
|
|
{
|
|
return 1;
|
|
}
|
|
return 2;
|
|
}
|
|
|
|
static uint32_t get_num_ee_g51(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
if ((core_count == 1) && ((thread_features & 0xFFFF) == 0x2000))
|
|
{
|
|
return 1;
|
|
}
|
|
return 3;
|
|
}
|
|
|
|
static uint32_t get_num_ee_g52(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
return core_features & 0xF;
|
|
}
|
|
|
|
static uint32_t get_num_ee_g310_g510(
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
if ((core_features & 0xF) <= 1)
|
|
{
|
|
return 1;
|
|
}
|
|
|
|
return 2;
|
|
}
|
|
|
|
const std::array<product_entry, 30> PRODUCT_VERSIONS {{
|
|
// ID, ID Mask, Min cores, Name, Args, FMA, Texels, Pixels, Engines
|
|
product_entry { 0x6956, MASK_OLD, 1, "Mali-T600", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
|
|
product_entry { 0x0620, MASK_OLD, 1, "Mali-T620", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
|
|
product_entry { 0x0720, MASK_OLD, 1, "Mali-T720", "Midgard", 4, get_num_1, get_num_1, get_num_1 },
|
|
product_entry { 0x0750, MASK_OLD, 1, "Mali-T760", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
|
|
product_entry { 0x0820, MASK_OLD, 1, "Mali-T820", "Midgard", 4, get_num_1, get_num_1, get_num_1 },
|
|
product_entry { 0x0830, MASK_OLD, 1, "Mali-T830", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
|
|
product_entry { 0x0860, MASK_OLD, 1, "Mali-T860", "Midgard", 4, get_num_1, get_num_1, get_num_2 },
|
|
product_entry { 0x0880, MASK_OLD, 1, "Mali-T880", "Midgard", 4, get_num_1, get_num_1, get_num_3 },
|
|
product_entry { 0x6000, MASK_NEW, 1, "Mali-G71", "Bifrost", 4, get_num_1, get_num_1, get_num_3 },
|
|
product_entry { 0x6001, MASK_NEW, 1, "Mali-G72", "Bifrost", 4, get_num_1, get_num_1, get_num_3 },
|
|
product_entry { 0x7000, MASK_NEW, 1, "Mali-G51", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g51 },
|
|
product_entry { 0x7001, MASK_NEW, 1, "Mali-G76", "Bifrost", 8, get_num_2, get_num_2, get_num_3 },
|
|
product_entry { 0x7002, MASK_NEW, 1, "Mali-G52", "Bifrost", 8, get_num_2, get_num_2, get_num_ee_g52 },
|
|
product_entry { 0x7003, MASK_NEW, 1, "Mali-G31", "Bifrost", 4, get_num_2, get_num_2, get_num_ee_g31 },
|
|
product_entry { 0x9000, MASK_NEW, 1, "Mali-G77", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0x9001, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0x9003, MASK_NEW, 1, "Mali-G57", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0x9004, MASK_NEW, 1, "Mali-G68", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0x9002, MASK_NEW, 1, "Mali-G78", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0x9005, MASK_NEW, 1, "Mali-G78AE", "Valhall", 16, get_num_4, get_num_2, get_num_2 },
|
|
product_entry { 0xa002, MASK_NEW, 1, "Mali-G710", "Valhall", 32, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xa007, MASK_NEW, 1, "Mali-G610", "Valhall", 32, get_num_8, get_num_4, get_num_2 },
|
|
// TODO: Extract FMA, pixel, and texel settings
|
|
product_entry { 0xa003, MASK_NEW, 1, "Mali-G510", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 },
|
|
// TODO: Extract FMA, pixel, and texel settings
|
|
product_entry { 0xa004, MASK_NEW, 1, "Mali-G310", "Valhall", 32, get_num_8, get_num_4, get_num_ee_g310_g510 },
|
|
product_entry { 0xb002, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xb003, MASK_NEW, 10, "Immortalis-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xb002, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xb003, MASK_NEW, 7, "Mali-G715", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xb002, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
product_entry { 0xb003, MASK_NEW, 1, "Mali-G615", "Valhall", 64, get_num_8, get_num_4, get_num_2 },
|
|
}};
|
|
|
|
uint32_t get_gpu_id(
|
|
uint32_t gpu_id
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if (((gpu_id & entry.mask) == entry.id))
|
|
{
|
|
return gpu_id & entry.mask;
|
|
}
|
|
}
|
|
|
|
return gpu_id;
|
|
}
|
|
|
|
const char* get_gpu_name(
|
|
uint32_t gpu_id,
|
|
int core_count
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if(((gpu_id & entry.mask) == entry.id) &&
|
|
(core_count >= entry.min_cores))
|
|
{
|
|
return entry.name;
|
|
}
|
|
}
|
|
|
|
return "Unknown gpu_id";
|
|
}
|
|
|
|
const char* get_architecture_name(
|
|
uint32_t gpu_id
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if((gpu_id & entry.mask) == entry.id)
|
|
{
|
|
return entry.architecture;
|
|
}
|
|
}
|
|
|
|
return "Unknown gpu_id";
|
|
}
|
|
|
|
int get_num_exec_engines(
|
|
uint32_t gpu_id,
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if(((gpu_id & entry.mask) == entry.id) &&
|
|
(core_count >= entry.min_cores))
|
|
{
|
|
return entry.get_num_exec_engines(core_count, core_features, thread_features);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
const uint32_t get_num_fp32_fmas(
|
|
uint32_t gpu_id,
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if(((gpu_id & entry.mask) == entry.id) &&
|
|
(core_count >= entry.min_cores))
|
|
{
|
|
return entry.fp32_fmas_per_engine * entry.get_num_exec_engines(core_count, core_features, thread_features);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
const uint32_t get_num_texels(
|
|
uint32_t gpu_id,
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if(((gpu_id & entry.mask) == entry.id) &&
|
|
(core_count >= entry.min_cores))
|
|
{
|
|
return entry.get_num_texels(core_count, core_features, thread_features);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
const uint32_t get_num_pixels(
|
|
uint32_t gpu_id,
|
|
int core_count,
|
|
uint32_t core_features,
|
|
uint32_t thread_features
|
|
) {
|
|
for (const auto& entry : PRODUCT_VERSIONS)
|
|
{
|
|
if(((gpu_id & entry.mask) == entry.id) &&
|
|
(core_count >= entry.min_cores))
|
|
{
|
|
return entry.get_num_pixels(core_count, core_features, thread_features);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
/** Kbase Pre R21 ioctl interface. */
|
|
namespace kbase_pre_r21 {
|
|
|
|
/** Related to mali0 ioctl interface */
|
|
enum class header_id : uint32_t {
|
|
/** Version check. */
|
|
version_check = 0,
|
|
/** Base Context Create Kernel Flags. */
|
|
create_kernel_flags = 2,
|
|
/** Kbase Func Get Props. */
|
|
get_props = 526,
|
|
/** Kbase Func Set Flags. */
|
|
set_flags = 530,
|
|
};
|
|
|
|
/** Message header. */
|
|
union uk_header {
|
|
/** Number identifying the called UK function. */
|
|
header_id id;
|
|
/** The return code of the called UK function. */
|
|
uint32_t ret;
|
|
/** Dummy to ensure type has 64-bit alignment */
|
|
uint64_t sizer;
|
|
};
|
|
|
|
/** Check version compatibility between kernel and userspace. */
|
|
struct version_check_t {
|
|
/** UK header */
|
|
uk_header header;
|
|
/** Major version number */
|
|
uint16_t major;
|
|
/** Minor version number */
|
|
uint16_t minor;
|
|
|
|
bool is_set() const
|
|
{
|
|
return major || minor;
|
|
}
|
|
};
|
|
|
|
/** IOCTL parameters to set flags */
|
|
struct set_flags_t {
|
|
/** UK header */
|
|
uk_header header;
|
|
/** Create flags */
|
|
uint32_t create_flags;
|
|
/** Padding */
|
|
uint32_t padding;
|
|
};
|
|
|
|
/** Base GPU Num Texture Features Registers. */
|
|
static constexpr const uint32_t base_gpu_num_texture_features_registers = 3;
|
|
|
|
/** Base Max Coherent Groups. */
|
|
static constexpr const uint32_t base_max_coherent_groups = 16;
|
|
|
|
/** GPU Max Job Slots. */
|
|
static constexpr const uint32_t gpu_max_job_slots = 16;
|
|
|
|
/** Kbase UK GPU props. */
|
|
struct uk_gpuprops_t {
|
|
/**
|
|
* IOCTL parameters to probe GPU properties
|
|
*
|
|
* NOTE: the raw_props member in this data structure contains the register
|
|
* values from which the value of the other members are derived. The derived
|
|
* members exist to allow for efficient access and/or shielding the details
|
|
* of the layout of the registers.
|
|
*
|
|
*/
|
|
struct gpu_props {
|
|
/** Core. */
|
|
struct core {
|
|
/** Product specific value. */
|
|
uint32_t product_id;
|
|
/**
|
|
* Status of the GPU release.
|
|
* No defined values, but starts at 0 and increases by one for each
|
|
* release status (alpha, beta, EAC, etc.).
|
|
* 4 bit values (0-15).
|
|
*/
|
|
uint16_t version_status;
|
|
/**
|
|
* Minor release number of the GPU. "P" part of an "RnPn" release number.
|
|
* 8 bit values (0-255).
|
|
*/
|
|
uint16_t minor_revision;
|
|
/**
|
|
* Major release number of the GPU. "R" part of an "RnPn" release number.
|
|
* 4 bit values (0-15).
|
|
*/
|
|
uint16_t major_revision;
|
|
/** Padding. */
|
|
uint16_t padding;
|
|
/**
|
|
* This property is deprecated since it has not contained the real current
|
|
* value of GPU clock speed. It is kept here only for backwards compatibility.
|
|
* For the new ioctl interface, it is ignored and is treated as a padding
|
|
* to keep the structure of the same size and retain the placement of its
|
|
* members.
|
|
*/
|
|
uint32_t gpu_speed_mhz;
|
|
/**
|
|
* @usecase GPU clock max speed is required for computing best case
|
|
* in tasks as job scheduling ant irq_throttling. (It is not specified in the
|
|
* Midgard Architecture).
|
|
* Also, GPU clock max speed is used for OpenCL's clGetDeviceInfo() function.
|
|
*/
|
|
uint32_t gpu_freq_khz_max;
|
|
/**
|
|
* @usecase GPU clock min speed is required for computing worst case
|
|
* in tasks as job scheduling ant irq_throttling. (It is not specified in the
|
|
* Midgard Architecture).
|
|
*/
|
|
uint32_t gpu_freq_khz_min;
|
|
/** Size of the shader program counter, in bits. */
|
|
uint32_t log2_program_counter_size;
|
|
/**
|
|
* TEXTURE_FEATURES_x registers, as exposed by the GPU. This is a
|
|
* bitpattern where a set bit indicates that the format is supported.
|
|
*
|
|
* Before using a texture format, it is recommended that the corresponding
|
|
* bit be checked.
|
|
*/
|
|
uint32_t texture_features[base_gpu_num_texture_features_registers];
|
|
/**
|
|
* Theoretical maximum memory available to the GPU. It is unlikely that a
|
|
* client will be able to allocate all of this memory for their own
|
|
* purposes, but this at least provides an upper bound on the memory
|
|
* available to the GPU.
|
|
*
|
|
* This is required for OpenCL's clGetDeviceInfo() call when
|
|
* CL_DEVICE_GLOBAL_MEM_SIZE is requested, for OpenCL GPU devices. The
|
|
* client will not be expecting to allocate anywhere near this value.
|
|
*/
|
|
uint64_t gpu_available_memory_size;
|
|
};
|
|
|
|
/**
|
|
* More information is possible - but associativity and bus width are not
|
|
* required by upper-level apis.
|
|
*/
|
|
struct l2_cache {
|
|
/** Log2 Line Size. */
|
|
uint8_t log2_line_size;
|
|
/** Log2 Cache Size. */
|
|
uint8_t log2_cache_size;
|
|
/** Num L2 Slices. */
|
|
uint8_t num_l2_slices;
|
|
/** Padding bytes. */
|
|
uint8_t padding[5];
|
|
};
|
|
|
|
/** Tiler. */
|
|
struct tiler {
|
|
/** Max is 4*2^15 */
|
|
uint32_t bin_size_bytes;
|
|
/** Max is 2^15 */
|
|
uint32_t max_active_levels;
|
|
};
|
|
|
|
/** GPU threading system details. */
|
|
struct thread {
|
|
/** Max. number of threads per core */
|
|
uint32_t max_threads;
|
|
/** Max. number of threads per workgroup */
|
|
uint32_t max_workgroup_size;
|
|
/** Max. number of threads that can synchronize on a simple barrier */
|
|
uint32_t max_barrier_size;
|
|
/** Total size [1..65535] of the register file available per core. */
|
|
uint16_t max_registers;
|
|
/** Max. tasks [1..255] which may be sent to a core before it becomes blocked. */
|
|
uint8_t max_task_queue;
|
|
/** Max. allowed value [1..15] of the Thread Group Split field. */
|
|
uint8_t max_thread_group_split;
|
|
/** 0 = Not specified, 1 = Silicon, 2 = FPGA, 3 = SW Model/Emulation */
|
|
uint8_t impl_tech;
|
|
/** Padding bytes. */
|
|
uint8_t padding[7];
|
|
};
|
|
|
|
/**
|
|
* A complete description of the GPU's Hardware Configuration Discovery
|
|
* registers.
|
|
*
|
|
* The information is presented inefficiently for access. For frequent access,
|
|
* the values should be better expressed in an unpacked form in the
|
|
* base_gpu_props structure.
|
|
*
|
|
* @usecase The raw properties in @ref gpu_raw_gpu_props are necessary to
|
|
* allow a user of the Mali Tools (e.g. PAT) to determine "Why is this device
|
|
* behaving differently?". In this case, all information about the
|
|
* by the driver</b>. Instead, the raw registers can be processed by the Mali
|
|
* Tools software on the host PC.
|
|
*/
|
|
struct raw {
|
|
/** Shader Present. */
|
|
uint64_t shader_present;
|
|
/** Tiler Present. */
|
|
uint64_t tiler_present;
|
|
/** L2 Present. */
|
|
uint64_t l2_present;
|
|
/** Unused 1. */
|
|
uint64_t unused_1;
|
|
/** L2 Features. */
|
|
uint32_t l2_features;
|
|
/** Suspend Size. */
|
|
uint32_t suspend_size;
|
|
/** Mem Features. */
|
|
uint32_t mem_features;
|
|
/** Mmu Features. */
|
|
uint32_t mmu_features;
|
|
/** As Present. */
|
|
uint32_t as_present;
|
|
/** Js Present. */
|
|
uint32_t js_present;
|
|
/** Js Features. */
|
|
uint32_t js_features[gpu_max_job_slots];
|
|
/** Tiler Features. */
|
|
uint32_t tiler_features;
|
|
/** Texture Features. */
|
|
uint32_t texture_features[3];
|
|
/** GPU ID. */
|
|
uint32_t gpu_id;
|
|
/** Thread Max Threads. */
|
|
uint32_t thread_max_threads;
|
|
/** Thread Max Workgroup Size. */
|
|
uint32_t thread_max_workgroup_size;
|
|
/** Thread Max Barrier Size. */
|
|
uint32_t thread_max_barrier_size;
|
|
/** Thread Features. */
|
|
uint32_t thread_features;
|
|
/**
|
|
* Coherency Mode.
|
|
* Note: This is the _selected_ coherency mode rather than the
|
|
* available modes as exposed in the coherency_features register.
|
|
*/
|
|
uint32_t coherency_mode;
|
|
};
|
|
|
|
/**
|
|
* Coherency group information
|
|
*
|
|
* Note that the sizes of the members could be reduced. However, the \c group
|
|
* member might be 8-byte aligned to ensure the u64 core_mask is 8-byte
|
|
* aligned, thus leading to wastage if the other members sizes were reduced.
|
|
*
|
|
* The groups are sorted by core mask. The core masks are non-repeating and do
|
|
* not intersect.
|
|
*/
|
|
struct coherent_group_info {
|
|
/**
|
|
* descriptor for a coherent group
|
|
*
|
|
* \c core_mask exposes all cores in that coherent group, and \c num_cores
|
|
* provides a cached population-count for that mask.
|
|
*
|
|
* @note Whilst all cores are exposed in the mask, not all may be available to
|
|
* the application, depending on the Kernel Power policy.
|
|
*
|
|
* @note if u64s must be 8-byte aligned, then this structure has 32-bits of
|
|
* wastage.
|
|
*/
|
|
struct coherent_group {
|
|
/** Core restriction mask required for the group */
|
|
uint64_t core_mask;
|
|
/** Number of cores in the group */
|
|
uint16_t num_cores;
|
|
/** Padding bytes. */
|
|
uint16_t padding[3];
|
|
};
|
|
|
|
/** Num Groups. */
|
|
uint32_t num_groups;
|
|
/**
|
|
* Number of core groups (coherent or not) in the GPU. Equivalent to the number of
|
|
* L2 Caches.
|
|
* The GPU Counter dumping writes 2048 bytes per core group, regardless of whether
|
|
* the core groups are coherent or not. Hence this member is needed to calculate
|
|
* how much memory is required for dumping.
|
|
* @note Do not use it to work out how many valid elements are in the group[]
|
|
* member. Use num_groups instead.
|
|
*/
|
|
uint32_t num_core_groups;
|
|
/** Coherency features of the memory, accessed by @ref gpu_mem_features methods. */
|
|
uint32_t coherency;
|
|
/** Padding. */
|
|
uint32_t padding;
|
|
/** Descriptors of coherent groups */
|
|
coherent_group group[base_max_coherent_groups];
|
|
};
|
|
|
|
/** Core Props. */
|
|
core core_props;
|
|
/** L2 Props. */
|
|
l2_cache l2_props;
|
|
/** Unused to keep for backwards compatibility. */
|
|
uint64_t unused;
|
|
/** Tiler Props. */
|
|
tiler tiler_props;
|
|
/** Thread Props. */
|
|
thread thread_props;
|
|
/** This member is large, likely to be 128 bytes. */
|
|
raw raw_props;
|
|
/** This must be last member of the structure. */
|
|
coherent_group_info coherency_info;
|
|
};
|
|
|
|
/** Header. */
|
|
uk_header header;
|
|
/** Props. */
|
|
gpu_props props;
|
|
};
|
|
|
|
constexpr auto iface_number = 0x80;
|
|
|
|
/** Commands describing kbase_pre_r21 ioctl interface. */
|
|
enum command_type {
|
|
/** Check version compatibility between JM kernel and userspace. */
|
|
version_check = _IOWR(iface_number, 0x0, version_check_t),
|
|
/** Set kernel context creation flags. */
|
|
set_flags = _IOWR(iface_number, 0x212, set_flags_t),
|
|
/** Get GPU properties. */
|
|
get_gpuprops = _IOWR(iface_number, 0x20e, uk_gpuprops_t),
|
|
};
|
|
|
|
}
|
|
|
|
/** Kbase Post R21 ioctl interface. */
|
|
namespace kbase_post_r21 {
|
|
|
|
template <typename value_t>
|
|
class pointer64 {
|
|
public:
|
|
/** @return Pointer to the object. */
|
|
value_t* get() const {
|
|
return reinterpret_cast<value_t*>(static_cast<uintptr_t>(value));
|
|
}
|
|
|
|
/**
|
|
* Set pointer value.
|
|
*
|
|
* @param ptr The new pointer value.
|
|
*/
|
|
void reset(value_t* ptr) {
|
|
value = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(ptr));
|
|
}
|
|
|
|
private:
|
|
/** Pointer value as uint64_t. */
|
|
uint64_t value { 0 };
|
|
};
|
|
|
|
/** Check version compatibility between kernel and userspace. */
|
|
struct version_check_t {
|
|
/** Major version number. */
|
|
uint16_t major;
|
|
/** Minor version number */
|
|
uint16_t minor;
|
|
|
|
bool is_set() const
|
|
{
|
|
return major || minor;
|
|
}
|
|
};
|
|
|
|
/** Set kernel context creation flags. */
|
|
struct set_flags_t {
|
|
/** kernel context creation flags. */
|
|
uint32_t create_flags;
|
|
};
|
|
|
|
/**
|
|
* The ioctl will return the number of bytes stored into buffer or an error
|
|
* on failure (e.g. size is too small). If size is specified as 0 then no
|
|
* data will be written but the return value will be the number of bytes needed
|
|
* for all the properties.
|
|
*
|
|
* flags may be used in the future to request a different format for the
|
|
* buffer. With flags == 0 the following format is used.
|
|
*
|
|
* The buffer will be filled with pairs of values, a __u32 key identifying the
|
|
* property followed by the value. The size of the value is identified using
|
|
* the bottom bits of the key. The value then immediately followed the key and
|
|
* is tightly packed (there is no padding). All keys and values are
|
|
* little-endian.
|
|
*
|
|
* 00 = __u8
|
|
* 01 = __u16
|
|
* 10 = __u32
|
|
* 11 = __u64
|
|
*/
|
|
struct get_gpuprops_t {
|
|
/** GPU property size. */
|
|
enum class gpuprop_size : uint8_t {
|
|
/** Property type is uint8_t. */
|
|
uint8 = 0x0,
|
|
/** Property type is uint16_t. */
|
|
uint16 = 0x1,
|
|
/** Property type is uint32_t. */
|
|
uint32 = 0x2,
|
|
/** Property type is uint64_t. */
|
|
uint64 = 0x3
|
|
};
|
|
|
|
/** GPU properties codes. */
|
|
enum class gpuprop_code : uint8_t {
|
|
/** Product id. */
|
|
product_id = 1,
|
|
/** L2 log2 line size. */
|
|
l2_log2_line_size = 13,
|
|
/** L2 log2 cache size. */
|
|
l2_log2_cache_size = 14,
|
|
/** L2 num l2 slices. */
|
|
l2_num_l2_slices = 15,
|
|
/** Max threads. */
|
|
max_threads = 18,
|
|
/** Max registers. */
|
|
max_registers = 21,
|
|
/** Raw l2 features. */
|
|
raw_l2_features = 29,
|
|
/** Raw core features. */
|
|
raw_core_features = 30,
|
|
/** Raw thread max threads. */
|
|
raw_thread_max_threads = 56,
|
|
/** Raw thread max workgroup size. */
|
|
raw_thread_max_workgroup_size = 57,
|
|
/** Raw thread max barrier size. */
|
|
raw_thread_max_barrier_size = 58,
|
|
/** Raw thread features. */
|
|
raw_thread_features = 59,
|
|
/** Raw coherency mode. */
|
|
raw_coherency_mode = 60,
|
|
/** Coherency num groups. */
|
|
coherency_num_groups = 61,
|
|
/** Coherency num core groups. */
|
|
coherency_num_core_groups = 62,
|
|
/** Coherency coherency. */
|
|
coherency_coherency = 63,
|
|
/** Coherency group 0. */
|
|
coherency_group_0 = 64,
|
|
/** Coherency group 1. */
|
|
coherency_group_1 = 65,
|
|
/** Coherency group 2. */
|
|
coherency_group_2 = 66,
|
|
/** Coherency group 3. */
|
|
coherency_group_3 = 67,
|
|
/** Num exec engines. */
|
|
num_exec_engines = 82
|
|
};
|
|
|
|
/** Pointer to the buffer to store properties into. */
|
|
pointer64<uint8_t> buffer;
|
|
|
|
/** Size of the buffer. */
|
|
uint32_t size;
|
|
|
|
/** Flags - must be zero for now. */
|
|
uint32_t flags;
|
|
};
|
|
|
|
constexpr auto iface_number = 0x80;
|
|
|
|
/** Commands describing kbase ioctl interface. */
|
|
enum command_type {
|
|
/** Check version compatibility between JM kernel and userspace. */
|
|
version_check_jm = _IOWR(iface_number, 0x0, version_check_t),
|
|
/** Check version compatibility between CSF kernel and userspace. */
|
|
version_check_csf = _IOWR(iface_number, 0x34, version_check_t),
|
|
/** Set kernel context creation flags. */
|
|
set_flags = _IOW(iface_number, 0x1, set_flags_t),
|
|
/** Get GPU properties. */
|
|
get_gpuprops = _IOW(iface_number, 0x3, get_gpuprops_t),
|
|
};
|
|
|
|
}
|
|
|
|
class prop_decoder {
|
|
public:
|
|
prop_decoder(std::vector<unsigned char> buffer)
|
|
: buffer_{ std::move(buffer) }
|
|
, data_{ buffer_.data() }
|
|
, size_{ buffer_.size() } {}
|
|
|
|
bool decode(gpuinfo& info) {
|
|
bool success = true;
|
|
|
|
uint64_t raw_core_features {};
|
|
uint64_t raw_thread_features {};
|
|
|
|
while (size_ > 0) {
|
|
auto p = next(success);
|
|
if (!success) {
|
|
return false;
|
|
}
|
|
|
|
prop_id_t id = p.first;
|
|
uint64_t value = p.second;
|
|
|
|
switch (id) {
|
|
case prop_id_t::product_id:
|
|
info.gpu_id = value;
|
|
break;
|
|
case prop_id_t::l2_log2_cache_size:
|
|
info.num_l2_bytes = 1UL << value;
|
|
break;
|
|
case prop_id_t::l2_num_l2_slices:
|
|
info.num_l2_slices = value;
|
|
break;
|
|
case prop_id_t::raw_l2_features:
|
|
/* log2(bus width) stored in top 8 bits of register. */
|
|
info.num_bus_bits = 1UL << ((value >> 24) & 0xFF);
|
|
break;
|
|
case prop_id_t::raw_core_features:
|
|
raw_core_features = value;
|
|
break;
|
|
case prop_id_t::raw_thread_features:
|
|
raw_thread_features = value;
|
|
break;
|
|
case prop_id_t::coherency_group_0:
|
|
case prop_id_t::coherency_group_1:
|
|
case prop_id_t::coherency_group_2:
|
|
case prop_id_t::coherency_group_3:
|
|
info.num_shader_cores += __builtin_popcount(value);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
}
|
|
|
|
info.num_exec_engines = get_num_exec_engines(
|
|
info.gpu_id,
|
|
info.num_shader_cores,
|
|
raw_core_features,
|
|
raw_thread_features);
|
|
|
|
if (!info.num_exec_engines) {
|
|
return false;
|
|
}
|
|
|
|
info.num_fp32_fmas_per_cy = get_num_fp32_fmas(
|
|
info.gpu_id,
|
|
info.num_shader_cores,
|
|
raw_core_features,
|
|
raw_thread_features);
|
|
|
|
info.num_fp16_fmas_per_cy = info.num_fp32_fmas_per_cy * 2;
|
|
|
|
info.num_texels_per_cy = get_num_texels(
|
|
info.gpu_id,
|
|
info.num_shader_cores,
|
|
raw_core_features,
|
|
raw_thread_features);
|
|
|
|
info.num_pixels_per_cy = get_num_pixels(
|
|
info.gpu_id,
|
|
info.num_shader_cores,
|
|
raw_core_features,
|
|
raw_thread_features);
|
|
|
|
return true;
|
|
}
|
|
|
|
private:
|
|
/** Property id type. */
|
|
using prop_id_t = kbase_post_r21::get_gpuprops_t::gpuprop_code;
|
|
/** Property size type. */
|
|
using prop_size_t = kbase_post_r21::get_gpuprops_t::gpuprop_size;
|
|
|
|
static std::pair<prop_id_t, prop_size_t> to_prop_metadata(uint32_t v) {
|
|
/* Property id/size encoding is:
|
|
* +--------+----------+
|
|
* | 31 2 | 1 0 |
|
|
* +--------+----------+
|
|
* | PropId | PropSize |
|
|
* +--------+----------+
|
|
*/
|
|
static unsigned int id_shift { 2 };
|
|
static unsigned int size_mask { 0b11 };
|
|
|
|
return { static_cast<prop_id_t>(v >> id_shift), static_cast<prop_size_t>(v & size_mask) };
|
|
}
|
|
|
|
std::pair<prop_id_t, uint64_t> next(bool& success) {
|
|
success = true;
|
|
auto p = to_prop_metadata(read_bytes<uint32_t>(success));
|
|
if (success)
|
|
{
|
|
prop_id_t id = p.first;
|
|
prop_size_t size = p.second;
|
|
|
|
switch (size) {
|
|
case prop_size_t::uint8:
|
|
return { id, read_bytes<uint8_t>(success) };
|
|
case prop_size_t::uint16:
|
|
return { id, read_bytes<uint16_t>(success) };
|
|
case prop_size_t::uint32:
|
|
return { id, read_bytes<uint32_t>(success) };
|
|
case prop_size_t::uint64:
|
|
return { id, read_bytes<uint64_t>(success) };
|
|
}
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
template <typename T>
|
|
T read_bytes(bool& success) {
|
|
// Check we have enough bytes in the buffer
|
|
if (size_ < sizeof(T)) {
|
|
success = false;
|
|
return 0;
|
|
}
|
|
|
|
T ret {};
|
|
for (size_t b = 0; b < sizeof(T); b++)
|
|
{
|
|
ret |= static_cast<T>(static_cast<uint64_t>(data_[b]) << (8 * b));
|
|
}
|
|
data_ += sizeof(T);
|
|
size_ -= sizeof(T);
|
|
return ret;
|
|
}
|
|
|
|
std::vector<unsigned char> const buffer_;
|
|
unsigned char const *data_;
|
|
std::size_t size_;
|
|
};
|
|
|
|
/* See header for documentation */
|
|
std::unique_ptr<instance> instance::create(
|
|
const uint32_t id
|
|
) {
|
|
std::string device_path("/dev/mali" + std::to_string(id));
|
|
|
|
// Open the kernel driver device node
|
|
const int fd = ::open(device_path.c_str(), O_RDONLY);
|
|
if (fd < 0) {
|
|
return nullptr;
|
|
}
|
|
|
|
// Check that it is a character device
|
|
struct stat s {};
|
|
const int fs_result = fstat(fd, &s);
|
|
if ((fs_result < 0) || (S_ISCHR(s.st_mode) == 0)) {
|
|
::close(fd);
|
|
return nullptr;
|
|
}
|
|
|
|
// Create the instance
|
|
auto result = std::unique_ptr<instance>(new instance(fd));
|
|
if (!result || !result->valid_) {
|
|
return nullptr;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/* See header for documentation */
|
|
const gpuinfo& instance::get_info() const
|
|
{
|
|
return info_;
|
|
};
|
|
|
|
/* See header for documentation */
|
|
instance::~instance()
|
|
{
|
|
::close(fd_);
|
|
}
|
|
|
|
/* See header for documentation */
|
|
instance::instance(int fd):
|
|
fd_(fd)
|
|
{
|
|
if (!check_version()) {
|
|
valid_ = false;
|
|
return;
|
|
}
|
|
|
|
if (!set_flags()) {
|
|
valid_ = false;
|
|
return;
|
|
}
|
|
|
|
if (!init_props()) {
|
|
valid_ = false;
|
|
return;
|
|
}
|
|
}
|
|
|
|
static bool is_supported(unsigned int major, unsigned int minor)
|
|
{
|
|
return (major > 10) || ((major == 10) && (minor >= 2));
|
|
}
|
|
|
|
/* See header for documentation */
|
|
bool instance::check_version() {
|
|
// Probe pre-r21 JM kernel
|
|
// Must be first in the list because CSF reuses an old IOCTL ID
|
|
iface_ = iface_type::pre_r21;
|
|
kbase_pre_r21::version_check_t pre_r21 {};
|
|
pre_r21.header.id = kbase_pre_r21::header_id::version_check;
|
|
::ioctl(fd_, kbase_pre_r21::version_check, &pre_r21);
|
|
// If this is non-zero this must be pre-r21 driver, so check version
|
|
if (pre_r21.is_set()) {
|
|
return is_supported(pre_r21.major, pre_r21.minor);
|
|
}
|
|
|
|
// Probe r21+ JM kernel
|
|
iface_ = iface_type::post_r21;
|
|
kbase_post_r21::version_check_t post_r21 {};
|
|
::ioctl(fd_, kbase_post_r21::version_check_jm, &post_r21);
|
|
// If this is non-zero this must be post-r21 JM driver, so check version
|
|
if (post_r21.is_set()) {
|
|
return is_supported(post_r21.major, post_r21.minor);
|
|
}
|
|
|
|
// Probe r21+ CSF kernel
|
|
::ioctl(fd_, kbase_post_r21::version_check_csf, &post_r21);
|
|
// If this is any non-zero value this is a valid CSF GPU
|
|
return post_r21.is_set();
|
|
}
|
|
|
|
/** Call set flags ioctl. */
|
|
bool instance::set_flags() {
|
|
static constexpr auto system_monitor_flag_submit_disabled_bit = 1;
|
|
static constexpr auto system_monitor_flag = 1U << system_monitor_flag_submit_disabled_bit;
|
|
|
|
// Clear errno
|
|
errno = 0;
|
|
|
|
if (iface_ == iface_type::pre_r21) {
|
|
kbase_pre_r21::set_flags_t flags {};
|
|
flags.header.id = kbase_pre_r21::header_id::set_flags;
|
|
flags.create_flags = system_monitor_flag;
|
|
::ioctl(fd_, kbase_pre_r21::set_flags, &flags);
|
|
} else {
|
|
kbase_post_r21::set_flags_t flags { system_monitor_flag };
|
|
::ioctl(fd_, kbase_post_r21::set_flags, &flags);
|
|
}
|
|
|
|
// Mali driver will fail if reinitialized, but it's benign
|
|
// TODO: Does this ever happen with this usage pattern
|
|
return errno == 0 || errno == EINVAL || errno == EPERM;
|
|
}
|
|
|
|
/* See header for documentation */
|
|
bool instance::init_props() {
|
|
bool success;
|
|
if (iface_ == iface_type::pre_r21) {
|
|
success = init_props_pre_r21();
|
|
} else {
|
|
success = init_props_post_r21();
|
|
}
|
|
|
|
// Perform some common cleanup on the data
|
|
if (!success)
|
|
{
|
|
return false;
|
|
}
|
|
|
|
info_.num_l2_bytes *= info_.num_l2_slices;
|
|
info_.gpu_name = get_gpu_name(info_.gpu_id, info_.num_shader_cores);
|
|
info_.architecture_name = get_architecture_name(info_.gpu_id);
|
|
info_.gpu_id = get_gpu_id(info_.gpu_id);
|
|
return true;
|
|
}
|
|
|
|
/* See header for documentation */
|
|
bool instance::init_props_pre_r21() {
|
|
int error = 0;
|
|
|
|
kbase_pre_r21::uk_gpuprops_t props {};
|
|
props.header.id = kbase_pre_r21::header_id::get_props;
|
|
errno = 0;
|
|
::ioctl(fd_, kbase_pre_r21::get_gpuprops, &props);
|
|
if (errno) {
|
|
return false;
|
|
}
|
|
|
|
info_.gpu_id = props.props.core_props.product_id;
|
|
info_.num_l2_bytes = 1UL << props.props.l2_props.log2_cache_size;
|
|
info_.num_l2_slices = props.props.l2_props.num_l2_slices;
|
|
info_.num_bus_bits = 1UL << (props.props.raw_props.l2_features >> 24);
|
|
|
|
info_.num_shader_cores = 0;
|
|
for (uint32_t i = 0; i < props.props.coherency_info.num_core_groups; i++)
|
|
{
|
|
info_.num_shader_cores += __builtin_popcount(props.props.coherency_info.group[i].core_mask);
|
|
}
|
|
|
|
info_.num_exec_engines = get_num_exec_engines(
|
|
info_.gpu_id,
|
|
info_.num_shader_cores,
|
|
0, 0);
|
|
|
|
info_.num_fp32_fmas_per_cy = get_num_fp32_fmas(
|
|
info_.gpu_id,
|
|
info_.num_shader_cores,
|
|
0, 0);
|
|
|
|
info_.num_fp16_fmas_per_cy = info_.num_fp32_fmas_per_cy * 2;
|
|
|
|
info_.num_texels_per_cy = get_num_texels(
|
|
info_.gpu_id,
|
|
info_.num_shader_cores,
|
|
0, 0);
|
|
|
|
info_.num_pixels_per_cy = get_num_pixels(
|
|
info_.gpu_id,
|
|
info_.num_shader_cores,
|
|
0, 0);
|
|
|
|
return true;
|
|
}
|
|
|
|
/* See header for documentation */
|
|
bool instance::init_props_post_r21() {
|
|
errno = 0;
|
|
|
|
kbase_post_r21::get_gpuprops_t get_props = {};
|
|
int size = ::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props);
|
|
if (errno) {
|
|
return false;
|
|
}
|
|
|
|
std::vector<unsigned char> buffer(static_cast<std::size_t>(size));
|
|
get_props.size = static_cast<uint32_t>(size);
|
|
get_props.buffer.reset(buffer.data());
|
|
::ioctl(fd_, kbase_post_r21::get_gpuprops, &get_props);
|
|
if (errno) {
|
|
return false;
|
|
}
|
|
|
|
prop_decoder decoder { buffer };
|
|
return decoder.decode(info_);
|
|
}
|
|
|
|
}
|