Files
UnrealEngine/Engine/Source/Programs/BreakpadSymbolEncoder/Private/BreakpadSymbolEncoder.cpp
2025-05-18 13:04:45 +08:00

562 lines
16 KiB
C++

// Copyright Epic Games, Inc. All Rights Reserved.
#include "BreakpadSymbolEncoder.h"
#include <algorithm>
#include <iostream>
#include <fstream>
#include <limits>
#include <string>
#include <string.h>
#include <unordered_map>
#include <vector>
#define VERBOSE_DEBUG 0
namespace
{
std::string ReadInFile(const std::string& Path)
{
std::ifstream InFile(Path.c_str(), std::ios::in | std::ios::binary);
if (InFile)
{
std::string Out;
InFile.seekg(0, std::ios::end);
Out.resize(InFile.tellg());
InFile.seekg(0, std::ios::beg);
InFile.read(&Out[0], Out.size());
InFile.close();
return Out;
}
return {};
}
std::vector<std::string> SplitFilePerLine(std::string&& RawBytes)
{
std::vector<std::string> Out;
size_t Current = 0;
while (Current != std::string::npos && Current < RawBytes.size())
{
size_t NewLineEnd = RawBytes.find("\n", Current);
size_t NewLineSize = 1;
// Check if we are CRLF \r\n
if (NewLineEnd > 0 && NewLineEnd != std::string::npos)
{
if (RawBytes[NewLineEnd - 1] == '\r')
{
NewLineEnd--;
NewLineSize = 2;
}
Out.push_back(RawBytes.substr(Current, NewLineEnd - Current));
Current = NewLineEnd + NewLineSize;
}
else
{
Out.push_back(RawBytes.substr(Current));
break;
}
}
return Out;
}
std::vector<std::string> SplitLineIntoNEntries(const std::string& Line, size_t n)
{
std::vector<std::string> Out;
size_t Current = 0;
for (size_t i = 0; i < n; i++)
{
size_t End = Line.find(" ", Current);
Out.push_back(Line.substr(Current, End - Current));
Current = End + 1;
}
Out.push_back(Line.substr(Current));
if (Out.size() != n + 1)
{
std::cerr << "Failed to split the string by an expected amount\n";
return {};
}
return Out;
}
bool BeginsWith(const std::string& String, const std::string& With)
{
return String.compare(0, With.size(), With) == 0;
}
// For the symbol file only expect lower case letters for hex
constexpr bool IsHex(char c)
{
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z');
}
bool BeginsWithHex(const std::string& String)
{
for (size_t i = 0; i < String.size(); i++)
{
if (i > 0 && String[i] == ' ')
{
break;
}
else if (!IsHex(String[i]))
{
return false;
}
}
return true;
}
/* Used to keep track of the actual sizes of FUNCs so we can generate
* Records to fill in the gap between FUNCs as this would be a major issue.
* PUBLIC symbols are assumed to have a size extending to the next symbol so
* these dont require any dummy Records to fill in the gaps
*/
struct SymbolChunk
{
uint64_t Address;
uint32_t Size;
bool bPublic;
friend bool operator<(const SymbolChunk& A, const SymbolChunk& B)
{
return A.Address < B.Address;
}
};
/* Keeps track of the Filename as well as a relative offset in bytes
* from the start of a contigous chunk of memory to its current location
*/
struct FileWithOffset
{
std::string Name;
uint32_t RelativeOffset;
};
/* Keeps track of the Symbol name as well as a relative offset in bytes
* from the start of a contigous chunk of memory to its current location
*/
struct SymbolWithOffset
{
std::string Name;
uint32_t RelativeOffset;
};
// PUBLIC address parameter_size name
struct PublicRecord
{
uint64_t Address;
std::string ParameterSize;
std::string Name;
};
// https://github.com/google/breakpad/blob/master/docs/symbol_files.md
//
// Prefix : Info : Number of spaces
// ------------------------------------------------------------------
// MODULE : operatingsystem architecture id name : 4
// FILE : number name : 2
// FUNC m : address size parameter_size name : 5
// FUNC : address size parameter_size name : 4
// address : size line filenum : 3
// PUBLIC m : address parameter_size name : 4
// PUBLIC : address parameter_size name : 3
// STACK : : 0 // Ignore
// INFO : : 0 // Ignore
/* We use these to split up an expected LINE into its assumed line. Gets around needing a
* lexer/parser where we can just assume the number of spaces and from there what is what
*/
const size_t ExpectedFileSpaces = 2;
const size_t ExpectedFuncSpaces = 4;
const size_t ExpectedLineSpaces = 3;
const size_t ExpectedPublicSpaces = 3;
bool ParseSymbolFile(const std::string& SymbolFile, std::vector<Record>& out_Records, std::vector<FileWithOffset>& out_FileRecords, std::vector<SymbolWithOffset>& out_SymbolNames)
{
std::unordered_map<uint64_t, uint64_t> FuncRecords;
std::vector<PublicRecord> PublicRecords;
std::vector<SymbolChunk> SymbolChunks;
uint32_t RelativeFileOffset = 0;
uint32_t RelativeSymbolNameOffset = 0;
int LineCount = 0;
int IgnoreCount = 0;
int ActualLineCount = 0;
bool bFirstLineRecordFromFunc = false;
std::vector<std::string> SplitLines = SplitFilePerLine(ReadInFile(SymbolFile));
if (SplitLines.empty())
{
std::cerr << "Failed to read file: '" << SymbolFile << "'" << '\n';
return false;
}
for (auto const& Line : SplitLines)
{
// address size line filenum
if (BeginsWithHex(Line))
{
Record Out;
char const* RawLine = Line.c_str();
char* End;
Out.Address = std::strtoull(RawLine, &End, 16);
// Need to skip the size entry
End = strchr(End + 1, ' ');
Out.LineNumber = std::strtoul(End, &End, 10);
// Store the actual index into the contiguous memory, which we will convert into a RelativeOffset at the end when we have the totals
Out.FileRelativeOffset = std::strtoul(End, nullptr, 10);
/* An example of what a FUNC + LINE records would look like:
* FUNC
* LINE RECORD LineNumber FileNumber
* LINE RECORD LineNumber FileNumber
* ....
* LINE RECORD LineNumber FileNumber
*
* Compress simply ignores a LINE RECORD *if* the previous LINE RECORD has the same LineNumber and FileNumber
* This will give us a larger Chunk size for this entry but for our use case its not required.
*/
if (bFirstLineRecordFromFunc)
{
// Same as the FileAbsoluteOffset but for the Symbol name. Need to covert at the end when all the memory we are going to write is in the structure
Out.SymbolRelativeOffset = static_cast<uint32_t>(out_SymbolNames.size() - 1);
out_Records.emplace_back(Out);
bFirstLineRecordFromFunc = false;
ActualLineCount++; // TODO Remove just for numbers
}
else
{
Record LastRecord = out_Records.back();
// We at times have a line number of zero from dump_syms, this can mess up compressing chunks of the same PC range
// just to save on size lets just assume if its a zero, use the previous line number. While not correct, not any less
// incorrect then 0, and helps reduce the sym file size by a good amount
if (Out.LineNumber == 0)
{
Out.LineNumber = LastRecord.LineNumber;
}
if (LastRecord.LineNumber != Out.LineNumber || LastRecord.FileRelativeOffset != Out.FileRelativeOffset)
{
// Same as the FileAbsoluteOffset but for the Symbol name. Need to covert at the end when all the memory we are going to write is in the structure
Out.SymbolRelativeOffset = static_cast<uint32_t>(out_SymbolNames.size() - 1);
out_Records.emplace_back(Out);
ActualLineCount++; // TODO Remove just for numbers
}
}
LineCount++;
}
// FUNC address size parameter_size name
else if (BeginsWith(Line, "FUNC"))
{
size_t FuncSpaces = ExpectedFuncSpaces;
size_t FirstValue = 1;
if (BeginsWith(Line, "FUNC m"))
{
FuncSpaces++;
FirstValue++;
}
std::vector<std::string> FuncSplit = SplitLineIntoNEntries(Line, FuncSpaces);
if (!FuncSplit.empty())
{
uint64_t Address = std::strtoull(FuncSplit[FirstValue].c_str(), 0, 16);
uint32_t Size = std::strtoul(FuncSplit[FirstValue + 1].c_str(), 0, 16);
FuncRecords[Address] = Address;
out_SymbolNames.push_back({FuncSplit[FirstValue + 3] + "\n", RelativeSymbolNameOffset});
RelativeSymbolNameOffset += static_cast<uint32_t>(out_SymbolNames.back().Name.size()) * sizeof(char);
SymbolChunks.push_back({Address, Size, false});
bFirstLineRecordFromFunc = true;
}
else
{
std::cerr << "ERROR: Failed to split a FUNC line:\n " << Line << '\n';
}
}
// PUBLIC address parameter_size name
else if (BeginsWith(Line, "PUBLIC"))
{
size_t PublicSpaces = ExpectedPublicSpaces;
size_t FirstValue = 1;
if (BeginsWith(Line, "PUBLIC m"))
{
PublicSpaces++;
FirstValue++;
}
std::vector<std::string> PublicSplit = SplitLineIntoNEntries(Line, PublicSpaces);
if (!PublicSplit.empty())
{
PublicRecords.push_back({
std::strtoull(PublicSplit[FirstValue].c_str(), 0, 16),
PublicSplit[FirstValue + 1],
PublicSplit[FirstValue + 2]
});
}
else
{
std::cerr << "ERROR: Failed to split a PUBLIC line:\n " << Line << '\n';
}
}
// FILE number name
else if (BeginsWith(Line, "FILE"))
{
std::vector<std::string> FileSplit = SplitLineIntoNEntries(Line, ExpectedFileSpaces);
if (!FileSplit.empty())
{
// Add a newline as we'll need to use that when reading later
std::string Filename = FileSplit[2] + "\n";
// Maintain one style of pathing
std::replace(std::begin(Filename), std::end(Filename), '\\', '/');
out_FileRecords.push_back({Filename, RelativeFileOffset});
RelativeFileOffset += static_cast<uint32_t>(out_FileRecords.back().Name.size()) * sizeof(char);
}
else
{
std::cerr << "ERROR: Failed to split a FILE line:\n " << Line << '\n';
}
}
else if (BeginsWith(Line, "STACK") ||
BeginsWith(Line, "INFO") ||
BeginsWith(Line, "MODULE") ||
BeginsWith(Line, "INLINE"))
{
// Ignore
IgnoreCount++;
}
else
{
std::cerr << "ERROR: Unepxected line: " << Line << '\n';
return false;
}
}
// Only add Records for PUBLIC symbols that are not already captured by a FUNC entry
int TotalPublicKept = 0;
for (auto const& PRecord : PublicRecords)
{
if (FuncRecords.find(PRecord.Address) == FuncRecords.end())
{
Record Out;
Out.Address = PRecord.Address;
Out.FileRelativeOffset = static_cast<uint32_t>(-1);
Out.LineNumber = static_cast<uint32_t>(-1);
out_SymbolNames.push_back({PRecord.Name + "\n", RelativeSymbolNameOffset});
RelativeSymbolNameOffset += static_cast<uint32_t>(out_SymbolNames.back().Name.size()) * sizeof(char);
// We just pushed a new symbol on the list, use that as the index when we look up offsets later
Out.SymbolRelativeOffset = static_cast<uint32_t>(out_SymbolNames.size() - 1);
out_Records.emplace_back(Out);
// Add all the PUBLIC symbols we need to account for so we dont add dummy Records in their locations
SymbolChunks.push_back({Out.Address, 0, true});
TotalPublicKept++;
}
}
// We have put all the FUNC and PUBLIC (non duplicates), need to sort them before generating the dummy entries
std::sort(SymbolChunks.begin(), SymbolChunks.end());
// Dummy symbol name
out_SymbolNames.push_back({"?????????????\n", RelativeSymbolNameOffset});
RelativeSymbolNameOffset += static_cast<uint32_t>(out_SymbolNames.back().Name.size()) * sizeof(char);
int ChunksAdded = 0;
for (size_t i = 0; i < SymbolChunks.size() - 1; i++)
{
// We assume all public symbols extend to the next symbol
if (!SymbolChunks[i].bPublic)
{
uint64_t Address = SymbolChunks[i].Address;
uint64_t NextAddress = SymbolChunks[i + 1].Address;
uint32_t Size = SymbolChunks[i].Size;
if (Address + Size != NextAddress)
{
// Add a dummy symbol that fills in the Hole between symbols so we can assume NextAddress - Address == Size
out_Records.push_back({
Address + Size,
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1),
static_cast<uint32_t>(out_SymbolNames.size() - 1)
});
ChunksAdded++;
}
}
}
// Add a final dummy record for the last entry. This way you can get the size of the last entry (which is just assumed to be 4 bytes as it'll
// be a public function with no defined size).
out_Records.push_back({
out_Records.back().Address + 0x4,
static_cast<uint32_t>(-1),
static_cast<uint32_t>(-1),
static_cast<uint32_t>(out_SymbolNames.size() - 1)
});
#if VERBOSE_DEBUG
std::cout << "TotalLines: " << LineCount << " Actual Lines Added: " << ActualLineCount << " Percent compressed: " << 100 - (ActualLineCount / (float)LineCount * 100) << "%" << '\n';
std::cout << "TotalPublic: " << PublicRecords.size() << " Actual Public Added: " << TotalPublicKept << " Percent removed: " << 100 - (TotalPublicKept / (float)PublicRecords.size() * 100) << "%" << '\n';
std::cout << std::dec << "File: " << out_FileRecords.size() << "\t" << (out_FileRecords.size() / (float)SplitLines.size()) * 100 << '\n'
<< "Func: " << FuncRecords.size() << "\t" << (FuncRecords.size() / (float)SplitLines.size()) * 100 << '\n'
<< "Public: " << PublicRecords.size() << "\t" << (PublicRecords.size() / (float)SplitLines.size()) * 100 << '\n'
<< "Ignore: " << IgnoreCount << "\t" << (IgnoreCount / (float)SplitLines.size()) * 100 << '\n'
<< "Line: " << LineCount << "\t" << (LineCount / (float)SplitLines.size()) * 100 << '\n'
<< "Total: " << SplitLines.size() << '\n';
std::cout << "Total Record: " << out_Records.size() << "\n"
<< " TotalLines: " << ActualLineCount << "\n"
<< " TotalPublic: " << TotalPublicKept << "\n"
<< " EmptyChunks: " << ChunksAdded << "\n";
#endif
return true;
}
void EncodeSymbolFile(const std::string& SymbolFile, const std::string& OutputFile)
{
std::vector<Record> Records;
std::vector<FileWithOffset> FileRecords;
std::vector<SymbolWithOffset> SymbolNames;
std::string Filename;
if (!ParseSymbolFile(SymbolFile, Records, FileRecords, SymbolNames))
{
std::cerr << "Failed to parse '" << SymbolFile << "'" << '\n';
return;
}
if (SymbolFile.empty() || OutputFile.empty())
{
std::cerr << "ERROR: Symbol file or Output file is empty '" << SymbolFile << "' '" << OutputFile << "'" << '\n';
return;
}
std::sort(Records.begin(), Records.end());
uint64_t RecordsSize = Records.size() * sizeof(Record);
uint64_t FilesBytesSize = 0;
for (size_t i = 0; i < FileRecords.size(); i++)
{
FilesBytesSize += FileRecords[i].Name.size() * sizeof(char);
}
uint64_t SymbolBytesSize = 0;
for (size_t i = 0; i < SymbolNames.size(); i++)
{
SymbolBytesSize += SymbolNames[i].Name.size() * sizeof(char);
}
if (FilesBytesSize + SymbolBytesSize > std::numeric_limits<uint32_t>::max())
{
std::cerr << "ERROR: String section larger then the uint32_t::max() cannot encode the offsets" << '\n';
return;
}
if (Records.size() > std::numeric_limits<uint32_t>::max())
{
std::cerr << "ERROR: Record count greater then uint32_t::max() cannot encode the record count" << '\n';
return;
}
// Replace all the stored index with relative offsets from the start of the strings section in the output file
for (auto& R : Records)
{
if (R.FileRelativeOffset < FileRecords.size())
{
R.FileRelativeOffset = FileRecords[R.FileRelativeOffset].RelativeOffset;
}
else if (R.FileRelativeOffset != (uint32_t)-1)
{
std::cerr << "Error FileRelativeOffset larger then expected range, got: " << R.FileRelativeOffset << " Expect less then: " << FileRecords.size() << '\n';
}
if (R.SymbolRelativeOffset != static_cast<uint32_t>(-1))
{
R.SymbolRelativeOffset = SymbolNames[R.SymbolRelativeOffset].RelativeOffset + static_cast<uint32_t>(FilesBytesSize);
}
}
// If we require larger then 4GB files... we'll need to reconsider this
RecordsHeader Header{static_cast<uint32_t>(Records.size())};
std::ofstream os(OutputFile, std::ios::binary);
if (os.is_open())
{
os.write((char*)&Header, sizeof(RecordsHeader));
os.write((char*)Records.data(), RecordsSize);
for (size_t i = 0; i < FileRecords.size(); i++)
{
os.write((char*)&FileRecords[i].Name[0], FileRecords[i].Name.size() * sizeof(char));
}
for (size_t i = 0; i < SymbolNames.size(); i++)
{
os.write((char*)&SymbolNames[i].Name[0], SymbolNames[i].Name.size() * sizeof(char));
}
os.close();
#if VERBOSE_DEBUG
std::cout << " OutputFile: " << OutputFile << '\n';
std::cout << " RecordsSize : 0x" << std::hex << RecordsSize << '\n';
std::cout << " RecordOut Offset: 0x" << std::hex << RecordsSize + sizeof(RecordsHeader) << '\n';
std::cout << " Record + Files Offset: 0x" << RecordsSize + FilesBytesSize << std::dec << '\n';
#endif
}
else
{
std::cerr << "ERROR: Failed to open file for writing: " << OutputFile << '\n';
}
}
}
int main(int argc, char* argv[])
{
if (argc > 2)
{
std::string SymbolFile(argv[1]);
std::string OutputFile(argv[2]);
#if VERBOSE_DEBUG
std::cout << "Attempting to read Symbol file: '" << SymbolFile << "'" << '\n';
#endif
EncodeSymbolFile(SymbolFile, OutputFile);
}
else
{
std::cerr << "Usage: " << argv[0] << " <path/to/symbol/file> <path/to/output/file>" << '\n';
}
}