// Copyright Epic Games, Inc. All Rights Reserved. #include "BreakpadSymbolEncoder.h" #include #include #include #include #include #include #include #include #define VERBOSE_DEBUG 0 namespace { std::string ReadInFile(const std::string& Path) { std::ifstream InFile(Path.c_str(), std::ios::in | std::ios::binary); if (InFile) { std::string Out; InFile.seekg(0, std::ios::end); Out.resize(InFile.tellg()); InFile.seekg(0, std::ios::beg); InFile.read(&Out[0], Out.size()); InFile.close(); return Out; } return {}; } std::vector SplitFilePerLine(std::string&& RawBytes) { std::vector Out; size_t Current = 0; while (Current != std::string::npos && Current < RawBytes.size()) { size_t NewLineEnd = RawBytes.find("\n", Current); size_t NewLineSize = 1; // Check if we are CRLF \r\n if (NewLineEnd > 0 && NewLineEnd != std::string::npos) { if (RawBytes[NewLineEnd - 1] == '\r') { NewLineEnd--; NewLineSize = 2; } Out.push_back(RawBytes.substr(Current, NewLineEnd - Current)); Current = NewLineEnd + NewLineSize; } else { Out.push_back(RawBytes.substr(Current)); break; } } return Out; } std::vector SplitLineIntoNEntries(const std::string& Line, size_t n) { std::vector Out; size_t Current = 0; for (size_t i = 0; i < n; i++) { size_t End = Line.find(" ", Current); Out.push_back(Line.substr(Current, End - Current)); Current = End + 1; } Out.push_back(Line.substr(Current)); if (Out.size() != n + 1) { std::cerr << "Failed to split the string by an expected amount\n"; return {}; } return Out; } bool BeginsWith(const std::string& String, const std::string& With) { return String.compare(0, With.size(), With) == 0; } // For the symbol file only expect lower case letters for hex constexpr bool IsHex(char c) { return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'z'); } bool BeginsWithHex(const std::string& String) { for (size_t i = 0; i < String.size(); i++) { if (i > 0 && String[i] == ' ') { break; } else if (!IsHex(String[i])) { return false; } } return true; } /* Used to keep track of the actual sizes of FUNCs so we can generate * Records to fill in the gap between FUNCs as this would be a major issue. * PUBLIC symbols are assumed to have a size extending to the next symbol so * these dont require any dummy Records to fill in the gaps */ struct SymbolChunk { uint64_t Address; uint32_t Size; bool bPublic; friend bool operator<(const SymbolChunk& A, const SymbolChunk& B) { return A.Address < B.Address; } }; /* Keeps track of the Filename as well as a relative offset in bytes * from the start of a contigous chunk of memory to its current location */ struct FileWithOffset { std::string Name; uint32_t RelativeOffset; }; /* Keeps track of the Symbol name as well as a relative offset in bytes * from the start of a contigous chunk of memory to its current location */ struct SymbolWithOffset { std::string Name; uint32_t RelativeOffset; }; // PUBLIC address parameter_size name struct PublicRecord { uint64_t Address; std::string ParameterSize; std::string Name; }; // https://github.com/google/breakpad/blob/master/docs/symbol_files.md // // Prefix : Info : Number of spaces // ------------------------------------------------------------------ // MODULE : operatingsystem architecture id name : 4 // FILE : number name : 2 // FUNC m : address size parameter_size name : 5 // FUNC : address size parameter_size name : 4 // address : size line filenum : 3 // PUBLIC m : address parameter_size name : 4 // PUBLIC : address parameter_size name : 3 // STACK : : 0 // Ignore // INFO : : 0 // Ignore /* We use these to split up an expected LINE into its assumed line. Gets around needing a * lexer/parser where we can just assume the number of spaces and from there what is what */ const size_t ExpectedFileSpaces = 2; const size_t ExpectedFuncSpaces = 4; const size_t ExpectedLineSpaces = 3; const size_t ExpectedPublicSpaces = 3; bool ParseSymbolFile(const std::string& SymbolFile, std::vector& out_Records, std::vector& out_FileRecords, std::vector& out_SymbolNames) { std::unordered_map FuncRecords; std::vector PublicRecords; std::vector SymbolChunks; uint32_t RelativeFileOffset = 0; uint32_t RelativeSymbolNameOffset = 0; int LineCount = 0; int IgnoreCount = 0; int ActualLineCount = 0; bool bFirstLineRecordFromFunc = false; std::vector SplitLines = SplitFilePerLine(ReadInFile(SymbolFile)); if (SplitLines.empty()) { std::cerr << "Failed to read file: '" << SymbolFile << "'" << '\n'; return false; } for (auto const& Line : SplitLines) { // address size line filenum if (BeginsWithHex(Line)) { Record Out; char const* RawLine = Line.c_str(); char* End; Out.Address = std::strtoull(RawLine, &End, 16); // Need to skip the size entry End = strchr(End + 1, ' '); Out.LineNumber = std::strtoul(End, &End, 10); // Store the actual index into the contiguous memory, which we will convert into a RelativeOffset at the end when we have the totals Out.FileRelativeOffset = std::strtoul(End, nullptr, 10); /* An example of what a FUNC + LINE records would look like: * FUNC * LINE RECORD LineNumber FileNumber * LINE RECORD LineNumber FileNumber * .... * LINE RECORD LineNumber FileNumber * * Compress simply ignores a LINE RECORD *if* the previous LINE RECORD has the same LineNumber and FileNumber * This will give us a larger Chunk size for this entry but for our use case its not required. */ if (bFirstLineRecordFromFunc) { // Same as the FileAbsoluteOffset but for the Symbol name. Need to covert at the end when all the memory we are going to write is in the structure Out.SymbolRelativeOffset = static_cast(out_SymbolNames.size() - 1); out_Records.emplace_back(Out); bFirstLineRecordFromFunc = false; ActualLineCount++; // TODO Remove just for numbers } else { Record LastRecord = out_Records.back(); // We at times have a line number of zero from dump_syms, this can mess up compressing chunks of the same PC range // just to save on size lets just assume if its a zero, use the previous line number. While not correct, not any less // incorrect then 0, and helps reduce the sym file size by a good amount if (Out.LineNumber == 0) { Out.LineNumber = LastRecord.LineNumber; } if (LastRecord.LineNumber != Out.LineNumber || LastRecord.FileRelativeOffset != Out.FileRelativeOffset) { // Same as the FileAbsoluteOffset but for the Symbol name. Need to covert at the end when all the memory we are going to write is in the structure Out.SymbolRelativeOffset = static_cast(out_SymbolNames.size() - 1); out_Records.emplace_back(Out); ActualLineCount++; // TODO Remove just for numbers } } LineCount++; } // FUNC address size parameter_size name else if (BeginsWith(Line, "FUNC")) { size_t FuncSpaces = ExpectedFuncSpaces; size_t FirstValue = 1; if (BeginsWith(Line, "FUNC m")) { FuncSpaces++; FirstValue++; } std::vector FuncSplit = SplitLineIntoNEntries(Line, FuncSpaces); if (!FuncSplit.empty()) { uint64_t Address = std::strtoull(FuncSplit[FirstValue].c_str(), 0, 16); uint32_t Size = std::strtoul(FuncSplit[FirstValue + 1].c_str(), 0, 16); FuncRecords[Address] = Address; out_SymbolNames.push_back({FuncSplit[FirstValue + 3] + "\n", RelativeSymbolNameOffset}); RelativeSymbolNameOffset += static_cast(out_SymbolNames.back().Name.size()) * sizeof(char); SymbolChunks.push_back({Address, Size, false}); bFirstLineRecordFromFunc = true; } else { std::cerr << "ERROR: Failed to split a FUNC line:\n " << Line << '\n'; } } // PUBLIC address parameter_size name else if (BeginsWith(Line, "PUBLIC")) { size_t PublicSpaces = ExpectedPublicSpaces; size_t FirstValue = 1; if (BeginsWith(Line, "PUBLIC m")) { PublicSpaces++; FirstValue++; } std::vector PublicSplit = SplitLineIntoNEntries(Line, PublicSpaces); if (!PublicSplit.empty()) { PublicRecords.push_back({ std::strtoull(PublicSplit[FirstValue].c_str(), 0, 16), PublicSplit[FirstValue + 1], PublicSplit[FirstValue + 2] }); } else { std::cerr << "ERROR: Failed to split a PUBLIC line:\n " << Line << '\n'; } } // FILE number name else if (BeginsWith(Line, "FILE")) { std::vector FileSplit = SplitLineIntoNEntries(Line, ExpectedFileSpaces); if (!FileSplit.empty()) { // Add a newline as we'll need to use that when reading later std::string Filename = FileSplit[2] + "\n"; // Maintain one style of pathing std::replace(std::begin(Filename), std::end(Filename), '\\', '/'); out_FileRecords.push_back({Filename, RelativeFileOffset}); RelativeFileOffset += static_cast(out_FileRecords.back().Name.size()) * sizeof(char); } else { std::cerr << "ERROR: Failed to split a FILE line:\n " << Line << '\n'; } } else if (BeginsWith(Line, "STACK") || BeginsWith(Line, "INFO") || BeginsWith(Line, "MODULE") || BeginsWith(Line, "INLINE")) { // Ignore IgnoreCount++; } else { std::cerr << "ERROR: Unepxected line: " << Line << '\n'; return false; } } // Only add Records for PUBLIC symbols that are not already captured by a FUNC entry int TotalPublicKept = 0; for (auto const& PRecord : PublicRecords) { if (FuncRecords.find(PRecord.Address) == FuncRecords.end()) { Record Out; Out.Address = PRecord.Address; Out.FileRelativeOffset = static_cast(-1); Out.LineNumber = static_cast(-1); out_SymbolNames.push_back({PRecord.Name + "\n", RelativeSymbolNameOffset}); RelativeSymbolNameOffset += static_cast(out_SymbolNames.back().Name.size()) * sizeof(char); // We just pushed a new symbol on the list, use that as the index when we look up offsets later Out.SymbolRelativeOffset = static_cast(out_SymbolNames.size() - 1); out_Records.emplace_back(Out); // Add all the PUBLIC symbols we need to account for so we dont add dummy Records in their locations SymbolChunks.push_back({Out.Address, 0, true}); TotalPublicKept++; } } // We have put all the FUNC and PUBLIC (non duplicates), need to sort them before generating the dummy entries std::sort(SymbolChunks.begin(), SymbolChunks.end()); // Dummy symbol name out_SymbolNames.push_back({"?????????????\n", RelativeSymbolNameOffset}); RelativeSymbolNameOffset += static_cast(out_SymbolNames.back().Name.size()) * sizeof(char); int ChunksAdded = 0; for (size_t i = 0; i < SymbolChunks.size() - 1; i++) { // We assume all public symbols extend to the next symbol if (!SymbolChunks[i].bPublic) { uint64_t Address = SymbolChunks[i].Address; uint64_t NextAddress = SymbolChunks[i + 1].Address; uint32_t Size = SymbolChunks[i].Size; if (Address + Size != NextAddress) { // Add a dummy symbol that fills in the Hole between symbols so we can assume NextAddress - Address == Size out_Records.push_back({ Address + Size, static_cast(-1), static_cast(-1), static_cast(out_SymbolNames.size() - 1) }); ChunksAdded++; } } } // Add a final dummy record for the last entry. This way you can get the size of the last entry (which is just assumed to be 4 bytes as it'll // be a public function with no defined size). out_Records.push_back({ out_Records.back().Address + 0x4, static_cast(-1), static_cast(-1), static_cast(out_SymbolNames.size() - 1) }); #if VERBOSE_DEBUG std::cout << "TotalLines: " << LineCount << " Actual Lines Added: " << ActualLineCount << " Percent compressed: " << 100 - (ActualLineCount / (float)LineCount * 100) << "%" << '\n'; std::cout << "TotalPublic: " << PublicRecords.size() << " Actual Public Added: " << TotalPublicKept << " Percent removed: " << 100 - (TotalPublicKept / (float)PublicRecords.size() * 100) << "%" << '\n'; std::cout << std::dec << "File: " << out_FileRecords.size() << "\t" << (out_FileRecords.size() / (float)SplitLines.size()) * 100 << '\n' << "Func: " << FuncRecords.size() << "\t" << (FuncRecords.size() / (float)SplitLines.size()) * 100 << '\n' << "Public: " << PublicRecords.size() << "\t" << (PublicRecords.size() / (float)SplitLines.size()) * 100 << '\n' << "Ignore: " << IgnoreCount << "\t" << (IgnoreCount / (float)SplitLines.size()) * 100 << '\n' << "Line: " << LineCount << "\t" << (LineCount / (float)SplitLines.size()) * 100 << '\n' << "Total: " << SplitLines.size() << '\n'; std::cout << "Total Record: " << out_Records.size() << "\n" << " TotalLines: " << ActualLineCount << "\n" << " TotalPublic: " << TotalPublicKept << "\n" << " EmptyChunks: " << ChunksAdded << "\n"; #endif return true; } void EncodeSymbolFile(const std::string& SymbolFile, const std::string& OutputFile) { std::vector Records; std::vector FileRecords; std::vector SymbolNames; std::string Filename; if (!ParseSymbolFile(SymbolFile, Records, FileRecords, SymbolNames)) { std::cerr << "Failed to parse '" << SymbolFile << "'" << '\n'; return; } if (SymbolFile.empty() || OutputFile.empty()) { std::cerr << "ERROR: Symbol file or Output file is empty '" << SymbolFile << "' '" << OutputFile << "'" << '\n'; return; } std::sort(Records.begin(), Records.end()); uint64_t RecordsSize = Records.size() * sizeof(Record); uint64_t FilesBytesSize = 0; for (size_t i = 0; i < FileRecords.size(); i++) { FilesBytesSize += FileRecords[i].Name.size() * sizeof(char); } uint64_t SymbolBytesSize = 0; for (size_t i = 0; i < SymbolNames.size(); i++) { SymbolBytesSize += SymbolNames[i].Name.size() * sizeof(char); } if (FilesBytesSize + SymbolBytesSize > std::numeric_limits::max()) { std::cerr << "ERROR: String section larger then the uint32_t::max() cannot encode the offsets" << '\n'; return; } if (Records.size() > std::numeric_limits::max()) { std::cerr << "ERROR: Record count greater then uint32_t::max() cannot encode the record count" << '\n'; return; } // Replace all the stored index with relative offsets from the start of the strings section in the output file for (auto& R : Records) { if (R.FileRelativeOffset < FileRecords.size()) { R.FileRelativeOffset = FileRecords[R.FileRelativeOffset].RelativeOffset; } else if (R.FileRelativeOffset != (uint32_t)-1) { std::cerr << "Error FileRelativeOffset larger then expected range, got: " << R.FileRelativeOffset << " Expect less then: " << FileRecords.size() << '\n'; } if (R.SymbolRelativeOffset != static_cast(-1)) { R.SymbolRelativeOffset = SymbolNames[R.SymbolRelativeOffset].RelativeOffset + static_cast(FilesBytesSize); } } // If we require larger then 4GB files... we'll need to reconsider this RecordsHeader Header{static_cast(Records.size())}; std::ofstream os(OutputFile, std::ios::binary); if (os.is_open()) { os.write((char*)&Header, sizeof(RecordsHeader)); os.write((char*)Records.data(), RecordsSize); for (size_t i = 0; i < FileRecords.size(); i++) { os.write((char*)&FileRecords[i].Name[0], FileRecords[i].Name.size() * sizeof(char)); } for (size_t i = 0; i < SymbolNames.size(); i++) { os.write((char*)&SymbolNames[i].Name[0], SymbolNames[i].Name.size() * sizeof(char)); } os.close(); #if VERBOSE_DEBUG std::cout << " OutputFile: " << OutputFile << '\n'; std::cout << " RecordsSize : 0x" << std::hex << RecordsSize << '\n'; std::cout << " RecordOut Offset: 0x" << std::hex << RecordsSize + sizeof(RecordsHeader) << '\n'; std::cout << " Record + Files Offset: 0x" << RecordsSize + FilesBytesSize << std::dec << '\n'; #endif } else { std::cerr << "ERROR: Failed to open file for writing: " << OutputFile << '\n'; } } } int main(int argc, char* argv[]) { if (argc > 2) { std::string SymbolFile(argv[1]); std::string OutputFile(argv[2]); #if VERBOSE_DEBUG std::cout << "Attempting to read Symbol file: '" << SymbolFile << "'" << '\n'; #endif EncodeSymbolFile(SymbolFile, OutputFile); } else { std::cerr << "Usage: " << argv[0] << " " << '\n'; } }