diff --git a/CMakeLists.txt b/CMakeLists.txt index 05b5289..f8637e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -393,6 +393,7 @@ file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/src/bloaty_package.bloaty DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) add_library(libbloaty STATIC + src/arfile.cc src/bloaty.cc src/bloaty.h src/disassemble.cc diff --git a/src/arfile.cc b/src/arfile.cc new file mode 100644 index 0000000..20533d4 --- /dev/null +++ b/src/arfile.cc @@ -0,0 +1,136 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include +#include +#include +#include "absl/numeric/int128.h" +#include "absl/strings/escaping.h" +#include "absl/strings/string_view.h" +#include "absl/strings/substitute.h" +#include "arfile.h" +#include "bloaty.h" +#include "util.h" + +#include +#include +#include + +using absl::string_view; + +namespace bloaty { + +size_t StringViewToSize(string_view str) { + // Trim trailing whitespace (AR format allows space-padding in numeric fields) + while (!str.empty() && (str.back() == ' ' || str.back() == '\t')) { + str.remove_suffix(1); + } + size_t ret; + if (!absl::SimpleAtoi(str, &ret)) { + THROWF("couldn't convert string '$0' to integer.", str); + } + return ret; +} + +bool ArFile::MemberReader::ReadMember(MemberFile* file) { + struct Header { + char file_id[16]; + char modified_timestamp[12]; + char owner_id[6]; + char group_id[6]; + char mode[8]; + char size[10]; + char end[2]; + }; + + if (remaining_.size() == 0) { + return false; + } else if (remaining_.size() < sizeof(Header)) { + THROW("Premature EOF in AR data"); + } + + const Header* header = reinterpret_cast(remaining_.data()); + file->header = Consume(sizeof(Header)); + + string_view file_id(&header->file_id[0], sizeof(header->file_id)); + string_view size_str(&header->size[0], sizeof(header->size)); + file->size = StringViewToSize(size_str); + file->contents = Consume(file->size); + file->file_type = MemberFile::kNormal; + file->format = MemberFile::GNU; + + if (file_id[0] == '/') { + // Special filename, internal to the format. + if (file_id[1] == ' ') { + file->file_type = MemberFile::kSymbolTable; + } else if (file_id[1] == '/') { + file->file_type = MemberFile::kLongFilenameTable; + long_filenames_ = file->contents; + } else if (isdigit(file_id[1])) { + size_t offset = StringViewToSize(file_id.substr(1)); + size_t end = long_filenames_.find('/', offset); + + if (end == std::string::npos) { + THROW("Unterminated long filename"); + } + + file->filename = long_filenames_.substr(offset, end - offset); + } else { + THROW("Unexpected special filename in AR archive"); + } + } else if (file_id[0] == '#' && file_id[1] == '1' && + file_id[2] == '/') { + // Darwin-style long filename: #1/N where N is the embedded filename length + file->format = MemberFile::Darwin; + size_t offset = StringViewToSize(file_id.substr(3)); + + // Validate that the filename length doesn't exceed member content size + if (offset > file->contents.size()) { + THROWF("Darwin long filename offset ($0) exceeds member size ($1)", + offset, file->contents.size()); + } + + string_view filename_data = file->contents.substr(0, offset); + size_t null_pos = filename_data.find('\0'); + if (null_pos != string_view::npos) { + file->filename = filename_data.substr(0, null_pos); + } else { + file->filename = filename_data; + } + + // Darwin archives use "__.SYMDEF" or "__.SYMDEF SORTED" for symbol tables + // (GNU uses "/" for the same purpose) + if (file->filename == "__.SYMDEF" || file->filename == "__.SYMDEF SORTED") { + file->file_type = MemberFile::kSymbolTable; + } else { + file->contents = file->contents.substr(offset); + } + } else { + // Normal filename, slash-terminated. + size_t slash = file_id.find('/'); + + if (slash == std::string::npos) { + file->format = MemberFile::BSD; + THROW("BSD-style AR not yet implemented"); + } + + file->filename = file_id.substr(0, slash); + } + + return true; +} + +} // bloaty namespace + diff --git a/src/arfile.h b/src/arfile.h new file mode 100644 index 0000000..149d123 --- /dev/null +++ b/src/arfile.h @@ -0,0 +1,135 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef BLOATY_ARFILE_H_ +#define BLOATY_ARFILE_H_ + +#include +#include +#include +#include "absl/numeric/int128.h" +#include "absl/strings/escaping.h" +#include "absl/strings/string_view.h" +#include "absl/strings/substitute.h" +#include "bloaty.h" +#include "util.h" + +#include +#include +#include + +using absl::string_view; + +namespace bloaty { + +// ArFile ////////////////////////////////////////////////////////////////////// + +// For parsing .a files (static libraries). +// +// AR archives are used for static libraries and can contain multiple object +// files. The format is ancient but still widely used. There are three main +// variants: +// +// 1. GNU format: +// - Symbol table: member named "/" +// - Long filename table: member named "//" +// - Long filename references: members named "/N" where N is offset into table +// - Short filenames: slash-terminated in header (e.g., "foo.o/") +// +// 2. Darwin format: +// - Symbol table: member named "__.SYMDEF" or "__.SYMDEF SORTED" +// - Long filenames: embedded in member data, indicated by "#1/N" where N is length +// - Short filenames: same as GNU (slash-terminated) +// +// 3. BSD format: +// - Uses space-padded filenames instead of slash-terminated +// - Currently not implemented (throws error if detected) +// +// Archive structure: +// Magic: "!\n" (8 bytes) +// For each member: +// Header: 60 bytes (file_id, timestamp, owner, group, mode, size, end marker) +// Data: size bytes (padded to even boundary for alignment) +// +// The best documentation for this file format is Wikipedia: +// https://en.wikipedia.org/wiki/Ar_(Unix) + +class ArFile { + public: + ArFile(string_view data) + : magic_(StrictSubstr(data, 0, kMagicSize)), + contents_(data.substr(std::min(data.size(), kMagicSize))) {} + + bool IsOpen() const { return magic() == string_view(kMagic); } + + string_view magic() const { return magic_; } + string_view contents() const { return contents_; } + + struct MemberFile { + enum { + kSymbolTable, // Stores a symbol table. + kLongFilenameTable, // Stores long filenames, users should ignore. + kNormal, // Regular data file. + } file_type; + + enum { + GNU, + Darwin, + BSD + } format; + + string_view filename; // Only when file_type == kNormal + size_t size; + string_view header; + string_view contents; + }; + + class MemberReader { + public: + MemberReader(const ArFile& ar) : remaining_(ar.contents()) {} + bool ReadMember(MemberFile* file); + bool IsEof() const { return remaining_.size() == 0; } + + private: + string_view Consume(size_t n) { + n = (n % 2 == 0 ? n : n + 1); + if (remaining_.size() < n) { + THROW("premature end of file"); + } + string_view ret = remaining_.substr(0, n); + remaining_.remove_prefix(n); + return ret; + } + + string_view long_filenames_; + string_view remaining_; + }; + + private: + const string_view magic_; + const string_view contents_; + + static constexpr const char* kMagic = "!\n"; + static constexpr int kMagicSize = 8; +}; + +inline bool IsArchiveFile(string_view data) { + ArFile ar(data); + return ar.IsOpen(); +} + +} // namespace bloaty + +#endif // BLOATY_ARFILE_H_ + diff --git a/src/bloaty.cc b/src/bloaty.cc index f489c60..631b96b 100644 --- a/src/bloaty.cc +++ b/src/bloaty.cc @@ -85,6 +85,7 @@ struct DataSourceDefinition { constexpr DataSourceDefinition data_sources[] = { {DataSource::kArchiveMembers, "armembers", "the .o files in a .a file"}, + {DataSource::kArchs, "archs", "architecture slices in universal binaries"}, {DataSource::kCompileUnits, "compileunits", "source file for the .o file (translation unit). requires debug info."}, {DataSource::kInputFiles, "inputfiles", diff --git a/src/bloaty.h b/src/bloaty.h index 85515fc..c9e2d1f 100644 --- a/src/bloaty.h +++ b/src/bloaty.h @@ -58,6 +58,7 @@ enum class DataSource { kRawRanges, kSections, kSegments, + kArchs, // We always set this to one of the concrete symbol types below before // setting it on a sink. diff --git a/src/elf.cc b/src/elf.cc index ac0f610..9fc02ff 100644 --- a/src/elf.cc +++ b/src/elf.cc @@ -24,6 +24,8 @@ #include "bloaty.h" #include "util.h" +#include "arfile.h" + #include #include #include @@ -50,14 +52,6 @@ struct NullFunc { T operator()(T val) { return val; } }; -size_t StringViewToSize(string_view str) { - size_t ret; - if (!absl::SimpleAtoi(str, &ret)) { - THROWF("couldn't convert string '$0' to integer.", str); - } - return ret; -} - template void AdvancePastStruct(string_view* data) { *data = data->substr(sizeof(T)); @@ -577,127 +571,6 @@ bool ElfFile::FindSectionByName(std::string_view name, Section* section) const { return false; } - -// ArFile ////////////////////////////////////////////////////////////////////// - -// For parsing .a files (static libraries). -// -// The best documentation I've been able to find for this file format is -// Wikipedia: https://en.wikipedia.org/wiki/Ar_(Unix) -// -// So far we only parse the System V / GNU variant. - -class ArFile { - public: - ArFile(string_view data) - : magic_(StrictSubstr(data, 0, kMagicSize)), - contents_(data.substr(std::min(data.size(), kMagicSize))) {} - - bool IsOpen() const { return magic() == string_view(kMagic); } - - string_view magic() const { return magic_; } - string_view contents() const { return contents_; } - - struct MemberFile { - enum { - kSymbolTable, // Stores a symbol table. - kLongFilenameTable, // Stores long filenames, users should ignore. - kNormal, // Regular data file. - } file_type; - string_view filename; // Only when file_type == kNormal - size_t size; - string_view header; - string_view contents; - }; - - class MemberReader { - public: - MemberReader(const ArFile& ar) : remaining_(ar.contents()) {} - bool ReadMember(MemberFile* file); - bool IsEof() const { return remaining_.size() == 0; } - - private: - string_view Consume(size_t n) { - n = (n % 2 == 0 ? n : n + 1); - if (remaining_.size() < n) { - THROW("premature end of file"); - } - string_view ret = remaining_.substr(0, n); - remaining_.remove_prefix(n); - return ret; - } - - string_view long_filenames_; - string_view remaining_; - }; - - private: - const string_view magic_; - const string_view contents_; - - static constexpr const char* kMagic = "!\n"; - static constexpr int kMagicSize = 8; -}; - -bool ArFile::MemberReader::ReadMember(MemberFile* file) { - struct Header { - char file_id[16]; - char modified_timestamp[12]; - char owner_id[6]; - char group_id[6]; - char mode[8]; - char size[10]; - char end[2]; - }; - - if (remaining_.size() == 0) { - return false; - } else if (remaining_.size() < sizeof(Header)) { - THROW("Premature EOF in AR data"); - } - - const Header* header = reinterpret_cast(remaining_.data()); - file->header = Consume(sizeof(Header)); - - string_view file_id(&header->file_id[0], sizeof(header->file_id)); - string_view size_str(&header->size[0], sizeof(header->size)); - file->size = StringViewToSize(size_str); - file->contents = Consume(file->size); - file->file_type = MemberFile::kNormal; - - if (file_id[0] == '/') { - // Special filename, internal to the format. - if (file_id[1] == ' ') { - file->file_type = MemberFile::kSymbolTable; - } else if (file_id[1] == '/') { - file->file_type = MemberFile::kLongFilenameTable; - long_filenames_ = file->contents; - } else if (isdigit(file_id[1])) { - size_t offset = StringViewToSize(file_id.substr(1)); - size_t end = long_filenames_.find('/', offset); - - if (end == std::string::npos) { - THROW("Unterminated long filename"); - } - - file->filename = long_filenames_.substr(offset, end - offset); - } else { - THROW("Unexpected special filename in AR archive"); - } - } else { - // Normal filename, slash-terminated. - size_t slash = file_id.find('/'); - - if (slash == std::string::npos) { - THROW("BSD-style AR not yet implemented"); - } - - file->filename = file_id.substr(0, slash); - } - - return true; -} - void MaybeAddFileRange(const char* analyzer, RangeSink* sink, string_view label, string_view range) { if (sink) { @@ -771,11 +644,6 @@ static uint64_t ToVMAddr(uint64_t addr, uint64_t ndx, bool is_object) { } } -static bool IsArchiveFile(string_view data) { - ArFile ar(data); - return ar.IsOpen(); -} - static bool IsObjectFile(string_view data) { ElfFile elf(data); return IsArchiveFile(data) || (elf.IsOpen() && elf.header().e_type == ET_REL); @@ -1401,6 +1269,8 @@ class ElfObjectFile : public ObjectFile { DoReadELFSections(sink, kReportByEscapedSectionName); break; } + case DataSource::kArchs: + THROW("ELF files do not support 'archs' data source"); default: THROW("unknown data source"); } @@ -1479,8 +1349,19 @@ class ElfObjectFile : public ObjectFile { std::unique_ptr TryOpenELFFile(std::unique_ptr& file) { ElfFile elf(file->data()); ArFile ar(file->data()); - if (elf.IsOpen() || ar.IsOpen()) { + + if (elf.IsOpen()) { return std::unique_ptr(new ElfObjectFile(std::move(file))); + } else if (ar.IsOpen()) { + ArFile::MemberFile member; + ArFile::MemberReader reader(ar); + + /* If the first archive member is GNU handle it as ELF */ + if (reader.ReadMember(&member) && member.format == ArFile::MemberFile::GNU) { + return std::unique_ptr(new ElfObjectFile(std::move(file))); + } else { + return nullptr; + } } else { return nullptr; } diff --git a/src/macho.cc b/src/macho.cc index cd826fe..6f5a3b9 100644 --- a/src/macho.cc +++ b/src/macho.cc @@ -21,9 +21,12 @@ #include #include "absl/strings/str_join.h" +#include "absl/strings/str_format.h" #include "absl/strings/substitute.h" -#include "third_party/darwin_xnu_macho/mach-o/loader.h" +#include "arfile.h" +#include "third_party/darwin_xnu_macho/mach/machine.h" #include "third_party/darwin_xnu_macho/mach-o/fat.h" +#include "third_party/darwin_xnu_macho/mach-o/loader.h" #include "third_party/darwin_xnu_macho/mach-o/nlist.h" #include "third_party/darwin_xnu_macho/mach-o/reloc.h" @@ -69,11 +72,103 @@ void MaybeAddOverhead(RangeSink* sink, const char* label, string_view data) { } } +// ARM64E capability field constants +static constexpr uint32_t ARM64E_SUBTYPE_MASK = 0x00FFFFFF; // Low 24 bits: subtype proper + +static bool IsArm64eSubtype(uint32_t cpusubtype) { + uint32_t subtype_proper = cpusubtype & ARM64E_SUBTYPE_MASK; + return subtype_proper == CPU_SUBTYPE_ARM64E; +} + +std::string CpuTypeToString(uint32_t cputype, uint32_t cpusubtype) { + switch (cputype) { + case CPU_TYPE_X86_64: + switch (cpusubtype) { + case CPU_SUBTYPE_X86_64_H: + return "x86_64h"; + default: + return "x86_64"; + } + case CPU_TYPE_ARM64: + if (IsArm64eSubtype(cpusubtype)) { + return "arm64e"; + } + switch (cpusubtype) { + case CPU_SUBTYPE_ARM64_V8: + return "arm64v8"; + default: + return "arm64"; + } + case CPU_TYPE_X86: + return "i386"; + case CPU_TYPE_ARM: + switch (cpusubtype) { + case CPU_SUBTYPE_ARM_V6: + return "armv6"; + case CPU_SUBTYPE_ARM_V7: + return "armv7"; + case CPU_SUBTYPE_ARM_V7F: + return "armv7f"; + case CPU_SUBTYPE_ARM_V7S: + return "armv7s"; + case CPU_SUBTYPE_ARM_V7K: + return "armv7k"; + case CPU_SUBTYPE_ARM_V8: + return "armv8"; + default: + return "arm"; + } + default: + return absl::StrFormat("cpu_%d", cputype); + } +} + +void MaybeAddFileRange(const char* analyzer, RangeSink* sink, string_view label, + string_view range) { + if (sink) { + sink->AddFileRange(analyzer, label, range); + } +} + +static bool IsMachOContent(string_view data, std::string* error_msg = nullptr) { + if (data.size() < sizeof(uint32_t)) { + if (error_msg) *error_msg = "File too small for Mach-O header"; + return false; + } + + if (data.size() == 0 || std::all_of(data.begin(), data.begin() + std::min(data.size(), size_t(64)), + [](char c) { return c == 0; })) { + if (error_msg) *error_msg = "File appears to be empty or all zeros"; + return false; + } + + try { + uint32_t magic = macho::ReadMagic(data); + switch (magic) { + case MH_MAGIC: + case MH_MAGIC_64: + case MH_CIGAM: + case MH_CIGAM_64: + return true; + case FAT_MAGIC: + case FAT_CIGAM: + return true; + default: + if (error_msg) *error_msg = absl::StrFormat("Unknown magic: 0x%08x", magic); + return false; + } + } catch (const std::exception& e) { + if (error_msg) *error_msg = std::string("Parse error: ") + e.what(); + return false; + } +} + struct LoadCommand { bool is64bit; uint32_t cmd; string_view command_data; string_view file_data; + string_view filename; }; template @@ -84,7 +179,7 @@ bool Is64Bit() { return true; } template void ParseMachOHeaderImpl(string_view macho_data, RangeSink* overhead_sink, - Func&& loadcmd_func) { + string_view filename, Func&& loadcmd_func) { string_view header_data = macho_data; auto header = GetStructPointerAndAdvance(&header_data); MaybeAddOverhead(overhead_sink, @@ -107,6 +202,7 @@ void ParseMachOHeaderImpl(string_view macho_data, RangeSink* overhead_sink, data.cmd = command->cmd; data.command_data = StrictSubstr(header_data, 0, command->cmdsize); data.file_data = macho_data; + data.filename = filename; std::forward(loadcmd_func)(data); MaybeAddOverhead(overhead_sink, "[Mach-O Headers]", data.command_data); @@ -116,7 +212,7 @@ void ParseMachOHeaderImpl(string_view macho_data, RangeSink* overhead_sink, template void ParseMachOHeader(string_view macho_file, RangeSink* overhead_sink, - Func&& loadcmd_func) { + string_view filename, Func&& loadcmd_func) { uint32_t magic = ReadMagic(macho_file); switch (magic) { case MH_MAGIC: @@ -128,11 +224,11 @@ void ParseMachOHeader(string_view macho_file, RangeSink* overhead_sink, // there are existing 32-bit binaries floating around, so we might // as well support them. ParseMachOHeaderImpl(macho_file, overhead_sink, - std::forward(loadcmd_func)); + filename, std::forward(loadcmd_func)); break; case MH_MAGIC_64: ParseMachOHeaderImpl( - macho_file, overhead_sink, std::forward(loadcmd_func)); + macho_file, overhead_sink, filename, std::forward(loadcmd_func)); break; case MH_CIGAM: case MH_CIGAM_64: @@ -154,24 +250,113 @@ void ParseMachOHeader(string_view macho_file, RangeSink* overhead_sink, } } +template +void ParseArchiveMembers(string_view archive_data, RangeSink* overhead_sink, + string_view arch_suffix, Func&& loadcmd_func) { + ArFile ar_file(archive_data); + if (!ar_file.IsOpen()) { + return; + } + + ArFile::MemberFile member; + ArFile::MemberReader reader(ar_file); + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", ar_file.magic()); + + while (reader.ReadMember(&member)) { + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", member.header); + + switch (member.file_type) { + case ArFile::MemberFile::kNormal: { + std::string error_msg; + if (IsMachOContent(member.contents, &error_msg)) { + try { + std::string member_name = arch_suffix.empty() + ? std::string(member.filename) + : absl::StrFormat("%s [%s]", member.filename, arch_suffix); + + uint32_t magic = macho::ReadMagic(member.contents); + if (magic == FAT_MAGIC || magic == FAT_CIGAM) { + ParseFatHeader(member.contents, overhead_sink, member_name, + std::forward(loadcmd_func)); + } else { + ParseMachOHeader(member.contents, overhead_sink, member_name, + std::forward(loadcmd_func)); + } + } catch (const std::exception& e) { + WARN("Failed to parse archive member '$0': $1", member.filename, e.what()); + } + } else { + std::string label = arch_suffix.empty() + ? absl::StrFormat("[AR Non-Mach-O: %s]", member.filename) + : absl::StrFormat("[AR Non-Mach-O: %s [%s]]", member.filename, arch_suffix); + MaybeAddFileRange("ar_archive", overhead_sink, label, member.contents); + } + break; + } + case ArFile::MemberFile::kSymbolTable: + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Symbol Table]", member.contents); + break; + case ArFile::MemberFile::kLongFilenameTable: + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", member.contents); + break; + } + } +} + template void ParseFatHeader(string_view fat_file, RangeSink* overhead_sink, - Func&& loadcmd_func) { + string_view filename, Func&& loadcmd_func) { string_view header_data = fat_file; auto header = GetStructPointerAndAdvance(&header_data); MaybeAddOverhead(overhead_sink, "[Mach-O Headers]", fat_file.substr(0, sizeof(fat_header))); - assert(ByteSwap(header->magic) == FAT_MAGIC); - uint32_t nfat_arch = ByteSwap(header->nfat_arch); + + bool need_swap = (header->magic == FAT_CIGAM); + if (header->magic != FAT_MAGIC && header->magic != FAT_CIGAM) { + THROW("Invalid FAT magic"); + } + + uint32_t nfat_arch = need_swap ? ByteSwap(header->nfat_arch) : header->nfat_arch; + + if (nfat_arch > header_data.size() / sizeof(fat_arch)) { + THROW("invalid nfat_arch count in universal binary header"); + } + + // Process all architectures in universal binaries. + // Use --source-filter to filter to a specific architecture. + for (uint32_t i = 0; i < nfat_arch; i++) { auto arch = GetStructPointerAndAdvance(&header_data); - string_view macho_data = StrictSubstr( - fat_file, ByteSwap(arch->offset), ByteSwap(arch->size)); - ParseMachOHeader(macho_data, overhead_sink, - std::forward(loadcmd_func)); + + uint32_t offset = need_swap ? ByteSwap(arch->offset) : arch->offset; + uint32_t size = need_swap ? ByteSwap(arch->size) : arch->size; + uint32_t cputype = need_swap ? ByteSwap(arch->cputype) : arch->cputype; + uint32_t cpusubtype = need_swap ? ByteSwap(arch->cpusubtype) : arch->cpusubtype; + + string_view arch_data = StrictSubstr(fat_file, offset, size); + std::string arch_name = CpuTypeToString(cputype, cpusubtype); + + ArFile ar_file(arch_data); + if (ar_file.IsOpen()) { + ParseArchiveMembers(arch_data, overhead_sink, arch_name, + std::forward(loadcmd_func)); + } else { + // If this is an archive member, append architecture name + std::string arch_filename; + if (!filename.empty()) { + arch_filename = absl::StrFormat("%s [%s]", filename, arch_name); + ParseMachOHeader(arch_data, overhead_sink, arch_filename, + std::forward(loadcmd_func)); + } else { + ParseMachOHeader(arch_data, overhead_sink, "", + std::forward(loadcmd_func)); + } + } } } +static bool g_warned_about_universal_in_archive = false; + template void ForEachLoadCommand(string_view maybe_fat_file, RangeSink* overhead_sink, Func&& loadcmd_func) { @@ -181,14 +366,100 @@ void ForEachLoadCommand(string_view maybe_fat_file, RangeSink* overhead_sink, case MH_MAGIC_64: case MH_CIGAM: case MH_CIGAM_64: - ParseMachOHeader(maybe_fat_file, overhead_sink, + ParseMachOHeader(maybe_fat_file, overhead_sink, "", std::forward(loadcmd_func)); break; case FAT_CIGAM: - ParseFatHeader(maybe_fat_file, overhead_sink, + case FAT_MAGIC: + ParseFatHeader(maybe_fat_file, overhead_sink, "", std::forward(loadcmd_func)); break; } + + ArFile ar_file(maybe_fat_file); + + if (ar_file.IsOpen()) { + ArFile::MemberFile member; + ArFile::MemberReader reader(ar_file); + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", ar_file.magic()); + + int processed_count = 0; + int skipped_count = 0; + bool has_universal_binaries = false; + + while (reader.ReadMember(&member)) { + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", member.header); + + switch (member.file_type) { + case ArFile::MemberFile::kNormal: { + std::string error_msg; + if (IsMachOContent(member.contents, &error_msg)) { + try { + uint32_t magic = macho::ReadMagic(member.contents); + switch (magic) { + case MH_MAGIC: + case MH_MAGIC_64: + case MH_CIGAM: + case MH_CIGAM_64: + ParseMachOHeader(member.contents, overhead_sink, member.filename, + std::forward(loadcmd_func)); + processed_count++; + break; + case FAT_MAGIC: + case FAT_CIGAM: + has_universal_binaries = true; + ParseFatHeader(member.contents, overhead_sink, member.filename, + std::forward(loadcmd_func)); + processed_count++; + break; + default: + // This shouldn't happen with IsMachOContent check but be safe + MaybeAddFileRange("ar_archive", overhead_sink, + absl::StrFormat("[AR Unknown Mach-O: %s]", member.filename), + member.contents); + skipped_count++; + } + } catch (const std::exception& e) { + WARN("Failed to parse Mach-O member '$0': $1", member.filename, e.what()); + MaybeAddFileRange("ar_archive", overhead_sink, + absl::StrFormat("[AR Corrupt Mach-O: %s]", member.filename), + member.contents); + skipped_count++; + } + } else { + MaybeAddFileRange("ar_archive", overhead_sink, + absl::StrFormat("[AR Non-Mach-O: %s]", member.filename), + member.contents); + skipped_count++; + } + break; + } + case ArFile::MemberFile::kSymbolTable: + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Symbol Table]", + member.contents); + break; + case ArFile::MemberFile::kLongFilenameTable: + MaybeAddFileRange("ar_archive", overhead_sink, "[AR Headers]", + member.contents); + break; + } + } + + if (verbose_level > 1 && (processed_count > 0 || skipped_count > 0)) { + printf("Archive processing complete: %d Mach-O members processed, %d skipped\n", + processed_count, skipped_count); + } + + // Warn when processing universal binaries without --source-filter. + // Each architecture in a universal binary has its own independent VM address + // space, so summing VM sizes across different architectures is meaningless + // Use --domain=file for meaningful size comparisons, or --source-filter + // to filter to a single architecture. + if (has_universal_binaries && overhead_sink && !g_warned_about_universal_in_archive) { + fprintf(stderr, "Warning: Archive contains universal binaries. VM size totals across different architectures are not meaningful. Consider using --domain=file or --source-filter= to filter to a specific architecture.\n"); + g_warned_about_universal_in_archive = true; + } + } } template @@ -283,6 +554,17 @@ void ParseSegment(LoadCommand cmd, RangeSink* sink) { StrictSubstr(cmd.file_data, section->offset, filesize)); } } + } else if (sink->data_source() == DataSource::kArchiveMembers) { + if (!cmd.filename.empty()) { + if (unmapped) { + sink->AddFileRange( + "macho_armember", cmd.filename, + StrictSubstr(cmd.file_data, segment->fileoff, segment->filesize)); + } else { + sink->AddRange("macho_armember", cmd.filename, segment->vmaddr, segment->vmsize, + StrictSubstr(cmd.file_data, segment->fileoff, segment->filesize)); + } + } } else { BLOATY_UNREACHABLE(); } @@ -496,6 +778,18 @@ static void AddMachOFallback(RangeSink* sink) { sink->AddFileRange("macho_fallback", "[Unmapped]", sink->input_file().data()); } +static void AddMachOArchiveMemberFallback(RangeSink* sink) { + if (sink->data_source() == DataSource::kArchiveMembers) { + ForEachLoadCommand( + sink->input_file().data(), sink, + [sink](const LoadCommand& cmd) { + if (!cmd.filename.empty()) { + sink->AddFileRange("unmapped_armember", cmd.filename, cmd.file_data); + } + }); + } +} + template void ReadDebugSectionsFromSegment(LoadCommand cmd, dwarf::File *dwarf, RangeSink *sink) { @@ -619,7 +913,14 @@ class MachOObjectFile : public ObjectFile { ReadDWARFInlines(dwarf, sink, true); break; } + case DataSource::kArchs: { + ProcessArchitectures(sink); + break; + } case DataSource::kArchiveMembers: + ParseLoadCommands(sink); + AddMachOArchiveMemberFallback(sink); + break; default: THROW("Mach-O doesn't support this data source"); } @@ -627,6 +928,34 @@ class MachOObjectFile : public ObjectFile { } } + void ProcessArchitectures(RangeSink* sink) const { + uint32_t magic = ReadMagic(file_data().data()); + + if (magic == FAT_CIGAM) { + string_view header_data = file_data().data(); + auto header = GetStructPointerAndAdvance(&header_data); + uint32_t nfat_arch = ByteSwap(header->nfat_arch); + + for (uint32_t i = 0; i < nfat_arch; i++) { + auto arch = GetStructPointerAndAdvance(&header_data); + uint32_t cputype = ByteSwap(arch->cputype); + uint32_t cpusubtype = ByteSwap(arch->cpusubtype); + uint32_t offset = ByteSwap(arch->offset); + uint32_t size = ByteSwap(arch->size); + + std::string arch_name = CpuTypeToString(cputype, cpusubtype); + string_view slice_data = StrictSubstr(file_data().data(), offset, size); + + sink->AddFileRange("archs", arch_name, slice_data); + } + } else { + auto header = GetStructPointer(file_data().data()); + std::string arch_name = CpuTypeToString(header->cputype, header->cpusubtype); + + sink->AddFileRange("archs", arch_name, file_data().data()); + } + } + bool GetDisassemblyInfo(std::string_view /*symbol*/, DataSource /*symbol_source*/, DisassemblyInfo* /*info*/) const override { @@ -640,12 +969,22 @@ class MachOObjectFile : public ObjectFile { std::unique_ptr TryOpenMachOFile(std::unique_ptr &file) { uint32_t magic = macho::ReadMagic(file->data()); + ArFile ar(file->data()); // We only support little-endian host and little endian binaries (see // ParseMachOHeader() for more rationale). Fat headers are always on disk as // big-endian. - if (magic == MH_MAGIC || magic == MH_MAGIC_64 || magic == FAT_CIGAM) { + if (magic == MH_MAGIC || magic == MH_MAGIC_64 || magic == FAT_MAGIC || magic == FAT_CIGAM) { return std::unique_ptr( new macho::MachOObjectFile(std::move(file))); + } else if (ar.IsOpen()) { + ArFile::MemberFile member; + ArFile::MemberReader reader(ar); + /* if the first archive member is Darwin handle it as macho */ + if (reader.ReadMember(&member) && member.format == ArFile::MemberFile::Darwin) { + return std::unique_ptr(new macho::MachOObjectFile(std::move(file))); + } else { + return nullptr; + } } return nullptr; diff --git a/tests/macho/archive-basic.test b/tests/macho/archive-basic.test new file mode 100644 index 0000000..128df28 --- /dev/null +++ b/tests/macho/archive-basic.test @@ -0,0 +1,7 @@ +# Test that bloaty can parse Mach-O archive files + +# RUN: %bloaty %p/../testdata/macho/simple.a -d sections --domain=vm | %FileCheck %s + +# CHECK: VM SIZE +# CHECK: ,__text +# CHECK: TOTAL diff --git a/tests/macho/archive-long-filename.test b/tests/macho/archive-long-filename.test new file mode 100644 index 0000000..86c7ff7 --- /dev/null +++ b/tests/macho/archive-long-filename.test @@ -0,0 +1,10 @@ +# Test that bloaty can parse Mach-O archives with long filenames +# +# This test verifies that bloaty correctly handles Darwin-style long filenames +# in archive members using the #1/N format, where N is the embedded filename length. + +# RUN: %bloaty %p/../testdata/macho/long-filename.a -d armembers --domain=vm | %FileCheck %s + +# CHECK: VM SIZE +# CHECK: a_filename_that_is_longer_than_sixteen_chars.o +# CHECK: TOTAL diff --git a/tests/macho/archive-source-filter.test b/tests/macho/archive-source-filter.test new file mode 100644 index 0000000..f762c7e --- /dev/null +++ b/tests/macho/archive-source-filter.test @@ -0,0 +1,22 @@ +# Test that --source-filter can filter architectures in archives containing universal binaries +# +# This test verifies that bloaty can filter to a specific architecture in archives +# containing universal binaries using the --source-filter flag. +# +# Note: Without segment-aware VM addressing, use file domain for universal archives. + +# Test filtering to arm64 architecture +# RUN: %bloaty %p/../testdata/macho/universal.a -d armembers --domain=file --source-filter=arm64 2>&1 | %FileCheck %s --check-prefix=ARM64 + +# ARM64: FILE SIZE +# ARM64: func_universal.o [arm64] +# ARM64-NOT: func_universal.o [x86_64] +# ARM64: TOTAL + +# Test filtering to x86_64 architecture +# RUN: %bloaty %p/../testdata/macho/universal.a -d armembers --domain=file --source-filter=x86_64 2>&1 | %FileCheck %s --check-prefix=X86_64 + +# X86_64: FILE SIZE +# X86_64: func_universal.o [x86_64] +# X86_64-NOT: func_universal.o [arm64] +# X86_64: TOTAL diff --git a/tests/macho/archive-universal.test b/tests/macho/archive-universal.test new file mode 100644 index 0000000..f24eace --- /dev/null +++ b/tests/macho/archive-universal.test @@ -0,0 +1,14 @@ +# Test that bloaty can parse Mach-O archives containing universal binaries +# +# This test verifies that bloaty can open and analyze archives containing +# universal binaries (multiple architectures in one file). +# +# Note: Without segment-aware VM addressing, VM domain doesn't work correctly +# for universal archives (architectures' address spaces collide). Use file domain. + +# RUN: %bloaty %p/../testdata/macho/universal.a -d armembers --domain=file | %FileCheck %s + +# CHECK: FILE SIZE +# CHECK: func_universal.o [x86_64] +# CHECK: func_universal.o [arm64] +# CHECK: TOTAL diff --git a/tests/macho/archs.test b/tests/macho/archs.test new file mode 100644 index 0000000..a8374b4 --- /dev/null +++ b/tests/macho/archs.test @@ -0,0 +1,263 @@ +# Test -d archs data source for mach-o universal binaries +# +# Tests that the 'archs' data source correctly reports architecture slices +# in universal binaries and single-architecture binaries. + +## Test 1: Universal binary with two architectures (x86_64 and arm64) +# RUN: %yaml2obj --docnum=1 %s -o %t.universal +# RUN: %bloaty %t.universal -d archs --domain=file | %FileCheck --check-prefix=UNIVERSAL %s + +# UNIVERSAL: FILE SIZE +# UNIVERSAL-DAG: x86_64 +# UNIVERSAL-DAG: arm64 +# UNIVERSAL-DAG: [Unmapped] + +## Test 2: Filter to x86_64 architecture only +# RUN: %bloaty %t.universal -d archs,segments --source-filter=x86_64 --domain=file | %FileCheck --check-prefix=FILTER-X86 %s + +# FILTER-X86: FILE SIZE +# FILTER-X86: x86_64 +# FILTER-X86: __TEXT +# FILTER-X86: __LINKEDIT +# FILTER-X86-NOT: arm64 + +## Test 3: Filter to arm64 architecture only +# RUN: %bloaty %t.universal -d archs,segments --source-filter=arm64 --domain=file | %FileCheck --check-prefix=FILTER-ARM %s + +# FILTER-ARM: FILE SIZE +# FILTER-ARM: arm64 +# FILTER-ARM: __TEXT +# FILTER-ARM: __LINKEDIT +# FILTER-ARM-NOT: x86_64 + +## Test 4: Single architecture binary +# RUN: %yaml2obj --docnum=2 %s -o %t.single +# RUN: %bloaty %t.single -d archs --domain=file | %FileCheck --check-prefix=SINGLE %s + +# SINGLE: FILE SIZE +# SINGLE: x86_64 +# SINGLE-NOT: arm64 + +## Universal binary with x86_64 and arm64 slices +--- !fat-mach-o +FatHeader: + magic: 0xCAFEBABE + nfat_arch: 2 +FatArchs: + - cputype: 0x1000007 + cpusubtype: 0x3 + offset: 0x1000 + size: 4176 + align: 12 + - cputype: 0x100000C + cpusubtype: 0x0 + offset: 0x2050 + size: 8280 + align: 12 +Slices: + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __PAGEZERO + vmaddr: 0 + vmsize: 4294967296 + fileoff: 0 + filesize: 0 + maxprot: 0 + initprot: 0 + nsects: 0 + flags: 0 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000F80 + size: 8 + offset: 0xF80 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E531C05DC3 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 80 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + LinkEditData: + NameList: + - n_strx: 1 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294971264 + StringTable: + - ' ' + - _main + - !mach-o + FileHeader: + magic: 0xFEEDFACF + cputype: 0x100000C + cpusubtype: 0x0 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 + LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __PAGEZERO + vmaddr: 0 + vmsize: 4294967296 + fileoff: 0 + filesize: 0 + maxprot: 0 + initprot: 0 + nsects: 0 + flags: 0 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 8192 + fileoff: 0 + filesize: 8192 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100001F80 + size: 8 + offset: 0x1F80 + align: 2 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 00008052C0035FD6 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294975488 + vmsize: 4096 + fileoff: 8192 + filesize: 88 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 + LinkEditData: + NameList: + - n_strx: 1 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294975360 + StringTable: + - ' ' + - _main + +## Single x86_64 Mach-O executable +--- !mach-o +FileHeader: + magic: 0xFEEDFACF + cputype: 0x1000007 + cpusubtype: 0x3 + filetype: 0x2 + ncmds: 3 + sizeofcmds: 328 + flags: 0x200085 + reserved: 0x0 +LoadCommands: + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __PAGEZERO + vmaddr: 0 + vmsize: 4294967296 + fileoff: 0 + filesize: 0 + maxprot: 0 + initprot: 0 + nsects: 0 + flags: 0 + - cmd: LC_SEGMENT_64 + cmdsize: 152 + segname: __TEXT + vmaddr: 4294967296 + vmsize: 4096 + fileoff: 0 + filesize: 4096 + maxprot: 5 + initprot: 5 + nsects: 1 + flags: 0 + Sections: + - sectname: __text + segname: __TEXT + addr: 0x100000F80 + size: 8 + offset: 0xF80 + align: 4 + reloff: 0x0 + nreloc: 0 + flags: 0x80000400 + reserved1: 0x0 + reserved2: 0x0 + reserved3: 0x0 + content: 554889E531C05DC3 + - cmd: LC_SEGMENT_64 + cmdsize: 72 + segname: __LINKEDIT + vmaddr: 4294971392 + vmsize: 4096 + fileoff: 4096 + filesize: 80 + maxprot: 1 + initprot: 1 + nsects: 0 + flags: 0 +LinkEditData: + NameList: + - n_strx: 1 + n_type: 0xF + n_sect: 1 + n_desc: 0 + n_value: 4294971264 + StringTable: + - ' ' + - _main diff --git a/tests/macho/lipo-universal-archive.test b/tests/macho/lipo-universal-archive.test new file mode 100644 index 0000000..4cd49cb --- /dev/null +++ b/tests/macho/lipo-universal-archive.test @@ -0,0 +1,14 @@ +# Test that bloaty can parse universal archives created with lipo -create +# +# This test verifies that bloaty can handle the case where lipo -create is used +# to combine two archives (one per architecture), creating a FAT binary where +# each slice is an entire AR archive rather than a Mach-O file. +# +# This previously failed with "Corrupt Mach-O file" error before the fix. + +# RUN: %bloaty %p/../testdata/macho/lipo-universal.a -d armembers --domain=file | %FileCheck %s + +# CHECK: FILE SIZE +# CHECK: test_x86_64.o [x86_64] +# CHECK: test_arm64.o [arm64] +# CHECK: TOTAL diff --git a/tests/testdata/macho/lipo-universal.a b/tests/testdata/macho/lipo-universal.a new file mode 100644 index 0000000..6f09ed4 Binary files /dev/null and b/tests/testdata/macho/lipo-universal.a differ diff --git a/tests/testdata/macho/long-filename.a b/tests/testdata/macho/long-filename.a new file mode 100644 index 0000000..6bfd191 Binary files /dev/null and b/tests/testdata/macho/long-filename.a differ diff --git a/tests/testdata/macho/simple.a b/tests/testdata/macho/simple.a new file mode 100644 index 0000000..77110b3 Binary files /dev/null and b/tests/testdata/macho/simple.a differ diff --git a/tests/testdata/macho/universal.a b/tests/testdata/macho/universal.a new file mode 100644 index 0000000..1016640 Binary files /dev/null and b/tests/testdata/macho/universal.a differ