3
\$\begingroup\$

The structure of a class file consists of a single structure (presented here using pseudostructures written in a C-like structure notation):

ClassFile { u4 magic; u2 minor_version; u2 major_version; u2 constant_pool_count; cp_info constant_pool[constant_pool_count-1]; u2 access_flags; u2 this_class; u2 super_class; u2 interfaces_count; u2 interfaces[interfaces_count]; u2 fields_count; field_info fields[fields_count]; u2 methods_count; method_info methods[methods_count]; u2 attributes_count; attribute_info attributes[attributes_count]; } 

The script parses the class file into a dictionary, except for the attributes, but does not verify that the file is valid/correct. (I did not deem that a good use of my time)

For a class file generated for this simple program:

public class Main { public static void main(String[] args) { System.out.println("Hello, World!"); } } 

the script produces:

{'access_flags': ['ACC_FINAL', 'ACC_INTERFACE', 'ACC_ABSTRACT', 'ACC_SYNTHETIC', 'ACC_ANNOTATION', 'ACC_ENUM'], 'attributes': [{'attribute_length': 2, 'attribute_name_index': 13, 'info': b'\x00\x0e'}], 'attributes_count': 1, 'constant_pool': [{'class_index': 6, 'name_and_type_index': 15, 'tag': 10}, {'class_index': 16, 'name_and_type_index': 17, 'tag': 9}, {'name_index': 18, 'tag': 8}, {'class_index': 19, 'name_and_type_index': 20, 'tag': 10}, {'name_index': 21, 'tag': 7}, {'name_index': 22, 'tag': 7}, {'bytes': b'<init>', 'length': 6, 'tag': 1}, {'bytes': b'()V', 'length': 3, 'tag': 1}, {'bytes': b'Code', 'length': 4, 'tag': 1}, {'bytes': b'LineNumberTable', 'length': 15, 'tag': 1}, {'bytes': b'main', 'length': 4, 'tag': 1}, {'bytes': b'([Ljava/lang/String;)V', 'length': 22, 'tag': 1}, {'bytes': b'SourceFile', 'length': 10, 'tag': 1}, {'bytes': b'Main.java', 'length': 9, 'tag': 1}, {'descriptor_index': 8, 'name_index': 7, 'tag': 12}, {'name_index': 23, 'tag': 7}, {'descriptor_index': 25, 'name_index': 24, 'tag': 12}, {'bytes': b'Hello, World!', 'length': 13, 'tag': 1}, {'name_index': 26, 'tag': 7}, {'descriptor_index': 28, 'name_index': 27, 'tag': 12}, {'bytes': b'Main', 'length': 4, 'tag': 1}, {'bytes': b'java/lang/Object', 'length': 16, 'tag': 1}, {'bytes': b'java/lang/System', 'length': 16, 'tag': 1}, {'bytes': b'out', 'length': 3, 'tag': 1}, {'bytes': b'Ljava/io/PrintStream;', 'length': 21, 'tag': 1}, {'bytes': b'java/io/PrintStream', 'length': 19, 'tag': 1}, {'bytes': b'println', 'length': 7, 'tag': 1}, {'bytes': b'(Ljava/lang/String;)V', 'length': 21, 'tag': 1}], 'constant_pool_count': 29, 'fields': [], 'fields_count': 0, 'interfaces': [], 'interfaces_count': 0, 'magic': '0XCAFEBABE', 'major': 55, 'methods': [{'access_flags': ['ACC_PRIVATE', 'ACC_PROTECTED', 'ACC_STATIC', 'ACC_FINAL', 'ACC_SYNCHRONIZED', 'ACC_BRIDGE', 'ACC_VARARGS', 'ACC_NATIVE', 'ACC_ABSTRACT', 'ACC_STRICT', 'ACC_SYNTHETIC'], 'attributes': [{'attribute_length': 29, 'attribute_name_index': 9, 'info': b'\x00\x01\x00\x01\x00\x00\x00\x05' b'*\xb7\x00\x01\xb1\x00\x00\x00' b'\x01\x00\n\x00\x00\x00\x06\x00' b'\x01\x00\x00\x00\x01'}], 'attributes_count': 1, 'descriptor_index': 8, 'name_index': 7}, {'access_flags': ['ACC_PRIVATE', 'ACC_PROTECTED', 'ACC_FINAL', 'ACC_SYNCHRONIZED', 'ACC_BRIDGE', 'ACC_VARARGS', 'ACC_NATIVE', 'ACC_ABSTRACT', 'ACC_STRICT', 'ACC_SYNTHETIC'], 'attributes': [{'attribute_length': 37, 'attribute_name_index': 9, 'info': b'\x00\x02\x00\x01\x00\x00\x00\t' b'\xb2\x00\x02\x12\x03\xb6\x00\x04' b'\xb1\x00\x00\x00\x01\x00\n\x00' b'\x00\x00\n\x00\x02\x00\x00\x00' b'\x03\x00\x08\x00\x04'}], 'attributes_count': 1, 'descriptor_index': 12, 'name_index': 11}], 'methods_count': 2, 'minor': 0, 'super_class': 6, 'this_class': 5} 

Code:

#!/usr/bin/env python3 from enum import Enum from io import BytesIO from pathlib import Path from pprint import pprint import typer # fmt: off # This got rather duplicative. class Constants(Enum): CONSTANT_Class = 7 CONSTANT_Fieldref = 9 CONSTANT_Methodref = 10 CONSTANT_InterfaceMethodref = 11 CONSTANT_String = 8 CONSTANT_Integer = 3 CONSTANT_Float = 4 CONSTANT_Long = 5 CONSTANT_Double = 6 CONSTANT_NameAndType = 12 CONSTANT_Utf8 = 1 CONSTANT_MethodHandle = 15 CONSTANT_MethodType = 16 CONSTANT_InvokeDynamic = 18 ACCESS_FLAGS = { "class": [ ("ACC_PUBLIC" ,0x0001), ("ACC_FINAL" ,0x0010), ("ACC_SUPER" ,0x0020), ("ACC_INTERFACE" ,0x0200), ("ACC_ABSTRACT" ,0x0400), ("ACC_SYNTHETIC" ,0x1000), ("ACC_ANNOTATION" ,0x2000), ("ACC_ENUM" ,0x4000), ], "field": [ ("ACC_PUBLIC" ,0x0001), ("ACC_PRIVATE" ,0x0002), ("ACC_PROTECTED" ,0x0004), ("ACC_STATIC" ,0x0008), ("ACC_FINAL" ,0x0010), ("ACC_VOLATILE" ,0x0040), ("ACC_TRANSIENT" ,0x0080), ("ACC_SYNTHETIC" ,0x1000), ("ACC_ENUM" ,0x4000), ], "method": [ ("ACC_PUBLIC" ,0x0001), ("ACC_PRIVATE" ,0x0002), ("ACC_PROTECTED" ,0x0004), ("ACC_STATIC" ,0x0008), ("ACC_FINAL" ,0x0010), ("ACC_SYNCHRONIZED" ,0x0020), ("ACC_BRIDGE" ,0x0040), ("ACC_VARARGS" ,0x0080), ("ACC_NATIVE" ,0x0100), ("ACC_ABSTRACT" ,0x0400), ("ACC_STRICT" ,0x0800), ("ACC_SYNTHETIC" ,0x1000), ], } # fmt: on def parse_ux(file: BytesIO, length: int) -> int: return int.from_bytes(file.read(length), "big") def parse_u1(file: BytesIO) -> int: return parse_ux(file, 1) def parse_u2(file: BytesIO) -> int: return parse_ux(file, 2) def parse_u4(file: BytesIO) -> int: return parse_ux(file, 4) def parse_constant_pool(f: BytesIO, pool_size: int) -> int: constant_pool = [] # We could map each constant tag to its corresponding processing logic. # Would that be better? This looks horrendous. for _ in range(pool_size): cp_info = {} tag = parse_u1(f) constant = Constants(tag) if constant in ( Constants.CONSTANT_Methodref, Constants.CONSTANT_InterfaceMethodref, Constants.CONSTANT_Fieldref, ): cp_info["tag"] = constant.value cp_info["class_index"] = parse_u2(f) cp_info["name_and_type_index"] = parse_u2(f) elif constant in (Constants.CONSTANT_Class, Constants.CONSTANT_String): cp_info["tag"] = constant.value cp_info["name_index"] = parse_u2(f) elif constant == Constants.CONSTANT_Utf8: cp_info["tag"] = constant.value cp_info["length"] = parse_u2(f) cp_info["bytes"] = f.read(cp_info["length"]) elif constant == Constants.CONSTANT_NameAndType: cp_info["tag"] = constant.value cp_info["name_index"] = parse_u2(f) cp_info["descriptor_index"] = parse_u2(f) elif constant in (Constants.CONSTANT_Integer, Constants.CONSTANT_Float): cp_info["tag"] = constant.value cp_info["bytes"] = f.read(4) elif constant in (Constants.CONSTANT_Long, Constants.CONSTANT_Double): cp_info["tag"] = constant.value cp_info["high_bytes"] = f.read(4) cp_info["low_bytes"] = f.read(4) elif constant == Constants.CONSTANT_MethodHandle: cp_info["tag"] = constant.value cp_info["reference_kind"] = parse_u1(f) cp_info["reference_index"] = parse_u2(f) elif constant == Constants.CONSTANT_MethodType: cp_info["tag"] = constant.value cp_info["descriptor_index"] = parse_u2(f) elif constant == Constants.CONSTANT_InvokeDynamic: cp_info["tag"] = constant.value cp_info["bootstrap_method_attr_index"] = parse_u2(f) cp_info["name_and_type_index"] = parse_u2(f) else: assert False, f"Unexpected tag encountered {tag = }" constant_pool.append(cp_info) return constant_pool def parse_access_flags(val: int, flags: [(str, int)]) -> list[str]: return [name for (name, mask) in flags if not (val & mask)] def parse_attributes(f: BytesIO, attributes_count: int) -> list: attributes = [] for _ in range(attributes_count): attribute_info = {} attribute_info["attribute_name_index"] = parse_u2(f) attribute_info["attribute_length"] = parse_u4(f) attribute_info["info"] = f.read(attribute_info["attribute_length"]) attributes.append(attribute_info) return attributes def parse_methods(f: BytesIO, methods_count: int) -> list: methods = [] for _ in range(methods_count): method_info = {} method_info["access_flags"] = parse_access_flags( parse_u2(f), ACCESS_FLAGS["method"] ) method_info["name_index"] = parse_u2(f) method_info["descriptor_index"] = parse_u2(f) method_info["attributes_count"] = parse_u2(f) method_info["attributes"] = parse_attributes(f, method_info["attributes_count"]) methods.append(method_info) return methods def parse_fields(f: BytesIO, fields_count: int) -> dict: fields = [] for _ in range(fields_count): field_info = {} field_info["access_flags"] = parse_access_flags( parse_u2(f), ACCESS_FLAGS["field"] ) field_info["name_index"] = parse_u2(f) field_info["descriptor_index"] = parse_u2(f) field_info["attributes_count"] = parse_u2(f) field_info["attributes"] = parse_attributes(f, field_info["attributes_count"]) fields.append(field_info) return fields def parse_interfaces(f: BytesIO, interfaces_count: int) -> dict: interfaces = [] for _ in range(interfaces_count): parse_u1(f) # Discard tag class_info = {"tag": "CONSTANT_Class", "name_index": parse_u2()} interfaces.append(class_info) return interfaces def parse_class_file(f: BytesIO) -> dict: class_file = {} class_file["magic"] = str(hex(parse_u4(f))).upper() class_file["minor"] = parse_u2(f) class_file["major"] = parse_u2(f) class_file["constant_pool_count"] = parse_u2(f) class_file["constant_pool"] = parse_constant_pool( f, class_file["constant_pool_count"] - 1 ) class_file["access_flags"] = parse_access_flags(parse_u2(f), ACCESS_FLAGS["class"]) class_file["this_class"] = parse_u2(f) class_file["super_class"] = parse_u2(f) class_file["interfaces_count"] = parse_u2(f) class_file["interfaces"] = parse_interfaces(f, class_file["interfaces_count"]) class_file["fields_count"] = parse_u2(f) class_file["fields"] = parse_fields(f, class_file["fields_count"]) class_file["methods_count"] = parse_u2(f) class_file["methods"] = parse_methods(f, class_file["methods_count"]) class_file["attributes_count"] = parse_u2(f) class_file["attributes"] = parse_attributes(f, class_file["attributes_count"]) return class_file def main(file_path: Path) -> None: with open(file_path, mode="rb") as f: class_file = parse_class_file(BytesIO(f.read())) pprint(class_file) if __name__ == "__main__": typer.run(main) 

Review Request:

Bugs, general coding comments, style, idiomatic code, et cetera.

PS: This was done as a recreational activity.

\$\endgroup\$
3
  • \$\begingroup\$It's fine that it was a recreational activity; but is this the end purpose? If not, how is the parsed struct then used?\$\endgroup\$CommentedFeb 23, 2024 at 12:54
  • \$\begingroup\$Well, I might someday parse the bytecode too to run a small hello world program, but for now, this is the finished program, yes.\$\endgroup\$CommentedFeb 23, 2024 at 12:56
  • \$\begingroup\$"In real life" you definitely don't want to parse byte code to run it; you need to call into an FFI. There are many options.\$\endgroup\$CommentedFeb 23, 2024 at 13:00

1 Answer 1

3
\$\begingroup\$

Your reference is extremely out-of-date; refer to version 21. Luckily the JVM hasn't changed much.

Typer seems like overkill for a program that unconditionally accepts one command-line argument. I scarcely consider that justification for bringing in a third-party library.

Your Constants shouldn't be an Enum; it should be an IntEnum. Your ACCESS_FLAGS should not be a dict of lists; it should be split out into separate IntFlags.

When you print the constant tag, don't print the number; print the symbol. repr (!r) will do this.

I consider int.from_bytes and the variable-length method used in parse_ux to be less explicit than the other two options I'll be demonstrating, which are struct unpacking and ctypes unpacking. Your parse_fields and similar methods should be entirely replaced with big-endian structure definitions.

Don't use dictionaries for internal program data; they aren't well-typed.

Your script will not be very useful until you resolve the constant indices to their respective structures. For instance, your output 'attribute_name_index': 9 would be replaced with a reference to the corresponding constant string.

Replace open(file_path, mode="rb") with file_path.open().

It's actually a pretty reasonable idea to in-memory buffer the file content before deserialising it, and may have performance advantages; but for simplicity I do not include this in my demonstration.

Suggested

The following is a little long-winded, but demonstrates some of the concepts I've talked about above. It has nearly mypy-compliant types, save for the functional enums that mypy does not support.

#!/usr/bin/env python3 import ctypes import struct import sys from dataclasses import dataclass from enum import IntEnum, IntFlag from functools import partial from io import BufferedIOBase from itertools import chain from pathlib import Path from typing import Callable, ClassVar, Iterator, NamedTuple, Type, TypeVar # Spec from # https://docs.oracle.com/javase/specs/jvms/se21/html/jvms-4.html # Since we don't require strict validation, this captures all flags that don't # have multiple definitions. ACCESS_SHARED = { 'PUBLIC' : 0x0001, 'PRIVATE' : 0x0002, 'PROTECTED' : 0x0004, 'STATIC' : 0x0008, 'FINAL' : 0x0010, 'NATIVE' : 0x0100, 'INTERFACE' : 0x0200, 'ABSTRACT' : 0x0400, 'STRICT' : 0x0800, 'SYNTHETIC' : 0x1000, 'ANNOTATION': 0x2000, 'ENUM' : 0x4000, } # This functional enum form is not mypy-compatible. CommonAccess = IntFlag('CommonAccess', ACCESS_SHARED) ClassAccess = IntFlag('ClassAccess', { **ACCESS_SHARED, 'SUPER': 0x0020, 'MODULE': 0x8000, }) MethodAccess = IntFlag('MethodAccess', { **ACCESS_SHARED, 'SYNCHRONIZED' : 0x0020, 'BRIDGE' : 0x0040, 'VARARGS' : 0x0080, }) ParameterAccess = IntFlag('ParameterAccess', { **ACCESS_SHARED, 'MANDATED' : 0x8000, }) ModuleAccess = IntFlag('ModuleAccess', { **ACCESS_SHARED, 'OPEN' : 0x0020, 'MANDATED' : 0x8000, }) ModuleRequiresAccess = IntFlag('ModuleRequiresAccess', { **ACCESS_SHARED, 'TRANSITIVE' : 0x0020, 'STATIC_PHASE' : 0x0040, 'MANDATED' : 0x8000, }) FieldAccess = IntFlag('FieldAccess', { **ACCESS_SHARED, 'VOLATILE' : 0x0040, 'TRANSIENT' : 0x0080, }) class ConstantTag(IntEnum): UTF8 = 1 INTEGER = 3 FLOAT = 4 LONG = 5 DOUBLE = 6 CLASS = 7 STRING = 8 FIELD_REF = 9 METHOD_REF = 10 INTERFACE_METHOD_REF = 11 NAME_AND_TYPE = 12 METHOD_HANDLE = 15 METHOD_TYPE = 16 DYNAMIC = 17 INVOKE_DYNAMIC = 18 MODULE = 19 PACKAGE = 20 class ReferenceKind(IntEnum): getField = 1 getStatic = 2 putField = 3 putStatic = 4 invokeVirtual = 5 invokeStatic = 6 invokeSpecial = 7 newInvokeSpecial = 8 invokeInterface = 9 class Version(ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('magic', ctypes.c_uint32), ('minor_version', ctypes.c_uint16), ('major_version', ctypes.c_uint16), ) __slots__ = [k for k, t in _fields_] class Constant: CHILDREN: ClassVar[tuple[str, ...]] class ClassConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('name_index', ctypes.c_uint16), ) CHILDREN = 'name_index', __slots__ = ('name_index', 'name_constant') def __str__(self) -> str: return str(self.name_constant) ModuleConstant = ClassConstant PackageConstant = ClassConstant class DoubleConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( # Don't represent as high bytes and low bytes in the spec; # directly unpack to value ('value', ctypes.c_double), ) CHILDREN = () __slots__ = 'value', def __str__(self) -> str: return str(self.value) class DynamicConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('bootstrap_method_attr_index', ctypes.c_uint16), ('name_and_type_index', ctypes.c_uint16), ) CHILDREN = ('bootstrap_method_attr_index', 'name_and_type_index') __slots__ = ( 'bootstrap_method_attr_index', 'bootstrap_method_attr_constant', 'name_and_type_index', 'name_and_type_constant', ) def __str__(self) -> str: return f'{self.name_and_type_constant} -> {self.bootstrap_method_attr_constant}' class FloatConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('value', ctypes.c_float), ) CHILDREN = () __slots__ = 'value', def __str__(self) -> str: return str(self.value) class IntegerConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('value', ctypes.c_int32), ) CHILDREN = () __slots__ = 'value', def __str__(self) -> str: return str(self.value) class LongConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('value', ctypes.c_int64), ) CHILDREN = () __slots__ = 'value', def __str__(self) -> str: return str(self.value) InvokeDynamicConstant = DynamicConstant class MethodHandleConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('reference_kind', ctypes.c_uint8), ('reference_index', ctypes.c_uint16), ) CHILDREN = 'reference_index', __slots__ = ( 'reference_kind', 'reference_index', 'reference_constant', ) @property def kind(self) -> ReferenceKind: return ReferenceKind(self.reference_kind) def __str__(self) -> str: return f'{self.kind.name} {self.reference_constant}' class MethodRefConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('class_index', ctypes.c_uint16), ('name_and_type_index', ctypes.c_uint16), ) CHILDREN = ('class_index', 'name_and_type_index') __slots__ = ( 'class_index', 'class_constant', 'name_and_type_index', 'name_and_type_constant', ) def __str__(self) -> str: return str(self.name_and_type_constant) class MethodTypeConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('descriptor_index', ctypes.c_uint16), ) CHILDREN = 'descriptor_index', __slots__ = ('descriptor_index', 'descriptor_constant') def __str__(self) -> str: return str(self.descriptor_constant) FieldRefConstant = MethodRefConstant InterfaceMethodConstant = MethodRefConstant class NameAndTypeConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('name_index', ctypes.c_uint16), ('descriptor_index', ctypes.c_uint16), ) CHILDREN = ('name_index', 'descriptor_index') __slots__ = ( 'name_index', 'name_constant', 'descriptor_index', 'descriptor_constant', ) def __str__(self) -> str: return f'{self.name_constant} "{self.descriptor_constant}"' class StringConstant(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('string_index', ctypes.c_uint16), ) CHILDREN = 'string_index', __slots__ = ('string_index', 'string_constant') def __str__(self) -> str: return str(self.string_constant) class AttributeInfo(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('attribute_name_index', ctypes.c_uint16), ('attribute_length', ctypes.c_uint32), ) data: bytes CHILDREN = 'attribute_name_index', __slots__ = ( 'attribute_name_index', 'attribute_name_constant', 'attribute_length', 'data', ) def __str__(self) -> str: return f'{self.attribute_name_constant}' class FieldInfo(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('access_flags', ctypes.c_uint16), ('name_index', ctypes.c_uint16), ('descriptor_index', ctypes.c_uint16), ('attributes_count', ctypes.c_uint16), ) CHILDREN = ('name_index', 'descriptor_index') attributes: tuple[AttributeInfo, ...] __slots__ = ( 'name_index', 'name_constant', 'descriptor_index', 'descriptor_constant', 'access_flags', 'attributes_count', 'attributes', ) @property def access(self) -> FieldAccess: return FieldAccess(self.access_flags) def __str__(self) -> str: s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"' attrs = ', '.join(str(a) for a in self.attributes) if attrs: s += ' @ ' + attrs return s class MethodInfo(Constant, ctypes.BigEndianStructure): _pack_ = 1 _fields_ = ( ('access_flags', ctypes.c_uint16), ('name_index', ctypes.c_uint16), ('descriptor_index', ctypes.c_uint16), ('attributes_count', ctypes.c_uint16), ) CHILDREN = ('name_index', 'descriptor_index') attributes: tuple[AttributeInfo, ...] __slots__ = ( 'name_index', 'name_constant', 'descriptor_index', 'descriptor_constant', 'access_flags', 'attributes_count', 'attributes', ) @property def access(self) -> MethodAccess: return MethodAccess(self.access_flags) def __str__(self) -> str: s = f'{self.access!r} {self.name_constant} "{self.descriptor_constant}"' attrs = ', '.join(str(a) for a in self.attributes) if attrs: s += ' @ ' + attrs return s @dataclass(frozen=True, slots=True) class UTF8Constant(Constant): length: int bytes_: bytes CHILDREN = () @classmethod def read(cls, f: BufferedIOBase) -> 'UTF8Constant': length = read_short(f) bytes_ = f.read(length) return cls(length=length, bytes_=bytes_) def __str__(self) -> str: return self.bytes_.decode(encoding='utf8') StructT = TypeVar('StructT', bound=ctypes.BigEndianStructure) def read_struct(f: BufferedIOBase, type_: Type[StructT]) -> StructT: value = type_() f.readinto(value) return value def read_short(f: BufferedIOBase) -> int: fmt = '>H' buffer = f.read(struct.calcsize(fmt)) value, = struct.unpack(fmt, buffer) return value def read_indices(f: BufferedIOBase, n: int) -> tuple[int, ...]: fmt = f'>{n}H' buffer = f.read(struct.calcsize(fmt)) return struct.unpack(fmt, buffer) def bind_read(type_: Type[StructT]) -> Callable[[BufferedIOBase], StructT]: return partial(read_struct, type_=type_) CONSTANT_READERS = { ConstantTag.CLASS: bind_read(ClassConstant), ConstantTag.DOUBLE: bind_read(DoubleConstant), ConstantTag.DYNAMIC: bind_read(DynamicConstant), ConstantTag.FIELD_REF: bind_read(FieldRefConstant), ConstantTag.FLOAT: bind_read(FloatConstant), ConstantTag.INTEGER: bind_read(IntegerConstant), ConstantTag.INTERFACE_METHOD_REF: bind_read(InterfaceMethodConstant), ConstantTag.INVOKE_DYNAMIC: bind_read(InvokeDynamicConstant), ConstantTag.LONG: bind_read(LongConstant), ConstantTag.METHOD_REF: bind_read(MethodRefConstant), ConstantTag.METHOD_HANDLE: bind_read(MethodHandleConstant), ConstantTag.METHOD_TYPE: bind_read(MethodTypeConstant), ConstantTag.MODULE: bind_read(ModuleConstant), ConstantTag.NAME_AND_TYPE: bind_read(NameAndTypeConstant), ConstantTag.PACKAGE: bind_read(PackageConstant), ConstantTag.STRING: bind_read(StringConstant), ConstantTag.UTF8: UTF8Constant.read, } def generate_constants(f: BufferedIOBase, n: int) -> Iterator[Constant]: for _ in range(n): tag_value, = f.read(1) tag = ConstantTag(tag_value) yield CONSTANT_READERS[tag](f) def generate_attrs(f: BufferedIOBase, n: int) -> Iterator[AttributeInfo]: for _ in range(n): attr = read_struct(f, AttributeInfo) attr.data = f.read(attr.attribute_length) yield attr class Class(NamedTuple): major_version: int minor_version: int access_flags: ClassAccess constants: tuple[Constant, ...] this_class: Constant super_class: Constant interfaces: tuple[Constant, ...] fields: tuple[FieldInfo, ...] methods: tuple[MethodInfo, ...] attributes: tuple[AttributeInfo, ...] @classmethod def deserialise(cls, f: BufferedIOBase) -> 'Class': version = read_struct(f=f, type_=Version) constant_pool_count = read_short(f) constant_pool = tuple(generate_constants(f, n=constant_pool_count - 1)) access_flags = ClassAccess(read_short(f)) this_class = read_short(f) super_class = read_short(f) interfaces_count = read_short(f) interfaces = read_indices(f=f, n=interfaces_count) fields_count = read_short(f) fields = [ ( field := read_struct(f, FieldInfo), tuple(generate_attrs(f, field.attributes_count)), ) for _ in range(fields_count) ] methods_count = read_short(f) methods = [ ( method := read_struct(f, MethodInfo), tuple(generate_attrs(f, method.attributes_count)), ) for _ in range(methods_count) ] attributes_count = read_short(f) attributes = tuple(generate_attrs(f, attributes_count)) trailing = len(f.read()) if trailing != 0: raise ValueError(f'{trailing} trailing bytes after deserialise') return cls._traverse( version=version, constants=constant_pool, access_flags=access_flags, this_idx=this_class, interfaces=interfaces, super_idx=super_class, attributes=attributes, fields=fields, methods=methods, ) @classmethod def _traverse( cls, version: Version, constants: tuple[Constant, ...], access_flags: ClassAccess, this_idx: int, super_idx: int, interfaces: tuple[int, ...], fields: list[ tuple[ FieldInfo, tuple[AttributeInfo, ...], ] ], methods: list[ tuple[ MethodInfo, tuple[AttributeInfo, ...], ] ], attributes: tuple[AttributeInfo, ...], ) -> 'Class': field_constants = [field[0] for field in fields] method_constants = [method[0] for method in methods] all_nodes: tuple[Constant, ...] = ( *constants, *field_constants, *method_constants, *chain.from_iterable(field[1] for field in fields), *chain.from_iterable(method[1] for method in methods), *attributes, ) for constant in all_nodes: for child_name in constant.CHILDREN: varname = child_name.removesuffix('_index') + '_constant' child = constants[getattr(constant, child_name)] setattr(constant, varname, child) for field, attrs in fields: field.attributes = attrs for method, attrs in methods: method.attributes = attrs return cls( major_version=version.major_version, minor_version=version.minor_version, access_flags=access_flags, constants=constants, this_class=constants[this_idx], super_class=constants[super_idx], interfaces=tuple( constants[idx] for idx in interfaces ), fields=tuple(field_constants), methods=tuple(method_constants), attributes=attributes, ) def dump(self, verbose: bool = False) -> Iterator[str]: yield f'Version {self.major_version}.{self.minor_version}' yield f'Class: {self.this_class}' yield f'Super: {self.super_class}' yield f'Access flags: {self.access_flags!r}' yield 'Attributes:' for attr in self.attributes: yield f' {attr}' yield 'Fields:' for field in self.fields: yield f' {field}' yield 'Methods:' for method in self.methods: yield f' {method}' yield 'Interfaces:' for iface in self.interfaces: yield f' {iface}' if verbose: yield 'Constant pool:' for constant in self.constants: yield f' {constant}' def main() -> None: _, file_path = sys.argv with Path(file_path).open(mode='rb') as f: class_ = Class.deserialise(f) print('\n'.join(class_.dump(verbose=True))) if __name__ == '__main__': main() 

Output (simple example)

Version 65.0 Class: I "comparator" Super: ()V "TopByOrder" Access flags: <ClassAccess.SUPER|PUBLIC: 33> Attributes: Ljava/util/Comparator<TE;>; TopByOrder.java InnerClasses CountingComparator Fields: <FieldAccess.FINAL|PUBLIC: 17> I "comparator" <FieldAccess.FINAL|PUBLIC: 17> Ljava/util/Comparator; "(ILjava/util/Comparator;)V"Ljava/util/Comparator<TE;>; Methods: <MethodAccess.PUBLIC: 1> ()V "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>; <MethodAccess.PUBLIC: 1> (Ljava/util/Collection;)Ljava/util/PriorityQueue; "java/util/Collection" @ LineNumberTable, Ljava/util/Comparator<TE;>; <MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "Ljava/lang/String;" @ LineNumberTable 

Output (more complex example)

This one has Dynamics.

Version 65.0 Class: (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V" Super: ()V "java/lang/Object" Access flags: <ClassAccess.SUPER|FINAL|PUBLIC: 49> Attributes: MultipleGroupPermuterDemo.java <ReferenceKind.invokeVirtual: 5> (Ljava/lang/Integer;)I "<ReferenceKind.invokeStatic: 6> (Ljava/lang/invoke/MethodHandles$Lookup;Ljava/lang/String;Ljava/lang/invoke/MethodType;Ljava/lang/String;[Ljava/lang/Object;)Ljava/lang/invoke/CallSite;" Lookup Fields: Methods: <MethodAccess.PUBLIC: 1> ()V "java/lang/Object" @ LineNumberTable <MethodAccess.STATIC|PUBLIC: 9> ([Ljava/lang/String;)V "groupPermutation" @ LineNumberTable <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/Map;)I "makeConcatWithConstants -> <init>" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/lang/String; "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I <MethodAccess.STATIC|PRIVATE: 10> format "java/io/PrintStream" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I <MethodAccess.STATIC|PRIVATE: 10> (Ljava/util/List;)Ljava/util/List; "(Ljava/util/List;)V" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I <MethodAccess.STATIC|PRIVATE: 10> java/lang/System "computeGroupPermutations" @ LineNumberTable, <T:Ljava/lang/Object;>(Ljava/util/Map<Ljava/util/List<Ljava/util/List<TT;>;>;Ljava/lang/Integer;>;)I 
\$\endgroup\$

    Start asking to get answers

    Find the answer to your question by asking.

    Ask question

    Explore related questions

    See similar questions with these tags.