diff --git a/Doc/c-api/perfmaps.rst b/Doc/c-api/perfmaps.rst index 76a1e9f528dc70..bd05e628faaaa1 100644 --- a/Doc/c-api/perfmaps.rst +++ b/Doc/c-api/perfmaps.rst @@ -31,7 +31,7 @@ Note that holding an :term:`attached thread state` is not required for these API or ``-2`` on failure to create a lock. Check ``errno`` for more information about the cause of a failure. -.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, unsigned int code_size, const char *entry_name) +.. c:function:: int PyUnstable_WritePerfMapEntry(const void *code_addr, size_t code_size, const char *entry_name) Write one single entry to the ``/tmp/perf-$pid.map`` file. This function is thread safe. Here is what an example entry looks like:: diff --git a/Doc/deprecations/pending-removal-in-3.20.rst b/Doc/deprecations/pending-removal-in-3.20.rst index 176e8f3f9f601c..12d7acf5ce05b4 100644 --- a/Doc/deprecations/pending-removal-in-3.20.rst +++ b/Doc/deprecations/pending-removal-in-3.20.rst @@ -38,3 +38,8 @@ Pending removal in Python 3.20 - :mod:`zlib` (Contributed by Hugo van Kemenade and Stan Ulbrych in :gh:`76007`.) + +* :mod:`ast`: + + * Creating instances of abstract AST nodes (such as :class:`ast.AST` + or :class:`!ast.expr`) is deprecated and will raise an error in Python 3.20. diff --git a/Doc/library/ast.rst b/Doc/library/ast.rst index 3c6e8745474316..e10b8e22df0fef 100644 --- a/Doc/library/ast.rst +++ b/Doc/library/ast.rst @@ -42,7 +42,7 @@ Node classes .. class:: AST - This is the base of all AST node classes. The actual node classes are + This is the abstract base of all AST node classes. The actual node classes are derived from the :file:`Parser/Python.asdl` file, which is reproduced :ref:`above `. They are defined in the :mod:`!_ast` C module and re-exported in :mod:`!ast`. @@ -168,6 +168,15 @@ Node classes arguments that were set as attributes of the AST node, even if they did not match any of the fields of the AST node. These cases now raise a :exc:`TypeError`. +.. deprecated-removed:: next 3.20 + + In the :ref:`grammar above `, the AST node classes that + correspond to production rules with variants (aka "sums") are abstract + classes. Previous versions of Python allowed for the creation of direct + instances of these abstract node classes. This behavior is deprecated and + will be removed in Python 3.20. + + .. note:: The descriptions of the specific node classes displayed here were initially adapted from the fantastic `Green Tree @@ -271,18 +280,25 @@ Root nodes Literals ^^^^^^^^ -.. class:: Constant(value) +.. class:: Constant(value, kind) A constant value. The ``value`` attribute of the ``Constant`` literal contains the Python object it represents. The values represented can be instances of :class:`str`, :class:`bytes`, :class:`int`, :class:`float`, :class:`complex`, and :class:`bool`, and the constants :data:`None` and :data:`Ellipsis`. + The ``kind`` attribute is an optional string. For string literals with a + ``u`` prefix, ``kind`` is set to ``'u'``. For all other + constants, ``kind`` is ``None``. + .. doctest:: >>> print(ast.dump(ast.parse('123', mode='eval'), indent=4)) Expression( body=Constant(value=123)) + >>> print(ast.dump(ast.parse("u'hello'", mode='eval'), indent=4)) + Expression( + body=Constant(value='hello', kind='u')) .. class:: FormattedValue(value, conversion, format_spec) @@ -2536,6 +2552,20 @@ and classes for traversing abstract syntax trees: Added the *color* parameter. +.. function:: compare(a, b, /, *, compare_attributes=False) + + Recursively compares two ASTs. + + *compare_attributes* affects whether AST attributes are considered + in the comparison. If *compare_attributes* is ``False`` (default), then + attributes are ignored. Otherwise they must all be equal. This + option is useful to check whether the ASTs are structurally equal but + differ in whitespace or similar details. Attributes include line numbers + and column offsets. + + .. versionadded:: 3.14 + + .. _ast-compiler-flags: Compiler flags @@ -2571,20 +2601,6 @@ effects on the compilation of a program: .. versionadded:: 3.8 -.. function:: compare(a, b, /, *, compare_attributes=False) - - Recursively compares two ASTs. - - *compare_attributes* affects whether AST attributes are considered - in the comparison. If *compare_attributes* is ``False`` (default), then - attributes are ignored. Otherwise they must all be equal. This - option is useful to check whether the ASTs are structurally equal but - differ in whitespace or similar details. Attributes include line numbers - and column offsets. - - .. versionadded:: 3.14 - - .. _ast-cli: Command-line usage diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index b63e7a4790e9af..78e464f2a5a6d8 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -1846,6 +1846,13 @@ Deprecated New deprecations ---------------- +* :mod:`ast` + + * Creating instances of abstract AST nodes (such as :class:`ast.AST` + or :class:`!ast.expr`) is deprecated and will raise an error in Python 3.20. + + (Contributed by Brian Schubert in :gh:`116021`.) + * :mod:`base64`: * Accepting the ``+`` and ``/`` characters with an alternative alphabet in diff --git a/Include/cpython/ceval.h b/Include/cpython/ceval.h index bbab8d35b75cb2..5b66fa1040d738 100644 --- a/Include/cpython/ceval.h +++ b/Include/cpython/ceval.h @@ -38,7 +38,7 @@ typedef struct { PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void); PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name); PyAPI_FUNC(void) PyUnstable_PerfMapState_Fini(void); PyAPI_FUNC(int) PyUnstable_CopyPerfMapFile(const char* parent_filename); diff --git a/Include/internal/pycore_ast_state.h b/Include/internal/pycore_ast_state.h index 1caf200ee34b2a..32c12fb5875e8e 100644 --- a/Include/internal/pycore_ast_state.h +++ b/Include/internal/pycore_ast_state.h @@ -161,6 +161,7 @@ struct ast_state { PyObject *__module__; PyObject *_attributes; PyObject *_fields; + PyObject *abstract_types; PyObject *alias_type; PyObject *annotation; PyObject *arg; diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index f9507fda1606db..fd4221f0816d24 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -94,7 +94,7 @@ typedef struct { void* (*init_state)(void); // Callback to register every trampoline being created void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); // Callback to free the trampoline state int (*free_state)(void* state); } _PyPerf_Callbacks; @@ -108,6 +108,10 @@ extern PyStatus _PyPerfTrampoline_AfterFork_Child(void); #ifdef PY_HAVE_PERF_TRAMPOLINE extern _PyPerf_Callbacks _Py_perfmap_callbacks; extern _PyPerf_Callbacks _Py_perfmap_jit_callbacks; +extern void _PyPerfJit_WriteNamedCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename); #endif static inline PyObject* diff --git a/Include/internal/pycore_debug_offsets.h b/Include/internal/pycore_debug_offsets.h index c166f963da4f66..1dd10f8d94cfd8 100644 --- a/Include/internal/pycore_debug_offsets.h +++ b/Include/internal/pycore_debug_offsets.h @@ -215,6 +215,7 @@ typedef struct _Py_DebugOffsets { uint64_t state; uint64_t length; uint64_t asciiobject_size; + uint64_t compactunicodeobject_size; } unicode_object; // GC runtime state offset; @@ -370,6 +371,7 @@ typedef struct _Py_DebugOffsets { .state = offsetof(PyUnicodeObject, _base._base.state), \ .length = offsetof(PyUnicodeObject, _base._base.length), \ .asciiobject_size = sizeof(PyASCIIObject), \ + .compactunicodeobject_size = sizeof(PyCompactUnicodeObject), \ }, \ .gc = { \ .size = sizeof(struct _gc_runtime_state), \ diff --git a/Include/internal/pycore_global_objects_fini_generated.h b/Include/internal/pycore_global_objects_fini_generated.h index 4d6d5ce9c5ea26..f7d3dcd440aaf1 100644 --- a/Include/internal/pycore_global_objects_fini_generated.h +++ b/Include/internal/pycore_global_objects_fini_generated.h @@ -1582,6 +1582,7 @@ _PyStaticObjects_CheckRefcnt(PyInterpreterState *interp) { _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alias)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(align)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all)); + _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_interpreters)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(all_threads)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(allow_code)); _PyStaticObject_CheckRefcnt((PyObject *)&_Py_ID(alphabet)); diff --git a/Include/internal/pycore_global_strings.h b/Include/internal/pycore_global_strings.h index 20dcf81ccf15fa..22494b1798cc53 100644 --- a/Include/internal/pycore_global_strings.h +++ b/Include/internal/pycore_global_strings.h @@ -305,6 +305,7 @@ struct _Py_global_strings { STRUCT_FOR_ID(alias) STRUCT_FOR_ID(align) STRUCT_FOR_ID(all) + STRUCT_FOR_ID(all_interpreters) STRUCT_FOR_ID(all_threads) STRUCT_FOR_ID(allow_code) STRUCT_FOR_ID(alphabet) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index cccfe3565db6e0..86f018e328656e 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -69,7 +69,7 @@ struct code_arena_st; struct trampoline_api_st { void* (*init_state)(void); void (*write_state)(void* state, const void *code_addr, - unsigned int code_size, PyCodeObject* code); + size_t code_size, PyCodeObject* code); int (*free_state)(void* state); void *state; Py_ssize_t code_padding; diff --git a/Include/internal/pycore_jit.h b/Include/internal/pycore_jit.h index b3cadcce8247d0..2f97cc26eaf115 100644 --- a/Include/internal/pycore_jit.h +++ b/Include/internal/pycore_jit.h @@ -23,7 +23,7 @@ typedef _Py_CODEUNIT *(*jit_func)( _PyStackRef _tos_cache0, _PyStackRef _tos_cache1, _PyStackRef _tos_cache2 ); -_Py_CODEUNIT *_PyJIT( +_Py_CODEUNIT *_PyJIT_Entry( _PyExecutorObject *executor, _PyInterpreterFrame *frame, _PyStackRef *stack_pointer, PyThreadState *tstate ); diff --git a/Include/internal/pycore_jit_unwind.h b/Include/internal/pycore_jit_unwind.h new file mode 100644 index 00000000000000..2d325ad9562286 --- /dev/null +++ b/Include/internal/pycore_jit_unwind.h @@ -0,0 +1,68 @@ +#ifndef Py_INTERNAL_JIT_UNWIND_H +#define Py_INTERNAL_JIT_UNWIND_H + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include +#include + +#if defined(_Py_JIT) && defined(__linux__) && defined(__ELF__) +# define PY_HAVE_JIT_GDB_UNWIND +#endif + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +extern PyMutex _Py_jit_debug_mutex; +#endif + +/* DWARF exception-handling pointer encodings shared by JIT unwind users. */ +enum { + DWRF_EH_PE_absptr = 0x00, + DWRF_EH_PE_omit = 0xff, + + /* Data type encodings */ + DWRF_EH_PE_uleb128 = 0x01, + DWRF_EH_PE_udata2 = 0x02, + DWRF_EH_PE_udata4 = 0x03, + DWRF_EH_PE_udata8 = 0x04, + DWRF_EH_PE_sleb128 = 0x09, + DWRF_EH_PE_sdata2 = 0x0a, + DWRF_EH_PE_sdata4 = 0x0b, + DWRF_EH_PE_sdata8 = 0x0c, + DWRF_EH_PE_signed = 0x08, + + /* Reference type encodings */ + DWRF_EH_PE_pcrel = 0x10, + DWRF_EH_PE_textrel = 0x20, + DWRF_EH_PE_datarel = 0x30, + DWRF_EH_PE_funcrel = 0x40, + DWRF_EH_PE_aligned = 0x50, + DWRF_EH_PE_indirect = 0x80 +}; + +/* Return the size of the generated .eh_frame data for the given encoding. */ +size_t _PyJitUnwind_EhFrameSize(int absolute_addr); + +/* + * Build DWARF .eh_frame data for JIT code; returns size written or 0 on error. + * absolute_addr selects the FDE address encoding: + * - 0: PC-relative offsets (perf jitdump synthesized DSO). + * - nonzero: absolute addresses (GDB JIT in-memory ELF). + */ +size_t _PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr); + +void *_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename); + +void _PyJitUnwind_GdbUnregisterCode(void *handle); + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#endif // Py_INTERNAL_JIT_UNWIND_H diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 230335ead3a3b6..4f8b48db23d1ef 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1236,7 +1236,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = { [LIST_APPEND] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG }, [LIST_EXTEND] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [LOAD_ATTR] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, - [LOAD_ATTR_CLASS] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG }, + [LOAD_ATTR_CLASS] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_NAME_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_SYNC_SP_FLAG | HAS_NEEDS_GUARD_IP_FLAG | HAS_RECORDS_VALUE_FLAG }, [LOAD_ATTR_INSTANCE_VALUE] = { true, INSTR_FMT_IBC00000000, HAS_ARG_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ESCAPES_FLAG | HAS_RECORDS_VALUE_FLAG }, @@ -1460,8 +1460,8 @@ _PyOpcode_macro_expansion[256] = { [LIST_APPEND] = { .nuops = 1, .uops = { { _LIST_APPEND, OPARG_SIMPLE, 0 } } }, [LIST_EXTEND] = { .nuops = 2, .uops = { { _LIST_EXTEND, OPARG_SIMPLE, 0 }, { _POP_TOP, OPARG_SIMPLE, 0 } } }, [LOAD_ATTR] = { .nuops = 1, .uops = { { _LOAD_ATTR, OPARG_SIMPLE, 8 } } }, - [LOAD_ATTR_CLASS] = { .nuops = 3, .uops = { { _CHECK_ATTR_CLASS, 2, 1 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } }, - [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = { .nuops = 5, .uops = { { _RECORD_TOS_TYPE, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_CLASS, 2, 3 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } }, + [LOAD_ATTR_CLASS] = { .nuops = 4, .uops = { { _RECORD_TOS, OPARG_SIMPLE, 1 }, { _CHECK_ATTR_CLASS, 2, 1 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } }, + [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = { .nuops = 5, .uops = { { _RECORD_TOS, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_CLASS, 2, 3 }, { _LOAD_ATTR_CLASS, 4, 5 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } }, [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = { .nuops = 7, .uops = { { _RECORD_TOS_TYPE, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_PEP_523, OPARG_SIMPLE, 3 }, { _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_FRAME, 2, 3 }, { _LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN_FRAME, OPERAND1_4, 5 }, { _SAVE_RETURN_OFFSET, OPARG_SAVE_RETURN_OFFSET, 9 }, { _PUSH_FRAME, OPARG_SIMPLE, 9 } } }, [LOAD_ATTR_INSTANCE_VALUE] = { .nuops = 6, .uops = { { _RECORD_TOS_TYPE, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_MANAGED_OBJECT_HAS_VALUES, OPARG_SIMPLE, 3 }, { _LOAD_ATTR_INSTANCE_VALUE, 1, 3 }, { _POP_TOP, OPARG_SIMPLE, 4 }, { _PUSH_NULL_CONDITIONAL, OPARG_SIMPLE, 9 } } }, [LOAD_ATTR_METHOD_LAZY_DICT] = { .nuops = 4, .uops = { { _RECORD_TOS_TYPE, OPARG_SIMPLE, 1 }, { _GUARD_TYPE_VERSION, 2, 1 }, { _CHECK_ATTR_METHOD_LAZY_DICT, 1, 3 }, { _LOAD_ATTR_METHOD_LAZY_DICT, 4, 5 } } }, diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index f356d60ae5c7a7..a0727c045e51ec 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -198,6 +198,7 @@ typedef struct _PyExecutorObject { uint32_t code_size; size_t jit_size; void *jit_code; + void *jit_gdb_handle; _PyExitData exits[1]; } _PyExecutorObject; diff --git a/Include/internal/pycore_runtime_init_generated.h b/Include/internal/pycore_runtime_init_generated.h index 1ce91dc51ea0b7..892c3cdd9623a2 100644 --- a/Include/internal/pycore_runtime_init_generated.h +++ b/Include/internal/pycore_runtime_init_generated.h @@ -1580,6 +1580,7 @@ extern "C" { INIT_ID(alias), \ INIT_ID(align), \ INIT_ID(all), \ + INIT_ID(all_interpreters), \ INIT_ID(all_threads), \ INIT_ID(allow_code), \ INIT_ID(alphabet), \ diff --git a/Include/internal/pycore_unicodeobject_generated.h b/Include/internal/pycore_unicodeobject_generated.h index c7c23494845e01..f0fc3c4f5b0900 100644 --- a/Include/internal/pycore_unicodeobject_generated.h +++ b/Include/internal/pycore_unicodeobject_generated.h @@ -1000,6 +1000,10 @@ _PyUnicode_InitStaticStrings(PyInterpreterState *interp) { _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); assert(PyUnicode_GET_LENGTH(string) != 1); + string = &_Py_ID(all_interpreters); + _PyUnicode_InternStatic(interp, &string); + assert(_PyUnicode_CheckConsistency(string, 1)); + assert(PyUnicode_GET_LENGTH(string) != 1); string = &_Py_ID(all_threads); _PyUnicode_InternStatic(interp, &string); assert(_PyUnicode_CheckConsistency(string, 1)); diff --git a/Lib/random.py b/Lib/random.py index c89cbb755abac8..726a71e782893c 100644 --- a/Lib/random.py +++ b/Lib/random.py @@ -836,7 +836,11 @@ def binomialvariate(self, n=1, p=0.5): if not c: return x while True: - y += _floor(_log2(random()) / c) + 1 + try: + y += _floor(_log2(random()) / c) + 1 + except ValueError: + # Reject case where random() returned 0.0 + continue if y > n: return x x += 1 diff --git a/Lib/test/test_ast/test_ast.py b/Lib/test/test_ast/test_ast.py index a7d5a51a2aa4d5..fcc6c93eb86981 100644 --- a/Lib/test/test_ast/test_ast.py +++ b/Lib/test/test_ast/test_ast.py @@ -1,4 +1,5 @@ import _ast_unparse +import _ast import ast import builtins import contextlib @@ -85,7 +86,9 @@ def _assertTrueorder(self, ast_node, parent_pos): self.assertEqual(ast_node._fields, ast_node.__match_args__) def test_AST_objects(self): - x = ast.AST() + # Directly instantiating abstract node class AST is allowed (but deprecated) + with self.assertWarns(DeprecationWarning): + x = ast.AST() self.assertEqual(x._fields, ()) x.foobar = 42 self.assertEqual(x.foobar, 42) @@ -94,7 +97,7 @@ def test_AST_objects(self): with self.assertRaises(AttributeError): x.vararg - with self.assertRaises(TypeError): + with self.assertRaises(TypeError), self.assertWarns(DeprecationWarning): # "ast.AST constructor takes 0 positional arguments" ast.AST(2) @@ -110,15 +113,21 @@ def cleanup(): msg = "type object 'ast.AST' has no attribute '_fields'" # Both examples used to crash: - with self.assertRaisesRegex(AttributeError, msg): + with ( + self.assertRaisesRegex(AttributeError, msg), + self.assertWarns(DeprecationWarning), + ): ast.AST(arg1=123) - with self.assertRaisesRegex(AttributeError, msg): + with ( + self.assertRaisesRegex(AttributeError, msg), + self.assertWarns(DeprecationWarning), + ): ast.AST() - def test_AST_garbage_collection(self): + def test_node_garbage_collection(self): class X: pass - a = ast.AST() + a = ast.Module() a.x = X() a.x.a = a ref = weakref.ref(a.x) @@ -439,7 +448,15 @@ def _construct_ast_class(self, cls): elif typ is object: kwargs[name] = b'capybara' elif isinstance(typ, type) and issubclass(typ, ast.AST): - kwargs[name] = self._construct_ast_class(typ) + if _ast._is_abstract(typ): + # Use an arbitrary concrete subclass + concrete = next((sub for sub in typ.__subclasses__() + if not _ast._is_abstract(sub)), None) + msg = f"abstract node class {typ} has no concrete subclasses" + self.assertIsNotNone(concrete, msg) + else: + concrete = typ + kwargs[name] = self._construct_ast_class(concrete) return cls(**kwargs) def test_arguments(self): @@ -578,6 +595,10 @@ def test_nodeclasses(self): with self.assertRaisesRegex(TypeError, re.escape(msg)): ast.BinOp(1, 2, 3, foobarbaz=42) + # Directly instantiating abstract node types is allowed (but deprecated) + self.assertWarns(DeprecationWarning, ast.stmt) + self.assertWarns(DeprecationWarning, ast.expr_context) + def test_no_fields(self): # this used to fail because Sub._fields was None x = ast.Sub() @@ -585,7 +606,10 @@ def test_no_fields(self): def test_invalid_sum(self): pos = dict(lineno=2, col_offset=3) - m = ast.Module([ast.Expr(ast.expr(**pos), **pos)], []) + with self.assertWarns(DeprecationWarning): + # Creating instances of ast.expr is deprecated + e = ast.expr(**pos) + m = ast.Module([ast.Expr(e, **pos)], []) with self.assertRaises(TypeError) as cm: compile(m, "", "exec") self.assertIn("but got expr()", str(cm.exception)) @@ -1107,14 +1131,19 @@ class CopyTests(unittest.TestCase): def iter_ast_classes(): """Iterate over the (native) subclasses of ast.AST recursively. - This excludes the special class ast.Index since its constructor - returns an integer. + This excludes: + * abstract AST nodes + * the special class ast.Index, since its constructor returns + an integer. """ def do(cls): if cls.__module__ != 'ast': return if cls is ast.Index: return + # Don't attempt to create instances of abstract AST nodes + if _ast._is_abstract(cls): + return yield cls for sub in cls.__subclasses__(): diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py index ec7192b1b89184..401136de8de666 100644 --- a/Lib/test/test_external_inspection.py +++ b/Lib/test/test_external_inspection.py @@ -559,6 +559,75 @@ def test_self_trace_after_ctypes_import(self): f"stdout: {result.stdout}\nstderr: {result.stderr}" ) + @skip_if_not_supported + @unittest.skipIf( + sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, + "Test only runs on Linux with process_vm_readv support", + ) + def test_remote_stack_trace_non_ascii_names(self): + # Exercise each PyUnicode kind (1-byte non-ASCII, 2-byte BMP, + # 4-byte non-BMP) for both the filename and the function name + # reported in the stack trace. + latin1 = "zażółć" # 1-byte non-ASCII (forces non-ASCII path) + bmp = "λάμβδα" # 2-byte BMP + astral = "𐌀𐌁𐌂𐌃" # 4-byte non-BMP (Old Italic; XID, no NFKC fold) + func_name = f"{latin1}_{bmp}_{astral}" + script_basename = f"mod_{latin1}_{bmp}_{astral}" + + port = find_unused_port() + script = textwrap.dedent( + f"""\ + import socket + import time + + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect(('localhost', {port})) + + def {func_name}(): + sock.sendall(b"ready") + time.sleep(10_000) + + {func_name}() + """ + ) + with os_helper.temp_dir() as work_dir: + script_dir = os.path.join(work_dir, "script_pkg") + os.mkdir(script_dir) + script_name = _make_test_script(script_dir, script_basename, script) + + server_socket = _create_server_socket(port) + client_socket = None + try: + p = subprocess.Popen([sys.executable, script_name]) + client_socket, _ = server_socket.accept() + server_socket.close() + _wait_for_signal(client_socket, b"ready") + + stack_trace = get_stack_trace(p.pid) + except PermissionError: + self.skipTest("Insufficient permissions to read the stack trace") + finally: + if client_socket is not None: + client_socket.close() + p.kill() + p.wait(timeout=SHORT_TIMEOUT) + + frames = [ + frame + for interp in stack_trace + for thread in interp.threads + for frame in thread.frame_info + ] + target = next( + (f for f in frames if f.funcname == func_name), None + ) + self.assertIsNotNone( + target, + f"Frame for {func_name!r} missing; got " + f"{[(f.filename, f.funcname) for f in frames]}", + ) + self.assertEqual(target.filename, script_name) + @skip_if_not_supported @unittest.skipIf( sys.platform == "linux" and not PROCESS_VM_READV_SUPPORTED, diff --git a/Lib/test/test_gc_stats.py b/Lib/test/test_gc_stats.py new file mode 100644 index 00000000000000..59365ad45b32c9 --- /dev/null +++ b/Lib/test/test_gc_stats.py @@ -0,0 +1,350 @@ +import gc +import os +import textwrap +import time +import unittest + +from test.support import ( + Py_GIL_DISABLED, + import_helper, + requires_gil_enabled, + requires_remote_subprocess_debugging, +) +from test.test_profiling.test_sampling_profiler.helpers import test_subprocess + +try: + import _remote_debugging # noqa: F401 +except ImportError: + raise unittest.SkipTest( + "Test only runs when _remote_debugging is available" + ) + + +GC_STATS_FIELDS = ( + "gen", "iid", "ts_start", "ts_stop", "collections", "collected", + "uncollectable", "candidates", "duration") + + +def get_interpreter_identifiers(gc_stats) -> tuple[int,...]: + return tuple(sorted({s.iid for s in gc_stats})) + + +def get_generations(gc_stats) -> tuple[int,int,int]: + generations = set() + for s in gc_stats: + generations.add(s.gen) + + return tuple(sorted(generations)) + + +def get_last_item(gc_stats, generation: int, iid: int): + item = None + for s in gc_stats: + if s.gen == generation and s.iid == iid: + if item is None or item.ts_start < s.ts_start: + item = s + + return item + + +def has_local_process_debugging(): + try: + return _remote_debugging.is_python_process(os.getpid()) + except Exception: + return False + + +def check_gc_stats_fields(testcase, stats): + testcase.assertIsInstance(stats, list) + testcase.assertGreater(len(stats), 0) + for item in stats: + testcase.assertIsInstance(item, _remote_debugging.GCStatsInfo) + testcase.assertEqual(type(item).__match_args__, GC_STATS_FIELDS) + testcase.assertEqual(len(item), len(GC_STATS_FIELDS)) + + +def gc_stats_counters_advanced(before_stats, after_stats, generations, iid): + for generation in generations: + before = get_last_item(before_stats, generation, iid) + after = get_last_item(after_stats, generation, iid) + if after is None or before is None: + return False + if after.duration <= before.duration: + return False + if after.candidates <= before.candidates: + return False + return True + + +@unittest.skipUnless( + has_local_process_debugging(), "requires local process debugging") +class TestLocalGCStats(unittest.TestCase): + + _main_iid = 0 # main interpreter ID + + def test_gc_stats_fields(self): + monitor = _remote_debugging.GCMonitor(os.getpid(), debug=True) + stats = monitor.get_gc_stats(all_interpreters=False) + check_gc_stats_fields(self, stats) + + def test_module_get_gc_stats_fields(self): + stats = _remote_debugging.get_gc_stats( + os.getpid(), all_interpreters=False) + check_gc_stats_fields(self, stats) + + def test_all_interpreters_filter_for_local_process(self): + interpreters = import_helper.import_module("concurrent.interpreters") + source = """ + import gc + objects = [] + obj = [] + obj.append(obj) + objects.append(obj) + gc.collect(0) + gc.collect(1) + gc.collect(2) + """ + interp = interpreters.create() + try: + interp.exec(textwrap.dedent(source)) + for generation in range(3): + gc.collect(generation) + + main_stats = _remote_debugging.get_gc_stats( + os.getpid(), all_interpreters=False) + all_stats = _remote_debugging.get_gc_stats( + os.getpid(), all_interpreters=True) + finally: + interp.close() + + self.assertEqual(get_interpreter_identifiers(main_stats), (0,)) + self.assertIn(0, get_interpreter_identifiers(all_stats)) + self.assertGreater(len(get_interpreter_identifiers(all_stats)), 1) + self.assertEqual(get_generations(main_stats), (0, 1, 2)) + self.assertEqual(get_generations(all_stats), (0, 1, 2)) + for iid in get_interpreter_identifiers(all_stats): + for generation in range(3): + self.assertIsNotNone(get_last_item(all_stats, generation, iid)) + + @unittest.skipUnless(Py_GIL_DISABLED, "requires free-threaded GC") + def test_gc_stats_counters_for_main_interpreter_free_threaded(self): + generations = (0, 1, 2) + before_stats = _remote_debugging.get_gc_stats( + os.getpid(), all_interpreters=False) + for generation in generations: + self.assertIsNotNone( + get_last_item(before_stats, generation, self._main_iid)) + + objects = [] + for _ in range(1000): + obj = [] + obj.append(obj) + objects.append(obj) + for generation in generations: + gc.collect(generation) + + after_stats = _remote_debugging.get_gc_stats( + os.getpid(), all_interpreters=False) + self.assertTrue( + gc_stats_counters_advanced( + before_stats, after_stats, generations, self._main_iid), + (before_stats, after_stats) + ) + + +@requires_remote_subprocess_debugging() +class TestGCStats(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls._main_iid = 0 # main interpreter ID + cls._main_interpreter_script = ''' + import gc + import time + + gc.collect(0) + gc.collect(1) + gc.collect(2) + + _test_sock.sendall(b"working") + objects = [] + while True: + if len(objects) > 100: + objects = [] + + obj = [] + obj.append(obj) + objects.append(obj) + + time.sleep(0.1) + gc.collect(0) + gc.collect(1) + gc.collect(2) + ''' + cls._script = ''' + import concurrent.interpreters as interpreters + import gc + import time + + source = """if True: + import gc + + if "objects" not in globals(): + objects = [] + if len(objects) > 100: + objects = [] + + obj = [] + obj.append(obj) + objects.append(obj) + + gc.collect(0) + gc.collect(1) + gc.collect(2) + """ + + if {0}: + interp = interpreters.create() + interp.exec(source) + + gc.collect(0) + gc.collect(1) + gc.collect(2) + + _test_sock.sendall(b"working") + objects = [] + while True: + if len(objects) > 100: + objects = [] + + obj = [] + obj.append(obj) + objects.append(obj) + + time.sleep(0.1) + if {0}: + interp.exec(source) + gc.collect(0) + gc.collect(1) + gc.collect(2) + ''' + + def _gc_stats_advanced(self, before_stats, after_stats, generations): + for generation in generations: + before = get_last_item(before_stats, generation, self._main_iid) + after = get_last_item(after_stats, generation, self._main_iid) + if after is None or before is None: + return False + if after.ts_stop <= before.ts_stop: + return False + return True + + def _collect_gc_stats(self, script: str, all_interpreters: bool, + generations=(2,)): + with (test_subprocess(script, wait_for_working=True) as subproc): + monitor = _remote_debugging.GCMonitor(subproc.process.pid, debug=True) + before_stats = monitor.get_gc_stats(all_interpreters=all_interpreters) + for generation in generations: + before = get_last_item(before_stats, generation, self._main_iid) + self.assertIsNotNone(before) + + after_stats = before_stats + for _ in range(10): + time.sleep(0.5) + after_stats = monitor.get_gc_stats(all_interpreters=all_interpreters) + if self._gc_stats_advanced(before_stats, after_stats, generations): + break + else: + self.fail( + f"GC stats for generations {generations!r} did not " + f"advance: {before_stats!r} -> {after_stats!r}" + ) + + return before_stats, after_stats + + def _check_gc_stats(self, before, after): + self.assertIsNotNone(before) + self.assertIsNotNone(after) + + self.assertGreater(after.collections, before.collections, (before, after)) + self.assertGreater(after.ts_start, before.ts_start, (before, after)) + self.assertGreater(after.ts_stop, before.ts_stop, (before, after)) + self.assertGreater(after.duration, before.duration, (before, after)) + + self.assertGreater(after.candidates, before.candidates, (before, after)) + + # may not grow + self.assertGreaterEqual(after.collected, before.collected, (before, after)) + self.assertGreaterEqual(after.uncollectable, before.uncollectable, (before, after)) + + def _check_interpreter_gc_stats(self, before_stats, after_stats): + before_iids = get_interpreter_identifiers(before_stats) + after_iids = get_interpreter_identifiers(after_stats) + + self.assertEqual(before_iids, after_iids) + + self.assertEqual(get_generations(before_stats), (0, 1, 2)) + self.assertEqual(get_generations(after_stats), (0, 1, 2)) + + for iid in after_iids: + with self.subTest(f"interpreter id={iid}"): + before_last_items = (get_last_item(before_stats, 0, iid), + get_last_item(before_stats, 1, iid), + get_last_item(before_stats, 2, iid)) + + after_last_items = (get_last_item(after_stats, 0, iid), + get_last_item(after_stats, 1, iid), + get_last_item(after_stats, 2, iid)) + + for before, after in zip(before_last_items, after_last_items): + self._check_gc_stats(before, after) + + def test_gc_stats_timestamps_for_main_interpreter(self): + script = textwrap.dedent(self._main_interpreter_script) + before_stats, after_stats = self._collect_gc_stats( + script, False, generations=(0, 1, 2)) + + for generation in range(3): + with self.subTest(generation=generation): + before = get_last_item(before_stats, generation, self._main_iid) + after = get_last_item(after_stats, generation, self._main_iid) + + self.assertIsNotNone(before) + self.assertIsNotNone(after) + self.assertGreater( + after.collections, before.collections, + (before, after)) + self.assertGreater( + after.ts_start, before.ts_start, + (before, after)) + self.assertGreater( + after.ts_stop, before.ts_stop, + (before, after)) + + @requires_gil_enabled() + def test_gc_stats_for_main_interpreter(self): + script = textwrap.dedent(self._script.format(False)) + before_stats, after_stats = self._collect_gc_stats(script, False) + + self._check_interpreter_gc_stats(before_stats, after_stats) + + @requires_gil_enabled() + def test_gc_stats_for_main_interpreter_if_subinterpreter_exists(self): + script = textwrap.dedent(self._script.format(True)) + before_stats, after_stats = self._collect_gc_stats(script, False) + + self._check_interpreter_gc_stats(before_stats, after_stats) + + @requires_gil_enabled() + def test_gc_stats_for_all_interpreters(self): + script = textwrap.dedent(self._script.format(True)) + before_stats, after_stats = self._collect_gc_stats(script, True) + + before_iids = get_interpreter_identifiers(before_stats) + after_iids = get_interpreter_identifiers(after_stats) + + self.assertGreater(len(before_iids), 1) + self.assertGreater(len(after_iids), 1) + self.assertEqual(before_iids, after_iids) + + self._check_interpreter_gc_stats(before_stats, after_stats) diff --git a/Lib/test/test_gdb/gdb_jit_sample.py b/Lib/test/test_gdb/gdb_jit_sample.py new file mode 100644 index 00000000000000..b439e82e8b312f --- /dev/null +++ b/Lib/test/test_gdb/gdb_jit_sample.py @@ -0,0 +1,27 @@ +# Sample script for use by test_gdb.test_jit + +import _testinternalcapi +import operator + + +WARMUP_ITERATIONS = _testinternalcapi.TIER2_THRESHOLD + 10 + + +def jit_bt_hot(depth, warming_up_caller=False): + if depth == 0: + if not warming_up_caller: + id(42) + return + + for iteration in range(WARMUP_ITERATIONS): + operator.call( + jit_bt_hot, + depth - 1, + warming_up_caller or iteration + 1 != WARMUP_ITERATIONS, + ) + + +# Warm the shared shim once without hitting builtin_id so the real run uses +# the steady-state shim path when GDB breaks inside id(42). +jit_bt_hot(1, warming_up_caller=True) +jit_bt_hot(1) diff --git a/Lib/test/test_gdb/test_jit.py b/Lib/test/test_gdb/test_jit.py new file mode 100644 index 00000000000000..ea88d7b0a1fe75 --- /dev/null +++ b/Lib/test/test_gdb/test_jit.py @@ -0,0 +1,201 @@ +import os +import platform +import re +import sys +import unittest + +from .util import setup_module, DebuggerTests + + +JIT_SAMPLE_SCRIPT = os.path.join(os.path.dirname(__file__), "gdb_jit_sample.py") +# In batch GDB, break in builtin_id() while it is running under JIT, +# then repeatedly "finish" until the selected frame is the JIT executor. +# That gives a deterministic backtrace starting with py::jit:executor. +# +# builtin_id() sits only a few helper frames above the JIT entry on this path. +# This bound is just a generous upper limit so the test fails clearly if the +# expected stack shape changes. +MAX_FINISH_STEPS = 20 +# After landing on the JIT entry frame, single-step a bounded number of +# instructions further into the blob so the backtrace is taken from JIT code +# itself rather than the immediate helper-return site. The exact number of +# steps is not significant: each step is cross-checked against the selected +# frame's symbol so the test fails loudly if stepping escapes the registered +# JIT region, instead of asserting against a misleading backtrace. +MAX_JIT_ENTRY_STEPS = 4 +EVAL_FRAME_RE = r"(_PyEval_EvalFrameDefault|_PyEval_Vector)" +JIT_EXECUTOR_FRAME = "py::jit:executor" +JIT_ENTRY_SYMBOL = "_PyJIT_Entry" +BACKTRACE_FRAME_RE = re.compile(r"^#\d+\s+.*$", re.MULTILINE) + +FINISH_TO_JIT_EXECUTOR = ( + "python exec(\"import gdb\\n" + f"target = {JIT_EXECUTOR_FRAME!r}\\n" + f"for _ in range({MAX_FINISH_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is not None and frame.name() == target:\\n" + " break\\n" + " gdb.execute('finish')\\n" + "else:\\n" + " raise RuntimeError('did not reach %s' % target)\\n\")" +) +STEP_INSIDE_JIT_EXECUTOR = ( + "python exec(\"import gdb\\n" + f"target = {JIT_EXECUTOR_FRAME!r}\\n" + f"for _ in range({MAX_JIT_ENTRY_STEPS}):\\n" + " frame = gdb.selected_frame()\\n" + " if frame is None or frame.name() != target:\\n" + " raise RuntimeError('left JIT region during stepping: '\\n" + " + repr(frame and frame.name()))\\n" + " gdb.execute('si')\\n" + "frame = gdb.selected_frame()\\n" + "if frame is None or frame.name() != target:\\n" + " raise RuntimeError('stepped out of JIT region after si')\\n\")" +) + + +def setUpModule(): + setup_module() + + +# The GDB JIT interface registration is gated on __linux__ && __ELF__ in +# Python/jit_unwind.c, and the synthetic EH-frame is only implemented for +# x86_64 and AArch64 (a #error fires otherwise). Skip cleanly on other +# platforms or architectures instead of producing timeouts / empty backtraces. +# is_enabled() implies is_available() and also implies that the runtime has +# JIT execution active; interpreter-only tier 2 builds don't hit this path. +@unittest.skipUnless(sys.platform == "linux", + "GDB JIT interface is only implemented for Linux + ELF") +@unittest.skipUnless(platform.machine() in ("x86_64", "aarch64"), + "GDB JIT CFI emitter only supports x86_64 and AArch64") +@unittest.skipUnless(hasattr(sys, "_jit") and sys._jit.is_enabled(), + "requires a JIT-enabled build with JIT execution active") +class JitBacktraceTests(DebuggerTests): + def get_stack_trace(self, **kwargs): + # These tests validate the JIT-relevant part of the backtrace via + # _assert_jit_backtrace_shape, so an unrelated "?? ()" frame below + # the JIT/eval segment (e.g. libc without debug info) is tolerable. + kwargs.setdefault("skip_on_truncation", False) + return super().get_stack_trace(**kwargs) + + def _extract_backtrace_frames(self, gdb_output): + frames = BACKTRACE_FRAME_RE.findall(gdb_output) + self.assertGreater( + len(frames), 0, + f"expected at least one GDB backtrace frame in output:\n{gdb_output}", + ) + return frames + + def _assert_jit_backtrace_shape(self, gdb_output, *, anchor_at_top): + # Shape assertions applied to every JIT backtrace we produce: + # 1. The synthetic JIT symbol appears exactly once. A second + # py::jit:executor frame would mean the unwinder is + # materializing two native frames for a single logical JIT + # region, or failing to unwind out of the region entirely. + # 2. The unwinder must climb directly back out of the JIT region + # into the eval loop. _PyJIT_Entry only exists to establish the + # physical frame; the synthetic executor FDE collapses it away. + # 3. For tests that assert a specific entry PC, the JIT frame + # is also at #0. + frames = self._extract_backtrace_frames(gdb_output) + backtrace = "\n".join(frames) + + jit_frames = [frame for frame in frames if JIT_EXECUTOR_FRAME in frame] + jit_count = len(jit_frames) + self.assertEqual( + jit_count, 1, + f"expected exactly 1 {JIT_EXECUTOR_FRAME} frame, got {jit_count}\n" + f"backtrace:\n{backtrace}", + ) + eval_frames = [frame for frame in frames if re.search(EVAL_FRAME_RE, frame)] + eval_count = len(eval_frames) + self.assertGreaterEqual( + eval_count, 1, + f"expected at least one _PyEval_* frame, got {eval_count}\n" + f"backtrace:\n{backtrace}", + ) + jit_frame_index = next( + i for i, frame in enumerate(frames) if JIT_EXECUTOR_FRAME in frame + ) + frames_after_jit = frames[jit_frame_index + 1:] + first_eval_offset = next( + ( + i for i, frame in enumerate(frames_after_jit) + if re.search(EVAL_FRAME_RE, frame) + ), + None, + ) + self.assertIsNotNone( + first_eval_offset, + f"expected an eval frame after the JIT frame\n" + f"backtrace:\n{backtrace}", + ) + unexpected_between = frames_after_jit[:first_eval_offset] + self.assertFalse( + unexpected_between, + "expected the executor frame to unwind directly into eval\n" + f"backtrace:\n{backtrace}", + ) + relevant_end = max( + i + for i, frame in enumerate(frames) + if ( + JIT_EXECUTOR_FRAME in frame + or re.search(EVAL_FRAME_RE, frame) + ) + ) + truncated_frames = [ + frame for frame in frames[: relevant_end + 1] + if " ?? ()" in frame + ] + self.assertFalse( + truncated_frames, + "unexpected truncated frame before the validated JIT/eval segment\n" + f"backtrace:\n{backtrace}", + ) + if anchor_at_top: + self.assertRegex( + frames[0], + re.compile(rf"^#0\s+{re.escape(JIT_EXECUTOR_FRAME)}"), + ) + + def test_bt_unwinds_through_jit_frames(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=["bt"], + PYTHON_JIT="1", + ) + # The executor should appear as a named JIT frame and unwind back into + # the eval loop. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False) + + def test_bt_handoff_from_jit_entry_to_executor(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + breakpoint=JIT_ENTRY_SYMBOL, + cmds_after_breakpoint=[ + "delete 1", + "tbreak builtin_id", + "continue", + "bt", + ], + PYTHON_JIT="1", + ) + # If we stop first in the shim and then continue into the real JIT + # workload, the final backtrace should match the architecture's + # executor unwind contract. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=False) + + def test_bt_unwinds_from_inside_jit_executor(self): + gdb_output = self.get_stack_trace( + script=JIT_SAMPLE_SCRIPT, + cmds_after_breakpoint=[ + FINISH_TO_JIT_EXECUTOR, + STEP_INSIDE_JIT_EXECUTOR, + "bt", + ], + PYTHON_JIT="1", + ) + # Once the selected PC is inside the JIT executor, we require that GDB + # identifies the JIT frame at #0 and keeps unwinding into _PyEval_*. + self._assert_jit_backtrace_shape(gdb_output, anchor_at_top=True) diff --git a/Lib/test/test_gdb/util.py b/Lib/test/test_gdb/util.py index 8097fd52ababe6..d903adcf2903f3 100644 --- a/Lib/test/test_gdb/util.py +++ b/Lib/test/test_gdb/util.py @@ -20,6 +20,27 @@ PYTHONHASHSEED = '123' +# gh-91960, bpo-40019: gdb reports these when the optimizer has dropped +# python-frame debug info; the test can't read what's not there. +_OPTIMIZED_OUT_PATTERNS = ( + '(frame information optimized out)', + 'Unable to read information on python frame', + '(unable to read python frame information)', +) +# gdb prints this when the unwinder genuinely failed to walk a frame — +# i.e. the CFI (ours or a library's) is wrong. Treat as a hard failure, +# not a skip, so regressions in our own unwind info don't hide. +_UNWIND_FAILURE_PATTERNS = ( + 'Backtrace stopped: frame did not save the PC', +) +# gh-104736: " ?? ()" in the bt usually means the unwinder bailed early, +# but can also be unrelated frames without debug info (e.g. libc). Tests +# that validate the JIT-relevant part of the backtrace themselves can +# opt out via skip_on_truncation=False. +_TRUNCATED_BACKTRACE_PATTERNS = ( + ' ?? ()', +) + def clean_environment(): # Remove PYTHON* environment variables such as PYTHONHOME @@ -160,7 +181,9 @@ def get_stack_trace(self, source=None, script=None, breakpoint=BREAKPOINT_FN, cmds_after_breakpoint=None, import_site=False, - ignore_stderr=False): + ignore_stderr=False, + skip_on_truncation=True, + **env_vars): ''' Run 'python -c SOURCE' under gdb with a breakpoint. @@ -239,7 +262,7 @@ def get_stack_trace(self, source=None, script=None, args += [script] # Use "args" to invoke gdb, capturing stdout, stderr: - out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED) + out, err = run_gdb(*args, PYTHONHASHSEED=PYTHONHASHSEED, **env_vars) if not ignore_stderr: for line in err.splitlines(): @@ -255,26 +278,20 @@ def get_stack_trace(self, source=None, script=None, " because the Program Counter is" " not present") + for pattern in _UNWIND_FAILURE_PATTERNS: + if pattern in out: + raise AssertionError( + f"gdb unwinder failed ({pattern!r}) — CFI bug in our " + f"generated code or in a linked library.\n" + f"Full gdb output:\n{out}" + ) + # bpo-40019: Skip the test if gdb failed to read debug information # because the Python binary is optimized. - for pattern in ( - '(frame information optimized out)', - 'Unable to read information on python frame', - - # gh-91960: On Python built with "clang -Og", gdb gets - # "frame=" for _PyEval_EvalFrameDefault() parameter - '(unable to read python frame information)', - - # gh-104736: On Python built with "clang -Og" on ppc64le, - # "py-bt" displays a truncated or not traceback, but "where" - # logs this error message: - 'Backtrace stopped: frame did not save the PC', - - # gh-104736: When "bt" command displays something like: - # "#1 0x0000000000000000 in ?? ()", the traceback is likely - # truncated or wrong. - ' ?? ()', - ): + patterns = _OPTIMIZED_OUT_PATTERNS + if skip_on_truncation: + patterns = patterns + _TRUNCATED_BACKTRACE_PATTERNS + for pattern in patterns: if pattern in out: raise unittest.SkipTest(f"{pattern!r} found in gdb output") diff --git a/Lib/test/test_perfmaps.py b/Lib/test/test_perfmaps.py index 647c32656abd6d..ee4eb50033c470 100644 --- a/Lib/test/test_perfmaps.py +++ b/Lib/test/test_perfmaps.py @@ -1,4 +1,5 @@ import os +import sys import sysconfig import unittest @@ -17,6 +18,9 @@ def supports_trampoline_profiling(): raise unittest.SkipTest("perf trampoline profiling not supported") class TestPerfMapWriting(unittest.TestCase): + def tearDown(self): + perf_map_state_teardown() + def test_write_perf_map_entry(self): self.assertEqual(write_perf_map_entry(0x1234, 5678, "entry1"), 0) self.assertEqual(write_perf_map_entry(0x2345, 6789, "entry2"), 0) @@ -24,4 +28,15 @@ def test_write_perf_map_entry(self): perf_file_contents = f.read() self.assertIn("1234 162e entry1", perf_file_contents) self.assertIn("2345 1a85 entry2", perf_file_contents) - perf_map_state_teardown() + + @unittest.skipIf(sys.maxsize <= 2**32, "requires size_t wider than unsigned int") + def test_write_perf_map_entry_large_size(self): + code_addr = 0x3456 + code_size = 1 << 33 + entry_name = "entry_big" + + self.assertEqual(write_perf_map_entry(code_addr, code_size, entry_name), 0) + with open(f"/tmp/perf-{os.getpid()}.map") as f: + perf_file_contents = f.read() + self.assertIn(f"{code_addr:x} {code_size:x} {entry_name}", + perf_file_contents) diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py index 1e57b9244b4fd5..dbd3b855f536a0 100644 --- a/Lib/test/test_random.py +++ b/Lib/test/test_random.py @@ -1075,6 +1075,12 @@ def test_avg_std(self): msg='%s%r' % (variate.__name__, args)) self.assertAlmostEqual(s2/(N-1), sigmasqrd, places=2, msg='%s%r' % (variate.__name__, args)) + def test_binomialvariate_log_zero(self): + # gh-149222: Variety random() return 0.0 no input Error + with unittest.mock.patch.object(random.Random, 'random', side_effect= [0.0] + [0.5] * 20): + result = random.binomialvariate(10, 0.5) + self.assertIsInstance(result, int) + self.assertIn(result, range(11)) def test_constant(self): g = random.Random() diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 34a773ee9376cd..3002fa528eab17 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1908,7 +1908,7 @@ def test_pythontypes(self): check = self.check_sizeof # _ast.AST import _ast - check(_ast.AST(), size('P')) + check(_ast.Module(), size('3P')) try: raise TypeError except TypeError as e: diff --git a/Lib/test/test_xml_etree.py b/Lib/test/test_xml_etree.py index 43864820688bd4..8f3efe9fc90794 100644 --- a/Lib/test/test_xml_etree.py +++ b/Lib/test/test_xml_etree.py @@ -3197,7 +3197,7 @@ def test_deeply_nested_deepcopy(self): # This should raise a RecursionError and not crash. # See https://github.com/python/cpython/issues/148801. root = cur = ET.Element('s') - for _ in range(150_000): + for _ in range(500_000): cur = ET.SubElement(cur, 'u') with support.infinite_recursion(): with self.assertRaises(RecursionError): diff --git a/Makefile.pre.in b/Makefile.pre.in index 0edf55d991a05e..3c166470b6c143 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -514,6 +514,7 @@ PYTHON_OBJS= \ Python/suggestions.o \ Python/perf_trampoline.o \ Python/perf_jit_trampoline.o \ + Python/jit_unwind.o \ Python/remote_debugging.o \ Python/$(DYNLOADFILE) \ $(LIBOBJS) \ @@ -3209,7 +3210,8 @@ Python/emscripten_trampoline_wasm.c: Python/emscripten_trampoline_inner.wasm $(PYTHON_FOR_REGEN) $(srcdir)/Platforms/emscripten/prepare_external_wasm.py $< $@ getWasmTrampolineModule JIT_SHIM_BUILD_OBJS= @JIT_SHIM_BUILD_O@ -JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_SHIM_BUILD_OBJS) +JIT_UNWIND_INFO_H= $(if $(JIT_OBJS),jit_unwind_info.h $(patsubst jit_stencils-%.h,jit_unwind_info-%.h,@JIT_STENCILS_H@)) +JIT_BUILD_TARGETS= jit_stencils.h @JIT_STENCILS_H@ $(JIT_UNWIND_INFO_H) $(JIT_SHIM_BUILD_OBJS) JIT_TARGETS= $(JIT_BUILD_TARGETS) $(filter-out $(JIT_SHIM_BUILD_OBJS),$(JIT_OBJS)) JIT_GENERATED_STAMP= .jit-stamp @@ -3237,6 +3239,9 @@ jit_shim-universal2-apple-darwin.o: jit_shim-aarch64-apple-darwin.o jit_shim-x86 Python/jit.o: $(srcdir)/Python/jit.c @JIT_STENCILS_H@ $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< +Python/jit_unwind.o: $(srcdir)/Python/jit_unwind.c $(JIT_UNWIND_INFO_H) + $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< + .PHONY: regen-jit regen-jit: $(JIT_TARGETS) @@ -3362,7 +3367,7 @@ clean-profile: clean-retain-profile clean-bolt # gh-141808: The JIT stencils are deliberately kept in clean-profile .PHONY: clean-jit-stencils clean-jit-stencils: - -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_shim*.o + -rm -f $(JIT_TARGETS) $(JIT_GENERATED_STAMP) jit_stencils*.h jit_unwind_info*.h jit_shim*.o .PHONY: clean clean: clean-profile clean-jit-stencils @@ -3491,7 +3496,7 @@ MODULE__DECIMAL_DEPS=@LIBMPDEC_INTERNAL@ MODULE__ELEMENTTREE_DEPS=$(srcdir)/Modules/pyexpat.c @LIBEXPAT_INTERNAL@ MODULE__HASHLIB_DEPS=$(srcdir)/Modules/hashlib.h MODULE__IO_DEPS=$(srcdir)/Modules/_io/_iomodule.h -MODULE__REMOTE_DEBUGGING_DEPS=$(srcdir)/Modules/_remote_debugging/_remote_debugging.h +MODULE__REMOTE_DEBUGGING_DEPS=$(srcdir)/Modules/_remote_debugging/_remote_debugging.h $(srcdir)/Modules/_remote_debugging/gc_stats.h # HACL*-based cryptographic primitives MODULE__MD5_DEPS=$(srcdir)/Modules/hashlib.h $(LIBHACL_MD5_HEADERS) $(LIBHACL_MD5_LIB_@LIBHACL_LDEPS_LIBTYPE@) diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-08-16-12-56-08.gh-issue-116021.hMN9yw.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-08-16-12-56-08.gh-issue-116021.hMN9yw.rst new file mode 100644 index 00000000000000..967d8faaef3422 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-08-16-12-56-08.gh-issue-116021.hMN9yw.rst @@ -0,0 +1,2 @@ +Support for creating instances of abstract AST nodes from the :mod:`ast` module +is deprecated and scheduled for removal in Python 3.20. Patch by Brian Schubert. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst new file mode 100644 index 00000000000000..4d2634d0dd1e81 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-17-20-30-17.gh-issue-126910.NaUwmD.rst @@ -0,0 +1 @@ +Add support for unwinding JIT frames using GDB. Patch by Diego Russo and Pablo Galindo. diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-04-10-23-13-19.gh-issue-146527.P3Xv4Q.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-10-23-13-19.gh-issue-146527.P3Xv4Q.rst new file mode 100644 index 00000000000000..e83b66f71da0ca --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-04-10-23-13-19.gh-issue-146527.P3Xv4Q.rst @@ -0,0 +1,5 @@ +Add a ``GCMonitor`` class with a ``get_gc_stats`` method to the +:mod:`!_remote_debugging` module to allow reading GC statistics from an +external Python process without requiring the full ``RemoteUnwinder`` +functionality. +Patch by Sergey Miryanov and Pablo Galindo. diff --git a/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst new file mode 100644 index 00000000000000..cbb05620626d0f --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-01-10-20-27.gh-issue-149214.btP546.rst @@ -0,0 +1,4 @@ +Fix :mod:`!_remote_debugging` misreading non-ASCII Unicode strings (Latin-1, +BMP and non-BMP) from a remote process. Filenames and function names that +contain non-ASCII characters are now reported correctly in stack traces, the +sampling profiler, and :mod:`asyncio` task introspection. diff --git a/Misc/NEWS.d/next/Library/2026-05-02-01-09-29.gh-issue-149221.__KOks.rst b/Misc/NEWS.d/next/Library/2026-05-02-01-09-29.gh-issue-149221.__KOks.rst new file mode 100644 index 00000000000000..fab2b0f6a23489 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2026-05-02-01-09-29.gh-issue-149221.__KOks.rst @@ -0,0 +1 @@ +Catch rare math domain error for :func:`random.binomialvariate`. diff --git a/Modules/Setup b/Modules/Setup index 7d816ead8432ef..33737c21cb4066 100644 --- a/Modules/Setup +++ b/Modules/Setup @@ -285,7 +285,7 @@ PYTHONPATH=$(COREPYTHONPATH) #*shared* #_ctypes_test _ctypes/_ctypes_test.c -#_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c +#_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/interpreters.c #_testcapi _testcapimodule.c #_testimportmultiple _testimportmultiple.c #_testmultiphase _testmultiphase.c diff --git a/Modules/Setup.stdlib.in b/Modules/Setup.stdlib.in index 0d520684c795d6..a274c312d99313 100644 --- a/Modules/Setup.stdlib.in +++ b/Modules/Setup.stdlib.in @@ -41,7 +41,7 @@ @MODULE__PICKLE_TRUE@_pickle _pickle.c @MODULE__QUEUE_TRUE@_queue _queuemodule.c @MODULE__RANDOM_TRUE@_random _randommodule.c -@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c +@MODULE__REMOTE_DEBUGGING_TRUE@_remote_debugging _remote_debugging/module.c _remote_debugging/gc_stats.c _remote_debugging/object_reading.c _remote_debugging/code_objects.c _remote_debugging/frames.c _remote_debugging/frame_cache.c _remote_debugging/threads.c _remote_debugging/asyncio.c _remote_debugging/binary_io_writer.c _remote_debugging/binary_io_reader.c _remote_debugging/subprocess.c _remote_debugging/interpreters.c @MODULE__STRUCT_TRUE@_struct _struct.c # build supports subinterpreters diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h index 07738d45e42d24..7369cd1514c296 100644 --- a/Modules/_remote_debugging/_remote_debugging.h +++ b/Modules/_remote_debugging/_remote_debugging.h @@ -260,8 +260,10 @@ typedef struct { PyTypeObject *ThreadInfo_Type; PyTypeObject *InterpreterInfo_Type; PyTypeObject *AwaitedInfo_Type; + PyTypeObject *GCStatsInfo_Type; PyTypeObject *BinaryWriter_Type; PyTypeObject *BinaryReader_Type; + PyTypeObject *GCMonitor_Type; } RemoteDebuggingState; enum _ThreadState { @@ -346,6 +348,13 @@ typedef struct { size_t count; } StackChunkList; +typedef struct { + proc_handle_t handle; + uintptr_t runtime_start_address; + struct _Py_DebugOffsets debug_offsets; + int debug; +} RuntimeOffsets; + /* * Context for frame chain traversal operations. */ @@ -376,6 +385,13 @@ typedef struct { int32_t tlbc_index; // Thread-local bytecode index (free-threading) } CodeObjectContext; +typedef struct { + PyObject_HEAD + RuntimeOffsets offsets; +} GCMonitorObject; + +#define GCMonitor_CAST(op) ((GCMonitorObject *)(op)) + /* Function pointer types for iteration callbacks */ typedef int (*thread_processor_func)( RemoteUnwinderObject *unwinder, @@ -390,6 +406,14 @@ typedef int (*set_entry_processor_func)( void *context ); +typedef int (*interpreter_processor_func)( + RuntimeOffsets *offsets, + uintptr_t interpreter_state_addr, + int64_t iid, + void *context +); + + /* ============================================================================ * STRUCTSEQ DESCRIPTORS (extern declarations) * ============================================================================ */ @@ -401,6 +425,7 @@ extern PyStructSequence_Desc CoroInfo_desc; extern PyStructSequence_Desc ThreadInfo_desc; extern PyStructSequence_Desc InterpreterInfo_desc; extern PyStructSequence_Desc AwaitedInfo_desc; +extern PyStructSequence_Desc GCStatsInfo_desc; /* ============================================================================ * UTILITY FUNCTION DECLARATIONS @@ -588,6 +613,17 @@ extern void _Py_RemoteDebug_InitThreadsState(RemoteUnwinderObject *unwinder, _Py extern int _Py_RemoteDebug_StopAllThreads(RemoteUnwinderObject *unwinder, _Py_RemoteDebug_ThreadsState *st); extern void _Py_RemoteDebug_ResumeAllThreads(RemoteUnwinderObject *unwinder, _Py_RemoteDebug_ThreadsState *st); +/* ============================================================================ + * INTERPRETER FUNCTION DECLARATIONS + * ============================================================================ */ + +extern int +iterate_interpreters( + RuntimeOffsets *offsets, + interpreter_processor_func processor, + void *context +); + /* ============================================================================ * ASYNCIO FUNCTION DECLARATIONS * ============================================================================ */ diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h index 15df48fabb56b2..179a7b97dd4e2f 100644 --- a/Modules/_remote_debugging/clinic/module.c.h +++ b/Modules/_remote_debugging/clinic/module.c.h @@ -495,6 +495,181 @@ _remote_debugging_RemoteUnwinder_resume_threads(PyObject *self, PyObject *Py_UNU return return_value; } +PyDoc_STRVAR(_remote_debugging_GCMonitor___init____doc__, +"GCMonitor(pid, *, debug=False)\n" +"--\n" +"\n" +"Initialize a new GCMonitor object for monitoring GC events from remote process.\n" +"\n" +"Args:\n" +" pid: Process ID of the target Python process to monitor\n" +" debug: If True, chain exceptions to explain the sequence of events that\n" +" lead to the exception.\n" +"\n" +"The GCMonitor provides functionality to read GC statistics from a running\n" +"Python process.\n" +"\n" +"Raises:\n" +" PermissionError: If access to the target process is denied\n" +" OSError: If unable to attach to the target process or access its memory\n" +" RuntimeError: If unable to read debug information from the target process"); + +static int +_remote_debugging_GCMonitor___init___impl(GCMonitorObject *self, int pid, + int debug); + +static int +_remote_debugging_GCMonitor___init__(PyObject *self, PyObject *args, PyObject *kwargs) +{ + int return_value = -1; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(pid), &_Py_ID(debug), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"pid", "debug", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "GCMonitor", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + PyObject * const *fastargs; + Py_ssize_t nargs = PyTuple_GET_SIZE(args); + Py_ssize_t noptargs = nargs + (kwargs ? PyDict_GET_SIZE(kwargs) : 0) - 1; + int pid; + int debug = 0; + + fastargs = _PyArg_UnpackKeywords(_PyTuple_CAST(args)->ob_item, nargs, kwargs, NULL, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!fastargs) { + goto exit; + } + pid = PyLong_AsInt(fastargs[0]); + if (pid == -1 && PyErr_Occurred()) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + debug = PyObject_IsTrue(fastargs[1]); + if (debug < 0) { + goto exit; + } +skip_optional_kwonly: + return_value = _remote_debugging_GCMonitor___init___impl((GCMonitorObject *)self, pid, debug); + +exit: + return return_value; +} + +PyDoc_STRVAR(_remote_debugging_GCMonitor_get_gc_stats__doc__, +"get_gc_stats($self, /, all_interpreters=False)\n" +"--\n" +"\n" +"Get garbage collector statistics from external Python process.\n" +"\n" +" all_interpreters\n" +" If True, return GC statistics from all interpreters.\n" +" If False, return only from main interpreter.\n" +"\n" +"Returns a list of GCStatsInfo objects with GC statistics data.\n" +"\n" +"Returns:\n" +" list of GCStatsInfo: A list of stats samples containing:\n" +" - gen: GC generation number.\n" +" - iid: Interpreter ID.\n" +" - ts_start: Raw timestamp at collection start.\n" +" - ts_stop: Raw timestamp at collection stop.\n" +" - collections: Total number of collections.\n" +" - collected: Total number of collected objects.\n" +" - uncollectable: Total number of uncollectable objects.\n" +" - candidates: Total objects considered and traversed.\n" +" - duration: Total collection time, in seconds.\n" +"\n" +"Raises:\n" +" RuntimeError: If the target process cannot be inspected or if its\n" +" debug offsets or GC stats layout are incompatible."); + +#define _REMOTE_DEBUGGING_GCMONITOR_GET_GC_STATS_METHODDEF \ + {"get_gc_stats", _PyCFunction_CAST(_remote_debugging_GCMonitor_get_gc_stats), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_GCMonitor_get_gc_stats__doc__}, + +static PyObject * +_remote_debugging_GCMonitor_get_gc_stats_impl(GCMonitorObject *self, + int all_interpreters); + +static PyObject * +_remote_debugging_GCMonitor_get_gc_stats(PyObject *self, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(all_interpreters), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"all_interpreters", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "get_gc_stats", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[1]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 0; + int all_interpreters = 0; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 0, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!noptargs) { + goto skip_optional_pos; + } + all_interpreters = PyObject_IsTrue(args[0]); + if (all_interpreters < 0) { + goto exit; + } +skip_optional_pos: + Py_BEGIN_CRITICAL_SECTION(self); + return_value = _remote_debugging_GCMonitor_get_gc_stats_impl((GCMonitorObject *)self, all_interpreters); + Py_END_CRITICAL_SECTION(); + +exit: + return return_value; +} + PyDoc_STRVAR(_remote_debugging_BinaryWriter___init____doc__, "BinaryWriter(filename, sample_interval_us, start_time_us, *,\n" " compression=0)\n" @@ -1296,4 +1471,96 @@ _remote_debugging_is_python_process(PyObject *module, PyObject *const *args, Py_ exit: return return_value; } -/*[clinic end generated code: output=34f50b18f317b9b6 input=a9049054013a1b77]*/ + +PyDoc_STRVAR(_remote_debugging_get_gc_stats__doc__, +"get_gc_stats($module, /, pid, *, all_interpreters=False)\n" +"--\n" +"\n" +"Get garbage collector statistics from external Python process.\n" +"\n" +" all_interpreters\n" +" If True, return GC statistics from all interpreters.\n" +" If False, return only from main interpreter.\n" +"\n" +"Returns:\n" +" list of GCStatsInfo: A list of stats samples containing:\n" +" - gen: GC generation number.\n" +" - iid: Interpreter ID.\n" +" - ts_start: Raw timestamp at collection start.\n" +" - ts_stop: Raw timestamp at collection stop.\n" +" - collections: Total number of collections.\n" +" - collected: Total number of collected objects.\n" +" - uncollectable: Total number of uncollectable objects.\n" +" - candidates: Total objects considered and traversed.\n" +" - duration: Total collection time, in seconds.\n" +"\n" +"Raises:\n" +" RuntimeError: If the target process cannot be inspected or if its\n" +" debug offsets or GC stats layout are incompatible."); + +#define _REMOTE_DEBUGGING_GET_GC_STATS_METHODDEF \ + {"get_gc_stats", _PyCFunction_CAST(_remote_debugging_get_gc_stats), METH_FASTCALL|METH_KEYWORDS, _remote_debugging_get_gc_stats__doc__}, + +static PyObject * +_remote_debugging_get_gc_stats_impl(PyObject *module, int pid, + int all_interpreters); + +static PyObject * +_remote_debugging_get_gc_stats(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) +{ + PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 2 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(pid), &_Py_ID(all_interpreters), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"pid", "all_interpreters", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "get_gc_stats", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; + int pid; + int all_interpreters = 0; + + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + pid = PyLong_AsInt(args[0]); + if (pid == -1 && PyErr_Occurred()) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + all_interpreters = PyObject_IsTrue(args[1]); + if (all_interpreters < 0) { + goto exit; + } +skip_optional_kwonly: + return_value = _remote_debugging_get_gc_stats_impl(module, pid, all_interpreters); + +exit: + return return_value; +} +/*[clinic end generated code: output=1151e58683dab9f4 input=a9049054013a1b77]*/ diff --git a/Modules/_remote_debugging/debug_offsets_validation.h b/Modules/_remote_debugging/debug_offsets_validation.h index 32800e767b3169..1507026306192e 100644 --- a/Modules/_remote_debugging/debug_offsets_validation.h +++ b/Modules/_remote_debugging/debug_offsets_validation.h @@ -31,7 +31,7 @@ #define FIELD_SIZE(type, member) sizeof(((type *)0)->member) enum { - PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 840, + PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 848, PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104, }; @@ -304,7 +304,9 @@ validate_fixed_field( #define PY_REMOTE_DEBUG_UNICODE_OBJECT_FIELDS(APPLY, buffer_size) \ APPLY(unicode_object, length, sizeof(Py_ssize_t), _Alignof(Py_ssize_t), buffer_size); \ - APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), buffer_size) + APPLY(unicode_object, state, sizeof(struct _PyUnicodeObject_state), _Alignof(struct _PyUnicodeObject_state), buffer_size); \ + APPLY(unicode_object, asciiobject_size, sizeof(char), _Alignof(char), buffer_size); \ + APPLY(unicode_object, compactunicodeobject_size, sizeof(char), _Alignof(char), buffer_size) #define PY_REMOTE_DEBUG_GEN_OBJECT_FIELDS(APPLY, buffer_size) \ APPLY(gen_object, gi_frame_state, sizeof(int8_t), _Alignof(int8_t), buffer_size); \ @@ -370,6 +372,12 @@ _PyRemoteDebug_ValidateDebugOffsetsLayout(struct _Py_DebugOffsets *debug_offsets sizeof(uintptr_t), _Alignof(uintptr_t), SIZEOF_GC_RUNTIME_STATE); + PY_REMOTE_DEBUG_VALIDATE_FIELD( + gc, + generation_stats, + sizeof(uintptr_t), + _Alignof(uintptr_t), + SIZEOF_GC_RUNTIME_STATE); PY_REMOTE_DEBUG_VALIDATE_NESTED_FIELD( interpreter_state, gc, @@ -378,6 +386,14 @@ _PyRemoteDebug_ValidateDebugOffsetsLayout(struct _Py_DebugOffsets *debug_offsets sizeof(uintptr_t), _Alignof(uintptr_t), INTERP_STATE_BUFFER_SIZE); + PY_REMOTE_DEBUG_VALIDATE_NESTED_FIELD( + interpreter_state, + gc, + gc, + generation_stats, + sizeof(uintptr_t), + _Alignof(uintptr_t), + INTERP_STATE_BUFFER_SIZE); PY_REMOTE_DEBUG_VALIDATE_SECTION(interpreter_frame); PY_REMOTE_DEBUG_INTERPRETER_FRAME_FIELDS( diff --git a/Modules/_remote_debugging/gc_stats.c b/Modules/_remote_debugging/gc_stats.c new file mode 100644 index 00000000000000..852dc866153192 --- /dev/null +++ b/Modules/_remote_debugging/gc_stats.c @@ -0,0 +1,153 @@ +/****************************************************************************** + * Remote Debugging Module - GC Stats Functions + * + * This file contains functions for reading GC stats from interpreter state. + ******************************************************************************/ + +#include "gc_stats.h" + +typedef struct { + PyObject *result; + PyTypeObject *gc_stats_info_type; + bool all_interpreters; +} GetGCStatsContext; + +static int +read_gc_stats(struct gc_stats *stats, int64_t iid, PyObject *result, + PyTypeObject *gc_stats_info_type) +{ +#define SET_FIELD(converter, expr) do { \ + PyObject *value = converter(expr); \ + if (value == NULL) { \ + goto error; \ + } \ + PyStructSequence_SetItem(item, field++, value); \ +} while (0) + + PyObject *item = NULL; + + for (unsigned long gen = 0; gen < NUM_GENERATIONS; gen++) { + struct gc_generation_stats *items; + int size; + if (gen == 0) { + items = (struct gc_generation_stats *)stats->young.items; + size = GC_YOUNG_STATS_SIZE; + } + else { + items = (struct gc_generation_stats *)stats->old[gen-1].items; + size = GC_OLD_STATS_SIZE; + } + for (int i = 0; i < size; i++, items++) { + item = PyStructSequence_New(gc_stats_info_type); + if (item == NULL) { + goto error; + } + Py_ssize_t field = 0; + + SET_FIELD(PyLong_FromUnsignedLong, gen); + SET_FIELD(PyLong_FromInt64, iid); + + SET_FIELD(PyLong_FromInt64, items->ts_start); + SET_FIELD(PyLong_FromInt64, items->ts_stop); + SET_FIELD(PyLong_FromSsize_t, items->collections); + SET_FIELD(PyLong_FromSsize_t, items->collected); + SET_FIELD(PyLong_FromSsize_t, items->uncollectable); + SET_FIELD(PyLong_FromSsize_t, items->candidates); + + SET_FIELD(PyFloat_FromDouble, items->duration); + + int rc = PyList_Append(result, item); + Py_CLEAR(item); + if (rc < 0) { + goto error; + } + } + } + +#undef SET_FIELD + + return 0; + +error: + Py_XDECREF(item); + + return -1; +} + +static int +get_gc_stats_from_interpreter_state(RuntimeOffsets *offsets, + uintptr_t interpreter_state_addr, + int64_t iid, + void *context) +{ + GetGCStatsContext *ctx = (GetGCStatsContext *)context; + if (!ctx->all_interpreters && iid > 0) { + return 0; + } + + uintptr_t gc_stats_addr = 0; + uintptr_t gc_stats_pointer_address = interpreter_state_addr + + offsets->debug_offsets.interpreter_state.gc + + offsets->debug_offsets.gc.generation_stats; + if (_Py_RemoteDebug_ReadRemoteMemory(&offsets->handle, + gc_stats_pointer_address, + sizeof(gc_stats_addr), + &gc_stats_addr) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read GC state address"); + return -1; + } + if (gc_stats_addr == 0) { + PyErr_SetString(PyExc_RuntimeError, "GC state address is NULL"); + return -1; + } + + struct gc_stats stats; + if (_Py_RemoteDebug_ReadRemoteMemory(&offsets->handle, + gc_stats_addr, + sizeof(stats), + &stats) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read GC state"); + return -1; + } + + if (read_gc_stats(&stats, iid, ctx->result, + ctx->gc_stats_info_type) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to populate GC stats result"); + return -1; + } + + return 0; +} + +PyObject * +get_gc_stats(RuntimeOffsets *offsets, bool all_interpreters, + PyTypeObject *gc_stats_info_type) +{ + uint64_t gc_stats_size = offsets->debug_offsets.gc.generation_stats_size; + if (gc_stats_size != sizeof(struct gc_stats)) { + PyErr_Format(PyExc_RuntimeError, + "Remote gc_stats size (%llu) does not match " + "local size (%zu)", + (unsigned long long)gc_stats_size, + sizeof(struct gc_stats)); + set_exception_cause(offsets, PyExc_RuntimeError, "Remote gc_stats size mismatch"); + return NULL; + } + + PyObject *result = PyList_New(0); + if (result == NULL) { + return NULL; + } + GetGCStatsContext ctx = { + .result = result, + .gc_stats_info_type = gc_stats_info_type, + .all_interpreters = all_interpreters, + }; + if (iterate_interpreters(offsets, get_gc_stats_from_interpreter_state, + &ctx) < 0) { + Py_CLEAR(result); + return NULL; + } + + return result; +} diff --git a/Modules/_remote_debugging/gc_stats.h b/Modules/_remote_debugging/gc_stats.h new file mode 100644 index 00000000000000..959a49c59dcee1 --- /dev/null +++ b/Modules/_remote_debugging/gc_stats.h @@ -0,0 +1,24 @@ +/****************************************************************************** + * Remote Debugging Module - GC Stats Functions + * + * This file contains declarations for reading GC stats from interpreter state. + ******************************************************************************/ + +#ifndef Py_REMOTE_DEBUGGING_GC_STATS_H +#define Py_REMOTE_DEBUGGING_GC_STATS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "_remote_debugging.h" + +PyObject * +get_gc_stats(RuntimeOffsets *offsets, bool all_interpreters, + PyTypeObject *gc_stats_info_type); + +#ifdef __cplusplus +} +#endif + +#endif /* Py_REMOTE_DEBUGGING_GC_STATS_H */ diff --git a/Modules/_remote_debugging/interpreters.c b/Modules/_remote_debugging/interpreters.c new file mode 100644 index 00000000000000..55ebcb1d7816a6 --- /dev/null +++ b/Modules/_remote_debugging/interpreters.c @@ -0,0 +1,66 @@ +/****************************************************************************** + * Remote Debugging Module - Interpreters Functions + * + * This file contains function for iterating interpreters. + ******************************************************************************/ + +#include "_remote_debugging.h" + +int +iterate_interpreters( + RuntimeOffsets *offsets, + interpreter_processor_func processor, + void *context +) { + uintptr_t interpreters_head_addr = + offsets->runtime_start_address + + (uintptr_t)offsets->debug_offsets.runtime_state.interpreters_head; + uintptr_t interpreter_id_offset = + (uintptr_t)offsets->debug_offsets.interpreter_state.id; + uintptr_t interpreter_next_offset = + (uintptr_t)offsets->debug_offsets.interpreter_state.next; + + uintptr_t interpreter_state_addr; + if (_Py_RemoteDebug_ReadRemoteMemory(&offsets->handle, + interpreters_head_addr, + sizeof(void*), + &interpreter_state_addr) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read interpreter state address"); + return -1; + } + + if (interpreter_state_addr == 0) { + PyErr_SetString(PyExc_RuntimeError, "No interpreter state found"); + return -1; + } + + int64_t iid = 0; + static_assert( + sizeof((((PyInterpreterState*)NULL)->id)) == sizeof(iid), + "Sizeof of PyInterpreterState.id mismatch with local iid value"); + while (interpreter_state_addr != 0) { + if (_Py_RemoteDebug_ReadRemoteMemory( + &offsets->handle, + interpreter_state_addr + interpreter_id_offset, + sizeof(iid), + &iid) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read interpreter id"); + return -1; + } + + if (processor(offsets, interpreter_state_addr, iid, context) < 0) { + return -1; + } + + if (_Py_RemoteDebug_ReadRemoteMemory( + &offsets->handle, + interpreter_state_addr + interpreter_next_offset, + sizeof(void*), + &interpreter_state_addr) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read next interpreter state"); + return -1; + } + } + + return 0; +} diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c index 32f2cbacf2143b..c840c59971c478 100644 --- a/Modules/_remote_debugging/module.c +++ b/Modules/_remote_debugging/module.c @@ -8,6 +8,7 @@ #include "_remote_debugging.h" #include "binary_io.h" #include "debug_offsets_validation.h" +#include "gc_stats.h" /* Forward declarations for clinic-generated code */ typedef struct { @@ -132,6 +133,27 @@ PyStructSequence_Desc AwaitedInfo_desc = { 2 }; +// GCStatsInfo structseq type +static PyStructSequence_Field GCStatsInfo_fields[] = { + {"gen", "GC generation number"}, + {"iid", "Interpreter ID"}, + {"ts_start", "Raw timestamp at collection start"}, + {"ts_stop", "Raw timestamp at collection stop"}, + {"collections", "Total number of collections"}, + {"collected", "Total number of collected objects"}, + {"uncollectable", "Total number of uncollectable objects"}, + {"candidates", "Total objects considered and traversed"}, + {"duration", "Total collection time, in seconds"}, + {NULL} +}; + +PyStructSequence_Desc GCStatsInfo_desc = { + "_remote_debugging.GCStatsInfo", + "Information about a garbage collector stats sample", + GCStatsInfo_fields, + 9 +}; + /* ============================================================================ * UTILITY FUNCTIONS * ============================================================================ */ @@ -1100,6 +1122,159 @@ static PyType_Spec RemoteUnwinder_spec = { .slots = RemoteUnwinder_slots, }; +/* ============================================================================ + * GCMONITOR CLASS IMPLEMENTATION + * ============================================================================ */ + +static void +cleanup_runtime_offsets(RuntimeOffsets *offsets) +{ + if (offsets->handle.pid != 0) { + _Py_RemoteDebug_ClearCache(&offsets->handle); + _Py_RemoteDebug_CleanupProcHandle(&offsets->handle); + } +} + +static int +init_runtime_offsets(RuntimeOffsets *offsets, int pid, int debug) +{ + offsets->debug = debug; + if (_Py_RemoteDebug_InitProcHandle(&offsets->handle, pid) < 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to initialize process handle"); + return -1; + } + offsets->runtime_start_address = _Py_RemoteDebug_GetPyRuntimeAddress(&offsets->handle); + if (offsets->runtime_start_address == 0) { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to get Python runtime address"); + goto error; + } + if (_Py_RemoteDebug_ReadDebugOffsets(&offsets->handle, + &offsets->runtime_start_address, + &offsets->debug_offsets) < 0) + { + set_exception_cause(offsets, PyExc_RuntimeError, "Failed to read debug offsets"); + goto error; + } + if (validate_debug_offsets(&offsets->debug_offsets) == -1) { + set_exception_cause(offsets, PyExc_RuntimeError, "Invalid debug offsets found"); + goto error; + } + return 0; + +error: + cleanup_runtime_offsets(offsets); + return -1; +} + +/*[clinic input] +class _remote_debugging.GCMonitor "GCMonitorObject *" "&GCMonitor_Type" +[clinic start generated code]*/ +/*[clinic end generated code: output=da39a3ee5e6b4b0d input=ebc229325a5e5154]*/ + +/*[clinic input] +@permit_long_summary +@permit_long_docstring_body +_remote_debugging.GCMonitor.__init__ + pid: int + * + debug: bool = False + +Initialize a new GCMonitor object for monitoring GC events from remote process. + +Args: + pid: Process ID of the target Python process to monitor + debug: If True, chain exceptions to explain the sequence of events that + lead to the exception. + +The GCMonitor provides functionality to read GC statistics from a running +Python process. + +Raises: + PermissionError: If access to the target process is denied + OSError: If unable to attach to the target process or access its memory + RuntimeError: If unable to read debug information from the target process +[clinic start generated code]*/ + +static int +_remote_debugging_GCMonitor___init___impl(GCMonitorObject *self, int pid, + int debug) +/*[clinic end generated code: output=2cdf351c2f6335db input=1185a48535b808be]*/ +{ + return init_runtime_offsets(&self->offsets, pid, debug); +} + +/*[clinic input] +@critical_section +_remote_debugging.GCMonitor.get_gc_stats + + all_interpreters: bool = False + If True, return GC statistics from all interpreters. + If False, return only from main interpreter. + +Get garbage collector statistics from external Python process. + +Returns a list of GCStatsInfo objects with GC statistics data. + +Returns: + list of GCStatsInfo: A list of stats samples containing: + - gen: GC generation number. + - iid: Interpreter ID. + - ts_start: Raw timestamp at collection start. + - ts_stop: Raw timestamp at collection stop. + - collections: Total number of collections. + - collected: Total number of collected objects. + - uncollectable: Total number of uncollectable objects. + - candidates: Total objects considered and traversed. + - duration: Total collection time, in seconds. + +Raises: + RuntimeError: If the target process cannot be inspected or if its + debug offsets or GC stats layout are incompatible. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_GCMonitor_get_gc_stats_impl(GCMonitorObject *self, + int all_interpreters) +/*[clinic end generated code: output=f73f365725224f7a input=09e647719c65f9e4]*/ +{ + RemoteDebuggingState *st = RemoteDebugging_GetStateFromType(Py_TYPE(self)); + return get_gc_stats(&self->offsets, all_interpreters, st->GCStatsInfo_Type); +} + +static PyMethodDef GCMonitor_methods[] = { + _REMOTE_DEBUGGING_GCMONITOR_GET_GC_STATS_METHODDEF + {NULL, NULL} +}; + +static void +GCMonitor_dealloc(PyObject *op) +{ + GCMonitorObject *self = GCMonitor_CAST(op); + PyTypeObject *tp = Py_TYPE(self); + + cleanup_runtime_offsets(&self->offsets); + PyObject_Del(self); + Py_DECREF(tp); +} + +static PyType_Slot GCMonitor_slots[] = { + {Py_tp_doc, (void *)"GCMonitor(pid): Monitor GC events of a remote Python process."}, + {Py_tp_methods, GCMonitor_methods}, + {Py_tp_init, _remote_debugging_GCMonitor___init__}, + {Py_tp_dealloc, GCMonitor_dealloc}, + {0, NULL} +}; + +static PyType_Spec GCMonitor_spec = { + .name = "_remote_debugging.GCMonitor", + .basicsize = sizeof(GCMonitorObject), + .flags = ( + Py_TPFLAGS_DEFAULT + | Py_TPFLAGS_IMMUTABLETYPE + ), + .slots = GCMonitor_slots, +}; + /* Forward declarations for type specs defined later */ static PyType_Spec BinaryWriter_spec; static PyType_Spec BinaryReader_spec; @@ -1126,6 +1301,11 @@ _remote_debugging_exec(PyObject *m) return -1; } + CREATE_TYPE(m, st->GCMonitor_Type, &GCMonitor_spec); + if (PyModule_AddType(m, st->GCMonitor_Type) < 0) { + return -1; + } + // Initialize structseq types st->TaskInfo_Type = PyStructSequence_NewType(&TaskInfo_desc); if (st->TaskInfo_Type == NULL) { @@ -1183,6 +1363,14 @@ _remote_debugging_exec(PyObject *m) return -1; } + st->GCStatsInfo_Type = PyStructSequence_NewType(&GCStatsInfo_desc); + if (st->GCStatsInfo_Type == NULL) { + return -1; + } + if (PyModule_AddType(m, st->GCStatsInfo_Type) < 0) { + return -1; + } + // Create BinaryWriter and BinaryReader types CREATE_TYPE(m, st->BinaryWriter_Type, &BinaryWriter_spec); if (PyModule_AddType(m, st->BinaryWriter_Type) < 0) { @@ -1240,8 +1428,10 @@ remote_debugging_traverse(PyObject *mod, visitproc visit, void *arg) Py_VISIT(state->ThreadInfo_Type); Py_VISIT(state->InterpreterInfo_Type); Py_VISIT(state->AwaitedInfo_Type); + Py_VISIT(state->GCStatsInfo_Type); Py_VISIT(state->BinaryWriter_Type); Py_VISIT(state->BinaryReader_Type); + Py_VISIT(state->GCMonitor_Type); return 0; } @@ -1257,8 +1447,10 @@ remote_debugging_clear(PyObject *mod) Py_CLEAR(state->ThreadInfo_Type); Py_CLEAR(state->InterpreterInfo_Type); Py_CLEAR(state->AwaitedInfo_Type); + Py_CLEAR(state->GCStatsInfo_Type); Py_CLEAR(state->BinaryWriter_Type); Py_CLEAR(state->BinaryReader_Type); + Py_CLEAR(state->GCMonitor_Type); return 0; } @@ -1837,10 +2029,57 @@ _remote_debugging_is_python_process_impl(PyObject *module, int pid) Py_RETURN_TRUE; } +/*[clinic input] +_remote_debugging.get_gc_stats + + pid: int + * + all_interpreters: bool = False + If True, return GC statistics from all interpreters. + If False, return only from main interpreter. + +Get garbage collector statistics from external Python process. + +Returns: + list of GCStatsInfo: A list of stats samples containing: + - gen: GC generation number. + - iid: Interpreter ID. + - ts_start: Raw timestamp at collection start. + - ts_stop: Raw timestamp at collection stop. + - collections: Total number of collections. + - collected: Total number of collected objects. + - uncollectable: Total number of uncollectable objects. + - candidates: Total objects considered and traversed. + - duration: Total collection time, in seconds. + +Raises: + RuntimeError: If the target process cannot be inspected or if its + debug offsets or GC stats layout are incompatible. +[clinic start generated code]*/ + +static PyObject * +_remote_debugging_get_gc_stats_impl(PyObject *module, int pid, + int all_interpreters) +/*[clinic end generated code: output=d9dce5f7add149bb input=a2a08a45a8f0b119]*/ +{ + RuntimeOffsets offsets; + if (init_runtime_offsets(&offsets, pid, /*debug=*/1) < 0) { + return NULL; + } + + RemoteDebuggingState *st = RemoteDebugging_GetState(module); + PyObject *result = get_gc_stats(&offsets, all_interpreters, + st->GCStatsInfo_Type); + + cleanup_runtime_offsets(&offsets); + return result; +} + static PyMethodDef remote_debugging_methods[] = { _REMOTE_DEBUGGING_ZSTD_AVAILABLE_METHODDEF _REMOTE_DEBUGGING_GET_CHILD_PIDS_METHODDEF _REMOTE_DEBUGGING_IS_PYTHON_PROCESS_METHODDEF + _REMOTE_DEBUGGING_GET_GC_STATS_METHODDEF {NULL, NULL, 0, NULL}, }; diff --git a/Modules/_remote_debugging/object_reading.c b/Modules/_remote_debugging/object_reading.c index 59c28e223c545f..b63b103a2617ac 100644 --- a/Modules/_remote_debugging/object_reading.c +++ b/Modules/_remote_debugging/object_reading.c @@ -48,10 +48,8 @@ read_py_str( uintptr_t address, Py_ssize_t max_len ) { - PyObject *result = NULL; - char *buf = NULL; - - // Read the entire PyUnicodeObject at once + // Read the entire PyUnicodeObject at once; for short strings the data + // is inline right after the header and we'll already have (some of) it. char unicode_obj[SIZEOF_UNICODE_OBJ]; int res = _Py_RemoteDebug_PagedReadRemoteMemory( &unwinder->handle, @@ -61,7 +59,7 @@ read_py_str( ); if (res < 0) { set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read PyUnicodeObject"); - goto err; + return NULL; } Py_ssize_t len = GET_MEMBER(Py_ssize_t, unicode_obj, unwinder->debug_offsets.unicode_object.length); @@ -72,36 +70,94 @@ read_py_str( return NULL; } - buf = (char *)PyMem_RawMalloc(len+1); - if (buf == NULL) { - PyErr_NoMemory(); - set_exception_cause(unwinder, PyExc_MemoryError, "Failed to allocate buffer for string reading"); + // Inspect state to pick the right data offset and character width. + // We rely on the remote process sharing this Python version's + // PyASCIIObject layout, the same assumption already used for `length`. + struct _PyUnicodeObject_state state = GET_MEMBER( + struct _PyUnicodeObject_state, + unicode_obj, + unwinder->debug_offsets.unicode_object.state); + + if (!state.compact) { + PyErr_Format(PyExc_RuntimeError, + "Cannot read non-compact Unicode object at 0x%lx", address); + set_exception_cause(unwinder, PyExc_RuntimeError, + "Legacy (non-compact) Unicode objects are not supported"); return NULL; } - size_t offset = (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size; - res = _Py_RemoteDebug_PagedReadRemoteMemory(&unwinder->handle, address + offset, len, buf); - if (res < 0) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read string data from remote memory"); - goto err; + int kind = (int)state.kind; + Py_UCS4 max_char; + switch (kind) { + case PyUnicode_1BYTE_KIND: + max_char = state.ascii ? 0x7F : 0xFF; + break; + case PyUnicode_2BYTE_KIND: + max_char = 0xFFFF; + break; + case PyUnicode_4BYTE_KIND: + max_char = 0x10FFFF; + break; + default: + PyErr_Format(PyExc_RuntimeError, + "Invalid Unicode kind %d at 0x%lx", kind, address); + set_exception_cause(unwinder, PyExc_RuntimeError, + "Invalid kind in remote Unicode object"); + return NULL; } - buf[len] = '\0'; - result = PyUnicode_FromStringAndSize(buf, len); + size_t header_size = state.ascii + ? (size_t)unwinder->debug_offsets.unicode_object.asciiobject_size + : (size_t)unwinder->debug_offsets.unicode_object.compactunicodeobject_size; + + // len * kind is bounded by max_len * 4 (kind <= 4, len <= max_len), so + // the multiplication can't overflow for any caller-sane max_len, but the + // explicit cap here keeps a corrupted remote `length` from later turning + // into a giant allocation. + size_t nbytes = (size_t)len * (size_t)kind; + if ((size_t)len > (SIZE_MAX / 4) || nbytes > (size_t)max_len * 4) { + PyErr_Format(PyExc_RuntimeError, + "Implausible Unicode byte size %zu at 0x%lx", nbytes, address); + set_exception_cause(unwinder, PyExc_RuntimeError, + "Garbage byte size in remote Unicode object"); + return NULL; + } + + PyObject *result = PyUnicode_New(len, max_char); if (result == NULL) { - set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create PyUnicode from remote string data"); - goto err; + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to allocate PyUnicode for remote string"); + return NULL; + } + if (nbytes == 0) { + return result; } - PyMem_RawFree(buf); - assert(result != NULL); - return result; + void *data = PyUnicode_DATA(result); -err: - if (buf != NULL) { - PyMem_RawFree(buf); + // Reuse data already present in the header read; only round-trip for + // whatever spills past it. + size_t inline_avail = (header_size < SIZEOF_UNICODE_OBJ) + ? SIZEOF_UNICODE_OBJ - header_size + : 0; + size_t inline_bytes = nbytes < inline_avail ? nbytes : inline_avail; + if (inline_bytes > 0) { + memcpy(data, unicode_obj + header_size, inline_bytes); } - return NULL; + + if (nbytes > inline_bytes) { + res = _Py_RemoteDebug_PagedReadRemoteMemory( + &unwinder->handle, + address + header_size + inline_bytes, + nbytes - inline_bytes, + (char *)data + inline_bytes); + if (res < 0) { + Py_DECREF(result); + set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read string data from remote memory"); + return NULL; + } + } + + return result; } PyObject * diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 5319d9c7a4819b..a07675bb66d8cc 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1256,16 +1256,22 @@ write_perf_map_entry(PyObject *self, PyObject *args) { PyObject *code_addr_v; const void *code_addr; - unsigned int code_size; + PyObject *code_size_s; + size_t code_size; const char *entry_name; - if (!PyArg_ParseTuple(args, "OIs", &code_addr_v, &code_size, &entry_name)) + if (!PyArg_ParseTuple(args, "OOs", &code_addr_v, &code_size_s, &entry_name)) return NULL; code_addr = PyLong_AsVoidPtr(code_addr_v); if (code_addr == NULL) { return NULL; } + code_size = PyLong_AsSize_t(code_size_s); + if (code_size == (size_t)-1 && PyErr_Occurred()) { + return NULL; + } + int ret = PyUnstable_WritePerfMapEntry(code_addr, code_size, entry_name); if (ret < 0) { PyErr_SetFromErrno(PyExc_OSError); diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index 3bdbf2ef81679e..5bd53c2146a822 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -20,6 +20,7 @@ #include "pycore_fileutils.h" // _Py_closerange() #include "pycore_import.h" // _PyImport_AcquireLock() #include "pycore_initconfig.h" // _PyStatus_EXCEPTION() +#include "pycore_jit_unwind.h" // _Py_jit_debug_mutex #include "pycore_long.h" // _PyLong_IsNegative() #include "pycore_moduleobject.h" // _PyModule_GetState() #include "pycore_object.h" // _PyObject_LookupSpecial() @@ -758,6 +759,13 @@ PyOS_AfterFork_Child(void) goto fatal_error; } +#if defined(PY_HAVE_JIT_GDB_UNWIND) + // The child can inherit this mutex locked if another thread held it at + // fork(), but the child itself cannot be inside gdb_jit_register_code(). + // Reinitialize it before any executor cleanup can unregister JIT code. + _Py_jit_debug_mutex = (PyMutex){0}; +#endif + reset_remotedebug_data(tstate); reset_asyncio_state((_PyThreadStateImpl *)tstate); diff --git a/PCbuild/_remote_debugging.vcxproj b/PCbuild/_remote_debugging.vcxproj index 0e86ce9f4c918c..3a9b4033a697ad 100644 --- a/PCbuild/_remote_debugging.vcxproj +++ b/PCbuild/_remote_debugging.vcxproj @@ -99,6 +99,7 @@ + @@ -108,10 +109,12 @@ + + diff --git a/PCbuild/_remote_debugging.vcxproj.filters b/PCbuild/_remote_debugging.vcxproj.filters index 59d4d5c5c335fb..001f214805f93f 100644 --- a/PCbuild/_remote_debugging.vcxproj.filters +++ b/PCbuild/_remote_debugging.vcxproj.filters @@ -15,6 +15,9 @@ Source Files + + Source Files + Source Files @@ -42,6 +45,9 @@ Source Files + + Source Files + @@ -50,6 +56,9 @@ Header Files + + Header Files + diff --git a/Parser/asdl_c.py b/Parser/asdl_c.py index df4454cf948693..ed1d2e08df8f40 100755 --- a/Parser/asdl_c.py +++ b/Parser/asdl_c.py @@ -945,6 +945,19 @@ def visitModule(self, mod): return -1; } + int contains = PySet_Contains(state->abstract_types, (PyObject *)Py_TYPE(self)); + if (contains == -1) { + return -1; + } + else if (contains == 1) { + if (PyErr_WarnFormat( + PyExc_DeprecationWarning, 1, + "Instantiating abstract AST node class %T is deprecated. " + "This will become an error in Python 3.20", self) < 0) { + return -1; + } + } + Py_ssize_t i, numfields = 0; int res = -1; PyObject *key, *value, *fields, *attributes = NULL, *remaining_fields = NULL; @@ -1777,6 +1790,13 @@ def visitModule(self, mod): if (!state->AST_type) { return -1; } + state->abstract_types = PySet_New(NULL); + if (!state->abstract_types) { + return -1; + } + if (PySet_Add(state->abstract_types, state->AST_type) < 0) { + return -1; + } if (add_ast_fields(state) < 0) { return -1; } @@ -1818,6 +1838,7 @@ def visitSum(self, sum, name): (name, name, len(sum.attributes)), 1) else: self.emit("if (add_attributes(state, state->%s_type, NULL, 0) < 0) return -1;" % name, 1) + self.emit("if (PySet_Add(state->abstract_types, state->%s_type) < 0) return -1;" % name, 1) self.emit_defaults(name, sum.attributes, 1) simple = is_simple(sum) for t in sum.types: @@ -1850,6 +1871,30 @@ def emit_defaults(self, name, fields, depth): class ASTModuleVisitor(PickleVisitor): def visitModule(self, mod): + self.emit(""" +/* Helper for checking if a node class is abstract in the tests. */ +static PyObject * +ast_is_abstract(PyObject *Py_UNUSED(module), PyObject *cls) { + struct ast_state *state = get_ast_state(); + if (state == NULL) { + return NULL; + } + int contains = PySet_Contains(state->abstract_types, cls); + if (contains == -1) { + return NULL; + } + else if (contains == 1) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; +} + +static struct PyMethodDef astmodule_methods[] = { + {"_is_abstract", ast_is_abstract, METH_O, NULL}, + {NULL} /* Sentinel */ +}; +""".strip(), 0, reflow=False) + self.emit("", 0) self.emit("static int", 0) self.emit("astmodule_exec(PyObject *m)", 0) self.emit("{", 0) @@ -1891,7 +1936,8 @@ def visitModule(self, mod): .m_name = "_ast", // The _ast module uses a per-interpreter state (PyInterpreterState.ast) .m_size = 0, - .m_slots = astmodule_slots, + .m_methods = astmodule_methods, + .m_slots = astmodule_slots }; PyMODINIT_FUNC @@ -2180,6 +2226,7 @@ def generate_module_def(mod, metadata, f, internal_h): "%s_type" % type for type in metadata.types ) + module_state.add("abstract_types") state_strings = sorted(state_strings) module_state = sorted(module_state) diff --git a/Python/Python-ast.c b/Python/Python-ast.c index 6bcf57bdd6b4f4..5f3d9c4b17410f 100644 --- a/Python/Python-ast.c +++ b/Python/Python-ast.c @@ -178,6 +178,7 @@ void _PyAST_Fini(PyInterpreterState *interp) Py_CLEAR(state->__module__); Py_CLEAR(state->_attributes); Py_CLEAR(state->_fields); + Py_CLEAR(state->abstract_types); Py_CLEAR(state->alias_type); Py_CLEAR(state->annotation); Py_CLEAR(state->arg); @@ -5269,6 +5270,19 @@ ast_type_init(PyObject *self, PyObject *args, PyObject *kw) return -1; } + int contains = PySet_Contains(state->abstract_types, (PyObject *)Py_TYPE(self)); + if (contains == -1) { + return -1; + } + else if (contains == 1) { + if (PyErr_WarnFormat( + PyExc_DeprecationWarning, 1, + "Instantiating abstract AST node class %T is deprecated. " + "This will become an error in Python 3.20", self) < 0) { + return -1; + } + } + Py_ssize_t i, numfields = 0; int res = -1; PyObject *key, *value, *fields, *attributes = NULL, *remaining_fields = NULL; @@ -6100,6 +6114,13 @@ init_types(void *arg) if (!state->AST_type) { return -1; } + state->abstract_types = PySet_New(NULL); + if (!state->abstract_types) { + return -1; + } + if (PySet_Add(state->abstract_types, state->AST_type) < 0) { + return -1; + } if (add_ast_fields(state) < 0) { return -1; } @@ -6110,6 +6131,7 @@ init_types(void *arg) " | FunctionType(expr* argtypes, expr returns)"); if (!state->mod_type) return -1; if (add_attributes(state, state->mod_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->mod_type) < 0) return -1; state->Module_type = make_type(state, "Module", state->mod_type, Module_fields, 2, "Module(stmt* body, type_ignore* type_ignores)"); @@ -6159,6 +6181,7 @@ init_types(void *arg) if (!state->stmt_type) return -1; if (add_attributes(state, state->stmt_type, stmt_attributes, 4) < 0) return -1; + if (PySet_Add(state->abstract_types, state->stmt_type) < 0) return -1; if (PyObject_SetAttr(state->stmt_type, state->end_lineno, Py_None) == -1) return -1; if (PyObject_SetAttr(state->stmt_type, state->end_col_offset, Py_None) == @@ -6348,6 +6371,7 @@ init_types(void *arg) if (!state->expr_type) return -1; if (add_attributes(state, state->expr_type, expr_attributes, 4) < 0) return -1; + if (PySet_Add(state->abstract_types, state->expr_type) < 0) return -1; if (PyObject_SetAttr(state->expr_type, state->end_lineno, Py_None) == -1) return -1; if (PyObject_SetAttr(state->expr_type, state->end_col_offset, Py_None) == @@ -6494,6 +6518,8 @@ init_types(void *arg) "expr_context = Load | Store | Del"); if (!state->expr_context_type) return -1; if (add_attributes(state, state->expr_context_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->expr_context_type) < 0) return + -1; state->Load_type = make_type(state, "Load", state->expr_context_type, NULL, 0, "Load"); @@ -6518,6 +6544,7 @@ init_types(void *arg) "boolop = And | Or"); if (!state->boolop_type) return -1; if (add_attributes(state, state->boolop_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->boolop_type) < 0) return -1; state->And_type = make_type(state, "And", state->boolop_type, NULL, 0, "And"); if (!state->And_type) return -1; @@ -6535,6 +6562,7 @@ init_types(void *arg) "operator = Add | Sub | Mult | MatMult | Div | Mod | Pow | LShift | RShift | BitOr | BitXor | BitAnd | FloorDiv"); if (!state->operator_type) return -1; if (add_attributes(state, state->operator_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->operator_type) < 0) return -1; state->Add_type = make_type(state, "Add", state->operator_type, NULL, 0, "Add"); if (!state->Add_type) return -1; @@ -6629,6 +6657,7 @@ init_types(void *arg) "unaryop = Invert | Not | UAdd | USub"); if (!state->unaryop_type) return -1; if (add_attributes(state, state->unaryop_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->unaryop_type) < 0) return -1; state->Invert_type = make_type(state, "Invert", state->unaryop_type, NULL, 0, "Invert"); @@ -6659,6 +6688,7 @@ init_types(void *arg) "cmpop = Eq | NotEq | Lt | LtE | Gt | GtE | Is | IsNot | In | NotIn"); if (!state->cmpop_type) return -1; if (add_attributes(state, state->cmpop_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->cmpop_type) < 0) return -1; state->Eq_type = make_type(state, "Eq", state->cmpop_type, NULL, 0, "Eq"); if (!state->Eq_type) return -1; @@ -6732,6 +6762,8 @@ init_types(void *arg) if (!state->excepthandler_type) return -1; if (add_attributes(state, state->excepthandler_type, excepthandler_attributes, 4) < 0) return -1; + if (PySet_Add(state->abstract_types, state->excepthandler_type) < 0) return + -1; if (PyObject_SetAttr(state->excepthandler_type, state->end_lineno, Py_None) == -1) return -1; @@ -6822,6 +6854,7 @@ init_types(void *arg) if (!state->pattern_type) return -1; if (add_attributes(state, state->pattern_type, pattern_attributes, 4) < 0) return -1; + if (PySet_Add(state->abstract_types, state->pattern_type) < 0) return -1; state->MatchValue_type = make_type(state, "MatchValue", state->pattern_type, MatchValue_fields, 1, @@ -6872,6 +6905,8 @@ init_types(void *arg) "type_ignore = TypeIgnore(int lineno, string tag)"); if (!state->type_ignore_type) return -1; if (add_attributes(state, state->type_ignore_type, NULL, 0) < 0) return -1; + if (PySet_Add(state->abstract_types, state->type_ignore_type) < 0) return + -1; state->TypeIgnore_type = make_type(state, "TypeIgnore", state->type_ignore_type, TypeIgnore_fields, 2, @@ -6885,6 +6920,7 @@ init_types(void *arg) if (!state->type_param_type) return -1; if (add_attributes(state, state->type_param_type, type_param_attributes, 4) < 0) return -1; + if (PySet_Add(state->abstract_types, state->type_param_type) < 0) return -1; state->TypeVar_type = make_type(state, "TypeVar", state->type_param_type, TypeVar_fields, 3, "TypeVar(identifier name, expr? bound, expr? default_value)"); @@ -17956,6 +17992,28 @@ obj2ast_type_param(struct ast_state *state, PyObject* obj, type_param_ty* out, } +/* Helper for checking if a node class is abstract in the tests. */ +static PyObject * +ast_is_abstract(PyObject *Py_UNUSED(module), PyObject *cls) { + struct ast_state *state = get_ast_state(); + if (state == NULL) { + return NULL; + } + int contains = PySet_Contains(state->abstract_types, cls); + if (contains == -1) { + return NULL; + } + else if (contains == 1) { + Py_RETURN_TRUE; + } + Py_RETURN_FALSE; +} + +static struct PyMethodDef astmodule_methods[] = { + {"_is_abstract", ast_is_abstract, METH_O, NULL}, + {NULL} /* Sentinel */ +}; + static int astmodule_exec(PyObject *m) { @@ -18382,7 +18440,8 @@ static struct PyModuleDef _astmodule = { .m_name = "_ast", // The _ast module uses a per-interpreter state (PyInterpreterState.ast) .m_size = 0, - .m_slots = astmodule_slots, + .m_methods = astmodule_methods, + .m_slots = astmodule_slots }; PyMODINIT_FUNC diff --git a/Python/bytecodes.c b/Python/bytecodes.c index d485172c82fa0d..4239ba58bc390b 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2984,6 +2984,7 @@ dummy_func( macro(LOAD_ATTR_CLASS) = unused/1 + + _RECORD_TOS + _CHECK_ATTR_CLASS + unused/2 + _LOAD_ATTR_CLASS + @@ -2991,7 +2992,7 @@ dummy_func( macro(LOAD_ATTR_CLASS_WITH_METACLASS_CHECK) = unused/1 + - _RECORD_TOS_TYPE + + _RECORD_TOS + _GUARD_TYPE_VERSION + _CHECK_ATTR_CLASS + _LOAD_ATTR_CLASS + diff --git a/Python/ceval.c b/Python/ceval.c index 0d7f8a62e246ae..28087ba58d4855 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1305,7 +1305,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } #ifdef _Py_TIER2 #ifdef _Py_JIT -_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT; +_PyJitEntryFuncPtr _Py_jit_entry = _PyJIT_Entry; #else _PyJitEntryFuncPtr _Py_jit_entry = _PyTier2Interpreter; #endif diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index a7d63fd3b82211..a4e9980589e4a3 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -168,7 +168,6 @@ #define STOP_TRACING() ((void)(0)); #endif - /* PRE_DISPATCH_GOTO() does lltrace if enabled. Normally a no-op */ #ifdef Py_DEBUG #define PRE_DISPATCH_GOTO() if (frame->lltrace >= 5) { \ diff --git a/Python/gc.c b/Python/gc.c index 59bed10c1fb230..134da107e1b61d 100644 --- a/Python/gc.c +++ b/Python/gc.c @@ -1405,7 +1405,6 @@ add_stats(GCState *gcstate, int gen, struct gc_generation_stats *stats) memcpy(cur_stats, prev_stats, sizeof(struct gc_generation_stats)); cur_stats->ts_start = stats->ts_start; - cur_stats->ts_stop = stats->ts_stop; cur_stats->collections += 1; cur_stats->collected += stats->collected; @@ -1413,6 +1412,9 @@ add_stats(GCState *gcstate, int gen, struct gc_generation_stats *stats) cur_stats->candidates += stats->candidates; cur_stats->duration += stats->duration; + /* Publish ts_stop last so remote readers do not select a partially + updated stats record as the latest collection. */ + cur_stats->ts_stop = stats->ts_stop; } /* This is the main function. Read this to understand how the @@ -1454,10 +1456,14 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) assert(generation >= 0 && generation < NUM_GENERATIONS); #ifdef Py_STATS - if (_Py_stats) { - _Py_stats->object_stats.object_visits = 0; + { + PyStats *s = _PyStats_GET(); + if (s) { + s->object_stats.object_visits = 0; + } } #endif + GC_STAT_ADD(generation, collections, 1); struct gc_generation_stats stats = { 0 }; @@ -1615,12 +1621,16 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) /* Update stats */ add_stats(gcstate, generation, &stats); - GC_STAT_ADD(generation, objects_collected, m); + GC_STAT_ADD(generation, objects_collected, stats.collected); + #ifdef Py_STATS - if (_Py_stats) { - GC_STAT_ADD(generation, object_visits, - _Py_stats->object_stats.object_visits); - _Py_stats->object_stats.object_visits = 0; + { + PyStats *s = _PyStats_GET(); + if (s) { + GC_STAT_ADD(generation, object_visits, + s->object_stats.object_visits); + s->object_stats.object_visits = 0; + } } #endif diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 4b46ca04f56b20..b4fcd365592aa5 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -2492,6 +2492,8 @@ gc_collect_main(PyThreadState *tstate, int generation, _PyGC_Reason reason) /* Update stats */ struct gc_generation_stats *stats = get_stats(gcstate, generation); + stats->ts_start = start; + stats->ts_stop = stop; stats->collections++; stats->collected += m; stats->uncollectable += n; diff --git a/Python/jit.c b/Python/jit.c index 26e01b25d48c04..8b555105129a9f 100644 --- a/Python/jit.c +++ b/Python/jit.c @@ -15,6 +15,7 @@ #include "pycore_interpframe.h" #include "pycore_interpolation.h" #include "pycore_intrinsics.h" +#include "pycore_jit_unwind.h" #include "pycore_lazyimportobject.h" #include "pycore_list.h" #include "pycore_long.h" @@ -60,6 +61,40 @@ jit_error(const char *message) PyErr_Format(PyExc_RuntimeWarning, "JIT %s (%d)", message, hint); } +/* + * Publish JIT code to optional tooling backends. + * + * The return value is a backend-specific deregistration handle, not a + * success/failure indicator. NULL means there is nothing to unregister later: + * perf does not need a handle, and GDB registration failures are intentionally + * non-fatal because tooling support must not make JIT compilation fail. + */ +static void * +jit_record_code(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + _PyPerf_Callbacks callbacks; + _PyPerfTrampoline_GetCallbacks(&callbacks); + if (callbacks.write_state == _Py_perfmap_jit_callbacks.write_state) { + _PyPerfJit_WriteNamedCode( + code_addr, code_size, entry, filename); + return NULL; + } +#endif + +#if defined(PY_HAVE_JIT_GDB_UNWIND) + return _PyJitUnwind_GdbRegisterCode( + code_addr, code_size, entry, filename); +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + return NULL; +#endif +} + static int address_in_executor_array(_PyExecutorObject **ptrs, size_t count, uintptr_t addr) { @@ -715,6 +750,10 @@ _PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], siz } executor->jit_code = memory; executor->jit_size = total_size; + executor->jit_gdb_handle = jit_record_code(memory, + code_size + state.trampolines.size, + "jit", + "executor"); return 0; } @@ -727,6 +766,12 @@ _PyJIT_Free(_PyExecutorObject *executor) if (memory) { executor->jit_code = NULL; executor->jit_size = 0; +#if defined(PY_HAVE_JIT_GDB_UNWIND) + if (executor->jit_gdb_handle != NULL) { + _PyJitUnwind_GdbUnregisterCode(executor->jit_gdb_handle); + executor->jit_gdb_handle = NULL; + } +#endif if (jit_free(memory, size)) { PyErr_FormatUnraisable("Exception ignored while " "freeing JIT memory"); diff --git a/Python/jit_unwind.c b/Python/jit_unwind.c new file mode 100644 index 00000000000000..09002feafec17c --- /dev/null +++ b/Python/jit_unwind.c @@ -0,0 +1,986 @@ +/* + * Python JIT - DWARF .eh_frame builder + * + * This file contains the DWARF CFI generator used to build .eh_frame + * data for JIT code (perf jitdump and other unwinders). + */ + +#include "Python.h" +#include "pycore_jit_unwind.h" +#include "pycore_lock.h" + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +# include "jit_unwind_info.h" +# if !JIT_UNWIND_INFO_SUPPORTED +# error "JIT unwind info was not generated for this target" +# endif +#endif + +#if defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +# include +#endif +#include +#include + +// ============================================================================= +// DWARF CONSTANTS +// ============================================================================= + +/* + * DWARF (Debug With Arbitrary Record Formats) constants + * + * DWARF is a debugging data format used to provide stack unwinding information. + * These constants define the various encoding types and opcodes used in + * DWARF Call Frame Information (CFI) records. + */ + +/* DWARF Call Frame Information version */ +#define DWRF_CIE_VERSION 1 + +/* DWARF CFA (Call Frame Address) opcodes */ +enum { + DWRF_CFA_nop = 0x0, // No operation + DWRF_CFA_offset_extended = 0x5, // Extended offset instruction + DWRF_CFA_def_cfa = 0xc, // Define CFA rule + DWRF_CFA_def_cfa_register = 0xd, // Define CFA register + DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset + DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset + DWRF_CFA_advance_loc = 0x40, // Advance location counter + DWRF_CFA_offset = 0x80, // Simple offset instruction + DWRF_CFA_restore = 0xc0 // Restore register +}; + +/* + * Architecture-specific DWARF register numbers + * + * These constants define the register numbering scheme used by DWARF + * for each supported architecture. The numbers must match the ABI + * specification for proper stack unwinding. + */ +enum { +#ifdef __x86_64__ + /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ + DWRF_REG_AX, // RAX + DWRF_REG_DX, // RDX + DWRF_REG_CX, // RCX + DWRF_REG_BX, // RBX + DWRF_REG_SI, // RSI + DWRF_REG_DI, // RDI + DWRF_REG_BP, // RBP + DWRF_REG_SP, // RSP + DWRF_REG_8, // R8 + DWRF_REG_9, // R9 + DWRF_REG_10, // R10 + DWRF_REG_11, // R11 + DWRF_REG_12, // R12 + DWRF_REG_13, // R13 + DWRF_REG_14, // R14 + DWRF_REG_15, // R15 + DWRF_REG_RA, // Return address (RIP) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 register numbering */ + DWRF_REG_FP = 29, // Frame Pointer + DWRF_REG_RA = 30, // Link register (return address) + DWRF_REG_SP = 31, // Stack pointer +#else +# error "Unsupported target architecture" +#endif +}; + +// ============================================================================= +// ELF OBJECT CONTEXT +// ============================================================================= + +/* + * Context for building ELF/DWARF structures + * + * This structure maintains state while constructing DWARF unwind information. + * It acts as a simple buffer manager with pointers to track current position + * and important landmarks within the buffer. + */ +typedef struct ELFObjectContext { + uint8_t* p; // Current write position in buffer + uint8_t* startp; // Start of buffer (for offset calculations) + uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) + uintptr_t code_addr; // Address of the code section + size_t code_size; // Size of the code section +} ELFObjectContext; + +// ============================================================================= +// DWARF GENERATION UTILITIES +// ============================================================================= + +/* + * Append a null-terminated string to the ELF context buffer. + * + * Args: + * ctx: ELF object context + * str: String to append (must be null-terminated) + * + * Returns: Offset from start of buffer where string was written + */ +static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { + uint8_t* p = ctx->p; + uint32_t ofs = (uint32_t)(p - ctx->startp); + + /* Copy string including null terminator */ + do { + *p++ = (uint8_t)*str; + } while (*str++); + + ctx->p = p; + return ofs; +} + +/* + * Append a SLEB128 (Signed Little Endian Base 128) value + * + * SLEB128 is a variable-length encoding used extensively in DWARF. + * It efficiently encodes small numbers in fewer bytes. + * + * Args: + * ctx: ELF object context + * v: Signed value to encode + */ +static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { + *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Append a ULEB128 (Unsigned Little Endian Base 128) value + * + * Similar to SLEB128 but for unsigned values. + * + * Args: + * ctx: ELF object context + * v: Unsigned value to encode + */ +static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { + uint8_t* p = ctx->p; + + /* Encode 7 bits at a time, with continuation bit in MSB */ + for (; v >= 0x80; v >>= 7) { + *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit + } + *p++ = (char)v; // Final byte without continuation bit + + ctx->p = p; +} + +/* + * Macros for generating DWARF structures + * + * These macros provide a convenient way to write various data types + * to the DWARF buffer while automatically advancing the pointer. + */ +#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit +#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit +#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit +#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit +#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address +#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 +#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 +#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string + +/* Align to specified boundary with NOP instructions */ +#define DWRF_ALIGNNOP(s) \ + while ((uintptr_t)p & ((s)-1)) { \ + *p++ = DWRF_CFA_nop; \ + } + +/* Write a DWARF section with automatic size calculation */ +#define DWRF_SECTION(name, stmt) \ + { \ + uint32_t* szp_##name = (uint32_t*)p; \ + p += 4; \ + stmt; \ + *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ + } + +// ============================================================================= +// DWARF EH FRAME GENERATION +// ============================================================================= + +static void elf_init_ehframe_perf(ELFObjectContext* ctx); +#if defined(PY_HAVE_JIT_GDB_UNWIND) +static void elf_init_ehframe_gdb(ELFObjectContext* ctx); +#endif + +static inline void elf_init_ehframe(ELFObjectContext* ctx, int absolute_addr) { + if (absolute_addr) { +#if defined(PY_HAVE_JIT_GDB_UNWIND) + elf_init_ehframe_gdb(ctx); +#else + Py_UNREACHABLE(); +#endif + } + else { + elf_init_ehframe_perf(ctx); + } +} + +size_t +_PyJitUnwind_EhFrameSize(int absolute_addr) +{ + /* The .eh_frame we emit is small and bounded; keep a generous buffer. */ + uint8_t scratch[512]; + _Static_assert(sizeof(scratch) >= 256, + "scratch buffer may be too small for elf_init_ehframe"); + ELFObjectContext ctx; + ctx.code_size = 1; + ctx.code_addr = 0; + ctx.startp = ctx.p = scratch; + ctx.fde_p = NULL; + /* Generate once into scratch to learn the required size. */ + elf_init_ehframe(&ctx, absolute_addr); + ptrdiff_t size = ctx.p - ctx.startp; + assert(size <= (ptrdiff_t)sizeof(scratch)); + return (size_t)size; +} + +size_t +_PyJitUnwind_BuildEhFrame(uint8_t *buffer, size_t buffer_size, + const void *code_addr, size_t code_size, + int absolute_addr) +{ + if (buffer == NULL || code_addr == NULL || code_size == 0) { + return 0; + } + /* Generate the frame twice: once to size-check, once to write. */ + size_t required = _PyJitUnwind_EhFrameSize(absolute_addr); + if (required == 0 || required > buffer_size) { + return 0; + } + ELFObjectContext ctx; + ctx.code_size = code_size; + ctx.code_addr = (uintptr_t)code_addr; + ctx.startp = ctx.p = buffer; + ctx.fde_p = NULL; + elf_init_ehframe(&ctx, absolute_addr); + size_t written = (size_t)(ctx.p - ctx.startp); + /* The frame size is independent of code_addr/code_size (fixed-width fields). */ + assert(written == required); + return written; +} + +/* + * Generate a minimal .eh_frame for a single JIT code region. + * + * The .eh_frame section contains Call Frame Information (CFI) that describes + * how to unwind the stack at any point in the code. This is essential for + * unwinding through JIT-generated code. + * + * The generated data contains: + * 1. A CIE (Common Information Entry) describing the calling convention. + * 2. An FDE (Frame Description Entry) describing how to unwind the JIT frame. + * + * Two flavors are emitted, dispatched on the absolute_addr flag: + * + * - absolute_addr == 0 (elf_init_ehframe_perf): PC-relative FDE address + * encoding for perf's synthesized DSO layout. The CIE describes the + * trampoline's entry state and the FDE walks through the prologue and + * epilogue with advance_loc instructions. This matches the pre-existing + * perf_jit_trampoline behavior byte-for-byte. + * + * - absolute_addr == 1 (elf_init_ehframe_gdb): absolute FDE address + * encoding for the GDB JIT in-memory ELF. The CIE describes the + * steady-state frame layout (CFA = %rbp+16 / x29+16, with saved fp and + * return-address column at fixed offsets) and the FDE emits no further + * CFI. The same rule applies at every PC in the registered region, + * which is correct for executor stencils (they pin the frame pointer + * across the region). This is the GDB-side fix; see elf_init_ehframe_gdb + * for details. + */ +static void elf_init_ehframe_perf(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4; + uint8_t* p = ctx->p; + uint8_t* framep = p; // Remember start of frame data + + /* + * DWARF Unwind Table for Trampoline Function + * + * This section defines DWARF Call Frame Information (CFI) using encoded macros + * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function + * preserves and restores registers. This is used by profiling tools (e.g., `perf`) + * and debuggers for stack unwinding in JIT-compiled code. + * + * ------------------------------------------------- + * TO REGENERATE THIS TABLE FROM GCC OBJECTS: + * ------------------------------------------------- + * + * 1. Create a trampoline source file (e.g., `trampoline.c`): + * + * #include + * typedef PyObject* (*py_evaluator)(void*, void*, int); + * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { + * return evaluator(ts, f, throwflag); + * } + * + * 2. Compile to an object file with frame pointer preservation: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * 3. Extract DWARF unwind info from the object file: + * + * readelf -w trampoline.o + * + * Example output from `.eh_frame`: + * + * 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * 00000014 FDE cie=00000000 pc=0..14 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. + * + * ---------------------------------- + * HOW TO TRANSLATE TO DWRF_* MACROS: + * ---------------------------------- + * + * After compiling your trampoline with: + * + * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c + * + * run: + * + * readelf -w trampoline.o + * + * to inspect the generated `.eh_frame` data. You will see two main components: + * + * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. + * 2. An FDE (Frame Description Entry): function-specific unwind instructions. + * + * --------------------- + * Translating the CIE: + * --------------------- + * From `readelf -w`, you might see: + * + * 00000000 0000000000000010 00000000 CIE + * Version: 1 + * Augmentation: "zR" + * Code alignment factor: 4 + * Data alignment factor: -8 + * Return address column: 30 + * Augmentation data: 1b + * DW_CFA_def_cfa: r31 (sp) ofs 0 + * + * Map this to: + * + * DWRF_SECTION(CIE, + * DWRF_U32(0); // CIE ID (always 0 for CIEs) + * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 + * DWRF_STR("zR"); // Augmentation string "zR" + * DWRF_UV(4); // Code alignment factor = 4 + * DWRF_SV(-8); // Data alignment factor = -8 + * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) + * DWRF_UV(1); // Augmentation data length = 1 + * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers + * + * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa + * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) + * DWRF_UV(0); // Offset = 0 + * + * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary + * ) + * + * Notes: + * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. + * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. + * + * --------------------- + * Translating the FDE: + * --------------------- + * From `readelf -w`: + * + * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 + * DW_CFA_advance_loc: 4 + * DW_CFA_def_cfa_offset: 16 + * DW_CFA_offset: r29 at cfa-16 + * DW_CFA_offset: r30 at cfa-8 + * DW_CFA_advance_loc: 12 + * DW_CFA_restore: r30 + * DW_CFA_restore: r29 + * DW_CFA_def_cfa_offset: 0 + * + * Map the FDE header and instructions to: + * + * DWRF_SECTION(FDE, + * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) + * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) + * DWRF_U32(ctx->code_size); // Code range covered by this FDE + * DWRF_U8(0); // Augmentation data length (none) + * + * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + * DWRF_UV(16); + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) + * DWRF_UV(2); // At offset 2 * 8 = 16 bytes + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) + * DWRF_UV(1); // At offset 1 * 8 = 8 bytes + * + * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) + * + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 + * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 + * + * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + * DWRF_UV(0); + * ) + * + * To regenerate: + * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. + * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as + * the code is in a different address space every time. + * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: + * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) + * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) + * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset + * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) + * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. + * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. + */ + + /* + * Emit DWARF EH CIE (Common Information Entry) + * + * The CIE describes the calling conventions and basic unwinding rules + * that apply to all functions in this compilation unit. + */ + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID (0 indicates this is a CIE) + DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) + DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) +#ifdef __x86_64__ + DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) +#endif + DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) + DWRF_U8(DWRF_REG_RA); // Return address register number + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Initial CFI instructions - describe default calling convention */ +#ifdef __x86_64__ + /* x86_64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size + DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved + DWRF_UV(1); // At offset 1 from CFA +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 initial CFI state */ + DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) + DWRF_UV(DWRF_REG_SP); // CFA = SP register + DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) + // No initial register saves in AArch64 CIE +#endif + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + /* + * Emit DWARF EH FDE (Frame Description Entry) + * + * The FDE describes unwinding information specific to this function. + * It references the CIE and provides function-specific CFI instructions. + * + * The PC-relative offset is calculated after the entire EH frame is built + * to ensure accurate positioning relative to the synthesized DSO layout. + */ + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + /* + * In perf jitdump mode the FDE PC field is encoded PC-relative and + * points back to code_start. Record where that field lives so we can + * patch in the final offset after the rest of the synthetic DSO + * layout is known. + */ + ctx->fde_p = p; // Remember where PC offset field is located for later calculation + DWRF_U32(0); // Placeholder for PC-relative offset (calculated below) + DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) + DWRF_U8(0); // Augmentation data length (none) + + /* + * Architecture-specific CFI instructions + * + * These instructions describe how registers are saved and restored + * during function calls. Each architecture has different calling + * conventions and register usage patterns. + */ +#ifdef __x86_64__ + /* x86_64 calling convention unwinding rules */ +# if defined(__CET__) && (__CET__ & 1) + DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) +# endif + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) + DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 + DWRF_UV(16); // New offset: SP + 16 + DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 + DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 + DWRF_UV(DWRF_REG_BP); // Use base pointer register + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 + DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 + DWRF_UV(DWRF_REG_SP); // Use stack pointer register + DWRF_UV(8); // New offset: SP + 8 +#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + /* AArch64 calling convention unwinding rules */ + DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) + DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 + DWRF_UV(16); // Stack pointer moved by 16 bytes + DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved + DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) + DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved + DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) + DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) + DWRF_U8(DWRF_CFA_def_cfa_register); // CFA = FP (x29) + 16 + DWRF_UV(DWRF_REG_FP); + DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! + DWRF_U8(DWRF_CFA_def_cfa); // CFA = SP + 0 (stack restored) + DWRF_UV(DWRF_REG_SP); + DWRF_UV(0); + +#else +# error "Unsupported target architecture" +#endif + + DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary + ) + + ctx->p = p; // Update context pointer to end of generated data + + /* Calculate and update the PC-relative offset in the FDE + * + * When perf processes the jitdump, it creates a synthesized DSO with this layout: + * + * Synthesized DSO Memory Layout: + * ┌─────────────────────────────────────────────────────────────┐ < code_start + * │ Code Section │ + * │ (round_up(code_size, 8) bytes) │ + * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data + * │ EH Frame Data │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ CIE data │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * │ ┌─────────────────────────────────────────────────────┐ │ + * │ │ FDE Header: │ │ + * │ │ - CIE offset (4 bytes) │ │ + * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start + * │ │ - address range (4 bytes) │ │ (this specific field) + * │ │ CFI Instructions... │ │ + * │ └─────────────────────────────────────────────────────┘ │ + * ├─────────────────────────────────────────────────────────────┤ < reference_point + * │ EhFrameHeader │ + * │ (navigation metadata) │ + * └─────────────────────────────────────────────────────────────┘ + * + * The PC offset field in the FDE must contain the distance from itself to code_start: + * + * distance = code_start - fde_pc_field + * + * Where: + * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame + * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) + * + * Therefore: + * distance = code_start_location - fde_pc_field_location + * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) + * = -rounded_code_size - fde_offset_in_frame + * = -(round_up(code_size, 8) + fde_offset_in_frame) + * + * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field. + * + */ + int32_t rounded_code_size = + (int32_t)_Py_SIZE_ROUND_UP(ctx->code_size, 8); + int32_t fde_offset_in_frame = (int32_t)(ctx->fde_p - framep); + *(int32_t *)ctx->fde_p = -(rounded_code_size + fde_offset_in_frame); +} + +/* + * Build .eh_frame data for the GDB JIT interface. + * + * The executor runs inside the frame established by _PyJIT_Entry, but the + * synthetic executor FDE collapses that state into a single logical JIT frame + * that unwinds directly into _PyEval_*. Executor stencils never touch the + * frame pointer - enforced by Tools/jit/_optimizers.py _validate() and + * -mframe-pointer=reserved - so the steady-state rule is valid at every PC + * and the FDE body is empty. Tools/jit/_targets.py derives the initial CFI + * rules from the row active at the executor call in the compiled shim object. + */ +#if defined(PY_HAVE_JIT_GDB_UNWIND) +static void elf_init_ehframe_gdb(ELFObjectContext* ctx) { + int fde_ptr_enc = DWRF_EH_PE_absptr; + uint8_t* p = ctx->p; + uint8_t* framep = p; + + DWRF_SECTION(CIE, + DWRF_U32(0); // CIE ID + DWRF_U8(DWRF_CIE_VERSION); + DWRF_STR("zR"); // aug data length + FDE ptr encoding follow + DWRF_UV(JIT_UNWIND_CODE_ALIGNMENT_FACTOR); + DWRF_SV(JIT_UNWIND_DATA_ALIGNMENT_FACTOR); + DWRF_U8(JIT_UNWIND_RA_REG); + DWRF_UV(1); // Augmentation data length + DWRF_U8(fde_ptr_enc); // FDE pointer encoding + + /* Executor steady-state rule (our invariant, not the compiler's). */ + DWRF_U8(DWRF_CFA_def_cfa); + DWRF_UV(JIT_UNWIND_CFA_REG); + DWRF_UV(JIT_UNWIND_CFA_OFFSET); + DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_FP_REG); + DWRF_UV(JIT_UNWIND_FP_OFFSET); + DWRF_U8(DWRF_CFA_offset | JIT_UNWIND_RA_REG); + DWRF_UV(JIT_UNWIND_RA_OFFSET); + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + DWRF_SECTION(FDE, + DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) + DWRF_ADDR(ctx->code_addr); // Absolute code start + DWRF_ADDR((uintptr_t)ctx->code_size); // Code range covered + DWRF_U8(0); // Augmentation data length (none) + DWRF_ALIGNNOP(sizeof(uintptr_t)); + ) + + ctx->p = p; +} +#endif + +#if defined(PY_HAVE_JIT_GDB_UNWIND) +enum { + JIT_NOACTION = 0, + JIT_REGISTER_FN = 1, + JIT_UNREGISTER_FN = 2, +}; + +struct jit_code_entry { + struct jit_code_entry *next; + struct jit_code_entry *prev; + const char *symfile_addr; + uint64_t symfile_size; + const void *code_addr; +}; + +struct jit_descriptor { + uint32_t version; + uint32_t action_flag; + struct jit_code_entry *relevant_entry; + struct jit_code_entry *first_entry; +}; + +PyMutex _Py_jit_debug_mutex = {0}; + +Py_EXPORTED_SYMBOL volatile struct jit_descriptor __jit_debug_descriptor = { + 1, JIT_NOACTION, NULL, NULL +}; + +Py_EXPORTED_SYMBOL void __attribute__((noinline)) +__jit_debug_register_code(void) +{ + /* Keep this call visible to debuggers and not optimized away. */ + (void)__jit_debug_descriptor.action_flag; +#if defined(__GNUC__) || defined(__clang__) + __asm__ __volatile__("" ::: "memory"); +#endif +} + +static uint16_t +gdb_jit_machine_id(void) +{ + /* Map the current target to ELF e_machine; return 0 to skip registration. */ +#if defined(__x86_64__) || defined(_M_X64) + return EM_X86_64; +#elif defined(__aarch64__) && !defined(__ILP32__) + return EM_AARCH64; +#else + return 0; +#endif +} + +static struct jit_code_entry * +gdb_jit_register_code( + const void *code_addr, + size_t code_size, + const char *symname, + const uint8_t *eh_frame, + size_t eh_frame_size +) +{ + /* + * Build a minimal in-memory ELF for GDB's JIT interface and link it into + * __jit_debug_descriptor so debuggers can resolve JIT code. + */ + if (code_addr == NULL || code_size == 0 || symname == NULL) { + return NULL; + } + + const uint16_t machine = gdb_jit_machine_id(); + if (machine == 0) { + return NULL; + } + + enum { + SH_NULL = 0, + SH_TEXT, + SH_EH_FRAME, + SH_SHSTRTAB, + SH_STRTAB, + SH_SYMTAB, + SH_NUM, + }; + static const char shstrtab[] = + "\0.text\0.eh_frame\0.shstrtab\0.strtab\0.symtab"; + _Static_assert(sizeof(shstrtab) == + 1 + sizeof(".text") + sizeof(".eh_frame") + + sizeof(".shstrtab") + sizeof(".strtab") + sizeof(".symtab"), + "shstrtab size mismatch"); + const size_t shstrtab_size = sizeof(shstrtab); + const size_t sh_text = 1; + const size_t sh_eh_frame = sh_text + sizeof(".text"); + const size_t sh_shstrtab = sh_eh_frame + sizeof(".eh_frame"); + const size_t sh_strtab = sh_shstrtab + sizeof(".shstrtab"); + const size_t sh_symtab = sh_strtab + sizeof(".strtab"); + const size_t text_size = code_size; + const size_t text_padded = _Py_SIZE_ROUND_UP(text_size, 8); + const size_t strtab_size = 1 + strlen(symname) + 1; + const size_t symtab_size = 3 * sizeof(Elf64_Sym); + + size_t offset = sizeof(Elf64_Ehdr); + offset = _Py_SIZE_ROUND_UP(offset, 16); + const size_t text_off = offset; + const size_t eh_off = text_off + text_padded; + offset = eh_off + eh_frame_size; + const size_t shstr_off = offset; + offset += shstrtab_size; + const size_t str_off = offset; + offset += strtab_size; + /* Elf64_Sym requires 8-byte alignment for st_value/st_size. */ + offset = _Py_SIZE_ROUND_UP(offset, 8); + const size_t sym_off = offset; + offset += symtab_size; + offset = _Py_SIZE_ROUND_UP(offset, sizeof(Elf64_Shdr)); + const size_t sh_off = offset; + + const size_t shnum = SH_NUM; + const size_t total_size = sh_off + shnum * sizeof(Elf64_Shdr); + uint8_t *buf = (uint8_t *)PyMem_RawMalloc(total_size); + if (buf == NULL) { + return NULL; + } + memset(buf, 0, total_size); + + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)buf; + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELFOSABI_NONE; + ehdr->e_type = ET_DYN; + ehdr->e_machine = machine; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = 0; + ehdr->e_shoff = sh_off; + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_shentsize = sizeof(Elf64_Shdr); + ehdr->e_shnum = shnum; + ehdr->e_shstrndx = SH_SHSTRTAB; + + memcpy(buf + text_off, code_addr, text_size); + memcpy(buf + eh_off, eh_frame, eh_frame_size); + + char *shstr = (char *)(buf + shstr_off); + memcpy(shstr, shstrtab, shstrtab_size); + + char *strtab = (char *)(buf + str_off); + strtab[0] = '\0'; + memcpy(strtab + 1, symname, strlen(symname)); + strtab[strtab_size - 1] = '\0'; + + Elf64_Sym *syms = (Elf64_Sym *)(buf + sym_off); + memset(syms, 0, symtab_size); + /* Section symbol for .text (local) */ + syms[1].st_info = ELF64_ST_INFO(STB_LOCAL, STT_SECTION); + syms[1].st_shndx = 1; + /* Function symbol */ + syms[2].st_name = 1; + syms[2].st_info = ELF64_ST_INFO(STB_GLOBAL, STT_FUNC); + syms[2].st_other = STV_DEFAULT; + syms[2].st_shndx = 1; + /* For ET_DYN/ET_EXEC, st_value is the absolute virtual address. */ + syms[2].st_value = (Elf64_Addr)(uintptr_t)code_addr; + syms[2].st_size = code_size; + + Elf64_Shdr *shdrs = (Elf64_Shdr *)(buf + sh_off); + memset(shdrs, 0, shnum * sizeof(Elf64_Shdr)); + + shdrs[SH_TEXT].sh_name = sh_text; + shdrs[SH_TEXT].sh_type = SHT_PROGBITS; + shdrs[SH_TEXT].sh_flags = SHF_ALLOC | SHF_EXECINSTR; + shdrs[SH_TEXT].sh_addr = (Elf64_Addr)(uintptr_t)code_addr; + shdrs[SH_TEXT].sh_offset = text_off; + shdrs[SH_TEXT].sh_size = text_size; + shdrs[SH_TEXT].sh_addralign = 16; + + shdrs[SH_EH_FRAME].sh_name = sh_eh_frame; + shdrs[SH_EH_FRAME].sh_type = SHT_PROGBITS; + shdrs[SH_EH_FRAME].sh_flags = SHF_ALLOC; + shdrs[SH_EH_FRAME].sh_addr = + (Elf64_Addr)((uintptr_t)code_addr + text_padded); + shdrs[SH_EH_FRAME].sh_offset = eh_off; + shdrs[SH_EH_FRAME].sh_size = eh_frame_size; + shdrs[SH_EH_FRAME].sh_addralign = 8; + + shdrs[SH_SHSTRTAB].sh_name = sh_shstrtab; + shdrs[SH_SHSTRTAB].sh_type = SHT_STRTAB; + shdrs[SH_SHSTRTAB].sh_offset = shstr_off; + shdrs[SH_SHSTRTAB].sh_size = shstrtab_size; + shdrs[SH_SHSTRTAB].sh_addralign = 1; + + shdrs[SH_STRTAB].sh_name = sh_strtab; + shdrs[SH_STRTAB].sh_type = SHT_STRTAB; + shdrs[SH_STRTAB].sh_offset = str_off; + shdrs[SH_STRTAB].sh_size = strtab_size; + shdrs[SH_STRTAB].sh_addralign = 1; + + shdrs[SH_SYMTAB].sh_name = sh_symtab; + shdrs[SH_SYMTAB].sh_type = SHT_SYMTAB; + shdrs[SH_SYMTAB].sh_offset = sym_off; + shdrs[SH_SYMTAB].sh_size = symtab_size; + shdrs[SH_SYMTAB].sh_link = SH_STRTAB; + shdrs[SH_SYMTAB].sh_info = 2; + shdrs[SH_SYMTAB].sh_addralign = 8; + shdrs[SH_SYMTAB].sh_entsize = sizeof(Elf64_Sym); + + struct jit_code_entry *entry = PyMem_RawMalloc(sizeof(*entry)); + if (entry == NULL) { + PyMem_RawFree(buf); + return NULL; + } + entry->symfile_addr = (const char *)buf; + entry->symfile_size = total_size; + entry->code_addr = code_addr; + + PyMutex_Lock(&_Py_jit_debug_mutex); + entry->prev = NULL; + entry->next = __jit_debug_descriptor.first_entry; + if (entry->next != NULL) { + entry->next->prev = entry; + } + __jit_debug_descriptor.first_entry = entry; + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_REGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + PyMutex_Unlock(&_Py_jit_debug_mutex); + return entry; +} +#endif // defined(PY_HAVE_JIT_GDB_UNWIND) + +void * +_PyJitUnwind_GdbRegisterCode(const void *code_addr, + size_t code_size, + const char *entry, + const char *filename) +{ +#if defined(PY_HAVE_JIT_GDB_UNWIND) + /* GDB expects a stable symbol name and absolute addresses in .eh_frame. */ + if (entry == NULL) { + entry = ""; + } + if (filename == NULL) { + filename = ""; + } + size_t name_size = snprintf(NULL, 0, "py::%s:%s", entry, filename) + 1; + char *name = (char *)PyMem_RawMalloc(name_size); + if (name == NULL) { + return NULL; + } + snprintf(name, name_size, "py::%s:%s", entry, filename); + + uint8_t buffer[1024]; + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 1); + if (eh_frame_size == 0) { + PyMem_RawFree(name); + return NULL; + } + + void *handle = gdb_jit_register_code(code_addr, code_size, name, + buffer, eh_frame_size); + PyMem_RawFree(name); + return handle; +#else + (void)code_addr; + (void)code_size; + (void)entry; + (void)filename; + return NULL; +#endif +} + +void +_PyJitUnwind_GdbUnregisterCode(void *handle) +{ +#if defined(PY_HAVE_JIT_GDB_UNWIND) + struct jit_code_entry *entry = (struct jit_code_entry *)handle; + if (entry == NULL) { + return; + } + + PyMutex_Lock(&_Py_jit_debug_mutex); + if (entry->prev != NULL) { + entry->prev->next = entry->next; + } + else { + __jit_debug_descriptor.first_entry = entry->next; + } + if (entry->next != NULL) { + entry->next->prev = entry->prev; + } + + __jit_debug_descriptor.relevant_entry = entry; + __jit_debug_descriptor.action_flag = JIT_UNREGISTER_FN; + __jit_debug_register_code(); + __jit_debug_descriptor.action_flag = JIT_NOACTION; + __jit_debug_descriptor.relevant_entry = NULL; + + PyMutex_Unlock(&_Py_jit_debug_mutex); + + PyMem_RawFree((void *)entry->symfile_addr); + PyMem_RawFree(entry); +#else + (void)handle; +#endif +} + +#endif // defined(PY_HAVE_PERF_TRAMPOLINE) || defined(PY_HAVE_JIT_GDB_UNWIND) diff --git a/Python/optimizer.c b/Python/optimizer.c index 820a0771a2cb0f..11658fca0da595 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1448,6 +1448,7 @@ allocate_executor(int exit_count, int length) res->trace = (_PyUOpInstruction *)(res->exits + exit_count); res->code_size = length; res->exit_count = exit_count; + res->jit_gdb_handle = NULL; return res; } diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index 33b5257fd58281..83ecbf0ec12907 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -229,8 +229,8 @@ dummy_func(void) { } op(_CHECK_ATTR_CLASS, (type_version/2, owner -- owner)) { - PyObject *type = (PyObject *)_PyType_LookupByVersion(type_version); - if (type) { + PyObject *type = sym_get_probable_value(owner); + if (type != NULL && ((PyTypeObject *)type)->tp_version_tag == type_version) { if (type == sym_get_const(ctx, owner)) { ADD_OP(_NOP, 0, 0); } @@ -246,7 +246,7 @@ dummy_func(void) { op(_GUARD_TYPE_VERSION, (type_version/2, owner -- owner)) { assert(type_version); - assert(this_instr[-1].opcode == _RECORD_TOS_TYPE); + assert(this_instr[-1].opcode == _RECORD_TOS_TYPE || this_instr[-1].opcode == _RECORD_TOS); if (sym_matches_type_version(owner, type_version)) { ADD_OP(_NOP, 0, 0); } diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 8f208beb86476b..39e98634a6e9c2 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -2518,7 +2518,7 @@ owner = stack_pointer[-1]; uint32_t type_version = (uint32_t)this_instr->operand0; assert(type_version); - assert(this_instr[-1].opcode == _RECORD_TOS_TYPE); + assert(this_instr[-1].opcode == _RECORD_TOS_TYPE || this_instr[-1].opcode == _RECORD_TOS); if (sym_matches_type_version(owner, type_version)) { ADD_OP(_NOP, 0, 0); } @@ -2676,8 +2676,8 @@ JitOptRef owner; owner = stack_pointer[-1]; uint32_t type_version = (uint32_t)this_instr->operand0; - PyObject *type = (PyObject *)_PyType_LookupByVersion(type_version); - if (type) { + PyObject *type = sym_get_probable_value(owner); + if (type != NULL && ((PyTypeObject *)type)->tp_version_tag == type_version) { if (type == sym_get_const(ctx, owner)) { ADD_OP(_NOP, 0, 0); } diff --git a/Python/perf_jit_trampoline.c b/Python/perf_jit_trampoline.c index 0ba856ea610e59..0c460282feceef 100644 --- a/Python/perf_jit_trampoline.c +++ b/Python/perf_jit_trampoline.c @@ -62,6 +62,7 @@ #include "pycore_frame.h" #include "pycore_interp.h" #include "pycore_mmap.h" // _PyAnnotateMemoryMap() +#include "pycore_jit_unwind.h" #include "pycore_runtime.h" // _PyRuntime #ifdef PY_HAVE_PERF_TRAMPOLINE @@ -73,6 +74,7 @@ #include // File control operations #include // Standard I/O operations #include // Standard library functions +#include // memcpy, strlen #include // Memory mapping functions (mmap) #include // System data types #include // System calls (sysconf, getpid) @@ -246,6 +248,25 @@ typedef struct { */ } CodeUnwindingInfoEvent; +/* + * EH Frame Header structure for DWARF unwinding + * + * This header provides metadata about the .eh_frame data that follows. + * It uses PC-relative and data-relative encodings to keep the synthesized + * DSO self-contained when perf injects it. + */ +typedef struct __attribute__((packed)) { + uint8_t version; + uint8_t eh_frame_ptr_enc; + uint8_t fde_count_enc; + uint8_t table_enc; + int32_t eh_frame_ptr; + uint32_t eh_fde_count; + int32_t from; + int32_t to; +} EhFrameHeader; +_Static_assert(sizeof(EhFrameHeader) == 20, "EhFrameHeader layout mismatch"); + // ============================================================================= // GLOBAL STATE MANAGEMENT // ============================================================================= @@ -259,10 +280,11 @@ typedef struct { */ typedef struct { FILE* perf_map; // File handle for the jitdump file - PyThread_type_lock map_lock; // Thread synchronization lock + PyMutex map_lock; // Thread synchronization lock void* mapped_buffer; // Memory-mapped region (signals perf we're active) size_t mapped_size; // Size of the mapped region - int code_id; // Counter for unique code region identifiers + uint32_t code_id; // Counter for unique code region identifiers + uint64_t build_id_salt; // Per-process salt for unique synthetic DSOs } PerfMapJitState; /* Global singleton instance */ @@ -316,40 +338,6 @@ static int64_t get_current_time_microseconds(void) { return ((int64_t)(tv.tv_sec) * 1000000) + tv.tv_usec; } -// ============================================================================= -// UTILITY FUNCTIONS -// ============================================================================= - -/* - * Round up a value to the next multiple of a given number - * - * This is essential for maintaining proper alignment requirements in the - * jitdump format. Many structures need to be aligned to specific boundaries - * (typically 8 or 16 bytes) for efficient processing by perf. - * - * Args: - * value: The value to round up - * multiple: The multiple to round up to - * - * Returns: The smallest value >= input that is a multiple of 'multiple' - */ -static size_t round_up(int64_t value, int64_t multiple) { - if (multiple == 0) { - return value; // Avoid division by zero - } - - int64_t remainder = value % multiple; - if (remainder == 0) { - return value; // Already aligned - } - - /* Calculate how much to add to reach the next multiple */ - int64_t difference = multiple - remainder; - int64_t rounded_up_value = value + difference; - - return rounded_up_value; -} - // ============================================================================= // FILE I/O UTILITIES // ============================================================================= @@ -406,623 +394,6 @@ static void perf_map_jit_write_header(int pid, FILE* out_file) { perf_map_jit_write_fully(&header, sizeof(header)); } -// ============================================================================= -// DWARF CONSTANTS AND UTILITIES -// ============================================================================= - -/* - * DWARF (Debug With Arbitrary Record Formats) constants - * - * DWARF is a debugging data format used to provide stack unwinding information. - * These constants define the various encoding types and opcodes used in - * DWARF Call Frame Information (CFI) records. - */ - -/* DWARF Call Frame Information version */ -#define DWRF_CIE_VERSION 1 - -/* DWARF CFA (Call Frame Address) opcodes */ -enum { - DWRF_CFA_nop = 0x0, // No operation - DWRF_CFA_offset_extended = 0x5, // Extended offset instruction - DWRF_CFA_def_cfa = 0xc, // Define CFA rule - DWRF_CFA_def_cfa_register = 0xd, // Define CFA register - DWRF_CFA_def_cfa_offset = 0xe, // Define CFA offset - DWRF_CFA_offset_extended_sf = 0x11, // Extended signed offset - DWRF_CFA_advance_loc = 0x40, // Advance location counter - DWRF_CFA_offset = 0x80, // Simple offset instruction - DWRF_CFA_restore = 0xc0 // Restore register -}; - -/* DWARF Exception Handling pointer encodings */ -enum { - DWRF_EH_PE_absptr = 0x00, // Absolute pointer - DWRF_EH_PE_omit = 0xff, // Omitted value - - /* Data type encodings */ - DWRF_EH_PE_uleb128 = 0x01, // Unsigned LEB128 - DWRF_EH_PE_udata2 = 0x02, // Unsigned 2-byte - DWRF_EH_PE_udata4 = 0x03, // Unsigned 4-byte - DWRF_EH_PE_udata8 = 0x04, // Unsigned 8-byte - DWRF_EH_PE_sleb128 = 0x09, // Signed LEB128 - DWRF_EH_PE_sdata2 = 0x0a, // Signed 2-byte - DWRF_EH_PE_sdata4 = 0x0b, // Signed 4-byte - DWRF_EH_PE_sdata8 = 0x0c, // Signed 8-byte - DWRF_EH_PE_signed = 0x08, // Signed flag - - /* Reference type encodings */ - DWRF_EH_PE_pcrel = 0x10, // PC-relative - DWRF_EH_PE_textrel = 0x20, // Text-relative - DWRF_EH_PE_datarel = 0x30, // Data-relative - DWRF_EH_PE_funcrel = 0x40, // Function-relative - DWRF_EH_PE_aligned = 0x50, // Aligned - DWRF_EH_PE_indirect = 0x80 // Indirect -}; - -/* Additional DWARF constants for debug information */ -enum { DWRF_TAG_compile_unit = 0x11 }; -enum { DWRF_children_no = 0, DWRF_children_yes = 1 }; -enum { - DWRF_AT_name = 0x03, // Name attribute - DWRF_AT_stmt_list = 0x10, // Statement list - DWRF_AT_low_pc = 0x11, // Low PC address - DWRF_AT_high_pc = 0x12 // High PC address -}; -enum { - DWRF_FORM_addr = 0x01, // Address form - DWRF_FORM_data4 = 0x06, // 4-byte data - DWRF_FORM_string = 0x08 // String form -}; - -/* Line number program opcodes */ -enum { - DWRF_LNS_extended_op = 0, // Extended opcode - DWRF_LNS_copy = 1, // Copy operation - DWRF_LNS_advance_pc = 2, // Advance program counter - DWRF_LNS_advance_line = 3 // Advance line number -}; - -/* Line number extended opcodes */ -enum { - DWRF_LNE_end_sequence = 1, // End of sequence - DWRF_LNE_set_address = 2 // Set address -}; - -/* - * Architecture-specific DWARF register numbers - * - * These constants define the register numbering scheme used by DWARF - * for each supported architecture. The numbers must match the ABI - * specification for proper stack unwinding. - */ -enum { -#ifdef __x86_64__ - /* x86_64 register numbering (note: order is defined by x86_64 ABI) */ - DWRF_REG_AX, // RAX - DWRF_REG_DX, // RDX - DWRF_REG_CX, // RCX - DWRF_REG_BX, // RBX - DWRF_REG_SI, // RSI - DWRF_REG_DI, // RDI - DWRF_REG_BP, // RBP - DWRF_REG_SP, // RSP - DWRF_REG_8, // R8 - DWRF_REG_9, // R9 - DWRF_REG_10, // R10 - DWRF_REG_11, // R11 - DWRF_REG_12, // R12 - DWRF_REG_13, // R13 - DWRF_REG_14, // R14 - DWRF_REG_15, // R15 - DWRF_REG_RA, // Return address (RIP) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 register numbering */ - DWRF_REG_FP = 29, // Frame Pointer - DWRF_REG_RA = 30, // Link register (return address) - DWRF_REG_SP = 31, // Stack pointer -#else -# error "Unsupported target architecture" -#endif -}; - -/* DWARF encoding constants used in EH frame headers */ -static const uint8_t DwarfUData4 = 0x03; // Unsigned 4-byte data -static const uint8_t DwarfSData4 = 0x0b; // Signed 4-byte data -static const uint8_t DwarfPcRel = 0x10; // PC-relative encoding -static const uint8_t DwarfDataRel = 0x30; // Data-relative encoding - -// ============================================================================= -// ELF OBJECT CONTEXT -// ============================================================================= - -/* - * Context for building ELF/DWARF structures - * - * This structure maintains state while constructing DWARF unwind information. - * It acts as a simple buffer manager with pointers to track current position - * and important landmarks within the buffer. - */ -typedef struct ELFObjectContext { - uint8_t* p; // Current write position in buffer - uint8_t* startp; // Start of buffer (for offset calculations) - uint8_t* eh_frame_p; // Start of EH frame data (for relative offsets) - uint8_t* fde_p; // Start of FDE data (for PC-relative calculations) - uint32_t code_size; // Size of the code being described -} ELFObjectContext; - -/* - * EH Frame Header structure for DWARF unwinding - * - * This structure provides metadata about the DWARF unwinding information - * that follows. It's required by the perf jitdump format to enable proper - * stack unwinding during profiling. - */ -typedef struct { - unsigned char version; // EH frame version (always 1) - unsigned char eh_frame_ptr_enc; // Encoding of EH frame pointer - unsigned char fde_count_enc; // Encoding of FDE count - unsigned char table_enc; // Encoding of table entries - int32_t eh_frame_ptr; // Pointer to EH frame data - int32_t eh_fde_count; // Number of FDEs (Frame Description Entries) - int32_t from; // Start address of code range - int32_t to; // End address of code range -} EhFrameHeader; - -// ============================================================================= -// DWARF GENERATION UTILITIES -// ============================================================================= - -/* - * Append a null-terminated string to the ELF context buffer - * - * Args: - * ctx: ELF object context - * str: String to append (must be null-terminated) - * - * Returns: Offset from start of buffer where string was written - */ -static uint32_t elfctx_append_string(ELFObjectContext* ctx, const char* str) { - uint8_t* p = ctx->p; - uint32_t ofs = (uint32_t)(p - ctx->startp); - - /* Copy string including null terminator */ - do { - *p++ = (uint8_t)*str; - } while (*str++); - - ctx->p = p; - return ofs; -} - -/* - * Append a SLEB128 (Signed Little Endian Base 128) value - * - * SLEB128 is a variable-length encoding used extensively in DWARF. - * It efficiently encodes small numbers in fewer bytes. - * - * Args: - * ctx: ELF object context - * v: Signed value to encode - */ -static void elfctx_append_sleb128(ELFObjectContext* ctx, int32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; (uint32_t)(v + 0x40) >= 0x80; v >>= 7) { - *p++ = (uint8_t)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (uint8_t)(v & 0x7f); // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Append a ULEB128 (Unsigned Little Endian Base 128) value - * - * Similar to SLEB128 but for unsigned values. - * - * Args: - * ctx: ELF object context - * v: Unsigned value to encode - */ -static void elfctx_append_uleb128(ELFObjectContext* ctx, uint32_t v) { - uint8_t* p = ctx->p; - - /* Encode 7 bits at a time, with continuation bit in MSB */ - for (; v >= 0x80; v >>= 7) { - *p++ = (char)((v & 0x7f) | 0x80); // Set continuation bit - } - *p++ = (char)v; // Final byte without continuation bit - - ctx->p = p; -} - -/* - * Macros for generating DWARF structures - * - * These macros provide a convenient way to write various data types - * to the DWARF buffer while automatically advancing the pointer. - */ -#define DWRF_U8(x) (*p++ = (x)) // Write unsigned 8-bit -#define DWRF_I8(x) (*(int8_t*)p = (x), p++) // Write signed 8-bit -#define DWRF_U16(x) (*(uint16_t*)p = (x), p += 2) // Write unsigned 16-bit -#define DWRF_U32(x) (*(uint32_t*)p = (x), p += 4) // Write unsigned 32-bit -#define DWRF_ADDR(x) (*(uintptr_t*)p = (x), p += sizeof(uintptr_t)) // Write address -#define DWRF_UV(x) (ctx->p = p, elfctx_append_uleb128(ctx, (x)), p = ctx->p) // Write ULEB128 -#define DWRF_SV(x) (ctx->p = p, elfctx_append_sleb128(ctx, (x)), p = ctx->p) // Write SLEB128 -#define DWRF_STR(str) (ctx->p = p, elfctx_append_string(ctx, (str)), p = ctx->p) // Write string - -/* Align to specified boundary with NOP instructions */ -#define DWRF_ALIGNNOP(s) \ - while ((uintptr_t)p & ((s)-1)) { \ - *p++ = DWRF_CFA_nop; \ - } - -/* Write a DWARF section with automatic size calculation */ -#define DWRF_SECTION(name, stmt) \ - { \ - uint32_t* szp_##name = (uint32_t*)p; \ - p += 4; \ - stmt; \ - *szp_##name = (uint32_t)((p - (uint8_t*)szp_##name) - 4); \ - } - -// ============================================================================= -// DWARF EH FRAME GENERATION -// ============================================================================= - -static void elf_init_ehframe(ELFObjectContext* ctx); - -/* - * Initialize DWARF .eh_frame section for a code region - * - * The .eh_frame section contains Call Frame Information (CFI) that describes - * how to unwind the stack at any point in the code. This is essential for - * proper profiling as it allows perf to generate accurate call graphs. - * - * The function generates two main components: - * 1. CIE (Common Information Entry) - describes calling conventions - * 2. FDE (Frame Description Entry) - describes specific function unwinding - * - * Args: - * ctx: ELF object context containing code size and buffer pointers - */ -static size_t calculate_eh_frame_size(void) { - /* Calculate the EH frame size for the trampoline function */ - extern void *_Py_trampoline_func_start; - extern void *_Py_trampoline_func_end; - - size_t code_size = (char*)&_Py_trampoline_func_end - (char*)&_Py_trampoline_func_start; - - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; - - elf_init_ehframe(&ctx); - return ctx.p - ctx.startp; -} - -static void elf_init_ehframe(ELFObjectContext* ctx) { - uint8_t* p = ctx->p; - uint8_t* framep = p; // Remember start of frame data - - /* - * DWARF Unwind Table for Trampoline Function - * - * This section defines DWARF Call Frame Information (CFI) using encoded macros - * like `DWRF_U8`, `DWRF_UV`, and `DWRF_SECTION` to describe how the trampoline function - * preserves and restores registers. This is used by profiling tools (e.g., `perf`) - * and debuggers for stack unwinding in JIT-compiled code. - * - * ------------------------------------------------- - * TO REGENERATE THIS TABLE FROM GCC OBJECTS: - * ------------------------------------------------- - * - * 1. Create a trampoline source file (e.g., `trampoline.c`): - * - * #include - * typedef PyObject* (*py_evaluator)(void*, void*, int); - * PyObject* trampoline(void *ts, void *f, int throwflag, py_evaluator evaluator) { - * return evaluator(ts, f, throwflag); - * } - * - * 2. Compile to an object file with frame pointer preservation: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * 3. Extract DWARF unwind info from the object file: - * - * readelf -w trampoline.o - * - * Example output from `.eh_frame`: - * - * 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * 00000014 FDE cie=00000000 pc=0..14 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * -- These values can be verified by comparing with `readelf -w` or `llvm-dwarfdump --eh-frame`. - * - * ---------------------------------- - * HOW TO TRANSLATE TO DWRF_* MACROS: - * ---------------------------------- - * - * After compiling your trampoline with: - * - * gcc trampoline.c -I. -I./Include -O2 -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -c - * - * run: - * - * readelf -w trampoline.o - * - * to inspect the generated `.eh_frame` data. You will see two main components: - * - * 1. A CIE (Common Information Entry): shared configuration used by all FDEs. - * 2. An FDE (Frame Description Entry): function-specific unwind instructions. - * - * --------------------- - * Translating the CIE: - * --------------------- - * From `readelf -w`, you might see: - * - * 00000000 0000000000000010 00000000 CIE - * Version: 1 - * Augmentation: "zR" - * Code alignment factor: 4 - * Data alignment factor: -8 - * Return address column: 30 - * Augmentation data: 1b - * DW_CFA_def_cfa: r31 (sp) ofs 0 - * - * Map this to: - * - * DWRF_SECTION(CIE, - * DWRF_U32(0); // CIE ID (always 0 for CIEs) - * DWRF_U8(DWRF_CIE_VERSION); // Version: 1 - * DWRF_STR("zR"); // Augmentation string "zR" - * DWRF_UV(4); // Code alignment factor = 4 - * DWRF_SV(-8); // Data alignment factor = -8 - * DWRF_U8(DWRF_REG_RA); // Return address register (e.g., x30 = 30) - * DWRF_UV(1); // Augmentation data length = 1 - * DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // Encoding for FDE pointers - * - * DWRF_U8(DWRF_CFA_def_cfa); // DW_CFA_def_cfa - * DWRF_UV(DWRF_REG_SP); // Register: SP (r31) - * DWRF_UV(0); // Offset = 0 - * - * DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer size boundary - * ) - * - * Notes: - * - Use `DWRF_UV` for unsigned LEB128, `DWRF_SV` for signed LEB128. - * - `DWRF_REG_RA` and `DWRF_REG_SP` are architecture-defined constants. - * - * --------------------- - * Translating the FDE: - * --------------------- - * From `readelf -w`: - * - * 00000014 0000000000000020 00000018 FDE cie=00000000 pc=0000000000000000..0000000000000014 - * DW_CFA_advance_loc: 4 - * DW_CFA_def_cfa_offset: 16 - * DW_CFA_offset: r29 at cfa-16 - * DW_CFA_offset: r30 at cfa-8 - * DW_CFA_advance_loc: 12 - * DW_CFA_restore: r30 - * DW_CFA_restore: r29 - * DW_CFA_def_cfa_offset: 0 - * - * Map the FDE header and instructions to: - * - * DWRF_SECTION(FDE, - * DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (relative from here) - * DWRF_U32(pc_relative_offset); // PC-relative location of the code (calculated dynamically) - * DWRF_U32(ctx->code_size); // Code range covered by this FDE - * DWRF_U8(0); // Augmentation data length (none) - * - * DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance location by 1 unit (1 * 4 = 4 bytes) - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - * DWRF_UV(16); - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Save x29 (frame pointer) - * DWRF_UV(2); // At offset 2 * 8 = 16 bytes - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Save x30 (return address) - * DWRF_UV(1); // At offset 1 * 8 = 8 bytes - * - * DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance location by 3 units (3 * 4 = 12 bytes) - * - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // Restore x30 - * DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // Restore x29 - * - * DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP - * DWRF_UV(0); - * ) - * - * To regenerate: - * 1. Get the `code alignment factor`, `data alignment factor`, and `RA column` from the CIE. - * 2. Note the range of the function from the FDE's `pc=...` line and map it to the JIT code as - * the code is in a different address space every time. - * 3. For each `DW_CFA_*` entry, use the corresponding `DWRF_*` macro: - * - `DW_CFA_def_cfa_offset` → DWRF_U8(DWRF_CFA_def_cfa_offset), DWRF_UV(value) - * - `DW_CFA_offset: rX` → DWRF_U8(DWRF_CFA_offset | reg), DWRF_UV(offset) - * - `DW_CFA_restore: rX` → DWRF_U8(DWRF_CFA_offset | reg) // restore is same as reusing offset - * - `DW_CFA_advance_loc: N` → DWRF_U8(DWRF_CFA_advance_loc | (N / code_alignment_factor)) - * 4. Use `DWRF_REG_FP`, `DWRF_REG_RA`, etc., for register numbers. - * 5. Use `sizeof(uintptr_t)` (typically 8) for pointer size calculations and alignment. - */ - - /* - * Emit DWARF EH CIE (Common Information Entry) - * - * The CIE describes the calling conventions and basic unwinding rules - * that apply to all functions in this compilation unit. - */ - DWRF_SECTION(CIE, - DWRF_U32(0); // CIE ID (0 indicates this is a CIE) - DWRF_U8(DWRF_CIE_VERSION); // CIE version (1) - DWRF_STR("zR"); // Augmentation string ("zR" = has LSDA) -#ifdef __x86_64__ - DWRF_UV(1); // Code alignment factor (x86_64: 1 byte) -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - DWRF_UV(4); // Code alignment factor (AArch64: 4 bytes per instruction) -#endif - DWRF_SV(-(int64_t)sizeof(uintptr_t)); // Data alignment factor (negative) - DWRF_U8(DWRF_REG_RA); // Return address register number - DWRF_UV(1); // Augmentation data length - DWRF_U8(DWRF_EH_PE_pcrel | DWRF_EH_PE_sdata4); // FDE pointer encoding - - /* Initial CFI instructions - describe default calling convention */ -#ifdef __x86_64__ - /* x86_64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(sizeof(uintptr_t)); // CFA = SP + pointer_size - DWRF_U8(DWRF_CFA_offset|DWRF_REG_RA); // Return address is saved - DWRF_UV(1); // At offset 1 from CFA -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 initial CFI state */ - DWRF_U8(DWRF_CFA_def_cfa); // Define CFA (Call Frame Address) - DWRF_UV(DWRF_REG_SP); // CFA = SP register - DWRF_UV(0); // CFA = SP + 0 (AArch64 starts with offset 0) - // No initial register saves in AArch64 CIE -#endif - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->eh_frame_p = p; // Remember start of FDE data - - /* - * Emit DWARF EH FDE (Frame Description Entry) - * - * The FDE describes unwinding information specific to this function. - * It references the CIE and provides function-specific CFI instructions. - * - * The PC-relative offset is calculated after the entire EH frame is built - * to ensure accurate positioning relative to the synthesized DSO layout. - */ - DWRF_SECTION(FDE, - DWRF_U32((uint32_t)(p - framep)); // Offset to CIE (backwards reference) - ctx->fde_p = p; // Remember where PC offset field is located for later calculation - DWRF_U32(0); // Placeholder for PC-relative offset (calculated at end of elf_init_ehframe) - DWRF_U32(ctx->code_size); // Address range covered by this FDE (code length) - DWRF_U8(0); // Augmentation data length (none) - - /* - * Architecture-specific CFI instructions - * - * These instructions describe how registers are saved and restored - * during function calls. Each architecture has different calling - * conventions and register usage patterns. - */ -#ifdef __x86_64__ - /* x86_64 calling convention unwinding rules with frame pointer */ -# if defined(__CET__) && (__CET__ & 1) - DWRF_U8(DWRF_CFA_advance_loc | 4); // Advance past endbr64 (4 bytes) -# endif - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance past push %rbp (1 byte) - DWRF_U8(DWRF_CFA_def_cfa_offset); // def_cfa_offset 16 - DWRF_UV(16); // New offset: SP + 16 - DWRF_U8(DWRF_CFA_offset | DWRF_REG_BP); // offset r6 at cfa-16 - DWRF_UV(2); // Offset factor: 2 * 8 = 16 bytes - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past mov %rsp,%rbp (3 bytes) - DWRF_U8(DWRF_CFA_def_cfa_register); // def_cfa_register r6 - DWRF_UV(DWRF_REG_BP); // Use base pointer register - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance past call *%rcx (2 bytes) + pop %rbp (1 byte) = 3 - DWRF_U8(DWRF_CFA_def_cfa); // def_cfa r7 ofs 8 - DWRF_UV(DWRF_REG_SP); // Use stack pointer register - DWRF_UV(8); // New offset: SP + 8 -#elif defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) - /* AArch64 calling convention unwinding rules */ - DWRF_U8(DWRF_CFA_advance_loc | 1); // Advance by 1 instruction (4 bytes) - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 16 - DWRF_UV(16); // Stack pointer moved by 16 bytes - DWRF_U8(DWRF_CFA_offset | DWRF_REG_FP); // x29 (frame pointer) saved - DWRF_UV(2); // At CFA-16 (2 * 8 = 16 bytes from CFA) - DWRF_U8(DWRF_CFA_offset | DWRF_REG_RA); // x30 (link register) saved - DWRF_UV(1); // At CFA-8 (1 * 8 = 8 bytes from CFA) - DWRF_U8(DWRF_CFA_advance_loc | 3); // Advance by 3 instructions (12 bytes) - DWRF_U8(DWRF_CFA_restore | DWRF_REG_RA); // Restore x30 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_restore | DWRF_REG_FP); // Restore x29 - NO DWRF_UV() after this! - DWRF_U8(DWRF_CFA_def_cfa_offset); // CFA = SP + 0 (stack restored) - DWRF_UV(0); // Back to original stack position -#else -# error "Unsupported target architecture" -#endif - - DWRF_ALIGNNOP(sizeof(uintptr_t)); // Align to pointer boundary - ) - - ctx->p = p; // Update context pointer to end of generated data - - /* Calculate and update the PC-relative offset in the FDE - * - * When perf processes the jitdump, it creates a synthesized DSO with this layout: - * - * Synthesized DSO Memory Layout: - * ┌─────────────────────────────────────────────────────────────┐ < code_start - * │ Code Section │ - * │ (round_up(code_size, 8) bytes) │ - * ├─────────────────────────────────────────────────────────────┤ < start of EH frame data - * │ EH Frame Data │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ CIE data │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * │ ┌─────────────────────────────────────────────────────┐ │ - * │ │ FDE Header: │ │ - * │ │ - CIE offset (4 bytes) │ │ - * │ │ - PC offset (4 bytes) <─ fde_offset_in_frame ─────┼────┼─> points to code_start - * │ │ - address range (4 bytes) │ │ (this specific field) - * │ │ CFI Instructions... │ │ - * │ └─────────────────────────────────────────────────────┘ │ - * ├─────────────────────────────────────────────────────────────┤ < reference_point - * │ EhFrameHeader │ - * │ (navigation metadata) │ - * └─────────────────────────────────────────────────────────────┘ - * - * The PC offset field in the FDE must contain the distance from itself to code_start: - * - * distance = code_start - fde_pc_field - * - * Where: - * fde_pc_field_location = reference_point - eh_frame_size + fde_offset_in_frame - * code_start_location = reference_point - eh_frame_size - round_up(code_size, 8) - * - * Therefore: - * distance = code_start_location - fde_pc_field_location - * = (ref - eh_frame_size - rounded_code_size) - (ref - eh_frame_size + fde_offset_in_frame) - * = -rounded_code_size - fde_offset_in_frame - * = -(round_up(code_size, 8) + fde_offset_in_frame) - * - * Note: fde_offset_in_frame is the offset from EH frame start to the PC offset field, - * - */ - if (ctx->fde_p != NULL) { - int32_t fde_offset_in_frame = (ctx->fde_p - ctx->startp); - int32_t rounded_code_size = round_up(ctx->code_size, 8); - int32_t pc_relative_offset = -(rounded_code_size + fde_offset_in_frame); - - - // Update the PC-relative offset in the FDE - *(int32_t*)ctx->fde_p = pc_relative_offset; - } -} - // ============================================================================= // JITDUMP INITIALIZATION // ============================================================================= @@ -1042,6 +413,12 @@ static void elf_init_ehframe(ELFObjectContext* ctx) { * Returns: Pointer to initialized state, or NULL on failure */ static void* perf_map_jit_init(void) { + PyMutex_Lock(&perf_jit_map_state.map_lock); + if (perf_jit_map_state.perf_map != NULL) { + PyMutex_Unlock(&perf_jit_map_state.map_lock); + return &perf_jit_map_state; + } + char filename[100]; int pid = getpid(); @@ -1051,6 +428,7 @@ static void* perf_map_jit_init(void) { /* Create/open the jitdump file with appropriate permissions */ const int fd = open(filename, O_CREAT | O_TRUNC | O_RDWR, 0666); if (fd == -1) { + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to create file } @@ -1058,6 +436,7 @@ static void* perf_map_jit_init(void) { const long page_size = sysconf(_SC_PAGESIZE); if (page_size == -1) { close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to get page size } @@ -1086,6 +465,7 @@ static void* perf_map_jit_init(void) { if (perf_jit_map_state.mapped_buffer == MAP_FAILED) { perf_jit_map_state.mapped_buffer = NULL; close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Memory mapping failed } (void)_PyAnnotateMemoryMap(perf_jit_map_state.mapped_buffer, page_size, @@ -1098,6 +478,7 @@ static void* perf_map_jit_init(void) { perf_jit_map_state.perf_map = fdopen(fd, "w+"); if (perf_jit_map_state.perf_map == NULL) { close(fd); + PyMutex_Unlock(&perf_jit_map_state.map_lock); return NULL; // Failed to create FILE* } @@ -1113,28 +494,18 @@ static void* perf_map_jit_init(void) { /* Write the jitdump file header */ perf_map_jit_write_header(pid, perf_jit_map_state.perf_map); - /* - * Initialize thread synchronization lock - * - * Multiple threads may attempt to write to the jitdump file - * simultaneously. This lock ensures thread-safe access to the - * global jitdump state. - */ - perf_jit_map_state.map_lock = PyThread_allocate_lock(); - if (perf_jit_map_state.map_lock == NULL) { - fclose(perf_jit_map_state.perf_map); - return NULL; // Failed to create lock - } - /* Initialize code ID counter */ perf_jit_map_state.code_id = 0; + perf_jit_map_state.build_id_salt = + ((uint64_t)pid << 32) ^ (uint64_t)get_current_monotonic_ticks(); /* Calculate padding size based on actual unwind info requirements */ - size_t eh_frame_size = calculate_eh_frame_size(); + size_t eh_frame_size = _PyJitUnwind_EhFrameSize(0); size_t unwind_data_size = sizeof(EhFrameHeader) + eh_frame_size; - trampoline_api.code_padding = round_up(unwind_data_size, 16); + trampoline_api.code_padding = _Py_SIZE_ROUND_UP(unwind_data_size, 16); trampoline_api.code_alignment = 32; + PyMutex_Unlock(&perf_jit_map_state.map_lock); return &perf_jit_map_state; } @@ -1143,54 +514,31 @@ static void* perf_map_jit_init(void) { // ============================================================================= /* - * Write a complete jitdump entry for a Python function - * - * This is the main function called by Python's trampoline system whenever - * a new piece of JIT-compiled code needs to be recorded. It writes both - * the unwinding information and the code load event to the jitdump file. - * - * The function performs these steps: - * 1. Initialize jitdump system if not already done - * 2. Extract function name and filename from Python code object - * 3. Generate DWARF unwinding information - * 4. Write unwinding info event to jitdump file - * 5. Write code load event to jitdump file - * - * Args: - * state: Jitdump state (currently unused, uses global state) - * code_addr: Address where the compiled code resides - * code_size: Size of the compiled code in bytes - * co: Python code object containing metadata + * Write a complete jitdump entry for a code region with a provided name. * - * IMPORTANT: This function signature is part of Python's internal API - * and must not be changed without coordinating with core Python development. + * This shares the same implementation as the trampoline callback, but + * allows callers that don't have a PyCodeObject to reuse the jitdump + * infrastructure. */ -static void perf_map_jit_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) +static void perf_map_jit_write_entry_with_name( + void *state, + const void *code_addr, + size_t code_size, + const char *entry, + const char *filename +) { /* Initialize jitdump system on first use */ - if (perf_jit_map_state.perf_map == NULL) { - void* ret = perf_map_jit_init(); - if(ret == NULL){ - return; // Initialization failed, silently abort - } + void* ret = perf_map_jit_init(); + if (ret == NULL) { + return; // Initialization failed, silently abort } - /* - * Extract function information from Python code object - * - * We create a human-readable function name by combining the qualified - * name (includes class/module context) with the filename. This helps - * developers identify functions in perf reports. - */ - const char *entry = ""; - if (co->co_qualname != NULL) { - entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + entry = ""; } - - const char *filename = ""; - if (co->co_filename != NULL) { - filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + filename = ""; } /* @@ -1218,15 +566,20 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, * Without it, perf cannot generate accurate call graphs, especially * in optimized code where frame pointers may be omitted. */ - ELFObjectContext ctx; - char buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) - ctx.code_size = code_size; - ctx.startp = ctx.p = (uint8_t*)buffer; - ctx.fde_p = NULL; // Initialize to NULL, will be set when FDE is written + uint8_t buffer[1024]; // Buffer for DWARF data (1KB should be sufficient) + size_t eh_frame_size = _PyJitUnwind_BuildEhFrame( + buffer, sizeof(buffer), code_addr, code_size, 0); + if (eh_frame_size == 0) { + PyMem_RawFree(perf_map_entry); + return; + } - /* Generate EH frame (Exception Handling frame) data */ - elf_init_ehframe(&ctx); - int eh_frame_size = ctx.p - ctx.startp; + /* + * A logical jitdump entry is written as multiple records and also consumes + * a process-global code_id. Serialize the whole sequence so concurrent JIT + * compilation cannot interleave records or reuse an ID. + */ + PyMutex_Lock(&perf_jit_map_state.map_lock); /* * Write Code Unwinding Information Event @@ -1244,12 +597,12 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, assert(ev2.unwind_data_size <= (uint64_t)trampoline_api.code_padding); ev2.eh_frame_hdr_size = sizeof(EhFrameHeader); - ev2.mapped_size = round_up(ev2.unwind_data_size, 16); // 16-byte alignment + ev2.mapped_size = _Py_SIZE_ROUND_UP(ev2.unwind_data_size, 16); // 16-byte alignment /* Calculate total event size with padding */ - int content_size = sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size; - int padding_size = round_up(content_size, 8) - content_size; // 8-byte align - ev2.base.size = content_size + padding_size; + int content_size = (int)(sizeof(ev2) + sizeof(EhFrameHeader) + eh_frame_size); + int padding_size = (int)_Py_SIZE_ROUND_UP((size_t)content_size, 8) - content_size; // 8-byte align + ev2.base.size = (uint32_t)(content_size + padding_size); /* Write the unwinding info event header */ perf_map_jit_write_fully(&ev2, sizeof(ev2)); @@ -1263,20 +616,21 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, */ EhFrameHeader f; f.version = 1; - f.eh_frame_ptr_enc = DwarfSData4 | DwarfPcRel; // PC-relative signed 4-byte - f.fde_count_enc = DwarfUData4; // Unsigned 4-byte count - f.table_enc = DwarfSData4 | DwarfDataRel; // Data-relative signed 4-byte + f.eh_frame_ptr_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_pcrel; + f.fde_count_enc = DWRF_EH_PE_udata4; + f.table_enc = DWRF_EH_PE_sdata4 | DWRF_EH_PE_datarel; /* Calculate relative offsets for EH frame navigation */ - f.eh_frame_ptr = -(eh_frame_size + 4 * sizeof(unsigned char)); + f.eh_frame_ptr = -(int32_t)(eh_frame_size + 4 * sizeof(unsigned char)); f.eh_fde_count = 1; // We generate exactly one FDE per function - f.from = -(round_up(code_size, 8) + eh_frame_size); - - int cie_size = ctx.eh_frame_p - ctx.startp; - f.to = -(eh_frame_size - cie_size); + f.from = -(int32_t)(_Py_SIZE_ROUND_UP(code_size, 8) + eh_frame_size); + uint32_t cie_payload_size; + memcpy(&cie_payload_size, buffer, sizeof(cie_payload_size)); + int cie_size = (int)(sizeof(cie_payload_size) + cie_payload_size); + f.to = -(int32_t)(eh_frame_size - cie_size); /* Write EH frame data and header */ - perf_map_jit_write_fully(ctx.startp, eh_frame_size); + perf_map_jit_write_fully(buffer, eh_frame_size); perf_map_jit_write_fully(&f, sizeof(f)); /* Write padding to maintain alignment */ @@ -1313,12 +667,86 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr, /* Write code load event and associated data */ perf_map_jit_write_fully(&ev, sizeof(ev)); perf_map_jit_write_fully(perf_map_entry, name_length+1); // Include null terminator - perf_map_jit_write_fully((void*)(base), size); // Copy actual machine code + /* + * Ensure each synthetic DSO has unique .text bytes. + * + * perf merges DSOs that share a build-id. Since trampolines can share + * identical code and unwind bytes, perf may resolve all JIT frames to + * the first symbol it saw (including entries from previous runs when + * build-id caching is enabled). Patch a small marker in the emitted + * bytes to make the build-id depend on a per-process salt and code id + * without modifying the live code. + */ + uint64_t marker = perf_jit_map_state.build_id_salt ^ + ((uint64_t)perf_jit_map_state.code_id << 32) ^ + (uint64_t)code_size; + if (size >= sizeof(marker)) { + size_t prefix = size - sizeof(marker); + perf_map_jit_write_fully((void *)(base), prefix); + perf_map_jit_write_fully(&marker, sizeof(marker)); + } + else if (size > 0) { + uint8_t tmp[sizeof(marker)]; + memcpy(tmp, (void *)(base), size); + for (size_t i = 0; i < size; i++) { + tmp[i] ^= (uint8_t)(marker >> (i * 8)); + } + perf_map_jit_write_fully(tmp, size); + } /* Clean up allocated memory */ + PyMutex_Unlock(&perf_jit_map_state.map_lock); PyMem_RawFree(perf_map_entry); } +/* + * Write a complete jitdump entry for a Python function + * + * This is the main function called by Python's trampoline system whenever + * a new piece of JIT-compiled code needs to be recorded. It writes both + * the unwinding information and the code load event to the jitdump file. + * + * The function performs these steps: + * 1. Initialize jitdump system if not already done + * 2. Extract function name and filename from Python code object + * 3. Generate DWARF unwinding information + * 4. Write unwinding info event to jitdump file + * 5. Write code load event to jitdump file + * + * Args: + * state: Jitdump state (currently unused, uses global state) + * code_addr: Address where the compiled code resides + * code_size: Size of the compiled code in bytes + * co: Python code object containing metadata + * + * IMPORTANT: This function signature is part of Python's internal API + * and must not be changed without coordinating with core Python development. + */ +static void perf_map_jit_write_entry(void *state, const void *code_addr, + size_t code_size, PyCodeObject *co) +{ + const char *entry = ""; + const char *filename = ""; + if (co != NULL) { + if (co->co_qualname != NULL) { + entry = PyUnicode_AsUTF8(co->co_qualname); + } + if (co->co_filename != NULL) { + filename = PyUnicode_AsUTF8(co->co_filename); + } + } + perf_map_jit_write_entry_with_name(state, code_addr, code_size, + entry, filename); +} + +void +_PyPerfJit_WriteNamedCode(const void *code_addr, size_t code_size, + const char *entry, const char *filename) +{ + perf_map_jit_write_entry_with_name( + NULL, code_addr, code_size, entry, filename); +} + // ============================================================================= // CLEANUP AND FINALIZATION // ============================================================================= @@ -1346,15 +774,12 @@ static int perf_map_jit_fini(void* state) { * writing to the file when we close it. This prevents corruption * and ensures all data is properly flushed. */ + PyMutex_Lock(&perf_jit_map_state.map_lock); if (perf_jit_map_state.perf_map != NULL) { - PyThread_acquire_lock(perf_jit_map_state.map_lock, 1); fclose(perf_jit_map_state.perf_map); // This also flushes buffers - PyThread_release_lock(perf_jit_map_state.map_lock); - - /* Clean up synchronization primitive */ - PyThread_free_lock(perf_jit_map_state.map_lock); perf_jit_map_state.perf_map = NULL; } + PyMutex_Unlock(&perf_jit_map_state.map_lock); /* * Unmap the memory region diff --git a/Python/perf_trampoline.c b/Python/perf_trampoline.c index 0d835f3b7f56a9..58c61e64bfc4e9 100644 --- a/Python/perf_trampoline.c +++ b/Python/perf_trampoline.c @@ -243,7 +243,7 @@ perf_trampoline_code_watcher(PyCodeEvent event, PyCodeObject *co) static void perf_map_write_entry(void *state, const void *code_addr, - unsigned int code_size, PyCodeObject *co) + size_t code_size, PyCodeObject *co) { const char *entry = ""; if (co->co_qualname != NULL) { diff --git a/Python/record_functions.c.h b/Python/record_functions.c.h index 504f6e1d9901c3..3163f45f4bb751 100644 --- a/Python/record_functions.c.h +++ b/Python/record_functions.c.h @@ -101,10 +101,11 @@ void _PyOpcode_RecordFunction_CODE(_PyInterpreterFrame *frame, _PyStackRef *stac #define _RECORD_TOS_TYPE_INDEX 1 #define _RECORD_NOS_INDEX 2 #define _RECORD_3OS_GEN_FUNC_INDEX 3 -#define _RECORD_NOS_GEN_FUNC_INDEX 4 -#define _RECORD_CALLABLE_INDEX 5 -#define _RECORD_CALLABLE_KW_INDEX 6 -#define _RECORD_4OS_INDEX 7 +#define _RECORD_TOS_INDEX 4 +#define _RECORD_NOS_GEN_FUNC_INDEX 5 +#define _RECORD_CALLABLE_INDEX 6 +#define _RECORD_CALLABLE_KW_INDEX 7 +#define _RECORD_4OS_INDEX 8 const _PyOpcodeRecordEntry _PyOpcode_RecordEntries[256] = { [TO_BOOL_BOOL] = {1, {_RECORD_TOS_TYPE_INDEX}}, @@ -136,15 +137,15 @@ const _PyOpcodeRecordEntry _PyOpcode_RecordEntries[256] = { [STORE_ATTR] = {1, {_RECORD_TOS_TYPE_INDEX}}, [LOAD_SUPER_ATTR] = {1, {_RECORD_NOS_INDEX}}, [LOAD_SUPER_ATTR_METHOD] = {1, {_RECORD_NOS_INDEX}}, - [LOAD_ATTR] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_INSTANCE_VALUE] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_MODULE] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_WITH_HINT] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_SLOT] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_CLASS] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_PROPERTY] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = {1, {_RECORD_TOS_TYPE_INDEX}}, + [LOAD_ATTR] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_INSTANCE_VALUE] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_MODULE] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_WITH_HINT] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_SLOT] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_CLASS] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_PROPERTY] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = {1, {_RECORD_TOS_INDEX}}, [STORE_ATTR_INSTANCE_VALUE] = {1, {_RECORD_TOS_TYPE_INDEX}}, [STORE_ATTR_WITH_HINT] = {1, {_RECORD_TOS_TYPE_INDEX}}, [STORE_ATTR_SLOT] = {1, {_RECORD_TOS_TYPE_INDEX}}, @@ -158,11 +159,11 @@ const _PyOpcodeRecordEntry _PyOpcode_RecordEntries[256] = { [FOR_ITER_RANGE] = {1, {_RECORD_NOS_GEN_FUNC_INDEX}}, [FOR_ITER_GEN] = {1, {_RECORD_NOS_GEN_FUNC_INDEX}}, [LOAD_SPECIAL] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_METHOD_WITH_VALUES] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_METHOD_NO_DICT] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = {1, {_RECORD_TOS_TYPE_INDEX}}, - [LOAD_ATTR_METHOD_LAZY_DICT] = {1, {_RECORD_TOS_TYPE_INDEX}}, + [LOAD_ATTR_METHOD_WITH_VALUES] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_METHOD_NO_DICT] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = {1, {_RECORD_TOS_INDEX}}, + [LOAD_ATTR_METHOD_LAZY_DICT] = {1, {_RECORD_TOS_INDEX}}, [CALL] = {1, {_RECORD_CALLABLE_INDEX}}, [CALL_PY_GENERAL] = {1, {_RECORD_CALLABLE_INDEX}}, [CALL_BOUND_METHOD_GENERAL] = {1, {_RECORD_CALLABLE_INDEX}}, @@ -199,12 +200,13 @@ const _PyOpcodeRecordSlotMap _PyOpcode_RecordSlotMaps[256] = { [BINARY_OP_SUBSCR_GETITEM] = {1, 0, {0}}, [SEND_GEN] = {1, 0, {0}}, [LOAD_SUPER_ATTR_METHOD] = {1, 0, {0}}, - [LOAD_ATTR_INSTANCE_VALUE] = {1, 0, {0}}, - [LOAD_ATTR_WITH_HINT] = {1, 0, {0}}, - [LOAD_ATTR_SLOT] = {1, 0, {0}}, + [LOAD_ATTR_INSTANCE_VALUE] = {1, 1, {0}}, + [LOAD_ATTR_WITH_HINT] = {1, 1, {0}}, + [LOAD_ATTR_SLOT] = {1, 1, {0}}, + [LOAD_ATTR_CLASS] = {1, 0, {0}}, [LOAD_ATTR_CLASS_WITH_METACLASS_CHECK] = {1, 0, {0}}, - [LOAD_ATTR_PROPERTY] = {1, 0, {0}}, - [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = {1, 0, {0}}, + [LOAD_ATTR_PROPERTY] = {1, 1, {0}}, + [LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN] = {1, 1, {0}}, [STORE_ATTR_INSTANCE_VALUE] = {1, 0, {0}}, [STORE_ATTR_WITH_HINT] = {1, 0, {0}}, [STORE_ATTR_SLOT] = {1, 0, {0}}, @@ -213,11 +215,11 @@ const _PyOpcodeRecordSlotMap _PyOpcode_RecordSlotMaps[256] = { [GET_ITER_VIRTUAL] = {1, 0, {0}}, [FOR_ITER_GEN] = {1, 0, {0}}, [LOAD_SPECIAL] = {1, 0, {0}}, - [LOAD_ATTR_METHOD_WITH_VALUES] = {1, 0, {0}}, - [LOAD_ATTR_METHOD_NO_DICT] = {1, 0, {0}}, - [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = {1, 0, {0}}, - [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = {1, 0, {0}}, - [LOAD_ATTR_METHOD_LAZY_DICT] = {1, 0, {0}}, + [LOAD_ATTR_METHOD_WITH_VALUES] = {1, 1, {0}}, + [LOAD_ATTR_METHOD_NO_DICT] = {1, 1, {0}}, + [LOAD_ATTR_NONDESCRIPTOR_WITH_VALUES] = {1, 1, {0}}, + [LOAD_ATTR_NONDESCRIPTOR_NO_DICT] = {1, 1, {0}}, + [LOAD_ATTR_METHOD_LAZY_DICT] = {1, 1, {0}}, [CALL_PY_GENERAL] = {1, 0, {0}}, [CALL_BOUND_METHOD_GENERAL] = {1, 1, {0}}, [CALL_NON_PY_GENERAL] = {1, 0, {0}}, @@ -237,11 +239,12 @@ const _PyOpcodeRecordSlotMap _PyOpcode_RecordSlotMaps[256] = { [BINARY_OP] = {2, 2, {1, 0}}, }; -const _Py_RecordFuncPtr _PyOpcode_RecordFunctions[8] = { +const _Py_RecordFuncPtr _PyOpcode_RecordFunctions[9] = { [0] = NULL, [_RECORD_TOS_TYPE_INDEX] = _PyOpcode_RecordFunction_TOS_TYPE, [_RECORD_NOS_INDEX] = _PyOpcode_RecordFunction_NOS, [_RECORD_3OS_GEN_FUNC_INDEX] = _PyOpcode_RecordFunction_3OS_GEN_FUNC, + [_RECORD_TOS_INDEX] = _PyOpcode_RecordFunction_TOS, [_RECORD_NOS_GEN_FUNC_INDEX] = _PyOpcode_RecordFunction_NOS_GEN_FUNC, [_RECORD_CALLABLE_INDEX] = _PyOpcode_RecordFunction_CALLABLE, [_RECORD_CALLABLE_KW_INDEX] = _PyOpcode_RecordFunction_CALLABLE_KW, diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 1ee0b3bec684f9..c6447d03369a94 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2707,7 +2707,7 @@ PyAPI_FUNC(int) PyUnstable_PerfMapState_Init(void) { PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( const void *code_addr, - unsigned int code_size, + size_t code_size, const char *entry_name ) { #ifndef MS_WINDOWS @@ -2718,7 +2718,7 @@ PyAPI_FUNC(int) PyUnstable_WritePerfMapEntry( } } PyThread_acquire_lock(perf_map_state.map_lock, 1); - fprintf(perf_map_state.perf_map, "%" PRIxPTR " %x %s\n", (uintptr_t) code_addr, code_size, entry_name); + fprintf(perf_map_state.perf_map, "%" PRIxPTR " %zx %s\n", (uintptr_t) code_addr, code_size, entry_name); fflush(perf_map_state.perf_map); PyThread_release_lock(perf_map_state.map_lock); #endif diff --git a/Tools/build/smelly.py b/Tools/build/smelly.py index 7197d70bc8bd0c..17547d4d916e9f 100755 --- a/Tools/build/smelly.py +++ b/Tools/build/smelly.py @@ -25,6 +25,8 @@ # "Legacy": some old symbols are prefixed by "PY_". EXCEPTIONS = frozenset({ 'PY_TIMEOUT_MAX', + '__jit_debug_descriptor', + '__jit_debug_register_code', }) IGNORED_EXTENSION = "_ctypes_test" diff --git a/Tools/c-analyzer/cpython/_parser.py b/Tools/c-analyzer/cpython/_parser.py index a251a045b91144..fc3cbf3779db26 100644 --- a/Tools/c-analyzer/cpython/_parser.py +++ b/Tools/c-analyzer/cpython/_parser.py @@ -318,14 +318,17 @@ def format_tsv_lines(lines): _abs('Modules/_hacl/*.c'): (200_000, 500), _abs('Modules/posixmodule.c'): (20_000, 500), _abs('Modules/termios.c'): (10_000, 800), + _abs('Modules/_remote_debugging/debug_offsets_validation.h'): (25_000, 1000), _abs('Modules/_remote_debugging/*.h'): (20_000, 1000), _abs('Modules/_testcapimodule.c'): (20_000, 400), _abs('Modules/expat/expat.h'): (10_000, 400), _abs('Objects/stringlib/unicode_format.h'): (10_000, 400), _abs('Objects/typeobject.c'): (380_000, 13_000), _abs('Python/compile.c'): (20_000, 500), + _abs('Python/jit_unwind.c'): (20_000, 300), _abs('Python/optimizer.c'): (100_000, 5_000), _abs('Python/parking_lot.c'): (40_000, 1000), + _abs('Python/perf_jit_trampoline.c'): (40_000, 1000), _abs('Python/pylifecycle.c'): (750_000, 5000), _abs('Python/pystate.c'): (750_000, 5000), _abs('Python/initconfig.c'): (50_000, 500), @@ -344,7 +347,7 @@ def format_tsv_lines(lines): _abs('Modules/_ssl_data_300.h'): (80_000, 10_000), _abs('Modules/_ssl_data_111.h'): (80_000, 10_000), _abs('Modules/cjkcodecs/mappings_*.h'): (160_000, 2_000), - _abs('Modules/clinic/_testclinic.c.h'): (120_000, 5_000), + _abs('Modules/clinic/_testclinic.c.h'): (125_000, 5_000), _abs('Modules/unicodedata_db.h'): (180_000, 3_000), _abs('Modules/unicodename_db.h'): (1_200_000, 15_000), _abs('Objects/unicodetype_db.h'): (240_000, 3_000), diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv index d2489387f46caa..aa89e312b62482 100644 --- a/Tools/c-analyzer/cpython/ignored.tsv +++ b/Tools/c-analyzer/cpython/ignored.tsv @@ -386,6 +386,8 @@ Python/intrinsics.c - _PyIntrinsics_UnaryFunctions - Python/intrinsics.c - _PyIntrinsics_BinaryFunctions - Python/lock.c - TIME_TO_BE_FAIR_NS - Python/opcode_targets.h - opcode_targets - +Python/jit_unwind.c - __jit_debug_descriptor - +Python/jit_unwind.c - _Py_jit_debug_mutex - Python/perf_trampoline.c - _Py_perfmap_callbacks - Python/perf_jit_trampoline.c - _Py_perfmap_jit_callbacks - Python/perf_jit_trampoline.c - perf_jit_map_state - diff --git a/Tools/jit/README.md b/Tools/jit/README.md index 9361f39dcc64f2..2a687bb9e899e7 100644 --- a/Tools/jit/README.md +++ b/Tools/jit/README.md @@ -9,7 +9,7 @@ Python 3.11 or newer is required to build the JIT. The JIT compiler does not require end users to install any third-party dependencies, but part of it must be *built* using LLVM[^why-llvm]. You are *not* required to build the rest of CPython using LLVM, or even the same version of LLVM (in fact, this is uncommon). -LLVM version 21 is the officially supported version. Both `clang` and `llvm-readobj` need to be installed and discoverable (version suffixes, like `clang-21`, are okay). It's highly recommended that you also have `llvm-objdump` available, since this allows the build script to dump human-readable assembly for the generated code. +LLVM version 21 is the officially supported version. The tools `clang`, `llvm-readobj`, `llvm-objdump`, and `llvm-dwarfdump` need to be installed and discoverable (version suffixes, like `clang-21`, are okay). You can customize the LLVM configuration using environment variables before running configure: @@ -85,7 +85,7 @@ If you're looking for information on how to update the JIT build dependencies, s [^pep-744]: [PEP 744](https://peps.python.org/pep-0744/) -[^why-llvm]: Clang is specifically needed because it's the only C compiler with support for guaranteed tail calls (`musttail`), which are required by CPython's continuation-passing-style approach to JIT compilation. Since LLVM also includes other functionalities we need (namely, object file parsing and disassembly), it's convenient to only support one toolchain at this time. +[^why-llvm]: Clang is specifically needed because it's the only C compiler with support for guaranteed tail calls (`musttail`), which are required by CPython's continuation-passing-style approach to JIT compilation. Since LLVM also includes other functionalities we need (namely, object file parsing, disassembly, and DWARF inspection), it's convenient to only support one toolchain at this time. ### Understanding JIT behavior diff --git a/Tools/jit/_dwarf.py b/Tools/jit/_dwarf.py new file mode 100644 index 00000000000000..5b6b148562e109 --- /dev/null +++ b/Tools/jit/_dwarf.py @@ -0,0 +1,236 @@ +"""Utilities for deriving JIT unwind information from DWARF CFI.""" + +import dataclasses +import pathlib +import re +import typing + +_LLVMRun = typing.Callable[..., typing.Awaitable[str]] + + +@dataclasses.dataclass(frozen=True) +class UnwindInfo: + code_alignment_factor: int + data_alignment_factor: int + return_address_register: int + cfa_register: int + cfa_offset: int + frame_pointer_register: int + frame_pointer_offset: int + return_address_offset: int + + +@dataclasses.dataclass(frozen=True) +class ELFUnwindConfig: + frame_pointer: str + return_address: str + register_numbers: typing.Mapping[str, int] + call_instruction_prefixes: tuple[str, ...] + + def is_call_instruction(self, instruction: str) -> bool: + return instruction.startswith(self.call_instruction_prefixes) + + +@dataclasses.dataclass(frozen=True) +class _UnwindRow: + pc: int + cfa_register: str + cfa_offset: int + saved_registers: dict[str, int] + + +class ELFUnwindInfo: + def __init__( + self, + target_name: str, + *, + config: ELFUnwindConfig, + verbose: bool = False, + llvm_version: str, + llvm_tools_install_dir: str | None = None, + llvm_run: _LLVMRun, + ) -> None: + self.target_name = target_name + self.config = config + self.verbose = verbose + self.llvm_version = llvm_version + self.llvm_tools_install_dir = llvm_tools_install_dir + self.llvm_run = llvm_run + + @staticmethod + def _parse_dwarfdump_int( + dump: str, field: str, *, required: bool = True + ) -> int | None: + match = re.search(rf"^\s*{field}:\s+(-?\d+)$", dump, re.MULTILINE) + if match is None: + if required: + raise ValueError(f"missing {field} in llvm-dwarfdump output") + return None + return int(match.group(1)) + + @staticmethod + def _parse_dwarfdump_rows(dump: str) -> list[_UnwindRow]: + row_pattern = re.compile( + r"^\s*0x(?P[0-9a-f]+):\s+" + r"CFA=(?P[A-Z][A-Z0-9]*)" + r"(?P[+-]\d+)?" + r"(?::\s*(?P.*))?$" + ) + saved_pattern = re.compile( + r"(?P[A-Z][A-Z0-9]*)=\[CFA(?P[+-]\d+)?\]" + ) + rows = [] + for line in dump.splitlines(): + row_match = row_pattern.match(line) + if row_match is None: + continue + saved_registers = {} + saved = row_match["saved"] + if saved: + for saved_match in saved_pattern.finditer(saved): + offset = saved_match["offset"] + saved_registers[saved_match["register"]] = ( + int(offset) if offset is not None else 0 + ) + cfa_offset = row_match["cfa_offset"] + rows.append( + _UnwindRow( + pc=int(row_match["pc"], 16), + cfa_register=row_match["cfa_register"], + cfa_offset=int(cfa_offset) if cfa_offset is not None else 0, + saved_registers=saved_registers, + ) + ) + if not rows: + raise ValueError("missing interpreted CFI rows in llvm-dwarfdump output") + return rows + + @staticmethod + def _parse_objdump_instructions(dump: str) -> list[tuple[int, str]]: + instructions = [] + for line in dump.splitlines(): + match = re.match( + r"^\s*(?P[0-9a-f]+):\s+" + r"(?:(?:[0-9a-f]{2}|[0-9a-f]{8})\s+)+" + r"(?P.+)$", + line, + ) + if match: + instructions.append( + ( + int(match["pc"], 16), + re.sub(r"\s+", " ", match["instruction"].strip()), + ) + ) + if not instructions: + raise ValueError("missing instructions in llvm-objdump output") + return instructions + + def _reg_number(self, register: str) -> int: + try: + return self.config.register_numbers[register] + except KeyError as exc: + raise ValueError( + f"unsupported register {register!r} in llvm-dwarfdump output" + ) from exc + + @staticmethod + def _encoded_cfa_offset(byte_offset: int, data_alignment_factor: int) -> int: + if data_alignment_factor == 0: + raise ValueError("DWARF data alignment factor must not be zero") + if byte_offset % data_alignment_factor: + raise ValueError( + f"offset {byte_offset} is not a multiple of " + f"data alignment factor {data_alignment_factor}" + ) + return byte_offset // data_alignment_factor + + async def _read_objdump(self, output: pathlib.Path) -> str: + return await self.llvm_run( + "llvm-objdump", + ["-d", f"{output}"], + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + + async def _read_eh_frame(self, output: pathlib.Path) -> str: + return await self.llvm_run( + "llvm-dwarfdump", + ["--eh-frame", f"{output}"], + echo=self.verbose, + llvm_version=self.llvm_version, + llvm_tools_install_dir=self.llvm_tools_install_dir, + ) + + def _executor_call_pc(self, disassembly: str) -> int: + calls = [ + pc + for pc, instruction in self._parse_objdump_instructions(disassembly) + if self.config.is_call_instruction(instruction) + ] + if len(calls) != 1: + raise ValueError( + f"{self.target_name} JIT shim should contain exactly one executor call" + ) + call_pc = calls[0] + return call_pc + + def _active_row(self, eh_frame: str, call_pc: int) -> _UnwindRow: + rows = self._parse_dwarfdump_rows(eh_frame) + active_rows = [row for row in rows if row.pc <= call_pc] + if not active_rows: + raise ValueError( + f"{self.target_name} JIT shim has no CFI row for executor call " + f"at 0x{call_pc:x}" + ) + return max(active_rows, key=lambda row: row.pc) + + def _check_saved_registers(self, row: _UnwindRow) -> None: + if ( + self.config.frame_pointer not in row.saved_registers + or self.config.return_address not in row.saved_registers + ): + raise ValueError( + f"{self.target_name} JIT shim CFI row at 0x{row.pc:x} " + f"does not save {self.config.frame_pointer} and " + f"{self.config.return_address}" + ) + + def _build_unwind_info(self, eh_frame: str, active_row: _UnwindRow) -> UnwindInfo: + code_alignment_factor = self._parse_dwarfdump_int( + eh_frame, "Code alignment factor" + ) + data_alignment_factor = self._parse_dwarfdump_int( + eh_frame, "Data alignment factor" + ) + return_address_register = self._parse_dwarfdump_int( + eh_frame, "Return address column" + ) + assert code_alignment_factor is not None + assert data_alignment_factor is not None + assert return_address_register is not None + return UnwindInfo( + code_alignment_factor=code_alignment_factor, + data_alignment_factor=data_alignment_factor, + return_address_register=return_address_register, + cfa_register=self._reg_number(active_row.cfa_register), + cfa_offset=active_row.cfa_offset, + frame_pointer_register=self._reg_number(self.config.frame_pointer), + frame_pointer_offset=self._encoded_cfa_offset( + active_row.saved_registers[self.config.frame_pointer], + data_alignment_factor, + ), + return_address_offset=self._encoded_cfa_offset( + active_row.saved_registers[self.config.return_address], + data_alignment_factor, + ), + ) + + async def extract(self, output: pathlib.Path) -> UnwindInfo: + disassembly = await self._read_objdump(output) + call_pc = self._executor_call_pc(disassembly) + eh_frame = await self._read_eh_frame(output) + active_row = self._active_row(eh_frame, call_pc) + self._check_saved_registers(active_row) + return self._build_unwind_info(eh_frame, active_row) diff --git a/Tools/jit/_targets.py b/Tools/jit/_targets.py index 15cac3de3fe11f..ceee383ea680d7 100644 --- a/Tools/jit/_targets.py +++ b/Tools/jit/_targets.py @@ -12,6 +12,7 @@ import typing import shlex +import _dwarf import _llvm import _optimizers import _schema @@ -37,6 +38,27 @@ ) +_ELF_UNWIND_AARCH64 = _dwarf.ELFUnwindConfig( + frame_pointer="W29", + return_address="W30", + register_numbers={ + "W29": 29, + "W30": 30, + }, + call_instruction_prefixes=("blr ",), +) + +_ELF_UNWIND_X86_64 = _dwarf.ELFUnwindConfig( + frame_pointer="RBP", + return_address="RIP", + register_numbers={ + "RBP": 6, + "RIP": 16, + }, + call_instruction_prefixes=("callq ", "call "), +) + + @dataclasses.dataclass class _Target(typing.Generic[_S, _R]): triple: str @@ -52,6 +74,7 @@ class _Target(typing.Generic[_S, _R]): verbose: bool = False cflags: str = "" frame_pointers: bool = False + unwind: _dwarf.ELFUnwindConfig | None = None llvm_version: str = _llvm._LLVM_VERSION llvm_tools_install_dir: str | None = None known_symbols: dict[str, int] = dataclasses.field(default_factory=dict) @@ -88,6 +111,32 @@ def _compute_digest(self) -> str: hasher.update(pathlib.Path(dirpath, filename).read_bytes()) return hasher.hexdigest() + def _write_generated_header( + self, + output: pathlib.Path, + *, + digest: str, + comment: str, + lines: typing.Iterable[str], + ) -> None: + output_new = output.with_name(f"{output.name}.new") + try: + with output_new.open("w") as file: + file.write(digest) + if comment: + file.write(f"// {comment}\n") + file.write("\n") + for line in lines: + file.write(f"{line}\n") + try: + output_new.replace(output) + except FileNotFoundError: + # another process probably already moved the file + if not output.is_file(): + raise + finally: + output_new.unlink(missing_ok=True) + async def _parse(self, path: pathlib.Path) -> _stencils.StencilGroup: group = _stencils.StencilGroup() args = ["--disassemble", "--reloc", f"{path}"] @@ -234,7 +283,7 @@ async def _build_shim_object(self, output: pathlib.Path) -> None: args_o += self._shim_compile_args() args_o += [ "-c", - # The linked shim is a real function in the final binary, so + # The shim is a real function in the final binary, so # keep unwind info for debuggers and stack walkers. "-fasynchronous-unwind-tables", ] @@ -251,6 +300,11 @@ async def _build_shim_object(self, output: pathlib.Path) -> None: llvm_tools_install_dir=self.llvm_tools_install_dir, ) + async def _get_shim_unwind_info( + self, output: pathlib.Path + ) -> _dwarf.UnwindInfo | None: + return None + async def _build_stencils(self) -> dict[str, _stencils.StencilGroup]: generated_cases = PYTHON_EXECUTOR_CASES_C_H.read_text() cases_and_opnames = sorted( @@ -289,6 +343,7 @@ def build( force: bool = False, jit_stencils: pathlib.Path, jit_shim_object: pathlib.Path, + jit_unwind_info: pathlib.Path, ) -> None: """Build jit_stencils.h and the shim object in the given directory.""" jit_stencils.parent.mkdir(parents=True, exist_ok=True) @@ -300,39 +355,42 @@ def build( outline = "=" * len(warning) print("\n".join(["", outline, warning, request, outline, ""])) digest = f"// {self._compute_digest()}\n" + # The generated headers include the input digest as their first line. + # If every generated artifact is current, skip the expensive rebuild. if ( not force and jit_stencils.exists() and jit_stencils.read_text().startswith(digest) and jit_shim_object.exists() + and jit_unwind_info.exists() + and jit_unwind_info.read_text().startswith(digest) ): return + # Build the shim first so its compiled DWARF CFI can be used to derive + # the unwind rules emitted into jit_unwind_info-.h. ASYNCIO_RUNNER.run(self._build_shim_object(jit_shim_object)) + unwind_info = ASYNCIO_RUNNER.run(self._get_shim_unwind_info(jit_shim_object)) + self._write_generated_header( + jit_unwind_info, + digest=digest, + comment=comment, + lines=_writer.dump_unwind_info(unwind_info), + ) + # Build the uop stencils after the shim metadata has been emitted. stencil_groups = ASYNCIO_RUNNER.run(self._build_stencils()) - jit_stencils_new = jit_stencils.parent / "jit_stencils.h.new" - try: - with jit_stencils_new.open("w") as file: - file.write(digest) - if comment: - file.write(f"// {comment}\n") - file.write("\n") - for line in _writer.dump(stencil_groups, self.known_symbols): - file.write(f"{line}\n") - try: - jit_stencils_new.replace(jit_stencils) - except FileNotFoundError: - # another process probably already moved the file - if not jit_stencils.is_file(): - raise - finally: - jit_stencils_new.unlink(missing_ok=True) + self._write_generated_header( + jit_stencils, + digest=digest, + comment=comment, + lines=_writer.dump(stencil_groups, self.known_symbols), + ) class _COFF( _Target[_schema.COFFSection, _schema.COFFRelocation] ): # pylint: disable = too-few-public-methods def _shim_compile_args(self) -> list[str]: - # The linked shim is part of pythoncore, not a shared extension. + # The shim is part of pythoncore, not a shared extension. # On Windows, Py_BUILD_CORE_MODULE makes public APIs import from # pythonXY.lib, which creates a self-dependency when linking # pythoncore.dll. Build the shim with builtin/core semantics. @@ -450,6 +508,19 @@ class _ELF( symbol_prefix = "" re_global = re.compile(r'\s*\.globl\s+(?P