From 7ba5f5527dc431e04f24cf80ddbf6979fac8f16b Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 26 Sep 2023 14:21:25 -0400 Subject: [PATCH 01/16] gh-109329: Support for basic pystats for Tier 2 --- Include/cpython/pystats.h | 18 ++++- Include/internal/pycore_code.h | 6 ++ Python/bytecodes.c | 2 +- Python/executor.c | 6 +- Python/generated_cases.c.h | 2 +- Python/optimizer.c | 2 +- Python/specialize.c | 26 +++++- Tools/scripts/summarize_stats.py | 135 ++++++++++++++++++++++++------- 8 files changed, 153 insertions(+), 44 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 150e16faa96ca1..7014c968af8229 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -86,10 +86,6 @@ typedef struct _object_stats { uint64_t type_cache_dunder_hits; uint64_t type_cache_dunder_misses; uint64_t type_cache_collisions; - uint64_t optimization_attempts; - uint64_t optimization_traces_created; - uint64_t optimization_traces_executed; - uint64_t optimization_uops_executed; /* Temporary value used during GC */ uint64_t object_visits; } ObjectStats; @@ -100,10 +96,24 @@ typedef struct _gc_stats { uint64_t objects_collected; } GCStats; +typedef struct _uop_stats { + uint64_t execution_count; + uint64_t miss; +} UOpStats; + +typedef struct _optimization_stats { + uint64_t attempts; + uint64_t traces_created; + uint64_t traces_executed; + uint64_t uops_executed; + UOpStats opcode[512]; +} OptimizationStats; + typedef struct _stats { OpcodeStats opcode_stats[256]; CallStats call_stats; ObjectStats object_stats; + OptimizationStats optimization_stats; GCStats *gc_stats; } PyStats; diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index a77fa11baf8413..e89cab16c6d9eb 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -282,6 +282,9 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \ do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0) #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) +#define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) +#define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) +#define UOP_STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].name++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -296,6 +299,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define EVAL_CALL_STAT_INC(name) ((void)0) #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0) #define GC_STAT_ADD(gen, name, n) ((void)0) +#define OPTIMIZATION_STAT_INC(name) ((void)0) +#define UOP_EXE_INC(opname) ((void)0) +#define UOP_STAT_INC(opname, name) ((void)0) #endif // !Py_STATS // Utility functions for reading/writing 32/64-bit values in the inline caches. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 402b27101dbdb6..e813b65f5c8e0e 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2244,7 +2244,7 @@ dummy_func( // Double-check that the opcode isn't instrumented or something: here->op.code == JUMP_BACKWARD) { - OBJECT_STAT_INC(optimization_attempts); + OPTIMIZATION_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer); ERROR_IF(optimized < 0, error); if (optimized) { diff --git a/Python/executor.c b/Python/executor.c index ac9104223da8ff..cb9f00aae72975 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -62,7 +62,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject CHECK_EVAL_BREAKER(); - OBJECT_STAT_INC(optimization_traces_executed); + OPTIMIZATION_STAT_INC(traces_executed); _Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; int pc = 0; int opcode; @@ -81,7 +81,9 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject operand, (int)(stack_pointer - _PyFrame_Stackbase(frame))); pc++; - OBJECT_STAT_INC(optimization_uops_executed); + OPTIMIZATION_STAT_INC(uops_executed); + assert(opcode < 512); + UOP_EXE_INC(opcode); switch (opcode) { #include "executor_cases.c.h" diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index ebb87a86de432e..216640c23891fd 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -2947,7 +2947,7 @@ // Double-check that the opcode isn't instrumented or something: here->op.code == JUMP_BACKWARD) { - OBJECT_STAT_INC(optimization_attempts); + OPTIMIZATION_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer); if (optimized < 0) goto error; if (optimized) { diff --git a/Python/optimizer.c b/Python/optimizer.c index fbdbf7291784c4..fc1e7a8cad98fd 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -891,7 +891,7 @@ uop_optimize( // Error or nothing translated return trace_length; } - OBJECT_STAT_INC(optimization_traces_created); + OPTIMIZATION_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize != NULL && *uop_optimize > '0') { trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries); diff --git a/Python/specialize.c b/Python/specialize.c index d9b748cad78f4f..71b10b7c3d2eeb 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -199,10 +199,6 @@ print_object_stats(FILE *out, ObjectStats *stats) fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions); fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits); fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses); - fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->optimization_attempts); - fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->optimization_traces_created); - fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->optimization_traces_executed); - fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->optimization_uops_executed); } static void @@ -215,6 +211,27 @@ print_gc_stats(FILE *out, GCStats *stats) } } +static void +print_optimization_stats(FILE *out, OptimizationStats *stats) +{ + fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts); + fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); + fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); + fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); + + char** names; + for (int i = 0; i < 512; i++) { + if (i < 256) { + names = _PyOpcode_OpName; + } else { + names = _PyOpcode_uop_name; + } + if (stats->opcode[i].execution_count) { + fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", names[i], stats->opcode[i].execution_count); + } + } +} + static void print_stats(FILE *out, PyStats *stats) { @@ -222,6 +239,7 @@ print_stats(FILE *out, PyStats *stats) print_call_stats(out, &stats->call_stats); print_object_stats(out, &stats->object_stats); print_gc_stats(out, stats->gc_stats); + print_optimization_stats(out, &stats->optimization_stats); } void diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 3b2bdd8015be4a..8b8dce950ed852 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -211,12 +211,12 @@ def gather_stats(input): else: raise ValueError(f"{input:r} is not a file or directory path") -def extract_opcode_stats(stats): +def extract_opcode_stats(stats, prefix): opcode_stats = collections.defaultdict(dict) for key, value in stats.items(): - if not key.startswith("opcode"): + if not key.startswith(prefix): continue - name, _, rest = key[7:].partition("]") + name, _, rest = key[len(prefix) + 2:].partition("]") opcode_stats[name][rest.strip(".")] = value return opcode_stats @@ -350,35 +350,38 @@ def emit_execution_counts(opcode_stats, total): rows ) +def _emit_comparative_execution_counts(base_rows, head_rows): + base_data = dict((x[0], x[1:]) for x in base_rows) + head_data = dict((x[0], x[1:]) for x in head_rows) + opcodes = set(base_data.keys()) | set(head_data.keys()) + + rows = [] + default = [0, "0.0%", "0.0%", 0] + for opcode in opcodes: + base_entry = base_data.get(opcode, default) + head_entry = head_data.get(opcode, default) + if base_entry[0] == 0: + change = 1 + else: + change = (head_entry[0] - base_entry[0]) / base_entry[0] + rows.append( + (opcode, base_entry[0], head_entry[0], + f"{100*change:0.1f}%")) + + rows.sort(key=lambda x: -abs(percentage_to_float(x[-1]))) + + emit_table( + ("Name", "Base Count:", "Head Count:", "Change:"), + rows + ) + def emit_comparative_execution_counts( - base_opcode_stats, base_total, head_opcode_stats, head_total + base_opcode_stats, base_total, head_opcode_stats, head_total, level=2 ): - with Section("Execution counts", summary="execution counts for all instructions"): + with Section("Execution counts", summary="execution counts for all instructions", level=level): base_rows = calculate_execution_counts(base_opcode_stats, base_total) head_rows = calculate_execution_counts(head_opcode_stats, head_total) - base_data = dict((x[0], x[1:]) for x in base_rows) - head_data = dict((x[0], x[1:]) for x in head_rows) - opcodes = set(base_data.keys()) | set(head_data.keys()) - - rows = [] - default = [0, "0.0%", "0.0%", 0] - for opcode in opcodes: - base_entry = base_data.get(opcode, default) - head_entry = head_data.get(opcode, default) - if base_entry[0] == 0: - change = 1 - else: - change = (head_entry[0] - base_entry[0]) / base_entry[0] - rows.append( - (opcode, base_entry[0], head_entry[0], - f"{100*change:0.1f}%")) - - rows.sort(key=lambda x: -abs(percentage_to_float(x[-1]))) - - emit_table( - ("Name", "Base Count:", "Head Count:", "Change:"), - rows - ) + _emit_comparative_execution_counts(base_rows, head_rows) def get_defines(): spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c") @@ -611,8 +614,76 @@ def emit_pair_counts(opcode_stats, total): succ_rows ) + +def calculate_optimization_stats(stats): + attempts = stats["Optimization attempts"] + created = stats["Optimization traces created"] + executed = stats["Optimization traces executed"] + uops = stats["Optimization uops executed"] + + return [ + ("Optimization attempts", attempts, ""), + ( + "Traces created", created, + format_ratio(created, attempts) + ), + ("Traces executed", executed, ""), + ("Uops executed", uops, format_ratio(uops, executed)) + ] + + +def calculate_uop_execution_counts(opcode_stats): + total = 0 + counts = [] + for name, opcode_stat in opcode_stats.items(): + if "execution_count" in opcode_stat: + count = opcode_stat['execution_count'] + counts.append((count, name)) + total += count + counts.sort(reverse=True) + cumulative = 0 + rows = [] + for (count, name) in counts: + cumulative += count + rows.append((name, count, format_ratio(count, total), + format_ratio(cumulative, total))) + return rows + + +def emit_optimization_stats(stats): + uop_stats = extract_opcode_stats(stats, "uop") + + with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): + with Section("Overall stats", level=3): + rows = calculate_optimization_stats(stats) + emit_table(("", "Count:", "Ratio:"), rows) + + with Section("Uop stats", level=3): + rows = calculate_uop_execution_counts(uop_stats) + emit_table( + ("Uop", "Count:", "Self:", "Cumulative:"), + rows + ) + + +def emit_comparative_optimization_stats(base_stats, head_stats): + base_uop_stats = extract_opcode_stats(base_stats, "uop") + head_uop_stats = extract_opcode_stats(head_stats, "uop") + + with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): + with Section("Overall stats", level=3): + base_rows = calculate_optimization_stats(base_stats) + head_rows = calculate_optimization_stats(head_stats) + emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows)) + + with Section("Uop stats", level=3): + base_rows = calculate_uop_execution_counts(base_uop_stats) + head_rows = calculate_uop_execution_counts(head_uop_stats) + _emit_comparative_execution_counts(base_rows, head_rows) + + def output_single_stats(stats): - opcode_stats = extract_opcode_stats(stats) + opcode_stats = extract_opcode_stats(stats, "opcode") total = get_total(opcode_stats) emit_execution_counts(opcode_stats, total) emit_pair_counts(opcode_stats, total) @@ -621,15 +692,16 @@ def output_single_stats(stats): emit_call_stats(stats, stats["_stats_defines"]) emit_object_stats(stats) emit_gc_stats(stats) + emit_optimization_stats(stats) with Section("Meta stats", summary="Meta statistics"): emit_table(("", "Count:"), [('Number of data files', stats['__nfiles__'])]) def output_comparative_stats(base_stats, head_stats): - base_opcode_stats = extract_opcode_stats(base_stats) + base_opcode_stats = extract_opcode_stats(base_stats, "opcode") base_total = get_total(base_opcode_stats) - head_opcode_stats = extract_opcode_stats(head_stats) + head_opcode_stats = extract_opcode_stats(head_stats, "opcode") head_total = get_total(head_opcode_stats) emit_comparative_execution_counts( @@ -645,6 +717,7 @@ def output_comparative_stats(base_stats, head_stats): emit_comparative_call_stats(base_stats, head_stats, head_stats["_stats_defines"]) emit_comparative_object_stats(base_stats, head_stats) emit_comparative_gc_stats(base_stats, head_stats) + emit_comparative_optimization_stats(base_stats, head_stats) def output_stats(inputs, json_output=None): if len(inputs) == 1: From b84f496e3b2b8616eb696f51cfc78e54412ea5d5 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 26 Sep 2023 14:37:19 -0400 Subject: [PATCH 02/16] Remove unnecessary stuff --- Include/cpython/pystats.h | 1 - Include/internal/pycore_code.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 7014c968af8229..78004e64bb1f05 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -98,7 +98,6 @@ typedef struct _gc_stats { typedef struct _uop_stats { uint64_t execution_count; - uint64_t miss; } UOpStats; typedef struct _optimization_stats { diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index e89cab16c6d9eb..3c49eb4ae35a77 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -284,7 +284,6 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) #define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) -#define UOP_STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].name++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -301,7 +300,6 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define GC_STAT_ADD(gen, name, n) ((void)0) #define OPTIMIZATION_STAT_INC(name) ((void)0) #define UOP_EXE_INC(opname) ((void)0) -#define UOP_STAT_INC(opname, name) ((void)0) #endif // !Py_STATS // Utility functions for reading/writing 32/64-bit values in the inline caches. From 5549410ca6d233e2d88c6fcaa6e66e30c5a94bf3 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 26 Sep 2023 14:39:27 -0400 Subject: [PATCH 03/16] Delete trailing whitespace --- Python/specialize.c | 2 +- Tools/scripts/summarize_stats.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/specialize.c b/Python/specialize.c index 71b10b7c3d2eeb..106737bda1c43e 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -218,7 +218,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); - + char** names; for (int i = 0; i < 512; i++) { if (i < 256) { diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 8b8dce950ed852..64a161008b4d31 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -624,7 +624,7 @@ def calculate_optimization_stats(stats): return [ ("Optimization attempts", attempts, ""), ( - "Traces created", created, + "Traces created", created, format_ratio(created, attempts) ), ("Traces executed", executed, ""), @@ -661,7 +661,7 @@ def emit_optimization_stats(stats): with Section("Uop stats", level=3): rows = calculate_uop_execution_counts(uop_stats) emit_table( - ("Uop", "Count:", "Self:", "Cumulative:"), + ("Uop", "Count:", "Self:", "Cumulative:"), rows ) From ed639fc4534ce8b592809043ecaa6b43f1a44bd0 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 26 Sep 2023 15:11:01 -0400 Subject: [PATCH 04/16] Don't print optimization stats if not in dataset --- Tools/scripts/summarize_stats.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 64a161008b4d31..b2d7940476b597 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -651,6 +651,9 @@ def calculate_uop_execution_counts(opcode_stats): def emit_optimization_stats(stats): + if "Optimization attempts" not in stats: + return + uop_stats = extract_opcode_stats(stats, "uop") with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): @@ -667,6 +670,9 @@ def emit_optimization_stats(stats): def emit_comparative_optimization_stats(base_stats, head_stats): + if not any("Optimization attempts" in stats for stats in (base_stats, head_stats)): + return + base_uop_stats = extract_opcode_stats(base_stats, "uop") head_uop_stats = extract_opcode_stats(head_stats, "uop") From 195664e5a76ce820b7402437c5829191bfbd5252 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 26 Sep 2023 16:59:04 -0400 Subject: [PATCH 05/16] Fix logic --- Tools/scripts/summarize_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index b2d7940476b597..18adaa1a05c6eb 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -670,7 +670,7 @@ def emit_optimization_stats(stats): def emit_comparative_optimization_stats(base_stats, head_stats): - if not any("Optimization attempts" in stats for stats in (base_stats, head_stats)): + if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)): return base_uop_stats = extract_opcode_stats(base_stats, "uop") From 8d76c0ac7d7c8a2e0c8dd6f130c60a15691c2a55 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 27 Sep 2023 10:12:53 -0400 Subject: [PATCH 06/16] More simple Tier 2 counters --- Include/cpython/pystats.h | 6 ++++++ Include/internal/pycore_code.h | 4 ++++ Python/optimizer.c | 9 ++++++++- Python/specialize.c | 11 +++++++++++ Tools/scripts/summarize_stats.py | 21 ++++++++++++++++++++- 5 files changed, 49 insertions(+), 2 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 78004e64bb1f05..08cf488d7b2d01 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -105,7 +105,13 @@ typedef struct _optimization_stats { uint64_t traces_created; uint64_t traces_executed; uint64_t uops_executed; + uint64_t trace_stack_overflow; + uint64_t trace_stack_underflow; + uint64_t trace_too_long; + uint64_t inner_loop; + uint64_t recursive_call; UOpStats opcode[512]; + uint64_t unsupported_opcode[256]; } OptimizationStats; typedef struct _stats { diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 3c49eb4ae35a77..8b11de0f2dce02 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -284,6 +284,8 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) #define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) +#define OPTIMIZE_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) +#define OPTIMIZE_TRACE_STACK_OVERFLOW() do { if (_Py_stats) _Py_stats->optimization_stats.trace_stack_overflow++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -300,6 +302,8 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define GC_STAT_ADD(gen, name, n) ((void)0) #define OPTIMIZATION_STAT_INC(name) ((void)0) #define UOP_EXE_INC(opname) ((void)0) +#define OPTIMIZE_UNSUPPOTED_OPCODE(opname) ((void)0) +#define OPTIMIZE_TRACE_STACK_OVERFLOW() ((void)0) #endif // !Py_STATS // Utility functions for reading/writing 32/64-bit values in the inline caches. diff --git a/Python/optimizer.c b/Python/optimizer.c index fc1e7a8cad98fd..7fc20af7221c2b 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -461,6 +461,7 @@ translate_bytecode_to_trace( if (trace_length + (n) > max_length) { \ DPRINTF(2, "No room for %s (need %d, got %d)\n", \ (opname), (n), max_length - trace_length); \ + OPTIMIZATION_STAT_INC(trace_too_long); \ goto done; \ } \ reserved = (n); // Keep ADD_TO_TRACE / ADD_TO_STUB honest @@ -472,6 +473,7 @@ translate_bytecode_to_trace( #define TRACE_STACK_PUSH() \ if (trace_stack_depth >= TRACE_STACK_SIZE) { \ DPRINTF(2, "Trace stack overflow\n"); \ + OPTIMIZATION_STAT_INC(trace_stack_overflow); \ ADD_TO_TRACE(_SET_IP, 0, 0); \ goto done; \ } \ @@ -572,6 +574,7 @@ translate_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0); } else { + OPTIMIZATION_STAT_INC(inner_loop); DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n"); } goto done; @@ -638,7 +641,9 @@ translate_bytecode_to_trace( // LOAD_CONST + _POP_FRAME. if (trace_stack_depth == 0) { DPRINTF(2, "Trace stack underflow\n"); - goto done;} + OPTIMIZATION_STAT_INC(trace_stack_underflow); + goto done; + } } uint32_t orig_oparg = oparg; // For OPARG_TOP/BOTTOM for (int i = 0; i < nuops; i++) { @@ -713,6 +718,7 @@ translate_bytecode_to_trace( PyUnicode_AsUTF8(new_code->co_qualname), PyUnicode_AsUTF8(new_code->co_filename), new_code->co_firstlineno); + OPTIMIZATION_STAT_INC(recursive_call); ADD_TO_TRACE(_SET_IP, 0, 0); goto done; } @@ -744,6 +750,7 @@ translate_bytecode_to_trace( break; } DPRINTF(2, "Unsupported opcode %s\n", uop_name(opcode)); + OPTIMIZE_UNSUPPORTED_OPCODE(opcode); goto done; // Break out of loop } // End default diff --git a/Python/specialize.c b/Python/specialize.c index 106737bda1c43e..a2b4b50ce8e797 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -218,6 +218,11 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created); fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed); fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed); + fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow); + fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow); + fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long); + fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); + fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); char** names; for (int i = 0; i < 512; i++) { @@ -230,6 +235,12 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", names[i], stats->opcode[i].execution_count); } } + + for (int i = 0; i < 256; i++) { + if (stats->unsupported_opcode[i]) { + fprintf(out, "unsupported_opcode[%s].count : %" PRIu64 "\n", _PyOpcode_OpName[i]); + } + } } static void diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 18adaa1a05c6eb..c41d3bd085bacf 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -620,6 +620,11 @@ def calculate_optimization_stats(stats): created = stats["Optimization traces created"] executed = stats["Optimization traces executed"] uops = stats["Optimization uops executed"] + trace_stack_overflow = stats["Optimization trace stack overflow"] + trace_stack_underflow = stats["Optimization trace stack underflow"] + trace_too_long = stats["Optimization trace too long"] + inner_loop = stats["Optimization inner loop"] + recursive_call = stats["Optimization recursive call"] return [ ("Optimization attempts", attempts, ""), @@ -628,7 +633,12 @@ def calculate_optimization_stats(stats): format_ratio(created, attempts) ), ("Traces executed", executed, ""), - ("Uops executed", uops, format_ratio(uops, executed)) + ("Uops executed", uops, int(uops / executed)), + ("Trace stack overflow", trace_stack_overflow, ""), + ("Trace stack underflow", trace_stack_underflow, ""), + ("Trace too long", trace_too_long, ""), + ("Inner loop found", inner_loop, ""), + ("Recursive call", recursive_call, "") ] @@ -668,6 +678,15 @@ def emit_optimization_stats(stats): rows ) + with Section("Unsupported opcodes", level=3): + unsupported_opcodes = extract_opcode_stats(stats, "unsupported_opcode") + data = [] + for opcode, entry in unsupported_opcodes.items(): + data.append((entry["count"], opcode)) + data.sort(reverse=True) + rows = [(x[1], x[0]) for x in data] + emit_table(("Opcode", "Count"), rows) + def emit_comparative_optimization_stats(base_stats, head_stats): if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)): From c54707a611401c83839d490b73a6a304d19f31c8 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 27 Sep 2023 10:24:45 -0400 Subject: [PATCH 07/16] Renames and fixups --- Include/internal/pycore_code.h | 10 ++++------ Python/executor.c | 4 ++-- Python/generated_cases.c.h | 2 +- Python/optimizer.c | 12 ++++++------ Python/specialize.c | 2 +- 5 files changed, 14 insertions(+), 16 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 8b11de0f2dce02..5a193251da762d 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -282,10 +282,9 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \ do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0) #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0) -#define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) +#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) -#define OPTIMIZE_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) -#define OPTIMIZE_TRACE_STACK_OVERFLOW() do { if (_Py_stats) _Py_stats->optimization_stats.trace_stack_overflow++; } while (0) +#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -300,10 +299,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define EVAL_CALL_STAT_INC(name) ((void)0) #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0) #define GC_STAT_ADD(gen, name, n) ((void)0) -#define OPTIMIZATION_STAT_INC(name) ((void)0) +#define OPT_STAT_INC(name) ((void)0) #define UOP_EXE_INC(opname) ((void)0) -#define OPTIMIZE_UNSUPPOTED_OPCODE(opname) ((void)0) -#define OPTIMIZE_TRACE_STACK_OVERFLOW() ((void)0) +#define OPT_UNSUPPORTED_OPCODE(opname) ((void)0) #endif // !Py_STATS // Utility functions for reading/writing 32/64-bit values in the inline caches. diff --git a/Python/executor.c b/Python/executor.c index cb9f00aae72975..572faa7f41b40d 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -62,7 +62,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject CHECK_EVAL_BREAKER(); - OPTIMIZATION_STAT_INC(traces_executed); + OPT_STAT_INC(traces_executed); _Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive; int pc = 0; int opcode; @@ -81,7 +81,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject operand, (int)(stack_pointer - _PyFrame_Stackbase(frame))); pc++; - OPTIMIZATION_STAT_INC(uops_executed); + OPT_STAT_INC(uops_executed); assert(opcode < 512); UOP_EXE_INC(opcode); switch (opcode) { diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 216640c23891fd..8e4feebc9290da 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -2947,7 +2947,7 @@ // Double-check that the opcode isn't instrumented or something: here->op.code == JUMP_BACKWARD) { - OPTIMIZATION_STAT_INC(attempts); + OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer); if (optimized < 0) goto error; if (optimized) { diff --git a/Python/optimizer.c b/Python/optimizer.c index 7fc20af7221c2b..d270286f2cac82 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -461,7 +461,7 @@ translate_bytecode_to_trace( if (trace_length + (n) > max_length) { \ DPRINTF(2, "No room for %s (need %d, got %d)\n", \ (opname), (n), max_length - trace_length); \ - OPTIMIZATION_STAT_INC(trace_too_long); \ + OPT_STAT_INC(trace_too_long); \ goto done; \ } \ reserved = (n); // Keep ADD_TO_TRACE / ADD_TO_STUB honest @@ -473,7 +473,7 @@ translate_bytecode_to_trace( #define TRACE_STACK_PUSH() \ if (trace_stack_depth >= TRACE_STACK_SIZE) { \ DPRINTF(2, "Trace stack overflow\n"); \ - OPTIMIZATION_STAT_INC(trace_stack_overflow); \ + OPT_STAT_INC(trace_stack_overflow); \ ADD_TO_TRACE(_SET_IP, 0, 0); \ goto done; \ } \ @@ -574,7 +574,7 @@ translate_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0); } else { - OPTIMIZATION_STAT_INC(inner_loop); + OPT_STAT_INC(inner_loop); DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n"); } goto done; @@ -641,7 +641,7 @@ translate_bytecode_to_trace( // LOAD_CONST + _POP_FRAME. if (trace_stack_depth == 0) { DPRINTF(2, "Trace stack underflow\n"); - OPTIMIZATION_STAT_INC(trace_stack_underflow); + OPT_STAT_INC(trace_stack_underflow); goto done; } } @@ -718,7 +718,7 @@ translate_bytecode_to_trace( PyUnicode_AsUTF8(new_code->co_qualname), PyUnicode_AsUTF8(new_code->co_filename), new_code->co_firstlineno); - OPTIMIZATION_STAT_INC(recursive_call); + OPT_STAT_INC(recursive_call); ADD_TO_TRACE(_SET_IP, 0, 0); goto done; } @@ -898,7 +898,7 @@ uop_optimize( // Error or nothing translated return trace_length; } - OPTIMIZATION_STAT_INC(traces_created); + OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize != NULL && *uop_optimize > '0') { trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries); diff --git a/Python/specialize.c b/Python/specialize.c index a2b4b50ce8e797..e326af2389cd1f 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -224,7 +224,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); - char** names; + char* const* names; for (int i = 0; i < 512; i++) { if (i < 256) { names = _PyOpcode_OpName; From ba58546a9430898aba1045228deef1bc9973d186 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 27 Sep 2023 16:45:59 -0400 Subject: [PATCH 08/16] Lots more stats --- Include/cpython/pystats.h | 5 ++++ Include/internal/pycore_code.h | 2 ++ Python/bytecodes.c | 3 ++- Python/executor.c | 4 ++- Python/executor_cases.c.h | 1 + Python/optimizer.c | 4 ++- Python/specialize.c | 21 +++++++++++++-- Tools/scripts/summarize_stats.py | 46 +++++++++++++++++++++----------- 8 files changed, 65 insertions(+), 21 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 08cf488d7b2d01..f1fe6e5f4715f8 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -100,6 +100,8 @@ typedef struct _uop_stats { uint64_t execution_count; } UOpStats; +#define _Py_UOP_HIST_SIZE 5 + typedef struct _optimization_stats { uint64_t attempts; uint64_t traces_created; @@ -112,6 +114,9 @@ typedef struct _optimization_stats { uint64_t recursive_call; UOpStats opcode[512]; uint64_t unsupported_opcode[256]; + uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; + uint64_t trace_run_length_hist[_Py_UOP_HIST_SIZE]; + uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE]; } OptimizationStats; typedef struct _stats { diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 5a193251da762d..433ca49e424c1c 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -285,6 +285,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) +#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> 2)]++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); @@ -302,6 +303,7 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); #define OPT_STAT_INC(name) ((void)0) #define UOP_EXE_INC(opname) ((void)0) #define OPT_UNSUPPORTED_OPCODE(opname) ((void)0) +#define OPT_HIST(length, name) ((void)0) #endif // !Py_STATS // Utility functions for reading/writing 32/64-bit values in the inline caches. diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e813b65f5c8e0e..059264163d0a15 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2244,7 +2244,7 @@ dummy_func( // Double-check that the opcode isn't instrumented or something: here->op.code == JUMP_BACKWARD) { - OPTIMIZATION_STAT_INC(attempts); + OPT_STAT_INC(attempts); int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer); ERROR_IF(optimized < 0, error); if (optimized) { @@ -3913,6 +3913,7 @@ dummy_func( frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); + OPT_HIST(pc, trace_run_length_hist); return frame; } diff --git a/Python/executor.c b/Python/executor.c index 572faa7f41b40d..48e9f2b42df8ff 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -2,6 +2,7 @@ #include "opcode.h" +#include "pycore_bitutils.h" #include "pycore_call.h" #include "pycore_ceval.h" #include "pycore_dict.h" @@ -82,7 +83,6 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject (int)(stack_pointer - _PyFrame_Stackbase(frame))); pc++; OPT_STAT_INC(uops_executed); - assert(opcode < 512); UOP_EXE_INC(opcode); switch (opcode) { @@ -116,6 +116,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject // On ERROR_IF we return NULL as the frame. // The caller recovers the frame from tstate->current_frame. DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand); + OPT_HIST(pc, trace_run_length_hist); _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); return NULL; @@ -124,6 +125,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject // On DEOPT_IF we just repeat the last instruction. // This presumes nothing was popped from the stack (nor pushed). DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand); + OPT_HIST(pc, trace_run_length_hist); frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index befb972f1e90f5..cee0a723fe8855 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3008,6 +3008,7 @@ frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); + OPT_HIST(pc, trace_run_length_hist); return frame; break; } diff --git a/Python/optimizer.c b/Python/optimizer.c index d270286f2cac82..f8796eb24073d2 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -750,7 +750,7 @@ translate_bytecode_to_trace( break; } DPRINTF(2, "Unsupported opcode %s\n", uop_name(opcode)); - OPTIMIZE_UNSUPPORTED_OPCODE(opcode); + OPT_UNSUPPORTED_OPCODE(opcode); goto done; // Break out of loop } // End default @@ -898,6 +898,7 @@ uop_optimize( // Error or nothing translated return trace_length; } + OPT_HIST(trace_length, trace_length_hist); OPT_STAT_INC(traces_created); char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE"); if (uop_optimize != NULL && *uop_optimize > '0') { @@ -908,6 +909,7 @@ uop_optimize( if (executor == NULL) { return -1; } + OPT_HIST(trace_length, optimized_trace_length_hist); executor->base.execute = _PyUopExecute; memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction)); *exec_ptr = (_PyExecutorObject *)executor; diff --git a/Python/specialize.c b/Python/specialize.c index e326af2389cd1f..3a697f420d0f37 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -211,6 +211,14 @@ print_gc_stats(FILE *out, GCStats *stats) } } +static void +print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) +{ + for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { + fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + 2)), hist[i]); + } +} + static void print_optimization_stats(FILE *out, OptimizationStats *stats) { @@ -224,7 +232,11 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop); fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call); - char* const* names; + print_histogram(out, "Trace length", stats->trace_length_hist); + print_histogram(out, "Trace run length", stats->trace_run_length_hist); + print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist); + + const char* const* names; for (int i = 0; i < 512; i++) { if (i < 256) { names = _PyOpcode_OpName; @@ -238,7 +250,12 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) for (int i = 0; i < 256; i++) { if (stats->unsupported_opcode[i]) { - fprintf(out, "unsupported_opcode[%s].count : %" PRIu64 "\n", _PyOpcode_OpName[i]); + fprintf( + out, + "unsupported_opcode[%s].count : %" PRIu64 "\n", + _PyOpcode_OpName[i], + stats->unsupported_opcode[i] + ); } } } diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index c41d3bd085bacf..763d2fe2dd0537 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -13,6 +13,7 @@ from datetime import date import itertools import sys +import re if os.name == "nt": DEFAULT_DIR = "c:\\temp\\py_stats\\" @@ -320,6 +321,15 @@ def emit_table(header, rows): print("|", " | ".join(to_str(i) for i in row), "|") print() +def emit_histogram(title, stats, key, total): + rows = [] + for k, v in stats.items(): + if k.startswith(key): + entry = int(re.match(r".+\[([0-9]+)\]", k).groups()[0]) + rows.append((f"<= {entry}", int(v), format_ratio(int(v), total))) + print(f"**{title}**\n") + emit_table(("Range", "Count:", "Ratio:"), rows) + def calculate_execution_counts(opcode_stats, total): counts = [] for name, opcode_stat in opcode_stats.items(): @@ -671,6 +681,25 @@ def emit_optimization_stats(stats): rows = calculate_optimization_stats(stats) emit_table(("", "Count:", "Ratio:"), rows) + emit_histogram( + "Trace length histogram", + stats, + "Trace length", + stats["Optimization traces created"] + ) + emit_histogram( + "Optimized trace length histogram", + stats, + "Optimized trace length", + stats["Optimization traces created"] + ) + emit_histogram( + "Trace run length histogram", + stats, + "Trace run length", + stats["Optimization traces executed"] + ) + with Section("Uop stats", level=3): rows = calculate_uop_execution_counts(uop_stats) emit_table( @@ -689,22 +718,7 @@ def emit_optimization_stats(stats): def emit_comparative_optimization_stats(base_stats, head_stats): - if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)): - return - - base_uop_stats = extract_opcode_stats(base_stats, "uop") - head_uop_stats = extract_opcode_stats(head_stats, "uop") - - with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): - with Section("Overall stats", level=3): - base_rows = calculate_optimization_stats(base_stats) - head_rows = calculate_optimization_stats(head_stats) - emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows)) - - with Section("Uop stats", level=3): - base_rows = calculate_uop_execution_counts(base_uop_stats) - head_rows = calculate_uop_execution_counts(head_uop_stats) - _emit_comparative_execution_counts(base_rows, head_rows) + print("## Comparative optimization stats not implemented\n\n") def output_single_stats(stats): From 08cd8c04ab4a83db88fced754a39f8ab269bf7d6 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 27 Sep 2023 10:19:52 -0400 Subject: [PATCH 09/16] Disable Tier 1 stats in Tier 2 --- Python/executor.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Python/executor.c b/Python/executor.c index 48e9f2b42df8ff..e509cfb1144a1d 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -31,6 +31,16 @@ goto deoptimize; \ } +#ifdef Py_STATS +// Disable these macros that apply to Tier 1 stats when we are in Tier 2 +#undef STAT_INC +#define STAT_INC(opname, name) ((void)0) +#undef STAT_DEC +#define STAT_DEC(opname, name) ((void)0) +#undef CALL_STAT_INC +#define CALL_STAT_INC(name) ((void)0) +#endif + #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 From 8d2699d26c7d2f750ed4a15161fea2531a10a081 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Wed, 27 Sep 2023 18:55:51 -0400 Subject: [PATCH 10/16] Trailing whitespace --- Python/specialize.c | 6 +++--- Tools/scripts/summarize_stats.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Python/specialize.c b/Python/specialize.c index 3a697f420d0f37..8ed2ad6276167d 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -251,9 +251,9 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) for (int i = 0; i < 256; i++) { if (stats->unsupported_opcode[i]) { fprintf( - out, - "unsupported_opcode[%s].count : %" PRIu64 "\n", - _PyOpcode_OpName[i], + out, + "unsupported_opcode[%s].count : %" PRIu64 "\n", + _PyOpcode_OpName[i], stats->unsupported_opcode[i] ); } diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 763d2fe2dd0537..f3bcd01b636c9a 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -682,8 +682,8 @@ def emit_optimization_stats(stats): emit_table(("", "Count:", "Ratio:"), rows) emit_histogram( - "Trace length histogram", - stats, + "Trace length histogram", + stats, "Trace length", stats["Optimization traces created"] ) @@ -694,8 +694,8 @@ def emit_optimization_stats(stats): stats["Optimization traces created"] ) emit_histogram( - "Trace run length histogram", - stats, + "Trace run length histogram", + stats, "Trace run length", stats["Optimization traces executed"] ) From ad0ca42c3b063066ea8ac3049f5672de59b78b59 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 28 Sep 2023 08:41:26 -0400 Subject: [PATCH 11/16] Make histogram bias more explicit --- Include/cpython/pystats.h | 3 +++ Include/internal/pycore_code.h | 2 +- Python/specialize.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index f1fe6e5f4715f8..99916ed74ac12a 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -100,7 +100,10 @@ typedef struct _uop_stats { uint64_t execution_count; } UOpStats; +// Note that these should be set to support the largest possible trace length. +// i.e. log2(MAX_LENGTH) + 1 === (_Py_UOP_HIST_SIZE + _Py_UOP_HIST_BIAS) #define _Py_UOP_HIST_SIZE 5 +#define _Py_UOP_HIST_BIAS 2 typedef struct _optimization_stats { uint64_t attempts; diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 433ca49e424c1c..9ac358862a3927 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -285,7 +285,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) -#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> 2)]++; } while (0) +#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> _Py_UOP_HIST_BIAS)]++; } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); diff --git a/Python/specialize.c b/Python/specialize.c index 8ed2ad6276167d..65863249b66d17 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -215,7 +215,7 @@ static void print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) { for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { - fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + 2)), hist[i]); + fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + _Py_UOP_HIST_BIAS)), hist[i]); } } From e5ee4e5c2a6667e498ca9890b9cbab00c968f139 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Thu, 28 Sep 2023 08:41:46 -0400 Subject: [PATCH 12/16] Various simplifications from code review --- Tools/scripts/summarize_stats.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index f3bcd01b636c9a..7587069f4e0cf6 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -217,7 +217,7 @@ def extract_opcode_stats(stats, prefix): for key, value in stats.items(): if not key.startswith(prefix): continue - name, _, rest = key[len(prefix) + 2:].partition("]") + name, _, rest = key[len(prefix) + 1:].partition("]") opcode_stats[name][rest.strip(".")] = value return opcode_stats @@ -361,9 +361,9 @@ def emit_execution_counts(opcode_stats, total): ) def _emit_comparative_execution_counts(base_rows, head_rows): - base_data = dict((x[0], x[1:]) for x in base_rows) - head_data = dict((x[0], x[1:]) for x in head_rows) - opcodes = set(base_data.keys()) | set(head_data.keys()) + base_data = {x[0]: x[1:] for x in base_rows} + head_data = {x[0]: x[1:] for x in head_rows} + opcodes = base_data.keys() | head_data.keys() rows = [] default = [0, "0.0%", "0.0%", 0] @@ -376,9 +376,9 @@ def _emit_comparative_execution_counts(base_rows, head_rows): change = (head_entry[0] - base_entry[0]) / base_entry[0] rows.append( (opcode, base_entry[0], head_entry[0], - f"{100*change:0.1f}%")) + f"{change:0.1%}")) - rows.sort(key=lambda x: -abs(percentage_to_float(x[-1]))) + rows.sort(key=lambda x: abs(percentage_to_float(x[-1])), reverse=True) emit_table( ("Name", "Base Count:", "Head Count:", "Change:"), From 0dac9e56e0c3f11c54d0573c5c294684154f0495 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 2 Oct 2023 15:16:40 -0400 Subject: [PATCH 13/16] Bugfix getting opcode stats --- Tools/scripts/summarize_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 7587069f4e0cf6..6bf9223035092b 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -674,7 +674,7 @@ def emit_optimization_stats(stats): if "Optimization attempts" not in stats: return - uop_stats = extract_opcode_stats(stats, "uop") + uop_stats = extract_opcode_stats(stats, "uops") with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): with Section("Overall stats", level=3): From 69a6e7bdd5d7b00074a3726aa963de517fb72545 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 2 Oct 2023 17:08:16 -0400 Subject: [PATCH 14/16] Change bucketing. Change how execution length is calculated --- Include/cpython/pystats.h | 5 +- Include/internal/pycore_code.h | 9 +- Python/bytecodes.c | 2 +- Python/executor.c | 11 +- Python/executor_cases.c.h | 2 +- Python/specialize.c | 2 +- Tools/scripts/summarize_stats.py | 350 +++++++++++++++++++++---------- 7 files changed, 261 insertions(+), 120 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 99916ed74ac12a..056406e6b5d992 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -100,10 +100,7 @@ typedef struct _uop_stats { uint64_t execution_count; } UOpStats; -// Note that these should be set to support the largest possible trace length. -// i.e. log2(MAX_LENGTH) + 1 === (_Py_UOP_HIST_SIZE + _Py_UOP_HIST_BIAS) -#define _Py_UOP_HIST_SIZE 5 -#define _Py_UOP_HIST_BIAS 2 +#define _Py_UOP_HIST_SIZE 32 typedef struct _optimization_stats { uint64_t attempts; diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 9ac358862a3927..1699f88579cd29 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -285,7 +285,14 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0) #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0) #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0) -#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> _Py_UOP_HIST_BIAS)]++; } while (0) +#define OPT_HIST(length, name) \ + do { \ + if (_Py_stats) { \ + int bucket = _Py_bit_length(length - 1); \ + bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \ + _Py_stats->optimization_stats.name[bucket]++; \ + } \ + } while (0) // Export for '_opcode' shared extension PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 059264163d0a15..7a01464213312f 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -3913,7 +3913,7 @@ dummy_func( frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); - OPT_HIST(pc, trace_run_length_hist); + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); return frame; } diff --git a/Python/executor.c b/Python/executor.c index e509cfb1144a1d..6da3af08822fd0 100644 --- a/Python/executor.c +++ b/Python/executor.c @@ -79,6 +79,9 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject int opcode; int oparg; uint64_t operand; +#ifdef Py_STATS + uint64_t trace_uop_execution_counter = 0; +#endif for (;;) { opcode = self->trace[pc].opcode; @@ -94,6 +97,10 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject pc++; OPT_STAT_INC(uops_executed); UOP_EXE_INC(opcode); +#ifdef Py_STATS + trace_uop_execution_counter++; +#endif + switch (opcode) { #include "executor_cases.c.h" @@ -126,7 +133,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject // On ERROR_IF we return NULL as the frame. // The caller recovers the frame from tstate->current_frame. DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand); - OPT_HIST(pc, trace_run_length_hist); + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); return NULL; @@ -135,7 +142,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject // On DEOPT_IF we just repeat the last instruction. // This presumes nothing was popped from the stack (nor pushed). DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand); - OPT_HIST(pc, trace_run_length_hist); + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index cee0a723fe8855..86a01bf553b27a 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -3008,7 +3008,7 @@ frame->prev_instr--; // Back up to just before destination _PyFrame_SetStackPointer(frame, stack_pointer); Py_DECREF(self); - OPT_HIST(pc, trace_run_length_hist); + OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); return frame; break; } diff --git a/Python/specialize.c b/Python/specialize.c index 65863249b66d17..ff732eb30ca1e0 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -215,7 +215,7 @@ static void print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE]) { for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) { - fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + _Py_UOP_HIST_BIAS)), hist[i]); + fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]); } } diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 6bf9223035092b..840de1fa345c5e 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -22,6 +22,7 @@ TOTAL = "specialization.hit", "specialization.miss", "execution_count" + def format_ratio(num, den): """ Format a ratio as a percentage. When the denominator is 0, returns the empty @@ -32,6 +33,7 @@ def format_ratio(num, den): else: return f"{num/den:.01%}" + def percentage_to_float(s): """ Converts a percentage string to a float. The empty string is returned as 0.0 @@ -42,6 +44,7 @@ def percentage_to_float(s): assert s[-1] == "%" return float(s[:-1]) + def join_rows(a_rows, b_rows): """ Joins two tables together, side-by-side, where the first column in each is a @@ -80,40 +83,53 @@ def join_rows(a_rows, b_rows): keys = list(a_data.keys()) + [k for k in b_data.keys() if k not in a_data] return [(k, *a_data.get(k, default), *b_data.get(k, default)) for k in keys] + def calculate_specialization_stats(family_stats, total): rows = [] for key in sorted(family_stats): if key.startswith("specialization.failure_kinds"): continue if key in ("specialization.hit", "specialization.miss"): - label = key[len("specialization."):] + label = key[len("specialization.") :] elif key == "execution_count": continue - elif key in ("specialization.success", "specialization.failure", "specializable"): + elif key in ( + "specialization.success", + "specialization.failure", + "specializable", + ): continue elif key.startswith("pair"): continue else: label = key - rows.append((f"{label:>12}", f"{family_stats[key]:>12}", format_ratio(family_stats[key], total))) + rows.append( + ( + f"{label:>12}", + f"{family_stats[key]:>12}", + format_ratio(family_stats[key], total), + ) + ) return rows + def calculate_specialization_success_failure(family_stats): total_attempts = 0 - for key in ("specialization.success", "specialization.failure"): + for key in ("specialization.success", "specialization.failure"): total_attempts += family_stats.get(key, 0) rows = [] if total_attempts: - for key in ("specialization.success", "specialization.failure"): - label = key[len("specialization."):] + for key in ("specialization.success", "specialization.failure"): + label = key[len("specialization.") :] label = label[0].upper() + label[1:] val = family_stats.get(key, 0) rows.append((label, val, format_ratio(val, total_attempts))) return rows + def calculate_specialization_failure_kinds(name, family_stats, defines): total_failures = family_stats.get("specialization.failure", 0) - failure_kinds = [ 0 ] * 40 + failure_kinds = [0] * 40 for key in family_stats: if not key.startswith("specialization.failure_kind"): continue @@ -126,9 +142,16 @@ def calculate_specialization_failure_kinds(name, family_stats, defines): for value, index in failures: if not value: continue - rows.append((kind_to_text(index, defines, name), value, format_ratio(value, total_failures))) + rows.append( + ( + kind_to_text(index, defines, name), + value, + format_ratio(value, total_failures), + ) + ) return rows + def print_specialization_stats(name, family_stats, defines): if "specializable" not in family_stats: return @@ -145,7 +168,10 @@ def print_specialization_stats(name, family_stats, defines): rows = calculate_specialization_failure_kinds(name, family_stats, defines) emit_table(("Failure kind", "Count:", "Ratio:"), rows) -def print_comparative_specialization_stats(name, base_family_stats, head_family_stats, defines): + +def print_comparative_specialization_stats( + name, base_family_stats, head_family_stats, defines +): if "specializable" not in base_family_stats: return @@ -158,20 +184,33 @@ def print_comparative_specialization_stats(name, base_family_stats, head_family_ head_rows = calculate_specialization_stats(head_family_stats, head_total) emit_table( ("Kind", "Base Count", "Base Ratio", "Head Count", "Head Ratio"), - join_rows(base_rows, head_rows) + join_rows(base_rows, head_rows), ) base_rows = calculate_specialization_success_failure(base_family_stats) head_rows = calculate_specialization_success_failure(head_family_stats) rows = join_rows(base_rows, head_rows) if rows: print_title("Specialization attempts", 4) - emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows) - base_rows = calculate_specialization_failure_kinds(name, base_family_stats, defines) - head_rows = calculate_specialization_failure_kinds(name, head_family_stats, defines) emit_table( - ("Failure kind", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), - join_rows(base_rows, head_rows) + ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows ) + base_rows = calculate_specialization_failure_kinds( + name, base_family_stats, defines + ) + head_rows = calculate_specialization_failure_kinds( + name, head_family_stats, defines + ) + emit_table( + ( + "Failure kind", + "Base Count:", + "Base Ratio:", + "Head Count:", + "Head Ratio:", + ), + join_rows(base_rows, head_rows), + ) + def gather_stats(input): # Note the output of this function must be JSON-serializable @@ -180,7 +219,9 @@ def gather_stats(input): with open(input, "r") as fd: stats = json.load(fd) - stats["_stats_defines"] = {int(k): v for k, v in stats["_stats_defines"].items()} + stats["_stats_defines"] = { + int(k): v for k, v in stats["_stats_defines"].items() + } stats["_defines"] = {int(k): v for k, v in stats["_defines"].items()} return stats @@ -192,18 +233,20 @@ def gather_stats(input): try: key, value = line.split(":") except ValueError: - print(f"Unparsable line: '{line.strip()}' in {filename}", file=sys.stderr) + print( + f"Unparsable line: '{line.strip()}' in {filename}", + file=sys.stderr, + ) continue key = key.strip() value = int(value) stats[key] += value - stats['__nfiles__'] += 1 + stats["__nfiles__"] += 1 import opcode stats["_specialized_instructions"] = [ - op for op in opcode._specialized_opmap.keys() - if "__" not in op + op for op in opcode._specialized_opmap.keys() if "__" not in op ] stats["_stats_defines"] = get_stats_defines() stats["_defines"] = get_defines() @@ -212,15 +255,17 @@ def gather_stats(input): else: raise ValueError(f"{input:r} is not a file or directory path") + def extract_opcode_stats(stats, prefix): opcode_stats = collections.defaultdict(dict) for key, value in stats.items(): if not key.startswith(prefix): continue - name, _, rest = key[len(prefix) + 1:].partition("]") + name, _, rest = key[len(prefix) + 1 :].partition("]") opcode_stats[name][rest.strip(".")] = value return opcode_stats + def parse_kinds(spec_src, prefix="SPEC_FAIL"): defines = collections.defaultdict(list) start = "#define " + prefix + "_" @@ -228,14 +273,16 @@ def parse_kinds(spec_src, prefix="SPEC_FAIL"): line = line.strip() if not line.startswith(start): continue - line = line[len(start):] + line = line[len(start) :] name, val = line.split() defines[int(val.strip())].append(name.strip()) return defines + def pretty(defname): return defname.replace("_", " ").lower() + def kind_to_text(kind, defines, opname): if kind <= 8: return pretty(defines[kind][0]) @@ -249,9 +296,10 @@ def kind_to_text(kind, defines, opname): opname = "SUBSCR" for name in defines[kind]: if name.startswith(opname): - return pretty(name[len(opname)+1:]) + return pretty(name[len(opname) + 1 :]) return "kind " + str(kind) + def categorized_counts(opcode_stats, specialized_instructions): basic = 0 specialized = 0 @@ -259,7 +307,7 @@ def categorized_counts(opcode_stats, specialized_instructions): for name, opcode_stat in opcode_stats.items(): if "execution_count" not in opcode_stat: continue - count = opcode_stat['execution_count'] + count = opcode_stat["execution_count"] if "specializable" in opcode_stat: not_specialized += count elif name in specialized_instructions: @@ -270,12 +318,13 @@ def categorized_counts(opcode_stats, specialized_instructions): basic += count return basic, not_specialized, specialized + def print_title(name, level=2): - print("#"*level, name) + print("#" * level, name) print() -class Section: +class Section: def __init__(self, title, level=2, summary=None): self.title = title self.level = level @@ -296,12 +345,14 @@ def __exit__(*args): print("") print() + def to_str(x): if isinstance(x, int): return format(x, ",d") else: return str(x) + def emit_table(header, rows): width = len(header) header_line = "|" @@ -321,20 +372,28 @@ def emit_table(header, rows): print("|", " | ".join(to_str(i) for i in row), "|") print() + def emit_histogram(title, stats, key, total): rows = [] for k, v in stats.items(): if k.startswith(key): entry = int(re.match(r".+\[([0-9]+)\]", k).groups()[0]) rows.append((f"<= {entry}", int(v), format_ratio(int(v), total))) + # Don't include larger buckets with 0 entries + for j in range(len(rows) - 1, -1, -1): + if rows[j][1] != 0: + break + rows = rows[: j + 1] + print(f"**{title}**\n") emit_table(("Range", "Count:", "Ratio:"), rows) + def calculate_execution_counts(opcode_stats, total): counts = [] for name, opcode_stat in opcode_stats.items(): if "execution_count" in opcode_stat: - count = opcode_stat['execution_count'] + count = opcode_stat["execution_count"] miss = 0 if "specializable" not in opcode_stat: miss = opcode_stat.get("specialization.miss") @@ -342,23 +401,29 @@ def calculate_execution_counts(opcode_stats, total): counts.sort(reverse=True) cumulative = 0 rows = [] - for (count, name, miss) in counts: + for count, name, miss in counts: cumulative += count if miss: miss = format_ratio(miss, count) else: miss = "" - rows.append((name, count, format_ratio(count, total), - format_ratio(cumulative, total), miss)) + rows.append( + ( + name, + count, + format_ratio(count, total), + format_ratio(cumulative, total), + miss, + ) + ) return rows + def emit_execution_counts(opcode_stats, total): with Section("Execution counts", summary="execution counts for all instructions"): rows = calculate_execution_counts(opcode_stats, total) - emit_table( - ("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"), - rows - ) + emit_table(("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"), rows) + def _emit_comparative_execution_counts(base_rows, head_rows): base_data = {x[0]: x[1:] for x in base_rows} @@ -374,37 +439,40 @@ def _emit_comparative_execution_counts(base_rows, head_rows): change = 1 else: change = (head_entry[0] - base_entry[0]) / base_entry[0] - rows.append( - (opcode, base_entry[0], head_entry[0], - f"{change:0.1%}")) + rows.append((opcode, base_entry[0], head_entry[0], f"{change:0.1%}")) rows.sort(key=lambda x: abs(percentage_to_float(x[-1])), reverse=True) - emit_table( - ("Name", "Base Count:", "Head Count:", "Change:"), - rows - ) + emit_table(("Name", "Base Count:", "Head Count:", "Change:"), rows) + def emit_comparative_execution_counts( base_opcode_stats, base_total, head_opcode_stats, head_total, level=2 ): - with Section("Execution counts", summary="execution counts for all instructions", level=level): + with Section( + "Execution counts", summary="execution counts for all instructions", level=level + ): base_rows = calculate_execution_counts(base_opcode_stats, base_total) head_rows = calculate_execution_counts(head_opcode_stats, head_total) _emit_comparative_execution_counts(base_rows, head_rows) + def get_defines(): spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c") with open(spec_path) as spec_src: defines = parse_kinds(spec_src) return defines + def emit_specialization_stats(opcode_stats, defines): with Section("Specialization stats", summary="specialization stats by family"): for name, opcode_stat in opcode_stats.items(): print_specialization_stats(name, opcode_stat, defines) -def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats, defines): + +def emit_comparative_specialization_stats( + base_opcode_stats, head_opcode_stats, defines +): with Section("Specialization stats", summary="specialization stats by family"): opcodes = set(base_opcode_stats.keys()) & set(head_opcode_stats.keys()) for opcode in opcodes: @@ -412,6 +480,7 @@ def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats, opcode, base_opcode_stats[opcode], head_opcode_stats[opcode], defines ) + def calculate_specialization_effectiveness( opcode_stats, total, specialized_instructions ): @@ -424,11 +493,17 @@ def calculate_specialization_effectiveness( ("Specialized", specialized, format_ratio(specialized, total)), ] + def emit_specialization_overview(opcode_stats, total, specialized_instructions): with Section("Specialization effectiveness"): - rows = calculate_specialization_effectiveness(opcode_stats, total, specialized_instructions) + rows = calculate_specialization_effectiveness( + opcode_stats, total, specialized_instructions + ) emit_table(("Instructions", "Count:", "Ratio:"), rows) - for title, field in (("Deferred", "specialization.deferred"), ("Misses", "specialization.miss")): + for title, field in ( + ("Deferred", "specialization.deferred"), + ("Misses", "specialization.miss"), + ): total = 0 counts = [] for name, opcode_stat in opcode_stats.items(): @@ -441,11 +516,19 @@ def emit_specialization_overview(opcode_stats, total, specialized_instructions): counts.sort(reverse=True) if total: with Section(f"{title} by instruction", 3): - rows = [ (name, count, format_ratio(count, total)) for (count, name) in counts[:10] ] + rows = [ + (name, count, format_ratio(count, total)) + for (count, name) in counts[:10] + ] emit_table(("Name", "Count:", "Ratio:"), rows) + def emit_comparative_specialization_overview( - base_opcode_stats, base_total, head_opcode_stats, head_total, specialized_instructions + base_opcode_stats, + base_total, + head_opcode_stats, + head_total, + specialized_instructions, ): with Section("Specialization effectiveness"): base_rows = calculate_specialization_effectiveness( @@ -455,16 +538,26 @@ def emit_comparative_specialization_overview( head_opcode_stats, head_total, specialized_instructions ) emit_table( - ("Instructions", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), - join_rows(base_rows, head_rows) + ( + "Instructions", + "Base Count:", + "Base Ratio:", + "Head Count:", + "Head Ratio:", + ), + join_rows(base_rows, head_rows), ) + def get_stats_defines(): - stats_path = os.path.join(os.path.dirname(__file__), "../../Include/cpython/pystats.h") + stats_path = os.path.join( + os.path.dirname(__file__), "../../Include/cpython/pystats.h" + ) with open(stats_path) as stats_src: defines = parse_kinds(stats_src, prefix="EVAL_CALL") return defines + def calculate_call_stats(stats, defines): total = 0 for key, value in stats.items(): @@ -476,7 +569,7 @@ def calculate_call_stats(stats, defines): rows.append((key, value, format_ratio(value, total))) elif key.startswith("Calls "): name, index = key[:-1].split("[") - index = int(index) + index = int(index) label = name + " (" + pretty(defines[index][0]) + ")" rows.append((label, value, format_ratio(value, total))) for key, value in stats.items(): @@ -484,11 +577,13 @@ def calculate_call_stats(stats, defines): rows.append((key, value, format_ratio(value, total))) return rows + def emit_call_stats(stats, defines): with Section("Call stats", summary="Inlined calls and frame stats"): rows = calculate_call_stats(stats, defines) emit_table(("", "Count:", "Ratio:"), rows) + def emit_comparative_call_stats(base_stats, head_stats, defines): with Section("Call stats", summary="Inlined calls and frame stats"): base_rows = calculate_call_stats(base_stats, defines) @@ -496,15 +591,21 @@ def emit_comparative_call_stats(base_stats, head_stats, defines): rows = join_rows(base_rows, head_rows) rows.sort(key=lambda x: -percentage_to_float(x[-1])) emit_table( - ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), - rows + ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows ) + def calculate_object_stats(stats): total_materializations = stats.get("Object new values") - total_allocations = stats.get("Object allocations") + stats.get("Object allocations from freelist") - total_increfs = stats.get("Object interpreter increfs") + stats.get("Object increfs") - total_decrefs = stats.get("Object interpreter decrefs") + stats.get("Object decrefs") + total_allocations = stats.get("Object allocations") + stats.get( + "Object allocations from freelist" + ) + total_increfs = stats.get("Object interpreter increfs") + stats.get( + "Object increfs" + ) + total_decrefs = stats.get("Object interpreter decrefs") + stats.get( + "Object decrefs" + ) rows = [] for key, value in stats.items(): if key.startswith("Object"): @@ -512,9 +613,9 @@ def calculate_object_stats(stats): ratio = format_ratio(value, total_materializations) elif "allocations" in key: ratio = format_ratio(value, total_allocations) - elif "increfs" in key: + elif "increfs" in key: ratio = format_ratio(value, total_increfs) - elif "decrefs" in key: + elif "decrefs" in key: ratio = format_ratio(value, total_decrefs) else: ratio = "" @@ -523,6 +624,7 @@ def calculate_object_stats(stats): rows.append((label, value, ratio)) return rows + def calculate_gc_stats(stats): gc_stats = [] for key, value in stats.items(): @@ -539,40 +641,58 @@ def calculate_gc_stats(stats): for (i, gen) in enumerate(gc_stats) ] + def emit_object_stats(stats): with Section("Object stats", summary="allocations, frees and dict materializatons"): rows = calculate_object_stats(stats) - emit_table(("", "Count:", "Ratio:"), rows) + emit_table(("", "Count:", "Ratio:"), rows) + def emit_comparative_object_stats(base_stats, head_stats): with Section("Object stats", summary="allocations, frees and dict materializatons"): base_rows = calculate_object_stats(base_stats) head_rows = calculate_object_stats(head_stats) - emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows)) + emit_table( + ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), + join_rows(base_rows, head_rows), + ) + def emit_gc_stats(stats): with Section("GC stats", summary="GC collections and effectiveness"): rows = calculate_gc_stats(stats) - emit_table(("Generation:", "Collections:", "Objects collected:", "Object visits:"), rows) + emit_table( + ("Generation:", "Collections:", "Objects collected:", "Object visits:"), + rows, + ) + def emit_comparative_gc_stats(base_stats, head_stats): with Section("GC stats", summary="GC collections and effectiveness"): base_rows = calculate_gc_stats(base_stats) head_rows = calculate_gc_stats(head_stats) emit_table( - ("Generation:", - "Base collections:", "Head collections:", - "Base objects collected:", "Head objects collected:", - "Base object visits:", "Head object visits:"), - join_rows(base_rows, head_rows)) + ( + "Generation:", + "Base collections:", + "Head collections:", + "Base objects collected:", + "Head objects collected:", + "Base object visits:", + "Head object visits:", + ), + join_rows(base_rows, head_rows), + ) + def get_total(opcode_stats): total = 0 for opcode_stat in opcode_stats.values(): if "execution_count" in opcode_stat: - total += opcode_stat['execution_count'] + total += opcode_stat["execution_count"] return total + def emit_pair_counts(opcode_stats, total): pair_counts = [] for name_i, opcode_stat in opcode_stats.items(): @@ -585,15 +705,22 @@ def emit_pair_counts(opcode_stats, total): pair_counts.sort(reverse=True) cumulative = 0 rows = [] - for (count, pair) in itertools.islice(pair_counts, 100): + for count, pair in itertools.islice(pair_counts, 100): name_i, name_j = pair cumulative += count - rows.append((f"{name_i} {name_j}", count, format_ratio(count, total), - format_ratio(cumulative, total))) - emit_table(("Pair", "Count:", "Self:", "Cumulative:"), - rows - ) - with Section("Predecessor/Successor Pairs", summary="Top 5 predecessors and successors of each opcode"): + rows.append( + ( + f"{name_i} {name_j}", + count, + format_ratio(count, total), + format_ratio(cumulative, total), + ) + ) + emit_table(("Pair", "Count:", "Self:", "Cumulative:"), rows) + with Section( + "Predecessor/Successor Pairs", + summary="Top 5 predecessors and successors of each opcode", + ): predecessors = collections.defaultdict(collections.Counter) successors = collections.defaultdict(collections.Counter) total_predecessors = collections.Counter() @@ -611,18 +738,18 @@ def emit_pair_counts(opcode_stats, total): continue pred_rows = succ_rows = () if total1: - pred_rows = [(pred, count, f"{count/total1:.1%}") - for (pred, count) in predecessors[name].most_common(5)] + pred_rows = [ + (pred, count, f"{count/total1:.1%}") + for (pred, count) in predecessors[name].most_common(5) + ] if total2: - succ_rows = [(succ, count, f"{count/total2:.1%}") - for (succ, count) in successors[name].most_common(5)] + succ_rows = [ + (succ, count, f"{count/total2:.1%}") + for (succ, count) in successors[name].most_common(5) + ] with Section(name, 3, f"Successors and predecessors for {name}"): - emit_table(("Predecessors", "Count:", "Percentage:"), - pred_rows - ) - emit_table(("Successors", "Count:", "Percentage:"), - succ_rows - ) + emit_table(("Predecessors", "Count:", "Percentage:"), pred_rows) + emit_table(("Successors", "Count:", "Percentage:"), succ_rows) def calculate_optimization_stats(stats): @@ -638,17 +765,14 @@ def calculate_optimization_stats(stats): return [ ("Optimization attempts", attempts, ""), - ( - "Traces created", created, - format_ratio(created, attempts) - ), + ("Traces created", created, format_ratio(created, attempts)), ("Traces executed", executed, ""), ("Uops executed", uops, int(uops / executed)), ("Trace stack overflow", trace_stack_overflow, ""), ("Trace stack underflow", trace_stack_underflow, ""), ("Trace too long", trace_too_long, ""), ("Inner loop found", inner_loop, ""), - ("Recursive call", recursive_call, "") + ("Recursive call", recursive_call, ""), ] @@ -657,16 +781,17 @@ def calculate_uop_execution_counts(opcode_stats): counts = [] for name, opcode_stat in opcode_stats.items(): if "execution_count" in opcode_stat: - count = opcode_stat['execution_count'] + count = opcode_stat["execution_count"] counts.append((count, name)) total += count counts.sort(reverse=True) cumulative = 0 rows = [] - for (count, name) in counts: + for count, name in counts: cumulative += count - rows.append((name, count, format_ratio(count, total), - format_ratio(cumulative, total))) + rows.append( + (name, count, format_ratio(count, total), format_ratio(cumulative, total)) + ) return rows @@ -676,7 +801,9 @@ def emit_optimization_stats(stats): uop_stats = extract_opcode_stats(stats, "uops") - with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"): + with Section( + "Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer" + ): with Section("Overall stats", level=3): rows = calculate_optimization_stats(stats) emit_table(("", "Count:", "Ratio:"), rows) @@ -685,27 +812,24 @@ def emit_optimization_stats(stats): "Trace length histogram", stats, "Trace length", - stats["Optimization traces created"] + stats["Optimization traces created"], ) emit_histogram( "Optimized trace length histogram", stats, "Optimized trace length", - stats["Optimization traces created"] + stats["Optimization traces created"], ) emit_histogram( "Trace run length histogram", stats, "Trace run length", - stats["Optimization traces executed"] + stats["Optimization traces executed"], ) with Section("Uop stats", level=3): rows = calculate_uop_execution_counts(uop_stats) - emit_table( - ("Uop", "Count:", "Self:", "Cumulative:"), - rows - ) + emit_table(("Uop", "Count:", "Self:", "Cumulative:"), rows) with Section("Unsupported opcodes", level=3): unsupported_opcodes = extract_opcode_stats(stats, "unsupported_opcode") @@ -727,13 +851,15 @@ def output_single_stats(stats): emit_execution_counts(opcode_stats, total) emit_pair_counts(opcode_stats, total) emit_specialization_stats(opcode_stats, stats["_defines"]) - emit_specialization_overview(opcode_stats, total, stats["_specialized_instructions"]) + emit_specialization_overview( + opcode_stats, total, stats["_specialized_instructions"] + ) emit_call_stats(stats, stats["_stats_defines"]) emit_object_stats(stats) emit_gc_stats(stats) emit_optimization_stats(stats) with Section("Meta stats", summary="Meta statistics"): - emit_table(("", "Count:"), [('Number of data files', stats['__nfiles__'])]) + emit_table(("", "Count:"), [("Number of data files", stats["__nfiles__"])]) def output_comparative_stats(base_stats, head_stats): @@ -750,14 +876,18 @@ def output_comparative_stats(base_stats, head_stats): base_opcode_stats, head_opcode_stats, head_stats["_defines"] ) emit_comparative_specialization_overview( - base_opcode_stats, base_total, head_opcode_stats, head_total, - head_stats["_specialized_instructions"] + base_opcode_stats, + base_total, + head_opcode_stats, + head_total, + head_stats["_specialized_instructions"], ) emit_comparative_call_stats(base_stats, head_stats, head_stats["_stats_defines"]) emit_comparative_object_stats(base_stats, head_stats) emit_comparative_gc_stats(base_stats, head_stats) emit_comparative_optimization_stats(base_stats, head_stats) + def output_stats(inputs, json_output=None): if len(inputs) == 1: stats = gather_stats(inputs[0]) @@ -766,9 +896,7 @@ def output_stats(inputs, json_output=None): output_single_stats(stats) elif len(inputs) == 2: if json_output is not None: - raise ValueError( - "Can not output to JSON when there are multiple inputs" - ) + raise ValueError("Can not output to JSON when there are multiple inputs") base_stats = gather_stats(inputs[0]) head_stats = gather_stats(inputs[1]) @@ -777,6 +905,7 @@ def output_stats(inputs, json_output=None): print("---") print("Stats gathered on:", date.today()) + def main(): parser = argparse.ArgumentParser(description="Summarize pystats results") @@ -792,14 +921,14 @@ def main(): If one source is provided, its stats are printed. If two sources are provided, comparative stats are printed. Default is {DEFAULT_DIR}. - """ + """, ) parser.add_argument( "--json-output", nargs="?", type=argparse.FileType("w"), - help="Output complete raw results to the given JSON file." + help="Output complete raw results to the given JSON file.", ) args = parser.parse_args() @@ -809,5 +938,6 @@ def main(): output_stats(args.inputs, json_output=args.json_output) + if __name__ == "__main__": main() From d7c1403e860792bd2940285fb5eb2079255c5461 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 3 Oct 2023 11:27:19 -0400 Subject: [PATCH 15/16] Clamp lower range of histogram as well --- Include/internal/pycore_code.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 1699f88579cd29..d31d8363d771ca 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -288,7 +288,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co); #define OPT_HIST(length, name) \ do { \ if (_Py_stats) { \ - int bucket = _Py_bit_length(length - 1); \ + int bucket = _Py_bit_length(length >= 1 ? length - 1 : 0); \ bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \ _Py_stats->optimization_stats.name[bucket]++; \ } \ From aa69c0f8017e2fe6c942bd764ff7bcaaa2b97c06 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 3 Oct 2023 11:46:36 -0400 Subject: [PATCH 16/16] Fix divide-by-zero --- Tools/scripts/summarize_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py index 840de1fa345c5e..b9cc2f94080b88 100644 --- a/Tools/scripts/summarize_stats.py +++ b/Tools/scripts/summarize_stats.py @@ -767,7 +767,7 @@ def calculate_optimization_stats(stats): ("Optimization attempts", attempts, ""), ("Traces created", created, format_ratio(created, attempts)), ("Traces executed", executed, ""), - ("Uops executed", uops, int(uops / executed)), + ("Uops executed", uops, int(uops / (executed or 1))), ("Trace stack overflow", trace_stack_overflow, ""), ("Trace stack underflow", trace_stack_underflow, ""), ("Trace too long", trace_too_long, ""),