From 7ba5f5527dc431e04f24cf80ddbf6979fac8f16b Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 26 Sep 2023 14:21:25 -0400
Subject: [PATCH 01/16] gh-109329: Support for basic pystats for Tier 2

---
 Include/cpython/pystats.h        |  18 ++++-
 Include/internal/pycore_code.h   |   6 ++
 Python/bytecodes.c               |   2 +-
 Python/executor.c                |   6 +-
 Python/generated_cases.c.h       |   2 +-
 Python/optimizer.c               |   2 +-
 Python/specialize.c              |  26 +++++-
 Tools/scripts/summarize_stats.py | 135 ++++++++++++++++++++++++-------
 8 files changed, 153 insertions(+), 44 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index 150e16faa96ca1..7014c968af8229 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -86,10 +86,6 @@ typedef struct _object_stats {
     uint64_t type_cache_dunder_hits;
     uint64_t type_cache_dunder_misses;
     uint64_t type_cache_collisions;
-    uint64_t optimization_attempts;
-    uint64_t optimization_traces_created;
-    uint64_t optimization_traces_executed;
-    uint64_t optimization_uops_executed;
     /* Temporary value used during GC */
     uint64_t object_visits;
 } ObjectStats;
@@ -100,10 +96,24 @@ typedef struct _gc_stats {
     uint64_t objects_collected;
 } GCStats;
 
+typedef struct _uop_stats {
+    uint64_t execution_count;
+    uint64_t miss;
+} UOpStats;
+
+typedef struct _optimization_stats {
+    uint64_t attempts;
+    uint64_t traces_created;
+    uint64_t traces_executed;
+    uint64_t uops_executed;
+    UOpStats opcode[512];
+} OptimizationStats;
+
 typedef struct _stats {
     OpcodeStats opcode_stats[256];
     CallStats call_stats;
     ObjectStats object_stats;
+    OptimizationStats optimization_stats;
     GCStats *gc_stats;
 } PyStats;
 
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index a77fa11baf8413..e89cab16c6d9eb 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -282,6 +282,9 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \
     do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0)
 #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
+#define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
+#define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
+#define UOP_STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].name++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -296,6 +299,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define EVAL_CALL_STAT_INC(name) ((void)0)
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
 #define GC_STAT_ADD(gen, name, n) ((void)0)
+#define OPTIMIZATION_STAT_INC(name) ((void)0)
+#define UOP_EXE_INC(opname) ((void)0)
+#define UOP_STAT_INC(opname, name) ((void)0)
 #endif  // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 402b27101dbdb6..e813b65f5c8e0e 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -2244,7 +2244,7 @@ dummy_func(
                 // Double-check that the opcode isn't instrumented or something:
                 here->op.code == JUMP_BACKWARD)
             {
-                OBJECT_STAT_INC(optimization_attempts);
+                OPTIMIZATION_STAT_INC(attempts);
                 int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer);
                 ERROR_IF(optimized < 0, error);
                 if (optimized) {
diff --git a/Python/executor.c b/Python/executor.c
index ac9104223da8ff..cb9f00aae72975 100644
--- a/Python/executor.c
+++ b/Python/executor.c
@@ -62,7 +62,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
 
     CHECK_EVAL_BREAKER();
 
-    OBJECT_STAT_INC(optimization_traces_executed);
+    OPTIMIZATION_STAT_INC(traces_executed);
     _Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;
     int pc = 0;
     int opcode;
@@ -81,7 +81,9 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
                 operand,
                 (int)(stack_pointer - _PyFrame_Stackbase(frame)));
         pc++;
-        OBJECT_STAT_INC(optimization_uops_executed);
+        OPTIMIZATION_STAT_INC(uops_executed);
+        assert(opcode < 512);
+        UOP_EXE_INC(opcode);
         switch (opcode) {
 
 #include "executor_cases.c.h"
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index ebb87a86de432e..216640c23891fd 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -2947,7 +2947,7 @@
                 // Double-check that the opcode isn't instrumented or something:
                 here->op.code == JUMP_BACKWARD)
             {
-                OBJECT_STAT_INC(optimization_attempts);
+                OPTIMIZATION_STAT_INC(attempts);
                 int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer);
                 if (optimized < 0) goto error;
                 if (optimized) {
diff --git a/Python/optimizer.c b/Python/optimizer.c
index fbdbf7291784c4..fc1e7a8cad98fd 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -891,7 +891,7 @@ uop_optimize(
         // Error or nothing translated
         return trace_length;
     }
-    OBJECT_STAT_INC(optimization_traces_created);
+    OPTIMIZATION_STAT_INC(traces_created);
     char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
     if (uop_optimize != NULL && *uop_optimize > '0') {
         trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
diff --git a/Python/specialize.c b/Python/specialize.c
index d9b748cad78f4f..71b10b7c3d2eeb 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -199,10 +199,6 @@ print_object_stats(FILE *out, ObjectStats *stats)
     fprintf(out, "Object method cache collisions: %" PRIu64 "\n", stats->type_cache_collisions);
     fprintf(out, "Object method cache dunder hits: %" PRIu64 "\n", stats->type_cache_dunder_hits);
     fprintf(out, "Object method cache dunder misses: %" PRIu64 "\n", stats->type_cache_dunder_misses);
-    fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->optimization_attempts);
-    fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->optimization_traces_created);
-    fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->optimization_traces_executed);
-    fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->optimization_uops_executed);
 }
 
 static void
@@ -215,6 +211,27 @@ print_gc_stats(FILE *out, GCStats *stats)
     }
 }
 
+static void
+print_optimization_stats(FILE *out, OptimizationStats *stats)
+{
+    fprintf(out, "Optimization attempts: %" PRIu64 "\n", stats->attempts);
+    fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created);
+    fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed);
+    fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed);
+    
+    char** names;
+    for (int i = 0; i < 512; i++) {
+        if (i < 256) {
+            names = _PyOpcode_OpName;
+        } else {
+            names = _PyOpcode_uop_name;
+        }
+        if (stats->opcode[i].execution_count) {
+            fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", names[i], stats->opcode[i].execution_count);
+        }
+    }
+}
+
 static void
 print_stats(FILE *out, PyStats *stats)
 {
@@ -222,6 +239,7 @@ print_stats(FILE *out, PyStats *stats)
     print_call_stats(out, &stats->call_stats);
     print_object_stats(out, &stats->object_stats);
     print_gc_stats(out, stats->gc_stats);
+    print_optimization_stats(out, &stats->optimization_stats);
 }
 
 void
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 3b2bdd8015be4a..8b8dce950ed852 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -211,12 +211,12 @@ def gather_stats(input):
     else:
         raise ValueError(f"{input:r} is not a file or directory path")
 
-def extract_opcode_stats(stats):
+def extract_opcode_stats(stats, prefix):
     opcode_stats = collections.defaultdict(dict)
     for key, value in stats.items():
-        if not key.startswith("opcode"):
+        if not key.startswith(prefix):
             continue
-        name, _, rest = key[7:].partition("]")
+        name, _, rest = key[len(prefix) + 2:].partition("]")
         opcode_stats[name][rest.strip(".")] = value
     return opcode_stats
 
@@ -350,35 +350,38 @@ def emit_execution_counts(opcode_stats, total):
             rows
         )
 
+def _emit_comparative_execution_counts(base_rows, head_rows):
+    base_data = dict((x[0], x[1:]) for x in base_rows)
+    head_data = dict((x[0], x[1:]) for x in head_rows)
+    opcodes = set(base_data.keys()) | set(head_data.keys())
+
+    rows = []
+    default = [0, "0.0%", "0.0%", 0]
+    for opcode in opcodes:
+        base_entry = base_data.get(opcode, default)
+        head_entry = head_data.get(opcode, default)
+        if base_entry[0] == 0:
+            change = 1
+        else:
+            change = (head_entry[0] - base_entry[0]) / base_entry[0]
+        rows.append(
+            (opcode, base_entry[0], head_entry[0],
+                f"{100*change:0.1f}%"))
+
+    rows.sort(key=lambda x: -abs(percentage_to_float(x[-1])))
+
+    emit_table(
+        ("Name", "Base Count:", "Head Count:", "Change:"),
+        rows
+    )
+
 def emit_comparative_execution_counts(
-    base_opcode_stats, base_total, head_opcode_stats, head_total
+    base_opcode_stats, base_total, head_opcode_stats, head_total, level=2
 ):
-    with Section("Execution counts", summary="execution counts for all instructions"):
+    with Section("Execution counts", summary="execution counts for all instructions", level=level):
         base_rows = calculate_execution_counts(base_opcode_stats, base_total)
         head_rows = calculate_execution_counts(head_opcode_stats, head_total)
-        base_data = dict((x[0], x[1:]) for x in base_rows)
-        head_data = dict((x[0], x[1:]) for x in head_rows)
-        opcodes = set(base_data.keys()) | set(head_data.keys())
-
-        rows = []
-        default = [0, "0.0%", "0.0%", 0]
-        for opcode in opcodes:
-            base_entry = base_data.get(opcode, default)
-            head_entry = head_data.get(opcode, default)
-            if base_entry[0] == 0:
-                change = 1
-            else:
-                change = (head_entry[0] - base_entry[0]) / base_entry[0]
-            rows.append(
-                (opcode, base_entry[0], head_entry[0],
-                 f"{100*change:0.1f}%"))
-
-        rows.sort(key=lambda x: -abs(percentage_to_float(x[-1])))
-
-        emit_table(
-            ("Name", "Base Count:", "Head Count:", "Change:"),
-            rows
-        )
+        _emit_comparative_execution_counts(base_rows, head_rows)
 
 def get_defines():
     spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c")
@@ -611,8 +614,76 @@ def emit_pair_counts(opcode_stats, total):
                     succ_rows
                 )
 
+
+def calculate_optimization_stats(stats):
+    attempts = stats["Optimization attempts"]
+    created = stats["Optimization traces created"]
+    executed = stats["Optimization traces executed"]
+    uops = stats["Optimization uops executed"]
+
+    return [
+        ("Optimization attempts", attempts, ""),
+        (
+            "Traces created", created, 
+            format_ratio(created, attempts)
+        ),
+        ("Traces executed", executed, ""),
+        ("Uops executed", uops, format_ratio(uops, executed))
+    ]
+
+
+def calculate_uop_execution_counts(opcode_stats):
+    total = 0
+    counts = []
+    for name, opcode_stat in opcode_stats.items():
+        if "execution_count" in opcode_stat:
+            count = opcode_stat['execution_count']
+            counts.append((count, name))
+            total += count
+    counts.sort(reverse=True)
+    cumulative = 0
+    rows = []
+    for (count, name) in counts:
+        cumulative += count
+        rows.append((name, count, format_ratio(count, total),
+                     format_ratio(cumulative, total)))
+    return rows
+
+
+def emit_optimization_stats(stats):
+    uop_stats = extract_opcode_stats(stats, "uop")
+
+    with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
+        with Section("Overall stats", level=3):
+            rows = calculate_optimization_stats(stats)
+            emit_table(("", "Count:", "Ratio:"), rows)
+
+        with Section("Uop stats", level=3):
+            rows = calculate_uop_execution_counts(uop_stats)
+            emit_table(
+                ("Uop", "Count:", "Self:", "Cumulative:"), 
+                rows
+            )
+
+
+def emit_comparative_optimization_stats(base_stats, head_stats):
+    base_uop_stats = extract_opcode_stats(base_stats, "uop")
+    head_uop_stats = extract_opcode_stats(head_stats, "uop")
+
+    with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
+        with Section("Overall stats", level=3):
+            base_rows = calculate_optimization_stats(base_stats)
+            head_rows = calculate_optimization_stats(head_stats)
+            emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows))
+
+        with Section("Uop stats", level=3):
+            base_rows = calculate_uop_execution_counts(base_uop_stats)
+            head_rows = calculate_uop_execution_counts(head_uop_stats)
+            _emit_comparative_execution_counts(base_rows, head_rows)
+
+
 def output_single_stats(stats):
-    opcode_stats = extract_opcode_stats(stats)
+    opcode_stats = extract_opcode_stats(stats, "opcode")
     total = get_total(opcode_stats)
     emit_execution_counts(opcode_stats, total)
     emit_pair_counts(opcode_stats, total)
@@ -621,15 +692,16 @@ def output_single_stats(stats):
     emit_call_stats(stats, stats["_stats_defines"])
     emit_object_stats(stats)
     emit_gc_stats(stats)
+    emit_optimization_stats(stats)
     with Section("Meta stats", summary="Meta statistics"):
         emit_table(("", "Count:"), [('Number of data files', stats['__nfiles__'])])
 
 
 def output_comparative_stats(base_stats, head_stats):
-    base_opcode_stats = extract_opcode_stats(base_stats)
+    base_opcode_stats = extract_opcode_stats(base_stats, "opcode")
     base_total = get_total(base_opcode_stats)
 
-    head_opcode_stats = extract_opcode_stats(head_stats)
+    head_opcode_stats = extract_opcode_stats(head_stats, "opcode")
     head_total = get_total(head_opcode_stats)
 
     emit_comparative_execution_counts(
@@ -645,6 +717,7 @@ def output_comparative_stats(base_stats, head_stats):
     emit_comparative_call_stats(base_stats, head_stats, head_stats["_stats_defines"])
     emit_comparative_object_stats(base_stats, head_stats)
     emit_comparative_gc_stats(base_stats, head_stats)
+    emit_comparative_optimization_stats(base_stats, head_stats)
 
 def output_stats(inputs, json_output=None):
     if len(inputs) == 1:

From b84f496e3b2b8616eb696f51cfc78e54412ea5d5 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 26 Sep 2023 14:37:19 -0400
Subject: [PATCH 02/16] Remove unnecessary stuff

---
 Include/cpython/pystats.h      | 1 -
 Include/internal/pycore_code.h | 2 --
 2 files changed, 3 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index 7014c968af8229..78004e64bb1f05 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -98,7 +98,6 @@ typedef struct _gc_stats {
 
 typedef struct _uop_stats {
     uint64_t execution_count;
-    uint64_t miss;
 } UOpStats;
 
 typedef struct _optimization_stats {
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index e89cab16c6d9eb..3c49eb4ae35a77 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -284,7 +284,6 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
 #define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
-#define UOP_STAT_INC(opname, name) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].name++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -301,7 +300,6 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define GC_STAT_ADD(gen, name, n) ((void)0)
 #define OPTIMIZATION_STAT_INC(name) ((void)0)
 #define UOP_EXE_INC(opname) ((void)0)
-#define UOP_STAT_INC(opname, name) ((void)0)
 #endif  // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.

From 5549410ca6d233e2d88c6fcaa6e66e30c5a94bf3 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 26 Sep 2023 14:39:27 -0400
Subject: [PATCH 03/16] Delete trailing whitespace

---
 Python/specialize.c              | 2 +-
 Tools/scripts/summarize_stats.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/Python/specialize.c b/Python/specialize.c
index 71b10b7c3d2eeb..106737bda1c43e 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -218,7 +218,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
     fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created);
     fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed);
     fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed);
-    
+
     char** names;
     for (int i = 0; i < 512; i++) {
         if (i < 256) {
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 8b8dce950ed852..64a161008b4d31 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -624,7 +624,7 @@ def calculate_optimization_stats(stats):
     return [
         ("Optimization attempts", attempts, ""),
         (
-            "Traces created", created, 
+            "Traces created", created,
             format_ratio(created, attempts)
         ),
         ("Traces executed", executed, ""),
@@ -661,7 +661,7 @@ def emit_optimization_stats(stats):
         with Section("Uop stats", level=3):
             rows = calculate_uop_execution_counts(uop_stats)
             emit_table(
-                ("Uop", "Count:", "Self:", "Cumulative:"), 
+                ("Uop", "Count:", "Self:", "Cumulative:"),
                 rows
             )
 

From ed639fc4534ce8b592809043ecaa6b43f1a44bd0 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 26 Sep 2023 15:11:01 -0400
Subject: [PATCH 04/16] Don't print optimization stats if not in dataset

---
 Tools/scripts/summarize_stats.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 64a161008b4d31..b2d7940476b597 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -651,6 +651,9 @@ def calculate_uop_execution_counts(opcode_stats):
 
 
 def emit_optimization_stats(stats):
+    if "Optimization attempts" not in stats:
+        return
+
     uop_stats = extract_opcode_stats(stats, "uop")
 
     with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
@@ -667,6 +670,9 @@ def emit_optimization_stats(stats):
 
 
 def emit_comparative_optimization_stats(base_stats, head_stats):
+    if not any("Optimization attempts" in stats for stats in (base_stats, head_stats)):
+        return
+
     base_uop_stats = extract_opcode_stats(base_stats, "uop")
     head_uop_stats = extract_opcode_stats(head_stats, "uop")
 

From 195664e5a76ce820b7402437c5829191bfbd5252 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 26 Sep 2023 16:59:04 -0400
Subject: [PATCH 05/16] Fix logic

---
 Tools/scripts/summarize_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index b2d7940476b597..18adaa1a05c6eb 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -670,7 +670,7 @@ def emit_optimization_stats(stats):
 
 
 def emit_comparative_optimization_stats(base_stats, head_stats):
-    if not any("Optimization attempts" in stats for stats in (base_stats, head_stats)):
+    if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)):
         return
 
     base_uop_stats = extract_opcode_stats(base_stats, "uop")

From 8d76c0ac7d7c8a2e0c8dd6f130c60a15691c2a55 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 27 Sep 2023 10:12:53 -0400
Subject: [PATCH 06/16] More simple Tier 2 counters

---
 Include/cpython/pystats.h        |  6 ++++++
 Include/internal/pycore_code.h   |  4 ++++
 Python/optimizer.c               |  9 ++++++++-
 Python/specialize.c              | 11 +++++++++++
 Tools/scripts/summarize_stats.py | 21 ++++++++++++++++++++-
 5 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index 78004e64bb1f05..08cf488d7b2d01 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -105,7 +105,13 @@ typedef struct _optimization_stats {
     uint64_t traces_created;
     uint64_t traces_executed;
     uint64_t uops_executed;
+    uint64_t trace_stack_overflow;
+    uint64_t trace_stack_underflow;
+    uint64_t trace_too_long;
+    uint64_t inner_loop;
+    uint64_t recursive_call;
     UOpStats opcode[512];
+    uint64_t unsupported_opcode[256];
 } OptimizationStats;
 
 typedef struct _stats {
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 3c49eb4ae35a77..8b11de0f2dce02 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -284,6 +284,8 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
 #define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
+#define OPTIMIZE_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
+#define OPTIMIZE_TRACE_STACK_OVERFLOW() do { if (_Py_stats) _Py_stats->optimization_stats.trace_stack_overflow++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -300,6 +302,8 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define GC_STAT_ADD(gen, name, n) ((void)0)
 #define OPTIMIZATION_STAT_INC(name) ((void)0)
 #define UOP_EXE_INC(opname) ((void)0)
+#define OPTIMIZE_UNSUPPOTED_OPCODE(opname) ((void)0)
+#define OPTIMIZE_TRACE_STACK_OVERFLOW() ((void)0)
 #endif  // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.
diff --git a/Python/optimizer.c b/Python/optimizer.c
index fc1e7a8cad98fd..7fc20af7221c2b 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -461,6 +461,7 @@ translate_bytecode_to_trace(
     if (trace_length + (n) > max_length) { \
         DPRINTF(2, "No room for %s (need %d, got %d)\n", \
                 (opname), (n), max_length - trace_length); \
+        OPTIMIZATION_STAT_INC(trace_too_long); \
         goto done; \
     } \
     reserved = (n);  // Keep ADD_TO_TRACE / ADD_TO_STUB honest
@@ -472,6 +473,7 @@ translate_bytecode_to_trace(
 #define TRACE_STACK_PUSH() \
     if (trace_stack_depth >= TRACE_STACK_SIZE) { \
         DPRINTF(2, "Trace stack overflow\n"); \
+        OPTIMIZATION_STAT_INC(trace_stack_overflow); \
         ADD_TO_TRACE(_SET_IP, 0, 0); \
         goto done; \
     } \
@@ -572,6 +574,7 @@ translate_bytecode_to_trace(
                     ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0);
                 }
                 else {
+                    OPTIMIZATION_STAT_INC(inner_loop);
                     DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n");
                 }
                 goto done;
@@ -638,7 +641,9 @@ translate_bytecode_to_trace(
                         // LOAD_CONST + _POP_FRAME.
                         if (trace_stack_depth == 0) {
                             DPRINTF(2, "Trace stack underflow\n");
-                            goto done;}
+                            OPTIMIZATION_STAT_INC(trace_stack_underflow);
+                            goto done;
+                        }
                     }
                     uint32_t orig_oparg = oparg;  // For OPARG_TOP/BOTTOM
                     for (int i = 0; i < nuops; i++) {
@@ -713,6 +718,7 @@ translate_bytecode_to_trace(
                                             PyUnicode_AsUTF8(new_code->co_qualname),
                                             PyUnicode_AsUTF8(new_code->co_filename),
                                             new_code->co_firstlineno);
+                                    OPTIMIZATION_STAT_INC(recursive_call);
                                     ADD_TO_TRACE(_SET_IP, 0, 0);
                                     goto done;
                                 }
@@ -744,6 +750,7 @@ translate_bytecode_to_trace(
                     break;
                 }
                 DPRINTF(2, "Unsupported opcode %s\n", uop_name(opcode));
+                OPTIMIZE_UNSUPPORTED_OPCODE(opcode);
                 goto done;  // Break out of loop
             }  // End default
 
diff --git a/Python/specialize.c b/Python/specialize.c
index 106737bda1c43e..a2b4b50ce8e797 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -218,6 +218,11 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
     fprintf(out, "Optimization traces created: %" PRIu64 "\n", stats->traces_created);
     fprintf(out, "Optimization traces executed: %" PRIu64 "\n", stats->traces_executed);
     fprintf(out, "Optimization uops executed: %" PRIu64 "\n", stats->uops_executed);
+    fprintf(out, "Optimization trace stack overflow: %" PRIu64 "\n", stats->trace_stack_overflow);
+    fprintf(out, "Optimization trace stack underflow: %" PRIu64 "\n", stats->trace_stack_underflow);
+    fprintf(out, "Optimization trace too long: %" PRIu64 "\n", stats->trace_too_long);
+    fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop);
+    fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call);
 
     char** names;
     for (int i = 0; i < 512; i++) {
@@ -230,6 +235,12 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
             fprintf(out, "uops[%s].execution_count : %" PRIu64 "\n", names[i], stats->opcode[i].execution_count);
         }
     }
+
+    for (int i = 0; i < 256; i++) {
+        if (stats->unsupported_opcode[i]) {
+            fprintf(out, "unsupported_opcode[%s].count : %" PRIu64 "\n", _PyOpcode_OpName[i]);
+        }
+    }
 }
 
 static void
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 18adaa1a05c6eb..c41d3bd085bacf 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -620,6 +620,11 @@ def calculate_optimization_stats(stats):
     created = stats["Optimization traces created"]
     executed = stats["Optimization traces executed"]
     uops = stats["Optimization uops executed"]
+    trace_stack_overflow = stats["Optimization trace stack overflow"]
+    trace_stack_underflow = stats["Optimization trace stack underflow"]
+    trace_too_long = stats["Optimization trace too long"]
+    inner_loop = stats["Optimization inner loop"]
+    recursive_call = stats["Optimization recursive call"]
 
     return [
         ("Optimization attempts", attempts, ""),
@@ -628,7 +633,12 @@ def calculate_optimization_stats(stats):
             format_ratio(created, attempts)
         ),
         ("Traces executed", executed, ""),
-        ("Uops executed", uops, format_ratio(uops, executed))
+        ("Uops executed", uops, int(uops / executed)),
+        ("Trace stack overflow", trace_stack_overflow, ""),
+        ("Trace stack underflow", trace_stack_underflow, ""),
+        ("Trace too long", trace_too_long, ""),
+        ("Inner loop found", inner_loop, ""),
+        ("Recursive call", recursive_call, "")
     ]
 
 
@@ -668,6 +678,15 @@ def emit_optimization_stats(stats):
                 rows
             )
 
+        with Section("Unsupported opcodes", level=3):
+            unsupported_opcodes = extract_opcode_stats(stats, "unsupported_opcode")
+            data = []
+            for opcode, entry in unsupported_opcodes.items():
+                data.append((entry["count"], opcode))
+            data.sort(reverse=True)
+            rows = [(x[1], x[0]) for x in data]
+            emit_table(("Opcode", "Count"), rows)
+
 
 def emit_comparative_optimization_stats(base_stats, head_stats):
     if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)):

From c54707a611401c83839d490b73a6a304d19f31c8 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 27 Sep 2023 10:24:45 -0400
Subject: [PATCH 07/16] Renames and fixups

---
 Include/internal/pycore_code.h | 10 ++++------
 Python/executor.c              |  4 ++--
 Python/generated_cases.c.h     |  2 +-
 Python/optimizer.c             | 12 ++++++------
 Python/specialize.c            |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 8b11de0f2dce02..5a193251da762d 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -282,10 +282,9 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) \
     do { if (_Py_stats && PyFunction_Check(callable)) _Py_stats->call_stats.eval_calls[name]++; } while (0)
 #define GC_STAT_ADD(gen, name, n) do { if (_Py_stats) _Py_stats->gc_stats[(gen)].name += (n); } while (0)
-#define OPTIMIZATION_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
+#define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
-#define OPTIMIZE_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
-#define OPTIMIZE_TRACE_STACK_OVERFLOW() do { if (_Py_stats) _Py_stats->optimization_stats.trace_stack_overflow++; } while (0)
+#define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -300,10 +299,9 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define EVAL_CALL_STAT_INC(name) ((void)0)
 #define EVAL_CALL_STAT_INC_IF_FUNCTION(name, callable) ((void)0)
 #define GC_STAT_ADD(gen, name, n) ((void)0)
-#define OPTIMIZATION_STAT_INC(name) ((void)0)
+#define OPT_STAT_INC(name) ((void)0)
 #define UOP_EXE_INC(opname) ((void)0)
-#define OPTIMIZE_UNSUPPOTED_OPCODE(opname) ((void)0)
-#define OPTIMIZE_TRACE_STACK_OVERFLOW() ((void)0)
+#define OPT_UNSUPPORTED_OPCODE(opname) ((void)0)
 #endif  // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.
diff --git a/Python/executor.c b/Python/executor.c
index cb9f00aae72975..572faa7f41b40d 100644
--- a/Python/executor.c
+++ b/Python/executor.c
@@ -62,7 +62,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
 
     CHECK_EVAL_BREAKER();
 
-    OPTIMIZATION_STAT_INC(traces_executed);
+    OPT_STAT_INC(traces_executed);
     _Py_CODEUNIT *ip_offset = (_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive;
     int pc = 0;
     int opcode;
@@ -81,7 +81,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
                 operand,
                 (int)(stack_pointer - _PyFrame_Stackbase(frame)));
         pc++;
-        OPTIMIZATION_STAT_INC(uops_executed);
+        OPT_STAT_INC(uops_executed);
         assert(opcode < 512);
         UOP_EXE_INC(opcode);
         switch (opcode) {
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h
index 216640c23891fd..8e4feebc9290da 100644
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -2947,7 +2947,7 @@
                 // Double-check that the opcode isn't instrumented or something:
                 here->op.code == JUMP_BACKWARD)
             {
-                OPTIMIZATION_STAT_INC(attempts);
+                OPT_STAT_INC(attempts);
                 int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer);
                 if (optimized < 0) goto error;
                 if (optimized) {
diff --git a/Python/optimizer.c b/Python/optimizer.c
index 7fc20af7221c2b..d270286f2cac82 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -461,7 +461,7 @@ translate_bytecode_to_trace(
     if (trace_length + (n) > max_length) { \
         DPRINTF(2, "No room for %s (need %d, got %d)\n", \
                 (opname), (n), max_length - trace_length); \
-        OPTIMIZATION_STAT_INC(trace_too_long); \
+        OPT_STAT_INC(trace_too_long); \
         goto done; \
     } \
     reserved = (n);  // Keep ADD_TO_TRACE / ADD_TO_STUB honest
@@ -473,7 +473,7 @@ translate_bytecode_to_trace(
 #define TRACE_STACK_PUSH() \
     if (trace_stack_depth >= TRACE_STACK_SIZE) { \
         DPRINTF(2, "Trace stack overflow\n"); \
-        OPTIMIZATION_STAT_INC(trace_stack_overflow); \
+        OPT_STAT_INC(trace_stack_overflow); \
         ADD_TO_TRACE(_SET_IP, 0, 0); \
         goto done; \
     } \
@@ -574,7 +574,7 @@ translate_bytecode_to_trace(
                     ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0);
                 }
                 else {
-                    OPTIMIZATION_STAT_INC(inner_loop);
+                    OPT_STAT_INC(inner_loop);
                     DPRINTF(2, "JUMP_BACKWARD not to top ends trace\n");
                 }
                 goto done;
@@ -641,7 +641,7 @@ translate_bytecode_to_trace(
                         // LOAD_CONST + _POP_FRAME.
                         if (trace_stack_depth == 0) {
                             DPRINTF(2, "Trace stack underflow\n");
-                            OPTIMIZATION_STAT_INC(trace_stack_underflow);
+                            OPT_STAT_INC(trace_stack_underflow);
                             goto done;
                         }
                     }
@@ -718,7 +718,7 @@ translate_bytecode_to_trace(
                                             PyUnicode_AsUTF8(new_code->co_qualname),
                                             PyUnicode_AsUTF8(new_code->co_filename),
                                             new_code->co_firstlineno);
-                                    OPTIMIZATION_STAT_INC(recursive_call);
+                                    OPT_STAT_INC(recursive_call);
                                     ADD_TO_TRACE(_SET_IP, 0, 0);
                                     goto done;
                                 }
@@ -898,7 +898,7 @@ uop_optimize(
         // Error or nothing translated
         return trace_length;
     }
-    OPTIMIZATION_STAT_INC(traces_created);
+    OPT_STAT_INC(traces_created);
     char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
     if (uop_optimize != NULL && *uop_optimize > '0') {
         trace_length = _Py_uop_analyze_and_optimize(code, trace, trace_length, curr_stackentries);
diff --git a/Python/specialize.c b/Python/specialize.c
index a2b4b50ce8e797..e326af2389cd1f 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -224,7 +224,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
     fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop);
     fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call);
 
-    char** names;
+    char* const* names;
     for (int i = 0; i < 512; i++) {
         if (i < 256) {
             names = _PyOpcode_OpName;

From ba58546a9430898aba1045228deef1bc9973d186 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 27 Sep 2023 16:45:59 -0400
Subject: [PATCH 08/16] Lots more stats

---
 Include/cpython/pystats.h        |  5 ++++
 Include/internal/pycore_code.h   |  2 ++
 Python/bytecodes.c               |  3 ++-
 Python/executor.c                |  4 ++-
 Python/executor_cases.c.h        |  1 +
 Python/optimizer.c               |  4 ++-
 Python/specialize.c              | 21 +++++++++++++--
 Tools/scripts/summarize_stats.py | 46 +++++++++++++++++++++-----------
 8 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index 08cf488d7b2d01..f1fe6e5f4715f8 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -100,6 +100,8 @@ typedef struct _uop_stats {
     uint64_t execution_count;
 } UOpStats;
 
+#define _Py_UOP_HIST_SIZE 5
+
 typedef struct _optimization_stats {
     uint64_t attempts;
     uint64_t traces_created;
@@ -112,6 +114,9 @@ typedef struct _optimization_stats {
     uint64_t recursive_call;
     UOpStats opcode[512];
     uint64_t unsupported_opcode[256];
+    uint64_t trace_length_hist[_Py_UOP_HIST_SIZE];
+    uint64_t trace_run_length_hist[_Py_UOP_HIST_SIZE];
+    uint64_t optimized_trace_length_hist[_Py_UOP_HIST_SIZE];
 } OptimizationStats;
 
 typedef struct _stats {
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 5a193251da762d..433ca49e424c1c 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -285,6 +285,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
 #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
+#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> 2)]++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
@@ -302,6 +303,7 @@ PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
 #define OPT_STAT_INC(name) ((void)0)
 #define UOP_EXE_INC(opname) ((void)0)
 #define OPT_UNSUPPORTED_OPCODE(opname) ((void)0)
+#define OPT_HIST(length, name) ((void)0)
 #endif  // !Py_STATS
 
 // Utility functions for reading/writing 32/64-bit values in the inline caches.
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index e813b65f5c8e0e..059264163d0a15 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -2244,7 +2244,7 @@ dummy_func(
                 // Double-check that the opcode isn't instrumented or something:
                 here->op.code == JUMP_BACKWARD)
             {
-                OPTIMIZATION_STAT_INC(attempts);
+                OPT_STAT_INC(attempts);
                 int optimized = _PyOptimizer_BackEdge(frame, here, next_instr, stack_pointer);
                 ERROR_IF(optimized < 0, error);
                 if (optimized) {
@@ -3913,6 +3913,7 @@ dummy_func(
             frame->prev_instr--;  // Back up to just before destination
             _PyFrame_SetStackPointer(frame, stack_pointer);
             Py_DECREF(self);
+            OPT_HIST(pc, trace_run_length_hist);
             return frame;
         }
 
diff --git a/Python/executor.c b/Python/executor.c
index 572faa7f41b40d..48e9f2b42df8ff 100644
--- a/Python/executor.c
+++ b/Python/executor.c
@@ -2,6 +2,7 @@
 
 #include "opcode.h"
 
+#include "pycore_bitutils.h"
 #include "pycore_call.h"
 #include "pycore_ceval.h"
 #include "pycore_dict.h"
@@ -82,7 +83,6 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
                 (int)(stack_pointer - _PyFrame_Stackbase(frame)));
         pc++;
         OPT_STAT_INC(uops_executed);
-        assert(opcode < 512);
         UOP_EXE_INC(opcode);
         switch (opcode) {
 
@@ -116,6 +116,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
     // On ERROR_IF we return NULL as the frame.
     // The caller recovers the frame from tstate->current_frame.
     DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
+    OPT_HIST(pc, trace_run_length_hist);
     _PyFrame_SetStackPointer(frame, stack_pointer);
     Py_DECREF(self);
     return NULL;
@@ -124,6 +125,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
     // On DEOPT_IF we just repeat the last instruction.
     // This presumes nothing was popped from the stack (nor pushed).
     DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
+    OPT_HIST(pc, trace_run_length_hist);
     frame->prev_instr--;  // Back up to just before destination
     _PyFrame_SetStackPointer(frame, stack_pointer);
     Py_DECREF(self);
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index befb972f1e90f5..cee0a723fe8855 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -3008,6 +3008,7 @@
             frame->prev_instr--;  // Back up to just before destination
             _PyFrame_SetStackPointer(frame, stack_pointer);
             Py_DECREF(self);
+            OPT_HIST(pc, trace_run_length_hist);
             return frame;
             break;
         }
diff --git a/Python/optimizer.c b/Python/optimizer.c
index d270286f2cac82..f8796eb24073d2 100644
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -750,7 +750,7 @@ translate_bytecode_to_trace(
                     break;
                 }
                 DPRINTF(2, "Unsupported opcode %s\n", uop_name(opcode));
-                OPTIMIZE_UNSUPPORTED_OPCODE(opcode);
+                OPT_UNSUPPORTED_OPCODE(opcode);
                 goto done;  // Break out of loop
             }  // End default
 
@@ -898,6 +898,7 @@ uop_optimize(
         // Error or nothing translated
         return trace_length;
     }
+    OPT_HIST(trace_length, trace_length_hist);
     OPT_STAT_INC(traces_created);
     char *uop_optimize = Py_GETENV("PYTHONUOPSOPTIMIZE");
     if (uop_optimize != NULL && *uop_optimize > '0') {
@@ -908,6 +909,7 @@ uop_optimize(
     if (executor == NULL) {
         return -1;
     }
+    OPT_HIST(trace_length, optimized_trace_length_hist);
     executor->base.execute = _PyUopExecute;
     memcpy(executor->trace, trace, trace_length * sizeof(_PyUOpInstruction));
     *exec_ptr = (_PyExecutorObject *)executor;
diff --git a/Python/specialize.c b/Python/specialize.c
index e326af2389cd1f..3a697f420d0f37 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -211,6 +211,14 @@ print_gc_stats(FILE *out, GCStats *stats)
     }
 }
 
+static void
+print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE])
+{
+    for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) {
+        fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + 2)), hist[i]);
+    }
+}
+
 static void
 print_optimization_stats(FILE *out, OptimizationStats *stats)
 {
@@ -224,7 +232,11 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
     fprintf(out, "Optimization inner loop: %" PRIu64 "\n", stats->inner_loop);
     fprintf(out, "Optimization recursive call: %" PRIu64 "\n", stats->recursive_call);
 
-    char* const* names;
+    print_histogram(out, "Trace length", stats->trace_length_hist);
+    print_histogram(out, "Trace run length", stats->trace_run_length_hist);
+    print_histogram(out, "Optimized trace length", stats->optimized_trace_length_hist);
+
+    const char* const* names;
     for (int i = 0; i < 512; i++) {
         if (i < 256) {
             names = _PyOpcode_OpName;
@@ -238,7 +250,12 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
 
     for (int i = 0; i < 256; i++) {
         if (stats->unsupported_opcode[i]) {
-            fprintf(out, "unsupported_opcode[%s].count : %" PRIu64 "\n", _PyOpcode_OpName[i]);
+            fprintf(
+                out, 
+                "unsupported_opcode[%s].count : %" PRIu64 "\n", 
+                _PyOpcode_OpName[i], 
+                stats->unsupported_opcode[i]
+            );
         }
     }
 }
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index c41d3bd085bacf..763d2fe2dd0537 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -13,6 +13,7 @@
 from datetime import date
 import itertools
 import sys
+import re
 
 if os.name == "nt":
     DEFAULT_DIR = "c:\\temp\\py_stats\\"
@@ -320,6 +321,15 @@ def emit_table(header, rows):
         print("|", " | ".join(to_str(i) for i in row), "|")
     print()
 
+def emit_histogram(title, stats, key, total):
+    rows = []
+    for k, v in stats.items():
+        if k.startswith(key):
+            entry = int(re.match(r".+\[([0-9]+)\]", k).groups()[0])
+            rows.append((f"<= {entry}", int(v), format_ratio(int(v), total)))
+    print(f"**{title}**\n")
+    emit_table(("Range", "Count:", "Ratio:"), rows)
+
 def calculate_execution_counts(opcode_stats, total):
     counts = []
     for name, opcode_stat in opcode_stats.items():
@@ -671,6 +681,25 @@ def emit_optimization_stats(stats):
             rows = calculate_optimization_stats(stats)
             emit_table(("", "Count:", "Ratio:"), rows)
 
+        emit_histogram(
+            "Trace length histogram", 
+            stats, 
+            "Trace length",
+            stats["Optimization traces created"]
+        )
+        emit_histogram(
+            "Optimized trace length histogram",
+            stats,
+            "Optimized trace length",
+            stats["Optimization traces created"]
+        )
+        emit_histogram(
+            "Trace run length histogram", 
+            stats, 
+            "Trace run length",
+            stats["Optimization traces executed"]
+        )
+
         with Section("Uop stats", level=3):
             rows = calculate_uop_execution_counts(uop_stats)
             emit_table(
@@ -689,22 +718,7 @@ def emit_optimization_stats(stats):
 
 
 def emit_comparative_optimization_stats(base_stats, head_stats):
-    if not all("Optimization attempts" in stats for stats in (base_stats, head_stats)):
-        return
-
-    base_uop_stats = extract_opcode_stats(base_stats, "uop")
-    head_uop_stats = extract_opcode_stats(head_stats, "uop")
-
-    with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
-        with Section("Overall stats", level=3):
-            base_rows = calculate_optimization_stats(base_stats)
-            head_rows = calculate_optimization_stats(head_stats)
-            emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows))
-
-        with Section("Uop stats", level=3):
-            base_rows = calculate_uop_execution_counts(base_uop_stats)
-            head_rows = calculate_uop_execution_counts(head_uop_stats)
-            _emit_comparative_execution_counts(base_rows, head_rows)
+    print("## Comparative optimization stats not implemented\n\n")
 
 
 def output_single_stats(stats):

From 08cd8c04ab4a83db88fced754a39f8ab269bf7d6 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 27 Sep 2023 10:19:52 -0400
Subject: [PATCH 09/16] Disable Tier 1 stats in Tier 2

---
 Python/executor.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/Python/executor.c b/Python/executor.c
index 48e9f2b42df8ff..e509cfb1144a1d 100644
--- a/Python/executor.c
+++ b/Python/executor.c
@@ -31,6 +31,16 @@
         goto deoptimize;         \
     }
 
+#ifdef Py_STATS
+// Disable these macros that apply to Tier 1 stats when we are in Tier 2
+#undef STAT_INC
+#define STAT_INC(opname, name) ((void)0)
+#undef STAT_DEC
+#define STAT_DEC(opname, name) ((void)0)
+#undef CALL_STAT_INC
+#define CALL_STAT_INC(name) ((void)0)
+#endif
+
 #undef ENABLE_SPECIALIZATION
 #define ENABLE_SPECIALIZATION 0
 

From 8d2699d26c7d2f750ed4a15161fea2531a10a081 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Wed, 27 Sep 2023 18:55:51 -0400
Subject: [PATCH 10/16] Trailing whitespace

---
 Python/specialize.c              | 6 +++---
 Tools/scripts/summarize_stats.py | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/Python/specialize.c b/Python/specialize.c
index 3a697f420d0f37..8ed2ad6276167d 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -251,9 +251,9 @@ print_optimization_stats(FILE *out, OptimizationStats *stats)
     for (int i = 0; i < 256; i++) {
         if (stats->unsupported_opcode[i]) {
             fprintf(
-                out, 
-                "unsupported_opcode[%s].count : %" PRIu64 "\n", 
-                _PyOpcode_OpName[i], 
+                out,
+                "unsupported_opcode[%s].count : %" PRIu64 "\n",
+                _PyOpcode_OpName[i],
                 stats->unsupported_opcode[i]
             );
         }
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 763d2fe2dd0537..f3bcd01b636c9a 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -682,8 +682,8 @@ def emit_optimization_stats(stats):
             emit_table(("", "Count:", "Ratio:"), rows)
 
         emit_histogram(
-            "Trace length histogram", 
-            stats, 
+            "Trace length histogram",
+            stats,
             "Trace length",
             stats["Optimization traces created"]
         )
@@ -694,8 +694,8 @@ def emit_optimization_stats(stats):
             stats["Optimization traces created"]
         )
         emit_histogram(
-            "Trace run length histogram", 
-            stats, 
+            "Trace run length histogram",
+            stats,
             "Trace run length",
             stats["Optimization traces executed"]
         )

From ad0ca42c3b063066ea8ac3049f5672de59b78b59 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 28 Sep 2023 08:41:26 -0400
Subject: [PATCH 11/16] Make histogram bias more explicit

---
 Include/cpython/pystats.h      | 3 +++
 Include/internal/pycore_code.h | 2 +-
 Python/specialize.c            | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index f1fe6e5f4715f8..99916ed74ac12a 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -100,7 +100,10 @@ typedef struct _uop_stats {
     uint64_t execution_count;
 } UOpStats;
 
+// Note that these should be set to support the largest possible trace length.
+// i.e. log2(MAX_LENGTH) + 1 === (_Py_UOP_HIST_SIZE + _Py_UOP_HIST_BIAS)
 #define _Py_UOP_HIST_SIZE 5
+#define _Py_UOP_HIST_BIAS 2
 
 typedef struct _optimization_stats {
     uint64_t attempts;
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 433ca49e424c1c..9ac358862a3927 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -285,7 +285,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
 #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
-#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> 2)]++; } while (0)
+#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> _Py_UOP_HIST_BIAS)]++; } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
diff --git a/Python/specialize.c b/Python/specialize.c
index 8ed2ad6276167d..65863249b66d17 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -215,7 +215,7 @@ static void
 print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE])
 {
     for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) {
-        fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + 2)), hist[i]);
+        fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + _Py_UOP_HIST_BIAS)), hist[i]);
     }
 }
 

From e5ee4e5c2a6667e498ca9890b9cbab00c968f139 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Thu, 28 Sep 2023 08:41:46 -0400
Subject: [PATCH 12/16] Various simplifications from code review

---
 Tools/scripts/summarize_stats.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index f3bcd01b636c9a..7587069f4e0cf6 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -217,7 +217,7 @@ def extract_opcode_stats(stats, prefix):
     for key, value in stats.items():
         if not key.startswith(prefix):
             continue
-        name, _, rest = key[len(prefix) + 2:].partition("]")
+        name, _, rest = key[len(prefix) + 1:].partition("]")
         opcode_stats[name][rest.strip(".")] = value
     return opcode_stats
 
@@ -361,9 +361,9 @@ def emit_execution_counts(opcode_stats, total):
         )
 
 def _emit_comparative_execution_counts(base_rows, head_rows):
-    base_data = dict((x[0], x[1:]) for x in base_rows)
-    head_data = dict((x[0], x[1:]) for x in head_rows)
-    opcodes = set(base_data.keys()) | set(head_data.keys())
+    base_data = {x[0]: x[1:] for x in base_rows}
+    head_data = {x[0]: x[1:] for x in head_rows}
+    opcodes = base_data.keys() | head_data.keys()
 
     rows = []
     default = [0, "0.0%", "0.0%", 0]
@@ -376,9 +376,9 @@ def _emit_comparative_execution_counts(base_rows, head_rows):
             change = (head_entry[0] - base_entry[0]) / base_entry[0]
         rows.append(
             (opcode, base_entry[0], head_entry[0],
-                f"{100*change:0.1f}%"))
+                f"{change:0.1%}"))
 
-    rows.sort(key=lambda x: -abs(percentage_to_float(x[-1])))
+    rows.sort(key=lambda x: abs(percentage_to_float(x[-1])), reverse=True)
 
     emit_table(
         ("Name", "Base Count:", "Head Count:", "Change:"),

From 0dac9e56e0c3f11c54d0573c5c294684154f0495 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Mon, 2 Oct 2023 15:16:40 -0400
Subject: [PATCH 13/16] Bugfix getting opcode stats

---
 Tools/scripts/summarize_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 7587069f4e0cf6..6bf9223035092b 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -674,7 +674,7 @@ def emit_optimization_stats(stats):
     if "Optimization attempts" not in stats:
         return
 
-    uop_stats = extract_opcode_stats(stats, "uop")
+    uop_stats = extract_opcode_stats(stats, "uops")
 
     with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
         with Section("Overall stats", level=3):

From 69a6e7bdd5d7b00074a3726aa963de517fb72545 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Mon, 2 Oct 2023 17:08:16 -0400
Subject: [PATCH 14/16] Change bucketing. Change how execution length is
 calculated

---
 Include/cpython/pystats.h        |   5 +-
 Include/internal/pycore_code.h   |   9 +-
 Python/bytecodes.c               |   2 +-
 Python/executor.c                |  11 +-
 Python/executor_cases.c.h        |   2 +-
 Python/specialize.c              |   2 +-
 Tools/scripts/summarize_stats.py | 350 +++++++++++++++++++++----------
 7 files changed, 261 insertions(+), 120 deletions(-)

diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h
index 99916ed74ac12a..056406e6b5d992 100644
--- a/Include/cpython/pystats.h
+++ b/Include/cpython/pystats.h
@@ -100,10 +100,7 @@ typedef struct _uop_stats {
     uint64_t execution_count;
 } UOpStats;
 
-// Note that these should be set to support the largest possible trace length.
-// i.e. log2(MAX_LENGTH) + 1 === (_Py_UOP_HIST_SIZE + _Py_UOP_HIST_BIAS)
-#define _Py_UOP_HIST_SIZE 5
-#define _Py_UOP_HIST_BIAS 2
+#define _Py_UOP_HIST_SIZE 32
 
 typedef struct _optimization_stats {
     uint64_t attempts;
diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 9ac358862a3927..1699f88579cd29 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -285,7 +285,14 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define OPT_STAT_INC(name) do { if (_Py_stats) _Py_stats->optimization_stats.name++; } while (0)
 #define UOP_EXE_INC(opname) do { if (_Py_stats) _Py_stats->optimization_stats.opcode[opname].execution_count++; } while (0)
 #define OPT_UNSUPPORTED_OPCODE(opname) do { if (_Py_stats) _Py_stats->optimization_stats.unsupported_opcode[opname]++; } while (0)
-#define OPT_HIST(length, name) do { if (_Py_stats) _Py_stats->optimization_stats.name[_Py_bit_length((length - 1) >> _Py_UOP_HIST_BIAS)]++; } while (0)
+#define OPT_HIST(length, name) \
+    do { \
+        if (_Py_stats) { \
+            int bucket = _Py_bit_length(length - 1); \
+            bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \
+            _Py_stats->optimization_stats.name[bucket]++; \
+        } \
+    } while (0)
 
 // Export for '_opcode' shared extension
 PyAPI_FUNC(PyObject*) _Py_GetSpecializationStats(void);
diff --git a/Python/bytecodes.c b/Python/bytecodes.c
index 059264163d0a15..7a01464213312f 100644
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -3913,7 +3913,7 @@ dummy_func(
             frame->prev_instr--;  // Back up to just before destination
             _PyFrame_SetStackPointer(frame, stack_pointer);
             Py_DECREF(self);
-            OPT_HIST(pc, trace_run_length_hist);
+            OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
             return frame;
         }
 
diff --git a/Python/executor.c b/Python/executor.c
index e509cfb1144a1d..6da3af08822fd0 100644
--- a/Python/executor.c
+++ b/Python/executor.c
@@ -79,6 +79,9 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
     int opcode;
     int oparg;
     uint64_t operand;
+#ifdef Py_STATS
+    uint64_t trace_uop_execution_counter = 0;
+#endif
 
     for (;;) {
         opcode = self->trace[pc].opcode;
@@ -94,6 +97,10 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
         pc++;
         OPT_STAT_INC(uops_executed);
         UOP_EXE_INC(opcode);
+#ifdef Py_STATS
+        trace_uop_execution_counter++;
+#endif
+
         switch (opcode) {
 
 #include "executor_cases.c.h"
@@ -126,7 +133,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
     // On ERROR_IF we return NULL as the frame.
     // The caller recovers the frame from tstate->current_frame.
     DPRINTF(2, "Error: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
-    OPT_HIST(pc, trace_run_length_hist);
+    OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
     _PyFrame_SetStackPointer(frame, stack_pointer);
     Py_DECREF(self);
     return NULL;
@@ -135,7 +142,7 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
     // On DEOPT_IF we just repeat the last instruction.
     // This presumes nothing was popped from the stack (nor pushed).
     DPRINTF(2, "DEOPT: [Opcode %d, operand %" PRIu64 "]\n", opcode, operand);
-    OPT_HIST(pc, trace_run_length_hist);
+    OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
     frame->prev_instr--;  // Back up to just before destination
     _PyFrame_SetStackPointer(frame, stack_pointer);
     Py_DECREF(self);
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h
index cee0a723fe8855..86a01bf553b27a 100644
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -3008,7 +3008,7 @@
             frame->prev_instr--;  // Back up to just before destination
             _PyFrame_SetStackPointer(frame, stack_pointer);
             Py_DECREF(self);
-            OPT_HIST(pc, trace_run_length_hist);
+            OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
             return frame;
             break;
         }
diff --git a/Python/specialize.c b/Python/specialize.c
index 65863249b66d17..ff732eb30ca1e0 100644
--- a/Python/specialize.c
+++ b/Python/specialize.c
@@ -215,7 +215,7 @@ static void
 print_histogram(FILE *out, const char *name, uint64_t hist[_Py_UOP_HIST_SIZE])
 {
     for (int i = 0; i < _Py_UOP_HIST_SIZE; i++) {
-        fprintf(out, "%s[%d]: %" PRIu64 "\n", name, (1 << (i + _Py_UOP_HIST_BIAS)), hist[i]);
+        fprintf(out, "%s[%" PRIu64"]: %" PRIu64 "\n", name, (uint64_t)1 << i, hist[i]);
     }
 }
 
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 6bf9223035092b..840de1fa345c5e 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -22,6 +22,7 @@
 
 TOTAL = "specialization.hit", "specialization.miss", "execution_count"
 
+
 def format_ratio(num, den):
     """
     Format a ratio as a percentage. When the denominator is 0, returns the empty
@@ -32,6 +33,7 @@ def format_ratio(num, den):
     else:
         return f"{num/den:.01%}"
 
+
 def percentage_to_float(s):
     """
     Converts a percentage string to a float.  The empty string is returned as 0.0
@@ -42,6 +44,7 @@ def percentage_to_float(s):
         assert s[-1] == "%"
         return float(s[:-1])
 
+
 def join_rows(a_rows, b_rows):
     """
     Joins two tables together, side-by-side, where the first column in each is a
@@ -80,40 +83,53 @@ def join_rows(a_rows, b_rows):
     keys = list(a_data.keys()) + [k for k in b_data.keys() if k not in a_data]
     return [(k, *a_data.get(k, default), *b_data.get(k, default)) for k in keys]
 
+
 def calculate_specialization_stats(family_stats, total):
     rows = []
     for key in sorted(family_stats):
         if key.startswith("specialization.failure_kinds"):
             continue
         if key in ("specialization.hit", "specialization.miss"):
-            label = key[len("specialization."):]
+            label = key[len("specialization.") :]
         elif key == "execution_count":
             continue
-        elif key in ("specialization.success",  "specialization.failure", "specializable"):
+        elif key in (
+            "specialization.success",
+            "specialization.failure",
+            "specializable",
+        ):
             continue
         elif key.startswith("pair"):
             continue
         else:
             label = key
-        rows.append((f"{label:>12}", f"{family_stats[key]:>12}", format_ratio(family_stats[key], total)))
+        rows.append(
+            (
+                f"{label:>12}",
+                f"{family_stats[key]:>12}",
+                format_ratio(family_stats[key], total),
+            )
+        )
     return rows
 
+
 def calculate_specialization_success_failure(family_stats):
     total_attempts = 0
-    for key in ("specialization.success",  "specialization.failure"):
+    for key in ("specialization.success", "specialization.failure"):
         total_attempts += family_stats.get(key, 0)
     rows = []
     if total_attempts:
-        for key in ("specialization.success",  "specialization.failure"):
-            label = key[len("specialization."):]
+        for key in ("specialization.success", "specialization.failure"):
+            label = key[len("specialization.") :]
             label = label[0].upper() + label[1:]
             val = family_stats.get(key, 0)
             rows.append((label, val, format_ratio(val, total_attempts)))
     return rows
 
+
 def calculate_specialization_failure_kinds(name, family_stats, defines):
     total_failures = family_stats.get("specialization.failure", 0)
-    failure_kinds = [ 0 ] * 40
+    failure_kinds = [0] * 40
     for key in family_stats:
         if not key.startswith("specialization.failure_kind"):
             continue
@@ -126,9 +142,16 @@ def calculate_specialization_failure_kinds(name, family_stats, defines):
     for value, index in failures:
         if not value:
             continue
-        rows.append((kind_to_text(index, defines, name), value, format_ratio(value, total_failures)))
+        rows.append(
+            (
+                kind_to_text(index, defines, name),
+                value,
+                format_ratio(value, total_failures),
+            )
+        )
     return rows
 
+
 def print_specialization_stats(name, family_stats, defines):
     if "specializable" not in family_stats:
         return
@@ -145,7 +168,10 @@ def print_specialization_stats(name, family_stats, defines):
             rows = calculate_specialization_failure_kinds(name, family_stats, defines)
             emit_table(("Failure kind", "Count:", "Ratio:"), rows)
 
-def print_comparative_specialization_stats(name, base_family_stats, head_family_stats, defines):
+
+def print_comparative_specialization_stats(
+    name, base_family_stats, head_family_stats, defines
+):
     if "specializable" not in base_family_stats:
         return
 
@@ -158,20 +184,33 @@ def print_comparative_specialization_stats(name, base_family_stats, head_family_
         head_rows = calculate_specialization_stats(head_family_stats, head_total)
         emit_table(
             ("Kind", "Base Count", "Base Ratio", "Head Count", "Head Ratio"),
-            join_rows(base_rows, head_rows)
+            join_rows(base_rows, head_rows),
         )
         base_rows = calculate_specialization_success_failure(base_family_stats)
         head_rows = calculate_specialization_success_failure(head_family_stats)
         rows = join_rows(base_rows, head_rows)
         if rows:
             print_title("Specialization attempts", 4)
-            emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows)
-            base_rows = calculate_specialization_failure_kinds(name, base_family_stats, defines)
-            head_rows = calculate_specialization_failure_kinds(name, head_family_stats, defines)
             emit_table(
-                ("Failure kind", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
-                join_rows(base_rows, head_rows)
+                ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows
             )
+            base_rows = calculate_specialization_failure_kinds(
+                name, base_family_stats, defines
+            )
+            head_rows = calculate_specialization_failure_kinds(
+                name, head_family_stats, defines
+            )
+            emit_table(
+                (
+                    "Failure kind",
+                    "Base Count:",
+                    "Base Ratio:",
+                    "Head Count:",
+                    "Head Ratio:",
+                ),
+                join_rows(base_rows, head_rows),
+            )
+
 
 def gather_stats(input):
     # Note the output of this function must be JSON-serializable
@@ -180,7 +219,9 @@ def gather_stats(input):
         with open(input, "r") as fd:
             stats = json.load(fd)
 
-        stats["_stats_defines"] = {int(k): v for k, v in stats["_stats_defines"].items()}
+        stats["_stats_defines"] = {
+            int(k): v for k, v in stats["_stats_defines"].items()
+        }
         stats["_defines"] = {int(k): v for k, v in stats["_defines"].items()}
         return stats
 
@@ -192,18 +233,20 @@ def gather_stats(input):
                     try:
                         key, value = line.split(":")
                     except ValueError:
-                        print(f"Unparsable line: '{line.strip()}' in  {filename}", file=sys.stderr)
+                        print(
+                            f"Unparsable line: '{line.strip()}' in  {filename}",
+                            file=sys.stderr,
+                        )
                         continue
                     key = key.strip()
                     value = int(value)
                     stats[key] += value
-            stats['__nfiles__'] += 1
+            stats["__nfiles__"] += 1
 
         import opcode
 
         stats["_specialized_instructions"] = [
-            op for op in opcode._specialized_opmap.keys()
-            if "__" not in op
+            op for op in opcode._specialized_opmap.keys() if "__" not in op
         ]
         stats["_stats_defines"] = get_stats_defines()
         stats["_defines"] = get_defines()
@@ -212,15 +255,17 @@ def gather_stats(input):
     else:
         raise ValueError(f"{input:r} is not a file or directory path")
 
+
 def extract_opcode_stats(stats, prefix):
     opcode_stats = collections.defaultdict(dict)
     for key, value in stats.items():
         if not key.startswith(prefix):
             continue
-        name, _, rest = key[len(prefix) + 1:].partition("]")
+        name, _, rest = key[len(prefix) + 1 :].partition("]")
         opcode_stats[name][rest.strip(".")] = value
     return opcode_stats
 
+
 def parse_kinds(spec_src, prefix="SPEC_FAIL"):
     defines = collections.defaultdict(list)
     start = "#define " + prefix + "_"
@@ -228,14 +273,16 @@ def parse_kinds(spec_src, prefix="SPEC_FAIL"):
         line = line.strip()
         if not line.startswith(start):
             continue
-        line = line[len(start):]
+        line = line[len(start) :]
         name, val = line.split()
         defines[int(val.strip())].append(name.strip())
     return defines
 
+
 def pretty(defname):
     return defname.replace("_", " ").lower()
 
+
 def kind_to_text(kind, defines, opname):
     if kind <= 8:
         return pretty(defines[kind][0])
@@ -249,9 +296,10 @@ def kind_to_text(kind, defines, opname):
         opname = "SUBSCR"
     for name in defines[kind]:
         if name.startswith(opname):
-            return pretty(name[len(opname)+1:])
+            return pretty(name[len(opname) + 1 :])
     return "kind " + str(kind)
 
+
 def categorized_counts(opcode_stats, specialized_instructions):
     basic = 0
     specialized = 0
@@ -259,7 +307,7 @@ def categorized_counts(opcode_stats, specialized_instructions):
     for name, opcode_stat in opcode_stats.items():
         if "execution_count" not in opcode_stat:
             continue
-        count = opcode_stat['execution_count']
+        count = opcode_stat["execution_count"]
         if "specializable" in opcode_stat:
             not_specialized += count
         elif name in specialized_instructions:
@@ -270,12 +318,13 @@ def categorized_counts(opcode_stats, specialized_instructions):
             basic += count
     return basic, not_specialized, specialized
 
+
 def print_title(name, level=2):
-    print("#"*level, name)
+    print("#" * level, name)
     print()
 
-class Section:
 
+class Section:
     def __init__(self, title, level=2, summary=None):
         self.title = title
         self.level = level
@@ -296,12 +345,14 @@ def __exit__(*args):
         print("</details>")
         print()
 
+
 def to_str(x):
     if isinstance(x, int):
         return format(x, ",d")
     else:
         return str(x)
 
+
 def emit_table(header, rows):
     width = len(header)
     header_line = "|"
@@ -321,20 +372,28 @@ def emit_table(header, rows):
         print("|", " | ".join(to_str(i) for i in row), "|")
     print()
 
+
 def emit_histogram(title, stats, key, total):
     rows = []
     for k, v in stats.items():
         if k.startswith(key):
             entry = int(re.match(r".+\[([0-9]+)\]", k).groups()[0])
             rows.append((f"<= {entry}", int(v), format_ratio(int(v), total)))
+    # Don't include larger buckets with 0 entries
+    for j in range(len(rows) - 1, -1, -1):
+        if rows[j][1] != 0:
+            break
+    rows = rows[: j + 1]
+
     print(f"**{title}**\n")
     emit_table(("Range", "Count:", "Ratio:"), rows)
 
+
 def calculate_execution_counts(opcode_stats, total):
     counts = []
     for name, opcode_stat in opcode_stats.items():
         if "execution_count" in opcode_stat:
-            count = opcode_stat['execution_count']
+            count = opcode_stat["execution_count"]
             miss = 0
             if "specializable" not in opcode_stat:
                 miss = opcode_stat.get("specialization.miss")
@@ -342,23 +401,29 @@ def calculate_execution_counts(opcode_stats, total):
     counts.sort(reverse=True)
     cumulative = 0
     rows = []
-    for (count, name, miss) in counts:
+    for count, name, miss in counts:
         cumulative += count
         if miss:
             miss = format_ratio(miss, count)
         else:
             miss = ""
-        rows.append((name, count, format_ratio(count, total),
-                     format_ratio(cumulative, total), miss))
+        rows.append(
+            (
+                name,
+                count,
+                format_ratio(count, total),
+                format_ratio(cumulative, total),
+                miss,
+            )
+        )
     return rows
 
+
 def emit_execution_counts(opcode_stats, total):
     with Section("Execution counts", summary="execution counts for all instructions"):
         rows = calculate_execution_counts(opcode_stats, total)
-        emit_table(
-            ("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"),
-            rows
-        )
+        emit_table(("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"), rows)
+
 
 def _emit_comparative_execution_counts(base_rows, head_rows):
     base_data = {x[0]: x[1:] for x in base_rows}
@@ -374,37 +439,40 @@ def _emit_comparative_execution_counts(base_rows, head_rows):
             change = 1
         else:
             change = (head_entry[0] - base_entry[0]) / base_entry[0]
-        rows.append(
-            (opcode, base_entry[0], head_entry[0],
-                f"{change:0.1%}"))
+        rows.append((opcode, base_entry[0], head_entry[0], f"{change:0.1%}"))
 
     rows.sort(key=lambda x: abs(percentage_to_float(x[-1])), reverse=True)
 
-    emit_table(
-        ("Name", "Base Count:", "Head Count:", "Change:"),
-        rows
-    )
+    emit_table(("Name", "Base Count:", "Head Count:", "Change:"), rows)
+
 
 def emit_comparative_execution_counts(
     base_opcode_stats, base_total, head_opcode_stats, head_total, level=2
 ):
-    with Section("Execution counts", summary="execution counts for all instructions", level=level):
+    with Section(
+        "Execution counts", summary="execution counts for all instructions", level=level
+    ):
         base_rows = calculate_execution_counts(base_opcode_stats, base_total)
         head_rows = calculate_execution_counts(head_opcode_stats, head_total)
         _emit_comparative_execution_counts(base_rows, head_rows)
 
+
 def get_defines():
     spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c")
     with open(spec_path) as spec_src:
         defines = parse_kinds(spec_src)
     return defines
 
+
 def emit_specialization_stats(opcode_stats, defines):
     with Section("Specialization stats", summary="specialization stats by family"):
         for name, opcode_stat in opcode_stats.items():
             print_specialization_stats(name, opcode_stat, defines)
 
-def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats, defines):
+
+def emit_comparative_specialization_stats(
+    base_opcode_stats, head_opcode_stats, defines
+):
     with Section("Specialization stats", summary="specialization stats by family"):
         opcodes = set(base_opcode_stats.keys()) & set(head_opcode_stats.keys())
         for opcode in opcodes:
@@ -412,6 +480,7 @@ def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats,
                 opcode, base_opcode_stats[opcode], head_opcode_stats[opcode], defines
             )
 
+
 def calculate_specialization_effectiveness(
     opcode_stats, total, specialized_instructions
 ):
@@ -424,11 +493,17 @@ def calculate_specialization_effectiveness(
         ("Specialized", specialized, format_ratio(specialized, total)),
     ]
 
+
 def emit_specialization_overview(opcode_stats, total, specialized_instructions):
     with Section("Specialization effectiveness"):
-        rows = calculate_specialization_effectiveness(opcode_stats, total, specialized_instructions)
+        rows = calculate_specialization_effectiveness(
+            opcode_stats, total, specialized_instructions
+        )
         emit_table(("Instructions", "Count:", "Ratio:"), rows)
-        for title, field in (("Deferred", "specialization.deferred"), ("Misses", "specialization.miss")):
+        for title, field in (
+            ("Deferred", "specialization.deferred"),
+            ("Misses", "specialization.miss"),
+        ):
             total = 0
             counts = []
             for name, opcode_stat in opcode_stats.items():
@@ -441,11 +516,19 @@ def emit_specialization_overview(opcode_stats, total, specialized_instructions):
             counts.sort(reverse=True)
             if total:
                 with Section(f"{title} by instruction", 3):
-                    rows = [ (name, count, format_ratio(count, total)) for (count, name) in counts[:10] ]
+                    rows = [
+                        (name, count, format_ratio(count, total))
+                        for (count, name) in counts[:10]
+                    ]
                     emit_table(("Name", "Count:", "Ratio:"), rows)
 
+
 def emit_comparative_specialization_overview(
-    base_opcode_stats, base_total, head_opcode_stats, head_total, specialized_instructions
+    base_opcode_stats,
+    base_total,
+    head_opcode_stats,
+    head_total,
+    specialized_instructions,
 ):
     with Section("Specialization effectiveness"):
         base_rows = calculate_specialization_effectiveness(
@@ -455,16 +538,26 @@ def emit_comparative_specialization_overview(
             head_opcode_stats, head_total, specialized_instructions
         )
         emit_table(
-            ("Instructions", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
-            join_rows(base_rows, head_rows)
+            (
+                "Instructions",
+                "Base Count:",
+                "Base Ratio:",
+                "Head Count:",
+                "Head Ratio:",
+            ),
+            join_rows(base_rows, head_rows),
         )
 
+
 def get_stats_defines():
-    stats_path = os.path.join(os.path.dirname(__file__), "../../Include/cpython/pystats.h")
+    stats_path = os.path.join(
+        os.path.dirname(__file__), "../../Include/cpython/pystats.h"
+    )
     with open(stats_path) as stats_src:
         defines = parse_kinds(stats_src, prefix="EVAL_CALL")
     return defines
 
+
 def calculate_call_stats(stats, defines):
     total = 0
     for key, value in stats.items():
@@ -476,7 +569,7 @@ def calculate_call_stats(stats, defines):
             rows.append((key, value, format_ratio(value, total)))
         elif key.startswith("Calls "):
             name, index = key[:-1].split("[")
-            index =  int(index)
+            index = int(index)
             label = name + " (" + pretty(defines[index][0]) + ")"
             rows.append((label, value, format_ratio(value, total)))
     for key, value in stats.items():
@@ -484,11 +577,13 @@ def calculate_call_stats(stats, defines):
             rows.append((key, value, format_ratio(value, total)))
     return rows
 
+
 def emit_call_stats(stats, defines):
     with Section("Call stats", summary="Inlined calls and frame stats"):
         rows = calculate_call_stats(stats, defines)
         emit_table(("", "Count:", "Ratio:"), rows)
 
+
 def emit_comparative_call_stats(base_stats, head_stats, defines):
     with Section("Call stats", summary="Inlined calls and frame stats"):
         base_rows = calculate_call_stats(base_stats, defines)
@@ -496,15 +591,21 @@ def emit_comparative_call_stats(base_stats, head_stats, defines):
         rows = join_rows(base_rows, head_rows)
         rows.sort(key=lambda x: -percentage_to_float(x[-1]))
         emit_table(
-            ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
-            rows
+            ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows
         )
 
+
 def calculate_object_stats(stats):
     total_materializations = stats.get("Object new values")
-    total_allocations = stats.get("Object allocations") + stats.get("Object allocations from freelist")
-    total_increfs = stats.get("Object interpreter increfs") + stats.get("Object increfs")
-    total_decrefs = stats.get("Object interpreter decrefs") + stats.get("Object decrefs")
+    total_allocations = stats.get("Object allocations") + stats.get(
+        "Object allocations from freelist"
+    )
+    total_increfs = stats.get("Object interpreter increfs") + stats.get(
+        "Object increfs"
+    )
+    total_decrefs = stats.get("Object interpreter decrefs") + stats.get(
+        "Object decrefs"
+    )
     rows = []
     for key, value in stats.items():
         if key.startswith("Object"):
@@ -512,9 +613,9 @@ def calculate_object_stats(stats):
                 ratio = format_ratio(value, total_materializations)
             elif "allocations" in key:
                 ratio = format_ratio(value, total_allocations)
-            elif "increfs"     in key:
+            elif "increfs" in key:
                 ratio = format_ratio(value, total_increfs)
-            elif "decrefs"     in key:
+            elif "decrefs" in key:
                 ratio = format_ratio(value, total_decrefs)
             else:
                 ratio = ""
@@ -523,6 +624,7 @@ def calculate_object_stats(stats):
             rows.append((label, value, ratio))
     return rows
 
+
 def calculate_gc_stats(stats):
     gc_stats = []
     for key, value in stats.items():
@@ -539,40 +641,58 @@ def calculate_gc_stats(stats):
         for (i, gen) in enumerate(gc_stats)
     ]
 
+
 def emit_object_stats(stats):
     with Section("Object stats", summary="allocations, frees and dict materializatons"):
         rows = calculate_object_stats(stats)
-        emit_table(("",  "Count:", "Ratio:"), rows)
+        emit_table(("", "Count:", "Ratio:"), rows)
+
 
 def emit_comparative_object_stats(base_stats, head_stats):
     with Section("Object stats", summary="allocations, frees and dict materializatons"):
         base_rows = calculate_object_stats(base_stats)
         head_rows = calculate_object_stats(head_stats)
-        emit_table(("",  "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows))
+        emit_table(
+            ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
+            join_rows(base_rows, head_rows),
+        )
+
 
 def emit_gc_stats(stats):
     with Section("GC stats", summary="GC collections and effectiveness"):
         rows = calculate_gc_stats(stats)
-        emit_table(("Generation:",  "Collections:", "Objects collected:", "Object visits:"), rows)
+        emit_table(
+            ("Generation:", "Collections:", "Objects collected:", "Object visits:"),
+            rows,
+        )
+
 
 def emit_comparative_gc_stats(base_stats, head_stats):
     with Section("GC stats", summary="GC collections and effectiveness"):
         base_rows = calculate_gc_stats(base_stats)
         head_rows = calculate_gc_stats(head_stats)
         emit_table(
-            ("Generation:",
-            "Base collections:", "Head collections:",
-            "Base objects collected:", "Head objects collected:",
-            "Base object visits:", "Head object visits:"),
-            join_rows(base_rows, head_rows))
+            (
+                "Generation:",
+                "Base collections:",
+                "Head collections:",
+                "Base objects collected:",
+                "Head objects collected:",
+                "Base object visits:",
+                "Head object visits:",
+            ),
+            join_rows(base_rows, head_rows),
+        )
+
 
 def get_total(opcode_stats):
     total = 0
     for opcode_stat in opcode_stats.values():
         if "execution_count" in opcode_stat:
-            total += opcode_stat['execution_count']
+            total += opcode_stat["execution_count"]
     return total
 
+
 def emit_pair_counts(opcode_stats, total):
     pair_counts = []
     for name_i, opcode_stat in opcode_stats.items():
@@ -585,15 +705,22 @@ def emit_pair_counts(opcode_stats, total):
         pair_counts.sort(reverse=True)
         cumulative = 0
         rows = []
-        for (count, pair) in itertools.islice(pair_counts, 100):
+        for count, pair in itertools.islice(pair_counts, 100):
             name_i, name_j = pair
             cumulative += count
-            rows.append((f"{name_i} {name_j}", count, format_ratio(count, total),
-                         format_ratio(cumulative, total)))
-        emit_table(("Pair", "Count:", "Self:", "Cumulative:"),
-            rows
-        )
-    with Section("Predecessor/Successor Pairs", summary="Top 5 predecessors and successors of each opcode"):
+            rows.append(
+                (
+                    f"{name_i} {name_j}",
+                    count,
+                    format_ratio(count, total),
+                    format_ratio(cumulative, total),
+                )
+            )
+        emit_table(("Pair", "Count:", "Self:", "Cumulative:"), rows)
+    with Section(
+        "Predecessor/Successor Pairs",
+        summary="Top 5 predecessors and successors of each opcode",
+    ):
         predecessors = collections.defaultdict(collections.Counter)
         successors = collections.defaultdict(collections.Counter)
         total_predecessors = collections.Counter()
@@ -611,18 +738,18 @@ def emit_pair_counts(opcode_stats, total):
                 continue
             pred_rows = succ_rows = ()
             if total1:
-                pred_rows = [(pred, count, f"{count/total1:.1%}")
-                             for (pred, count) in predecessors[name].most_common(5)]
+                pred_rows = [
+                    (pred, count, f"{count/total1:.1%}")
+                    for (pred, count) in predecessors[name].most_common(5)
+                ]
             if total2:
-                succ_rows = [(succ, count, f"{count/total2:.1%}")
-                             for (succ, count) in successors[name].most_common(5)]
+                succ_rows = [
+                    (succ, count, f"{count/total2:.1%}")
+                    for (succ, count) in successors[name].most_common(5)
+                ]
             with Section(name, 3, f"Successors and predecessors for {name}"):
-                emit_table(("Predecessors", "Count:", "Percentage:"),
-                    pred_rows
-                )
-                emit_table(("Successors", "Count:", "Percentage:"),
-                    succ_rows
-                )
+                emit_table(("Predecessors", "Count:", "Percentage:"), pred_rows)
+                emit_table(("Successors", "Count:", "Percentage:"), succ_rows)
 
 
 def calculate_optimization_stats(stats):
@@ -638,17 +765,14 @@ def calculate_optimization_stats(stats):
 
     return [
         ("Optimization attempts", attempts, ""),
-        (
-            "Traces created", created,
-            format_ratio(created, attempts)
-        ),
+        ("Traces created", created, format_ratio(created, attempts)),
         ("Traces executed", executed, ""),
         ("Uops executed", uops, int(uops / executed)),
         ("Trace stack overflow", trace_stack_overflow, ""),
         ("Trace stack underflow", trace_stack_underflow, ""),
         ("Trace too long", trace_too_long, ""),
         ("Inner loop found", inner_loop, ""),
-        ("Recursive call", recursive_call, "")
+        ("Recursive call", recursive_call, ""),
     ]
 
 
@@ -657,16 +781,17 @@ def calculate_uop_execution_counts(opcode_stats):
     counts = []
     for name, opcode_stat in opcode_stats.items():
         if "execution_count" in opcode_stat:
-            count = opcode_stat['execution_count']
+            count = opcode_stat["execution_count"]
             counts.append((count, name))
             total += count
     counts.sort(reverse=True)
     cumulative = 0
     rows = []
-    for (count, name) in counts:
+    for count, name in counts:
         cumulative += count
-        rows.append((name, count, format_ratio(count, total),
-                     format_ratio(cumulative, total)))
+        rows.append(
+            (name, count, format_ratio(count, total), format_ratio(cumulative, total))
+        )
     return rows
 
 
@@ -676,7 +801,9 @@ def emit_optimization_stats(stats):
 
     uop_stats = extract_opcode_stats(stats, "uops")
 
-    with Section("Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"):
+    with Section(
+        "Optimization (Tier 2) stats", summary="statistics about the Tier 2 optimizer"
+    ):
         with Section("Overall stats", level=3):
             rows = calculate_optimization_stats(stats)
             emit_table(("", "Count:", "Ratio:"), rows)
@@ -685,27 +812,24 @@ def emit_optimization_stats(stats):
             "Trace length histogram",
             stats,
             "Trace length",
-            stats["Optimization traces created"]
+            stats["Optimization traces created"],
         )
         emit_histogram(
             "Optimized trace length histogram",
             stats,
             "Optimized trace length",
-            stats["Optimization traces created"]
+            stats["Optimization traces created"],
         )
         emit_histogram(
             "Trace run length histogram",
             stats,
             "Trace run length",
-            stats["Optimization traces executed"]
+            stats["Optimization traces executed"],
         )
 
         with Section("Uop stats", level=3):
             rows = calculate_uop_execution_counts(uop_stats)
-            emit_table(
-                ("Uop", "Count:", "Self:", "Cumulative:"),
-                rows
-            )
+            emit_table(("Uop", "Count:", "Self:", "Cumulative:"), rows)
 
         with Section("Unsupported opcodes", level=3):
             unsupported_opcodes = extract_opcode_stats(stats, "unsupported_opcode")
@@ -727,13 +851,15 @@ def output_single_stats(stats):
     emit_execution_counts(opcode_stats, total)
     emit_pair_counts(opcode_stats, total)
     emit_specialization_stats(opcode_stats, stats["_defines"])
-    emit_specialization_overview(opcode_stats, total, stats["_specialized_instructions"])
+    emit_specialization_overview(
+        opcode_stats, total, stats["_specialized_instructions"]
+    )
     emit_call_stats(stats, stats["_stats_defines"])
     emit_object_stats(stats)
     emit_gc_stats(stats)
     emit_optimization_stats(stats)
     with Section("Meta stats", summary="Meta statistics"):
-        emit_table(("", "Count:"), [('Number of data files', stats['__nfiles__'])])
+        emit_table(("", "Count:"), [("Number of data files", stats["__nfiles__"])])
 
 
 def output_comparative_stats(base_stats, head_stats):
@@ -750,14 +876,18 @@ def output_comparative_stats(base_stats, head_stats):
         base_opcode_stats, head_opcode_stats, head_stats["_defines"]
     )
     emit_comparative_specialization_overview(
-        base_opcode_stats, base_total, head_opcode_stats, head_total,
-        head_stats["_specialized_instructions"]
+        base_opcode_stats,
+        base_total,
+        head_opcode_stats,
+        head_total,
+        head_stats["_specialized_instructions"],
     )
     emit_comparative_call_stats(base_stats, head_stats, head_stats["_stats_defines"])
     emit_comparative_object_stats(base_stats, head_stats)
     emit_comparative_gc_stats(base_stats, head_stats)
     emit_comparative_optimization_stats(base_stats, head_stats)
 
+
 def output_stats(inputs, json_output=None):
     if len(inputs) == 1:
         stats = gather_stats(inputs[0])
@@ -766,9 +896,7 @@ def output_stats(inputs, json_output=None):
         output_single_stats(stats)
     elif len(inputs) == 2:
         if json_output is not None:
-            raise ValueError(
-                "Can not output to JSON when there are multiple inputs"
-            )
+            raise ValueError("Can not output to JSON when there are multiple inputs")
 
         base_stats = gather_stats(inputs[0])
         head_stats = gather_stats(inputs[1])
@@ -777,6 +905,7 @@ def output_stats(inputs, json_output=None):
     print("---")
     print("Stats gathered on:", date.today())
 
+
 def main():
     parser = argparse.ArgumentParser(description="Summarize pystats results")
 
@@ -792,14 +921,14 @@ def main():
         If one source is provided, its stats are printed.
         If two sources are provided, comparative stats are printed.
         Default is {DEFAULT_DIR}.
-        """
+        """,
     )
 
     parser.add_argument(
         "--json-output",
         nargs="?",
         type=argparse.FileType("w"),
-        help="Output complete raw results to the given JSON file."
+        help="Output complete raw results to the given JSON file.",
     )
 
     args = parser.parse_args()
@@ -809,5 +938,6 @@ def main():
 
     output_stats(args.inputs, json_output=args.json_output)
 
+
 if __name__ == "__main__":
     main()

From d7c1403e860792bd2940285fb5eb2079255c5461 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 3 Oct 2023 11:27:19 -0400
Subject: [PATCH 15/16] Clamp lower range of histogram as well

---
 Include/internal/pycore_code.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h
index 1699f88579cd29..d31d8363d771ca 100644
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@@ -288,7 +288,7 @@ extern int _PyStaticCode_Init(PyCodeObject *co);
 #define OPT_HIST(length, name) \
     do { \
         if (_Py_stats) { \
-            int bucket = _Py_bit_length(length - 1); \
+            int bucket = _Py_bit_length(length >= 1 ? length - 1 : 0); \
             bucket = (bucket >= _Py_UOP_HIST_SIZE) ? _Py_UOP_HIST_SIZE - 1 : bucket; \
             _Py_stats->optimization_stats.name[bucket]++; \
         } \

From aa69c0f8017e2fe6c942bd764ff7bcaaa2b97c06 Mon Sep 17 00:00:00 2001
From: Michael Droettboom <mdboom@gmail.com>
Date: Tue, 3 Oct 2023 11:46:36 -0400
Subject: [PATCH 16/16] Fix divide-by-zero

---
 Tools/scripts/summarize_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 840de1fa345c5e..b9cc2f94080b88 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -767,7 +767,7 @@ def calculate_optimization_stats(stats):
         ("Optimization attempts", attempts, ""),
         ("Traces created", created, format_ratio(created, attempts)),
         ("Traces executed", executed, ""),
-        ("Uops executed", uops, int(uops / executed)),
+        ("Uops executed", uops, int(uops / (executed or 1))),
         ("Trace stack overflow", trace_stack_overflow, ""),
         ("Trace stack underflow", trace_stack_underflow, ""),
         ("Trace too long", trace_too_long, ""),