From 2749a44d99576c1dffe9eeb2467ce9a8cfaaf953 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 16:57:18 +0100 Subject: [PATCH 1/3] first stab --- Lib/profiling/sampling/__init__.py | 11 +- Lib/profiling/sampling/binary_reader.py | 3 + Lib/profiling/sampling/cli.py | 25 ++- Lib/profiling/sampling/ndjson_collector.py | 216 +++++++++++++++++++++ 4 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 Lib/profiling/sampling/ndjson_collector.py diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 6a0bb5e5c2f387..21d3a773a2ba63 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,6 +9,15 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .string_table import StringTable -__all__ = ("Collector", "PstatsCollector", "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", "StringTable") +__all__ = ( + "Collector", + "PstatsCollector", + "CollapsedStackCollector", + "HeatmapCollector", + "GeckoCollector", + "NdjsonCollector", + "StringTable", +) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index a11be3652597a6..d5bfc0d6130f1a 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,6 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector +from .ndjson_collector import NdjsonCollector from .pstats_collector import PstatsCollector @@ -117,6 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) + elif output_format == 'ndjson': + collector = NdjsonCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index f4b31aad45b922..4f9e784f80495d 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -19,6 +19,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector +from .ndjson_collector import NdjsonCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -87,6 +88,7 @@ class CustomFormatter( "flamegraph": "html", "gecko": "json", "heatmap": "html", + "ndjson": "ndjson", "binary": "bin", } @@ -96,6 +98,7 @@ class CustomFormatter( "flamegraph": FlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, + "ndjson": NdjsonCollector, "binary": BinaryCollector, } @@ -467,6 +470,13 @@ def _add_format_options(parser, include_compression=True, include_binary=True): dest="format", help="Generate interactive HTML heatmap visualization with line-level sample counts", ) + format_group.add_argument( + "--ndjson", + action="store_const", + const="ndjson", + dest="format", + help="Generate NDJSON snapshot output for external consumers", + ) if include_binary: format_group.add_argument( "--binary", @@ -545,15 +555,17 @@ def _sort_to_mode(sort_choice): return sort_map.get(sort_choice, SORT_MODE_NSAMPLES) def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=False, - output_file=None, compression='auto'): + mode=None, output_file=None, compression='auto'): """Create the appropriate collector based on format type. Args: - format_type: The output format ('pstats', 'collapsed', 'flamegraph', 'gecko', 'heatmap', 'binary') + format_type: The output format ('pstats', 'collapsed', 'flamegraph', + 'gecko', 'heatmap', 'ndjson', 'binary') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format for creating interval markers in Firefox Profiler) + mode: Profiling mode for collectors that expose it in metadata output_file: Output file path (required for binary format) compression: Compression type for binary format ('auto', 'zstd', 'none') @@ -577,6 +589,11 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) + if format_type == "ndjson": + return collector_class( + sample_interval_usec, skip_idle=skip_idle, mode=mode + ) + return collector_class(sample_interval_usec, skip_idle=skip_idle) @@ -951,7 +968,7 @@ def _handle_attach(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto') ) @@ -1029,7 +1046,7 @@ def _handle_run(args): # Create the appropriate collector collector = _create_collector( - args.format, args.sample_interval_usec, skip_idle, args.opcodes, + args.format, args.sample_interval_usec, skip_idle, args.opcodes, mode, output_file=output_file, compression=getattr(args, 'compression', 'auto') ) diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/ndjson_collector.py new file mode 100644 index 00000000000000..123ec1c5ea9a1c --- /dev/null +++ b/Lib/profiling/sampling/ndjson_collector.py @@ -0,0 +1,216 @@ +"""NDJSON collector.""" + +import json +import uuid +from itertools import batched + +from .constants import ( + PROFILING_MODE_ALL, + PROFILING_MODE_CPU, + PROFILING_MODE_EXCEPTION, + PROFILING_MODE_GIL, + PROFILING_MODE_WALL, +) +from .stack_collector import StackTraceCollector + + +_CHUNK_SIZE = 1000 + +_MODE_NAMES = { + PROFILING_MODE_WALL: "wall", + PROFILING_MODE_CPU: "cpu", + PROFILING_MODE_GIL: "gil", + PROFILING_MODE_ALL: "all", + PROFILING_MODE_EXCEPTION: "exception", +} + + +class NdjsonCollector(StackTraceCollector): + """Collector that exports finalized profiling data as NDJSON.""" + + def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): + super().__init__(sample_interval_usec, skip_idle=skip_idle) + self.run_id = uuid.uuid4().hex + + self._string_to_id = {} + self._strings = [] + + self._frame_to_id = {} + self._frames = [] + + self._frame_self = {} + self._frame_cumulative = {} + self._samples_total = 0 + + self._mode = mode + + def process_frames(self, frames, _thread_id, weight=1): + if not frames: + return + + self._samples_total += weight + + frame_ids = [ + self._get_or_create_frame_id(filename, location, funcname) + for filename, location, funcname, _opcode in frames + ] + leaf_frame_id = frame_ids[0] + + self._frame_self[leaf_frame_id] = ( + self._frame_self.get(leaf_frame_id, 0) + weight + ) + + for frame_id in set(frame_ids): + self._frame_cumulative[frame_id] = ( + self._frame_cumulative.get(frame_id, 0) + weight + ) + + def export(self, filename): + with open(filename, "w", encoding="utf-8") as output: + self._write_message(output, self._build_meta_record()) + self._write_chunked_defs(output, "str_def", self._strings) + self._write_chunked_defs(output, "frame_def", self._frames) + self._write_chunked_agg(output, self._iter_agg_entries()) + self._write_message( + output, + { + "type": "end", + "v": 1, + "run_id": self.run_id, + "samples_total": self._samples_total, + }, + ) + + print(f"NDJSON profile written to {filename}") + + def _build_meta_record(self): + record = { + "type": "meta", + "v": 1, + "run_id": self.run_id, + "sample_interval_usec": self.sample_interval_usec, + } + + if self._mode is not None: + record["mode"] = _MODE_NAMES.get(self._mode, str(self._mode)) + + return record + + def _get_or_create_frame_id(self, filename, location, funcname): + synthetic = location is None + location_fields = self._normalize_export_location(location) + func_str_id = self._intern_string(funcname) + path_str_id = self._intern_string(filename) + + frame_key = ( + path_str_id, + func_str_id, + location_fields["line"], + location_fields.get("end_line"), + location_fields.get("col"), + location_fields.get("end_col"), + synthetic, + ) + + if (frame_id := self._frame_to_id.get(frame_key)) is not None: + return frame_id + + frame_id = len(self._frames) + 1 + frame_record = { + "frame_id": frame_id, + "path_str_id": path_str_id, + "func_str_id": func_str_id, + **location_fields, + } + if synthetic: + frame_record["synthetic"] = True + + self._frame_to_id[frame_key] = frame_id + self._frames.append(frame_record) + return frame_id + + def _intern_string(self, value): + value = str(value) + + if (string_id := self._string_to_id.get(value)) is not None: + return string_id + + string_id = len(self._strings) + 1 + self._string_to_id[value] = string_id + self._strings.append({"str_id": string_id, "value": value}) + return string_id + + @staticmethod + def _normalize_export_location(location): + if location is None: + return {"line": 0} + + if isinstance(location, int): + return {"line": max(location, 0)} + + if not isinstance(location, tuple): + lineno = getattr(location, "lineno", 0) + location = ( + lineno, + getattr(location, "end_lineno", lineno), + getattr(location, "col_offset", -1), + getattr(location, "end_col_offset", -1), + ) + + lineno, end_lineno, col_offset, end_col_offset = location + if not isinstance(lineno, int) or lineno <= 0: + return {"line": 0} + + normalized = {"line": lineno} + if isinstance(end_lineno, int) and end_lineno > 0: + normalized["end_line"] = end_lineno + if isinstance(col_offset, int) and col_offset >= 0: + normalized["col"] = col_offset + if isinstance(end_col_offset, int) and end_col_offset >= 0: + normalized["end_col"] = end_col_offset + return normalized + + def _iter_agg_entries(self): + entries = [] + for frame_record in self._frames: + frame_id = frame_record["frame_id"] + entries.append( + { + "frame_id": frame_id, + "self": self._frame_self.get(frame_id, 0), + "cumulative": self._frame_cumulative.get(frame_id, 0), + } + ) + return entries + + def _write_chunked_defs(self, output, record_type, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": record_type, + "v": 1, + "run_id": self.run_id, + "defs": chunk, + }, + ) + + def _write_chunked_agg(self, output, entries): + for chunk in batched(entries, _CHUNK_SIZE): + self._write_message( + output, + { + "type": "agg", + "v": 1, + "run_id": self.run_id, + "kind": "frame", + "scope": "final", + "samples_total": self._samples_total, + "entries": chunk, + }, + ) + + @staticmethod + def _write_message(output, record): + output.write(json.dumps(record, separators=(",", ":"))) + output.write("\n") From f13d34c02b4b3a3e507e8863253c8f3c672484e9 Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sat, 21 Mar 2026 21:08:18 +0100 Subject: [PATCH 2/3] s/ndjson/jsonl/ --- Lib/profiling/sampling/__init__.py | 4 ++-- Lib/profiling/sampling/binary_reader.py | 6 +++--- Lib/profiling/sampling/cli.py | 16 ++++++++-------- .../{ndjson_collector.py => jsonl_collector.py} | 8 ++++---- 4 files changed, 17 insertions(+), 17 deletions(-) rename Lib/profiling/sampling/{ndjson_collector.py => jsonl_collector.py} (97%) diff --git a/Lib/profiling/sampling/__init__.py b/Lib/profiling/sampling/__init__.py index 21d3a773a2ba63..71579a3903253e 100644 --- a/Lib/profiling/sampling/__init__.py +++ b/Lib/profiling/sampling/__init__.py @@ -9,7 +9,7 @@ from .stack_collector import CollapsedStackCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .string_table import StringTable __all__ = ( @@ -18,6 +18,6 @@ "CollapsedStackCollector", "HeatmapCollector", "GeckoCollector", - "NdjsonCollector", + "JsonlCollector", "StringTable", ) diff --git a/Lib/profiling/sampling/binary_reader.py b/Lib/profiling/sampling/binary_reader.py index d5bfc0d6130f1a..8d1d8eef9155eb 100644 --- a/Lib/profiling/sampling/binary_reader.py +++ b/Lib/profiling/sampling/binary_reader.py @@ -4,7 +4,7 @@ from .gecko_collector import GeckoCollector from .stack_collector import FlamegraphCollector, CollapsedStackCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .pstats_collector import PstatsCollector @@ -118,8 +118,8 @@ def convert_binary_to_format(input_file, output_file, output_format, collector = PstatsCollector(interval) elif output_format == 'gecko': collector = GeckoCollector(interval) - elif output_format == 'ndjson': - collector = NdjsonCollector(interval) + elif output_format == 'jsonl': + collector = JsonlCollector(interval) else: raise ValueError(f"Unknown output format: {output_format}") diff --git a/Lib/profiling/sampling/cli.py b/Lib/profiling/sampling/cli.py index 4f9e784f80495d..bb97c9729364cc 100644 --- a/Lib/profiling/sampling/cli.py +++ b/Lib/profiling/sampling/cli.py @@ -19,7 +19,7 @@ from .stack_collector import CollapsedStackCollector, FlamegraphCollector from .heatmap_collector import HeatmapCollector from .gecko_collector import GeckoCollector -from .ndjson_collector import NdjsonCollector +from .jsonl_collector import JsonlCollector from .binary_collector import BinaryCollector from .binary_reader import BinaryReader from .constants import ( @@ -88,7 +88,7 @@ class CustomFormatter( "flamegraph": "html", "gecko": "json", "heatmap": "html", - "ndjson": "ndjson", + "jsonl": "jsonl", "binary": "bin", } @@ -98,7 +98,7 @@ class CustomFormatter( "flamegraph": FlamegraphCollector, "gecko": GeckoCollector, "heatmap": HeatmapCollector, - "ndjson": NdjsonCollector, + "jsonl": JsonlCollector, "binary": BinaryCollector, } @@ -471,11 +471,11 @@ def _add_format_options(parser, include_compression=True, include_binary=True): help="Generate interactive HTML heatmap visualization with line-level sample counts", ) format_group.add_argument( - "--ndjson", + "--jsonl", action="store_const", - const="ndjson", + const="jsonl", dest="format", - help="Generate NDJSON snapshot output for external consumers", + help="Generate JSONL snapshot output for external consumers", ) if include_binary: format_group.add_argument( @@ -560,7 +560,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals Args: format_type: The output format ('pstats', 'collapsed', 'flamegraph', - 'gecko', 'heatmap', 'ndjson', 'binary') + 'gecko', 'heatmap', 'jsonl', 'binary') sample_interval_usec: Sampling interval in microseconds skip_idle: Whether to skip idle samples opcodes: Whether to collect opcode information (only used by gecko format @@ -589,7 +589,7 @@ def _create_collector(format_type, sample_interval_usec, skip_idle, opcodes=Fals skip_idle = False return collector_class(sample_interval_usec, skip_idle=skip_idle, opcodes=opcodes) - if format_type == "ndjson": + if format_type == "jsonl": return collector_class( sample_interval_usec, skip_idle=skip_idle, mode=mode ) diff --git a/Lib/profiling/sampling/ndjson_collector.py b/Lib/profiling/sampling/jsonl_collector.py similarity index 97% rename from Lib/profiling/sampling/ndjson_collector.py rename to Lib/profiling/sampling/jsonl_collector.py index 123ec1c5ea9a1c..1d6575425c2616 100644 --- a/Lib/profiling/sampling/ndjson_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -1,4 +1,4 @@ -"""NDJSON collector.""" +"""JSONL collector.""" import json import uuid @@ -25,8 +25,8 @@ } -class NdjsonCollector(StackTraceCollector): - """Collector that exports finalized profiling data as NDJSON.""" +class JsonlCollector(StackTraceCollector): + """Collector that exports finalized profiling data as JSONL.""" def __init__(self, sample_interval_usec, *, skip_idle=False, mode=None): super().__init__(sample_interval_usec, skip_idle=skip_idle) @@ -81,7 +81,7 @@ def export(self, filename): }, ) - print(f"NDJSON profile written to {filename}") + print(f"JSONL profile written to {filename}") def _build_meta_record(self): record = { From c15d318022cf9c226cd36b36818a270f994fb99c Mon Sep 17 00:00:00 2001 From: maurycy <5383+maurycy@users.noreply.github.com> Date: Sun, 22 Mar 2026 02:51:12 +0100 Subject: [PATCH 3/3] printing to stdout isn't a great idea --- Lib/profiling/sampling/jsonl_collector.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/Lib/profiling/sampling/jsonl_collector.py b/Lib/profiling/sampling/jsonl_collector.py index 1d6575425c2616..3333b7352c9411 100644 --- a/Lib/profiling/sampling/jsonl_collector.py +++ b/Lib/profiling/sampling/jsonl_collector.py @@ -81,8 +81,6 @@ def export(self, filename): }, ) - print(f"JSONL profile written to {filename}") - def _build_meta_record(self): record = { "type": "meta",