From 7a6235393e9f7504dedeb7cd67a64e3a876a0cee Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 10:26:38 -0500 Subject: [PATCH 1/2] added count/part --- .../plugins/inband/dmesg/analyzer_args.py | 7 + .../plugins/inband/dmesg/dmesg_analyzer.py | 26 +++ nodescraper/plugins/inband/dmesg/mce_utils.py | 160 ++++++++++++++++++ test/unit/plugin/test_dmesg_analyzer.py | 53 ++++++ 4 files changed, 246 insertions(+) create mode 100644 nodescraper/plugins/inband/dmesg/mce_utils.py diff --git a/nodescraper/plugins/inband/dmesg/analyzer_args.py b/nodescraper/plugins/inband/dmesg/analyzer_args.py index b68aec27..acc7a6e1 100644 --- a/nodescraper/plugins/inband/dmesg/analyzer_args.py +++ b/nodescraper/plugins/inband/dmesg/analyzer_args.py @@ -62,3 +62,10 @@ class DmesgAnalyzerArgs(TimeRangeAnalysisArgs): "or 'NO_CHANGE' to leave the priority unchanged." ), ) + mce_threshold: Optional[int] = Field( + default=None, + description=( + "When set, raise ERROR if correctable MCE/RAS error count for any component " + "(CPU, GPU BDF/block, etc.) reaches or exceeds this value." + ), + ) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 68e77702..bf4f6418 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -34,6 +34,7 @@ from .analyzer_args import DmesgAnalyzerArgs from .dmesgdata import DmesgData +from .mce_utils import parse_correctable_mce_counts, parse_uncorrectable_mce_counts class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]): @@ -640,6 +641,28 @@ def _resolve_priority( return current_priority # if no rules are matched, keep the current priority + def _check_mce_threshold(self, dmesg_content: str, threshold: int) -> None: + """Raise ERROR events when correctable MCE counts per component reach the threshold.""" + correctable_counts = parse_correctable_mce_counts(dmesg_content) + uncorrectable_counts = parse_uncorrectable_mce_counts(dmesg_content) + + for part, count in sorted(correctable_counts.items()): + if count >= threshold: + self._log_event( + category=EventCategory.RAS, + description=( + f"{part} has {count} correctable MCE(s), " f"mce_threshold={threshold}" + ), + priority=EventPriority.ERROR, + data={ + "part": part, + "correctable_mce_count": count, + "uncorrectable_mce_count": uncorrectable_counts.get(part, 0), + "mce_threshold": threshold, + }, + console_log=True, + ) + def analyze_data( self, data: DmesgData, @@ -722,4 +745,7 @@ def analyze_data( if not self._is_known_error(known_err_events, match_content, final_error_regex): self.result.events.append(err_event) + if args.mce_threshold is not None: + self._check_mce_threshold(dmesg_content, args.mce_threshold) + return self.result diff --git a/nodescraper/plugins/inband/dmesg/mce_utils.py b/nodescraper/plugins/inband/dmesg/mce_utils.py new file mode 100644 index 00000000..a5efe64c --- /dev/null +++ b/nodescraper/plugins/inband/dmesg/mce_utils.py @@ -0,0 +1,160 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import re +from typing import Optional + +_CORRECTABLE_SUMMARY_RE = re.compile( + r"(?P\d+)\s+correctable hardware errors detected in total in (?P\w+) block" + r"(?:\s+on\s+(?PCPU\d+))?", + re.IGNORECASE, +) + +_UNCORRECTABLE_SUMMARY_RE = re.compile( + r"(?P\d+)\s+uncorrectable hardware errors detected in (?P\w+) block", + re.IGNORECASE, +) + +_GPU_CORRECTABLE_RE = re.compile( + r"amdgpu\s+(?P[\w:.]+):.*?(?P\d+)\s+correctable hardware errors detected in total in " + r"(?P\w+) block", + re.IGNORECASE, +) + +_GPU_UNCORRECTABLE_RE = re.compile( + r"amdgpu\s+(?P[\w:.]+):.*?(?P\d+)\s+uncorrectable hardware errors detected in " + r"(?P\w+) block", + re.IGNORECASE, +) + +_MCE_CE_STATUS_RE = re.compile( + r"\[Hardware Error\]:.*?(?PCPU\d+).*?MC\d+_STATUS\[[^\]]*\|CE\|[^\]]*\]", + re.IGNORECASE, +) + +_MCE_UC_STATUS_RE = re.compile( + r"\[Hardware Error\]:.*?(?PCPU\d+).*?MC\d+_STATUS\[[^\]]*\|UC\|[^\]]*\]", + re.IGNORECASE, +) + + +def _add_count(counts: dict[str, int], part: str, amount: int) -> None: + counts[part] = counts.get(part, 0) + amount + + +def _part_label( + *, + cpu: Optional[str] = None, + block: Optional[str] = None, + bdf: Optional[str] = None, + gpu_index: Optional[int] = None, +) -> str: + if bdf is not None: + block_suffix = f"/{block}" if block else "" + if gpu_index is not None: + return f"GPU{gpu_index}{block_suffix}" + return f"GPU {bdf}{block_suffix}" + if cpu and block: + return f"{cpu}/{block}" + if cpu: + return cpu + if block: + return block + return "unknown" + + +def _gpu_index_for_bdf(bdf: str, bdf_order: list[str]) -> int: + if bdf not in bdf_order: + bdf_order.append(bdf) + return bdf_order.index(bdf) + + +def parse_correctable_mce_counts(content: str) -> dict[str, int]: + """Count correctable MCE / RAS hardware errors per component from dmesg text. + + Handles summary lines (for example ``mce: 3 correctable ... on CPU1``), + amdgpu block summaries, and per-event ``MCn_STATUS[|CE|]`` hardware error lines. + """ + counts: dict[str, int] = {} + gpu_bdf_order: list[str] = [] + + for line in content.splitlines(): + gpu_match = _GPU_CORRECTABLE_RE.search(line) + if gpu_match: + bdf = gpu_match.group("bdf") + part = _part_label( + bdf=bdf, + block=gpu_match.group("block"), + gpu_index=_gpu_index_for_bdf(bdf, gpu_bdf_order), + ) + _add_count(counts, part, int(gpu_match.group("count"))) + continue + + summary_match = _CORRECTABLE_SUMMARY_RE.search(line) + if summary_match: + part = _part_label( + cpu=summary_match.group("cpu"), + block=summary_match.group("block"), + ) + _add_count(counts, part, int(summary_match.group("count"))) + continue + + status_match = _MCE_CE_STATUS_RE.search(line) + if status_match: + part = status_match.group("cpu") if status_match.group("cpu") else "unknown" + _add_count(counts, part, 1) + + return counts + + +def parse_uncorrectable_mce_counts(content: str) -> dict[str, int]: + """Count uncorrectable MCE / RAS hardware errors per component from dmesg text.""" + counts: dict[str, int] = {} + gpu_bdf_order: list[str] = [] + + for line in content.splitlines(): + gpu_match = _GPU_UNCORRECTABLE_RE.search(line) + if gpu_match: + bdf = gpu_match.group("bdf") + part = _part_label( + bdf=bdf, + block=gpu_match.group("block"), + gpu_index=_gpu_index_for_bdf(bdf, gpu_bdf_order), + ) + _add_count(counts, part, int(gpu_match.group("count"))) + continue + + summary_match = _UNCORRECTABLE_SUMMARY_RE.search(line) + if summary_match: + part = _part_label(block=summary_match.group("block")) + _add_count(counts, part, int(summary_match.group("count"))) + continue + + status_match = _MCE_UC_STATUS_RE.search(line) + if status_match: + part = status_match.group("cpu") if status_match.group("cpu") else "unknown" + _add_count(counts, part, 1) + + return counts diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index d24a311c..4226893b 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -1020,3 +1020,56 @@ def test_priority_override_updates_unkown_dmesg_error(system_info): assert len(res.events) == 1 assert res.events[0].priority == EventPriority.ERROR + + +def test_mce_threshold_raises_error_for_gpu(system_info): + dmesg_content = ( + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:c1:00.0: amdgpu: socket: 4, die: 0 " + "3 correctable hardware errors detected in total in gfx block\n" + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + DmesgData(dmesg_content=dmesg_content), + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, mce_threshold=3), + ) + + threshold_events = [e for e in res.events if e.data.get("mce_threshold") == 3] + assert len(threshold_events) == 1 + assert threshold_events[0].priority == EventPriority.ERROR + assert threshold_events[0].data["part"] == "GPU0/gfx" + assert threshold_events[0].data["correctable_mce_count"] == 3 + assert res.status == ExecutionStatus.ERROR + + +def test_mce_threshold_not_triggered_below_limit(system_info): + dmesg_content = ( + "kern :warn : 2024-06-11T14:30:00,123456+00:00 " + "mce: 2 correctable hardware errors detected in total in mc0 block on CPU1\n" + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + DmesgData(dmesg_content=dmesg_content), + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False, mce_threshold=3), + ) + + threshold_events = [e for e in res.events if "mce_threshold" in e.data] + assert threshold_events == [] + assert res.status == ExecutionStatus.WARNING + + +def test_mce_threshold_disabled_when_none(system_info): + dmesg_content = ( + "kern :warn : 2024-06-11T14:30:00,123456+00:00 " + "mce: 99 correctable hardware errors detected in total in mc0 block on CPU1\n" + ) + + analyzer = DmesgAnalyzer(system_info=system_info) + res = analyzer.analyze_data( + DmesgData(dmesg_content=dmesg_content), + args=DmesgAnalyzerArgs(check_unknown_dmesg_errors=False), + ) + + assert not any("mce_threshold" in e.data for e in res.events) From 9cbf5e70aa1a0b7bb4942ca9e7e80f5f7588f48a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 11:46:58 -0500 Subject: [PATCH 2/2] utest for new helper file --- test/unit/plugin/test_mce_utils.py | 66 ++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 test/unit/plugin/test_mce_utils.py diff --git a/test/unit/plugin/test_mce_utils.py b/test/unit/plugin/test_mce_utils.py new file mode 100644 index 00000000..7ad509a3 --- /dev/null +++ b/test/unit/plugin/test_mce_utils.py @@ -0,0 +1,66 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.plugins.inband.dmesg.mce_utils import ( + parse_correctable_mce_counts, + parse_uncorrectable_mce_counts, +) + + +def test_parse_correctable_mce_counts_cpu_summary_and_status(): + content = ( + "kern :warn : 2024-06-11T14:30:00,123456+00:00 " + "mce: 3 correctable hardware errors detected in total in mc0 block on CPU1\n" + "kern :warn : 2024-06-11T14:30:02,222222+00:00 " + "[Hardware Error]: CPU0 MC2_STATUS[0x0|CE|]: 0xabc\n" + ) + + counts = parse_correctable_mce_counts(content) + + assert counts == {"CPU1/mc0": 3, "CPU0": 1} + + +def test_parse_correctable_mce_counts_gpu_summary(): + content = ( + "kern :err : 2024-10-07T10:17:15,145363-04:00 " + "amdgpu 0000:c1:00.0: amdgpu: socket: 4, die: 0 " + "3 correctable hardware errors detected in total in gfx block\n" + ) + + counts = parse_correctable_mce_counts(content) + + assert counts == {"GPU0/gfx": 3} + + +def test_parse_uncorrectable_mce_counts(): + content = ( + "kern :err : 2038-01-19T00:00:01,000000+00:00 " + "[Hardware Error]: Machine Check: CPU1 MC1_STATUS[0xfeed|UC|AddrV]: 0x0\n" + "amdgpu 0000:de:ad.0: amdgpu: socket: 0 2 uncorrectable hardware errors detected in gfx block\n" + ) + + counts = parse_uncorrectable_mce_counts(content) + + assert counts == {"CPU1": 1, "GPU0/gfx": 2}