From 1a30ba00624fcd83e70225cb8eb07a15e1f2a31a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 14 May 2026 16:04:53 -0500 Subject: [PATCH 01/39] serviceability base plugin --- .../plugins/serviceability/__init__.py | 42 ++++ .../plugins/serviceability/collector_args.py | 134 ++++++++++++ .../serviceability_collector.py | 195 ++++++++++++++++++ .../serviceability/serviceability_data.py | 80 +++++++ .../serviceability_plugin_base.py | 45 ++++ 5 files changed, 496 insertions(+) create mode 100644 nodescraper/plugins/serviceability/__init__.py create mode 100644 nodescraper/plugins/serviceability/collector_args.py create mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py create mode 100644 nodescraper/plugins/serviceability/serviceability_data.py create mode 100644 nodescraper/plugins/serviceability/serviceability_plugin_base.py diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py new file mode 100644 index 00000000..af181362 --- /dev/null +++ b/nodescraper/plugins/serviceability/__init__.py @@ -0,0 +1,42 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ( + DeviceInfo, + ServiceabilityDataModel, + ServiceabilityResult, +) +from .serviceability_plugin_base import ServiceabilityPluginBase + +__all__ = [ + "DeviceInfo", + "ServiceabilityCollectorArgs", + "ServiceabilityCollectorBase", + "ServiceabilityDataModel", + "ServiceabilityPluginBase", + "ServiceabilityResult", +] diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/collector_args.py new file mode 100644 index 00000000..924c3cc9 --- /dev/null +++ b/nodescraper/plugins/serviceability/collector_args.py @@ -0,0 +1,134 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Optional, Tuple + +from pydantic import Field, field_validator, model_validator + +from nodescraper.models import CollectorArgs + + +class ServiceabilityCollectorArgs(CollectorArgs): + """Redfish collection arguments for ``ServiceabilityCollectorBase``. + + All Redfish URIs must be supplied by the caller; the base collector does not + embed product paths. Optional sections (assembly inventory, firmware bundle) + are skipped when the corresponding URI or template is omitted. + """ + + uri: Optional[str] = Field( + default=None, + description="Optional alias for ``rf_event_log_uri`` (non-empty string).", + ) + rf_event_log_uri: Optional[str] = Field( + default=None, + description="Redfish URI for the event log ``Entries`` collection.", + ) + rf_chassis_devices: Optional[List[str]] = Field( + default=None, + description="Chassis designations for Assembly GETs; required with ``rf_assembly_uri_template``.", + ) + rf_assembly_uri_template: Optional[str] = Field( + default=None, + description="Redfish URI template containing ``{device}`` for each chassis Assembly resource.", + ) + rf_firmware_bundle_uri: Optional[str] = Field( + default=None, + description="Redfish URI for firmware bundle inventory (e.g. ComponentDetails).", + ) + rf_assembly_fields: Optional[Tuple[str, ...]] = Field( + default=None, + description="Standard Assembly JSON field names mapped into ``DeviceInfo``.", + ) + rf_assembly_oem_fields: Optional[Tuple[str, ...]] = Field( + default=None, + description="OEM Assembly field names (under ``Oem``) mapped into ``DeviceInfo``.", + ) + follow_next_link: bool = Field( + default=True, + description=( + "When True, follow Members@odata.nextLink and merge pages (up to max_pages). " + "When False, only the first GET response is used." + ), + ) + max_pages: int = Field( + default=200, + ge=1, + le=10_000, + description="Safety cap on the number of pages when following event log pagination.", + ) + top: Optional[int] = Field( + default=None, + ge=1, + description=( + "Return only the most recent N entries using $skip when the collection " + "supports OData count; None collects per follow_next_link rules." + ), + ) + from_ac_cycle: int = Field( + default=-1, + description="Passed to ``filter_event_members`` implementations (e.g. A/C cycle window). -1 disables.", + ) + from_date: Optional[str] = Field( + default=None, + description="Passed to ``filter_event_members`` implementations (e.g. ISO date window).", + ) + + @field_validator("from_ac_cycle") + @classmethod + def validate_from_ac_cycle(cls, v: int) -> int: + if v != -1 and v < 0: + raise ValueError("from_ac_cycle must be -1 (no filter) or a non-negative integer") + return v + + @model_validator(mode="after") + def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: + if not self.resolved_event_log_uri(): + raise ValueError( + "Provide a non-empty rf_event_log_uri or uri for the event log collection." + ) + return self + + @model_validator(mode="after") + def _assembly_consistency(self) -> ServiceabilityCollectorArgs: + has_tpl = bool( + self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template + ) + has_dev = bool(self.rf_chassis_devices) + if has_tpl != has_dev: + raise ValueError( + "Provide both rf_assembly_uri_template (with '{device}') and rf_chassis_devices, " + "or omit both to skip assembly collection." + ) + return self + + def resolved_event_log_uri(self) -> str: + """Effective event-log URI (``uri`` or ``rf_event_log_uri``).""" + for candidate in (self.uri, self.rf_event_log_uri): + if candidate and str(candidate).strip(): + return str(candidate).strip() + return "" diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py new file mode 100644 index 00000000..19942f49 --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -0,0 +1,195 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import abc +from typing import Any, Optional +from urllib.parse import urlparse + +from nodescraper.base import RedfishDataCollector +from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult + +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_data import DeviceInfo, ServiceabilityDataModel + + +class ServiceabilityCollectorBase( + RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], +): + """Redfish serviceability collection flow with product-specific hooks. + + Subclasses implement event filtering, CPER detection, and CPER attachment handling. + Redfish URIs come only from :class:`ServiceabilityCollectorArgs`. + """ + + DATA_MODEL = ServiceabilityDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.get("log_path") + super().__init__(**kwargs) + + @abc.abstractmethod + def filter_event_members( + self, + members: list[Any], + args: ServiceabilityCollectorArgs, + ) -> list[Any]: + """Return the event list to analyze (e.g. time / A/C window).""" + + @abc.abstractmethod + def is_cper_event(self, event: dict) -> bool: + """Return whether a Redfish event entry should be treated as diagnostic-backed.""" + + @abc.abstractmethod + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + + def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): + if args.follow_next_link: + return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get(uri, log_artifact=True) + + def collect_data( + self, args: Optional[ServiceabilityCollectorArgs] = None + ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "ServiceabilityCollectorArgs are required" + return self.result, None + + event_uri = args.resolved_event_log_uri() + if args.top is not None: + res = self._fetch_top(args, args.top, args.max_pages) + else: + res = self._fetch_event_log(args, event_uri) + + if not res.success or res.data is None: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" + return self.result, None + + members = res.data.get(RF_MEMBERS, []) + responses = {res.path: res.data} + raw_base_url = getattr(self.connection, "base_url", None) + bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None + + try: + filtered_members = self.filter_event_members(members, args) + except ValueError as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Event filter failed: {exc}" + return self.result, None + + assembly_info: dict[str, DeviceInfo] = {} + tpl = args.rf_assembly_uri_template + devices = args.rf_chassis_devices + if tpl and devices: + std_fields = tuple(args.rf_assembly_fields or ()) + oem_fields = tuple(args.rf_assembly_oem_fields or ()) + std_to_device = { + "Name": "name", + "PartNumber": "part_number", + "ProductionDate": "production_date", + "SerialNumber": "serial_number", + "Version": "version", + } + + for device in devices: + uri_asm = tpl.format(device=device) + assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) + if not assembly_res.success or assembly_res.data is None: + continue + responses[assembly_res.path] = assembly_res.data + + assemblies = assembly_res.data.get("Assemblies", []) + if not assemblies: + continue + + entry = assemblies[0] + oem = entry.get("Oem", {}) + di_kwargs: dict[str, Any] = {} + for fname in std_fields: + key = std_to_device.get(fname) + if key: + di_kwargs[key] = entry.get(fname) + + for of in oem_fields: + if of == "AssemblyPartNumber": + di_kwargs["assembly_part_number"] = oem.get(of) + elif of == "AssemblySerialNumber": + di_kwargs["assembly_serial_number"] = oem.get(of) + + assembly_info[device] = DeviceInfo(**di_kwargs) + + cper_data = self.collect_cper_data(filtered_members or []) + + data = ServiceabilityDataModel( + responses=responses, + rf_events=filtered_members or [], + assembly_info=assembly_info, + cper_data=cper_data, + component_details=self._fetch_component_details(responses, args), + log_path=self._log_path, + bmc_host=bmc_host, + ) + self.result.status = ExecutionStatus.OK + self.result.message = f"Collected {len(members)} event log member(s)" + return self.result, data + + def _fetch_component_details( + self, responses: dict[str, Any], args: ServiceabilityCollectorArgs + ) -> Optional[str]: + fw_uri = args.rf_firmware_bundle_uri + if not fw_uri or not str(fw_uri).strip(): + return None + fw_uri = str(fw_uri).strip() + fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + if not fw_res.success or fw_res.data is None: + return None + responses[fw_res.path] = fw_res.data + + oem = fw_res.data.get("Oem", {}) + version_id = oem.get("AMD", oem).get("VersionID", {}) + return version_id.get("ComponentDetails") + + def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): + event_uri = args.resolved_event_log_uri() + probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) + if not probe.success or probe.data is None: + return probe + + count = probe.data.get(RF_MEMBERS_COUNT, 0) + + if count <= top: + return self._fetch_event_log(args, event_uri) + + skip = count - top + skip_uri = f"{event_uri}?$skip={skip}" + if args.follow_next_link: + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py new file mode 100644 index 00000000..4329feae --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -0,0 +1,80 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel + +from nodescraper.models import DataModel + + +class DeviceInfo(BaseModel): + """Information for a single chassis device collected via Redfish.""" + + name: Optional[str] = None + part_number: Optional[str] = None + production_date: Optional[str] = None + serial_number: Optional[str] = None + version: Optional[str] = None + assembly_part_number: Optional[str] = None + assembly_serial_number: Optional[str] = None + + +class ServiceabilityResult(BaseModel): + """Structured serviceability output (typically populated by a downstream analyzer).""" + + node: Optional[str] = None + service_recommendations: Dict[str, List[dict]] = {} + service_action_definitions: Dict[str, dict] = {} + afid_sag_metadata: Dict[str, Any] = {} + node_info: Dict[str, Any] = {} + + +class ServiceabilityDataModel(DataModel): + """Collected Redfish responses and intermediate serviceability fields.""" + + responses: dict[str, Any] = {} + rf_events: list[Any] = [] + assembly_info: Dict[str, DeviceInfo] = {} + cper_data: Dict[str, Any] = {} + component_details: Optional[str] = None + log_path: Optional[str] = None + bmc_host: Optional[str] = None + result: Optional[ServiceabilityResult] = None + + def log_model(self, log_path: str) -> None: + """Write raw Redfish responses and decoded CPER data to the log directory.""" + os.makedirs(log_path, exist_ok=True) + responses_path = os.path.join(log_path, "redfish_responses.json") + with open(responses_path, "w", encoding="utf-8") as f: + json.dump(self.responses, f, indent=2) + if self.cper_data: + cper_path = os.path.join(log_path, "cper_data.json") + with open(cper_path, "w", encoding="utf-8") as f: + json.dump(self.cper_data, f, indent=2) diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py new file mode 100644 index 00000000..fbc8082f --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -0,0 +1,45 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.base import OOBandDataPlugin + +from .collector_args import ServiceabilityCollectorArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ServiceabilityDataModel + + +class ServiceabilityPluginBase( + OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], +): + """OOB Redfish plugin base: collection only (no analyzer). + + Set ``COLLECTOR`` on a **subclass** to a concrete collector derived from + :class:`ServiceabilityCollectorBase` (the base ``COLLECTOR`` here is abstract + and cannot be instantiated). Add an ``ANALYZER`` on the subclass when needed. + """ + + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = ServiceabilityCollectorBase + COLLECTOR_ARGS = ServiceabilityCollectorArgs From 97aa436b841218fdfc3e2eca19124a7dbb31fe5f Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 15 May 2026 10:07:09 -0500 Subject: [PATCH 02/39] updates: oob, abstracted some dicts --- .../plugins/serviceability/collector_args.py | 48 +-- .../serviceability_collector.py | 55 ++- .../serviceability/serviceability_data.py | 12 +- .../serviceability_plugin_base.py | 7 +- .../plugin/test_serviceability_collector.py | 326 ++++++++++++++++++ 5 files changed, 362 insertions(+), 86 deletions(-) create mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/collector_args.py index 924c3cc9..4b2511ca 100644 --- a/nodescraper/plugins/serviceability/collector_args.py +++ b/nodescraper/plugins/serviceability/collector_args.py @@ -25,20 +25,15 @@ ############################################################################### from __future__ import annotations -from typing import List, Optional, Tuple +from typing import List, Optional -from pydantic import Field, field_validator, model_validator +from pydantic import Field, model_validator from nodescraper.models import CollectorArgs class ServiceabilityCollectorArgs(CollectorArgs): - """Redfish collection arguments for ``ServiceabilityCollectorBase``. - - All Redfish URIs must be supplied by the caller; the base collector does not - embed product paths. Optional sections (assembly inventory, firmware bundle) - are skipped when the corresponding URI or template is omitted. - """ + """URIs and pagination only. Subclasses add filtering and OEM-specific options.""" uri: Optional[str] = Field( default=None, @@ -58,22 +53,11 @@ class ServiceabilityCollectorArgs(CollectorArgs): ) rf_firmware_bundle_uri: Optional[str] = Field( default=None, - description="Redfish URI for firmware bundle inventory (e.g. ComponentDetails).", - ) - rf_assembly_fields: Optional[Tuple[str, ...]] = Field( - default=None, - description="Standard Assembly JSON field names mapped into ``DeviceInfo``.", - ) - rf_assembly_oem_fields: Optional[Tuple[str, ...]] = Field( - default=None, - description="OEM Assembly field names (under ``Oem``) mapped into ``DeviceInfo``.", + description="Redfish URI for firmware bundle inventory when subclasses extract component details.", ) follow_next_link: bool = Field( default=True, - description=( - "When True, follow Members@odata.nextLink and merge pages (up to max_pages). " - "When False, only the first GET response is used." - ), + description="If True, follow Members@odata.nextLink up to max_pages; else single GET.", ) max_pages: int = Field( default=200, @@ -84,26 +68,8 @@ class ServiceabilityCollectorArgs(CollectorArgs): top: Optional[int] = Field( default=None, ge=1, - description=( - "Return only the most recent N entries using $skip when the collection " - "supports OData count; None collects per follow_next_link rules." - ), + description="Most recent N entries via $skip after count probe; None collects full window.", ) - from_ac_cycle: int = Field( - default=-1, - description="Passed to ``filter_event_members`` implementations (e.g. A/C cycle window). -1 disables.", - ) - from_date: Optional[str] = Field( - default=None, - description="Passed to ``filter_event_members`` implementations (e.g. ISO date window).", - ) - - @field_validator("from_ac_cycle") - @classmethod - def validate_from_ac_cycle(cls, v: int) -> int: - if v != -1 and v < 0: - raise ValueError("from_ac_cycle must be -1 (no filter) or a non-negative integer") - return v @model_validator(mode="after") def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: @@ -127,7 +93,7 @@ def _assembly_consistency(self) -> ServiceabilityCollectorArgs: return self def resolved_event_log_uri(self) -> str: - """Effective event-log URI (``uri`` or ``rf_event_log_uri``).""" + """Return uri or rf_event_log_uri.""" for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 19942f49..7e364afd 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -41,11 +41,7 @@ class ServiceabilityCollectorBase( RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], ): - """Redfish serviceability collection flow with product-specific hooks. - - Subclasses implement event filtering, CPER detection, and CPER attachment handling. - Redfish URIs come only from :class:`ServiceabilityCollectorArgs`. - """ + """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" DATA_MODEL = ServiceabilityDataModel @@ -59,7 +55,7 @@ def filter_event_members( members: list[Any], args: ServiceabilityCollectorArgs, ) -> list[Any]: - """Return the event list to analyze (e.g. time / A/C window).""" + """Return the event list to retain for downstream analysis.""" @abc.abstractmethod def is_cper_event(self, event: dict) -> bool: @@ -69,6 +65,23 @@ def is_cper_event(self, event: dict) -> bool: def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + @abc.abstractmethod + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> DeviceInfo: + """Map one Assemblies[] member dict into DeviceInfo.""" + + @abc.abstractmethod + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> Optional[str]: + """Derive component-details text from a firmware inventory GET payload, or None.""" + def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): if args.follow_next_link: return self._run_redfish_get_paged(uri, max_pages=args.max_pages) @@ -109,16 +122,6 @@ def collect_data( tpl = args.rf_assembly_uri_template devices = args.rf_chassis_devices if tpl and devices: - std_fields = tuple(args.rf_assembly_fields or ()) - oem_fields = tuple(args.rf_assembly_oem_fields or ()) - std_to_device = { - "Name": "name", - "PartNumber": "part_number", - "ProductionDate": "production_date", - "SerialNumber": "serial_number", - "Version": "version", - } - for device in devices: uri_asm = tpl.format(device=device) assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) @@ -131,20 +134,7 @@ def collect_data( continue entry = assemblies[0] - oem = entry.get("Oem", {}) - di_kwargs: dict[str, Any] = {} - for fname in std_fields: - key = std_to_device.get(fname) - if key: - di_kwargs[key] = entry.get(fname) - - for of in oem_fields: - if of == "AssemblyPartNumber": - di_kwargs["assembly_part_number"] = oem.get(of) - elif of == "AssemblySerialNumber": - di_kwargs["assembly_serial_number"] = oem.get(of) - - assembly_info[device] = DeviceInfo(**di_kwargs) + assembly_info[device] = self.parse_assembly_entry(device, entry, args) cper_data = self.collect_cper_data(filtered_members or []) @@ -172,10 +162,7 @@ def _fetch_component_details( if not fw_res.success or fw_res.data is None: return None responses[fw_res.path] = fw_res.data - - oem = fw_res.data.get("Oem", {}) - version_id = oem.get("AMD", oem).get("VersionID", {}) - return version_id.get("ComponentDetails") + return self.extract_component_details(fw_res.data, args) def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): event_uri = args.resolved_event_log_uri() diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 4329feae..93e57737 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -29,21 +29,23 @@ import os from typing import Any, Dict, List, Optional -from pydantic import BaseModel +from pydantic import BaseModel, Field from nodescraper.models import DataModel class DeviceInfo(BaseModel): - """Information for a single chassis device collected via Redfish.""" + """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" name: Optional[str] = None part_number: Optional[str] = None production_date: Optional[str] = None serial_number: Optional[str] = None version: Optional[str] = None - assembly_part_number: Optional[str] = None - assembly_serial_number: Optional[str] = None + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Opaque vendor/product extensions parsed by the concrete collector.", + ) class ServiceabilityResult(BaseModel): @@ -69,7 +71,7 @@ class ServiceabilityDataModel(DataModel): result: Optional[ServiceabilityResult] = None def log_model(self, log_path: str) -> None: - """Write raw Redfish responses and decoded CPER data to the log directory.""" + """Write redfish_responses.json and optional cper_data.json under log_path.""" os.makedirs(log_path, exist_ok=True) responses_path = os.path.join(log_path, "redfish_responses.json") with open(responses_path, "w", encoding="utf-8") as f: diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py index fbc8082f..b3ca322a 100644 --- a/nodescraper/plugins/serviceability/serviceability_plugin_base.py +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -33,12 +33,7 @@ class ServiceabilityPluginBase( OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], ): - """OOB Redfish plugin base: collection only (no analyzer). - - Set ``COLLECTOR`` on a **subclass** to a concrete collector derived from - :class:`ServiceabilityCollectorBase` (the base ``COLLECTOR`` here is abstract - and cannot be instantiated). Add an ``ANALYZER`` on the subclass when needed. - """ + """OOB Redfish collect-only plugin stub; subclass with a concrete COLLECTOR and optional ANALYZER.""" DATA_MODEL = ServiceabilityDataModel COLLECTOR = ServiceabilityCollectorBase diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py new file mode 100644 index 00000000..e3a67d5d --- /dev/null +++ b/test/unit/plugin/test_serviceability_collector.py @@ -0,0 +1,326 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Any, Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + DeviceInfo, + ServiceabilityCollectorArgs, + ServiceabilityDataModel, + ServiceabilityPluginBase, +) +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubServiceabilityCollector(ServiceabilityCollectorBase): + def filter_event_members( + self, + members: list[Any], + args: ServiceabilityCollectorArgs, + ) -> list[Any]: + return members + + def is_cper_event(self, event: dict) -> bool: + return False + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: ServiceabilityCollectorArgs, + ) -> Optional[str]: + return firmware_inventory_payload.get("Details") + + +@pytest.fixture +def stub_serviceability_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + return _StubServiceabilityCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/serviceability.log", + ) + + +def test_serviceability_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs() + + +def test_serviceability_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): + args = ServiceabilityCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_serviceability_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_serviceability_collector_args_assembly_template_must_include_device_placeholder(): + with pytest.raises(ValidationError): + ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", + rf_chassis_devices=["C1"], + ) + + +def test_serviceability_collector_args_assembly_optional_when_omitted(): + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + assert args.rf_assembly_uri_template is None + assert args.rf_chassis_devices is None + + +def test_serviceability_plugin_base_wiring(): + assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase + assert ServiceabilityPluginBase.COLLECTOR_ARGS is ServiceabilityCollectorArgs + assert ServiceabilityPluginBase.ANALYZER is None + + +def test_stub_collector_no_args(stub_serviceability_collector): + result, data = stub_serviceability_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=False, + error="timeout", + status_code=None, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert EVENT_URI in result.message + assert data is None + + +def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): + members = [{"Id": "1"}] + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: members}, + status_code=200, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == members + assert EVENT_URI in data.responses + assert data.bmc_host == "bmc.example" + assert data.log_path == "/tmp/serviceability.log" + redfish_conn_mock.run_get_paged.assert_called_once() + + +def test_stub_collector_filter_raises_maps_to_error( + stub_serviceability_collector, redfish_conn_mock +): + class _BadFilter(_StubServiceabilityCollector): + def filter_event_members(self, members, args): + raise ValueError("bad filter") + + collector = _BadFilter( + system_info=stub_serviceability_collector.system_info, + connection=redfish_conn_mock, + ) + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert "Event filter failed" in result.message + assert data is None + + +def test_stub_collector_assembly_and_firmware_paths( + stub_serviceability_collector, redfish_conn_mock +): + tpl = "/redfish/v1/Chassis/{device}/Assembly" + asm_uri = tpl.format(device="C1") + fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" + + def run_get_side_effect(path: str, *_args, **_kwargs): + if path == EVENT_URI: + return RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + if path == asm_uri: + return RedfishGetResult( + path=asm_uri, + success=True, + data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + status_code=200, + ) + if path == fw_uri: + return RedfishGetResult( + path=fw_uri, + success=True, + data={"Details": "fw-summary"}, + status_code=200, + ) + raise AssertionError(f"unexpected Redfish GET path: {path!r}") + + redfish_conn_mock.run_get.side_effect = run_get_side_effect + + def run_get_paged_forbidden(*_args, **_kwargs): + raise AssertionError("run_get_paged must not run when follow_next_link=False") + + redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden + + args = ServiceabilityCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template=tpl, + rf_chassis_devices=["C1"], + rf_firmware_bundle_uri=fw_uri, + follow_next_link=False, + ) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert "C1" in data.assembly_info + assert data.assembly_info["C1"].serial_number == "SN-ASM" + assert data.component_details == "fw-summary" + assert asm_uri in data.responses + + +def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 100}, + status_code=200, + ) + window = RedfishGetResult( + path=f"{EVENT_URI}?$skip=90", + success=True, + data={RF_MEMBERS: [{"Id": "last"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = window + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == [{"Id": "last"}] + redfish_conn_mock.run_get.assert_called_once() + assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] + redfish_conn_mock.run_get_paged.assert_called_once_with( + f"{EVENT_URI}?$skip=90", max_pages=args.max_pages + ) + + +def test_stub_collector_top_when_count_within_top_fetches_full_log( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 3}, + status_code=200, + ) + full = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = full + args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 2 + redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) + + +def test_serviceability_data_model_log_model_writes_json(tmp_path): + model = ServiceabilityDataModel( + responses={"/x": {"ok": True}}, + cper_data={"slot": {"raw": "data"}}, + ) + model.log_model(str(tmp_path)) + responses_file = tmp_path / "redfish_responses.json" + cper_file = tmp_path / "cper_data.json" + assert responses_file.is_file() + assert cper_file.is_file() + assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} + assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} + + +def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): + model = ServiceabilityDataModel(responses={}) + model.log_model(str(tmp_path)) + assert (tmp_path / "redfish_responses.json").is_file() + assert not (tmp_path / "cper_data.json").exists() From d121f99ff60da154f690865f03bafe22bc3b8d27 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 18 May 2026 19:12:37 -0500 Subject: [PATCH 03/39] updates --- .../plugins/serviceability/__init__.py | 40 ++- .../serviceability/oob_redfish/__init__.py | 44 +++ .../oob_redfish/oob_redfish_collector.py | 76 ++++ .../oob_redfish_collector_args.py} | 51 ++- .../oob_redfish/oob_redfish_data.py | 186 ++++++++++ .../oob_redfish_plugin.py} | 18 +- .../serviceability_collector.py | 182 ---------- .../serviceability/serviceability_data.py | 82 ----- .../plugins/serviceability/time_utils.py | 116 +++++++ .../unit/plugin/test_oob_redfish_collector.py | 181 ++++++++++ .../plugin/test_serviceability_collector.py | 326 ------------------ 11 files changed, 684 insertions(+), 618 deletions(-) create mode 100644 nodescraper/plugins/serviceability/oob_redfish/__init__.py create mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py rename nodescraper/plugins/serviceability/{collector_args.py => oob_redfish/oob_redfish_collector_args.py} (67%) create mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py rename nodescraper/plugins/serviceability/{serviceability_plugin_base.py => oob_redfish/oob_redfish_plugin.py} (71%) delete mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py delete mode 100644 nodescraper/plugins/serviceability/serviceability_data.py create mode 100644 nodescraper/plugins/serviceability/time_utils.py create mode 100644 test/unit/plugin/test_oob_redfish_collector.py delete mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index af181362..16a87632 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -23,20 +23,34 @@ # SOFTWARE. # ############################################################################### -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_collector import ServiceabilityCollectorBase -from .serviceability_data import ( - DeviceInfo, - ServiceabilityDataModel, - ServiceabilityResult, +from .oob_redfish import ( + OobRedfishCollector, + OobRedfishCollectorArgs, + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishPlugin, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, +) +from .time_utils import ( + TimeOperator, + compare_iso_datetime, + is_valid_iso_datetime, + parse_iso_datetime, + satisfies_time_check, ) -from .serviceability_plugin_base import ServiceabilityPluginBase __all__ = [ - "DeviceInfo", - "ServiceabilityCollectorArgs", - "ServiceabilityCollectorBase", - "ServiceabilityDataModel", - "ServiceabilityPluginBase", - "ServiceabilityResult", + "OobRedfishCollector", + "OobRedfishCollectorArgs", + "OobRedfishDataModel", + "OobRedfishDeviceInfo", + "OobRedfishPlugin", + "OobRedfishResult", + "TimeOperator", + "build_oob_redfish_reporting_version_fields", + "compare_iso_datetime", + "is_valid_iso_datetime", + "parse_iso_datetime", + "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/oob_redfish/__init__.py b/nodescraper/plugins/serviceability/oob_redfish/__init__.py new file mode 100644 index 00000000..e0dae020 --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/__init__.py @@ -0,0 +1,44 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .oob_redfish_collector import OobRedfishCollector +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import ( + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, +) +from .oob_redfish_plugin import OobRedfishPlugin + +__all__ = [ + "OobRedfishCollector", + "OobRedfishCollectorArgs", + "OobRedfishDataModel", + "OobRedfishDeviceInfo", + "OobRedfishPlugin", + "OobRedfishResult", + "build_oob_redfish_reporting_version_fields", +] diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py new file mode 100644 index 00000000..503d7103 --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py @@ -0,0 +1,76 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from nodescraper.base import RedfishDataCollector +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult +from nodescraper.plugins.serviceability.time_utils import satisfies_time_check + +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import OobRedfishDataModel + + +class OobRedfishCollector( + RedfishDataCollector[OobRedfishDataModel, OobRedfishCollectorArgs], +): + """Collect OOB Redfish serviceability data.""" + + DATA_MODEL = OobRedfishDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.pop("log_path", None) + super().__init__(**kwargs) + + def satisfies_reference_time( + self, + candidate: str, + args: OobRedfishCollectorArgs, + ) -> bool: + """Test a timestamp against optional reference-time filter settings. + + Args: + candidate: Timestamp string to test. + args: Collector arguments that may define reference_time and time_operator. + + Returns: + True when no filter is configured or the comparison succeeds. + """ + if args.reference_time is None or args.time_operator is None: + return True + return satisfies_time_check(candidate, args.reference_time, args.time_operator) + + def _missing_args_result(self) -> tuple[TaskResult, None]: + """Build a not-ran result when collector arguments are missing. + + Returns: + Task result with NOT_RAN status and no data model. + """ + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "OobRedfishCollectorArgs are required" + return self.result, None diff --git a/nodescraper/plugins/serviceability/collector_args.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py similarity index 67% rename from nodescraper/plugins/serviceability/collector_args.py rename to nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py index 4b2511ca..5c1b0687 100644 --- a/nodescraper/plugins/serviceability/collector_args.py +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py @@ -27,13 +27,17 @@ from typing import List, Optional -from pydantic import Field, model_validator +from pydantic import Field, field_validator, model_validator from nodescraper.models import CollectorArgs +from nodescraper.plugins.serviceability.time_utils import ( + TimeOperator, + is_valid_iso_datetime, +) -class ServiceabilityCollectorArgs(CollectorArgs): - """URIs and pagination only. Subclasses add filtering and OEM-specific options.""" +class OobRedfishCollectorArgs(CollectorArgs): + """Arguments for OOB Redfish serviceability collection.""" uri: Optional[str] = Field( default=None, @@ -70,9 +74,32 @@ class ServiceabilityCollectorArgs(CollectorArgs): ge=1, description="Most recent N entries via $skip after count probe; None collects full window.", ) + reference_time: Optional[str] = Field( + default=None, + description=( + "Optional ISO-8601 date or date-time used with time_operator " + "(e.g. 2026-05-17 or 2026-05-17T13:01:00)." + ), + ) + time_operator: Optional[TimeOperator] = Field( + default=None, + description="Comparison operator applied when reference_time is set.", + ) + + @field_validator("reference_time") + @classmethod + def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + if not text: + raise ValueError("reference_time must be a non-empty ISO-8601 string") + if not is_valid_iso_datetime(text): + raise ValueError(f"reference_time is not ISO-8601 compliant: {value!r}") + return text @model_validator(mode="after") - def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: + def _require_event_log_uri(self) -> OobRedfishCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -80,7 +107,7 @@ def _require_event_log_uri(self) -> ServiceabilityCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> ServiceabilityCollectorArgs: + def _assembly_consistency(self) -> OobRedfishCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -92,8 +119,20 @@ def _assembly_consistency(self) -> ServiceabilityCollectorArgs: ) return self + @model_validator(mode="after") + def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: + has_ref = self.reference_time is not None + has_op = self.time_operator is not None + if has_ref != has_op: + raise ValueError("Provide both reference_time and time_operator, or omit both.") + return self + def resolved_event_log_uri(self) -> str: - """Return uri or rf_event_log_uri.""" + """Resolve the configured event log URI. + + Returns: + Non-empty URI from uri or rf_event_log_uri, or an empty string. + """ for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py new file mode 100644 index 00000000..6ad69a7b --- /dev/null +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py @@ -0,0 +1,186 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +from nodescraper.models import DataModel + + +class OobRedfishDeviceInfo(BaseModel): + """Device identity with separate board and product fields.""" + + board_product_name: Optional[str] = Field( + default=None, + description="Board product name (IPMI board information area).", + ) + board_part_number: Optional[str] = Field( + default=None, + description="Board part number.", + ) + board_serial_number: Optional[str] = Field( + default=None, + description="Board serial number.", + ) + board_manufacturing_date: Optional[str] = Field( + default=None, + description=( + "Board manufacturing date as a rendered string " + "(not IPMI minutes-since-1996 encoding)." + ), + ) + product_name: Optional[str] = Field( + default=None, + description="Product name (IPMI product information area).", + ) + product_part_number: Optional[str] = Field( + default=None, + description="Product part or model number.", + ) + product_serial_number: Optional[str] = Field( + default=None, + description="Product serial number.", + ) + product_version: Optional[str] = Field( + default=None, + description="Product version (no board-area equivalent in IPMI FRU).", + ) + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description=("Vendor-specific fields: extra board/product data, multirecord, etc."), + ) + + +class OobRedfishResult(BaseModel): + """Structured serviceability report output.""" + + node: Optional[str] = None + node_scraper_version: Optional[str] = Field( + default=None, + description="Version of amd-node-scraper that produced this report.", + ) + plugin_name: Optional[str] = Field( + default=None, + description="Name of the serviceability plugin that produced this report.", + ) + plugin_version: Optional[str] = Field( + default=None, + description="Version of the serviceability plugin that produced this report.", + ) + reporter_extensions: Dict[str, str] = Field( + default_factory=dict, + description="Additional tool versions keyed by name.", + ) + service_recommendations: Dict[str, List[dict]] = Field(default_factory=dict) + service_action_definitions: Dict[str, dict] = Field(default_factory=dict) + afid_sag_metadata: Dict[str, Any] = Field(default_factory=dict) + node_info: Dict[str, Any] = Field(default_factory=dict) + extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Additional implementation-specific fields.", + ) + + +def build_oob_redfish_reporting_version_fields( + *, + plugin_name: Optional[str] = None, + plugin_version: Optional[str] = None, + node_scraper_version: Optional[str] = None, + **reporter_extensions: str, +) -> Dict[str, Any]: + """Build keyword arguments for result versioning fields. + + Args: + plugin_name: Name of the reporting plugin. + plugin_version: Version of the reporting plugin. + node_scraper_version: Node scraper version; defaults to the installed package version. + reporter_extensions: Additional tool versions as keyword arguments. + + Returns: + Dictionary of versioning fields for a result model. + """ + import nodescraper + + return { + "node_scraper_version": node_scraper_version or nodescraper.__version__, + "plugin_name": plugin_name, + "plugin_version": plugin_version, + "reporter_extensions": dict(reporter_extensions), + } + + +class OobRedfishDataModel(DataModel): + """Collected OOB Redfish serviceability data model.""" + + collected_data: Dict[str, Any] = Field( + default_factory=dict, + description="Arbitrary keyed payloads from the collector implementation.", + ) + device_info: Dict[str, OobRedfishDeviceInfo] = Field( + default_factory=dict, + description="Optional device identity keyed by implementer-defined labels.", + ) + artifacts: Dict[str, Any] = Field( + default_factory=dict, + description="Filename to JSON-serializable payload for log_model output.", + ) + endpoint: Optional[str] = Field( + default=None, + description="Optional host or service endpoint label (not necessarily a BMC).", + ) + log_path: Optional[str] = None + result: Optional[OobRedfishResult] = None + + def log_model(self, log_path: str) -> None: + """Write artifact files and a JSON summary under the log directory. + + Args: + log_path: Directory path for output files. + + Returns: + None. + """ + os.makedirs(log_path, exist_ok=True) + for filename, payload in self.artifacts.items(): + if not filename or not str(filename).strip(): + continue + artifact_path = os.path.join(log_path, str(filename).strip()) + with open(artifact_path, "w", encoding="utf-8") as handle: + json.dump(payload, handle, indent=2) + summary_path = os.path.join(log_path, "oob_redfish_data.json") + with open(summary_path, "w", encoding="utf-8") as handle: + json.dump( + self.model_dump( + exclude={"artifacts"}, + mode="json", + ), + handle, + indent=2, + ) diff --git a/nodescraper/plugins/serviceability/serviceability_plugin_base.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py similarity index 71% rename from nodescraper/plugins/serviceability/serviceability_plugin_base.py rename to nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py index b3ca322a..b891c522 100644 --- a/nodescraper/plugins/serviceability/serviceability_plugin_base.py +++ b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py @@ -25,16 +25,16 @@ ############################################################################### from nodescraper.base import OOBandDataPlugin -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_collector import ServiceabilityCollectorBase -from .serviceability_data import ServiceabilityDataModel +from .oob_redfish_collector import OobRedfishCollector +from .oob_redfish_collector_args import OobRedfishCollectorArgs +from .oob_redfish_data import OobRedfishDataModel -class ServiceabilityPluginBase( - OOBandDataPlugin[ServiceabilityDataModel, ServiceabilityCollectorArgs, None], +class OobRedfishPlugin( + OOBandDataPlugin[OobRedfishDataModel, OobRedfishCollectorArgs, None], ): - """OOB Redfish collect-only plugin stub; subclass with a concrete COLLECTOR and optional ANALYZER.""" + """OOB Redfish serviceability plugin base.""" - DATA_MODEL = ServiceabilityDataModel - COLLECTOR = ServiceabilityCollectorBase - COLLECTOR_ARGS = ServiceabilityCollectorArgs + DATA_MODEL = OobRedfishDataModel + COLLECTOR = OobRedfishCollector + COLLECTOR_ARGS = OobRedfishCollectorArgs diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py deleted file mode 100644 index 7e364afd..00000000 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ /dev/null @@ -1,182 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -import abc -from typing import Any, Optional -from urllib.parse import urlparse - -from nodescraper.base import RedfishDataCollector -from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT -from nodescraper.enums import ExecutionStatus -from nodescraper.models import TaskResult - -from .collector_args import ServiceabilityCollectorArgs -from .serviceability_data import DeviceInfo, ServiceabilityDataModel - - -class ServiceabilityCollectorBase( - RedfishDataCollector[ServiceabilityDataModel, ServiceabilityCollectorArgs], -): - """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" - - DATA_MODEL = ServiceabilityDataModel - - def __init__(self, **kwargs: Any) -> None: - self._log_path: Optional[str] = kwargs.get("log_path") - super().__init__(**kwargs) - - @abc.abstractmethod - def filter_event_members( - self, - members: list[Any], - args: ServiceabilityCollectorArgs, - ) -> list[Any]: - """Return the event list to retain for downstream analysis.""" - - @abc.abstractmethod - def is_cper_event(self, event: dict) -> bool: - """Return whether a Redfish event entry should be treated as diagnostic-backed.""" - - @abc.abstractmethod - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" - - @abc.abstractmethod - def parse_assembly_entry( - self, - designation: str, - assembly_member_entry: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> DeviceInfo: - """Map one Assemblies[] member dict into DeviceInfo.""" - - @abc.abstractmethod - def extract_component_details( - self, - firmware_inventory_payload: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> Optional[str]: - """Derive component-details text from a firmware inventory GET payload, or None.""" - - def _fetch_event_log(self, args: ServiceabilityCollectorArgs, uri: str): - if args.follow_next_link: - return self._run_redfish_get_paged(uri, max_pages=args.max_pages) - return self._run_redfish_get(uri, log_artifact=True) - - def collect_data( - self, args: Optional[ServiceabilityCollectorArgs] = None - ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: - if args is None: - self.result.status = ExecutionStatus.NOT_RAN - self.result.message = "ServiceabilityCollectorArgs are required" - return self.result, None - - event_uri = args.resolved_event_log_uri() - if args.top is not None: - res = self._fetch_top(args, args.top, args.max_pages) - else: - res = self._fetch_event_log(args, event_uri) - - if not res.success or res.data is None: - self.result.status = ExecutionStatus.ERROR - self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" - return self.result, None - - members = res.data.get(RF_MEMBERS, []) - responses = {res.path: res.data} - raw_base_url = getattr(self.connection, "base_url", None) - bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None - - try: - filtered_members = self.filter_event_members(members, args) - except ValueError as exc: - self.result.status = ExecutionStatus.ERROR - self.result.message = f"Event filter failed: {exc}" - return self.result, None - - assembly_info: dict[str, DeviceInfo] = {} - tpl = args.rf_assembly_uri_template - devices = args.rf_chassis_devices - if tpl and devices: - for device in devices: - uri_asm = tpl.format(device=device) - assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) - if not assembly_res.success or assembly_res.data is None: - continue - responses[assembly_res.path] = assembly_res.data - - assemblies = assembly_res.data.get("Assemblies", []) - if not assemblies: - continue - - entry = assemblies[0] - assembly_info[device] = self.parse_assembly_entry(device, entry, args) - - cper_data = self.collect_cper_data(filtered_members or []) - - data = ServiceabilityDataModel( - responses=responses, - rf_events=filtered_members or [], - assembly_info=assembly_info, - cper_data=cper_data, - component_details=self._fetch_component_details(responses, args), - log_path=self._log_path, - bmc_host=bmc_host, - ) - self.result.status = ExecutionStatus.OK - self.result.message = f"Collected {len(members)} event log member(s)" - return self.result, data - - def _fetch_component_details( - self, responses: dict[str, Any], args: ServiceabilityCollectorArgs - ) -> Optional[str]: - fw_uri = args.rf_firmware_bundle_uri - if not fw_uri or not str(fw_uri).strip(): - return None - fw_uri = str(fw_uri).strip() - fw_res = self._run_redfish_get(fw_uri, log_artifact=True) - if not fw_res.success or fw_res.data is None: - return None - responses[fw_res.path] = fw_res.data - return self.extract_component_details(fw_res.data, args) - - def _fetch_top(self, args: ServiceabilityCollectorArgs, top: int, max_pages: int): - event_uri = args.resolved_event_log_uri() - probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) - if not probe.success or probe.data is None: - return probe - - count = probe.data.get(RF_MEMBERS_COUNT, 0) - - if count <= top: - return self._fetch_event_log(args, event_uri) - - skip = count - top - skip_uri = f"{event_uri}?$skip={skip}" - if args.follow_next_link: - return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) - return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py deleted file mode 100644 index 93e57737..00000000 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ /dev/null @@ -1,82 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -import json -import os -from typing import Any, Dict, List, Optional - -from pydantic import BaseModel, Field - -from nodescraper.models import DataModel - - -class DeviceInfo(BaseModel): - """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" - - name: Optional[str] = None - part_number: Optional[str] = None - production_date: Optional[str] = None - serial_number: Optional[str] = None - version: Optional[str] = None - oem_extensions: Dict[str, Any] = Field( - default_factory=dict, - description="Opaque vendor/product extensions parsed by the concrete collector.", - ) - - -class ServiceabilityResult(BaseModel): - """Structured serviceability output (typically populated by a downstream analyzer).""" - - node: Optional[str] = None - service_recommendations: Dict[str, List[dict]] = {} - service_action_definitions: Dict[str, dict] = {} - afid_sag_metadata: Dict[str, Any] = {} - node_info: Dict[str, Any] = {} - - -class ServiceabilityDataModel(DataModel): - """Collected Redfish responses and intermediate serviceability fields.""" - - responses: dict[str, Any] = {} - rf_events: list[Any] = [] - assembly_info: Dict[str, DeviceInfo] = {} - cper_data: Dict[str, Any] = {} - component_details: Optional[str] = None - log_path: Optional[str] = None - bmc_host: Optional[str] = None - result: Optional[ServiceabilityResult] = None - - def log_model(self, log_path: str) -> None: - """Write redfish_responses.json and optional cper_data.json under log_path.""" - os.makedirs(log_path, exist_ok=True) - responses_path = os.path.join(log_path, "redfish_responses.json") - with open(responses_path, "w", encoding="utf-8") as f: - json.dump(self.responses, f, indent=2) - if self.cper_data: - cper_path = os.path.join(log_path, "cper_data.json") - with open(cper_path, "w", encoding="utf-8") as f: - json.dump(self.cper_data, f, indent=2) diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py new file mode 100644 index 00000000..8bbc8a83 --- /dev/null +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -0,0 +1,116 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from datetime import datetime +from typing import Literal + +TimeOperator = Literal[">", ">=", "<", "<=", "=="] + +_TIME_OPERATORS: set[str] = {">", ">=", "<", "<=", "=="} + + +def is_valid_iso_datetime(value: str) -> bool: + """Return whether a string is ISO-8601 compliant. + + Args: + value: Date or date-time string to validate. + + Returns: + True if the value parses as ISO-8601. + """ + try: + parse_iso_datetime(value) + except ValueError: + return False + return True + + +def parse_iso_datetime(value: str) -> datetime: + """Parse an ISO-8601 date or date-time string. + + Args: + value: Date (e.g. 2026-05-17) or date-time (e.g. 2026-05-17T13:01:00). + + Returns: + Parsed datetime. + """ + text = str(value).strip() + if not text: + raise ValueError("Empty datetime string") + if text.endswith("Z"): + text = f"{text[:-1]}+00:00" + try: + parsed = datetime.fromisoformat(text) + except ValueError as exc: + raise ValueError(f"Not ISO-8601 compliant: {value!r}") from exc + if "T" not in text and "+" not in text and text.count("-") == 2: + return parsed.replace(hour=0, minute=0, second=0, microsecond=0) + return parsed + + +def compare_iso_datetime(left: str, right: str, operator: TimeOperator) -> bool: + """Compare two ISO-8601 values with a relational operator. + + Args: + left: Left-hand date or date-time string. + right: Right-hand date or date-time string. + operator: One of >, >=, <, <=, or ==. + + Returns: + Result of the comparison. + """ + if operator not in _TIME_OPERATORS: + raise ValueError(f"Unsupported time operator: {operator!r}") + left_dt = parse_iso_datetime(left) + right_dt = parse_iso_datetime(right) + if operator == ">": + return left_dt > right_dt + if operator == ">=": + return left_dt >= right_dt + if operator == "<": + return left_dt < right_dt + if operator == "<=": + return left_dt <= right_dt + return left_dt == right_dt + + +def satisfies_time_check( + candidate: str, + reference: str, + operator: TimeOperator, +) -> bool: + """Test whether candidate satisfies operator against reference. + + Args: + candidate: Date or date-time string to test. + reference: Reference date or date-time string. + operator: One of >, >=, <, <=, or ==. + + Returns: + True when the comparison holds. + """ + return compare_iso_datetime(candidate, reference, operator) diff --git a/test/unit/plugin/test_oob_redfish_collector.py b/test/unit/plugin/test_oob_redfish_collector.py new file mode 100644 index 00000000..e729cedc --- /dev/null +++ b/test/unit/plugin/test_oob_redfish_collector.py @@ -0,0 +1,181 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.base import OOBandDataPlugin +from nodescraper.connection.redfish import RedfishConnectionManager +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + OobRedfishCollector, + OobRedfishCollectorArgs, + OobRedfishDataModel, + OobRedfishDeviceInfo, + OobRedfishPlugin, + OobRedfishResult, + build_oob_redfish_reporting_version_fields, + compare_iso_datetime, + is_valid_iso_datetime, + satisfies_time_check, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubOobRedfishCollector(OobRedfishCollector): + def collect_data(self, args: Optional[OobRedfishCollectorArgs] = None): + if args is None: + return self._missing_args_result() + data = OobRedfishDataModel( + collected_data={"events": []}, + log_path=self._log_path, + ) + self.result.status = ExecutionStatus.OK + self.result.message = "stub collection complete" + return self.result, data + + +@pytest.fixture +def stub_oob_redfish_collector(system_info, redfish_conn_mock): + return _StubOobRedfishCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/oob_redfish.log", + ) + + +def test_oob_redfish_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs() + + +def test_oob_redfish_collector_args_uri_alias(): + args = OobRedfishCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_oob_redfish_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_oob_redfish_collector_args_reference_time_requires_operator(): + with pytest.raises(ValidationError): + OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + ) + + +def test_oob_redfish_collector_args_accepts_iso_date_and_datetime(): + date_args = OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + time_operator=">=", + ) + assert date_args.reference_time == "2026-05-17" + + +def test_time_utils_iso_validation_and_comparison(): + assert is_valid_iso_datetime("2026-05-17") + assert satisfies_time_check("2026-05-18", "2026-05-17", ">") + assert compare_iso_datetime("2026-05-17T13:01:00", "2026-05-17T13:01:00", "==") + + +def test_oob_redfish_plugin_wiring(): + assert issubclass(OobRedfishPlugin, OOBandDataPlugin) + assert OobRedfishPlugin.DATA_MODEL is OobRedfishDataModel + assert OobRedfishPlugin.COLLECTOR is OobRedfishCollector + assert OobRedfishPlugin.COLLECTOR_ARGS is OobRedfishCollectorArgs + assert OobRedfishPlugin.CONNECTION_TYPE is RedfishConnectionManager + assert OobRedfishPlugin.ANALYZER is None + + +def test_stub_collector_no_args(stub_oob_redfish_collector): + result, data = stub_oob_redfish_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_success_minimal(stub_oob_redfish_collector): + args = OobRedfishCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_oob_redfish_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.collected_data == {"events": []} + + +def test_collector_satisfies_reference_time_helper(stub_oob_redfish_collector): + args = OobRedfishCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2026-05-17", + time_operator=">=", + ) + assert stub_oob_redfish_collector.satisfies_reference_time("2026-05-18", args) + assert not stub_oob_redfish_collector.satisfies_reference_time("2026-05-16", args) + + +def test_oob_redfish_device_info_fields(): + info = OobRedfishDeviceInfo( + board_product_name="Board-A", + board_serial_number="BSN-1", + product_version="1.0", + ) + assert info.board_product_name == "Board-A" + assert info.product_version == "1.0" + + +def test_oob_redfish_result_reporting_versions(): + version_fields = build_oob_redfish_reporting_version_fields( + plugin_name="example_oob_redfish", + plugin_version="0.1.0", + node_scraper_version="1.2.3", + isa_version="9.8.7", + ) + result = OobRedfishResult(node="node-1", **version_fields) + assert result.plugin_name == "example_oob_redfish" + assert result.reporter_extensions["isa_version"] == "9.8.7" + + +def test_oob_redfish_data_model_log_model(tmp_path): + model = OobRedfishDataModel( + collected_data={"events": [{"id": 1}]}, + artifacts={"events.json": [{"id": 1}]}, + ) + model.log_model(str(tmp_path)) + assert (tmp_path / "events.json").is_file() + assert (tmp_path / "oob_redfish_data.json").is_file() diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py deleted file mode 100644 index e3a67d5d..00000000 --- a/test/unit/plugin/test_serviceability_collector.py +++ /dev/null @@ -1,326 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -import json -from typing import Any, Optional - -import pytest -from pydantic import ValidationError - -from nodescraper.connection.redfish import ( - RF_MEMBERS, - RF_MEMBERS_COUNT, - RedfishGetResult, -) -from nodescraper.enums import ExecutionStatus -from nodescraper.plugins.serviceability import ( - DeviceInfo, - ServiceabilityCollectorArgs, - ServiceabilityDataModel, - ServiceabilityPluginBase, -) -from nodescraper.plugins.serviceability.serviceability_collector import ( - ServiceabilityCollectorBase, -) - -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" - - -class _StubServiceabilityCollector(ServiceabilityCollectorBase): - def filter_event_members( - self, - members: list[Any], - args: ServiceabilityCollectorArgs, - ) -> list[Any]: - return members - - def is_cper_event(self, event: dict) -> bool: - return False - - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - return {} - - def parse_assembly_entry( - self, - designation: str, - assembly_member_entry: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> DeviceInfo: - return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) - - def extract_component_details( - self, - firmware_inventory_payload: dict[str, Any], - args: ServiceabilityCollectorArgs, - ) -> Optional[str]: - return firmware_inventory_payload.get("Details") - - -@pytest.fixture -def stub_serviceability_collector(system_info, redfish_conn_mock): - redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" - return _StubServiceabilityCollector( - system_info=system_info, - connection=redfish_conn_mock, - log_path="/tmp/serviceability.log", - ) - - -def test_serviceability_collector_args_requires_event_log_uri(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs() - - -def test_serviceability_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): - args = ServiceabilityCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" - - -def test_serviceability_collector_args_assembly_requires_both_template_and_devices(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", - ) - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], - ) - - -def test_serviceability_collector_args_assembly_template_must_include_device_placeholder(): - with pytest.raises(ValidationError): - ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", - rf_chassis_devices=["C1"], - ) - - -def test_serviceability_collector_args_assembly_optional_when_omitted(): - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - assert args.rf_assembly_uri_template is None - assert args.rf_chassis_devices is None - - -def test_serviceability_plugin_base_wiring(): - assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel - assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase - assert ServiceabilityPluginBase.COLLECTOR_ARGS is ServiceabilityCollectorArgs - assert ServiceabilityPluginBase.ANALYZER is None - - -def test_stub_collector_no_args(stub_serviceability_collector): - result, data = stub_serviceability_collector.collect_data() - assert result.status == ExecutionStatus.NOT_RAN - assert "required" in result.message.lower() - assert data is None - - -def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=False, - error="timeout", - status_code=None, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.ERROR - assert EVENT_URI in result.message - assert data is None - - -def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): - members = [{"Id": "1"}] - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: members}, - status_code=200, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.rf_events == members - assert EVENT_URI in data.responses - assert data.bmc_host == "bmc.example" - assert data.log_path == "/tmp/serviceability.log" - redfish_conn_mock.run_get_paged.assert_called_once() - - -def test_stub_collector_filter_raises_maps_to_error( - stub_serviceability_collector, redfish_conn_mock -): - class _BadFilter(_StubServiceabilityCollector): - def filter_event_members(self, members, args): - raise ValueError("bad filter") - - collector = _BadFilter( - system_info=stub_serviceability_collector.system_info, - connection=redfish_conn_mock, - ) - redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: []}, - status_code=200, - ) - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = collector.collect_data(args=args) - assert result.status == ExecutionStatus.ERROR - assert "Event filter failed" in result.message - assert data is None - - -def test_stub_collector_assembly_and_firmware_paths( - stub_serviceability_collector, redfish_conn_mock -): - tpl = "/redfish/v1/Chassis/{device}/Assembly" - asm_uri = tpl.format(device="C1") - fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" - - def run_get_side_effect(path: str, *_args, **_kwargs): - if path == EVENT_URI: - return RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: []}, - status_code=200, - ) - if path == asm_uri: - return RedfishGetResult( - path=asm_uri, - success=True, - data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, - status_code=200, - ) - if path == fw_uri: - return RedfishGetResult( - path=fw_uri, - success=True, - data={"Details": "fw-summary"}, - status_code=200, - ) - raise AssertionError(f"unexpected Redfish GET path: {path!r}") - - redfish_conn_mock.run_get.side_effect = run_get_side_effect - - def run_get_paged_forbidden(*_args, **_kwargs): - raise AssertionError("run_get_paged must not run when follow_next_link=False") - - redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden - - args = ServiceabilityCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template=tpl, - rf_chassis_devices=["C1"], - rf_firmware_bundle_uri=fw_uri, - follow_next_link=False, - ) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert "C1" in data.assembly_info - assert data.assembly_info["C1"].serial_number == "SN-ASM" - assert data.component_details == "fw-summary" - assert asm_uri in data.responses - - -def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( - stub_serviceability_collector, redfish_conn_mock -): - probe = RedfishGetResult( - path=f"{EVENT_URI}?$top=1", - success=True, - data={RF_MEMBERS_COUNT: 100}, - status_code=200, - ) - window = RedfishGetResult( - path=f"{EVENT_URI}?$skip=90", - success=True, - data={RF_MEMBERS: [{"Id": "last"}]}, - status_code=200, - ) - redfish_conn_mock.run_get.return_value = probe - redfish_conn_mock.run_get_paged.return_value = window - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.rf_events == [{"Id": "last"}] - redfish_conn_mock.run_get.assert_called_once() - assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] - redfish_conn_mock.run_get_paged.assert_called_once_with( - f"{EVENT_URI}?$skip=90", max_pages=args.max_pages - ) - - -def test_stub_collector_top_when_count_within_top_fetches_full_log( - stub_serviceability_collector, redfish_conn_mock -): - probe = RedfishGetResult( - path=f"{EVENT_URI}?$top=1", - success=True, - data={RF_MEMBERS_COUNT: 3}, - status_code=200, - ) - full = RedfishGetResult( - path=EVENT_URI, - success=True, - data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, - status_code=200, - ) - redfish_conn_mock.run_get.return_value = probe - redfish_conn_mock.run_get_paged.return_value = full - args = ServiceabilityCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) - result, data = stub_serviceability_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert len(data.rf_events) == 2 - redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) - - -def test_serviceability_data_model_log_model_writes_json(tmp_path): - model = ServiceabilityDataModel( - responses={"/x": {"ok": True}}, - cper_data={"slot": {"raw": "data"}}, - ) - model.log_model(str(tmp_path)) - responses_file = tmp_path / "redfish_responses.json" - cper_file = tmp_path / "cper_data.json" - assert responses_file.is_file() - assert cper_file.is_file() - assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} - assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} - - -def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): - model = ServiceabilityDataModel(responses={}) - model.log_model(str(tmp_path)) - assert (tmp_path / "redfish_responses.json").is_file() - assert not (tmp_path / "cper_data.json").exists() From f42ff6eabba4c39bca29156b4bda54007fffe846 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 26 May 2026 12:56:01 -0500 Subject: [PATCH 04/39] addressed reviews + added alllogs utests + missing functionality --- nodescraper/plugins/inband/rocm/rocmdata.py | 30 +- .../plugins/serviceability/__init__.py | 68 +++- .../plugins/serviceability/afid_events.py | 144 ++++++++ .../plugins/serviceability/analyzer_args.py | 109 ++++++ .../{oob_redfish => mi3xx}/__init__.py | 32 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 88 +++++ .../serviceability/mi3xx/mi3xx_collector.py | 107 ++++++ .../mi3xx_collector_args.py} | 16 +- .../mi3xx_data.py} | 14 +- .../mi3xx/serviceability_plugin_mi3xx.py | 44 +++ .../oob_redfish/oob_redfish_collector.py | 76 ---- .../plugins/serviceability/se_adapter.py | 137 ++++++++ .../plugins/serviceability/se_models.py | 85 +++++ .../plugins/serviceability/se_runner.py | 269 ++++++++++++++ .../serviceability_collector.py | 197 +++++++++++ .../serviceability/serviceability_data.py | 100 ++++++ ...lugin.py => serviceability_plugin_base.py} | 23 +- .../plugins/serviceability/time_utils.py | 30 +- .../unit/plugin/test_oob_redfish_collector.py | 181 ---------- .../plugin/test_serviceability_collector.py | 329 ++++++++++++++++++ 20 files changed, 1759 insertions(+), 320 deletions(-) create mode 100644 nodescraper/plugins/serviceability/afid_events.py create mode 100644 nodescraper/plugins/serviceability/analyzer_args.py rename nodescraper/plugins/serviceability/{oob_redfish => mi3xx}/__init__.py (70%) create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_collector_args.py => mi3xx/mi3xx_collector_args.py} (91%) rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_data.py => mi3xx/mi3xx_data.py} (95%) create mode 100644 nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py delete mode 100644 nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py create mode 100644 nodescraper/plugins/serviceability/se_adapter.py create mode 100644 nodescraper/plugins/serviceability/se_models.py create mode 100644 nodescraper/plugins/serviceability/se_runner.py create mode 100644 nodescraper/plugins/serviceability/serviceability_collector.py create mode 100644 nodescraper/plugins/serviceability/serviceability_data.py rename nodescraper/plugins/serviceability/{oob_redfish/oob_redfish_plugin.py => serviceability_plugin_base.py} (69%) delete mode 100644 test/unit/plugin/test_oob_redfish_collector.py create mode 100644 test/unit/plugin/test_serviceability_collector.py diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index c7e75608..eb1794c3 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,12 +24,22 @@ # ############################################################################### import re -from typing import List +from typing import List, Optional from pydantic import field_validator from nodescraper.models import DataModel +# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 +_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") +_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") + + +def _validate_rocm_version_string(rocm_version: str) -> str: + if not _ROCM_VERSION_RE.match(rocm_version): + raise ValueError(f"ROCm version has invalid format: {rocm_version}") + return rocm_version + class RocmDataModel(DataModel): rocm_version: str @@ -58,6 +68,18 @@ def validate_rocm_version(cls, rocm_version: str) -> str: Returns: str: The validated ROCm version string. """ - if not re.match(r"^\d+(?:\.\d+){0,3}(-\d+)?$", rocm_version): - raise ValueError(f"ROCm version has invalid format: {rocm_version}") - return rocm_version + return _validate_rocm_version_string(rocm_version) + + @field_validator("rocm_sub_versions") + @classmethod + def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: + for value in rocm_sub_versions.values(): + _validate_rocm_version_string(value) + return rocm_sub_versions + + @property + def build_number(self) -> Optional[str]: + """ROCm package build number from version-rocm sub-version or rocm_version.""" + version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version + match = _ROCM_BUILD_NUMBER_RE.match(version_str) + return match.group(1) if match else None diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index 16a87632..ae190bca 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -23,34 +23,72 @@ # SOFTWARE. # ############################################################################### -from .oob_redfish import ( - OobRedfishCollector, - OobRedfishCollectorArgs, - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishPlugin, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, +from .afid_events import build_afid_events_from_data +from .analyzer_args import ServiceabilityAnalyzerArgs +from .mi3xx import ( + Mi3xxAnalyzer, + Mi3xxCollector, + Mi3xxCollectorArgs, + Mi3xxDataModel, + Mi3xxDeviceInfo, + Mi3xxResult, + ServiceabilityPluginMI3XX, + build_mi3xx_reporting_version_fields, ) +from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine +from .se_models import ( + AfidEvent, + SeInputPayload, + ServiceabilityBlock, + ServiceabilitySolution, +) +from .se_runner import EngineBackend, SeRunError, resolve_engine_command, run_se +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ( + DeviceInfo, + ServiceabilityDataModel, + ServiceabilityResult, +) +from .serviceability_plugin_base import ServiceabilityPluginBase from .time_utils import ( TimeOperator, compare_iso_datetime, is_valid_iso_datetime, + normalize_se_timestamp, parse_iso_datetime, satisfies_time_check, ) __all__ = [ - "OobRedfishCollector", - "OobRedfishCollectorArgs", - "OobRedfishDataModel", - "OobRedfishDeviceInfo", - "OobRedfishPlugin", - "OobRedfishResult", + "AfidEvent", + "DeviceInfo", + "EngineBackend", + "Mi3xxAnalyzer", + "Mi3xxCollector", + "Mi3xxCollectorArgs", + "Mi3xxDataModel", + "Mi3xxDeviceInfo", + "Mi3xxResult", + "SeInputPayload", + "SeRunError", + "ServiceabilityAnalyzerArgs", + "ServiceabilityBlock", + "ServiceabilityCollectorBase", + "ServiceabilityDataModel", + "ServiceabilityPluginBase", + "ServiceabilityPluginMI3XX", + "ServiceabilityResult", + "ServiceabilitySolution", "TimeOperator", - "build_oob_redfish_reporting_version_fields", + "afid_events_to_engine_input", + "build_afid_events_from_data", + "serviceability_block_from_engine", + "build_mi3xx_reporting_version_fields", "compare_iso_datetime", "is_valid_iso_datetime", + "normalize_se_timestamp", "parse_iso_datetime", + "resolve_engine_command", + "run_se", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/afid_events.py b/nodescraper/plugins/serviceability/afid_events.py new file mode 100644 index 00000000..2138c0cf --- /dev/null +++ b/nodescraper/plugins/serviceability/afid_events.py @@ -0,0 +1,144 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from .se_models import AfidEvent +from .serviceability_data import ServiceabilityDataModel +from .time_utils import normalize_se_timestamp + +_EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") +_AFID_KEYS = ("Afid", "AFID", "afid") + + +def build_afid_events_from_data(data: ServiceabilityDataModel) -> list[AfidEvent]: + """Build SE input events from collected Redfish and CPER fields.""" + events: list[AfidEvent] = [] + seen: set[tuple[int, str, str]] = set() + + for rf_event in data.rf_events: + parsed = _afid_event_from_rf_member(rf_event) + if parsed is None: + continue + key = (parsed.afid, parsed.serviceable_unit, parsed.time) + if key in seen: + continue + seen.add(key) + events.append(parsed) + + for unit, payload in data.cper_data.items(): + parsed = _afid_event_from_cper_slot(str(unit), payload) + if parsed is None: + continue + key = (parsed.afid, parsed.serviceable_unit, parsed.time) + if key in seen: + continue + seen.add(key) + events.append(parsed) + + return events + + +def _afid_event_from_rf_member(member: Any) -> Optional[AfidEvent]: + if not isinstance(member, dict): + return None + afid = _extract_afid(member) + unit = _extract_serviceable_unit(member) + timestamp = _extract_timestamp(member) + if afid is None or unit is None or timestamp is None: + return None + return AfidEvent( + afid=afid, + serviceable_unit=unit, + time=normalize_se_timestamp(timestamp), + ) + + +def _afid_event_from_cper_slot(unit: str, payload: Any) -> Optional[AfidEvent]: + if not isinstance(payload, dict): + return None + afid = _extract_afid(payload) + timestamp = _extract_timestamp(payload) + unit_name = str(payload.get("serviceable_unit") or unit).strip() + if afid is None or not unit_name or timestamp is None: + return None + return AfidEvent( + afid=afid, + serviceable_unit=unit_name, + time=normalize_se_timestamp(timestamp), + ) + + +def _extract_afid(payload: dict[str, Any]) -> Optional[int]: + for key in _AFID_KEYS: + if key in payload and payload[key] is not None: + return int(payload[key]) + oem = payload.get("Oem") + if isinstance(oem, dict): + for vendor_payload in oem.values(): + if isinstance(vendor_payload, dict): + for key in _AFID_KEYS: + if key in vendor_payload and vendor_payload[key] is not None: + return int(vendor_payload[key]) + return None + + +def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: + for key in ("serviceable_unit", "ServiceableUnit", "OriginOfCondition", "Device"): + value = payload.get(key) + if value is None: + continue + if isinstance(value, dict): + odata_id = value.get("@odata.id") or value.get("odata.id") + if odata_id: + return _unit_from_odata_id(str(odata_id)) + text = str(value).strip() + if text: + return _unit_from_odata_id(text) if "/" in text else text + oem = payload.get("Oem") + if isinstance(oem, dict): + for vendor_payload in oem.values(): + if isinstance(vendor_payload, dict): + unit = vendor_payload.get("serviceable_unit") or vendor_payload.get( + "ServiceableUnit" + ) + if unit is not None and str(unit).strip(): + return str(unit).strip() + return None + + +def _extract_timestamp(payload: dict[str, Any]) -> Optional[str]: + for key in _EVENT_TIMESTAMP_KEYS: + value = payload.get(key) + if value is not None and str(value).strip(): + return str(value).strip() + return None + + +def _unit_from_odata_id(odata_id: str) -> str: + segment = odata_id.rstrip("/").split("/")[-1] + return segment or odata_id diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py new file mode 100644 index 00000000..c20366db --- /dev/null +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -0,0 +1,109 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Literal, Optional + +from pydantic import Field, field_validator, model_validator + +from nodescraper.models import AnalyzerArgs + +EngineBackend = Literal["python", "cli", "subprocess"] + + +class ServiceabilityAnalyzerArgs(AnalyzerArgs): + """Analyzer args for serviceability plugins.""" + + engine_backend: EngineBackend = Field( + default="python", + description=( + "How to invoke the SE: 'python' (serviceability_engine bindings), " + "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." + ), + ) + engine_python_module: str = Field( + default="serviceability_engine", + description="Python package providing ServiceabilityEngine bindings (python backend).", + ) + engine_executable: Optional[str] = Field( + default=None, + description="Path to the SE binary (cli or subprocess backends).", + ) + engine_entry_point: Optional[str] = Field( + default=None, + description=( + "Command for cli/subprocess backends: executable path or argv prefix on PATH. " + "Required when engine_backend is 'cli' or 'subprocess'." + ), + ) + afid_sag_path: Optional[str] = Field( + default=None, + description="Path to AFID_SAG.json.", + ) + engine_extra_args: List[str] = Field( + default_factory=list, + description="Extra CLI arguments (cli/subprocess backends).", + ) + engine_timeout_seconds: int = Field( + default=600, + ge=1, + le=86_400, + description="Subprocess timeout (cli/subprocess backends).", + ) + skip_engine: bool = Field( + default=False, + description="If True, only build afid_events without running the SE.", + ) + + @field_validator("engine_executable", "engine_entry_point", "afid_sag_path") + @classmethod + def _strip_optional_paths(cls, value: Optional[str]) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + return text or None + + @model_validator(mode="after") + def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: + if self.skip_engine: + return self + if not self.afid_sag_path: + raise ValueError("afid_sag_path is required when running the serviceability engine.") + if self.engine_backend == "python": + return self + has_exe = self.engine_executable is not None + has_entry = self.engine_entry_point is not None + if has_exe and has_entry: + raise ValueError( + "Provide only one of engine_executable or engine_entry_point " + "for cli/subprocess backends." + ) + if not has_exe and not has_entry: + raise ValueError( + "engine_executable or engine_entry_point is required when " + "engine_backend is 'cli' or 'subprocess'." + ) + return self diff --git a/nodescraper/plugins/serviceability/oob_redfish/__init__.py b/nodescraper/plugins/serviceability/mi3xx/__init__.py similarity index 70% rename from nodescraper/plugins/serviceability/oob_redfish/__init__.py rename to nodescraper/plugins/serviceability/mi3xx/__init__.py index e0dae020..25e83a07 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/__init__.py +++ b/nodescraper/plugins/serviceability/mi3xx/__init__.py @@ -23,22 +23,24 @@ # SOFTWARE. # ############################################################################### -from .oob_redfish_collector import OobRedfishCollector -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import ( - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, +from .mi3xx_analyzer import Mi3xxAnalyzer +from .mi3xx_collector import Mi3xxCollector +from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_data import ( + Mi3xxDataModel, + Mi3xxDeviceInfo, + Mi3xxResult, + build_mi3xx_reporting_version_fields, ) -from .oob_redfish_plugin import OobRedfishPlugin +from .serviceability_plugin_mi3xx import ServiceabilityPluginMI3XX __all__ = [ - "OobRedfishCollector", - "OobRedfishCollectorArgs", - "OobRedfishDataModel", - "OobRedfishDeviceInfo", - "OobRedfishPlugin", - "OobRedfishResult", - "build_oob_redfish_reporting_version_fields", + "Mi3xxAnalyzer", + "Mi3xxCollector", + "Mi3xxCollectorArgs", + "Mi3xxDataModel", + "Mi3xxDeviceInfo", + "Mi3xxResult", + "ServiceabilityPluginMI3XX", + "build_mi3xx_reporting_version_fields", ] diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py new file mode 100644 index 00000000..cd67bb58 --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -0,0 +1,88 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Optional + +from nodescraper.enums import ExecutionStatus +from nodescraper.interfaces import DataAnalyzer +from nodescraper.models import TaskResult +from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data +from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_se +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) + + +class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): + """Build AFID events from collected data and run the serviceability engine.""" + + DATA_MODEL = ServiceabilityDataModel + + def analyze_data( + self, + data: ServiceabilityDataModel, + args: Optional[ServiceabilityAnalyzerArgs] = None, + ) -> TaskResult: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "ServiceabilityAnalyzerArgs are required" + return self.result + + events = data.afid_events or build_afid_events_from_data(data) + data.afid_events = events + + if args.skip_engine: + data.serviceability = ServiceabilityBlock(afid_events=events) + self.result.status = ExecutionStatus.OK + self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + return self.result + + try: + block = run_se( + engine_backend=args.engine_backend, + engine_python_module=args.engine_python_module, + engine_executable=args.engine_executable, + engine_entry_point=args.engine_entry_point, + afid_events=events, + afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] + extra_args=args.engine_extra_args or None, + timeout_seconds=args.engine_timeout_seconds, + ) + except (SeRunError, ValueError) as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = str(exc) + return self.result + + data.serviceability = block + self.result.status = ExecutionStatus.OK + self.result.message = ( + f"Serviceability engine: {len(block.solution)} solution(s) " + f"from {len(events)} event(s)" + ) + return self.result diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py new file mode 100644 index 00000000..8f73941c --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -0,0 +1,107 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any, Optional + +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) +from nodescraper.plugins.serviceability.serviceability_data import DeviceInfo +from nodescraper.plugins.serviceability.time_utils import satisfies_time_check + +from .mi3xx_collector_args import Mi3xxCollectorArgs + +_EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") + + +class Mi3xxCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): + """MI3xx OOB Redfish serviceability collector.""" + + def satisfies_reference_time( + self, + candidate: str, + args: Mi3xxCollectorArgs, + ) -> bool: + """Test a timestamp against optional reference-time filter settings.""" + if args.reference_time is None or args.time_operator is None: + return True + return satisfies_time_check(candidate, args.reference_time, args.time_operator) + + def filter_event_members( + self, + members: list[Any], + args: Mi3xxCollectorArgs, + ) -> list[Any]: + filtered: list[Any] = [] + for member in members: + if not isinstance(member, dict): + filtered.append(member) + continue + timestamp = self._event_timestamp(member) + if timestamp is None or self.satisfies_reference_time(timestamp, args): + filtered.append(member) + return filtered + + def is_cper_event(self, event: dict) -> bool: + message_id = str(event.get("MessageId", "")).lower() + message = str(event.get("Message", "")).lower() + return "cper" in message_id or "cper" in message or "diagnostic" in message_id + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo( + name=assembly_member_entry.get("Name") or designation, + part_number=assembly_member_entry.get("PartNumber"), + production_date=assembly_member_entry.get("ProductionDate"), + serial_number=assembly_member_entry.get("SerialNumber"), + version=assembly_member_entry.get("Version"), + ) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> Optional[str]: + details = firmware_inventory_payload.get("Details") + if details is not None: + return str(details) + return None + + @staticmethod + def _event_timestamp(event: dict[str, Any]) -> Optional[str]: + for key in _EVENT_TIMESTAMP_KEYS: + value = event.get(key) + if value is not None and str(value).strip(): + return str(value).strip() + return None diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py similarity index 91% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py rename to nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py index 5c1b0687..ae7555d7 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector_args.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py @@ -36,8 +36,8 @@ ) -class OobRedfishCollectorArgs(CollectorArgs): - """Arguments for OOB Redfish serviceability collection.""" +class Mi3xxCollectorArgs(CollectorArgs): + """MI3xx OOB Redfish serviceability collector arguments.""" uri: Optional[str] = Field( default=None, @@ -99,7 +99,7 @@ def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: return text @model_validator(mode="after") - def _require_event_log_uri(self) -> OobRedfishCollectorArgs: + def _require_event_log_uri(self) -> Mi3xxCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -107,7 +107,7 @@ def _require_event_log_uri(self) -> OobRedfishCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> OobRedfishCollectorArgs: + def _assembly_consistency(self) -> Mi3xxCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -120,7 +120,7 @@ def _assembly_consistency(self) -> OobRedfishCollectorArgs: return self @model_validator(mode="after") - def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: + def _reference_time_requires_operator(self) -> Mi3xxCollectorArgs: has_ref = self.reference_time is not None has_op = self.time_operator is not None if has_ref != has_op: @@ -128,11 +128,7 @@ def _reference_time_requires_operator(self) -> OobRedfishCollectorArgs: return self def resolved_event_log_uri(self) -> str: - """Resolve the configured event log URI. - - Returns: - Non-empty URI from uri or rf_event_log_uri, or an empty string. - """ + """Return uri or rf_event_log_uri.""" for candidate in (self.uri, self.rf_event_log_uri): if candidate and str(candidate).strip(): return str(candidate).strip() diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py similarity index 95% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py rename to nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py index 6ad69a7b..6c9c268f 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_data.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py @@ -34,7 +34,7 @@ from nodescraper.models import DataModel -class OobRedfishDeviceInfo(BaseModel): +class Mi3xxDeviceInfo(BaseModel): """Device identity with separate board and product fields.""" board_product_name: Optional[str] = Field( @@ -78,7 +78,7 @@ class OobRedfishDeviceInfo(BaseModel): ) -class OobRedfishResult(BaseModel): +class Mi3xxResult(BaseModel): """Structured serviceability report output.""" node: Optional[str] = None @@ -108,7 +108,7 @@ class OobRedfishResult(BaseModel): ) -def build_oob_redfish_reporting_version_fields( +def build_mi3xx_reporting_version_fields( *, plugin_name: Optional[str] = None, plugin_version: Optional[str] = None, @@ -136,14 +136,14 @@ def build_oob_redfish_reporting_version_fields( } -class OobRedfishDataModel(DataModel): +class Mi3xxDataModel(DataModel): """Collected OOB Redfish serviceability data model.""" collected_data: Dict[str, Any] = Field( default_factory=dict, description="Arbitrary keyed payloads from the collector implementation.", ) - device_info: Dict[str, OobRedfishDeviceInfo] = Field( + device_info: Dict[str, Mi3xxDeviceInfo] = Field( default_factory=dict, description="Optional device identity keyed by implementer-defined labels.", ) @@ -156,7 +156,7 @@ class OobRedfishDataModel(DataModel): description="Optional host or service endpoint label (not necessarily a BMC).", ) log_path: Optional[str] = None - result: Optional[OobRedfishResult] = None + result: Optional[Mi3xxResult] = None def log_model(self, log_path: str) -> None: """Write artifact files and a JSON summary under the log directory. @@ -174,7 +174,7 @@ def log_model(self, log_path: str) -> None: artifact_path = os.path.join(log_path, str(filename).strip()) with open(artifact_path, "w", encoding="utf-8") as handle: json.dump(payload, handle, indent=2) - summary_path = os.path.join(log_path, "oob_redfish_data.json") + summary_path = os.path.join(log_path, "mi3xx_data.json") with open(summary_path, "w", encoding="utf-8") as handle: json.dump( self.model_dump( diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py new file mode 100644 index 00000000..ee0c510b --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -0,0 +1,44 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) +from nodescraper.plugins.serviceability.serviceability_plugin_base import ( + ServiceabilityPluginBase, +) + +from .mi3xx_analyzer import Mi3xxAnalyzer +from .mi3xx_collector import Mi3xxCollector +from .mi3xx_collector_args import Mi3xxCollectorArgs + + +class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): + """MI3xx OOB Redfish serviceability plugin.""" + + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = Mi3xxCollector + ANALYZER = Mi3xxAnalyzer + COLLECTOR_ARGS = Mi3xxCollectorArgs diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py b/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py deleted file mode 100644 index 503d7103..00000000 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_collector.py +++ /dev/null @@ -1,76 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from __future__ import annotations - -from typing import Any, Optional - -from nodescraper.base import RedfishDataCollector -from nodescraper.enums import ExecutionStatus -from nodescraper.models import TaskResult -from nodescraper.plugins.serviceability.time_utils import satisfies_time_check - -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import OobRedfishDataModel - - -class OobRedfishCollector( - RedfishDataCollector[OobRedfishDataModel, OobRedfishCollectorArgs], -): - """Collect OOB Redfish serviceability data.""" - - DATA_MODEL = OobRedfishDataModel - - def __init__(self, **kwargs: Any) -> None: - self._log_path: Optional[str] = kwargs.pop("log_path", None) - super().__init__(**kwargs) - - def satisfies_reference_time( - self, - candidate: str, - args: OobRedfishCollectorArgs, - ) -> bool: - """Test a timestamp against optional reference-time filter settings. - - Args: - candidate: Timestamp string to test. - args: Collector arguments that may define reference_time and time_operator. - - Returns: - True when no filter is configured or the comparison succeeds. - """ - if args.reference_time is None or args.time_operator is None: - return True - return satisfies_time_check(candidate, args.reference_time, args.time_operator) - - def _missing_args_result(self) -> tuple[TaskResult, None]: - """Build a not-ran result when collector arguments are missing. - - Returns: - Task result with NOT_RAN status and no data model. - """ - self.result.status = ExecutionStatus.NOT_RAN - self.result.message = "OobRedfishCollectorArgs are required" - return self.result, None diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py new file mode 100644 index 00000000..37b5d74c --- /dev/null +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -0,0 +1,137 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Map node-scraper serviceability models to/from the AMD serviceability-engine API.""" +from __future__ import annotations + +from collections import defaultdict +from typing import Any + +from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution + +_DEFAULT_SOLUTION_TIERS = ( + "primary_fru_events", + "secondary_actions", +) + + +def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: + """Convert plugin AFID events to serviceability-engine wire-format dicts. + + The engine triages on (afid, location, count). Duplicate (afid, unit) pairs + are merged by summing counts. Timestamp is preserved only on the plugin side. + """ + counts: dict[tuple[str, str], int] = defaultdict(int) + for event in afid_events: + key = (str(event.afid), event.serviceable_unit) + counts[key] += 1 + return [ + {"afid": afid, "location": location, "count": count} + for (afid, location), count in sorted(counts.items()) + ] + + +def recommendations_from_report_dict( + report: dict[str, Any], + *, + solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, +) -> list[dict[str, Any]]: + """Derive grouped recommendations from an :func:`serviceability_engine.api.analyze` report.""" + if "recommendations" in report: + return list(report["recommendations"]) + + grouped: dict[tuple[int, int], list[str]] = defaultdict(list) + for tier in solution_tiers: + for row in report.get(tier, []): + if not isinstance(row, dict): + continue + afid = int(row.get("afid", 0)) + location = str(row.get("location", "")).strip() + action_num = _action_num_from_row(row) + if not location or action_num is None: + continue + key = (afid, action_num) + if location not in grouped[key]: + grouped[key].append(location) + + return [ + { + "afid": afid, + "locations": locations, + "service_action_num": action_num, + } + for (afid, action_num), locations in sorted(grouped.items()) + ] + + +def serviceability_block_from_engine( + afid_events: list[AfidEvent], + report: dict[str, Any], + *, + recommendations: list[dict[str, Any]] | None = None, +) -> ServiceabilityBlock: + """Build the ANC ``serviceability`` block from an engine analysis report.""" + recs = ( + recommendations if recommendations is not None else recommendations_from_report_dict(report) + ) + solutions = [ + ServiceabilitySolution( + afid=int(item["afid"]), + serviceable_unit=list(item["locations"]), + service_action_num=int(item["service_action_num"]), + ) + for item in recs + ] + reasoning = _build_solution_reasoning(afid_events, solutions, report) + return ServiceabilityBlock( + afid_events=list(afid_events), + solution=solutions, + solution_reasoning=reasoning, + ) + + +def _action_num_from_row(row: dict[str, Any]) -> int | None: + if "service_action_num" in row: + return int(row["service_action_num"]) + service_action = row.get("service_action") + if isinstance(service_action, dict) and "id" in service_action: + return int(service_action["id"]) + afid_entry = row.get("afid_entry") + if isinstance(afid_entry, dict) and "service_action_num" in afid_entry: + return int(afid_entry["service_action_num"]) + return None + + +def _build_solution_reasoning( + afid_events: list[AfidEvent], + solutions: list[ServiceabilitySolution], + report: dict[str, Any], +) -> str: + sag_pid = report.get("sag_pid") or "unknown" + sag_revision = report.get("sag_revision") or "unknown" + return ( + f"Serviceability engine (SAG {sag_pid} rev {sag_revision}): " + f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." + ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py new file mode 100644 index 00000000..75919fc3 --- /dev/null +++ b/nodescraper/plugins/serviceability/se_models.py @@ -0,0 +1,85 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import List, Optional + +from pydantic import BaseModel, Field, field_validator + + +class AfidEvent(BaseModel): + """Serviceability engine input: one AFID occurrence on a serviceable unit.""" + + afid: int = Field(description="AMD Fault ID.") + serviceable_unit: str = Field( + description="Unit label (e.g. gpu02); standardized per platform.", + ) + time: str = Field( + description="First-occurrence timestamp (SE format, e.g. 2026-05-07 12:50:42.096-07:00).", + ) + + @field_validator("serviceable_unit") + @classmethod + def _strip_serviceable_unit(cls, value: str) -> str: + text = str(value).strip() + if not text: + raise ValueError("serviceable_unit must be non-empty") + return text + + +class ServiceabilitySolution(BaseModel): + """Serviceability engine output: recommended action for an AFID.""" + + afid: int + serviceable_unit: List[str] = Field( + description="Affected serviceable units for this AFID and service action.", + ) + service_action_num: int = Field( + description="Service action number from AFID_SAG.json.", + ) + + +class ServiceabilityBlock(BaseModel): + """ANC-style serviceability section: SE input, output, and optional reasoning.""" + + afid_events: List[AfidEvent] = Field( + default_factory=list, + description="Input events passed to the serviceability engine.", + ) + solution: List[ServiceabilitySolution] = Field( + default_factory=list, + description="Engine output: recommended service actions.", + ) + solution_reasoning: Optional[str] = Field( + default=None, + description="Human-readable summary of how the engine reached its conclusions.", + ) + + +class SeInputPayload(BaseModel): + """JSON written to the SE ``--input`` file.""" + + afid_events: List[AfidEvent] = Field(default_factory=list) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py new file mode 100644 index 00000000..df28426f --- /dev/null +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -0,0 +1,269 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Run the AMD serviceability engine (Python API, CLI, or custom subprocess).""" +from __future__ import annotations + +import importlib +import json +import shlex +import subprocess +import tempfile +from pathlib import Path +from typing import Literal, Optional + +from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine +from .se_models import AfidEvent, SeInputPayload, ServiceabilityBlock + +EngineBackend = Literal["python", "cli", "subprocess"] + + +class SeRunError(RuntimeError): + """Raised when the serviceability engine fails or returns invalid output.""" + + +def resolve_engine_command( + *, + engine_executable: Optional[str] = None, + engine_entry_point: Optional[str] = None, +) -> list[str]: + """Build the argv prefix for a subprocess or CLI-backed SE invocation.""" + has_exe = bool(engine_executable and str(engine_executable).strip()) + has_entry = bool(engine_entry_point and str(engine_entry_point).strip()) + if has_exe and has_entry: + raise ValueError("Provide only one of engine_executable or engine_entry_point.") + if not has_exe and not has_entry: + raise ValueError("Provide engine_executable or engine_entry_point.") + if has_exe: + return [str(engine_executable).strip()] + return shlex.split(str(engine_entry_point).strip()) + + +def run_se( + *, + engine_backend: EngineBackend = "python", + engine_python_module: str = "serviceability_engine", + engine_executable: Optional[str] = None, + engine_entry_point: Optional[str] = None, + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]] = None, + timeout_seconds: int = 600, + work_dir: Optional[str] = None, +) -> ServiceabilityBlock: + """Run the SE and return a :class:`ServiceabilityBlock`.""" + sag_path = Path(afid_sag_path) + if not sag_path.is_file(): + raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") + + if engine_backend == "python": + return _run_se_python( + engine_python_module=engine_python_module, + afid_events=afid_events, + afid_sag_path=str(sag_path), + ) + if engine_backend == "cli": + return _run_se_cli( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + afid_events=afid_events, + afid_sag_path=str(sag_path), + extra_args=extra_args, + timeout_seconds=timeout_seconds, + work_dir=work_dir, + ) + return _run_se_subprocess( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + afid_events=afid_events, + afid_sag_path=str(sag_path), + extra_args=extra_args, + timeout_seconds=timeout_seconds, + work_dir=work_dir, + ) + + +def _run_se_python( + *, + engine_python_module: str, + afid_events: list[AfidEvent], + afid_sag_path: str, +) -> ServiceabilityBlock: + try: + se = importlib.import_module(engine_python_module) + SagDocument = se.SagDocument + ServiceabilityEngine = se.ServiceabilityEngine + EventRecord = se.EventRecord + except (ImportError, AttributeError) as exc: + raise SeRunError( + f"Cannot import {engine_python_module} bindings — install serviceability-engine " + f"and build the Python extension (uv build)." + ) from exc + + wire_events = afid_events_to_engine_input(afid_events) + try: + sag = SagDocument.from_file(afid_sag_path) + records = [ + EventRecord( + afid=str(item["afid"]), + location=str(item["location"]), + count=int(item["count"]), + ) + for item in wire_events + ] + analysis = ServiceabilityEngine(sag).analyze(records) + report = analysis.to_dict() + except Exception as exc: + raise SeRunError(f"Serviceability engine analyze() failed: {exc}") from exc + + return serviceability_block_from_engine(afid_events, report) + + +def _run_se_cli( + *, + engine_executable: Optional[str], + engine_entry_point: Optional[str], + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]], + timeout_seconds: int, + work_dir: Optional[str], +) -> ServiceabilityBlock: + """Invoke an external engine CLI ``analyze --sag … --input …`` and map stdout JSON.""" + command = resolve_engine_command( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + ) + wire_events = afid_events_to_engine_input(afid_events) + + with tempfile.TemporaryDirectory(prefix="nodescraper_se_cli_", dir=work_dir) as tmp: + input_path = Path(tmp) / "events.json" + input_path.write_text(json.dumps(wire_events, indent=2), encoding="utf-8") + argv = [ + *command, + "analyze", + "--sag", + afid_sag_path, + "--input", + str(input_path), + ] + if extra_args: + argv.extend(extra_args) + completed = _run_subprocess(argv, timeout_seconds=timeout_seconds) + + try: + report = json.loads(completed.stdout or "{}") + except json.JSONDecodeError as exc: + raise SeRunError(f"Invalid JSON from serviceability engine CLI: {exc}") from exc + + from .se_adapter import recommendations_from_report_dict + + return serviceability_block_from_engine( + afid_events, + report, + recommendations=recommendations_from_report_dict(report), + ) + + +def _run_se_subprocess( + *, + engine_executable: Optional[str], + engine_entry_point: Optional[str], + afid_events: list[AfidEvent], + afid_sag_path: str, + extra_args: Optional[list[str]], + timeout_seconds: int, + work_dir: Optional[str], +) -> ServiceabilityBlock: + """Custom subprocess protocol: ``--input`` / ``--output`` / ``--afid-sag``.""" + command = resolve_engine_command( + engine_executable=engine_executable, + engine_entry_point=engine_entry_point, + ) + payload = SeInputPayload(afid_events=afid_events) + + with tempfile.TemporaryDirectory(prefix="nodescraper_se_", dir=work_dir) as tmp: + tmp_path = Path(tmp) + input_path = tmp_path / "se_input.json" + output_path = tmp_path / "se_output.json" + input_path.write_text( + json.dumps(payload.model_dump(mode="json"), indent=2), + encoding="utf-8", + ) + argv = [ + *command, + "--input", + str(input_path), + "--output", + str(output_path), + "--afid-sag", + str(Path(afid_sag_path).resolve()), + ] + if extra_args: + argv.extend(extra_args) + _run_subprocess(argv, timeout_seconds=timeout_seconds) + + if not output_path.is_file(): + raise SeRunError(f"Serviceability engine did not write output file: {output_path}") + try: + raw = json.loads(output_path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise SeRunError(f"Invalid JSON from serviceability engine: {exc}") from exc + + block = ServiceabilityBlock.model_validate(raw) + if not block.afid_events: + block.afid_events = list(afid_events) + return block + + +def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: + exe = Path(argv[0]) + if not exe.is_file() and not _command_on_path(argv[0]): + raise SeRunError(f"Serviceability engine not found or not executable: {argv[0]!r}") + try: + completed = subprocess.run( + argv, + capture_output=True, + text=True, + timeout=timeout_seconds, + check=False, + ) + except subprocess.TimeoutExpired as exc: + raise SeRunError(f"Serviceability engine timed out after {timeout_seconds}s") from exc + except OSError as exc: + raise SeRunError(f"Failed to start serviceability engine: {exc}") from exc + + if completed.returncode != 0: + stderr = (completed.stderr or "").strip() + stdout = (completed.stdout or "").strip() + detail = stderr or stdout or f"exit code {completed.returncode}" + raise SeRunError(f"Serviceability engine failed: {detail}") + return completed + + +def _command_on_path(name: str) -> bool: + from shutil import which + + return which(name) is not None diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py new file mode 100644 index 00000000..961afdf9 --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -0,0 +1,197 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import abc +from typing import Any, Generic, Optional, Protocol, TypeVar, cast +from urllib.parse import urlparse + +from nodescraper.base import RedfishDataCollector +from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.enums import ExecutionStatus +from nodescraper.models import CollectorArgs, TaskResult + +from .serviceability_data import DeviceInfo, ServiceabilityDataModel + + +class _ServiceabilityCollectArg(Protocol): + follow_next_link: bool + max_pages: int + top: Optional[int] + rf_assembly_uri_template: Optional[str] + rf_chassis_devices: Optional[list[str]] + rf_firmware_bundle_uri: Optional[str] + + def resolved_event_log_uri(self) -> str: ... + + +TServiceabilityCollectArg = TypeVar("TServiceabilityCollectArg", bound=_ServiceabilityCollectArg) + + +class ServiceabilityCollectorBase( + RedfishDataCollector[ServiceabilityDataModel, CollectorArgs], + Generic[TServiceabilityCollectArg], +): + """OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing.""" + + DATA_MODEL = ServiceabilityDataModel + + def __init__(self, **kwargs: Any) -> None: + self._log_path: Optional[str] = kwargs.get("log_path") + super().__init__(**kwargs) + + @abc.abstractmethod + def filter_event_members( + self, + members: list[Any], + args: TServiceabilityCollectArg, + ) -> list[Any]: + """Return the event list to retain for downstream analysis.""" + + @abc.abstractmethod + def is_cper_event(self, event: dict) -> bool: + """Return whether a Redfish event entry should be treated as diagnostic-backed.""" + + @abc.abstractmethod + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + + @abc.abstractmethod + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: TServiceabilityCollectArg, + ) -> DeviceInfo: + """Map one Assemblies[] member dict into DeviceInfo.""" + + @abc.abstractmethod + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: TServiceabilityCollectArg, + ) -> Optional[str]: + """Derive component-details text from a firmware inventory GET payload, or None.""" + + def _fetch_event_log(self, args: TServiceabilityCollectArg, uri: str): + if args.follow_next_link: + return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get(uri, log_artifact=True) + + def collect_data( + self, args: Optional[CollectorArgs] = None + ) -> tuple[TaskResult, Optional[ServiceabilityDataModel]]: + if args is None: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "Collector args are required" + return self.result, None + + svc_args = cast(TServiceabilityCollectArg, args) + event_uri = svc_args.resolved_event_log_uri() + if svc_args.top is not None: + res = self._fetch_top(svc_args, svc_args.top, svc_args.max_pages) + else: + res = self._fetch_event_log(svc_args, event_uri) + + if not res.success or res.data is None: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Redfish GET failed for {event_uri}: {res.error}" + return self.result, None + + members = res.data.get(RF_MEMBERS, []) + responses = {res.path: res.data} + raw_base_url = getattr(self.connection, "base_url", None) + bmc_host = urlparse(raw_base_url).hostname if raw_base_url else None + + try: + filtered_members = self.filter_event_members(members, svc_args) + except ValueError as exc: + self.result.status = ExecutionStatus.ERROR + self.result.message = f"Event filter failed: {exc}" + return self.result, None + + assembly_info: dict[str, DeviceInfo] = {} + tpl = svc_args.rf_assembly_uri_template + devices = svc_args.rf_chassis_devices + if tpl and devices: + for device in devices: + uri_asm = tpl.format(device=device) + assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) + if not assembly_res.success or assembly_res.data is None: + continue + responses[assembly_res.path] = assembly_res.data + + assemblies = assembly_res.data.get("Assemblies", []) + if not assemblies: + continue + + entry = assemblies[0] + assembly_info[device] = self.parse_assembly_entry(device, entry, svc_args) + + cper_data = self.collect_cper_data(filtered_members or []) + + data = ServiceabilityDataModel( + responses=responses, + rf_events=filtered_members or [], + assembly_info=assembly_info, + cper_data=cper_data, + component_details=self._fetch_component_details(responses, svc_args), + log_path=self._log_path, + bmc_host=bmc_host, + ) + self.result.status = ExecutionStatus.OK + self.result.message = f"Collected {len(members)} event log member(s)" + return self.result, data + + def _fetch_component_details( + self, responses: dict[str, Any], args: TServiceabilityCollectArg + ) -> Optional[str]: + fw_uri = args.rf_firmware_bundle_uri + if not fw_uri or not str(fw_uri).strip(): + return None + fw_uri = str(fw_uri).strip() + fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + if not fw_res.success or fw_res.data is None: + return None + responses[fw_res.path] = fw_res.data + return self.extract_component_details(fw_res.data, args) + + def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): + event_uri = args.resolved_event_log_uri() + probe = self._run_redfish_get(f"{event_uri}?$top=1", log_artifact=True) + if not probe.success or probe.data is None: + return probe + + count = probe.data.get(RF_MEMBERS_COUNT, 0) + + if count <= top: + return self._fetch_event_log(args, event_uri) + + skip = count - top + skip_uri = f"{event_uri}?$skip={skip}" + if args.follow_next_link: + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get(skip_uri, log_artifact=True) diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py new file mode 100644 index 00000000..68a7daea --- /dev/null +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -0,0 +1,100 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +import json +import os +from typing import Any, Dict, List, Optional + +from pydantic import BaseModel, Field + +from nodescraper.models import DataModel + +from .se_models import AfidEvent, ServiceabilityBlock + + +class DeviceInfo(BaseModel): + """Chassis fields from Assembly parsing; extra vendor keys belong in oem_extensions.""" + + name: Optional[str] = None + part_number: Optional[str] = None + production_date: Optional[str] = None + serial_number: Optional[str] = None + version: Optional[str] = None + oem_extensions: Dict[str, Any] = Field( + default_factory=dict, + description="Opaque vendor/product extensions parsed by the concrete collector.", + ) + + +class ServiceabilityResult(BaseModel): + """Structured serviceability output (typically populated by a downstream analyzer).""" + + node: Optional[str] = None + service_recommendations: Dict[str, List[dict]] = {} + service_action_definitions: Dict[str, dict] = {} + afid_sag_metadata: Dict[str, Any] = {} + node_info: Dict[str, Any] = {} + + +class ServiceabilityDataModel(DataModel): + """Collected Redfish responses and intermediate serviceability fields.""" + + responses: dict[str, Any] = {} + rf_events: list[Any] = [] + assembly_info: Dict[str, DeviceInfo] = {} + cper_data: Dict[str, Any] = {} + component_details: Optional[str] = None + log_path: Optional[str] = None + bmc_host: Optional[str] = None + afid_events: List[AfidEvent] = Field( + default_factory=list, + description="Serviceability engine input; built during analysis when not pre-filled.", + ) + serviceability: Optional[ServiceabilityBlock] = Field( + default=None, + description="ANC-style serviceability block (SE input + output).", + ) + result: Optional[ServiceabilityResult] = None + + def log_model(self, log_path: str) -> None: + """Write collector artifacts and optional serviceability.json under log_path.""" + os.makedirs(log_path, exist_ok=True) + responses_path = os.path.join(log_path, "redfish_responses.json") + with open(responses_path, "w", encoding="utf-8") as f: + json.dump(self.responses, f, indent=2) + if self.cper_data: + cper_path = os.path.join(log_path, "cper_data.json") + with open(cper_path, "w", encoding="utf-8") as f: + json.dump(self.cper_data, f, indent=2) + if self.serviceability is not None: + serviceability_path = os.path.join(log_path, "serviceability.json") + with open(serviceability_path, "w", encoding="utf-8") as f: + json.dump( + self.serviceability.model_dump(mode="json"), + f, + indent=2, + ) diff --git a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py b/nodescraper/plugins/serviceability/serviceability_plugin_base.py similarity index 69% rename from nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py rename to nodescraper/plugins/serviceability/serviceability_plugin_base.py index b891c522..991a2f99 100644 --- a/nodescraper/plugins/serviceability/oob_redfish/oob_redfish_plugin.py +++ b/nodescraper/plugins/serviceability/serviceability_plugin_base.py @@ -24,17 +24,22 @@ # ############################################################################### from nodescraper.base import OOBandDataPlugin +from nodescraper.models import CollectorArgs -from .oob_redfish_collector import OobRedfishCollector -from .oob_redfish_collector_args import OobRedfishCollectorArgs -from .oob_redfish_data import OobRedfishDataModel +from .analyzer_args import ServiceabilityAnalyzerArgs +from .serviceability_collector import ServiceabilityCollectorBase +from .serviceability_data import ServiceabilityDataModel -class OobRedfishPlugin( - OOBandDataPlugin[OobRedfishDataModel, OobRedfishCollectorArgs, None], +class ServiceabilityPluginBase( + OOBandDataPlugin[ + ServiceabilityDataModel, + CollectorArgs, + ServiceabilityAnalyzerArgs, + ], ): - """OOB Redfish serviceability plugin base.""" + """OOB Redfish plugin stub; subclass with a concrete COLLECTOR and COLLECTOR_ARGS.""" - DATA_MODEL = OobRedfishDataModel - COLLECTOR = OobRedfishCollector - COLLECTOR_ARGS = OobRedfishCollectorArgs + DATA_MODEL = ServiceabilityDataModel + COLLECTOR = ServiceabilityCollectorBase + ANALYZER_ARGS = ServiceabilityAnalyzerArgs diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py index 8bbc8a83..5653f4a9 100644 --- a/nodescraper/plugins/serviceability/time_utils.py +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -49,11 +49,33 @@ def is_valid_iso_datetime(value: str) -> bool: return True +def normalize_se_timestamp(value: str) -> str: + """Normalize a timestamp to the serviceability engine wire format. + + Accepts ISO-8601 (``2026-05-07T12:50:42``) and SE-style strings with a space + separator (``2026-05-07 12:50:42.096-07:00``). + """ + text = str(value).strip() + if not text: + raise ValueError("Empty datetime string") + if " " in text and "T" not in text: + return text + parsed = parse_iso_datetime(text) + micro = parsed.microsecond + base = parsed.strftime("%Y-%m-%d %H:%M:%S") + if micro: + base = f"{base}.{micro:06d}".rstrip("0").rstrip(".") + offset = parsed.strftime("%z") + if offset: + return f"{base}{offset[:3]}:{offset[3:]}" + return base + + def parse_iso_datetime(value: str) -> datetime: - """Parse an ISO-8601 date or date-time string. + """Parse an ISO-8601 or SE-style date-time string. Args: - value: Date (e.g. 2026-05-17) or date-time (e.g. 2026-05-17T13:01:00). + value: Date (e.g. 2026-05-17), ISO date-time, or SE format with a space separator. Returns: Parsed datetime. @@ -63,11 +85,13 @@ def parse_iso_datetime(value: str) -> datetime: raise ValueError("Empty datetime string") if text.endswith("Z"): text = f"{text[:-1]}+00:00" + if " " in text and "T" not in text: + text = text.replace(" ", "T", 1) try: parsed = datetime.fromisoformat(text) except ValueError as exc: raise ValueError(f"Not ISO-8601 compliant: {value!r}") from exc - if "T" not in text and "+" not in text and text.count("-") == 2: + if "T" not in value and "+" not in value and value.count("-") == 2: return parsed.replace(hour=0, minute=0, second=0, microsecond=0) return parsed diff --git a/test/unit/plugin/test_oob_redfish_collector.py b/test/unit/plugin/test_oob_redfish_collector.py deleted file mode 100644 index e729cedc..00000000 --- a/test/unit/plugin/test_oob_redfish_collector.py +++ /dev/null @@ -1,181 +0,0 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from typing import Optional - -import pytest -from pydantic import ValidationError - -from nodescraper.base import OOBandDataPlugin -from nodescraper.connection.redfish import RedfishConnectionManager -from nodescraper.enums import ExecutionStatus -from nodescraper.plugins.serviceability import ( - OobRedfishCollector, - OobRedfishCollectorArgs, - OobRedfishDataModel, - OobRedfishDeviceInfo, - OobRedfishPlugin, - OobRedfishResult, - build_oob_redfish_reporting_version_fields, - compare_iso_datetime, - is_valid_iso_datetime, - satisfies_time_check, -) - -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" - - -class _StubOobRedfishCollector(OobRedfishCollector): - def collect_data(self, args: Optional[OobRedfishCollectorArgs] = None): - if args is None: - return self._missing_args_result() - data = OobRedfishDataModel( - collected_data={"events": []}, - log_path=self._log_path, - ) - self.result.status = ExecutionStatus.OK - self.result.message = "stub collection complete" - return self.result, data - - -@pytest.fixture -def stub_oob_redfish_collector(system_info, redfish_conn_mock): - return _StubOobRedfishCollector( - system_info=system_info, - connection=redfish_conn_mock, - log_path="/tmp/oob_redfish.log", - ) - - -def test_oob_redfish_collector_args_requires_event_log_uri(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs() - - -def test_oob_redfish_collector_args_uri_alias(): - args = OobRedfishCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" - - -def test_oob_redfish_collector_args_assembly_requires_both_template_and_devices(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", - ) - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], - ) - - -def test_oob_redfish_collector_args_reference_time_requires_operator(): - with pytest.raises(ValidationError): - OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - ) - - -def test_oob_redfish_collector_args_accepts_iso_date_and_datetime(): - date_args = OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - time_operator=">=", - ) - assert date_args.reference_time == "2026-05-17" - - -def test_time_utils_iso_validation_and_comparison(): - assert is_valid_iso_datetime("2026-05-17") - assert satisfies_time_check("2026-05-18", "2026-05-17", ">") - assert compare_iso_datetime("2026-05-17T13:01:00", "2026-05-17T13:01:00", "==") - - -def test_oob_redfish_plugin_wiring(): - assert issubclass(OobRedfishPlugin, OOBandDataPlugin) - assert OobRedfishPlugin.DATA_MODEL is OobRedfishDataModel - assert OobRedfishPlugin.COLLECTOR is OobRedfishCollector - assert OobRedfishPlugin.COLLECTOR_ARGS is OobRedfishCollectorArgs - assert OobRedfishPlugin.CONNECTION_TYPE is RedfishConnectionManager - assert OobRedfishPlugin.ANALYZER is None - - -def test_stub_collector_no_args(stub_oob_redfish_collector): - result, data = stub_oob_redfish_collector.collect_data() - assert result.status == ExecutionStatus.NOT_RAN - assert "required" in result.message.lower() - assert data is None - - -def test_stub_collector_success_minimal(stub_oob_redfish_collector): - args = OobRedfishCollectorArgs(rf_event_log_uri=EVENT_URI) - result, data = stub_oob_redfish_collector.collect_data(args=args) - assert result.status == ExecutionStatus.OK - assert data is not None - assert data.collected_data == {"events": []} - - -def test_collector_satisfies_reference_time_helper(stub_oob_redfish_collector): - args = OobRedfishCollectorArgs( - rf_event_log_uri=EVENT_URI, - reference_time="2026-05-17", - time_operator=">=", - ) - assert stub_oob_redfish_collector.satisfies_reference_time("2026-05-18", args) - assert not stub_oob_redfish_collector.satisfies_reference_time("2026-05-16", args) - - -def test_oob_redfish_device_info_fields(): - info = OobRedfishDeviceInfo( - board_product_name="Board-A", - board_serial_number="BSN-1", - product_version="1.0", - ) - assert info.board_product_name == "Board-A" - assert info.product_version == "1.0" - - -def test_oob_redfish_result_reporting_versions(): - version_fields = build_oob_redfish_reporting_version_fields( - plugin_name="example_oob_redfish", - plugin_version="0.1.0", - node_scraper_version="1.2.3", - isa_version="9.8.7", - ) - result = OobRedfishResult(node="node-1", **version_fields) - assert result.plugin_name == "example_oob_redfish" - assert result.reporter_extensions["isa_version"] == "9.8.7" - - -def test_oob_redfish_data_model_log_model(tmp_path): - model = OobRedfishDataModel( - collected_data={"events": [{"id": 1}]}, - artifacts={"events.json": [{"id": 1}]}, - ) - model.log_model(str(tmp_path)) - assert (tmp_path / "events.json").is_file() - assert (tmp_path / "oob_redfish_data.json").is_file() diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py new file mode 100644 index 00000000..d7496288 --- /dev/null +++ b/test/unit/plugin/test_serviceability_collector.py @@ -0,0 +1,329 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from typing import Any, Optional + +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) +from nodescraper.enums import ExecutionStatus +from nodescraper.models import CollectorArgs +from nodescraper.plugins.serviceability import ( + DeviceInfo, + Mi3xxCollectorArgs, + ServiceabilityAnalyzerArgs, + ServiceabilityDataModel, + ServiceabilityPluginBase, +) +from nodescraper.plugins.serviceability.serviceability_collector import ( + ServiceabilityCollectorBase, +) + +EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" + + +class _StubServiceabilityCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): + def filter_event_members( + self, + members: list[Any], + args: Mi3xxCollectorArgs, + ) -> list[Any]: + return members + + def is_cper_event(self, event: dict) -> bool: + return False + + def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + return {} + + def parse_assembly_entry( + self, + designation: str, + assembly_member_entry: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> DeviceInfo: + return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) + + def extract_component_details( + self, + firmware_inventory_payload: dict[str, Any], + args: Mi3xxCollectorArgs, + ) -> Optional[str]: + return firmware_inventory_payload.get("Details") + + +@pytest.fixture +def stub_serviceability_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + return _StubServiceabilityCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/serviceability.log", + ) + + +def test_mi3xx_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs() + + +def test_mi3xx_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): + args = Mi3xxCollectorArgs(uri=" /events ", rf_event_log_uri="/other") + assert args.resolved_event_log_uri() == "/events" + + +def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["C1"], + ) + + +def test_mi3xx_collector_args_assembly_template_must_include_device_placeholder(): + with pytest.raises(ValidationError): + Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", + rf_chassis_devices=["C1"], + ) + + +def test_mi3xx_collector_args_assembly_optional_when_omitted(): + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + assert args.rf_assembly_uri_template is None + assert args.rf_chassis_devices is None + + +def test_serviceability_plugin_base_wiring(): + assert ServiceabilityPluginBase.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginBase.COLLECTOR is ServiceabilityCollectorBase + assert getattr(ServiceabilityPluginBase, "COLLECTOR_ARGS", CollectorArgs) is CollectorArgs + assert ServiceabilityPluginBase.ANALYZER_ARGS is ServiceabilityAnalyzerArgs + assert ServiceabilityPluginBase.ANALYZER is None + + +def test_stub_collector_no_args(stub_serviceability_collector): + result, data = stub_serviceability_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=False, + error="timeout", + status_code=None, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert EVENT_URI in result.message + assert data is None + + +def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_conn_mock): + members = [{"Id": "1"}] + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: members}, + status_code=200, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == members + assert EVENT_URI in data.responses + assert data.bmc_host == "bmc.example" + assert data.log_path == "/tmp/serviceability.log" + redfish_conn_mock.run_get_paged.assert_called_once() + + +def test_stub_collector_filter_raises_maps_to_error( + stub_serviceability_collector, redfish_conn_mock +): + class _BadFilter(_StubServiceabilityCollector): + def filter_event_members(self, members, args): + raise ValueError("bad filter") + + collector = _BadFilter( + system_info=stub_serviceability_collector.system_info, + connection=redfish_conn_mock, + ) + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = collector.collect_data(args=args) + assert result.status == ExecutionStatus.ERROR + assert "Event filter failed" in result.message + assert data is None + + +def test_stub_collector_assembly_and_firmware_paths( + stub_serviceability_collector, redfish_conn_mock +): + tpl = "/redfish/v1/Chassis/{device}/Assembly" + asm_uri = tpl.format(device="C1") + fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" + + def run_get_side_effect(path: str, *_args, **_kwargs): + if path == EVENT_URI: + return RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: []}, + status_code=200, + ) + if path == asm_uri: + return RedfishGetResult( + path=asm_uri, + success=True, + data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + status_code=200, + ) + if path == fw_uri: + return RedfishGetResult( + path=fw_uri, + success=True, + data={"Details": "fw-summary"}, + status_code=200, + ) + raise AssertionError(f"unexpected Redfish GET path: {path!r}") + + redfish_conn_mock.run_get.side_effect = run_get_side_effect + + def run_get_paged_forbidden(*_args, **_kwargs): + raise AssertionError("run_get_paged must not run when follow_next_link=False") + + redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden + + args = Mi3xxCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template=tpl, + rf_chassis_devices=["C1"], + rf_firmware_bundle_uri=fw_uri, + follow_next_link=False, + ) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert "C1" in data.assembly_info + assert data.assembly_info["C1"].serial_number == "SN-ASM" + assert data.component_details == "fw-summary" + assert asm_uri in data.responses + + +def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 100}, + status_code=200, + ) + window = RedfishGetResult( + path=f"{EVENT_URI}?$skip=90", + success=True, + data={RF_MEMBERS: [{"Id": "last"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = window + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rf_events == [{"Id": "last"}] + redfish_conn_mock.run_get.assert_called_once() + assert "?$top=1" in redfish_conn_mock.run_get.call_args[0][0] + redfish_conn_mock.run_get_paged.assert_called_once_with( + f"{EVENT_URI}?$skip=90", max_pages=args.max_pages + ) + + +def test_stub_collector_top_when_count_within_top_fetches_full_log( + stub_serviceability_collector, redfish_conn_mock +): + probe = RedfishGetResult( + path=f"{EVENT_URI}?$top=1", + success=True, + data={RF_MEMBERS_COUNT: 3}, + status_code=200, + ) + full = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "a"}, {"Id": "b"}]}, + status_code=200, + ) + redfish_conn_mock.run_get.return_value = probe + redfish_conn_mock.run_get_paged.return_value = full + args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + result, data = stub_serviceability_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 2 + redfish_conn_mock.run_get_paged.assert_called_once_with(EVENT_URI, max_pages=args.max_pages) + + +def test_serviceability_data_model_log_model_writes_json(tmp_path): + model = ServiceabilityDataModel( + responses={"/x": {"ok": True}}, + cper_data={"slot": {"raw": "data"}}, + ) + model.log_model(str(tmp_path)) + responses_file = tmp_path / "redfish_responses.json" + cper_file = tmp_path / "cper_data.json" + assert responses_file.is_file() + assert cper_file.is_file() + assert json.loads(responses_file.read_text(encoding="utf-8")) == {"/x": {"ok": True}} + assert json.loads(cper_file.read_text(encoding="utf-8")) == {"slot": {"raw": "data"}} + + +def test_serviceability_data_model_log_model_skips_cper_when_empty(tmp_path): + model = ServiceabilityDataModel(responses={}) + model.log_model(str(tmp_path)) + assert (tmp_path / "redfish_responses.json").is_file() + assert not (tmp_path / "cper_data.json").exists() From af9a02d23c32a8fe2d0990aec30229dd7b6eebac Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 26 May 2026 14:32:47 -0500 Subject: [PATCH 05/39] rename --- .../plugins/serviceability/analyzer_args.py | 8 +++--- .../serviceability/mi3xx/mi3xx_analyzer.py | 5 ++-- .../plugins/serviceability/se_adapter.py | 8 +++--- .../plugins/serviceability/se_models.py | 6 ++-- .../plugins/serviceability/se_runner.py | 28 +++++++++---------- .../serviceability/serviceability_data.py | 2 +- .../plugins/serviceability/time_utils.py | 2 +- 7 files changed, 29 insertions(+), 30 deletions(-) diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index c20366db..d9fa09bb 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -40,13 +40,13 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): engine_backend: EngineBackend = Field( default="python", description=( - "How to invoke the SE: 'python' (serviceability_engine bindings), " + "How to invoke the SE: 'python' (service_hub bindings), " "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." ), ) engine_python_module: str = Field( - default="serviceability_engine", - description="Python package providing ServiceabilityEngine bindings (python backend).", + default="service_hub", + description="Python package providing ServiceHub bindings (python backend).", ) engine_executable: Optional[str] = Field( default=None, @@ -91,7 +91,7 @@ def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running the serviceability engine.") + raise ValueError("afid_sag_path is required when running Service Hub.") if self.engine_backend == "python": return self has_exe = self.engine_executable is not None diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index cd67bb58..d74f297a 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -40,7 +40,7 @@ class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run the serviceability engine.""" + """Build AFID events from collected data and run Service Hub.""" DATA_MODEL = ServiceabilityDataModel @@ -82,7 +82,6 @@ def analyze_data( data.serviceability = block self.result.status = ExecutionStatus.OK self.result.message = ( - f"Serviceability engine: {len(block.solution)} solution(s) " - f"from {len(events)} event(s)" + f"Service Hub: {len(block.solution)} solution(s) " f"from {len(events)} event(s)" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 37b5d74c..4b4c7a2e 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Map node-scraper serviceability models to/from the AMD serviceability-engine API.""" +"""Map node-scraper serviceability models to/from the AMD Service Hub API.""" from __future__ import annotations from collections import defaultdict @@ -38,7 +38,7 @@ def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: - """Convert plugin AFID events to serviceability-engine wire-format dicts. + """Convert plugin AFID events to Service Hub wire-format dicts. The engine triages on (afid, location, count). Duplicate (afid, unit) pairs are merged by summing counts. Timestamp is preserved only on the plugin side. @@ -58,7 +58,7 @@ def recommendations_from_report_dict( *, solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, ) -> list[dict[str, Any]]: - """Derive grouped recommendations from an :func:`serviceability_engine.api.analyze` report.""" + """Derive grouped recommendations from an :func:`service_hub.api.analyze` report.""" if "recommendations" in report: return list(report["recommendations"]) @@ -132,6 +132,6 @@ def _build_solution_reasoning( sag_pid = report.get("sag_pid") or "unknown" sag_revision = report.get("sag_revision") or "unknown" return ( - f"Serviceability engine (SAG {sag_pid} rev {sag_revision}): " + f"Service Hub (SAG {sag_pid} rev {sag_revision}): " f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 75919fc3..f5fc54bb 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -31,7 +31,7 @@ class AfidEvent(BaseModel): - """Serviceability engine input: one AFID occurrence on a serviceable unit.""" + """Service Hub input: one AFID occurrence on a serviceable unit.""" afid: int = Field(description="AMD Fault ID.") serviceable_unit: str = Field( @@ -51,7 +51,7 @@ def _strip_serviceable_unit(cls, value: str) -> str: class ServiceabilitySolution(BaseModel): - """Serviceability engine output: recommended action for an AFID.""" + """Service Hub output: recommended action for an AFID.""" afid: int serviceable_unit: List[str] = Field( @@ -67,7 +67,7 @@ class ServiceabilityBlock(BaseModel): afid_events: List[AfidEvent] = Field( default_factory=list, - description="Input events passed to the serviceability engine.", + description="Input events passed to Service Hub.", ) solution: List[ServiceabilitySolution] = Field( default_factory=list, diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index df28426f..0fda2e5e 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Run the AMD serviceability engine (Python API, CLI, or custom subprocess).""" +"""Run the AMD Service Hub (Python API, CLI, or custom subprocess).""" from __future__ import annotations import importlib @@ -41,7 +41,7 @@ class SeRunError(RuntimeError): - """Raised when the serviceability engine fails or returns invalid output.""" + """Raised when Service Hub fails or returns invalid output.""" def resolve_engine_command( @@ -64,7 +64,7 @@ def resolve_engine_command( def run_se( *, engine_backend: EngineBackend = "python", - engine_python_module: str = "serviceability_engine", + engine_python_module: str = "service_hub", engine_executable: Optional[str] = None, engine_entry_point: Optional[str] = None, afid_events: list[AfidEvent], @@ -114,11 +114,11 @@ def _run_se_python( try: se = importlib.import_module(engine_python_module) SagDocument = se.SagDocument - ServiceabilityEngine = se.ServiceabilityEngine + ServiceHub = se.ServiceHub EventRecord = se.EventRecord except (ImportError, AttributeError) as exc: raise SeRunError( - f"Cannot import {engine_python_module} bindings — install serviceability-engine " + f"Cannot import {engine_python_module} bindings — install service-hub " f"and build the Python extension (uv build)." ) from exc @@ -133,10 +133,10 @@ def _run_se_python( ) for item in wire_events ] - analysis = ServiceabilityEngine(sag).analyze(records) + analysis = ServiceHub(sag).analyze(records) report = analysis.to_dict() except Exception as exc: - raise SeRunError(f"Serviceability engine analyze() failed: {exc}") from exc + raise SeRunError(f"Service Hub analyze() failed: {exc}") from exc return serviceability_block_from_engine(afid_events, report) @@ -176,7 +176,7 @@ def _run_se_cli( try: report = json.loads(completed.stdout or "{}") except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from serviceability engine CLI: {exc}") from exc + raise SeRunError(f"Invalid JSON from Service Hub CLI: {exc}") from exc from .se_adapter import recommendations_from_report_dict @@ -226,11 +226,11 @@ def _run_se_subprocess( _run_subprocess(argv, timeout_seconds=timeout_seconds) if not output_path.is_file(): - raise SeRunError(f"Serviceability engine did not write output file: {output_path}") + raise SeRunError(f"Service Hub did not write output file: {output_path}") try: raw = json.loads(output_path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from serviceability engine: {exc}") from exc + raise SeRunError(f"Invalid JSON from Service Hub: {exc}") from exc block = ServiceabilityBlock.model_validate(raw) if not block.afid_events: @@ -241,7 +241,7 @@ def _run_se_subprocess( def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: exe = Path(argv[0]) if not exe.is_file() and not _command_on_path(argv[0]): - raise SeRunError(f"Serviceability engine not found or not executable: {argv[0]!r}") + raise SeRunError(f"Service Hub not found or not executable: {argv[0]!r}") try: completed = subprocess.run( argv, @@ -251,15 +251,15 @@ def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.Comp check=False, ) except subprocess.TimeoutExpired as exc: - raise SeRunError(f"Serviceability engine timed out after {timeout_seconds}s") from exc + raise SeRunError(f"Service Hub timed out after {timeout_seconds}s") from exc except OSError as exc: - raise SeRunError(f"Failed to start serviceability engine: {exc}") from exc + raise SeRunError(f"Failed to start Service Hub: {exc}") from exc if completed.returncode != 0: stderr = (completed.stderr or "").strip() stdout = (completed.stdout or "").strip() detail = stderr or stdout or f"exit code {completed.returncode}" - raise SeRunError(f"Serviceability engine failed: {detail}") + raise SeRunError(f"Service Hub failed: {detail}") return completed diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 68a7daea..0c387940 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -72,7 +72,7 @@ class ServiceabilityDataModel(DataModel): bmc_host: Optional[str] = None afid_events: List[AfidEvent] = Field( default_factory=list, - description="Serviceability engine input; built during analysis when not pre-filled.", + description="Service Hub input; built during analysis when not pre-filled.", ) serviceability: Optional[ServiceabilityBlock] = Field( default=None, diff --git a/nodescraper/plugins/serviceability/time_utils.py b/nodescraper/plugins/serviceability/time_utils.py index 5653f4a9..166bca14 100644 --- a/nodescraper/plugins/serviceability/time_utils.py +++ b/nodescraper/plugins/serviceability/time_utils.py @@ -50,7 +50,7 @@ def is_valid_iso_datetime(value: str) -> bool: def normalize_se_timestamp(value: str) -> str: - """Normalize a timestamp to the serviceability engine wire format. + """Normalize a timestamp to the Service Hub wire format. Accepts ISO-8601 (``2026-05-07T12:50:42``) and SE-style strings with a space separator (``2026-05-07 12:50:42.096-07:00``). From a5bfaac2517c672dcf7a9925c7df733182ac6196 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 11:40:50 -0500 Subject: [PATCH 06/39] cleanup + updates --- nodescraper/interfaces/dataanalyzertask.py | 2 +- nodescraper/interfaces/dataplugin.py | 6 +- .../plugins/serviceability/__init__.py | 45 ++- .../plugins/serviceability/analyzer_args.py | 63 +--- .../plugins/serviceability/mi3xx/__init__.py | 24 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 34 ++- .../serviceability/mi3xx/mi3xx_collector.py | 14 +- .../mi3xx/mi3xx_collector_args.py | 12 +- .../serviceability/mi3xx/mi3xx_data.py | 12 +- .../mi3xx/serviceability_plugin_mi3xx.py | 19 +- .../plugins/serviceability/se_adapter.py | 145 ++++----- .../plugins/serviceability/se_models.py | 12 +- .../plugins/serviceability/se_runner.py | 276 +++++------------- .../taskresulthooks/filesystemloghook.py | 6 +- nodescraper/utils.py | 31 +- .../unit/plugin/fixtures/afid_sag_sample.json | 8 + .../plugin/fixtures/mock_python_engine.py | 40 +++ test/unit/plugin/serviceability_dummy_data.py | 22 ++ test/unit/plugin/test_mi3xx_collector.py | 213 ++++++++++++++ test/unit/plugin/test_se_runner.py | 257 ++++++++++++++++ .../plugin/test_serviceability_collector.py | 76 +++-- 21 files changed, 844 insertions(+), 473 deletions(-) create mode 100644 test/unit/plugin/fixtures/afid_sag_sample.json create mode 100644 test/unit/plugin/fixtures/mock_python_engine.py create mode 100644 test/unit/plugin/serviceability_dummy_data.py create mode 100644 test/unit/plugin/test_mi3xx_collector.py create mode 100644 test/unit/plugin/test_se_runner.py diff --git a/nodescraper/interfaces/dataanalyzertask.py b/nodescraper/interfaces/dataanalyzertask.py index 0e6b3b06..fd6cc284 100644 --- a/nodescraper/interfaces/dataanalyzertask.py +++ b/nodescraper/interfaces/dataanalyzertask.py @@ -99,7 +99,7 @@ def wrapper( result = analyzer.result result.finalize(analyzer.logger) - analyzer._run_hooks(result) + analyzer._run_hooks(result, data=data) return result diff --git a/nodescraper/interfaces/dataplugin.py b/nodescraper/interfaces/dataplugin.py index ed632fb4..43bc3d83 100644 --- a/nodescraper/interfaces/dataplugin.py +++ b/nodescraper/interfaces/dataplugin.py @@ -44,7 +44,7 @@ SystemInfo, TaskResult, ) -from nodescraper.utils import pascal_to_snake +from nodescraper.utils import resolve_log_dir_name from .connectionmanager import TConnectArg, TConnectionManager from .task import SystemCompatibilityError @@ -412,8 +412,8 @@ def find_datamodel_path_in_run(cls, run_path: str) -> Optional[str]: return None collector_dir = os.path.join( run_path, - pascal_to_snake(cls.__name__), - pascal_to_snake(collector_cls.__name__), + resolve_log_dir_name(cls.__name__), + resolve_log_dir_name(collector_cls.__name__), ) if not os.path.isdir(collector_dir): return None diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index ae190bca..36671691 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -26,23 +26,21 @@ from .afid_events import build_afid_events_from_data from .analyzer_args import ServiceabilityAnalyzerArgs from .mi3xx import ( - Mi3xxAnalyzer, - Mi3xxCollector, - Mi3xxCollectorArgs, - Mi3xxDataModel, - Mi3xxDeviceInfo, - Mi3xxResult, + MI3XXAnalyzer, + MI3XXCollector, + MI3XXCollectorArgs, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, ServiceabilityPluginMI3XX, build_mi3xx_reporting_version_fields, ) -from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine -from .se_models import ( - AfidEvent, - SeInputPayload, - ServiceabilityBlock, - ServiceabilitySolution, +from .se_adapter import ( + format_serviceability_solution_lines, + serviceability_block_from_service_result, ) -from .se_runner import EngineBackend, SeRunError, resolve_engine_command, run_se +from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +from .se_runner import SeRunError, run_service_engine from .serviceability_collector import ServiceabilityCollectorBase from .serviceability_data import ( DeviceInfo, @@ -62,14 +60,12 @@ __all__ = [ "AfidEvent", "DeviceInfo", - "EngineBackend", - "Mi3xxAnalyzer", - "Mi3xxCollector", - "Mi3xxCollectorArgs", - "Mi3xxDataModel", - "Mi3xxDeviceInfo", - "Mi3xxResult", - "SeInputPayload", + "MI3XXAnalyzer", + "MI3XXCollector", + "MI3XXCollectorArgs", + "MI3XXDataModel", + "MI3XXDeviceInfo", + "MI3XXResult", "SeRunError", "ServiceabilityAnalyzerArgs", "ServiceabilityBlock", @@ -80,15 +76,14 @@ "ServiceabilityResult", "ServiceabilitySolution", "TimeOperator", - "afid_events_to_engine_input", "build_afid_events_from_data", - "serviceability_block_from_engine", "build_mi3xx_reporting_version_fields", "compare_iso_datetime", + "format_serviceability_solution_lines", "is_valid_iso_datetime", "normalize_se_timestamp", "parse_iso_datetime", - "resolve_engine_command", - "run_se", + "run_service_engine", + "serviceability_block_from_service_result", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index d9fa09bb..679743dd 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -25,62 +25,39 @@ ############################################################################### from __future__ import annotations -from typing import List, Literal, Optional +from typing import Optional from pydantic import Field, field_validator, model_validator from nodescraper.models import AnalyzerArgs -EngineBackend = Literal["python", "cli", "subprocess"] - class ServiceabilityAnalyzerArgs(AnalyzerArgs): - """Analyzer args for serviceability plugins.""" + """Analyzer args for MI3XX serviceability (Python engine via plugin config).""" - engine_backend: EngineBackend = Field( - default="python", + engine_python_module: Optional[str] = Field( + default=None, description=( - "How to invoke the SE: 'python' (service_hub bindings), " - "'cli' (external analyze subcommand), or 'subprocess' (--input/--output protocol)." + "Importable Python module providing a service engine class with " + "get_service_info(rf_events, cper_data=...)." ), ) - engine_python_module: str = Field( - default="service_hub", - description="Python package providing ServiceHub bindings (python backend).", - ) - engine_executable: Optional[str] = Field( - default=None, - description="Path to the SE binary (cli or subprocess backends).", - ) - engine_entry_point: Optional[str] = Field( + engine_display_name: Optional[str] = Field( default=None, - description=( - "Command for cli/subprocess backends: executable path or argv prefix on PATH. " - "Required when engine_backend is 'cli' or 'subprocess'." - ), + description="Optional label for analyzer status messages.", ) afid_sag_path: Optional[str] = Field( default=None, description="Path to AFID_SAG.json.", ) - engine_extra_args: List[str] = Field( - default_factory=list, - description="Extra CLI arguments (cli/subprocess backends).", - ) - engine_timeout_seconds: int = Field( - default=600, - ge=1, - le=86_400, - description="Subprocess timeout (cli/subprocess backends).", - ) skip_engine: bool = Field( default=False, - description="If True, only build afid_events without running the SE.", + description="If True, only build afid_events without running the service engine.", ) - @field_validator("engine_executable", "engine_entry_point", "afid_sag_path") + @field_validator("afid_sag_path", "engine_python_module", "engine_display_name") @classmethod - def _strip_optional_paths(cls, value: Optional[str]) -> Optional[str]: + def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: if value is None: return None text = str(value).strip() @@ -91,19 +68,7 @@ def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running Service Hub.") - if self.engine_backend == "python": - return self - has_exe = self.engine_executable is not None - has_entry = self.engine_entry_point is not None - if has_exe and has_entry: - raise ValueError( - "Provide only one of engine_executable or engine_entry_point " - "for cli/subprocess backends." - ) - if not has_exe and not has_entry: - raise ValueError( - "engine_executable or engine_entry_point is required when " - "engine_backend is 'cli' or 'subprocess'." - ) + raise ValueError("afid_sag_path is required when running the service engine.") + if not self.engine_python_module: + raise ValueError("engine_python_module is required when running the service engine.") return self diff --git a/nodescraper/plugins/serviceability/mi3xx/__init__.py b/nodescraper/plugins/serviceability/mi3xx/__init__.py index 25e83a07..b97928b3 100644 --- a/nodescraper/plugins/serviceability/mi3xx/__init__.py +++ b/nodescraper/plugins/serviceability/mi3xx/__init__.py @@ -23,24 +23,24 @@ # SOFTWARE. # ############################################################################### -from .mi3xx_analyzer import Mi3xxAnalyzer -from .mi3xx_collector import Mi3xxCollector -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_analyzer import MI3XXAnalyzer +from .mi3xx_collector import MI3XXCollector +from .mi3xx_collector_args import MI3XXCollectorArgs from .mi3xx_data import ( - Mi3xxDataModel, - Mi3xxDeviceInfo, - Mi3xxResult, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, build_mi3xx_reporting_version_fields, ) from .serviceability_plugin_mi3xx import ServiceabilityPluginMI3XX __all__ = [ - "Mi3xxAnalyzer", - "Mi3xxCollector", - "Mi3xxCollectorArgs", - "Mi3xxDataModel", - "Mi3xxDeviceInfo", - "Mi3xxResult", + "MI3XXAnalyzer", + "MI3XXCollector", + "MI3XXCollectorArgs", + "MI3XXDataModel", + "MI3XXDeviceInfo", + "MI3XXResult", "ServiceabilityPluginMI3XX", "build_mi3xx_reporting_version_fields", ] diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index d74f297a..ab001184 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -7,7 +7,7 @@ # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# to use, copy, modify, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # @@ -32,15 +32,18 @@ from nodescraper.models import TaskResult from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.se_adapter import ( + format_serviceability_solution_lines, +) from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock -from nodescraper.plugins.serviceability.se_runner import SeRunError, run_se +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_engine from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) -class Mi3xxAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run Service Hub.""" +class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): + """Build AFID events from collected data and run the configured service engine.""" DATA_MODEL = ServiceabilityDataModel @@ -61,18 +64,17 @@ def analyze_data( data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + self._log_serviceability_solutions(data.serviceability) return self.result try: - block = run_se( - engine_backend=args.engine_backend, - engine_python_module=args.engine_python_module, - engine_executable=args.engine_executable, - engine_entry_point=args.engine_entry_point, + block = run_service_engine( + engine_python_module=args.engine_python_module, # type: ignore[arg-type] + engine_display_name=args.engine_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] - extra_args=args.engine_extra_args or None, - timeout_seconds=args.engine_timeout_seconds, + rf_events=data.rf_events, + cper_data=data.cper_data or None, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -80,8 +82,16 @@ def analyze_data( return self.result data.serviceability = block + self._log_serviceability_solutions(block) + engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK self.result.message = ( - f"Service Hub: {len(block.solution)} solution(s) " f"from {len(events)} event(s)" + f"{engine_label}: {len(block.solution)} solution(s) " + f"from {len(data.rf_events)} Redfish event(s)" ) return self.result + + def _log_serviceability_solutions(self, block: ServiceabilityBlock) -> None: + parent = self.parent or self.__class__.__name__ + for line in format_serviceability_solution_lines(block): + self.logger.info("(%s) %s", parent, line) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 8f73941c..63e23e21 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -33,18 +33,18 @@ from nodescraper.plugins.serviceability.serviceability_data import DeviceInfo from nodescraper.plugins.serviceability.time_utils import satisfies_time_check -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_collector_args import MI3XXCollectorArgs _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") -class Mi3xxCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): - """MI3xx OOB Redfish serviceability collector.""" +class MI3XXCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): + """MI3XX OOB Redfish serviceability collector.""" def satisfies_reference_time( self, candidate: str, - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> bool: """Test a timestamp against optional reference-time filter settings.""" if args.reference_time is None or args.time_operator is None: @@ -54,7 +54,7 @@ def satisfies_reference_time( def filter_event_members( self, members: list[Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> list[Any]: filtered: list[Any] = [] for member in members: @@ -78,7 +78,7 @@ def parse_assembly_entry( self, designation: str, assembly_member_entry: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> DeviceInfo: return DeviceInfo( name=assembly_member_entry.get("Name") or designation, @@ -91,7 +91,7 @@ def parse_assembly_entry( def extract_component_details( self, firmware_inventory_payload: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> Optional[str]: details = firmware_inventory_payload.get("Details") if details is not None: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py index ae7555d7..1e95a81b 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector_args.py @@ -36,11 +36,11 @@ ) -class Mi3xxCollectorArgs(CollectorArgs): - """MI3xx OOB Redfish serviceability collector arguments.""" +class MI3XXCollectorArgs(CollectorArgs): + """MI3XX OOB Redfish serviceability collector arguments.""" uri: Optional[str] = Field( - default=None, + default="/redfish/v1/Systems/UBB/LogServices/EventLog/Entries", description="Optional alias for ``rf_event_log_uri`` (non-empty string).", ) rf_event_log_uri: Optional[str] = Field( @@ -99,7 +99,7 @@ def _validate_reference_time_iso(cls, value: Optional[str]) -> Optional[str]: return text @model_validator(mode="after") - def _require_event_log_uri(self) -> Mi3xxCollectorArgs: + def _require_event_log_uri(self) -> MI3XXCollectorArgs: if not self.resolved_event_log_uri(): raise ValueError( "Provide a non-empty rf_event_log_uri or uri for the event log collection." @@ -107,7 +107,7 @@ def _require_event_log_uri(self) -> Mi3xxCollectorArgs: return self @model_validator(mode="after") - def _assembly_consistency(self) -> Mi3xxCollectorArgs: + def _assembly_consistency(self) -> MI3XXCollectorArgs: has_tpl = bool( self.rf_assembly_uri_template and "{device}" in self.rf_assembly_uri_template ) @@ -120,7 +120,7 @@ def _assembly_consistency(self) -> Mi3xxCollectorArgs: return self @model_validator(mode="after") - def _reference_time_requires_operator(self) -> Mi3xxCollectorArgs: + def _reference_time_requires_operator(self) -> MI3XXCollectorArgs: has_ref = self.reference_time is not None has_op = self.time_operator is not None if has_ref != has_op: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py index 6c9c268f..17a60eaa 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_data.py @@ -34,7 +34,7 @@ from nodescraper.models import DataModel -class Mi3xxDeviceInfo(BaseModel): +class MI3XXDeviceInfo(BaseModel): """Device identity with separate board and product fields.""" board_product_name: Optional[str] = Field( @@ -78,7 +78,7 @@ class Mi3xxDeviceInfo(BaseModel): ) -class Mi3xxResult(BaseModel): +class MI3XXResult(BaseModel): """Structured serviceability report output.""" node: Optional[str] = None @@ -136,14 +136,14 @@ def build_mi3xx_reporting_version_fields( } -class Mi3xxDataModel(DataModel): +class MI3XXDataModel(DataModel): """Collected OOB Redfish serviceability data model.""" collected_data: Dict[str, Any] = Field( default_factory=dict, description="Arbitrary keyed payloads from the collector implementation.", ) - device_info: Dict[str, Mi3xxDeviceInfo] = Field( + device_info: Dict[str, MI3XXDeviceInfo] = Field( default_factory=dict, description="Optional device identity keyed by implementer-defined labels.", ) @@ -156,7 +156,7 @@ class Mi3xxDataModel(DataModel): description="Optional host or service endpoint label (not necessarily a BMC).", ) log_path: Optional[str] = None - result: Optional[Mi3xxResult] = None + result: Optional[MI3XXResult] = None def log_model(self, log_path: str) -> None: """Write artifact files and a JSON summary under the log directory. @@ -174,7 +174,7 @@ def log_model(self, log_path: str) -> None: artifact_path = os.path.join(log_path, str(filename).strip()) with open(artifact_path, "w", encoding="utf-8") as handle: json.dump(payload, handle, indent=2) - summary_path = os.path.join(log_path, "mi3xx_data.json") + summary_path = os.path.join(log_path, "MI3XX_data.json") with open(summary_path, "w", encoding="utf-8") as handle: json.dump( self.model_dump( diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py index ee0c510b..2f38783f 100644 --- a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -29,16 +29,21 @@ from nodescraper.plugins.serviceability.serviceability_plugin_base import ( ServiceabilityPluginBase, ) +from nodescraper.utils import register_log_dir_name -from .mi3xx_analyzer import Mi3xxAnalyzer -from .mi3xx_collector import Mi3xxCollector -from .mi3xx_collector_args import Mi3xxCollectorArgs +from .mi3xx_analyzer import MI3XXAnalyzer +from .mi3xx_collector import MI3XXCollector +from .mi3xx_collector_args import MI3XXCollectorArgs + +register_log_dir_name("ServiceabilityPluginMI3XX", "serviceability_plugin_MI3XX") +register_log_dir_name("MI3XXCollector", "MI3XX_collector") +register_log_dir_name("MI3XXAnalyzer", "MI3XX_analyzer") class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): - """MI3xx OOB Redfish serviceability plugin.""" + """MI3XX OOB Redfish serviceability plugin.""" DATA_MODEL = ServiceabilityDataModel - COLLECTOR = Mi3xxCollector - ANALYZER = Mi3xxAnalyzer - COLLECTOR_ARGS = Mi3xxCollectorArgs + COLLECTOR = MI3XXCollector + ANALYZER = MI3XXAnalyzer + COLLECTOR_ARGS = MI3XXCollectorArgs diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 4b4c7a2e..243b2d7d 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Map node-scraper serviceability models to/from the AMD Service Hub API.""" +"""Map serviceability plugin models to/from Python service engine results.""" from __future__ import annotations from collections import defaultdict @@ -31,107 +31,76 @@ from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution -_DEFAULT_SOLUTION_TIERS = ( - "primary_fru_events", - "secondary_actions", -) - -def afid_events_to_engine_input(afid_events: list[AfidEvent]) -> list[dict[str, Any]]: - """Convert plugin AFID events to Service Hub wire-format dicts. - - The engine triages on (afid, location, count). Duplicate (afid, unit) pairs - are merged by summing counts. Timestamp is preserved only on the plugin side. - """ - counts: dict[tuple[str, str], int] = defaultdict(int) - for event in afid_events: - key = (str(event.afid), event.serviceable_unit) - counts[key] += 1 - return [ - {"afid": afid, "location": location, "count": count} - for (afid, location), count in sorted(counts.items()) - ] +def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: + """Human-readable lines for logging or console output.""" + lines: list[str] = [] + if block.solution_reasoning: + lines.append(block.solution_reasoning) + if not block.solution: + lines.append("No service actions recommended.") + return lines + for index, solution in enumerate(block.solution, start=1): + units = ", ".join(solution.serviceable_unit) + lines.append( + f"[{index}] AFID {solution.afid}, " + f"service action {solution.service_action_num}, " + f"units: [{units}]" + ) + return lines -def recommendations_from_report_dict( - report: dict[str, Any], +def serviceability_block_from_service_result( + afid_events: list[AfidEvent], + result: Any, *, - solution_tiers: tuple[str, ...] = _DEFAULT_SOLUTION_TIERS, -) -> list[dict[str, Any]]: - """Derive grouped recommendations from an :func:`service_hub.api.analyze` report.""" - if "recommendations" in report: - return list(report["recommendations"]) - + engine_label: str = "Service engine", + rf_event_count: int = 0, +) -> ServiceabilityBlock: + """Build a :class:`ServiceabilityBlock` from an engine result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) - for tier in solution_tiers: - for row in report.get(tier, []): - if not isinstance(row, dict): + service_info = getattr(result, "service_info", None) or {} + for designation, afid_map in service_info.items(): + if not isinstance(afid_map, dict): + continue + unit = str(designation).strip() if designation is not None else "" + for afid_raw, info in afid_map.items(): + if not isinstance(info, dict): continue - afid = int(row.get("afid", 0)) - location = str(row.get("location", "")).strip() - action_num = _action_num_from_row(row) - if not location or action_num is None: + san_raw = info.get("service_action_number") + if san_raw is None: continue - key = (afid, action_num) - if location not in grouped[key]: - grouped[key].append(location) - - return [ - { - "afid": afid, - "locations": locations, - "service_action_num": action_num, - } - for (afid, action_num), locations in sorted(grouped.items()) - ] - + try: + afid = int(afid_raw) + san = int(san_raw) + except (TypeError, ValueError): + continue + key = (afid, san) + if unit and unit not in grouped[key]: + grouped[key].append(unit) -def serviceability_block_from_engine( - afid_events: list[AfidEvent], - report: dict[str, Any], - *, - recommendations: list[dict[str, Any]] | None = None, -) -> ServiceabilityBlock: - """Build the ANC ``serviceability`` block from an engine analysis report.""" - recs = ( - recommendations if recommendations is not None else recommendations_from_report_dict(report) - ) solutions = [ ServiceabilitySolution( - afid=int(item["afid"]), - serviceable_unit=list(item["locations"]), - service_action_num=int(item["service_action_num"]), + afid=afid, + serviceable_unit=units, + service_action_num=san, ) - for item in recs + for (afid, san), units in sorted(grouped.items()) ] - reasoning = _build_solution_reasoning(afid_events, solutions, report) + metadata = getattr(result, "afid_sag_metadata", None) or {} + version_info = ( + getattr(result, "engine_version_info", None) or getattr(result, "version_info", None) or {} + ) + sag_pid = metadata.get("sag_pid") or metadata.get("pid") or "unknown" + sag_revision = metadata.get("sag_revision") or metadata.get("revision") or "unknown" + engine_version = version_info.get("version") or version_info.get("engine_version") + version_suffix = f", engine {engine_version}" if engine_version else "" + reasoning = ( + f"{engine_label} (SAG {sag_pid} rev {sag_revision}{version_suffix}): " + f"{len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + ) return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, ) - - -def _action_num_from_row(row: dict[str, Any]) -> int | None: - if "service_action_num" in row: - return int(row["service_action_num"]) - service_action = row.get("service_action") - if isinstance(service_action, dict) and "id" in service_action: - return int(service_action["id"]) - afid_entry = row.get("afid_entry") - if isinstance(afid_entry, dict) and "service_action_num" in afid_entry: - return int(afid_entry["service_action_num"]) - return None - - -def _build_solution_reasoning( - afid_events: list[AfidEvent], - solutions: list[ServiceabilitySolution], - report: dict[str, Any], -) -> str: - sag_pid = report.get("sag_pid") or "unknown" - sag_revision = report.get("sag_revision") or "unknown" - return ( - f"Service Hub (SAG {sag_pid} rev {sag_revision}): " - f"{len(solutions)} recommendation(s) from {len(afid_events)} input event(s)." - ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index f5fc54bb..344ef7c7 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -31,7 +31,7 @@ class AfidEvent(BaseModel): - """Service Hub input: one AFID occurrence on a serviceable unit.""" + """One AFID occurrence on a serviceable unit.""" afid: int = Field(description="AMD Fault ID.") serviceable_unit: str = Field( @@ -51,7 +51,7 @@ def _strip_serviceable_unit(cls, value: str) -> str: class ServiceabilitySolution(BaseModel): - """Service Hub output: recommended action for an AFID.""" + """Recommended service action for an AFID.""" afid: int serviceable_unit: List[str] = Field( @@ -67,7 +67,7 @@ class ServiceabilityBlock(BaseModel): afid_events: List[AfidEvent] = Field( default_factory=list, - description="Input events passed to Service Hub.", + description="Summarized AFID events from collected data.", ) solution: List[ServiceabilitySolution] = Field( default_factory=list, @@ -77,9 +77,3 @@ class ServiceabilityBlock(BaseModel): default=None, description="Human-readable summary of how the engine reached its conclusions.", ) - - -class SeInputPayload(BaseModel): - """JSON written to the SE ``--input`` file.""" - - afid_events: List[AfidEvent] = Field(default_factory=list) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index 0fda2e5e..aeec1eb7 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,247 +23,109 @@ # SOFTWARE. # ############################################################################### -"""Run the AMD Service Hub (Python API, CLI, or custom subprocess).""" +"""Invoke a configured Python service engine against collected Redfish events.""" from __future__ import annotations import importlib -import json -import shlex -import subprocess -import tempfile +import inspect from pathlib import Path -from typing import Literal, Optional +from typing import Any, Optional, Type -from .se_adapter import afid_events_to_engine_input, serviceability_block_from_engine -from .se_models import AfidEvent, SeInputPayload, ServiceabilityBlock +from .se_adapter import serviceability_block_from_service_result +from .se_models import AfidEvent, ServiceabilityBlock -EngineBackend = Literal["python", "cli", "subprocess"] +_ENGINE_METHOD = "get_service_info" class SeRunError(RuntimeError): - """Raised when Service Hub fails or returns invalid output.""" + """Raised when the service engine fails or returns invalid output.""" -def resolve_engine_command( +def run_service_engine( *, - engine_executable: Optional[str] = None, - engine_entry_point: Optional[str] = None, -) -> list[str]: - """Build the argv prefix for a subprocess or CLI-backed SE invocation.""" - has_exe = bool(engine_executable and str(engine_executable).strip()) - has_entry = bool(engine_entry_point and str(engine_entry_point).strip()) - if has_exe and has_entry: - raise ValueError("Provide only one of engine_executable or engine_entry_point.") - if not has_exe and not has_entry: - raise ValueError("Provide engine_executable or engine_entry_point.") - if has_exe: - return [str(engine_executable).strip()] - return shlex.split(str(engine_entry_point).strip()) - - -def run_se( - *, - engine_backend: EngineBackend = "python", - engine_python_module: str = "service_hub", - engine_executable: Optional[str] = None, - engine_entry_point: Optional[str] = None, + engine_python_module: str, + engine_display_name: Optional[str] = None, afid_events: list[AfidEvent], afid_sag_path: str, - extra_args: Optional[list[str]] = None, - timeout_seconds: int = 600, - work_dir: Optional[str] = None, + rf_events: list[Any], + cper_data: Optional[dict[str, Any]] = None, ) -> ServiceabilityBlock: - """Run the SE and return a :class:`ServiceabilityBlock`.""" + """Run a Python service engine and return a :class:`ServiceabilityBlock`.""" sag_path = Path(afid_sag_path) if not sag_path.is_file(): raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") - if engine_backend == "python": - return _run_se_python( - engine_python_module=engine_python_module, - afid_events=afid_events, - afid_sag_path=str(sag_path), - ) - if engine_backend == "cli": - return _run_se_cli( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - afid_events=afid_events, - afid_sag_path=str(sag_path), - extra_args=extra_args, - timeout_seconds=timeout_seconds, - work_dir=work_dir, - ) - return _run_se_subprocess( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - afid_events=afid_events, - afid_sag_path=str(sag_path), - extra_args=extra_args, - timeout_seconds=timeout_seconds, - work_dir=work_dir, - ) - - -def _run_se_python( - *, - engine_python_module: str, - afid_events: list[AfidEvent], - afid_sag_path: str, -) -> ServiceabilityBlock: - try: - se = importlib.import_module(engine_python_module) - SagDocument = se.SagDocument - ServiceHub = se.ServiceHub - EventRecord = se.EventRecord - except (ImportError, AttributeError) as exc: + if not rf_events: raise SeRunError( - f"Cannot import {engine_python_module} bindings — install service-hub " - f"and build the Python extension (uv build)." - ) from exc + "Collected Redfish events are required; re-run collection or use skip_engine." + ) - wire_events = afid_events_to_engine_input(afid_events) + label = engine_display_name or engine_python_module try: - sag = SagDocument.from_file(afid_sag_path) - records = [ - EventRecord( - afid=str(item["afid"]), - location=str(item["location"]), - count=int(item["count"]), - ) - for item in wire_events - ] - analysis = ServiceHub(sag).analyze(records) - report = analysis.to_dict() - except Exception as exc: - raise SeRunError(f"Service Hub analyze() failed: {exc}") from exc - - return serviceability_block_from_engine(afid_events, report) + mod = importlib.import_module(engine_python_module) + except ImportError as exc: + raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc - -def _run_se_cli( - *, - engine_executable: Optional[str], - engine_entry_point: Optional[str], - afid_events: list[AfidEvent], - afid_sag_path: str, - extra_args: Optional[list[str]], - timeout_seconds: int, - work_dir: Optional[str], -) -> ServiceabilityBlock: - """Invoke an external engine CLI ``analyze --sag … --input …`` and map stdout JSON.""" - command = resolve_engine_command( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, - ) - wire_events = afid_events_to_engine_input(afid_events) - - with tempfile.TemporaryDirectory(prefix="nodescraper_se_cli_", dir=work_dir) as tmp: - input_path = Path(tmp) / "events.json" - input_path.write_text(json.dumps(wire_events, indent=2), encoding="utf-8") - argv = [ - *command, - "analyze", - "--sag", - afid_sag_path, - "--input", - str(input_path), - ] - if extra_args: - argv.extend(extra_args) - completed = _run_subprocess(argv, timeout_seconds=timeout_seconds) + engine_cls = _resolve_engine_class(mod) try: - report = json.loads(completed.stdout or "{}") - except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from Service Hub CLI: {exc}") from exc + instance = engine_cls(afid_sag=afid_sag_path) + analyze = getattr(instance, _ENGINE_METHOD) + result = analyze( + list(rf_events), + cper_data=dict(cper_data) if cper_data else None, + ) + except Exception as exc: + raise SeRunError(f"{label} {_ENGINE_METHOD}() failed: {exc}") from exc - from .se_adapter import recommendations_from_report_dict + if result is None: + return ServiceabilityBlock( + afid_events=list(afid_events), + solution=[], + solution_reasoning=f"{label}: no service actions after event filtering.", + ) - return serviceability_block_from_engine( + return serviceability_block_from_service_result( afid_events, - report, - recommendations=recommendations_from_report_dict(report), - ) - - -def _run_se_subprocess( - *, - engine_executable: Optional[str], - engine_entry_point: Optional[str], - afid_events: list[AfidEvent], - afid_sag_path: str, - extra_args: Optional[list[str]], - timeout_seconds: int, - work_dir: Optional[str], -) -> ServiceabilityBlock: - """Custom subprocess protocol: ``--input`` / ``--output`` / ``--afid-sag``.""" - command = resolve_engine_command( - engine_executable=engine_executable, - engine_entry_point=engine_entry_point, + result, + engine_label=label, + rf_event_count=len(rf_events), ) - payload = SeInputPayload(afid_events=afid_events) - - with tempfile.TemporaryDirectory(prefix="nodescraper_se_", dir=work_dir) as tmp: - tmp_path = Path(tmp) - input_path = tmp_path / "se_input.json" - output_path = tmp_path / "se_output.json" - input_path.write_text( - json.dumps(payload.model_dump(mode="json"), indent=2), - encoding="utf-8", - ) - argv = [ - *command, - "--input", - str(input_path), - "--output", - str(output_path), - "--afid-sag", - str(Path(afid_sag_path).resolve()), - ] - if extra_args: - argv.extend(extra_args) - _run_subprocess(argv, timeout_seconds=timeout_seconds) - if not output_path.is_file(): - raise SeRunError(f"Service Hub did not write output file: {output_path}") - try: - raw = json.loads(output_path.read_text(encoding="utf-8")) - except json.JSONDecodeError as exc: - raise SeRunError(f"Invalid JSON from Service Hub: {exc}") from exc - block = ServiceabilityBlock.model_validate(raw) - if not block.afid_events: - block.afid_events = list(afid_events) - return block +def _is_engine_class(obj: Any) -> bool: + return inspect.isclass(obj) and callable(getattr(obj, _ENGINE_METHOD, None)) -def _run_subprocess(argv: list[str], *, timeout_seconds: int) -> subprocess.CompletedProcess: - exe = Path(argv[0]) - if not exe.is_file() and not _command_on_path(argv[0]): - raise SeRunError(f"Service Hub not found or not executable: {argv[0]!r}") - try: - completed = subprocess.run( - argv, - capture_output=True, - text=True, - timeout=timeout_seconds, - check=False, - ) - except subprocess.TimeoutExpired as exc: - raise SeRunError(f"Service Hub timed out after {timeout_seconds}s") from exc - except OSError as exc: - raise SeRunError(f"Failed to start Service Hub: {exc}") from exc +def _resolve_engine_class(mod: Any) -> Type[Any]: + """Find the engine class in ``mod`` that implements ``get_service_info``.""" + package = mod.__name__ + candidates: list[Type[Any]] = [] + seen: set[int] = set() - if completed.returncode != 0: - stderr = (completed.stderr or "").strip() - stdout = (completed.stdout or "").strip() - detail = stderr or stdout or f"exit code {completed.returncode}" - raise SeRunError(f"Service Hub failed: {detail}") - return completed + def add_candidate(obj: Any) -> None: + if not _is_engine_class(obj): + return + key = id(obj) + if key in seen: + return + seen.add(key) + candidates.append(obj) + for name in getattr(mod, "__all__", []) or []: + add_candidate(getattr(mod, name, None)) -def _command_on_path(name: str) -> bool: - from shutil import which + for _, obj in inspect.getmembers(mod, inspect.isclass): + obj_module = getattr(obj, "__module__", "") + if obj_module == package or obj_module.startswith(f"{package}."): + add_candidate(obj) - return which(name) is not None + if len(candidates) == 1: + return candidates[0] + if not candidates: + raise SeRunError( + f"No class with {_ENGINE_METHOD}() found in {package}; " + "check engine_python_module in analysis_args." + ) + names = ", ".join(cls.__name__ for cls in candidates) + raise SeRunError(f"Multiple classes with {_ENGINE_METHOD}() in {package}: {names}.") diff --git a/nodescraper/taskresulthooks/filesystemloghook.py b/nodescraper/taskresulthooks/filesystemloghook.py index 831e3fbe..50184b4e 100644 --- a/nodescraper/taskresulthooks/filesystemloghook.py +++ b/nodescraper/taskresulthooks/filesystemloghook.py @@ -28,7 +28,7 @@ from nodescraper.interfaces.taskresulthook import TaskResultHook from nodescraper.models import DataModel, TaskResult -from nodescraper.utils import pascal_to_snake +from nodescraper.utils import resolve_log_dir_name class FileSystemLogHook(TaskResultHook): @@ -43,9 +43,9 @@ def process_result(self, task_result: TaskResult, data: Optional[DataModel] = No """Log task result to the filesystem (single events.json per directory).""" log_path = self.log_base_path if task_result.parent: - log_path = os.path.join(log_path, pascal_to_snake(task_result.parent)) + log_path = os.path.join(log_path, resolve_log_dir_name(task_result.parent)) if task_result.task: - log_path = os.path.join(log_path, pascal_to_snake(task_result.task)) + log_path = os.path.join(log_path, resolve_log_dir_name(task_result.task)) task_result.log_result(log_path) diff --git a/nodescraper/utils.py b/nodescraper/utils.py index 3b9edf34..910f608f 100644 --- a/nodescraper/utils.py +++ b/nodescraper/utils.py @@ -187,18 +187,35 @@ def get_unique_filename(directory, filename) -> str: count += 1 -def pascal_to_snake(input_str: str) -> str: - """Convert PascalCase to snake_case +_LOG_DIR_NAME_OVERRIDES: dict[str, str] = {} - Args: - input_str (str): string to convert - Returns: - str: converted string +def register_log_dir_name(class_name: str, log_dir_name: str) -> None: + """Register a filesystem log directory name for a task or plugin class.""" + _LOG_DIR_NAME_OVERRIDES[class_name] = log_dir_name + + +def resolve_log_dir_name(class_name: str) -> str: + """Map a class name to its log directory (override or snake_case).""" + if class_name in _LOG_DIR_NAME_OVERRIDES: + return _LOG_DIR_NAME_OVERRIDES[class_name] + return pascal_to_snake(class_name) + + +def pascal_to_snake(input_str: str) -> str: + """Convert PascalCase to snake_case. + + Handles embedded acronyms with digits (e.g. ``ServiceabilityPluginMI3XX``, + ``MI3XXCollector``) without splitting into single-letter segments. """ + if not input_str: + return "" if input_str.isupper(): return input_str.lower() - return ("_").join(re.split("(?<=.)(?=[A-Z])", input_str)).lower() + normalized = re.sub(r"([A-Z][A-Z0-9]+)([A-Z][a-z])", r"\1_\2", input_str) + normalized = re.sub(r"([a-z])([A-Z][A-Z0-9]+)", r"\1_\2", normalized) + normalized = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", normalized) + return normalized.lower() def bytes_to_human_readable(input_bytes: int) -> str: diff --git a/test/unit/plugin/fixtures/afid_sag_sample.json b/test/unit/plugin/fixtures/afid_sag_sample.json new file mode 100644 index 00000000..952999e6 --- /dev/null +++ b/test/unit/plugin/fixtures/afid_sag_sample.json @@ -0,0 +1,8 @@ +{ + "9001": { + "service_action_num": 99 + }, + "9002": { + "service_action_num": 88 + } +} diff --git a/test/unit/plugin/fixtures/mock_python_engine.py b/test/unit/plugin/fixtures/mock_python_engine.py new file mode 100644 index 00000000..c45c4803 --- /dev/null +++ b/test/unit/plugin/fixtures/mock_python_engine.py @@ -0,0 +1,40 @@ +"""Mock Python service engine for unit tests.""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any, Optional + +from ..serviceability_dummy_data import ( + DUMMY_ENGINE_VERSION, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_UNIT_A, +) + + +class MockServiceEngine: + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[dict[str, Any]], + cper_data: Optional[dict[str, Any]] = None, + ) -> SimpleNamespace: + del cper_data + service_info: dict[str, dict[str, dict[str, str]]] = {} + for event in rf_events: + afid = event.get("Afid") + unit = event.get("serviceable_unit", DUMMY_UNIT_A) + if afid is None: + continue + service_info.setdefault(str(unit), {})[str(afid)] = { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + } + return SimpleNamespace( + service_info=service_info, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) diff --git a/test/unit/plugin/serviceability_dummy_data.py b/test/unit/plugin/serviceability_dummy_data.py new file mode 100644 index 00000000..c68b521f --- /dev/null +++ b/test/unit/plugin/serviceability_dummy_data.py @@ -0,0 +1,22 @@ +"""Shared dummy values for serviceability unit tests (not production data).""" + +DUMMY_AFID_A = 9001 +DUMMY_AFID_B = 9002 +DUMMY_AFID_C = 9003 +DUMMY_SERVICE_ACTION_NUM = 99 +DUMMY_UNIT_A = "dummy_unit_a" +DUMMY_UNIT_B = "dummy_unit_b" +DUMMY_UNIT_C = "dummy_unit_c" +DUMMY_DESIGNATION_A = "DUMMY_SLOT_A" +DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" +DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" +DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" +DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" +DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" +DUMMY_RF_EVENT_COUNT = 2 +DUMMY_SAG_PID = "dummy-sag-pid" +DUMMY_SAG_REVISION = "dummy-rev-0" +DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_BMC_HOST = "dummy-bmc.example" +DUMMY_OEM_VENDOR = "DummyVendor" diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py new file mode 100644 index 00000000..b89b1b71 --- /dev/null +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -0,0 +1,213 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest +from pydantic import ValidationError + +from nodescraper.connection.redfish import RF_MEMBERS, RedfishGetResult +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + MI3XXAnalyzer, + MI3XXCollector, + MI3XXCollectorArgs, + MI3XXDataModel, + MI3XXDeviceInfo, + MI3XXResult, + ServiceabilityDataModel, + ServiceabilityPluginBase, + ServiceabilityPluginMI3XX, + build_mi3xx_reporting_version_fields, + compare_iso_datetime, + is_valid_iso_datetime, + satisfies_time_check, +) +from test.unit.plugin.serviceability_dummy_data import ( + DUMMY_BMC_HOST, + DUMMY_EVENT_URI, + DUMMY_EVENT_URI_ALT, + DUMMY_TIMESTAMP_EARLIER, + DUMMY_TIMESTAMP_LATER, +) + +EVENT_URI = DUMMY_EVENT_URI + + +@pytest.fixture +def mi3xx_collector(system_info, redfish_conn_mock): + redfish_conn_mock.base_url = f"https://{DUMMY_BMC_HOST}/redfish/v1" + return MI3XXCollector( + system_info=system_info, + connection=redfish_conn_mock, + log_path="/tmp/mi3xx.log", + ) + + +def test_mi3xx_collector_args_default_event_log_uri(): + args = MI3XXCollectorArgs() + uri = args.resolved_event_log_uri() + assert uri.startswith("/redfish/") + assert "EventLog" in uri + + +def test_mi3xx_collector_args_requires_event_log_uri(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs(uri="", rf_event_log_uri="") + + +def test_mi3xx_collector_args_uri_alias(): + args = MI3XXCollectorArgs(uri=f" {DUMMY_EVENT_URI_ALT} ", rf_event_log_uri=DUMMY_EVENT_URI) + assert args.resolved_event_log_uri() == DUMMY_EVENT_URI_ALT + + +def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", + ) + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + rf_chassis_devices=["dummy-chassis"], + ) + + +def test_mi3xx_collector_args_reference_time_requires_operator(): + with pytest.raises(ValidationError): + MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + ) + + +def test_mi3xx_collector_args_accepts_iso_date_and_datetime(): + date_args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + assert date_args.reference_time == "2000-01-01" + + +def test_time_utils_iso_validation_and_comparison(): + assert is_valid_iso_datetime("2000-01-01") + assert satisfies_time_check("2000-01-02", "2000-01-01", ">") + assert compare_iso_datetime("2000-01-01T00:00:00", "2000-01-01T00:00:00", "==") + + +def test_serviceability_plugin_mi3xx_wiring(): + assert issubclass(ServiceabilityPluginMI3XX, ServiceabilityPluginBase) + assert ServiceabilityPluginMI3XX.DATA_MODEL is ServiceabilityDataModel + assert ServiceabilityPluginMI3XX.COLLECTOR is MI3XXCollector + assert ServiceabilityPluginMI3XX.COLLECTOR_ARGS is MI3XXCollectorArgs + assert ServiceabilityPluginMI3XX.ANALYZER is MI3XXAnalyzer + + +def test_mi3xx_collector_no_args(mi3xx_collector): + result, data = mi3xx_collector.collect_data() + assert result.status == ExecutionStatus.NOT_RAN + assert "required" in result.message.lower() + assert data is None + + +def test_mi3xx_collector_success_minimal(mi3xx_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={RF_MEMBERS: [{"Id": "dummy-1", "Created": DUMMY_TIMESTAMP_LATER}]}, + status_code=200, + ) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert len(data.rf_events) == 1 + assert data.bmc_host == DUMMY_BMC_HOST + assert data.log_path == "/tmp/mi3xx.log" + + +def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): + args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + assert mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_LATER, args) + assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) + + +def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + {"Id": "dummy-1", "Created": DUMMY_TIMESTAMP_LATER}, + {"Id": "dummy-2", "Created": DUMMY_TIMESTAMP_EARLIER}, + ] + }, + status_code=200, + ) + args = MI3XXCollectorArgs( + rf_event_log_uri=EVENT_URI, + reference_time="2000-01-01", + time_operator=">=", + ) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert [event["Id"] for event in data.rf_events] == ["dummy-1"] + + +def test_mi3xx_device_info_fields(): + info = MI3XXDeviceInfo( + board_product_name="dummy-board", + board_serial_number="dummy-serial-001", + product_version="0.0-dummy", + ) + assert info.board_product_name == "dummy-board" + assert info.product_version == "0.0-dummy" + + +def test_mi3xx_result_reporting_versions(): + version_fields = build_mi3xx_reporting_version_fields( + plugin_name="dummy_plugin", + plugin_version="0.0-dummy", + node_scraper_version="0.0-dummy", + dummy_engine_version="0.0-dummy", + ) + result = MI3XXResult(node="dummy-node", **version_fields) + assert result.plugin_name == "dummy_plugin" + assert result.reporter_extensions["dummy_engine_version"] == "0.0-dummy" + + +def test_mi3xx_data_model_log_model(tmp_path): + model = MI3XXDataModel( + collected_data={"events": [{"id": 1}]}, + artifacts={"events.json": [{"id": 1}]}, + ) + model.log_model(str(tmp_path)) + assert (tmp_path / "events.json").is_file() + assert (tmp_path / "MI3XX_data.json").is_file() diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py new file mode 100644 index 00000000..cbb5e714 --- /dev/null +++ b/test/unit/plugin/test_se_runner.py @@ -0,0 +1,257 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import json +from pathlib import Path +from types import SimpleNamespace + +import pytest +from pydantic import ValidationError + +from nodescraper.enums import ExecutionStatus +from nodescraper.plugins.serviceability import ( + AfidEvent, + MI3XXAnalyzer, + SeRunError, + ServiceabilityAnalyzerArgs, + ServiceabilityBlock, + ServiceabilityDataModel, + build_afid_events_from_data, + format_serviceability_solution_lines, + normalize_se_timestamp, + run_service_engine, + serviceability_block_from_service_result, +) +from nodescraper.plugins.serviceability.se_models import ServiceabilitySolution +from test.unit.plugin.serviceability_dummy_data import ( + DUMMY_AFID_A, + DUMMY_AFID_B, + DUMMY_AFID_C, + DUMMY_DESIGNATION_A, + DUMMY_DESIGNATION_B, + DUMMY_ENGINE_VERSION, + DUMMY_OEM_VENDOR, + DUMMY_RF_EVENT_COUNT, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_TIMESTAMP, + DUMMY_UNIT_A, + DUMMY_UNIT_B, + DUMMY_UNIT_C, +) + +FIXTURES = Path(__file__).resolve().parent / "fixtures" +AFID_SAG = FIXTURES / "afid_sag_sample.json" +EXAMPLE_EVENTS = [ + AfidEvent(afid=DUMMY_AFID_A, serviceable_unit=DUMMY_UNIT_A, time=DUMMY_TIMESTAMP), + AfidEvent(afid=DUMMY_AFID_B, serviceable_unit=DUMMY_UNIT_B, time=DUMMY_TIMESTAMP), + AfidEvent(afid=DUMMY_AFID_C, serviceable_unit=DUMMY_UNIT_C, time=DUMMY_TIMESTAMP), +] + + +def test_afid_event_requires_non_empty_serviceable_unit(): + with pytest.raises(ValidationError): + AfidEvent(afid=1, serviceable_unit=" ", time=DUMMY_TIMESTAMP) + + +def test_normalize_se_timestamp_preserves_engine_format(): + sample = "2000-01-01 12:00:00.000+00:00" + assert normalize_se_timestamp(sample) == sample + + +def test_analyzer_args_require_engine_config(): + with pytest.raises(ValidationError): + ServiceabilityAnalyzerArgs() + with pytest.raises(ValidationError, match="engine_python_module"): + ServiceabilityAnalyzerArgs(afid_sag_path=str(AFID_SAG)) + args = ServiceabilityAnalyzerArgs( + engine_python_module="dummy.test.module", + afid_sag_path=str(AFID_SAG), + ) + assert args.engine_python_module == "dummy.test.module" + + +def test_format_serviceability_solution_lines(): + block = ServiceabilityBlock( + afid_events=EXAMPLE_EVENTS[:1], + solution=[ + ServiceabilitySolution( + afid=DUMMY_AFID_A, + serviceable_unit=[DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B], + service_action_num=DUMMY_SERVICE_ACTION_NUM, + ) + ], + solution_reasoning="Dummy test reasoning.", + ) + lines = format_serviceability_solution_lines(block) + assert lines[0] == "Dummy test reasoning." + assert f"AFID {DUMMY_AFID_A}" in lines[1] + assert DUMMY_DESIGNATION_A in lines[1] + + +def test_serviceability_block_from_service_result(): + result = SimpleNamespace( + service_info={ + DUMMY_DESIGNATION_A: { + str(DUMMY_AFID_A): { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "error_category": "dummy_category", + "error_type": "dummy_type", + "title": "Dummy service action", + } + }, + DUMMY_DESIGNATION_B: { + str(DUMMY_AFID_A): { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "error_category": "dummy_category", + "error_type": "dummy_type", + "title": "Dummy service action", + } + }, + }, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) + block = serviceability_block_from_service_result( + EXAMPLE_EVENTS[:1], + result, + engine_label="Dummy test engine", + rf_event_count=DUMMY_RF_EVENT_COUNT, + ) + assert len(block.solution) == 1 + assert block.solution[0].afid == DUMMY_AFID_A + assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} + assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning + assert "Dummy test engine" in block.solution_reasoning + + +def test_resolve_engine_class_finds_package_export(): + import types + + submodule = types.ModuleType("fake_engine.impl") + submodule.__dict__["EngineImpl"] = type( + "EngineImpl", + (), + {"get_service_info": lambda self, rf_events, cper_data=None: None}, + ) + package = types.ModuleType("fake_engine") + package.EngineImpl = submodule.EngineImpl # type: ignore[attr-defined] + package.__all__ = ["EngineImpl"] + + from nodescraper.plugins.serviceability.se_runner import _resolve_engine_class + + assert _resolve_engine_class(package) is submodule.EngineImpl + + +def test_run_service_engine_with_mock_module(): + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, + ] + block = run_service_engine( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_events=EXAMPLE_EVENTS[:2], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + ) + assert len(block.solution) == 2 + assert block.solution[0].afid == DUMMY_AFID_A + assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + + +def test_run_service_engine_missing_sag_raises(): + with pytest.raises(SeRunError, match="AFID_SAG"): + run_service_engine( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_events=EXAMPLE_EVENTS, + afid_sag_path="/nonexistent/dummy_afid_sag.json", + rf_events=[{"Afid": DUMMY_AFID_A}], + ) + + +def test_build_afid_events_from_rf_members(): + data = ServiceabilityDataModel( + rf_events=[ + { + "Afid": DUMMY_AFID_A, + "serviceable_unit": DUMMY_UNIT_A, + "Created": DUMMY_TIMESTAMP, + }, + { + "Oem": { + DUMMY_OEM_VENDOR: { + "Afid": DUMMY_AFID_B, + "serviceable_unit": DUMMY_UNIT_B, + } + }, + "EventTimestamp": DUMMY_TIMESTAMP, + }, + ] + ) + events = build_afid_events_from_data(data) + assert len(events) == 2 + assert events[0].afid == DUMMY_AFID_A + assert events[1].afid == DUMMY_AFID_B + + +def test_mi3xx_analyzer_runs_python_engine(system_info): + data = ServiceabilityDataModel( + rf_events=[ + { + "Afid": DUMMY_AFID_A, + "serviceable_unit": DUMMY_UNIT_A, + "Created": DUMMY_TIMESTAMP, + }, + { + "Afid": DUMMY_AFID_C, + "serviceable_unit": DUMMY_UNIT_C, + "Created": DUMMY_TIMESTAMP, + }, + ] + ) + analyzer = MI3XXAnalyzer(system_info=system_info) + args = ServiceabilityAnalyzerArgs( + engine_python_module="test.unit.plugin.fixtures.mock_python_engine", + afid_sag_path=str(AFID_SAG), + ) + result = analyzer.analyze_data(data, args=args) + assert result.status == ExecutionStatus.OK + assert data.serviceability is not None + assert len(data.serviceability.solution) == 2 + + +def test_mi3xx_analyzer_writes_serviceability_json(tmp_path, system_info): + data = ServiceabilityDataModel( + afid_events=EXAMPLE_EVENTS[:1], + serviceability=ServiceabilityBlock( + afid_events=EXAMPLE_EVENTS[:1], + solution=[], + ), + ) + data.log_model(str(tmp_path)) + payload = json.loads((tmp_path / "serviceability.json").read_text(encoding="utf-8")) + assert payload["afid_events"][0]["afid"] == DUMMY_AFID_A diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py index d7496288..da31e491 100644 --- a/test/unit/plugin/test_serviceability_collector.py +++ b/test/unit/plugin/test_serviceability_collector.py @@ -38,7 +38,7 @@ from nodescraper.models import CollectorArgs from nodescraper.plugins.serviceability import ( DeviceInfo, - Mi3xxCollectorArgs, + MI3XXCollectorArgs, ServiceabilityAnalyzerArgs, ServiceabilityDataModel, ServiceabilityPluginBase, @@ -46,15 +46,16 @@ from nodescraper.plugins.serviceability.serviceability_collector import ( ServiceabilityCollectorBase, ) +from test.unit.plugin.serviceability_dummy_data import DUMMY_BMC_HOST, DUMMY_EVENT_URI -EVENT_URI = "/redfish/v1/Systems/1/LogServices/SEL/Entries" +EVENT_URI = DUMMY_EVENT_URI -class _StubServiceabilityCollector(ServiceabilityCollectorBase[Mi3xxCollectorArgs]): +class _StubServiceabilityCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): def filter_event_members( self, members: list[Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> list[Any]: return members @@ -68,21 +69,21 @@ def parse_assembly_entry( self, designation: str, assembly_member_entry: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> DeviceInfo: return DeviceInfo(name=designation, serial_number=assembly_member_entry.get("SerialNumber")) def extract_component_details( self, firmware_inventory_payload: dict[str, Any], - args: Mi3xxCollectorArgs, + args: MI3XXCollectorArgs, ) -> Optional[str]: return firmware_inventory_payload.get("Details") @pytest.fixture def stub_serviceability_collector(system_info, redfish_conn_mock): - redfish_conn_mock.base_url = "https://bmc.example/redfish/v1" + redfish_conn_mock.base_url = f"https://{DUMMY_BMC_HOST}/redfish/v1" return _StubServiceabilityCollector( system_info=system_info, connection=redfish_conn_mock, @@ -90,40 +91,53 @@ def stub_serviceability_collector(system_info, redfish_conn_mock): ) +def test_mi3xx_collector_args_default_event_log_uri(): + args = MI3XXCollectorArgs() + uri = args.resolved_event_log_uri() + assert uri.startswith("/redfish/") + assert "EventLog" in uri + + def test_mi3xx_collector_args_requires_event_log_uri(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs() + MI3XXCollectorArgs(uri="", rf_event_log_uri="") def test_mi3xx_collector_args_uri_alias_prefers_uri_over_rf_event_log_uri(): - args = Mi3xxCollectorArgs(uri=" /events ", rf_event_log_uri="/other") - assert args.resolved_event_log_uri() == "/events" + args = MI3XXCollectorArgs( + uri=" /redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt ", + rf_event_log_uri="/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries", + ) + assert ( + args.resolved_event_log_uri() + == "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" + ) def test_mi3xx_collector_args_assembly_requires_both_template_and_devices(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, rf_assembly_uri_template="/redfish/v1/Chassis/{device}/Assembly", ) with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, - rf_chassis_devices=["C1"], + rf_chassis_devices=["dummy-chassis"], ) def test_mi3xx_collector_args_assembly_template_must_include_device_placeholder(): with pytest.raises(ValidationError): - Mi3xxCollectorArgs( + MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, - rf_assembly_uri_template="/redfish/v1/Chassis/C1/Assembly", - rf_chassis_devices=["C1"], + rf_assembly_uri_template="/redfish/v1/Chassis/dummy-chassis/Assembly", + rf_chassis_devices=["dummy-chassis"], ) def test_mi3xx_collector_args_assembly_optional_when_omitted(): - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) assert args.rf_assembly_uri_template is None assert args.rf_chassis_devices is None @@ -150,7 +164,7 @@ def test_stub_collector_event_log_get_fails(stub_serviceability_collector, redfi error="timeout", status_code=None, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.ERROR assert EVENT_URI in result.message @@ -165,13 +179,13 @@ def test_stub_collector_success_minimal(stub_serviceability_collector, redfish_c data={RF_MEMBERS: members}, status_code=200, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None assert data.rf_events == members assert EVENT_URI in data.responses - assert data.bmc_host == "bmc.example" + assert data.bmc_host == DUMMY_BMC_HOST assert data.log_path == "/tmp/serviceability.log" redfish_conn_mock.run_get_paged.assert_called_once() @@ -193,7 +207,7 @@ def filter_event_members(self, members, args): data={RF_MEMBERS: []}, status_code=200, ) - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = collector.collect_data(args=args) assert result.status == ExecutionStatus.ERROR assert "Event filter failed" in result.message @@ -204,7 +218,7 @@ def test_stub_collector_assembly_and_firmware_paths( stub_serviceability_collector, redfish_conn_mock ): tpl = "/redfish/v1/Chassis/{device}/Assembly" - asm_uri = tpl.format(device="C1") + asm_uri = tpl.format(device="dummy-chassis") fw_uri = "/redfish/v1/UpdateService/FirmwareInventory" def run_get_side_effect(path: str, *_args, **_kwargs): @@ -219,14 +233,14 @@ def run_get_side_effect(path: str, *_args, **_kwargs): return RedfishGetResult( path=asm_uri, success=True, - data={"Assemblies": [{"SerialNumber": "SN-ASM"}]}, + data={"Assemblies": [{"SerialNumber": "dummy-asm-serial"}]}, status_code=200, ) if path == fw_uri: return RedfishGetResult( path=fw_uri, success=True, - data={"Details": "fw-summary"}, + data={"Details": "dummy-fw-summary"}, status_code=200, ) raise AssertionError(f"unexpected Redfish GET path: {path!r}") @@ -238,19 +252,19 @@ def run_get_paged_forbidden(*_args, **_kwargs): redfish_conn_mock.run_get_paged.side_effect = run_get_paged_forbidden - args = Mi3xxCollectorArgs( + args = MI3XXCollectorArgs( rf_event_log_uri=EVENT_URI, rf_assembly_uri_template=tpl, - rf_chassis_devices=["C1"], + rf_chassis_devices=["dummy-chassis"], rf_firmware_bundle_uri=fw_uri, follow_next_link=False, ) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert "C1" in data.assembly_info - assert data.assembly_info["C1"].serial_number == "SN-ASM" - assert data.component_details == "fw-summary" + assert "dummy-chassis" in data.assembly_info + assert data.assembly_info["dummy-chassis"].serial_number == "dummy-asm-serial" + assert data.component_details == "dummy-fw-summary" assert asm_uri in data.responses @@ -271,7 +285,7 @@ def test_stub_collector_top_when_count_exceeds_top_uses_skip_and_paged( ) redfish_conn_mock.run_get.return_value = probe redfish_conn_mock.run_get_paged.return_value = window - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI, top=10) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None @@ -300,7 +314,7 @@ def test_stub_collector_top_when_count_within_top_fetches_full_log( ) redfish_conn_mock.run_get.return_value = probe redfish_conn_mock.run_get_paged.return_value = full - args = Mi3xxCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI, top=50) result, data = stub_serviceability_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None From 9ace7e6fea33e98e68b319e4f3109c515f5f3924 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 12:37:58 -0500 Subject: [PATCH 07/39] updates --- nodescraper/interfaces/datacollectortask.py | 1 - nodescraper/interfaces/task.py | 23 ++- nodescraper/models/event.py | 19 ++- nodescraper/plugins/inband/rocm/rocmdata.py | 55 ++++--- .../plugins/serviceability/analyzer_args.py | 22 ++- .../plugins/serviceability/cper_decode.py | 145 ++++++++++++++++++ .../serviceability/mi3xx/mi3xx_analyzer.py | 58 ++++++- .../serviceability/mi3xx/mi3xx_collector.py | 56 ++++++- .../serviceability_collector.py | 8 +- .../serviceability/serviceability_data.py | 7 + test/unit/plugin/test_mi3xx_collector.py | 33 ++++ .../plugin/test_serviceability_collector.py | 2 +- 12 files changed, 389 insertions(+), 40 deletions(-) create mode 100644 nodescraper/plugins/serviceability/cper_decode.py diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 020bf053..18308a98 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -151,7 +151,6 @@ def __init__( Args: system_info (SystemInfo): system info object for target system for data collection system_interaction (SystemInteraction): enum to indicate the type of actions that can be performed when interacting with the system - event_reporter (str, optional): Described the reporter of the event. Defaults to DEFAULT_EVENT_REPORTER. logger (Optional[logging.Logger], optional): python logger object. Defaults to None. log_path (Optional[str], optional): file system log path. Defaults to None. """ diff --git a/nodescraper/interfaces/task.py b/nodescraper/interfaces/task.py index 8855a48a..3696673a 100644 --- a/nodescraper/interfaces/task.py +++ b/nodescraper/interfaces/task.py @@ -73,8 +73,10 @@ def __init__( if session_id is not None: try: uuid.UUID(str(session_id)) - except (ValueError, TypeError, AttributeError): - raise ValueError("session_id must be a valid UUID") from None + except (ValueError, TypeError, AttributeError) as e: + raise ValueError( + f"session_id must be a valid UUID string, got: {session_id}" + ) from e self.session_id: Optional[str] = str(session_id) if session_id is not None else None self.result: TaskResult = self._init_result() @@ -166,7 +168,22 @@ def _log_event( ) if console_log: - self.logger.log(getattr(logging, priority.name, logging.INFO), description) + level = getattr(logging, priority.name, logging.INFO) + prefix = "" + if data: + et = data.get("exception_type") + if et: + prefix = f"[{et}] " + self.logger.log(level, "%s%s", prefix, description) + if data: + tb = data.get("traceback") + if tb: + tb_text = "".join(tb) if isinstance(tb, list) else str(tb) + if tb_text.strip(): + self.logger.log(level, "Traceback:\n%s", tb_text.rstrip()) + det = data.get("details") + if det and not tb: + self.logger.log(level, "Details: %s", det) self.result.events.append(event) diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index 25315ef2..7c959d4e 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -114,13 +114,21 @@ def validate_category(cls, category: Optional[Union[str, Enum]]) -> str: @field_validator("priority", mode="before") @classmethod def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPriority: - """Allow priority as EventPriority, enum name string, or IntEnum value (unknown int -> ERROR). + """Allow priority via :class:`EventPriority`, name string, or integer value. + + Integer values use :class:`~enum.IntEnum` construction (same numeric scale as + ``EventPriority``). Values outside the enum (e.g. foreign severity codes) map + to :attr:`EventPriority.ERROR`. Booleans are rejected (``bool`` is a subclass + of ``int`` in Python). Args: - priority: EventPriority, name string, integer matching a level, or unknown int (maps to ERROR). + priority: Enum, member name, or integer severity. Raises: - ValueError: if priority string is invalid, or if a boolean is passed. + ValueError: if *priority* is a boolean or an invalid string name. + + Returns: + Resolved :class:`EventPriority`. """ if type(priority) is bool: raise ValueError("priority must not be a boolean") @@ -138,7 +146,10 @@ def validate_priority(cls, priority: Union[str, int, EventPriority]) -> EventPri ) from e if isinstance(priority, EventPriority): return priority - raise ValueError("priority must be an EventPriority or its name as a string") + raise ValueError( + "priority must be an EventPriority, its name as a string, or an int " + "(unknown ints map to ERROR)" + ) @field_serializer("priority") def serialize_priority(self, priority: EventPriority, _info) -> str: diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index cd1b0537..eb1794c3 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,18 +24,24 @@ # ############################################################################### import re -from typing import ClassVar, List, Optional +from typing import List, Optional -from pydantic import computed_field, field_validator +from pydantic import field_validator from nodescraper.models import DataModel -_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$") +# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 +_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") +_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") -class RocmDataModel(DataModel): - ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm" +def _validate_rocm_version_string(rocm_version: str) -> str: + if not _ROCM_VERSION_RE.match(rocm_version): + raise ValueError(f"ROCm version has invalid format: {rocm_version}") + return rocm_version + +class RocmDataModel(DataModel): rocm_version: str rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] @@ -47,28 +53,33 @@ class RocmDataModel(DataModel): clinfo: List[str] = [] kfd_proc: List[str] = [] - @staticmethod - def _validate_version_string(version: str) -> str: - if not _ROCM_VERSION_RE.match(version): - raise ValueError(f"ROCm version has invalid format: {version}") - return version - @field_validator("rocm_version") @classmethod def validate_rocm_version(cls, rocm_version: str) -> str: - return cls._validate_version_string(rocm_version) + """ + Validate the ROCm version format. + + Args: + rocm_version (str): The ROCm version string to validate. + + Raises: + ValueError: If the ROCm version does not match the expected format. + + Returns: + str: The validated ROCm version string. + """ + return _validate_rocm_version_string(rocm_version) @field_validator("rocm_sub_versions") @classmethod - def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]: - for version in sub_versions.values(): - cls._validate_version_string(version) - return sub_versions + def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: + for value in rocm_sub_versions.values(): + _validate_rocm_version_string(value) + return rocm_sub_versions - @computed_field + @property def build_number(self) -> Optional[str]: - """Build tag from version-rocm sub-version, or rocm_version when absent.""" - rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version) - if "-" in rocm_version: - return rocm_version.split("-")[1] - return None + """ROCm package build number from version-rocm sub-version or rocm_version.""" + version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version + match = _ROCM_BUILD_NUMBER_RE.match(version_str) + return match.group(1) if match else None diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 679743dd..8d5deea1 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -54,8 +54,28 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): default=False, description="If True, only build afid_events without running the service engine.", ) + cper_decode_module: Optional[str] = Field( + default=None, + description=( + "Import path of the Python module that decodes CPER blobs (e.g. " + "vendor.package.cdump_analyzer). Required when collected events " + "include CPER attachments to decode before running the service engine." + ), + ) + cper_decode_method: str = Field( + default="analyze_cper", + description=( + "Name of the callable on cper_decode_module. It must accept a " + "binary file-like CPER payload and return (return_code, decode_dict)." + ), + ) - @field_validator("afid_sag_path", "engine_python_module", "engine_display_name") + @field_validator( + "afid_sag_path", + "engine_python_module", + "engine_display_name", + "cper_decode_module", + ) @classmethod def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: if value is None: diff --git a/nodescraper/plugins/serviceability/cper_decode.py b/nodescraper/plugins/serviceability/cper_decode.py new file mode 100644 index 00000000..6982407a --- /dev/null +++ b/nodescraper/plugins/serviceability/cper_decode.py @@ -0,0 +1,145 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +"""Decode collected CPER attachments via a configured Python decode module.""" +from __future__ import annotations + +import base64 +import binascii +import importlib +import io +import logging +from typing import Any, Callable, Optional + + +class CperDecodeError(RuntimeError): + """Raised when the configured CPER decode module cannot be loaded or decoding fails.""" + + +def _load_decode_callable( + cper_decode_module: str, + cper_decode_method: str, +) -> Callable[[io.BytesIO], tuple[int, Any]]: + """Import a decode callable from analysis_args (module + method name).""" + try: + module = importlib.import_module(cper_decode_module) + except ImportError as exc: + raise CperDecodeError( + f"Cannot import cper_decode_module {cper_decode_module!r}: {exc}" + ) from exc + + decode_fn = getattr(module, cper_decode_method, None) + if decode_fn is None: + raise CperDecodeError( + f"Module {cper_decode_module!r} has no callable {cper_decode_method!r}" + ) + if not callable(decode_fn): + raise CperDecodeError(f"{cper_decode_module!r}.{cper_decode_method!r} is not callable") + return decode_fn + + +def count_ras_err_entries(decode_payload: Any) -> int: + """Count RasErr* keys in a decoded CPER triage_result dict.""" + if not isinstance(decode_payload, dict): + return 0 + triage_result = decode_payload.get("triage_result", {}) + if not isinstance(triage_result, dict): + return 0 + return sum(1 for key in triage_result if str(key).startswith("RasErr")) + + +def decode_cper_raw_attachments( + cper_raw: dict[str, str], + *, + cper_decode_module: str, + cper_decode_method: str = "analyze_cper", + logger: Optional[logging.Logger] = None, +) -> dict[str, Any]: + """Decode base64 CPER blobs keyed by Redfish event Id. + + The decode callable must accept a binary file-like object and return + ``(return_code, decode_dict)``. Results are passed to the service engine as + ``cper_data``; the engine does not perform CPER decoding itself. + + Returns ``{event_id: {"return_code": int, "decode": dict}}``. + """ + if not cper_raw: + return {} + + decode_fn = _load_decode_callable(cper_decode_module, cper_decode_method) + + decoded: dict[str, Any] = {} + errors: list[str] = [] + + for event_id, payload_b64 in cper_raw.items(): + try: + raw = base64.b64decode(payload_b64, validate=True) + except (binascii.Error, ValueError) as exc: + errors.append(f"event {event_id}: invalid base64 ({exc})") + continue + + try: + return_code, decode_payload = decode_fn(io.BytesIO(raw)) + except Exception as exc: # noqa: BLE001 + msg = f"event {event_id}: {exc}" + errors.append(msg) + if logger is not None: + logger.warning("CPER decode failed for Redfish event %s: %s", event_id, exc) + continue + + if return_code != 0: + errors.append(f"event {event_id}: decode return code {return_code}") + + decoded[str(event_id)] = { + "return_code": return_code, + "decode": decode_payload, + } + if logger is not None: + ras_count = count_ras_err_entries(decode_payload) + if return_code == 0: + logger.info( + "CPER decoded for Redfish event %s (return_code=0, %d RasErr entr%s)", + event_id, + ras_count, + "y" if ras_count == 1 else "ies", + ) + else: + logger.warning( + "CPER decoded for Redfish event %s with non-zero return_code=%s " + "(%d RasErr entr%s)", + event_id, + return_code, + ras_count, + "y" if ras_count == 1 else "ies", + ) + + if errors and not decoded: + raise CperDecodeError("; ".join(errors)) + + if logger is not None and errors: + for msg in errors: + logger.warning("CPER decode issue: %s", msg) + + return decoded diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index ab001184..b8fc8373 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -32,6 +32,10 @@ from nodescraper.models import TaskResult from nodescraper.plugins.serviceability.afid_events import build_afid_events_from_data from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs +from nodescraper.plugins.serviceability.cper_decode import ( + CperDecodeError, + decode_cper_raw_attachments, +) from nodescraper.plugins.serviceability.se_adapter import ( format_serviceability_solution_lines, ) @@ -67,6 +71,51 @@ def analyze_data( self._log_serviceability_solutions(data.serviceability) return self.result + parent = self.parent or self.__class__.__name__ + cper_data = data.cper_data or {} + if data.cper_raw and not cper_data: + if not args.cper_decode_module: + self.logger.warning( + "(%s) %d CPER attachment(s) collected but cper_decode_module is " + "not set in analysis_args; skipping CPER decode", + parent, + len(data.cper_raw), + ) + else: + self.logger.info( + "(%s) Decoding %d CPER attachment(s) via %s.%s", + parent, + len(data.cper_raw), + args.cper_decode_module, + args.cper_decode_method, + ) + try: + cper_data = decode_cper_raw_attachments( + data.cper_raw, + cper_decode_module=args.cper_decode_module, + cper_decode_method=args.cper_decode_method, + logger=self.logger, + ) + data.cper_data = cper_data + self.logger.info( + "(%s) CPER decode finished: %d of %d attachment(s) decoded", + parent, + len(cper_data), + len(data.cper_raw), + ) + except CperDecodeError as exc: + self.logger.warning( + "(%s) %s; continuing without decoded CPER", + parent, + exc, + ) + elif cper_data: + self.logger.info( + "(%s) Using %d pre-decoded CPER record(s) from collection", + parent, + len(cper_data), + ) + try: block = run_service_engine( engine_python_module=args.engine_python_module, # type: ignore[arg-type] @@ -74,7 +123,7 @@ def analyze_data( afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, - cper_data=data.cper_data or None, + cper_data=cper_data or None, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -85,9 +134,14 @@ def analyze_data( self._log_serviceability_solutions(block) engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK + cper_summary = "" + if cper_data: + cper_summary = f", {len(cper_data)} decoded CPER(s)" + elif data.cper_raw: + cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" self.result.message = ( f"{engine_label}: {len(block.solution)} solution(s) " - f"from {len(data.rf_events)} Redfish event(s)" + f"from {len(data.rf_events)} Redfish event(s){cper_summary}" ) return self.result diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 63e23e21..44594aee 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -25,6 +25,7 @@ ############################################################################### from __future__ import annotations +import base64 from typing import Any, Optional from nodescraper.plugins.serviceability.serviceability_collector import ( @@ -67,12 +68,63 @@ def filter_event_members( return filtered def is_cper_event(self, event: dict) -> bool: + if "CPER" in event: + return True + if str(event.get("DiagnosticDataType", "")).upper() == "CPER": + return True + if event.get("AdditionalDataURI"): + return True message_id = str(event.get("MessageId", "")).lower() message = str(event.get("Message", "")).lower() return "cper" in message_id or "cper" in message or "diagnostic" in message_id - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - return {} + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: + """Fetch CPER binaries from BMC; decoding runs in the analyzer.""" + parent = self.parent or self.__class__.__name__ + attachments: dict[str, str] = {} + for event in rf_events: + if not isinstance(event, dict) or not self.is_cper_event(event): + continue + uri = event.get("AdditionalDataURI") + event_id = event.get("Id") + if not uri or not event_id: + continue + + try: + resp = self.connection.get_response(uri) + except Exception as exc: # noqa: BLE001 + self.logger.warning( + "(%s) Failed to fetch CPER attachment for event %s: %s", + parent, + event_id, + exc, + ) + continue + if not resp.ok: + self.logger.warning( + "(%s) Failed to fetch CPER attachment for event %s: HTTP %s", + parent, + event_id, + resp.status_code, + ) + continue + + size_bytes = len(resp.content) + attachments[str(event_id)] = base64.b64encode(resp.content).decode("ascii") + self.logger.info( + "(%s) Fetched CPER attachment for Redfish event %s (%d bytes)", + parent, + event_id, + size_bytes, + ) + + if attachments: + self.logger.info( + "(%s) Collected %d CPER attachment(s) for analyzer decode", + parent, + len(attachments), + ) + return attachments def parse_assembly_entry( self, diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 961afdf9..3278c113 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -76,8 +76,8 @@ def is_cper_event(self, event: dict) -> bool: """Return whether a Redfish event entry should be treated as diagnostic-backed.""" @abc.abstractmethod - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: - """Fetch and decode diagnostic attachments for qualifying events (subclass-defined).""" + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: + """Fetch CPER binary attachments for qualifying events (base64 by event Id).""" @abc.abstractmethod def parse_assembly_entry( @@ -151,13 +151,13 @@ def collect_data( entry = assemblies[0] assembly_info[device] = self.parse_assembly_entry(device, entry, svc_args) - cper_data = self.collect_cper_data(filtered_members or []) + cper_raw = self.collect_cper_attachments(filtered_members or []) data = ServiceabilityDataModel( responses=responses, rf_events=filtered_members or [], assembly_info=assembly_info, - cper_data=cper_data, + cper_raw=cper_raw, component_details=self._fetch_component_details(responses, svc_args), log_path=self._log_path, bmc_host=bmc_host, diff --git a/nodescraper/plugins/serviceability/serviceability_data.py b/nodescraper/plugins/serviceability/serviceability_data.py index 0c387940..b275c579 100644 --- a/nodescraper/plugins/serviceability/serviceability_data.py +++ b/nodescraper/plugins/serviceability/serviceability_data.py @@ -66,6 +66,13 @@ class ServiceabilityDataModel(DataModel): responses: dict[str, Any] = {} rf_events: list[Any] = [] assembly_info: Dict[str, DeviceInfo] = {} + cper_raw: Dict[str, str] = Field( + default_factory=dict, + description=( + "Base64-encoded CPER attachment bytes keyed by Redfish event Id; " + "populated during collection and decoded in the analyzer." + ), + ) cper_data: Dict[str, Any] = {} component_details: Optional[str] = None log_path: Optional[str] = None diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index b89b1b71..91ff0ed0 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -158,6 +158,39 @@ def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) +def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_mock): + import base64 + from unittest.mock import MagicMock + + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-1", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", + } + ] + }, + status_code=200, + ) + response = MagicMock() + response.ok = True + response.status_code = 200 + response.content = b"\x01\x02dummy-cper" + redfish_conn_mock.get_response.return_value = response + + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw["cper-evt-1"] == base64.b64encode(b"\x01\x02dummy-cper").decode("ascii") + assert data.cper_data == {} + + def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, diff --git a/test/unit/plugin/test_serviceability_collector.py b/test/unit/plugin/test_serviceability_collector.py index da31e491..603cf08a 100644 --- a/test/unit/plugin/test_serviceability_collector.py +++ b/test/unit/plugin/test_serviceability_collector.py @@ -62,7 +62,7 @@ def filter_event_members( def is_cper_event(self, event: dict) -> bool: return False - def collect_cper_data(self, rf_events: list[Any]) -> dict[str, Any]: + def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: return {} def parse_assembly_entry( From eeb98889178e670787ac1f0622db2685a9bbad17 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 27 May 2026 12:42:36 -0500 Subject: [PATCH 08/39] undid rocmdata changes --- nodescraper/plugins/inband/rocm/rocmdata.py | 55 +++++++++------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index eb1794c3..cd1b0537 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -24,24 +24,18 @@ # ############################################################################### import re -from typing import List, Optional +from typing import ClassVar, List, Optional -from pydantic import field_validator +from pydantic import computed_field, field_validator from nodescraper.models import DataModel -# e.g. 7.13.0, 7.13.0-123, 7.13.0-123-gfx942, 7.13.0-123-gfx942;gfx950 -_ROCM_VERSION_RE = re.compile(r"^\d+(?:\.\d+){0,3}(?:-\d+)?(?:-gfx\d+(?:;gfx\d+)*)?$") -_ROCM_BUILD_NUMBER_RE = re.compile(r"^\d+(?:\.\d+){0,3}-(\d+)") - - -def _validate_rocm_version_string(rocm_version: str) -> str: - if not _ROCM_VERSION_RE.match(rocm_version): - raise ValueError(f"ROCm version has invalid format: {rocm_version}") - return rocm_version +_ROCM_VERSION_RE = re.compile(r"^(\d+(?:\.\d+){0,3})(?:-(\d+)(?:-gfx\w+(?:;gfx\w+)*)?)?$") class RocmDataModel(DataModel): + ROCM_VERSION_FILENAME: ClassVar[str] = "version-rocm" + rocm_version: str rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] @@ -53,33 +47,28 @@ class RocmDataModel(DataModel): clinfo: List[str] = [] kfd_proc: List[str] = [] + @staticmethod + def _validate_version_string(version: str) -> str: + if not _ROCM_VERSION_RE.match(version): + raise ValueError(f"ROCm version has invalid format: {version}") + return version + @field_validator("rocm_version") @classmethod def validate_rocm_version(cls, rocm_version: str) -> str: - """ - Validate the ROCm version format. - - Args: - rocm_version (str): The ROCm version string to validate. - - Raises: - ValueError: If the ROCm version does not match the expected format. - - Returns: - str: The validated ROCm version string. - """ - return _validate_rocm_version_string(rocm_version) + return cls._validate_version_string(rocm_version) @field_validator("rocm_sub_versions") @classmethod - def validate_rocm_sub_versions(cls, rocm_sub_versions: dict[str, str]) -> dict[str, str]: - for value in rocm_sub_versions.values(): - _validate_rocm_version_string(value) - return rocm_sub_versions + def validate_rocm_sub_versions(cls, sub_versions: dict[str, str]) -> dict[str, str]: + for version in sub_versions.values(): + cls._validate_version_string(version) + return sub_versions - @property + @computed_field def build_number(self) -> Optional[str]: - """ROCm package build number from version-rocm sub-version or rocm_version.""" - version_str = self.rocm_sub_versions.get("version-rocm") or self.rocm_version - match = _ROCM_BUILD_NUMBER_RE.match(version_str) - return match.group(1) if match else None + """Build tag from version-rocm sub-version, or rocm_version when absent.""" + rocm_version = self.rocm_sub_versions.get(self.ROCM_VERSION_FILENAME, self.rocm_version) + if "-" in rocm_version: + return rocm_version.split("-")[1] + return None From 89e10dd774d4f1c009082afc5d2dcb85d46b9696 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 12 Jun 2026 09:48:30 -0500 Subject: [PATCH 09/39] first pass at picking up oob for PLUGIN_DOC --- .github/workflows/update-plugin-docs.yml | 1 - docs/PLUGIN_DOC.md | 206 +++++++++++- docs/generate_plugin_doc_bundle.py | 301 +++++++++++++++--- .../generic_collection_collector.py | 5 + .../{ => inband}/regex_search/__init__.py | 56 ++-- .../regex_search/analyzer_args.py | 100 +++--- .../regex_search/regex_search_analyzer.py | 204 ++++++------ .../regex_search/regex_search_data.py | 214 ++++++------- .../regex_search/regex_search_plugin.py | 143 ++++----- .../bmc_archive/bmc_archive_collector.py | 5 + .../redfish_endpoint/endpoint_analyzer.py | 6 + .../redfish_endpoint/endpoint_collector.py | 7 + .../redfish_oem_diag/oem_diag_analyzer.py | 5 + .../redfish_oem_diag/oem_diag_collector.py | 6 + .../unit/plugin/test_regex_search_analyzer.py | 14 +- 15 files changed, 866 insertions(+), 407 deletions(-) rename nodescraper/plugins/{ => inband}/regex_search/__init__.py (97%) rename nodescraper/plugins/{ => inband}/regex_search/analyzer_args.py (97%) rename nodescraper/plugins/{ => inband}/regex_search/regex_search_analyzer.py (97%) rename nodescraper/plugins/{ => inband}/regex_search/regex_search_data.py (97%) rename nodescraper/plugins/{ => inband}/regex_search/regex_search_plugin.py (87%) diff --git a/.github/workflows/update-plugin-docs.yml b/.github/workflows/update-plugin-docs.yml index a4da869c..97c1e48a 100644 --- a/.github/workflows/update-plugin-docs.yml +++ b/.github/workflows/update-plugin-docs.yml @@ -37,7 +37,6 @@ jobs: run: | source venv/bin/activate python docs/generate_plugin_doc_bundle.py \ - --package nodescraper.plugins.inband \ --output docs/PLUGIN_DOC.md \ --update-readme-help diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 0ca1366f..e0d84df8 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -1,6 +1,6 @@ # Plugin Documentation -# Plugin Table +# IB Plugins | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | @@ -24,6 +24,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -PP -D -d {vendor_id}:{dev_id}
lspci -PP -D
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | +| RegexSearchPlugin | - | - | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | | RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | @@ -31,6 +32,14 @@ | SyslogPlugin | ls -1 /var/log/syslog* 2>/dev/null | grep -E '^/var/log/syslog(\.[0-9]+(\.gz)?)?$' || true
ls -1 /var/log/messages* 2>/dev/null | grep -E '^/var/log/messages(\.[0-9]+(\.gz)?)?$' || true | - | - | [SyslogData](#SyslogData-Model) | [SyslogCollector](#Collector-Class-SyslogCollector) | - | | UptimePlugin | uptime | - | - | [UptimeDataModel](#UptimeDataModel-Model) | [UptimeCollector](#Collector-Class-UptimeCollector) | - | +# OOB plugins + +| Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | +| --- | --- | --- | --- | --- | --- | --- | +| OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | +| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following Members@odata.nextLink when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / Members links (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Members@odata.nextLink pagination for each URI and merge all pages into a single response.
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | +| RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | + # Collectors ## Collector Class AmdSmiCollector @@ -947,6 +956,70 @@ UptimeDataModel - uptime +## Collector Class BmcArchiveCollector + +### Description + +Archive BMC directories over SSH using tar czf - . + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [bmc_archive_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/bmc_archive/bmc_archive_collector.py) + +### Class Variables + +- **SUPPORTED_OS_FAMILY**: `{, }` +- **REMOTE_ARCHIVE_TEMPLATE**: `/tmp/node_scraper_{name}.tar.gz` +- **_tar_ignore_failed_read_supported**: `None` + +### Provides Data + +BmcArchiveDataModel + +### Documented collection + +- SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries). +- Uses sudo on the BMC when collection_args paths require elevated access. + +## Collector Class RedfishEndpointCollector + +### Description + +Collects Redfish endpoint responses for URIs specified in config. + +**Bases**: ['RedfishDataCollector'] + +**Link to code**: [endpoint_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py) + +### Provides Data + +RedfishEndpointDataModel + +### Documented collection + +- Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1). +- Optional paged GET following Members@odata.nextLink when follow_next_link is true. +- Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / Members links (depth and endpoint caps from collection_args). + +## Collector Class RedfishOemDiagCollector + +### Description + +Collects Redfish OEM diagnostic logs (e.g. JournalControl, AllLogs) via LogService.CollectDiagnosticData. + +**Bases**: ['RedfishDataCollector'] + +**Link to code**: [oem_diag_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_collector.py) + +### Provides Data + +RedfishOemDiagDataModel + +### Documented collection + +- Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService). +- Optional binary archives under the plugin log path when log_path is set. + # Data Models ## AmdSmiDataModel Model @@ -1286,6 +1359,22 @@ Data model for RDMA (Remote Direct Memory Access) statistics and link informatio - **dev_list**: `list[nodescraper.plugins.inband.rdma.rdmadata.RdmaDevice]` - **link_list_text**: `list[nodescraper.plugins.inband.rdma.rdmadata.RdmaLinkText]` +## RegexSearchData Model + +### Description + +Loaded file or directory contents passed to the analyzer (via --data). + +**Link to code**: [regex_search_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/regex_search/regex_search_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **content**: `str` +- **data_root**: `str` +- **files**: `dict[str, str]` + ## RocmDataModel Model **Link to code**: [rocmdata.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/rocm/rocmdata.py) @@ -1378,6 +1467,49 @@ Data model for in band syslog logs - **current_time**: `str` - **uptime**: `str` +## BmcArchiveDataModel Model + +### Description + +Collected BMC directory archives. + +**Link to code**: [bmc_archive_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/bmc_archive/bmc_archive_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **results**: `list[nodescraper.plugins.ooband.bmc_archive.bmc_archive_data.ArchiveCollectionResult]` +- **archives**: `list[nodescraper.connection.inband.inband.BinaryFileArtifact]` + +## RedfishEndpointDataModel Model + +### Description + +Collected Redfish endpoint responses: URI -> JSON body. + +**Link to code**: [endpoint_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_endpoint/endpoint_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **responses**: `dict[str, dict]` + +## RedfishOemDiagDataModel Model + +### Description + +Collected Redfish OEM diagnostic log results: OEM type -> result (success, error, metadata). + +**Link to code**: [oem_diag_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **results**: `dict[str, nodescraper.plugins.ooband.redfish_oem_diag.oem_diag_data.OemDiagTypeResult]` + # Data Analyzers ## Data Analyzer Class AmdSmiAnalyzer @@ -1709,6 +1841,20 @@ Check RDMA statistics for errors (RoCE and other RDMA error counters). **Link to code**: [rdma_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/rdma/rdma_analyzer.py) +## Data Analyzer Class RegexSearchAnalyzer + +### Description + +Run user-provided regexes against text loaded from --data (file or directory). + +**Bases**: ['RegexAnalyzer'] + +**Link to code**: [regex_search_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/regex_search/regex_search_analyzer.py) + +### Class Variables + +- **ERROR_REGEX**: `[]` + ## Data Analyzer Class RocmAnalyzer ### Description @@ -1753,6 +1899,36 @@ Check sysctl matches expected sysctl details **Link to code**: [sysctl_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/sysctl/sysctl_analyzer.py) +## Data Analyzer Class RedfishEndpointAnalyzer + +### Description + +Checks Redfish endpoint responses against configured thresholds and key/value rules. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [endpoint_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py) + +### Documented analysis + +- For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.). +- URI key "*" runs checks against every collected response body. + +## Data Analyzer Class RedfishOemDiagAnalyzer + +### Description + +Analyzes Redfish OEM diagnostic log collection results. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [oem_diag_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_analyzer.py) + +### Documented analysis + +- Summarizes success/failure per OEM diagnostic type from collected results. +- When analysis_args.require_all_success is true, fails the run if any type failed collection. + # Analyzer Args ## Analyzer Args Class AmdSmiAnalyzerArgs @@ -2021,3 +2197,31 @@ Sysfs settings for analysis via a list of checks (path, expected values, name). - **exp_vm_dirty_ratio**: `Optional[int]` — Expected vm.dirty_ratio value. - **exp_vm_dirty_writeback_centisecs**: `Optional[int]` — Expected vm.dirty_writeback_centisecs value. - **exp_kernel_numa_balancing**: `Optional[int]` — Expected kernel.numa_balancing value. + +## Analyzer Args Class RedfishEndpointAnalyzerArgs + +### Description + +Analyzer args for config-driven Redfish checks. + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_endpoint/analyzer_args.py) + +### Annotations / fields + +- **checks**: `dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]]` — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match). Use '*' as the key to apply the inner constraints to every collected response body. Property paths use '/' for nesting and indices, e.g. 'Status/Health', 'PowerControl/0/PowerConsumedWatts'. Constraints: 'eq' — value must equal the given literal (int, float, str, bool). 'min' — value must be numeric and >= the given number. 'max' — value must be numeric and <= the given number. 'anyOf' — value must be in the given list (OR; any match passes). Example: { "/redfish/v1/Systems/1": { "Status/Health": { "anyOf": ["OK", "Warning"] }, "PowerState": "On" }, "*": { "Status/Health": { "anyOf": ["OK"] } } }. + +## Analyzer Args Class RedfishOemDiagAnalyzerArgs + +### Description + +Analyzer args for Redfish OEM diagnostic log results. + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/ooband/redfish_oem_diag/analyzer_args.py) + +### Annotations / fields + +- **require_all_success**: `bool` — If True, analysis fails when any OEM type collection failed. diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index b7676b6a..fb47a297 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -26,10 +26,8 @@ """ Usage python generate_plugin_doc_bundle.py \ - --package /home/alexbara/node-scraper/nodescraper/plugins/inband \ - --output PLUGIN_DOC.md \ + --output docs/PLUGIN_DOC.md \ --update-readme-help - """ import argparse import importlib @@ -44,7 +42,10 @@ LINK_BASE_DEFAULT = "https://github.com/amd/node-scraper/blob/HEAD/" REL_ROOT_DEFAULT = "nodescraper/plugins/inband" -DEFAULT_ROOT_PACKAGE = "nodescraper.plugins" +# Default packages scanned for plugin tables (IB: full inband tree; OOB: ooband). +PACKAGE_IB_INBAND = "nodescraper.plugins.inband" +PACKAGE_OOB = "nodescraper.plugins.ooband" +DEFAULT_PACKAGES = (PACKAGE_IB_INBAND, PACKAGE_OOB) def get_attr(obj: Any, name: str, default: Any = None) -> Any: @@ -182,6 +183,54 @@ def find_inband_plugin_base(): return get_attr(base_mod, "InBandDataPlugin") +def find_oob_plugin_bases() -> tuple[type, ...]: + """Return OOB plugin base classes under ``nodescraper.plugins.ooband`` (Redfish + BMC SSH).""" + base_mod = importlib.import_module("nodescraper.base") + oob = get_attr(base_mod, "OOBandDataPlugin") + oob_ssh = get_attr(base_mod, "OOBSSHDataPlugin") + bases = [b for b in (oob, oob_ssh) if b is not None] + return tuple(bases) + + +def is_concrete_plugin_class(cls: type) -> bool: + if not inspect.isclass(cls): + return False + return not bool(get_attr(cls, "__abstractmethods__", set())) + + +def all_subclasses_union(bases: Iterable[type]) -> set[type]: + """All distinct concrete descendants across one or more base classes (transitive).""" + merged: set[type] = set() + for base in bases: + merged |= all_subclasses_single(base) + return merged + + +def all_subclasses_single(cls: type) -> set[type]: + seen, out, work = set(), set(), [cls] + while work: + parent = work.pop() + for sub in parent.__subclasses__(): + if sub not in seen: + seen.add(sub) + out.add(sub) + work.append(sub) + return out + + +def plugins_for_package_prefix(base_classes: Iterable[type], package_prefix: str) -> List[type]: + """Non-abstract plugin classes under ``base_classes`` whose ``__module__`` starts with *package_prefix*.""" + found: List[type] = [] + for cls in all_subclasses_union(base_classes): + mod = getattr(cls, "__module__", "") or "" + if not mod.startswith(package_prefix): + continue + if not is_concrete_plugin_class(cls): + continue + found.append(cls) + return found + + def link_anchor(obj: Any, kind: str) -> str: if obj is None or not inspect.isclass(obj): return "-" @@ -228,6 +277,126 @@ def add_cmd(s: Any): return cmds +# Optional human-readable bullets for plugins without CMD_* shell snippets (e.g. Redfish). +DOCUMENTATION_COLLECTION_ITEMS_ATTR = "DOCUMENTATION_COLLECTION_ITEMS" +DOCUMENTATION_ANALYSIS_ITEMS_ATTR = "DOCUMENTATION_ANALYSIS_ITEMS" + + +def _documentation_lines_for_attr(cls: Any, attr_name: str) -> List[str]: + if cls is None or not inspect.isclass(cls): + return [] + raw = get_attr(cls, attr_name, None) + if raw is None: + return [] + if isinstance(raw, str): + return [ln.strip() for ln in raw.splitlines() if ln.strip()] + if isinstance(raw, (list, tuple)): + return [str(x).strip() for x in raw if isinstance(x, str) and str(x).strip()] + return [] + + +def merge_unique_lines(*line_groups: Iterable[str]) -> List[str]: + """Concatenate line groups, dropping exact duplicates while preserving order.""" + seen: set[str] = set() + out: List[str] = [] + for group in line_groups: + for line in group: + if line not in seen: + seen.add(line) + out.append(line) + return out + + +def extract_collection_lines_for_table(plugin_cls: type, collector_cls: Any) -> List[str]: + """Shell CMD_* lines plus optional DOCUMENTATION_COLLECTION_ITEMS (collector then plugin).""" + cmd_lines: List[str] = [] + if inspect.isclass(collector_cls): + cmd_lines = extract_cmds_from_classvars(collector_cls) + doc_collector = _documentation_lines_for_attr( + collector_cls, DOCUMENTATION_COLLECTION_ITEMS_ATTR + ) + doc_plugin = _documentation_lines_for_attr(plugin_cls, DOCUMENTATION_COLLECTION_ITEMS_ATTR) + return merge_unique_lines(cmd_lines, doc_collector, doc_plugin) + + +def extract_analysis_doc_lines_for_table(plugin_cls: type, analyzer_cls: Any) -> List[str]: + """Optional DOCUMENTATION_ANALYSIS_ITEMS (analyzer then plugin) for the analyzer column.""" + doc_an = _documentation_lines_for_attr(analyzer_cls, DOCUMENTATION_ANALYSIS_ITEMS_ATTR) + doc_pl = _documentation_lines_for_attr(plugin_cls, DOCUMENTATION_ANALYSIS_ITEMS_ATTR) + return merge_unique_lines(doc_an, doc_pl) + + +def iter_plugin_collector_classes(plugin_cls: type) -> List[type]: + """Return collector class(es) for a plugin (supports tuple COLLECTOR via DataPlugin.get_collector_classes).""" + gcs = getattr(plugin_cls, "get_collector_classes", None) + if callable(gcs): + try: + return [c for c in gcs() if inspect.isclass(c)] + except Exception: + return [] + return [] + + +def collector_has_table_collection_coverage(plugin_cls: type, collector_cls: type) -> bool: + """True if the plugin table Collection cell would be non-empty from CMD_* or documentation lines.""" + if extract_cmds_from_classvars(collector_cls): + return True + if _documentation_lines_for_attr(collector_cls, DOCUMENTATION_COLLECTION_ITEMS_ATTR): + return True + if _documentation_lines_for_attr(plugin_cls, DOCUMENTATION_COLLECTION_ITEMS_ATTR): + return True + return False + + +def analyzer_has_table_analysis_coverage( + plugin_cls: type, analyzer_cls: type, analyzer_args_cls: Any +) -> bool: + """True if the Analyzer Args table cell would be non-empty from regex/args extraction or doc lines.""" + if _documentation_lines_for_attr(analyzer_cls, DOCUMENTATION_ANALYSIS_ITEMS_ATTR): + return True + if _documentation_lines_for_attr(plugin_cls, DOCUMENTATION_ANALYSIS_ITEMS_ATTR): + return True + if extract_regexes_and_args_from_analyzer(analyzer_cls, analyzer_args_cls): + return True + return False + + +def collect_plugin_doc_table_coverage_messages(plugins: List[type]) -> List[str]: + """Messages for plugins whose generated table would show '-' for collection or analysis unjustifiably.""" + msgs: List[str] = [] + for p in plugins: + pname = p.__name__ + for c in iter_plugin_collector_classes(p): + if not collector_has_table_collection_coverage(p, c): + msgs.append( + f"{pname}: collector {c.__name__} has no CMD_* command strings and no " + f"{DOCUMENTATION_COLLECTION_ITEMS_ATTR} on the collector or plugin." + ) + an = get_attr(p, "ANALYZER", None) + aargs = get_attr(p, "ANALYZER_ARGS", None) + if inspect.isclass(an) and not analyzer_has_table_analysis_coverage(p, an, aargs): + msgs.append( + f"{pname}: analyzer {an.__name__} has no extractable analyzer table content " + f"(built-in regexes / *REGEX* attrs / analyzer args fields) and no " + f"{DOCUMENTATION_ANALYSIS_ITEMS_ATTR} on the analyzer or plugin." + ) + return msgs + + +def emit_plugin_doc_coverage_warnings(msgs: List[str], *, strict: bool) -> None: + if not msgs: + return + sys.stderr.write("PLUGIN_DOC.md table coverage warnings:\n") + for m in msgs: + sys.stderr.write(f" WARNING: {m}\n") + if strict: + sys.stderr.write( + f"error: {len(msgs)} plugin documentation coverage warning(s) " + "(--strict-plugin-doc-coverage)\n" + ) + sys.exit(1) + + def extract_regexes_and_args_from_analyzer( analyzer_cls: type, args_cls: Optional[type] ) -> List[str]: @@ -454,14 +623,14 @@ def generate_plugin_table_rows(plugins: List[type]) -> List[List[str]]: an = get_attr(p, "ANALYZER", None) args = get_attr(p, "ANALYZER_ARGS", None) collector_args_cls = get_attr(p, "COLLECTOR_ARGS", None) - cmds: List[str] = [] - if inspect.isclass(col): - cmds = extract_cmds_from_classvars(col) + cmds = extract_collection_lines_for_table(p, col) - # Extract regexes and args from analyzer - regex_and_args = [] + # Extract regexes and args from analyzer; optional DOCUMENTATION_ANALYSIS_* lines first + regex_and_args: List[str] = extract_analysis_doc_lines_for_table( + p, an if inspect.isclass(an) else None + ) if inspect.isclass(an): - regex_and_args = extract_regexes_and_args_from_analyzer(an, args) + regex_and_args.extend(extract_regexes_and_args_from_analyzer(an, args)) # Extract collection args from collector args class collection_args_lines = extract_collection_args_from_collector_args(collector_args_cls) @@ -504,7 +673,13 @@ def render_collector_section(col: type, link_base: str, rel_root: Optional[str]) _url = setup_link(col, link_base, rel_root) s += md_kv("Link to code", f"[{Path(_url).name}]({_url})") - exclude = {"__doc__", "__module__", "__weakref__", "__dict__"} + exclude = { + "__doc__", + "__module__", + "__weakref__", + "__dict__", + DOCUMENTATION_COLLECTION_ITEMS_ATTR, + } cv = class_vars_dump(col, exclude) if cv: s += md_header("Class Variables", 3) + md_list(cv) @@ -516,6 +691,10 @@ def render_collector_section(col: type, link_base: str, rel_root: Optional[str]) if cmds: s += md_header("Commands", 3) + md_list(cmds) + doc_coll = _documentation_lines_for_attr(col, DOCUMENTATION_COLLECTION_ITEMS_ATTR) + if doc_coll: + s += md_header("Documented collection", 3) + md_list(doc_coll) + return s @@ -529,11 +708,21 @@ def render_analyzer_section(an: type, link_base: str, rel_root: Optional[str]) - _url = setup_link(an, link_base, rel_root) s += md_kv("Link to code", f"[{Path(_url).name}]({_url})") - exclude = {"__doc__", "__module__", "__weakref__", "__dict__"} + exclude = { + "__doc__", + "__module__", + "__weakref__", + "__dict__", + DOCUMENTATION_ANALYSIS_ITEMS_ATTR, + } cv = class_vars_dump(an, exclude) if cv: s += md_header("Class Variables", 3) + md_list(cv) + doc_an = _documentation_lines_for_attr(an, DOCUMENTATION_ANALYSIS_ITEMS_ATTR) + if doc_an: + s += md_header("Documented analysis", 3) + md_list(doc_an) + # Add regex patterns if present (pass None for args_cls since we don't have context here) regex_info = extract_regexes_and_args_from_analyzer(an, None) if regex_info: @@ -648,7 +837,15 @@ def main(): description="Generate Plugin Table and detail sections with setup_link + rel-root." ) ap.add_argument( - "--package", default=DEFAULT_ROOT_PACKAGE, help="Dotted package or filesystem path" + "--package", + action="append", + dest="packages", + default=None, + metavar="PKG", + help=( + "Dotted package or filesystem path to import in addition to the default plugin " + f"packages ({', '.join(DEFAULT_PACKAGES)}). Repeatable." + ), ) ap.add_argument("--output", default="PLUGIN_DOC.md", help="Output Markdown file") ap.add_argument( @@ -661,31 +858,57 @@ def main(): default=None, help="Path to README.md (default: README.md in current working directory)", ) + ap.add_argument( + "--strict-plugin-doc-coverage", + action="store_true", + help=( + "Exit with status 1 if any plugin lacks CMD_* / DOCUMENTATION_COLLECTION_ITEMS " + "for collectors or lacks analyzer table content / DOCUMENTATION_ANALYSIS_ITEMS " + "when an analyzer is defined." + ), + ) args = ap.parse_args() - root = args.package - root_path = Path(root) - if os.sep in root or root_path.exists(): - root = dotted_from_path(root_path) - base = find_inband_plugin_base() - import_all_modules(root) - - def all_subclasses(cls: Type) -> set[type]: - seen, out, work = set(), set(), [cls] - while work: - parent = work.pop() - for sub in parent.__subclasses__(): - if sub not in seen: - seen.add(sub) - out.add(sub) - work.append(sub) - return out - - plugins = [c for c in all_subclasses(base) if c is not base] - plugins = [c for c in plugins if not get_attr(c, "__abstractmethods__", set())] - plugins.sort(key=lambda c: f"{c.__module__}.{c.__name__}".lower()) - - rows = generate_plugin_table_rows(plugins) + normalized_extra: List[str] = [] + if args.packages: + for root in args.packages: + root_path = Path(root) + if os.sep in root or root_path.exists(): + root = dotted_from_path(root_path) + normalized_extra.append(root) + + # Always import core plugin trees so IB/OOB tables are complete; append optional extras. + to_import: List[str] = [] + seen_pkg: set[str] = set() + for pkg in list(DEFAULT_PACKAGES) + normalized_extra: + if pkg not in seen_pkg: + seen_pkg.add(pkg) + to_import.append(pkg) + + for pkg in to_import: + import_all_modules(pkg) + + inband_base = find_inband_plugin_base() + oob_bases = find_oob_plugin_bases() + + ib_plugins = sorted( + plugins_for_package_prefix((inband_base,), PACKAGE_IB_INBAND), + key=lambda c: f"{c.__module__}.{c.__name__}".lower(), + ) + oob_plugins = sorted( + plugins_for_package_prefix(oob_bases, PACKAGE_OOB), + key=lambda c: f"{c.__module__}.{c.__name__}".lower(), + ) + plugins = sorted( + set(ib_plugins) | set(oob_plugins), + key=lambda c: f"{c.__module__}.{c.__name__}".lower(), + ) + + coverage_msgs = collect_plugin_doc_table_coverage_messages(plugins) + emit_plugin_doc_coverage_warnings(coverage_msgs, strict=args.strict_plugin_doc_coverage) + + ib_rows = generate_plugin_table_rows(ib_plugins) + oob_rows = generate_plugin_table_rows(oob_plugins) headers = [ "Plugin", "Collection", @@ -718,8 +941,10 @@ def all_subclasses(cls: Type) -> set[type]: out = [] out.append(md_header("Plugin Documentation", 1)) - out.append(md_header("Plugin Table", 1)) - out.append(render_table(headers, rows)) + out.append(md_header("IB Plugins", 1)) + out.append(render_table(headers, ib_rows)) + out.append(md_header("OOB plugins", 1)) + out.append(render_table(headers, oob_rows)) if collectors: out.append(md_header("Collectors", 1)) diff --git a/nodescraper/plugins/generic_collection/generic_collection_collector.py b/nodescraper/plugins/generic_collection/generic_collection_collector.py index 873f572a..1c15462b 100644 --- a/nodescraper/plugins/generic_collection/generic_collection_collector.py +++ b/nodescraper/plugins/generic_collection/generic_collection_collector.py @@ -41,6 +41,11 @@ class GenericCollectionCollector( DATA_MODEL = GenericCollectionDataModel SUPPORTED_OS_FAMILY: set[OSFamily] = {OSFamily.WINDOWS, OSFamily.LINUX, OSFamily.UNKNOWN} + DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( + "Runs each command from collection_args.commands on the target (in-band host or BMC over OOB SSH).", + "Commands are user-configured; there are no fixed CMD_* class fields.", + ) + def collect_data( self, args: Optional[GenericCollectionCollectorArgs] = None ) -> tuple[TaskResult, Optional[GenericCollectionDataModel]]: diff --git a/nodescraper/plugins/regex_search/__init__.py b/nodescraper/plugins/inband/regex_search/__init__.py similarity index 97% rename from nodescraper/plugins/regex_search/__init__.py rename to nodescraper/plugins/inband/regex_search/__init__.py index 708b6b04..b8ee4a8e 100644 --- a/nodescraper/plugins/regex_search/__init__.py +++ b/nodescraper/plugins/inband/regex_search/__init__.py @@ -1,28 +1,28 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from .regex_search_plugin import RegexSearchPlugin - -__all__ = ["RegexSearchPlugin"] +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from .regex_search_plugin import RegexSearchPlugin + +__all__ = ["RegexSearchPlugin"] diff --git a/nodescraper/plugins/regex_search/analyzer_args.py b/nodescraper/plugins/inband/regex_search/analyzer_args.py similarity index 97% rename from nodescraper/plugins/regex_search/analyzer_args.py rename to nodescraper/plugins/inband/regex_search/analyzer_args.py index b30acb7e..254d6a13 100644 --- a/nodescraper/plugins/regex_search/analyzer_args.py +++ b/nodescraper/plugins/inband/regex_search/analyzer_args.py @@ -1,50 +1,50 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from typing import Any, Optional - -from pydantic import Field - -from nodescraper.models import AnalyzerArgs - - -class RegexSearchAnalyzerArgs(AnalyzerArgs): - """Arguments for RegexSearchAnalyzer (dict items match Dmesg-style error_regex).""" - - error_regex: Optional[list[dict[str, Any]]] = Field( - default=None, - description=( - "Regex patterns to search for; each dict may include regex (str), message, " - "event_category, event_priority (same as Dmesg analyzer error_regex). " - ), - ) - interval_to_collapse_event: int = Field( - default=60, - description="Seconds within which repeated events are collapsed into one.", - ) - num_timestamps: int = Field( - default=3, - description="Number of timestamps to include per event in output.", - ) +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Any, Optional + +from pydantic import Field + +from nodescraper.models import AnalyzerArgs + + +class RegexSearchAnalyzerArgs(AnalyzerArgs): + """Arguments for RegexSearchAnalyzer (dict items match Dmesg-style error_regex).""" + + error_regex: Optional[list[dict[str, Any]]] = Field( + default=None, + description=( + "Regex patterns to search for; each dict may include regex (str), message, " + "event_category, event_priority (same as Dmesg analyzer error_regex). " + ), + ) + interval_to_collapse_event: int = Field( + default=60, + description="Seconds within which repeated events are collapsed into one.", + ) + num_timestamps: int = Field( + default=3, + description="Number of timestamps to include per event in output.", + ) diff --git a/nodescraper/plugins/regex_search/regex_search_analyzer.py b/nodescraper/plugins/inband/regex_search/regex_search_analyzer.py similarity index 97% rename from nodescraper/plugins/regex_search/regex_search_analyzer.py rename to nodescraper/plugins/inband/regex_search/regex_search_analyzer.py index 0b4384f4..85da6501 100644 --- a/nodescraper/plugins/regex_search/regex_search_analyzer.py +++ b/nodescraper/plugins/inband/regex_search/regex_search_analyzer.py @@ -1,102 +1,102 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -import os -from typing import Optional, Union - -from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer, RegexEvent -from nodescraper.enums import ExecutionStatus -from nodescraper.models import TaskResult - -from .analyzer_args import RegexSearchAnalyzerArgs -from .regex_search_data import RegexSearchData - - -class RegexSearchAnalyzer(RegexAnalyzer[RegexSearchData, RegexSearchAnalyzerArgs]): - """Run user-provided regexes against text loaded from --data (file or directory).""" - - DATA_MODEL = RegexSearchData - - ERROR_REGEX: list[ErrorRegex] = [] - - def _build_regex_event( - self, regex_obj: ErrorRegex, match: Union[str, list[str]], source: str - ) -> RegexEvent: - """Augment the default event text with a file path when the origin is a concrete path. - - Args: - regex_obj: Metadata for the rule that produced the match. - match: Substring or grouped capture text from the pattern. - source: Origin label, or an absolute path when matching per file. - - Returns: - Match record with an extended description when a path-like source is present. - """ - event = super()._build_regex_event(regex_obj, match, source) - if source and source != "regex_search": - event.description = f"{regex_obj.message} [file: {source}]" - return event - - def analyze_data( - self, - data: RegexSearchData, - args: Optional[RegexSearchAnalyzerArgs] = None, - ) -> TaskResult: - """Scan loaded inputs with the given patterns, or mark the task not run if inputs are incomplete. - - Args: - data: Aggregated and per-file text loaded from the user data path. - args: Optional pattern list and timing knobs; omitted or empty patterns skip work. - - Returns: - Work outcome with match events, or a not-run status when patterns are absent. - """ - if args is None or not args.error_regex: - self.result.status = ExecutionStatus.NOT_RAN - self.result.message = "Analysis args need to be provided for the analyzer to run" - return self.result - - final_regex = self._convert_and_extend_error_regex(args.error_regex, []) - - if data.files: - for rel_path in sorted(data.files.keys()): - file_content = data.files[rel_path] - abs_source = os.path.normpath(os.path.join(data.data_root, rel_path)) - self.result.events += self.check_all_regexes( - content=file_content, - source=abs_source, - error_regex=final_regex, - num_timestamps=args.num_timestamps, - interval_to_collapse_event=args.interval_to_collapse_event, - ) - else: - self.result.events += self.check_all_regexes( - content=data.content, - source=data.data_root or "regex_search", - error_regex=final_regex, - num_timestamps=args.num_timestamps, - interval_to_collapse_event=args.interval_to_collapse_event, - ) - return self.result +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import os +from typing import Optional, Union + +from nodescraper.base.regexanalyzer import ErrorRegex, RegexAnalyzer, RegexEvent +from nodescraper.enums import ExecutionStatus +from nodescraper.models import TaskResult + +from .analyzer_args import RegexSearchAnalyzerArgs +from .regex_search_data import RegexSearchData + + +class RegexSearchAnalyzer(RegexAnalyzer[RegexSearchData, RegexSearchAnalyzerArgs]): + """Run user-provided regexes against text loaded from --data (file or directory).""" + + DATA_MODEL = RegexSearchData + + ERROR_REGEX: list[ErrorRegex] = [] + + def _build_regex_event( + self, regex_obj: ErrorRegex, match: Union[str, list[str]], source: str + ) -> RegexEvent: + """Augment the default event text with a file path when the origin is a concrete path. + + Args: + regex_obj: Metadata for the rule that produced the match. + match: Substring or grouped capture text from the pattern. + source: Origin label, or an absolute path when matching per file. + + Returns: + Match record with an extended description when a path-like source is present. + """ + event = super()._build_regex_event(regex_obj, match, source) + if source and source != "regex_search": + event.description = f"{regex_obj.message} [file: {source}]" + return event + + def analyze_data( + self, + data: RegexSearchData, + args: Optional[RegexSearchAnalyzerArgs] = None, + ) -> TaskResult: + """Scan loaded inputs with the given patterns, or mark the task not run if inputs are incomplete. + + Args: + data: Aggregated and per-file text loaded from the user data path. + args: Optional pattern list and timing knobs; omitted or empty patterns skip work. + + Returns: + Work outcome with match events, or a not-run status when patterns are absent. + """ + if args is None or not args.error_regex: + self.result.status = ExecutionStatus.NOT_RAN + self.result.message = "Analysis args need to be provided for the analyzer to run" + return self.result + + final_regex = self._convert_and_extend_error_regex(args.error_regex, []) + + if data.files: + for rel_path in sorted(data.files.keys()): + file_content = data.files[rel_path] + abs_source = os.path.normpath(os.path.join(data.data_root, rel_path)) + self.result.events += self.check_all_regexes( + content=file_content, + source=abs_source, + error_regex=final_regex, + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, + ) + else: + self.result.events += self.check_all_regexes( + content=data.content, + source=data.data_root or "regex_search", + error_regex=final_regex, + num_timestamps=args.num_timestamps, + interval_to_collapse_event=args.interval_to_collapse_event, + ) + return self.result diff --git a/nodescraper/plugins/regex_search/regex_search_data.py b/nodescraper/plugins/inband/regex_search/regex_search_data.py similarity index 97% rename from nodescraper/plugins/regex_search/regex_search_data.py rename to nodescraper/plugins/inband/regex_search/regex_search_data.py index a12b2841..1e094d45 100644 --- a/nodescraper/plugins/regex_search/regex_search_data.py +++ b/nodescraper/plugins/inband/regex_search/regex_search_data.py @@ -1,107 +1,107 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -import os -from pathlib import Path -from typing import Union - -from pydantic import Field - -from nodescraper.models import DataModel -from nodescraper.utils import get_unique_filename - - -class RegexSearchData(DataModel): - """Loaded file or directory contents passed to the analyzer (via --data).""" - - content: str - data_root: str = "" - files: dict[str, str] = Field(default_factory=dict) - - def log_model(self, log_path: str) -> None: - """Persist the aggregated text payload as one log file under the given base path. - - Args: - log_path: Directory where the log file should be written. - - Returns: - None. - """ - log_name = os.path.join(log_path, get_unique_filename(log_path, "regex_search_source.log")) - with open(log_name, "w", encoding="utf-8") as log_file: - log_file.write(self.content) - - @classmethod - def import_model(cls, model_input: Union[dict, str]) -> "RegexSearchData": - """Import datamodel. - - Args: - model_input: Keyed fields for direct validation, or a path string to load from disk. - - Returns: - Instance with content, root path, and per-file bodies filled in. - """ - if isinstance(model_input, dict): - return cls.model_validate(model_input) - if isinstance(model_input, str): - return cls._from_filesystem_path(model_input) - raise ValueError("Invalid input for regex search data") - - @classmethod - def _from_filesystem_path(cls, path: str) -> "RegexSearchData": - """Read one file or every file under a directory into a merged view plus a path-to-text map. - - Args: - path: Absolute or resolvable path to a file or directory. - - Returns: - Instance built from the read text and discovered relative paths. - - """ - path = os.path.abspath(path) - if not os.path.exists(path): - raise FileNotFoundError(f"Path not found: {path}") - if os.path.isfile(path): - text = Path(path).read_text(encoding="utf-8", errors="replace") - rel = os.path.basename(path) - data_root = os.path.dirname(path) or os.path.abspath(os.path.curdir) - return cls(content=text, data_root=data_root, files={rel: text}) - if os.path.isdir(path): - files: dict[str, str] = {} - parts: list[str] = [] - for root, _dirs, filenames in os.walk(path): - for name in sorted(filenames): - fp = os.path.join(root, name) - if not os.path.isfile(fp): - continue - rel = os.path.relpath(fp, path) - try: - text = Path(fp).read_text(encoding="utf-8", errors="replace") - except OSError: - continue - files[rel] = text - parts.append(f"===== {rel} =====\n{text}") - return cls(content="\n".join(parts), data_root=path, files=files) - raise ValueError(f"Unsupported path type: {path}") +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import os +from pathlib import Path +from typing import Union + +from pydantic import Field + +from nodescraper.models import DataModel +from nodescraper.utils import get_unique_filename + + +class RegexSearchData(DataModel): + """Loaded file or directory contents passed to the analyzer (via --data).""" + + content: str + data_root: str = "" + files: dict[str, str] = Field(default_factory=dict) + + def log_model(self, log_path: str) -> None: + """Persist the aggregated text payload as one log file under the given base path. + + Args: + log_path: Directory where the log file should be written. + + Returns: + None. + """ + log_name = os.path.join(log_path, get_unique_filename(log_path, "regex_search_source.log")) + with open(log_name, "w", encoding="utf-8") as log_file: + log_file.write(self.content) + + @classmethod + def import_model(cls, model_input: Union[dict, str]) -> "RegexSearchData": + """Import datamodel. + + Args: + model_input: Keyed fields for direct validation, or a path string to load from disk. + + Returns: + Instance with content, root path, and per-file bodies filled in. + """ + if isinstance(model_input, dict): + return cls.model_validate(model_input) + if isinstance(model_input, str): + return cls._from_filesystem_path(model_input) + raise ValueError("Invalid input for regex search data") + + @classmethod + def _from_filesystem_path(cls, path: str) -> "RegexSearchData": + """Read one file or every file under a directory into a merged view plus a path-to-text map. + + Args: + path: Absolute or resolvable path to a file or directory. + + Returns: + Instance built from the read text and discovered relative paths. + + """ + path = os.path.abspath(path) + if not os.path.exists(path): + raise FileNotFoundError(f"Path not found: {path}") + if os.path.isfile(path): + text = Path(path).read_text(encoding="utf-8", errors="replace") + rel = os.path.basename(path) + data_root = os.path.dirname(path) or os.path.abspath(os.path.curdir) + return cls(content=text, data_root=data_root, files={rel: text}) + if os.path.isdir(path): + files: dict[str, str] = {} + parts: list[str] = [] + for root, _dirs, filenames in os.walk(path): + for name in sorted(filenames): + fp = os.path.join(root, name) + if not os.path.isfile(fp): + continue + rel = os.path.relpath(fp, path) + try: + text = Path(fp).read_text(encoding="utf-8", errors="replace") + except OSError: + continue + files[rel] = text + parts.append(f"===== {rel} =====\n{text}") + return cls(content="\n".join(parts), data_root=path, files=files) + raise ValueError(f"Unsupported path type: {path}") diff --git a/nodescraper/plugins/regex_search/regex_search_plugin.py b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py similarity index 87% rename from nodescraper/plugins/regex_search/regex_search_plugin.py rename to nodescraper/plugins/inband/regex_search/regex_search_plugin.py index 36d650c6..3b923550 100644 --- a/nodescraper/plugins/regex_search/regex_search_plugin.py +++ b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py @@ -1,76 +1,67 @@ -############################################################################### -# -# MIT License -# -# Copyright (c) 2026 Advanced Micro Devices, Inc. -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# -############################################################################### -from typing import Optional, Union - -from nodescraper.connection.inband import InBandConnectionManager, SSHConnectionParams -from nodescraper.enums import EventPriority -from nodescraper.interfaces import DataPlugin -from nodescraper.models import CollectorArgs, TaskResult - -from .analyzer_args import RegexSearchAnalyzerArgs -from .regex_search_analyzer import RegexSearchAnalyzer -from .regex_search_data import RegexSearchData - - -class RegexSearchPlugin( - DataPlugin[ - InBandConnectionManager, - SSHConnectionParams, - RegexSearchData, - CollectorArgs, - RegexSearchAnalyzerArgs, - ] -): - """Analyzer-only plugin: search user regexes against a file or directory (--data).""" - - DATA_MODEL = RegexSearchData - ANALYZER = RegexSearchAnalyzer - - def analyze( - self, - max_event_priority_level: Optional[Union[EventPriority, str]] = EventPriority.CRITICAL, - analysis_args: Optional[Union[RegexSearchAnalyzerArgs, dict]] = None, - data: Optional[Union[str, dict, RegexSearchData]] = None, - ) -> TaskResult: - if analysis_args is None: - missing_error_regex = True - elif isinstance(analysis_args, RegexSearchAnalyzerArgs): - missing_error_regex = not bool(analysis_args.error_regex) - elif isinstance(analysis_args, dict): - er = analysis_args.get("error_regex") - missing_error_regex = er is None or er == [] - else: - missing_error_regex = True - if missing_error_regex: - self.logger.warning( - "RegexSearchPlugin: analysis args need to be provided for the analyzer to run " - "(e.g. --error-regex for each pattern)." - ) - return super().analyze( - max_event_priority_level=max_event_priority_level, - analysis_args=analysis_args, - data=data, - ) +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from typing import Optional, Union + +from nodescraper.base import InBandDataPlugin +from nodescraper.enums import EventPriority +from nodescraper.models import CollectorArgs, TaskResult + +from .analyzer_args import RegexSearchAnalyzerArgs +from .regex_search_analyzer import RegexSearchAnalyzer +from .regex_search_data import RegexSearchData + + +class RegexSearchPlugin(InBandDataPlugin[RegexSearchData, CollectorArgs, RegexSearchAnalyzerArgs]): + """Analyzer-only plugin: search user regexes against a file or directory (--data).""" + + DATA_MODEL = RegexSearchData + ANALYZER = RegexSearchAnalyzer + + def analyze( + self, + max_event_priority_level: Optional[Union[EventPriority, str]] = EventPriority.CRITICAL, + analysis_args: Optional[Union[RegexSearchAnalyzerArgs, dict]] = None, + data: Optional[Union[str, dict, RegexSearchData]] = None, + ) -> TaskResult: + if analysis_args is None: + missing_error_regex = True + elif isinstance(analysis_args, RegexSearchAnalyzerArgs): + missing_error_regex = not bool(analysis_args.error_regex) + elif isinstance(analysis_args, dict): + er = analysis_args.get("error_regex") + missing_error_regex = er is None or er == [] + else: + missing_error_regex = True + if missing_error_regex: + self.logger.warning( + "RegexSearchPlugin: analysis args need to be provided for the analyzer to run " + "(e.g. --error-regex for each pattern)." + ) + return super().analyze( + max_event_priority_level=max_event_priority_level, + analysis_args=analysis_args, + data=data, + ) diff --git a/nodescraper/plugins/ooband/bmc_archive/bmc_archive_collector.py b/nodescraper/plugins/ooband/bmc_archive/bmc_archive_collector.py index 547ba80d..722122ca 100644 --- a/nodescraper/plugins/ooband/bmc_archive/bmc_archive_collector.py +++ b/nodescraper/plugins/ooband/bmc_archive/bmc_archive_collector.py @@ -41,6 +41,11 @@ class BmcArchiveCollector(InBandDataCollector[BmcArchiveDataModel, BmcArchiveCol DATA_MODEL = BmcArchiveDataModel SUPPORTED_OS_FAMILY = {OSFamily.LINUX, OSFamily.UNKNOWN} + DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( + "SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).", + "Uses sudo on the BMC when collection_args paths require elevated access.", + ) + REMOTE_ARCHIVE_TEMPLATE = "/tmp/node_scraper_{name}.tar.gz" # None until first probe in a run; collect_data resets so each collection re-probes. _tar_ignore_failed_read_supported: Optional[bool] = None diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py index 59dd7a8d..1e43a71a 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_analyzer.py @@ -89,6 +89,12 @@ class RedfishEndpointAnalyzer(DataAnalyzer[RedfishEndpointDataModel, RedfishEndp DATA_MODEL = RedfishEndpointDataModel + DOCUMENTATION_ANALYSIS_ITEMS: tuple[str, ...] = ( + "For each entry in analysis_args.checks, reads JSON paths in collected responses and " + "compares values to constraints (eq, min/max, anyOf, regex, etc.).", + 'URI key "*" runs checks against every collected response body.', + ) + def analyze_data( self, data: RedfishEndpointDataModel, diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py index e0878c1a..2a6715c6 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py @@ -152,6 +152,13 @@ class RedfishEndpointCollector( DATA_MODEL = RedfishEndpointDataModel + DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( + "Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).", + "Optional paged GET following Members@odata.nextLink when follow_next_link is true.", + "Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / " + "Members links (depth and endpoint caps from collection_args).", + ) + def collect_data( self, args: Optional[RedfishEndpointCollectorArgs] = None ) -> tuple[TaskResult, Optional[RedfishEndpointDataModel]]: diff --git a/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_analyzer.py b/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_analyzer.py index c54d9e2f..11aaa1e8 100644 --- a/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_analyzer.py +++ b/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_analyzer.py @@ -38,6 +38,11 @@ class RedfishOemDiagAnalyzer(DataAnalyzer[RedfishOemDiagDataModel, RedfishOemDia DATA_MODEL = RedfishOemDiagDataModel + DOCUMENTATION_ANALYSIS_ITEMS: tuple[str, ...] = ( + "Summarizes success/failure per OEM diagnostic type from collected results.", + "When analysis_args.require_all_success is true, fails the run if any type failed collection.", + ) + def analyze_data( self, data: RedfishOemDiagDataModel, diff --git a/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_collector.py b/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_collector.py index b406ef38..f2e3d1d2 100644 --- a/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_collector.py +++ b/nodescraper/plugins/ooband/redfish_oem_diag/oem_diag_collector.py @@ -43,6 +43,12 @@ class RedfishOemDiagCollector( DATA_MODEL = RedfishOemDiagDataModel + DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( + "Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types " + "(collection_args.log_service_path selects the LogService).", + "Optional binary archives under the plugin log path when log_path is set.", + ) + def __init__(self, *args: Any, **kwargs: Any) -> None: self.log_path = kwargs.pop("log_path", None) super().__init__(*args, **kwargs) diff --git a/test/unit/plugin/test_regex_search_analyzer.py b/test/unit/plugin/test_regex_search_analyzer.py index ac018ee1..e93b93da 100644 --- a/test/unit/plugin/test_regex_search_analyzer.py +++ b/test/unit/plugin/test_regex_search_analyzer.py @@ -28,10 +28,16 @@ import tempfile from nodescraper.enums.executionstatus import ExecutionStatus -from nodescraper.plugins.regex_search.analyzer_args import RegexSearchAnalyzerArgs -from nodescraper.plugins.regex_search.regex_search_analyzer import RegexSearchAnalyzer -from nodescraper.plugins.regex_search.regex_search_data import RegexSearchData -from nodescraper.plugins.regex_search.regex_search_plugin import RegexSearchPlugin +from nodescraper.plugins.inband.regex_search.analyzer_args import ( + RegexSearchAnalyzerArgs, +) +from nodescraper.plugins.inband.regex_search.regex_search_analyzer import ( + RegexSearchAnalyzer, +) +from nodescraper.plugins.inband.regex_search.regex_search_data import RegexSearchData +from nodescraper.plugins.inband.regex_search.regex_search_plugin import ( + RegexSearchPlugin, +) EXPECTED_MISSING_ANALYSIS_MSG = "Analysis args need to be provided for the analyzer to run" From b901cd867431b65d1a6faad62967f008bcbe950e Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 12 Jun 2026 10:05:03 -0500 Subject: [PATCH 10/39] fixes for doc --- docs/PLUGIN_DOC.md | 4 ++-- docs/generate_plugin_doc_bundle.py | 9 +++++++-- .../plugins/inband/regex_search/regex_search_plugin.py | 9 +++++++++ .../plugins/ooband/redfish_endpoint/collector_args.py | 5 ++++- .../ooband/redfish_endpoint/endpoint_collector.py | 6 +++--- 5 files changed, 25 insertions(+), 8 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index e0d84df8..94dc5227 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -24,7 +24,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -PP -D -d {vendor_id}:{dev_id}
lspci -PP -D
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RegexSearchPlugin | - | - | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | +| RegexSearchPlugin | No COLLECTOR: data is the text loaded from --data (file or directory) or equivalent input. | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex).
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | | RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | @@ -37,7 +37,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | | OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | -| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following Members@odata.nextLink when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / Members links (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Members@odata.nextLink pagination for each URI and merge all pages into a single response.
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | +| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single response.
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | | RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | # Collectors diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index fb47a297..96042329 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -504,7 +504,8 @@ def escape_table_cell(s: str) -> str: """ if not s: return s - return s.replace("|", "|").replace("\n", " ").replace("\r", " ") + # Avoid @ in cells (e.g. OData property names) being turned into mail/mention links in Outlook/HTML viewers. + return s.replace("|", "|").replace("@", "@").replace("\n", " ").replace("\r", " ") def md_header(text: str, level: int = 2) -> str: @@ -847,7 +848,11 @@ def main(): f"packages ({', '.join(DEFAULT_PACKAGES)}). Repeatable." ), ) - ap.add_argument("--output", default="PLUGIN_DOC.md", help="Output Markdown file") + ap.add_argument( + "--output", + default="docs/PLUGIN_DOC.md", + help="Output Markdown file (default: docs/PLUGIN_DOC.md under repo root)", + ) ap.add_argument( "--update-readme-help", action="store_true", diff --git a/nodescraper/plugins/inband/regex_search/regex_search_plugin.py b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py index 3b923550..c33359f1 100644 --- a/nodescraper/plugins/inband/regex_search/regex_search_plugin.py +++ b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py @@ -39,6 +39,15 @@ class RegexSearchPlugin(InBandDataPlugin[RegexSearchData, CollectorArgs, RegexSe DATA_MODEL = RegexSearchData ANALYZER = RegexSearchAnalyzer + ANALYZER_ARGS = RegexSearchAnalyzerArgs + + DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( + "No COLLECTOR: data is the text loaded from --data (file or directory) or equivalent input.", + ) + DOCUMENTATION_ANALYSIS_ITEMS: tuple[str, ...] = ( + "Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).", + "Emits regex match events with optional per-file source in the description when scanning directories.", + ) def analyze( self, diff --git a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py index 189c5edf..6583075e 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/collector_args.py @@ -59,7 +59,10 @@ class RedfishEndpointCollectorArgs(CollectorArgs): ) follow_next_link: bool = Field( default=False, - description="If True, follow Members@odata.nextLink pagination for each URI and merge all pages into a single response.", + description=( + "If True, follow Redfish Members collection OData nextLink pagination for each URI " + "and merge all pages into a single response." + ), ) max_pages: int = Field( default=200, diff --git a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py index 2a6715c6..37bd839b 100644 --- a/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py +++ b/nodescraper/plugins/ooband/redfish_endpoint/endpoint_collector.py @@ -154,9 +154,9 @@ class RedfishEndpointCollector( DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( "Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).", - "Optional paged GET following Members@odata.nextLink when follow_next_link is true.", - "Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / " - "Members links (depth and endpoint caps from collection_args).", + "Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.", + "Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and " + "Members navigation (depth and endpoint caps from collection_args).", ) def collect_data( From 4ffe6b6c30e5764749006d35d15ae3a2794bf28a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 12 Jun 2026 10:07:50 -0500 Subject: [PATCH 11/39] fixes for doc --- docs/PLUGIN_DOC.md | 2 +- nodescraper/plugins/inband/regex_search/regex_search_plugin.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 94dc5227..e15ce92a 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -24,7 +24,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -PP -D -d {vendor_id}:{dev_id}
lspci -PP -D
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RegexSearchPlugin | No COLLECTOR: data is the text loaded from --data (file or directory) or equivalent input. | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex).
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | +| RegexSearchPlugin | - | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex).
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | | RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | diff --git a/nodescraper/plugins/inband/regex_search/regex_search_plugin.py b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py index c33359f1..2a101ff8 100644 --- a/nodescraper/plugins/inband/regex_search/regex_search_plugin.py +++ b/nodescraper/plugins/inband/regex_search/regex_search_plugin.py @@ -41,9 +41,6 @@ class RegexSearchPlugin(InBandDataPlugin[RegexSearchData, CollectorArgs, RegexSe ANALYZER = RegexSearchAnalyzer ANALYZER_ARGS = RegexSearchAnalyzerArgs - DOCUMENTATION_COLLECTION_ITEMS: tuple[str, ...] = ( - "No COLLECTOR: data is the text loaded from --data (file or directory) or equivalent input.", - ) DOCUMENTATION_ANALYSIS_ITEMS: tuple[str, ...] = ( "Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).", "Emits regex match events with optional per-file source in the description when scanning directories.", From 23ba68b4d22d6eba90299e94043f1cf8f4a84360 Mon Sep 17 00:00:00 2001 From: Ignatious Johnson Date: Sat, 13 Jun 2026 20:49:23 -0400 Subject: [PATCH 12/39] fix(pcie): handle unknown PCIe capability IDs without crashing The PCIe collector raised an unhandled ValueError when the config space exposed a capability ID not modeled in CapabilityEnum / ExtendedCapabilityEnum (e.g. 0x2F / IDE on MI300X), aborting the entire PCIe plugin run. - Add ExtendedCapabilityEnum.IDE = 0x2F (Integrity and Data Encryption) - Guard the enum conversion in get_cap_cfg so any unknown cap id is skipped with a warning instead of taking down the whole collection. Fixes #224 Co-authored-by: Cursor --- .../plugins/inband/pcie/pcie_collector.py | 18 ++++++++++++++---- nodescraper/plugins/inband/pcie/pcie_data.py | 1 + 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/nodescraper/plugins/inband/pcie/pcie_collector.py b/nodescraper/plugins/inband/pcie/pcie_collector.py index eb3bb5f7..624122ec 100755 --- a/nodescraper/plugins/inband/pcie/pcie_collector.py +++ b/nodescraper/plugins/inband/pcie/pcie_collector.py @@ -489,10 +489,20 @@ def get_cap_cfg( for cap_id, cap_addr in cap_data.items(): if cap_id == 0: continue - if cap_addr >= 0x100: - cap_enum: Enum = ExtendedCapabilityEnum(cap_id) - else: - cap_enum = CapabilityEnum(cap_id) + cap_type = ExtendedCapabilityEnum if cap_addr >= 0x100 else CapabilityEnum + try: + cap_enum: Enum = cap_type(cap_id) + except ValueError: + # Unknown / not-yet-modeled capability id. Skip it instead of + # aborting the whole collection so one new cap id can't take + # down the entire PCIe plugin. + self.logger.warning( + "Skipping unknown %s id 0x%X at offset 0x%X", + cap_type.__name__, + cap_id, + cap_addr, + ) + continue cap_cls = self.get_cap_struct(cap_enum) if cap_cls is None: continue diff --git a/nodescraper/plugins/inband/pcie/pcie_data.py b/nodescraper/plugins/inband/pcie/pcie_data.py index 83a03403..70da6375 100644 --- a/nodescraper/plugins/inband/pcie/pcie_data.py +++ b/nodescraper/plugins/inband/pcie/pcie_data.py @@ -157,6 +157,7 @@ class ExtendedCapabilityEnum(Enum): ALT_PROTOCOL = 0x002B # Alternate Protocol Extended Capability SFI = 0x002C # System Firmware Intermediary (SFI)Extended Capability DOE = 0x2E # 0x2e Data Object Exchange + IDE = 0x2F # 0x2f Integrity and Data Encryption (IDE) INT_DOE = 0x30 # 0x30 Integrity and Data Encryption From 1e235c1e68389cecf8ac41fd901f8ccfc35a3a65 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 15 Jun 2026 15:02:59 -0500 Subject: [PATCH 13/39] updates --- nodescraper/interfaces/datacollectortask.py | 3 +- .../plugins/serviceability/__init__.py | 4 +- .../plugins/serviceability/analyzer_args.py | 96 ++++++++-- .../serviceability/mi3xx/mi3xx_analyzer.py | 19 +- .../plugins/serviceability/se_adapter.py | 100 ++++++++-- .../plugins/serviceability/se_models.py | 16 +- .../plugins/serviceability/se_runner.py | 107 ++++++++--- test/unit/mock_python_engine.py | 43 +++++ test/unit/plugin/test_se_runner.py | 172 ++++++++++++++++-- 9 files changed, 477 insertions(+), 83 deletions(-) create mode 100644 test/unit/mock_python_engine.py diff --git a/nodescraper/interfaces/datacollectortask.py b/nodescraper/interfaces/datacollectortask.py index 3c30a6ea..60826b16 100644 --- a/nodescraper/interfaces/datacollectortask.py +++ b/nodescraper/interfaces/datacollectortask.py @@ -204,7 +204,8 @@ def __init_subclass__(cls, **kwargs) -> None: if not issubclass(cls.DATA_MODEL, DataModel): raise TypeError(f"DATA_MODEL must be a subclass of DataModel in {cls.__name__}") if hasattr(cls, "collect_data"): - cls.collect_data = collect_decorator(cls.collect_data) + if "collect_data" in vars(cls): + cls.collect_data = collect_decorator(cls.collect_data) else: raise TypeError(f"Data collector {cls.__name__} must implement collect_data") diff --git a/nodescraper/plugins/serviceability/__init__.py b/nodescraper/plugins/serviceability/__init__.py index 36671691..c5e9f857 100644 --- a/nodescraper/plugins/serviceability/__init__.py +++ b/nodescraper/plugins/serviceability/__init__.py @@ -40,7 +40,7 @@ serviceability_block_from_service_result, ) from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution -from .se_runner import SeRunError, run_service_engine +from .se_runner import SeRunError, run_service_hub from .serviceability_collector import ServiceabilityCollectorBase from .serviceability_data import ( DeviceInfo, @@ -83,7 +83,7 @@ "is_valid_iso_datetime", "normalize_se_timestamp", "parse_iso_datetime", - "run_service_engine", + "run_service_hub", "serviceability_block_from_service_result", "satisfies_time_check", ] diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 8d5deea1..2aa27ccd 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -25,7 +25,7 @@ ############################################################################### from __future__ import annotations -from typing import Optional +from typing import Any, Optional from pydantic import Field, field_validator, model_validator @@ -33,14 +33,11 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): - """Analyzer args for MI3XX serviceability (Python engine via plugin config).""" + """Analyzer args for serviceability plugins that run a configurable Python hub.""" engine_python_module: Optional[str] = Field( default=None, - description=( - "Importable Python module providing a service engine class with " - "get_service_info(rf_events, cper_data=...)." - ), + description="Import path for the hub module (class implements engine_analyze_method); hub_options forwards kwargs.", ) engine_display_name: Optional[str] = Field( default=None, @@ -48,27 +45,86 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): ) afid_sag_path: Optional[str] = Field( default=None, - description="Path to AFID_SAG.json.", + description="Path to hub config (e.g. AFID_SAG.json); passed as engine_init_path_kwarg.", + ) + engine_init_path_kwarg: str = Field( + default="afid_sag", + description="Hub __init__ keyword that receives afid_sag_path.", + ) + engine_analyze_method: str = Field( + default="get_service_info", + description="Hub method called with rf_events first (default get_service_info).", ) skip_engine: bool = Field( default=False, - description="If True, only build afid_events without running the service engine.", + description="If True, only build afid_events without running the service hub.", ) cper_decode_module: Optional[str] = Field( default=None, - description=( - "Import path of the Python module that decodes CPER blobs (e.g. " - "vendor.package.cdump_analyzer). Required when collected events " - "include CPER attachments to decode before running the service engine." - ), + description="Module import path for CPER decoding when events include CPER attachments.", ) cper_decode_method: str = Field( default="analyze_cper", - description=( - "Name of the callable on cper_decode_module. It must accept a " - "binary file-like CPER payload and return (return_code, decode_dict)." - ), + description="Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out.", + ) + hub_options: Optional[dict[str, Any]] = Field( + default=None, + description="Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key.", + ) + from_ac_cycle: int = Field( + default=-1, + ge=-1, + description="from_ac_cycle kwarg for the hub analyze call (merged after hub_options).", + ) + from_date: Optional[str] = Field( + default=None, + description="Optional from_date for the hub analyze call (merged after hub_options).", ) + designation_serials: Optional[dict[str, str]] = Field( + default=None, + description="Optional designation_serials for the hub analyze call (merged after hub_options).", + ) + suppress_service_actions: Optional[list[str]] = Field( + default=None, + description="Optional suppress_service_actions for the hub analyze call (merged after hub_options).", + ) + + def resolved_hub_options(self) -> dict[str, Any]: + """Merge hub_options with from_ac_cycle, from_date, designation_serials, and suppress_service_actions.""" + merged = dict(self.hub_options or {}) + merged["from_ac_cycle"] = self.from_ac_cycle + if self.from_date is not None: + merged["from_date"] = self.from_date + if self.designation_serials is not None: + merged["designation_serials"] = self.designation_serials + if self.suppress_service_actions is not None: + merged["suppress_service_actions"] = self.suppress_service_actions + return merged + + @field_validator("engine_analyze_method", "engine_init_path_kwarg") + @classmethod + def _strip_non_empty_hub_hooks(cls, value: str) -> str: + text = str(value).strip() + if not text: + raise ValueError("must not be empty") + return text + + @field_validator("hub_options", mode="before") + @classmethod + def _none_empty_hub_options(cls, value: object) -> Optional[dict[str, Any]]: + if value is None: + return None + if isinstance(value, dict) and not value: + return None + return value # type: ignore[return-value] + + @field_validator("from_date", mode="before") + @classmethod + def _strip_from_date(cls, value: object) -> Optional[str]: + if value is None: + return None + text = str(value).strip() + return text or None @field_validator( "afid_sag_path", @@ -84,11 +140,11 @@ def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: return text or None @model_validator(mode="after") - def _require_engine_config_when_running(self) -> ServiceabilityAnalyzerArgs: + def _require_hub_config_when_running(self) -> ServiceabilityAnalyzerArgs: if self.skip_engine: return self if not self.afid_sag_path: - raise ValueError("afid_sag_path is required when running the service engine.") + raise ValueError("afid_sag_path is required when running the service hub.") if not self.engine_python_module: - raise ValueError("engine_python_module is required when running the service engine.") + raise ValueError("engine_python_module is required when running the service hub.") return self diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index b8fc8373..0424e8e2 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -40,14 +40,14 @@ format_serviceability_solution_lines, ) from nodescraper.plugins.serviceability.se_models import ServiceabilityBlock -from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_engine +from nodescraper.plugins.serviceability.se_runner import SeRunError, run_service_hub from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): - """Build AFID events from collected data and run the configured service engine.""" + """Build AFID events from collected data and run the configured service hub.""" DATA_MODEL = ServiceabilityDataModel @@ -67,7 +67,7 @@ def analyze_data( if args.skip_engine: data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK - self.result.message = f"Built {len(events)} AFID event(s); engine skipped" + self.result.message = f"Built {len(events)} AFID event(s); hub skipped" self._log_serviceability_solutions(data.serviceability) return self.result @@ -117,13 +117,16 @@ def analyze_data( ) try: - block = run_service_engine( + block = run_service_hub( engine_python_module=args.engine_python_module, # type: ignore[arg-type] engine_display_name=args.engine_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, cper_data=cper_data or None, + hub_options=args.resolved_hub_options(), + engine_analyze_method=args.engine_analyze_method, + engine_init_path_kwarg=args.engine_init_path_kwarg, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -139,9 +142,15 @@ def analyze_data( cper_summary = f", {len(cper_data)} decoded CPER(s)" elif data.cper_raw: cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" + ver_bits: list[str] = [] + if block.hub_version: + ver_bits.append(f"hub {block.hub_version}") + if block.afid_sag_file_version: + ver_bits.append(f"AFID_SAG {block.afid_sag_file_version}") + ver_suffix = f" [{'; '.join(ver_bits)}]" if ver_bits else "" self.result.message = ( f"{engine_label}: {len(block.solution)} solution(s) " - f"from {len(data.rf_events)} Redfish event(s){cper_summary}" + f"from {len(data.rf_events)} Redfish event(s){cper_summary}{ver_suffix}" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 243b2d7d..0e31135a 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -23,30 +23,80 @@ # SOFTWARE. # ############################################################################### -"""Map serviceability plugin models to/from Python service engine results.""" +"""Map serviceability plugin models to/from Python service hub results.""" from __future__ import annotations from collections import defaultdict -from typing import Any +from typing import Any, Optional from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +def _hub_version_display(version_info: Any) -> Optional[str]: + """Pick a single hub version string from common hub result version dict layouts.""" + if not isinstance(version_info, dict) or not version_info: + return None + primary = ( + version_info.get("isa_version") + or version_info.get("version") + or version_info.get("engine_version") + or version_info.get("VERSION") + ) + if primary is None: + return None + text = str(primary).strip() + if not text: + return None + bd = version_info.get("build_date") + if bd and str(bd).strip(): + return f"{text} (build {str(bd).strip()})" + return text + + +def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: + """Build a short AFID_SAG file identity string from hub ``afid_sag_metadata``.""" + if not isinstance(metadata, dict) or not metadata: + return None + pid = metadata.get("sag_pid") or metadata.get("pid") + rev = metadata.get("sag_revision") or metadata.get("revision") + extra = ( + metadata.get("sag_version") + or metadata.get("file_version") + or metadata.get("schema_version") + ) + parts: list[str] = [] + if pid and str(pid).strip(): + parts.append(f"PID {str(pid).strip()}") + if rev and str(rev).strip(): + parts.append(f"revision {str(rev).strip()}") + if extra and str(extra).strip(): + ex = str(extra).strip() + if ex not in (str(pid or "").strip(), str(rev or "").strip()): + parts.append(f"version {ex}") + if not parts: + return None + return ", ".join(parts) + + def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: """Human-readable lines for logging or console output.""" lines: list[str] = [] if block.solution_reasoning: lines.append(block.solution_reasoning) + if block.hub_version: + lines.append(f"Hub version: {block.hub_version}") + if block.afid_sag_file_version: + lines.append(f"AFID_SAG file: {block.afid_sag_file_version}") if not block.solution: lines.append("No service actions recommended.") return lines for index, solution in enumerate(block.solution, start=1): units = ", ".join(solution.serviceable_unit) - lines.append( - f"[{index}] AFID {solution.afid}, " - f"service action {solution.service_action_num}, " - f"units: [{units}]" - ) + title = (solution.service_action_title or "").strip() + action = f"service action {solution.service_action_num}" + if title: + action = f"{action} ({title})" + lines.append(f"[{index}] AFID {solution.afid}, {action}, units: [{units}]") return lines @@ -54,12 +104,22 @@ def serviceability_block_from_service_result( afid_events: list[AfidEvent], result: Any, *, - engine_label: str = "Service engine", + engine_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: - """Build a :class:`ServiceabilityBlock` from an engine result with ``service_info``.""" + """Build a :class:`ServiceabilityBlock` from a hub result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) + titles: dict[tuple[int, int], str] = {} service_info = getattr(result, "service_info", None) or {} + + def _action_title(info: dict[str, Any]) -> str: + raw = info.get("title") or info.get("service_action") or info.get("ServiceAction") + if raw is None: + return "" + if isinstance(raw, dict): + return str(raw.get("title") or raw.get("text") or raw.get("name") or "").strip() + return str(raw).strip() + for designation, afid_map in service_info.items(): if not isinstance(afid_map, dict): continue @@ -78,29 +138,33 @@ def serviceability_block_from_service_result( key = (afid, san) if unit and unit not in grouped[key]: grouped[key].append(unit) + label = _action_title(info) + if label and key not in titles: + titles[key] = label solutions = [ ServiceabilitySolution( afid=afid, serviceable_unit=units, service_action_num=san, + service_action_title=titles.get((afid, san)), ) for (afid, san), units in sorted(grouped.items()) ] metadata = getattr(result, "afid_sag_metadata", None) or {} version_info = ( - getattr(result, "engine_version_info", None) or getattr(result, "version_info", None) or {} - ) - sag_pid = metadata.get("sag_pid") or metadata.get("pid") or "unknown" - sag_revision = metadata.get("sag_revision") or metadata.get("revision") or "unknown" - engine_version = version_info.get("version") or version_info.get("engine_version") - version_suffix = f", engine {engine_version}" if engine_version else "" - reasoning = ( - f"{engine_label} (SAG {sag_pid} rev {sag_revision}{version_suffix}): " - f"{len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + getattr(result, "engine_version_info", None) + or getattr(result, "isa_version_info", None) + or getattr(result, "version_info", None) + or {} ) + hub_version = _hub_version_display(version_info) + afid_sag_file_version = _afid_sag_file_version_display(metadata) + reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, + hub_version=hub_version, + afid_sag_file_version=afid_sag_file_version, ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 344ef7c7..60c34083 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -60,6 +60,10 @@ class ServiceabilitySolution(BaseModel): service_action_num: int = Field( description="Service action number from AFID_SAG.json.", ) + service_action_title: Optional[str] = Field( + default=None, + description=("Short service action label from the hub."), + ) class ServiceabilityBlock(BaseModel): @@ -71,9 +75,17 @@ class ServiceabilityBlock(BaseModel): ) solution: List[ServiceabilitySolution] = Field( default_factory=list, - description="Engine output: recommended service actions.", + description="Hub output: recommended service actions.", ) solution_reasoning: Optional[str] = Field( default=None, - description="Human-readable summary of how the engine reached its conclusions.", + description="Human-readable summary of recommendations (counts and hub label).", + ) + hub_version: Optional[str] = Field( + default=None, + description="Service hub package/build version string when the hub returned it.", + ) + afid_sag_file_version: Optional[str] = Field( + default=None, + description="AFID_SAG.json identity/revision string when the hub returned metadata.", ) diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index aeec1eb7..c141b6ec 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -23,25 +23,71 @@ # SOFTWARE. # ############################################################################### -"""Invoke a configured Python service engine against collected Redfish events.""" +"""Invoke a configured Python service hub against collected Redfish events.""" from __future__ import annotations import importlib import inspect from pathlib import Path -from typing import Any, Optional, Type +from typing import Any, Callable, Optional, Type from .se_adapter import serviceability_block_from_service_result from .se_models import AfidEvent, ServiceabilityBlock -_ENGINE_METHOD = "get_service_info" + +def _signature_accepts_var_keyword(sig: inspect.Signature) -> bool: + return any(p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values()) + + +def _instantiate_hub( + hub_cls: Type[Any], + config_path: str, + init_path_kwarg: str, + hub_options: Optional[dict[str, Any]], +) -> Any: + """Construct the hub with ``config_path`` under ``init_path_kwarg``, plus matching options.""" + init_sig = inspect.signature(hub_cls.__init__) + kwargs: dict[str, Any] = {init_path_kwarg: config_path} + if not hub_options: + return hub_cls(**kwargs) + if _signature_accepts_var_keyword(init_sig): + merged = dict(hub_options) + merged[init_path_kwarg] = config_path + return hub_cls(**merged) + for key, val in hub_options.items(): + if key in init_sig.parameters: + kwargs[key] = val + kwargs[init_path_kwarg] = config_path + return hub_cls(**kwargs) + + +def _call_hub_analyze( + analyze: Callable[..., Any], + rf_events: list[Any], + cper_data: Optional[dict[str, Any]], + hub_options: Optional[dict[str, Any]], +) -> Any: + """Invoke the hub analyze callable with ``cper_data`` and per-parameter ``hub_options``.""" + sig = inspect.signature(analyze) + params = sig.parameters + eo = dict(hub_options or {}) + + if _signature_accepts_var_keyword(sig): + if "cper_data" in params: + eo["cper_data"] = dict(cper_data) if cper_data else None + return analyze(list(rf_events), **eo) + + kw = {k: v for k, v in eo.items() if k in params} + if "cper_data" in params: + kw["cper_data"] = dict(cper_data) if cper_data else None + return analyze(list(rf_events), **kw) class SeRunError(RuntimeError): - """Raised when the service engine fails or returns invalid output.""" + """Raised when the service hub fails or returns invalid output.""" -def run_service_engine( +def run_service_hub( *, engine_python_module: str, engine_display_name: Optional[str] = None, @@ -49,11 +95,21 @@ def run_service_engine( afid_sag_path: str, rf_events: list[Any], cper_data: Optional[dict[str, Any]] = None, + hub_options: Optional[dict[str, Any]] = None, + engine_analyze_method: str = "get_service_info", + engine_init_path_kwarg: str = "afid_sag", ) -> ServiceabilityBlock: - """Run a Python service engine and return a :class:`ServiceabilityBlock`.""" + """Run the configured Python service hub and return a :class:`ServiceabilityBlock`. + + The runner imports ``engine_python_module``, picks the unique class that implements + ``engine_analyze_method``, constructs it with the config file path passed as + ``engine_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any + ``hub_options`` keys that match the method signature (plus ``cper_data`` when + supported). Result mapping is handled by :func:`serviceability_block_from_service_result`. + """ sag_path = Path(afid_sag_path) if not sag_path.is_file(): - raise SeRunError(f"AFID_SAG file not found: {afid_sag_path}") + raise SeRunError(f"Hub config file not found: {afid_sag_path}") if not rf_events: raise SeRunError( @@ -66,17 +122,24 @@ def run_service_engine( except ImportError as exc: raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc - engine_cls = _resolve_engine_class(mod) + hub_cls = _resolve_hub_class(mod, engine_analyze_method) try: - instance = engine_cls(afid_sag=afid_sag_path) - analyze = getattr(instance, _ENGINE_METHOD) - result = analyze( - list(rf_events), - cper_data=dict(cper_data) if cper_data else None, + instance = _instantiate_hub( + hub_cls, + afid_sag_path, + engine_init_path_kwarg, + hub_options, + ) + analyze = getattr(instance, engine_analyze_method) + result = _call_hub_analyze( + analyze, + rf_events, + cper_data, + hub_options, ) except Exception as exc: - raise SeRunError(f"{label} {_ENGINE_METHOD}() failed: {exc}") from exc + raise SeRunError(f"{label} {engine_analyze_method}() failed: {exc}") from exc if result is None: return ServiceabilityBlock( @@ -93,18 +156,18 @@ def run_service_engine( ) -def _is_engine_class(obj: Any) -> bool: - return inspect.isclass(obj) and callable(getattr(obj, _ENGINE_METHOD, None)) +def _is_hub_class(obj: Any, analyze_method: str = "get_service_info") -> bool: + return inspect.isclass(obj) and callable(getattr(obj, analyze_method, None)) -def _resolve_engine_class(mod: Any) -> Type[Any]: - """Find the engine class in ``mod`` that implements ``get_service_info``.""" +def _resolve_hub_class(mod: Any, analyze_method: str = "get_service_info") -> Type[Any]: + """Find the hub class in ``mod`` that implements ``analyze_method``.""" package = mod.__name__ candidates: list[Type[Any]] = [] seen: set[int] = set() def add_candidate(obj: Any) -> None: - if not _is_engine_class(obj): + if not _is_hub_class(obj, analyze_method): return key = id(obj) if key in seen: @@ -124,8 +187,8 @@ def add_candidate(obj: Any) -> None: return candidates[0] if not candidates: raise SeRunError( - f"No class with {_ENGINE_METHOD}() found in {package}; " - "check engine_python_module in analysis_args." + f"No class with {analyze_method}() found in {package}; " + "check engine_python_module and engine_analyze_method in analysis_args." ) names = ", ".join(cls.__name__ for cls in candidates) - raise SeRunError(f"Multiple classes with {_ENGINE_METHOD}() in {package}: {names}.") + raise SeRunError(f"Multiple classes with {analyze_method}() in {package}: {names}.") diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py new file mode 100644 index 00000000..515eea38 --- /dev/null +++ b/test/unit/mock_python_engine.py @@ -0,0 +1,43 @@ +"""Mock Python service engine for unit tests.""" + +from __future__ import annotations + +from types import SimpleNamespace +from typing import Any, Optional + +from serviceability_dummy_data import ( + DUMMY_ENGINE_VERSION, + DUMMY_SAG_PID, + DUMMY_SAG_REVISION, + DUMMY_SERVICE_ACTION_NUM, + DUMMY_SERVICE_ACTION_TITLE, + DUMMY_UNIT_A, +) + + +class MockServiceEngine: + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[dict[str, Any]], + cper_data: Optional[dict[str, Any]] = None, + **kwargs: Any, + ) -> SimpleNamespace: + del cper_data, kwargs + service_info: dict[str, dict[str, dict[str, str]]] = {} + for event in rf_events: + afid = event.get("Afid") + unit = event.get("serviceable_unit", DUMMY_UNIT_A) + if afid is None: + continue + service_info.setdefault(str(unit), {})[str(afid)] = { + "service_action_number": str(DUMMY_SERVICE_ACTION_NUM), + "title": DUMMY_SERVICE_ACTION_TITLE, + } + return SimpleNamespace( + service_info=service_info, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + engine_version_info={"version": DUMMY_ENGINE_VERSION}, + ) diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index fd5132f4..d6fdf0d9 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -26,6 +26,7 @@ import json from pathlib import Path from types import SimpleNamespace +from typing import Any import pytest from pydantic import ValidationError @@ -58,7 +59,7 @@ build_afid_events_from_data, format_serviceability_solution_lines, normalize_se_timestamp, - run_service_engine, + run_service_hub, serviceability_block_from_service_result, ) from nodescraper.plugins.serviceability.se_models import ServiceabilitySolution @@ -77,12 +78,12 @@ def test_afid_event_requires_non_empty_serviceable_unit(): AfidEvent(afid=1, serviceable_unit=" ", time=DUMMY_TIMESTAMP) -def test_normalize_se_timestamp_preserves_engine_format(): +def test_normalize_se_timestamp_preserves_format_value(): sample = "2000-01-01 12:00:00.000+00:00" assert normalize_se_timestamp(sample) == sample -def test_analyzer_args_require_engine_config(): +def test_analyzer_args_require_hub_config(): with pytest.raises(ValidationError): ServiceabilityAnalyzerArgs() with pytest.raises(ValidationError, match="engine_python_module"): @@ -94,6 +95,24 @@ def test_analyzer_args_require_engine_config(): assert args.engine_python_module == "dummy.test.module" +def test_resolved_hub_options_explicit_fields_override_options_bag(): + args = ServiceabilityAnalyzerArgs( + engine_python_module="dummy.test.module", + afid_sag_path=str(AFID_SAG), + engine_options={"from_ac_cycle": 9, "extra": 1}, + from_ac_cycle=3, + from_date="2025-01-01", + designation_serials={"U": "S"}, + suppress_service_actions=["99"], + ) + merged = args.resolved_hub_options() + assert merged["from_ac_cycle"] == 3 + assert merged["from_date"] == "2025-01-01" + assert merged["designation_serials"] == {"U": "S"} + assert merged["suppress_service_actions"] == ["99"] + assert merged["extra"] == 1 + + def test_format_serviceability_solution_lines(): block = ServiceabilityBlock( afid_events=EXAMPLE_EVENTS[:1], @@ -102,14 +121,20 @@ def test_format_serviceability_solution_lines(): afid=DUMMY_AFID_A, serviceable_unit=[DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B], service_action_num=DUMMY_SERVICE_ACTION_NUM, + service_action_title="RMA", ) ], solution_reasoning="Dummy test reasoning.", + hub_version="1.0.0-test", + afid_sag_file_version="PID sag-1, revision rev-a", ) lines = format_serviceability_solution_lines(block) assert lines[0] == "Dummy test reasoning." - assert f"AFID {DUMMY_AFID_A}" in lines[1] - assert DUMMY_DESIGNATION_A in lines[1] + assert lines[1] == "Hub version: 1.0.0-test" + assert lines[2] == "AFID_SAG file: PID sag-1, revision rev-a" + assert f"AFID {DUMMY_AFID_A}" in lines[3] + assert DUMMY_DESIGNATION_A in lines[3] + assert "service action 99 (RMA)" in lines[3] def test_serviceability_block_from_service_result(): @@ -144,12 +169,34 @@ def test_serviceability_block_from_service_result(): assert len(block.solution) == 1 assert block.solution[0].afid == DUMMY_AFID_A assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM + assert block.solution[0].service_action_title == "Dummy service action" assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} + assert block.hub_version == DUMMY_ENGINE_VERSION + assert block.afid_sag_file_version is not None + assert DUMMY_SAG_PID in block.afid_sag_file_version + assert DUMMY_SAG_REVISION in block.afid_sag_file_version assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning assert "Dummy test engine" in block.solution_reasoning -def test_resolve_engine_class_finds_package_export(): +def test_serviceability_block_from_service_result_isa_version_info(): + result = SimpleNamespace( + service_info={}, + afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + isa_version_info={"VERSION": "1.2.3"}, + ) + block = serviceability_block_from_service_result( + EXAMPLE_EVENTS[:1], + result, + engine_label="ISA", + rf_event_count=1, + ) + assert block.hub_version == "1.2.3" + assert block.afid_sag_file_version is not None + assert DUMMY_SAG_PID in block.afid_sag_file_version + + +def test_resolve_hub_class_finds_package_export(): import types submodule = types.ModuleType("fake_engine.impl") @@ -162,17 +209,17 @@ def test_resolve_engine_class_finds_package_export(): package.EngineImpl = submodule.EngineImpl # type: ignore[attr-defined] package.__all__ = ["EngineImpl"] - from nodescraper.plugins.serviceability.se_runner import _resolve_engine_class + from nodescraper.plugins.serviceability.se_runner import _resolve_hub_class - assert _resolve_engine_class(package) is submodule.EngineImpl + assert _resolve_hub_class(package) is submodule.EngineImpl -def test_run_service_engine_with_mock_module(): +def test_run_service_hub_with_mock_module(): rf_events = [ {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, ] - block = run_service_engine( + block = run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:2], afid_sag_path=str(AFID_SAG), @@ -183,9 +230,107 @@ def test_run_service_engine_with_mock_module(): assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM -def test_run_service_engine_missing_sag_raises(): - with pytest.raises(SeRunError, match="AFID_SAG"): - run_service_engine( +def test_run_service_hub_custom_analyze_method_and_path_kwarg(): + import sys + import types + + init_log: list[tuple[str, bool]] = [] + analyze_log: list[Any] = [] + + class AltEngine: + def __init__(self, rulebook_path: str, debug: bool = False) -> None: + init_log.append((rulebook_path, debug)) + + def analyze_events(self, rf_events, cper_data=None): + analyze_log.append((list(rf_events), cper_data)) + return None + + mod = types.ModuleType("alt_service_engine") + mod.AltEngine = AltEngine + mod.__all__ = ["AltEngine"] + sys.modules["alt_service_engine"] = mod + try: + run_service_hub( + engine_python_module="alt_service_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=[{"Afid": 1}], + cper_data={"k": 1}, + engine_options={"debug": True}, + engine_analyze_method="analyze_events", + engine_init_path_kwarg="rulebook_path", + ) + finally: + del sys.modules["alt_service_engine"] + + assert init_log[0][0] == str(AFID_SAG) + assert init_log[0][1] is True + assert analyze_log[0][1] == {"k": 1} + + +def test_run_service_hub_accepts_engine_options(): + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + block = run_service_hub( + engine_python_module="mock_python_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + engine_options={"reporting_level": "verbose"}, + ) + assert len(block.solution) == 1 + + +def test_run_service_hub_forwards_instinct_shaped_engine_options(): + from instinct_shaped_engine import clear_last_call, get_last_call + + clear_last_call() + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + run_service_hub( + engine_python_module="instinct_shaped_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + cper_data={"decoded": True}, + engine_options={ + "from_ac_cycle": 2, + "from_date": "2024-06-01", + "designation_serials": {"GPU0": "SN1"}, + "suppress_service_actions": ["42"], + }, + ) + got = get_last_call() + assert got["from_ac_cycle"] == 2 + assert got["from_date"] == "2024-06-01" + assert got["cper_data"] == {"decoded": True} + assert got["designation_serials"] == {"GPU0": "SN1"} + assert got["suppress_service_actions"] == ["42"] + + +def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): + from instinct_shaped_engine import clear_last_call, get_last_call + + clear_last_call() + rf_events = [ + {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, + ] + run_service_hub( + engine_python_module="instinct_shaped_engine", + afid_events=EXAMPLE_EVENTS[:1], + afid_sag_path=str(AFID_SAG), + rf_events=rf_events, + cper_data={"from_collector": 1}, + engine_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, + ) + assert get_last_call()["cper_data"] == {"from_collector": 1} + + +def test_run_service_hub_missing_sag_raises(): + with pytest.raises(SeRunError, match="Engine config file not found"): + run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, afid_sag_path="/nonexistent/dummy_afid_sag.json", @@ -237,6 +382,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): args = ServiceabilityAnalyzerArgs( engine_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), + engine_options={"include_raw_events": False}, ) result = analyzer.analyze_data(data, args=args) assert result.status == ExecutionStatus.OK From 27bf25d1c8fa836747681ab667de870a15b1758f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 16 Jun 2026 00:38:09 +0000 Subject: [PATCH 14/39] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index e15ce92a..e1b06be4 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -24,7 +24,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -PP -D -d {vendor_id}:{dev_id}
lspci -PP -D
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RegexSearchPlugin | - | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex).
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | +| RegexSearchPlugin | - | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dme...
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | | RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | @@ -37,7 +37,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | | OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | -| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single response.
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | +| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single r...
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | | RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | # Collectors @@ -998,8 +998,8 @@ RedfishEndpointDataModel ### Documented collection - Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1). -- Optional paged GET following Members@odata.nextLink when follow_next_link is true. -- Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / Members links (depth and endpoint caps from collection_args). +- Optional paged GET following the Members collection OData nextLink field when follow_next_link is true. +- Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). ## Collector Class RedfishOemDiagCollector @@ -2149,6 +2149,22 @@ Arguments for PCIe analyzer - **max_kfd_processes**: `int` — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check. - **max_cpu_usage**: `float` — Maximum allowed CPU usage (percent) for process checks. +## Analyzer Args Class RegexSearchAnalyzerArgs + +### Description + +Arguments for RegexSearchAnalyzer (dict items match Dmesg-style error_regex). + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/regex_search/analyzer_args.py) + +### Annotations / fields + +- **error_regex**: `Optional[list[dict[str, Any]]]` — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex). +- **interval_to_collapse_event**: `int` — Seconds within which repeated events are collapsed into one. +- **num_timestamps**: `int` — Number of timestamps to include per event in output. + ## Analyzer Args Class RocmAnalyzerArgs **Bases**: ['AnalyzerArgs'] From bf6bb2f3b1edf6782ff1c1084c57ff937ac82a01 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Jun 2026 08:51:43 -0500 Subject: [PATCH 15/39] utest fix --- nodescraper/interfaces/plugin.py | 4 ++-- test/unit/framework/common/shared_utils.py | 19 +++++++++++++++++-- test/unit/plugin/test_se_runner.py | 20 ++++++++++---------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/nodescraper/interfaces/plugin.py b/nodescraper/interfaces/plugin.py index 06959b54..9e22d346 100644 --- a/nodescraper/interfaces/plugin.py +++ b/nodescraper/interfaces/plugin.py @@ -26,7 +26,7 @@ import abc import inspect import logging -from typing import Callable, Generic, Optional, Type, Union +from typing import Any, Callable, Generic, Optional, Type, Union from nodescraper.constants import DEFAULT_EVENT_REPORTER, DEFAULT_LOGGER from nodescraper.models import PluginResult, SystemInfo @@ -125,7 +125,7 @@ def _update_queue(self, queue_item: tuple) -> None: self.queue_callback(queue_item) @abc.abstractmethod - def run(self, **kwargs) -> PluginResult: + def run(self, **kwargs: Any) -> PluginResult: """Plugin run function Returns: diff --git a/test/unit/framework/common/shared_utils.py b/test/unit/framework/common/shared_utils.py index 05e77af3..7ba16c16 100644 --- a/test/unit/framework/common/shared_utils.py +++ b/test/unit/framework/common/shared_utils.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -from typing import Any, Optional +from typing import Any, Dict, List, Optional from unittest.mock import MagicMock from nodescraper.constants import DEFAULT_EVENT_REPORTER @@ -87,12 +87,27 @@ class DummyDataModel(DataModel): some_version: str = "0" +# Module-level defaults so ``run`` signatures stay stable for ConfigBuilder tests. +_TEST_PLUGIN_A_LIST_DEFAULT: List[Any] = [1] +_TEST_PLUGIN_A_DICT_DEFAULT: Dict[str, Any] = {} +_TEST_PLUGIN_A_MODEL_DEFAULT = TestModelArg() + + class TestPluginA(PluginInterface[MockConnectionManager, None]): CONNECTION_TYPE = MockConnectionManager ANALYZER_ARGS = TestModelArg - def run(self, **kwargs: Any) -> PluginResult: + def run( + self, + test_bool_arg: bool = True, + test_str_arg: str = "test", + test_list_arg: List[Any] = _TEST_PLUGIN_A_LIST_DEFAULT, + test_dict_arg: Dict[str, Any] = _TEST_PLUGIN_A_DICT_DEFAULT, + test_model_arg: TestModelArg = _TEST_PLUGIN_A_MODEL_DEFAULT, + **kwargs: Any, + ) -> PluginResult: + _ = kwargs return PluginResult( source="testA", status=ExecutionStatus.ERROR, diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index d6fdf0d9..01f8c4bc 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -99,7 +99,7 @@ def test_resolved_hub_options_explicit_fields_override_options_bag(): args = ServiceabilityAnalyzerArgs( engine_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), - engine_options={"from_ac_cycle": 9, "extra": 1}, + hub_options={"from_ac_cycle": 9, "extra": 1}, from_ac_cycle=3, from_date="2025-01-01", designation_serials={"U": "S"}, @@ -256,7 +256,7 @@ def analyze_events(self, rf_events, cper_data=None): afid_sag_path=str(AFID_SAG), rf_events=[{"Afid": 1}], cper_data={"k": 1}, - engine_options={"debug": True}, + hub_options={"debug": True}, engine_analyze_method="analyze_events", engine_init_path_kwarg="rulebook_path", ) @@ -268,7 +268,7 @@ def analyze_events(self, rf_events, cper_data=None): assert analyze_log[0][1] == {"k": 1} -def test_run_service_hub_accepts_engine_options(): +def test_run_service_hub_accepts_hub_options(): rf_events = [ {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] @@ -277,12 +277,12 @@ def test_run_service_hub_accepts_engine_options(): afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, - engine_options={"reporting_level": "verbose"}, + hub_options={"reporting_level": "verbose"}, ) assert len(block.solution) == 1 -def test_run_service_hub_forwards_instinct_shaped_engine_options(): +def test_run_service_hub_forwards_full_hub_options_kwargs(): from instinct_shaped_engine import clear_last_call, get_last_call clear_last_call() @@ -295,7 +295,7 @@ def test_run_service_hub_forwards_instinct_shaped_engine_options(): afid_sag_path=str(AFID_SAG), rf_events=rf_events, cper_data={"decoded": True}, - engine_options={ + hub_options={ "from_ac_cycle": 2, "from_date": "2024-06-01", "designation_serials": {"GPU0": "SN1"}, @@ -310,7 +310,7 @@ def test_run_service_hub_forwards_instinct_shaped_engine_options(): assert got["suppress_service_actions"] == ["42"] -def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): +def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): from instinct_shaped_engine import clear_last_call, get_last_call clear_last_call() @@ -323,13 +323,13 @@ def test_run_service_hub_collected_cper_overrides_engine_options_cper_data(): afid_sag_path=str(AFID_SAG), rf_events=rf_events, cper_data={"from_collector": 1}, - engine_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, + hub_options={"cper_data": {"from_options": 2}, "from_ac_cycle": 0}, ) assert get_last_call()["cper_data"] == {"from_collector": 1} def test_run_service_hub_missing_sag_raises(): - with pytest.raises(SeRunError, match="Engine config file not found"): + with pytest.raises(SeRunError, match="Hub config file not found"): run_service_hub( engine_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, @@ -382,7 +382,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): args = ServiceabilityAnalyzerArgs( engine_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), - engine_options={"include_raw_events": False}, + hub_options={"include_raw_events": False}, ) result = analyzer.analyze_data(data, args=args) assert result.status == ExecutionStatus.OK From dafa092821757a15c977b2446c8a88500f4a2d4b Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Jun 2026 08:58:08 -0500 Subject: [PATCH 16/39] utest fix --- nodescraper/configbuilder.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nodescraper/configbuilder.py b/nodescraper/configbuilder.py index 7823b95a..bc8f1b8a 100644 --- a/nodescraper/configbuilder.py +++ b/nodescraper/configbuilder.py @@ -24,6 +24,7 @@ # ############################################################################### import enum +import inspect import logging from typing import Any, Optional, Type, Union @@ -64,9 +65,17 @@ def gen_config(self, plugin_names: list[str]) -> PluginConfig: @classmethod def _build_plugin_config(cls, plugin_class: Type[PluginInterface]) -> dict: type_map = TypeUtils.get_func_arg_types(plugin_class.run, plugin_class) + run_sig = inspect.signature(plugin_class.run) config = {} for arg, arg_data in type_map.items(): + param = run_sig.parameters.get(arg) + # abstraction level for the ServiceabilityPlugin to allow kwargs for hub call + if param is not None and param.kind in ( + inspect.Parameter.VAR_KEYWORD, + inspect.Parameter.VAR_POSITIONAL, + ): + continue cls._update_config(arg, arg_data, config) return config From 3c1e1592960d5c16df160d730d649eb2712aad49 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 16 Jun 2026 11:32:36 -0500 Subject: [PATCH 17/39] doc updates --- docs/PLUGIN_DOC.md | 83 ++++++++++++++++++++++++++++-- docs/generate_plugin_doc_bundle.py | 19 ++++--- 2 files changed, 90 insertions(+), 12 deletions(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index e15ce92a..88c06e42 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -4,6 +4,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | +| GenericCollectionPlugin | Runs each command from collection_args.commands on the target (in-band host or BMC over OOB SSH).
Commands are user-configured; there are no fixed CMD_* class fields. | **Analyzer Args:**
- `checks`: list[nodescraper.plugins.generic_collection.analyzer_args.CommandCheck] — Per-command validation rules keyed by collected command name. | **Collection Args:**
- `commands`: list[nodescraper.plugins.generic_collection.collector_args.CommandSpec] — Named commands to run. Each entry must include 'name' and 'command'. Prefer small textual stdout; see class docstring...
- `sudo`: bool — Default sudo setting for commands that do not specify sudo.
- `timeout`: int — Default per-command timeout in seconds.
- `include_stdout`: bool — Default: include each command's stdout in collected results for analysis. When false, stdout is omitted from stored r... | [GenericCollectionDataModel](#GenericCollectionDataModel-Model) | [GenericCollectionCollector](#Collector-Class-GenericCollectionCollector) | [GenericAnalyzer](#Data-Analyzer-Class-GenericAnalyzer) | | AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_power_management`: Optional[str] — Expected amd-smi metric power_management value per GPU (e.g. DISABLED for active/full power, ENABLED for power-manage...
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `analysis_firmware_ids`: Optional[list[str]] — amd-smi fw_id values to record in analysis_ref.firmware_versions
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | | BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | | CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | @@ -24,7 +25,7 @@ | PciePlugin | lspci -d {vendor_id}: -nn
lspci -x
lspci -xxxx
lspci -PP
lspci -PP -d {vendor_id}:{dev_id}
lspci -PP -D -d {vendor_id}:{dev_id}
lspci -PP -D
lspci -vvv
lspci -vvvt | **Analyzer Args:**
- `exp_speed`: int — Expected PCIe link speed (generation 1–5).
- `exp_width`: int — Expected PCIe link width in lanes (1–16).
- `exp_sriov_count`: int — Expected SR-IOV virtual function count.
- `exp_gpu_count_override`: Optional[int] — Override expected GPU count for validation.
- `exp_max_payload_size`: Union[Dict[int, int], int, NoneType] — Expected max payload size: int for all devices, or dict keyed by device ID.
- `exp_max_rd_req_size`: Union[Dict[int, int], int, NoneType] — Expected max read request size: int for all devices, or dict keyed by device ID.
- `exp_ten_bit_tag_req_en`: Union[Dict[int, int], int, NoneType] — Expected 10-bit tag request enable: int for all devices, or dict keyed by device ID. | - | [PcieDataModel](#PcieDataModel-Model) | [PcieCollector](#Collector-Class-PcieCollector) | [PcieAnalyzer](#Data-Analyzer-Class-PcieAnalyzer) | | ProcessPlugin | top -b -n 1
rocm-smi --showpids
top -b -n 1 -o %CPU | **Analyzer Args:**
- `max_kfd_processes`: int — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check.
- `max_cpu_usage`: float — Maximum allowed CPU usage (percent) for process checks. | **Collection Args:**
- `top_n_process`: int — Number of top processes by CPU usage to collect (e.g. for top -b -n 1 -o %%CPU). | [ProcessDataModel](#ProcessDataModel-Model) | [ProcessCollector](#Collector-Class-ProcessCollector) | [ProcessAnalyzer](#Data-Analyzer-Class-ProcessAnalyzer) | | RdmaPlugin | rdma link -j
rdma dev
rdma link
rdma statistic -j | - | - | [RdmaDataModel](#RdmaDataModel-Model) | [RdmaCollector](#Collector-Class-RdmaCollector) | [RdmaAnalyzer](#Data-Analyzer-Class-RdmaAnalyzer) | -| RegexSearchPlugin | - | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex).
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | +| RegexSearchPlugin | - | Runs RegexSearchAnalyzer: user-defined patterns via analysis_args.error_regex (same shape as Dmesg).
Emits regex match events with optional per-file source in the description when scanning directories.
**Analyzer Args:**
- `error_regex`: Optional[list[dict[str, Any]]] — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dme...
- `interval_to_collapse_event`: int — Seconds within which repeated events are collapsed into one.
- `num_timestamps`: int — Number of timestamps to include per event in output. | - | [RegexSearchData](#RegexSearchData-Model) | - | [RegexSearchAnalyzer](#Data-Analyzer-Class-RegexSearchAnalyzer) | | RocmPlugin | {rocm_path}/opencl/bin/*/clinfo
env | grep -Ei 'rocm|hsa|hip|mpi|openmp|ucx|miopen'
ls /sys/class/kfd/kfd/proc/
grep -i -E 'rocm' /etc/ld.so.conf.d/*
{rocm_path}/bin/rocminfo
ls -v -d {rocm_path}*
ls -v -d {rocm_path}-[3-7]* | tail -1
ldconfig -p | grep -i -E 'rocm'
grep . -H -r -i {rocm_path}/.info/* | **Analyzer Args:**
- `exp_rocm`: Union[str, list] — Expected ROCm version string(s) to match (e.g. from rocminfo).
- `exp_rocm_latest`: str — Expected 'latest' ROCm path or version string for versioned installs.
- `exp_rocm_sub_versions`: dict[str, Union[str, list]] — Map sub-version name (e.g. version_rocm) to expected string or list of allowed strings. | **Collection Args:**
- `rocm_path`: str — Base path to ROCm installation (e.g. /opt/rocm). Used for rocminfo, clinfo, and version discovery. | [RocmDataModel](#RocmDataModel-Model) | [RocmCollector](#Collector-Class-RocmCollector) | [RocmAnalyzer](#Data-Analyzer-Class-RocmAnalyzer) | | StoragePlugin | sh -c 'df -lH -B1 | grep -v 'boot''
wmic LogicalDisk Where DriveType="3" Get DeviceId,Size,FreeSpace | - | **Collection Args:**
- `skip_sudo`: bool — If True, do not use sudo when running df and related storage commands. | [StorageDataModel](#StorageDataModel-Model) | [StorageCollector](#Collector-Class-StorageCollector) | [StorageAnalyzer](#Data-Analyzer-Class-StorageAnalyzer) | | SysSettingsPlugin | cat /sys/{}
ls -1 /sys/{}
ls -l /sys/{} | **Analyzer Args:**
- `checks`: Optional[list[nodescraper.plugins.inband.sys_settings.analyzer_args.SysfsCheck]] — List of sysfs checks (path, expected values or pattern, display name). | **Collection Args:**
- `paths`: list[str] — Sysfs paths to read (cat). Paths with '*' are collected with ls -l (e.g. class/net/*/device).
- `directory_paths`: list[str] — Sysfs paths to list (ls -1); used for checks that match entry names by regex. | [SysSettingsDataModel](#SysSettingsDataModel-Model) | [SysSettingsCollector](#Collector-Class-SysSettingsCollector) | [SysSettingsAnalyzer](#Data-Analyzer-Class-SysSettingsAnalyzer) | @@ -36,12 +37,36 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | +| OobGenericCollectionPlugin | Runs each command from collection_args.commands on the target (in-band host or BMC over OOB SSH).
Commands are user-configured; there are no fixed CMD_* class fields. | **Analyzer Args:**
- `checks`: list[nodescraper.plugins.generic_collection.analyzer_args.CommandCheck] — Per-command validation rules keyed by collected command name. | **Collection Args:**
- `commands`: list[nodescraper.plugins.generic_collection.collector_args.CommandSpec] — Named commands to run. Each entry must include 'name' and 'command'. Prefer small textual stdout; see class docstring...
- `sudo`: bool — Default sudo setting for commands that do not specify sudo.
- `timeout`: int — Default per-command timeout in seconds.
- `include_stdout`: bool — Default: include each command's stdout in collected results for analysis. When false, stdout is omitted from stored r... | [GenericCollectionDataModel](#GenericCollectionDataModel-Model) | [GenericCollectionCollector](#Collector-Class-GenericCollectionCollector) | [GenericAnalyzer](#Data-Analyzer-Class-GenericAnalyzer) | | OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | -| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single response.
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | +| RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single r...
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | | RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | # Collectors +## Collector Class GenericCollectionCollector + +### Description + +Run user-configured shell commands and report per-command success. + +**Bases**: ['InBandDataCollector'] + +**Link to code**: [generic_collection_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/generic_collection/generic_collection_collector.py) + +### Class Variables + +- **SUPPORTED_OS_FAMILY**: `{, , }` + +### Provides Data + +GenericCollectionDataModel + +### Documented collection + +- Runs each command from collection_args.commands on the target (in-band host or BMC over OOB SSH). +- Commands are user-configured; there are no fixed CMD_* class fields. + ## Collector Class AmdSmiCollector ### Description @@ -998,8 +1023,8 @@ RedfishEndpointDataModel ### Documented collection - Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1). -- Optional paged GET following Members@odata.nextLink when follow_next_link is true. -- Redfish GET tree: when discover_tree is true, walks from api_root using @odata.id / Members links (depth and endpoint caps from collection_args). +- Optional paged GET following the Members collection OData nextLink field when follow_next_link is true. +- Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). ## Collector Class RedfishOemDiagCollector @@ -1022,6 +1047,20 @@ RedfishOemDiagDataModel # Data Models +## GenericCollectionDataModel Model + +### Description + +Results for each command configured in collection_args. + +**Link to code**: [generic_collection_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/generic_collection/generic_collection_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **results**: `list[nodescraper.plugins.generic_collection.generic_collection_data.CommandCollectionResult]` + ## AmdSmiDataModel Model ### Description @@ -1512,6 +1551,16 @@ Collected Redfish OEM diagnostic log results: OEM type -> result (success, error # Data Analyzers +## Data Analyzer Class GenericAnalyzer + +### Description + +Validate generic collection command results against analysis_args checks. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [generic_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/generic_collection/generic_analyzer.py) + ## Data Analyzer Class AmdSmiAnalyzer ### Description @@ -1931,6 +1980,16 @@ Analyzes Redfish OEM diagnostic log collection results. # Analyzer Args +## Analyzer Args Class GenericAnalyzerArgs + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/generic_collection/analyzer_args.py) + +### Annotations / fields + +- **checks**: `list[nodescraper.plugins.generic_collection.analyzer_args.CommandCheck]` — Per-command validation rules keyed by collected command name. + ## Analyzer Args Class AmdSmiAnalyzerArgs **Bases**: ['AnalyzerArgs'] @@ -2149,6 +2208,22 @@ Arguments for PCIe analyzer - **max_kfd_processes**: `int` — Maximum allowed number of KFD (Kernel Fusion Driver) processes; 0 disables the check. - **max_cpu_usage**: `float` — Maximum allowed CPU usage (percent) for process checks. +## Analyzer Args Class RegexSearchAnalyzerArgs + +### Description + +Arguments for RegexSearchAnalyzer (dict items match Dmesg-style error_regex). + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/inband/regex_search/analyzer_args.py) + +### Annotations / fields + +- **error_regex**: `Optional[list[dict[str, Any]]]` — Regex patterns to search for; each dict may include regex (str), message, event_category, event_priority (same as Dmesg analyzer error_regex). +- **interval_to_collapse_event**: `int` — Seconds within which repeated events are collapsed into one. +- **num_timestamps**: `int` — Number of timestamps to include per event in output. + ## Analyzer Args Class RocmAnalyzerArgs **Bases**: ['AnalyzerArgs'] diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index 96042329..4d873ca5 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -42,10 +42,13 @@ LINK_BASE_DEFAULT = "https://github.com/amd/node-scraper/blob/HEAD/" REL_ROOT_DEFAULT = "nodescraper/plugins/inband" -# Default packages scanned for plugin tables (IB: full inband tree; OOB: ooband). -PACKAGE_IB_INBAND = "nodescraper.plugins.inband" -PACKAGE_OOB = "nodescraper.plugins.ooband" -DEFAULT_PACKAGES = (PACKAGE_IB_INBAND, PACKAGE_OOB) +# Import and document every concrete plugin under nodescraper.plugins (inband, ooband, +# generic_collection, regex_search, serviceability, …). +PACKAGE_PLUGINS_ROOT = "nodescraper.plugins" +# ``plugins_for_package_prefix`` matches on ``cls.__module__``; keep the trailing dot so +# ``nodescraper.plugins`` itself does not match every module starting with that string. +PLUGIN_MODULE_PREFIX = f"{PACKAGE_PLUGINS_ROOT}." +DEFAULT_PACKAGES = (PACKAGE_PLUGINS_ROOT,) def get_attr(obj: Any, name: str, default: Any = None) -> Any: @@ -184,7 +187,7 @@ def find_inband_plugin_base(): def find_oob_plugin_bases() -> tuple[type, ...]: - """Return OOB plugin base classes under ``nodescraper.plugins.ooband`` (Redfish + BMC SSH).""" + """Return OOB plugin base classes (Redfish + BMC SSH) used to discover OOB plugins.""" base_mod = importlib.import_module("nodescraper.base") oob = get_attr(base_mod, "OOBandDataPlugin") oob_ssh = get_attr(base_mod, "OOBSSHDataPlugin") @@ -882,7 +885,7 @@ def main(): root = dotted_from_path(root_path) normalized_extra.append(root) - # Always import core plugin trees so IB/OOB tables are complete; append optional extras. + # Always import the full nodescraper.plugins tree; append optional extras. to_import: List[str] = [] seen_pkg: set[str] = set() for pkg in list(DEFAULT_PACKAGES) + normalized_extra: @@ -897,11 +900,11 @@ def main(): oob_bases = find_oob_plugin_bases() ib_plugins = sorted( - plugins_for_package_prefix((inband_base,), PACKAGE_IB_INBAND), + plugins_for_package_prefix((inband_base,), PLUGIN_MODULE_PREFIX), key=lambda c: f"{c.__module__}.{c.__name__}".lower(), ) oob_plugins = sorted( - plugins_for_package_prefix(oob_bases, PACKAGE_OOB), + plugins_for_package_prefix(oob_bases, PLUGIN_MODULE_PREFIX), key=lambda c: f"{c.__module__}.{c.__name__}".lower(), ) plugins = sorted( From 9ac384ddbc9cf9a075b896ad6d96ecb537074c12 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 17 Jun 2026 11:51:46 -0500 Subject: [PATCH 18/39] Parse AFID from Oem.AMDFieldIdentifiers[]+ resolve the serviceable unit from Links.OriginOfCondition so OpenBMC UBB log entries feed hub correctly. --- .../plugins/serviceability/afid_events.py | 50 +++++++- .../plugin/test_afid_events_bmc_schema.py | 110 ++++++++++++++++++ 2 files changed, 157 insertions(+), 3 deletions(-) create mode 100644 test/unit/plugin/test_afid_events_bmc_schema.py diff --git a/nodescraper/plugins/serviceability/afid_events.py b/nodescraper/plugins/serviceability/afid_events.py index 2138c0cf..a84af503 100644 --- a/nodescraper/plugins/serviceability/afid_events.py +++ b/nodescraper/plugins/serviceability/afid_events.py @@ -100,10 +100,33 @@ def _extract_afid(payload: dict[str, Any]) -> Optional[int]: oem = payload.get("Oem") if isinstance(oem, dict): for vendor_payload in oem.values(): - if isinstance(vendor_payload, dict): + found = _extract_afid_from_oem_fragment(vendor_payload) + if found is not None: + return found + return None + + +def _extract_afid_from_oem_fragment(vendor_payload: Any) -> Optional[int]: + """Resolve AFID from one ``Oem`` property value (dict or list of dicts, e.g. ``AMDFieldIdentifiers``).""" + if isinstance(vendor_payload, dict): + for key in _AFID_KEYS: + if key in vendor_payload and vendor_payload[key] is not None: + return int(vendor_payload[key]) + elif isinstance(vendor_payload, list): + for item in vendor_payload: + if isinstance(item, dict): for key in _AFID_KEYS: - if key in vendor_payload and vendor_payload[key] is not None: - return int(vendor_payload[key]) + if key in item and item[key] is not None: + return int(item[key]) + return None + + +def _origin_dict_to_unit(value: Any) -> Optional[str]: + if not isinstance(value, dict): + return None + odata_id = value.get("@odata.id") or value.get("odata.id") + if odata_id: + return _unit_from_odata_id(str(odata_id)) return None @@ -119,6 +142,18 @@ def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: text = str(value).strip() if text: return _unit_from_odata_id(text) if "/" in text else text + + links = payload.get("Links") or payload.get("links") + if isinstance(links, dict): + ooc = ( + links.get("OriginOfCondition") + or links.get("originOfCondition") + or links.get("OriginofCondition") + ) + unit = _origin_dict_to_unit(ooc) + if unit: + return unit + oem = payload.get("Oem") if isinstance(oem, dict): for vendor_payload in oem.values(): @@ -128,6 +163,15 @@ def _extract_serviceable_unit(payload: dict[str, Any]) -> Optional[str]: ) if unit is not None and str(unit).strip(): return str(unit).strip() + elif isinstance(vendor_payload, list): + for item in vendor_payload: + if not isinstance(item, dict): + continue + su = item.get("ServiceableUnits") or item.get("serviceable_units") + if isinstance(su, list) and su: + u = _origin_dict_to_unit(su[0]) + if u: + return u return None diff --git a/test/unit/plugin/test_afid_events_bmc_schema.py b/test/unit/plugin/test_afid_events_bmc_schema.py new file mode 100644 index 00000000..7c54364f --- /dev/null +++ b/test/unit/plugin/test_afid_events_bmc_schema.py @@ -0,0 +1,110 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +############################################################################### +"""AFID / serviceable unit extraction for OpenBMC-style LogEntry payloads.""" +from __future__ import annotations + +from nodescraper.plugins.serviceability.afid_events import ( + _afid_event_from_rf_member, + build_afid_events_from_data, +) +from nodescraper.plugins.serviceability.serviceability_data import ( + ServiceabilityDataModel, +) + +# Shape from after_clear_rma_case.json: AFID under Oem.AMDFieldIdentifiers[], OOC under Links. +_SAMPLE_LOG_ENTRY = { + "@odata.id": "/redfish/v1/Systems/UBB/LogServices/EventLog/Entries/1", + "Created": "2026-06-16T20:25:22+00:00", + "Id": "1", + "Links": { + "OriginOfCondition": { + "@odata.id": "/redfish/v1/Chassis/OAM_7", + } + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 22, + "Description": "On-die ECC, Uncorrected, Non-fatal", + "ServiceableUnits": [ + {"@odata.id": "/redfish/v1/Chassis/OAM_7"}, + ], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, +} + + +def test_afid_event_from_openbmc_log_entry_with_links_and_amd_field_identifiers(): + ev = _afid_event_from_rf_member(_SAMPLE_LOG_ENTRY) + assert ev is not None + assert ev.afid == 22 + assert ev.serviceable_unit == "OAM_7" + assert "2026-06-16" in ev.time + + +def test_serviceable_unit_from_oem_serviceable_units_when_no_links(): + member = { + "Created": "2026-06-16T20:25:22+00:00", + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 23, + "ServiceableUnits": [ + {"@odata.id": "/redfish/v1/Chassis/OAM_3"}, + ], + } + ], + }, + } + ev = _afid_event_from_rf_member(member) + assert ev is not None + assert ev.afid == 23 + assert ev.serviceable_unit == "OAM_3" + + +# Minimal slice of smci350 command_artifacts.json first CPER row (Links + AMDFieldIdentifiers[]). +_SMCI350_STYLE_ENTRY = { + "Created": "2026-06-16T18:53:21+00:00", + "Id": "1", + "Links": { + "OriginOfCondition": {"@odata.id": "/redfish/v1/Chassis/OAM_2"}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": 25, + "Description": "All Other HBM, Fatal", + "ServiceableUnits": [{"@odata.id": "/redfish/v1/Chassis/OAM_2"}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, +} + + +def test_afid_event_smci350_style_fatal_hbm_entry(): + ev = _afid_event_from_rf_member(_SMCI350_STYLE_ENTRY) + assert ev is not None + assert ev.afid == 25 + assert ev.serviceable_unit == "OAM_2" + + +def test_build_afid_events_from_data_includes_openbmc_entries(): + data = ServiceabilityDataModel( + rf_events=[_SAMPLE_LOG_ENTRY, _SMCI350_STYLE_ENTRY], + cper_data={}, + ) + events = build_afid_events_from_data(data) + assert len(events) == 2 + by_afid_oam = {(e.afid, e.serviceable_unit) for e in events} + assert (22, "OAM_7") in by_afid_oam + assert (25, "OAM_2") in by_afid_oam From e593159025aeaadf8c89cae89061d86b7b171677 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Wed, 17 Jun 2026 13:47:06 -0500 Subject: [PATCH 19/39] pyproject.yaml updates --- pyproject.toml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1d40c1a8..9e24d056 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,12 +1,23 @@ [project] name = "amd-node-scraper" dynamic = ["version"] -description = "A framework for automated error detection and data collection" +description = "Automated data collection and analysis for system debug." authors = [{ name = "AMD" }] readme = "README.md" requires-python = ">=3.9" +license = { text = "MIT" } -keywords = [] +keywords = [ + "amd", + "debug", + "diagnostics", + "dmesg", + "redfish", + "scraping", + "systems", + "oob", + "in-band", +] classifiers = ["Topic :: Software Development"] @@ -36,8 +47,10 @@ dev = [ [project.urls] homepage = "https://github.com/amd/node-scraper" -documentation = "https://github.com/amd/node-scraper" +documentation = "https://github.com/amd/node-scraper/blob/main/docs/PLUGIN_DOC.md" repository = "https://github.com/amd/node-scraper" +changelog = "https://github.com/amd/node-scraper/releases" +issues = "https://github.com/amd/node-scraper/issues" [build-system] requires = ["setuptools==78.1.1", "setuptools-scm==8.1.0"] From 03d0d1a1a5f29dbdc0f694dc362bf22a2e99c730 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Thu, 18 Jun 2026 16:27:07 -0500 Subject: [PATCH 20/39] updates --- .../serviceability/mi3xx/mi3xx_analyzer.py | 20 +- .../plugins/serviceability/se_adapter.py | 178 +++++++++++++++++- .../plugins/serviceability/se_models.py | 13 +- .../serviceability_collector.py | 77 +++++++- 4 files changed, 273 insertions(+), 15 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 0424e8e2..931366df 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -25,7 +25,9 @@ ############################################################################### from __future__ import annotations -from typing import Optional +from typing import Any, ClassVar, Optional + +from pydantic import BaseModel, Field from nodescraper.enums import ExecutionStatus from nodescraper.interfaces import DataAnalyzer @@ -46,6 +48,14 @@ ) +class AfidSagMetadataArtifact(BaseModel): + """Hub AFID_SAG metadata snapshot; written to ``afid_sag_metadata.json``.""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "afid_sag_metadata" + + metadata: dict[str, Any] = Field(default_factory=dict) + + class MI3XXAnalyzer(DataAnalyzer[ServiceabilityDataModel, ServiceabilityAnalyzerArgs]): """Build AFID events from collected data and run the configured service hub.""" @@ -134,6 +144,7 @@ def analyze_data( return self.result data.serviceability = block + self._append_afid_sag_metadata_artifact(block) self._log_serviceability_solutions(block) engine_label = args.engine_display_name or args.engine_python_module self.result.status = ExecutionStatus.OK @@ -154,6 +165,13 @@ def analyze_data( ) return self.result + def _append_afid_sag_metadata_artifact(self, block: ServiceabilityBlock) -> None: + if block.afid_sag_metadata is None: + return + self.result.artifacts.append( + AfidSagMetadataArtifact(metadata=dict(block.afid_sag_metadata)) + ) + def _log_serviceability_solutions(self, block: ServiceabilityBlock) -> None: parent = self.parent or self.__class__.__name__ for line in format_serviceability_solution_lines(block): diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 0e31135a..04321c82 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -26,11 +26,34 @@ """Map serviceability plugin models to/from Python service hub results.""" from __future__ import annotations +import json from collections import defaultdict -from typing import Any, Optional +from typing import Any, Dict, List, Optional, Tuple from .se_models import AfidEvent, ServiceabilityBlock, ServiceabilitySolution +# Hub payload keys commonly holding a one-line human summary (not raw OEM metadata). +_SUMMARY_VALUE_KEYS: Tuple[str, ...] = ( + "short_service", + "short_service_info", + "summary", + "message", + "title", + "recommendation", + "solution", + "service_recommendation", + "action", +) +_UNIT_LABEL_KEYS: Tuple[str, ...] = ( + "oem", + "OEM", + "unit", + "serviceable_unit", + "designation", + "chassis", + "device", +) + def _hub_version_display(version_info: Any) -> Optional[str]: """Pick a single hub version string from common hub result version dict layouts.""" @@ -78,9 +101,151 @@ def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: return ", ".join(parts) +def _human_summary_line_from_hub_value(value: Any) -> Optional[str]: + """Pick a single human-readable line from a hub fragment (string, number, or dict).""" + if value is None: + return None + if isinstance(value, str): + text = value.strip() + return text or None + if isinstance(value, (int, float)) and not isinstance(value, bool): + return str(value).strip() or None + if isinstance(value, dict): + for key in _SUMMARY_VALUE_KEYS: + if key not in value: + continue + got = _human_summary_line_from_hub_value(value[key]) + if got: + return got + for key in ("service_action", "ServiceAction"): + if key not in value: + continue + raw = value[key] + if isinstance(raw, dict): + inner = ( + raw.get("title") + or raw.get("text") + or raw.get("name") + or raw.get("service_action") + ) + if isinstance(inner, str) and inner.strip(): + return inner.strip() + got = _human_summary_line_from_hub_value(raw) + if got: + return got + else: + s = str(raw).strip() + if s: + return s + for alt in ("text", "name", "description", "details"): + if isinstance(value.get(alt), str) and str(value[alt]).strip(): + return str(value[alt]).strip() + return None + text = str(value).strip() + return text or None + + +def _unit_label_from_short_service_item(item: dict[str, Any]) -> str: + for key in _UNIT_LABEL_KEYS: + raw = item.get(key) + if raw is not None and str(raw).strip(): + return str(raw).strip() + return "" + + +def _maybe_unwrap_outer_unit_map(d: dict[str, Any]) -> dict[str, Any]: + """If the hub wraps {wrapper: {unit: {...}}}, return the inner unit map.""" + if len(d) != 1: + return d + _, inner = next(iter(d.items())) + if isinstance(inner, dict) and inner and all(isinstance(v, dict) for v in inner.values()): + return inner + return d + + +def _merged_short_service_lines_from_unit_messages(entries: List[Tuple[str, str]]) -> List[str]: + """Group (unit, message) rows by message; merge units when the message is identical.""" + by_message: dict[str, list[str]] = defaultdict(list) + for unit, msg in entries: + if not msg: + continue + by_message[msg].append(unit or "") + + lines: list[str] = [] + for msg in sorted(by_message.keys(), key=lambda m: (-len(by_message[m]), m.lower())): + units = sorted({u for u in by_message[msg] if u}) + if len(units) <= 1: + u = units[0] if units else "" + lines.append(f"{msg} ({u})" if u else msg) + else: + lines.append(f"{msg} — OEMs/units: {', '.join(units)}") + return lines + + +def _format_short_service_info_for_block(raw: Any) -> Optional[str]: + """Turn hub ``short_service_info`` into multiline log/LLM text (no JSON dump of unit maps).""" + if raw is None: + return None + if isinstance(raw, str): + text = raw.strip() + return text or None + if isinstance(raw, (list, tuple)): + if raw and all(isinstance(x, dict) for x in raw): + entries: list[tuple[str, str]] = [] + for item in raw: + assert isinstance(item, dict) + unit = _unit_label_from_short_service_item(item) + msg = _human_summary_line_from_hub_value( + item + ) or _human_summary_line_from_hub_value(item.get("short_service_info")) + if msg: + entries.append((unit, msg)) + lines = _merged_short_service_lines_from_unit_messages(entries) + out = "\n".join(lines).strip() + return out or None + parts = [str(x).strip() for x in raw if x is not None and str(x).strip()] + return "\n".join(parts) if parts else None + if isinstance(raw, dict): + d = _maybe_unwrap_outer_unit_map(raw) + if d and all(isinstance(v, dict) for v in d.values()): + entries = [] + for unit_key, inner in d.items(): + msg = _human_summary_line_from_hub_value(inner) + if msg: + entries.append((str(unit_key).strip(), msg)) + lines = _merged_short_service_lines_from_unit_messages(entries) + out = "\n".join(lines).strip() + if out: + return out + flat_lines: list[str] = [] + for key in sorted(d.keys(), key=lambda x: str(x).lower()): + val = d[key] + if isinstance(val, dict): + msg = _human_summary_line_from_hub_value(val) + if msg: + flat_lines.append(f"{key}: {msg}") + elif val is not None and str(val).strip(): + flat_lines.append(f"{key}: {str(val).strip()}") + if flat_lines: + return "\n".join(flat_lines) + try: + compact = json.dumps(d, sort_keys=True) + except TypeError: + compact = str(d) + compact = compact.strip() + return compact or None + text = str(raw).strip() + return text or None + + def format_serviceability_solution_lines(block: ServiceabilityBlock) -> list[str]: """Human-readable lines for logging or console output.""" lines: list[str] = [] + if block.short_service_info: + lines.append("short_service_info:") + for part in block.short_service_info.splitlines(): + lines.append(f" {part}" if part else " ") + lines.append("") if block.solution_reasoning: lines.append(block.solution_reasoning) if block.hub_version: @@ -107,7 +272,7 @@ def serviceability_block_from_service_result( engine_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: - """Build a :class:`ServiceabilityBlock` from a hub result with ``service_info``.""" + """Build a ``ServiceabilityBlock`` from a hub result with ``service_info``.""" grouped: dict[tuple[int, int], list[str]] = defaultdict(list) titles: dict[tuple[int, int], str] = {} service_info = getattr(result, "service_info", None) or {} @@ -151,7 +316,8 @@ def _action_title(info: dict[str, Any]) -> str: ) for (afid, san), units in sorted(grouped.items()) ] - metadata = getattr(result, "afid_sag_metadata", None) or {} + raw_metadata = getattr(result, "afid_sag_metadata", None) + metadata: Dict[str, Any] = raw_metadata if isinstance(raw_metadata, dict) else {} version_info = ( getattr(result, "engine_version_info", None) or getattr(result, "isa_version_info", None) @@ -161,10 +327,16 @@ def _action_title(info: dict[str, Any]) -> str: hub_version = _hub_version_display(version_info) afid_sag_file_version = _afid_sag_file_version_display(metadata) reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + meta_out: Optional[dict[str, Any]] = dict(metadata) if isinstance(raw_metadata, dict) else None + short_service_info = _format_short_service_info_for_block( + getattr(result, "short_service_info", None) + ) return ServiceabilityBlock( afid_events=list(afid_events), solution=solutions, solution_reasoning=reasoning, hub_version=hub_version, afid_sag_file_version=afid_sag_file_version, + afid_sag_metadata=meta_out, + short_service_info=short_service_info, ) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 60c34083..8a3f50f3 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -25,7 +25,7 @@ ############################################################################### from __future__ import annotations -from typing import List, Optional +from typing import Any, List, Optional from pydantic import BaseModel, Field, field_validator @@ -89,3 +89,14 @@ class ServiceabilityBlock(BaseModel): default=None, description="AFID_SAG.json identity/revision string when the hub returned metadata.", ) + afid_sag_metadata: Optional[dict[str, Any]] = Field( + default=None, + description="Hub-reported AFID_SAG metadata dict when the engine exposes afid_sag_metadata.", + ) + short_service_info: Optional[str] = Field( + default=None, + description=( + "Brief hub summary derived from short_service_info (human-readable lines; " + "per-unit dict payloads are collapsed, identical messages merged with unit lists)." + ), + ) diff --git a/nodescraper/plugins/serviceability/serviceability_collector.py b/nodescraper/plugins/serviceability/serviceability_collector.py index 3278c113..0ad28643 100644 --- a/nodescraper/plugins/serviceability/serviceability_collector.py +++ b/nodescraper/plugins/serviceability/serviceability_collector.py @@ -26,17 +26,50 @@ from __future__ import annotations import abc -from typing import Any, Generic, Optional, Protocol, TypeVar, cast +from typing import Any, ClassVar, Generic, Literal, Optional, Protocol, TypeVar, cast from urllib.parse import urlparse +from pydantic import BaseModel, Field + from nodescraper.base import RedfishDataCollector -from nodescraper.connection.redfish import RF_MEMBERS, RF_MEMBERS_COUNT +from nodescraper.connection.redfish import ( + RF_MEMBERS, + RF_MEMBERS_COUNT, + RedfishGetResult, +) from nodescraper.enums import ExecutionStatus from nodescraper.models import CollectorArgs, TaskResult from .serviceability_data import DeviceInfo, ServiceabilityDataModel +class ServiceabilityUriManifestArtifact(BaseModel): + """Resolved Redfish URIs for this serviceability run (``serviceability_uri_manifest.json``).""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "serviceability_uri_manifest" + + artifact_kind: Literal["ServiceabilityUriManifest"] = "ServiceabilityUriManifest" + event_log_uri: str + assembly_get_uris: list[str] = Field(default_factory=list) + firmware_inventory_uri: Optional[str] = None + + +class FirmwareInventoryArtifact(BaseModel): + """Firmware inventory Redfish GET; written to ``firmware_inventory.json`` with path, success, data, error, and status_code fields (same layout as a Redfish GET artifact row).""" + + ARTIFACT_LOG_BASENAME: ClassVar[str] = "firmware_inventory" + + path: str + success: bool + data: Optional[dict[str, Any]] = None + error: Optional[str] = None + status_code: Optional[int] = None + + @classmethod + def from_redfish_get(cls, res: RedfishGetResult) -> FirmwareInventoryArtifact: + return cls.model_validate(res.model_dump(mode="python")) + + class _ServiceabilityCollectArg(Protocol): follow_next_link: bool max_pages: int @@ -98,7 +131,7 @@ def extract_component_details( def _fetch_event_log(self, args: TServiceabilityCollectArg, uri: str): if args.follow_next_link: - return self._run_redfish_get_paged(uri, max_pages=args.max_pages) + return self._run_redfish_get_paged(uri, max_pages=args.max_pages, log_artifact=True) return self._run_redfish_get(uri, log_artifact=True) def collect_data( @@ -111,6 +144,11 @@ def collect_data( svc_args = cast(TServiceabilityCollectArg, args) event_uri = svc_args.resolved_event_log_uri() + self.logger.info( + "Serviceability: event log Redfish URI %s (follow_next_link=%s)", + event_uri, + svc_args.follow_next_link, + ) if svc_args.top is not None: res = self._fetch_top(svc_args, svc_args.top, svc_args.max_pages) else: @@ -134,11 +172,18 @@ def collect_data( return self.result, None assembly_info: dict[str, DeviceInfo] = {} + assembly_get_uris: list[str] = [] tpl = svc_args.rf_assembly_uri_template devices = svc_args.rf_chassis_devices if tpl and devices: for device in devices: uri_asm = tpl.format(device=device) + assembly_get_uris.append(uri_asm) + self.logger.info( + "Serviceability: assembly Redfish GET %s (chassis designation=%s)", + uri_asm, + device, + ) assembly_res = self._run_redfish_get(uri_asm, log_artifact=True) if not assembly_res.success or assembly_res.data is None: continue @@ -153,31 +198,43 @@ def collect_data( cper_raw = self.collect_cper_attachments(filtered_members or []) + component_details, firmware_uri_used = self._fetch_component_details(responses, svc_args) + data = ServiceabilityDataModel( responses=responses, rf_events=filtered_members or [], assembly_info=assembly_info, cper_raw=cper_raw, - component_details=self._fetch_component_details(responses, svc_args), + component_details=component_details, log_path=self._log_path, bmc_host=bmc_host, ) + self.result.artifacts.append( + ServiceabilityUriManifestArtifact( + event_log_uri=event_uri, + assembly_get_uris=assembly_get_uris, + firmware_inventory_uri=firmware_uri_used, + ) + ) self.result.status = ExecutionStatus.OK self.result.message = f"Collected {len(members)} event log member(s)" return self.result, data def _fetch_component_details( self, responses: dict[str, Any], args: TServiceabilityCollectArg - ) -> Optional[str]: + ) -> tuple[Optional[str], Optional[str]]: + """Return ``(component_details, firmware_uri)``; firmware_uri is set when a GET was attempted.""" fw_uri = args.rf_firmware_bundle_uri if not fw_uri or not str(fw_uri).strip(): - return None + return None, None fw_uri = str(fw_uri).strip() - fw_res = self._run_redfish_get(fw_uri, log_artifact=True) + self.logger.info("Serviceability: firmware inventory Redfish GET %s", fw_uri) + fw_res = self._run_redfish_get(fw_uri, log_artifact=False) + self.result.artifacts.append(FirmwareInventoryArtifact.from_redfish_get(fw_res)) if not fw_res.success or fw_res.data is None: - return None + return None, fw_uri responses[fw_res.path] = fw_res.data - return self.extract_component_details(fw_res.data, args) + return self.extract_component_details(fw_res.data, args), fw_uri def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): event_uri = args.resolved_event_log_uri() @@ -193,5 +250,5 @@ def _fetch_top(self, args: TServiceabilityCollectArg, top: int, max_pages: int): skip = count - top skip_uri = f"{event_uri}?$skip={skip}" if args.follow_next_link: - return self._run_redfish_get_paged(skip_uri, max_pages=max_pages) + return self._run_redfish_get_paged(skip_uri, max_pages=max_pages, log_artifact=True) return self._run_redfish_get(skip_uri, log_artifact=True) From 0bbbb376bec1d507c9d144784f79dfd7780c6c58 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 19 Jun 2026 10:53:44 -0500 Subject: [PATCH 21/39] embedding task result hooks from invocation --- nodescraper/cli/cli.py | 6 ++++ nodescraper/cli/embed.py | 19 +++++++++-- nodescraper/cli/invocation.py | 11 +++++- nodescraper/pluginexecutor.py | 12 ++++++- test/unit/cli/test_cli_embed_api.py | 7 +++- test/unit/framework/test_plugin_executor.py | 37 +++++++++++++++++++-- 6 files changed, 84 insertions(+), 8 deletions(-) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 129ef136..2043a152 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -32,6 +32,7 @@ import platform import sys import uuid +from collections.abc import Sequence from typing import Optional import nodescraper @@ -64,6 +65,7 @@ from nodescraper.connection.redfish.redfish_params import RedfishConnectionParams from nodescraper.constants import DEFAULT_LOGGER from nodescraper.enums import ExecutionStatus, SystemInteractionLevel, SystemLocation +from nodescraper.interfaces import TaskResultHook from nodescraper.models import SystemInfo from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry @@ -461,6 +463,7 @@ def main( arg_input: Optional[list[str]] = None, *, host_cli_args: Optional[argparse.Namespace] = None, + embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, ): """Main entry point for the CLI @@ -468,6 +471,8 @@ def main( arg_input (Optional[list[str]], optional): list of args to parse. Defaults to None. host_cli_args: Optional namespace from an embedding host (e.g. detect-errors) for code that calls get_plugin_run_invocation during the plugin queue. + embed_default_task_result_hooks: Optional hooks prepended for embedded runs (see + :func:`nodescraper.cli.embed.run_cli_return_code`). """ if arg_input is None: arg_input = sys.argv[1:] @@ -642,6 +647,7 @@ def main( timestamp=timestamp, sname=sname, host_cli_args=host_cli_args, + embed_default_task_result_hooks=embed_default_task_result_hooks, session_id=str(uuid.uuid4()), ) diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py index 60d94515..f82f0f53 100644 --- a/nodescraper/cli/embed.py +++ b/nodescraper/cli/embed.py @@ -27,9 +27,11 @@ from __future__ import annotations import argparse +from collections.abc import Sequence from typing import Optional from nodescraper.cli.cli import get_cli_top_level_subcommands +from nodescraper.interfaces import TaskResultHook CLI_TOP_LEVEL_SUBCOMMANDS = get_cli_top_level_subcommands() @@ -45,29 +47,38 @@ def run_cli_return_code( argv: list[str], *, host_cli_args: Optional[argparse.Namespace] = None, + embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, ) -> int: """Run nodescraper in-process; same behavior as :func:`run_main_return_code`. Args: argv: Tokens after the program name. host_cli_args: Optional host namespace forwarded to :func:`nodescraper.cli.cli.main`. + embed_default_task_result_hooks: Optional hooks prepended to every plugin run and connection + manager for this embed (e.g. host event logging). Merged in :class:`PluginExecutor`. Returns: Integer exit code (``SystemExit`` is mapped, not raised). """ - return run_main_return_code(argv, host_cli_args=host_cli_args) + return run_main_return_code( + argv, + host_cli_args=host_cli_args, + embed_default_task_result_hooks=embed_default_task_result_hooks, + ) def run_main_return_code( arg_input: list[str], *, host_cli_args: Optional[argparse.Namespace] = None, + embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, ) -> int: """Run :func:`nodescraper.cli.cli.main` and map ``SystemExit`` to an exit code. Args: arg_input: Tokens after the program name. host_cli_args: Optional host namespace for embedded runs. + embed_default_task_result_hooks: Optional default task-result hooks for this embed. Returns: Integer exit code. @@ -75,7 +86,11 @@ def run_main_return_code( from nodescraper.cli.cli import main try: - main(arg_input, host_cli_args=host_cli_args) + main( + arg_input, + host_cli_args=host_cli_args, + embed_default_task_result_hooks=embed_default_task_result_hooks, + ) except SystemExit as exc: code = exc.code if code is None: diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py index ee59e4a6..c5d5d853 100644 --- a/nodescraper/cli/invocation.py +++ b/nodescraper/cli/invocation.py @@ -28,11 +28,13 @@ import argparse import logging +from collections.abc import Iterator, Sequence from contextlib import contextmanager from contextvars import ContextVar from dataclasses import dataclass -from typing import Iterator, Optional +from typing import Optional +from nodescraper.interfaces.taskresulthook import TaskResultHook from nodescraper.models import PluginConfig, SystemInfo from nodescraper.models.pluginresult import PluginResult from nodescraper.pluginexecutor import PluginExecutor @@ -72,6 +74,7 @@ class PluginRunInvocation: sname: str host_cli_args: Optional[argparse.Namespace] = None session_id: Optional[str] = None + embed_default_task_result_hooks: tuple[TaskResultHook, ...] = () def run_plugin_queue_with_invocation( @@ -86,8 +89,12 @@ def run_plugin_queue_with_invocation( sname: str, host_cli_args: Optional[argparse.Namespace] = None, session_id: Optional[str] = None, + embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, ) -> list[PluginResult]: """Constructs the plugin executor, binds invocation context, and runs the plugin queue.""" + embed_hooks_tuple: tuple[TaskResultHook, ...] = ( + tuple(embed_default_task_result_hooks) if embed_default_task_result_hooks else () + ) inv = PluginRunInvocation( plugin_reg=plugin_reg, parsed_args=parsed_args, @@ -99,6 +106,7 @@ def run_plugin_queue_with_invocation( sname=sname, host_cli_args=host_cli_args, session_id=session_id, + embed_default_task_result_hooks=embed_hooks_tuple, ) plugin_executor = PluginExecutor( logger=logger, @@ -108,6 +116,7 @@ def run_plugin_queue_with_invocation( log_path=log_path, plugin_registry=plugin_reg, session_id=session_id, + embed_default_task_result_hooks=embed_hooks_tuple, ) with plugin_run_invocation_scope(inv): return plugin_executor.run_queue() diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index 4f3febed..df135301 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -30,6 +30,7 @@ import logging import uuid from collections import deque +from collections.abc import Sequence from typing import Optional, Type, Union from pydantic import BaseModel @@ -38,6 +39,7 @@ from nodescraper.connection.oob_ssh import OobSshConnectionManager from nodescraper.constants import DEFAULT_LOGGER from nodescraper.interfaces import ConnectionManager, DataPlugin, PluginInterface +from nodescraper.interfaces.taskresulthook import TaskResultHook from nodescraper.models import PluginConfig, SystemInfo from nodescraper.models.pluginresult import PluginResult from nodescraper.pluginregistry import PluginRegistry @@ -57,6 +59,7 @@ def __init__( plugin_registry: Optional[PluginRegistry] = None, log_path: Optional[str] = None, session_id: Optional[str] = None, + embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, ): if logger is None: @@ -89,7 +92,12 @@ def __init__( self.log_path = log_path - self.connection_result_hooks = [] + self.embed_default_task_result_hooks: list[TaskResultHook] = ( + list(embed_default_task_result_hooks) if embed_default_task_result_hooks else [] + ) + + self.connection_result_hooks: list[TaskResultHook] = [] + self.connection_result_hooks.extend(self.embed_default_task_result_hooks) if log_path: self.connection_result_hooks.append(FileSystemLogHook(log_base_path=log_path)) @@ -178,6 +186,8 @@ def run_queue(self) -> list[PluginResult]: "log_path": self.log_path, "session_id": self.session_id, } + if self.embed_default_task_result_hooks: + init_payload["task_result_hooks"] = list(self.embed_default_task_result_hooks) if plugin_class.CONNECTION_TYPE: if issubclass(plugin_class, OOBSSHDataPlugin): diff --git a/test/unit/cli/test_cli_embed_api.py b/test/unit/cli/test_cli_embed_api.py index db44f6cf..e260f667 100644 --- a/test/unit/cli/test_cli_embed_api.py +++ b/test/unit/cli/test_cli_embed_api.py @@ -53,7 +53,12 @@ def test_run_cli_return_code_and_run_main_return_code_delegate( ) -> None: calls: list[list[str]] = [] - def fake_main(arg_input: list[str], *, host_cli_args=None) -> None: + def fake_main( + arg_input: list[str], + *, + host_cli_args=None, + embed_default_task_result_hooks=None, + ) -> None: calls.append(list(arg_input)) raise SystemExit(7) diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index fe9a8954..ceec44a2 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -24,16 +24,17 @@ # ############################################################################### import pytest -from framework.common.shared_utils import DummyDataModel, MockConnectionManager from pydantic import BaseModel +from framework.common.shared_utils import DummyDataModel, MockConnectionManager from nodescraper.enums import ExecutionStatus from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.systeminteraction import SystemInteractionLevel -from nodescraper.interfaces import PluginInterface -from nodescraper.models import PluginConfig, PluginResult +from nodescraper.interfaces import PluginInterface, TaskResultHook +from nodescraper.models import PluginConfig, PluginResult, TaskResult from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry +from nodescraper.taskresulthooks import FileSystemLogHook class DummyArgs(BaseModel): @@ -186,3 +187,33 @@ def test_connection_manager_from_plugin_when_not_in_registry(): assert len(results) == 1 assert results[0].source == "testB" assert results[0].status == ExecutionStatus.OK + + +class _CaptureEmbedHook(TaskResultHook): + def process_result(self, task_result: TaskResult, **kwargs) -> None: + pass + + +def test_embed_default_task_result_hooks_order_before_filesystem_log(plugin_registry, tmp_path): + embed = _CaptureEmbedHook() + executor = PluginExecutor( + plugin_configs=[PluginConfig(plugins={"TestPluginB": {}})], + plugin_registry=plugin_registry, + log_path=str(tmp_path), + embed_default_task_result_hooks=[embed], + ) + assert executor.connection_result_hooks[0] is embed + assert isinstance(executor.connection_result_hooks[1], FileSystemLogHook) + + +def test_embed_default_task_result_hooks_reach_connection_manager(plugin_registry, tmp_path): + embed = _CaptureEmbedHook() + executor = PluginExecutor( + plugin_configs=[PluginConfig(plugins={"TestPluginB": {}})], + plugin_registry=plugin_registry, + log_path=str(tmp_path), + embed_default_task_result_hooks=[embed], + ) + executor.run_queue() + cm = next(iter(executor.connection_library.values())) + assert cm.task_result_hooks[0] is embed From 45b2ce577a8afe78567737beddca4eb77b9f9b98 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 19 Jun 2026 12:36:07 -0500 Subject: [PATCH 22/39] avoiding decode calls when cper is decoded with event --- .../serviceability/mi3xx/mi3xx_analyzer.py | 47 ++++++- .../serviceability/mi3xx/mi3xx_collector.py | 11 ++ .../serviceability/mi3xx/mi3xx_cper_utils.py | 117 ++++++++++++++++++ test/unit/plugin/test_mi3xx_collector.py | 80 ++++++++++++ test/unit/plugin/test_mi3xx_cper_utils.py | 108 ++++++++++++++++ test/unit/serviceability_dummy_data.py | 23 ++++ 6 files changed, 380 insertions(+), 6 deletions(-) create mode 100644 nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py create mode 100644 test/unit/plugin/test_mi3xx_cper_utils.py create mode 100644 test/unit/serviceability_dummy_data.py diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 931366df..e0eab28c 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -47,6 +47,8 @@ ServiceabilityDataModel, ) +from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode + class AfidSagMetadataArtifact(BaseModel): """Hub AFID_SAG metadata snapshot; written to ``afid_sag_metadata.json``.""" @@ -83,25 +85,35 @@ def analyze_data( parent = self.parent or self.__class__.__name__ cper_data = data.cper_data or {} - if data.cper_raw and not cper_data: + cper_raw_to_decode = self._cper_raw_needing_decode(data) + skipped_cper = len(data.cper_raw or {}) - len(cper_raw_to_decode) + if skipped_cper: + self.logger.info( + "(%s) Skipping CPER decode for %d CPER attachment(s); Redfish log " + "already has usable ACA fields (AFID<%s or no serial on decode)", + parent, + skipped_cper, + RF_CPER_AFID_MIN, + ) + if cper_raw_to_decode and not cper_data: if not args.cper_decode_module: self.logger.warning( "(%s) %d CPER attachment(s) collected but cper_decode_module is " "not set in analysis_args; skipping CPER decode", parent, - len(data.cper_raw), + len(cper_raw_to_decode), ) else: self.logger.info( "(%s) Decoding %d CPER attachment(s) via %s.%s", parent, - len(data.cper_raw), + len(cper_raw_to_decode), args.cper_decode_module, args.cper_decode_method, ) try: cper_data = decode_cper_raw_attachments( - data.cper_raw, + cper_raw_to_decode, cper_decode_module=args.cper_decode_module, cper_decode_method=args.cper_decode_method, logger=self.logger, @@ -111,7 +123,7 @@ def analyze_data( "(%s) CPER decode finished: %d of %d attachment(s) decoded", parent, len(cper_data), - len(data.cper_raw), + len(cper_raw_to_decode), ) except CperDecodeError as exc: self.logger.warning( @@ -151,8 +163,10 @@ def analyze_data( cper_summary = "" if cper_data: cper_summary = f", {len(cper_data)} decoded CPER(s)" + elif cper_raw_to_decode: + cper_summary = f", {len(cper_raw_to_decode)} CPER attachment(s) not decoded" elif data.cper_raw: - cper_summary = f", {len(data.cper_raw)} CPER attachment(s) not decoded" + cper_summary = f", {len(data.cper_raw)} CPER attachment(s) omitted (ACA on log entry)" ver_bits: list[str] = [] if block.hub_version: ver_bits.append(f"hub {block.hub_version}") @@ -165,6 +179,27 @@ def analyze_data( ) return self.result + @staticmethod + def _cper_raw_needing_decode(data: ServiceabilityDataModel) -> dict[str, str]: + """Subset of ``cper_raw`` that still needs configured CPER decode (not already on the log).""" + raw = data.cper_raw or {} + if not raw: + return {} + by_id: dict[str, dict[str, Any]] = {} + for member in data.rf_events: + if not isinstance(member, dict): + continue + eid = member.get("Id") + if eid is not None: + by_id[str(eid)] = member + out: dict[str, str] = {} + for event_id, blob in raw.items(): + ev = by_id.get(str(event_id)) + if ev is not None and should_skip_cper_fetch_or_decode(ev): + continue + out[str(event_id)] = blob + return out + def _append_afid_sag_metadata_artifact(self, block: ServiceabilityBlock) -> None: if block.afid_sag_metadata is None: return diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 44594aee..8921796c 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -35,6 +35,7 @@ from nodescraper.plugins.serviceability.time_utils import satisfies_time_check from .mi3xx_collector_args import MI3XXCollectorArgs +from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") @@ -90,6 +91,16 @@ def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: if not uri or not event_id: continue + if should_skip_cper_fetch_or_decode(event): + self.logger.info( + "(%s) Skipping CPER attachment fetch for Redfish event %s " + "(ACA decode already on log entry; AFID<%s check or no serial)", + parent, + event_id, + RF_CPER_AFID_MIN, + ) + continue + try: resp = self.connection.get_response(uri) except Exception as exc: # noqa: BLE001 diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py new file mode 100644 index 00000000..fe9661dc --- /dev/null +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -0,0 +1,117 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +from __future__ import annotations + +from typing import Any + +# Redfish CPER (RF) style AFIDs start at this value; lower values are in-band / +# OEM-field AFIDs already reflected on the log entry. +RF_CPER_AFID_MIN = 10000 + +_SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") + + +def event_afids_from_oem(event: dict[str, Any]) -> list[int]: + """AFIDs from ``Oem.AMDFieldIdentifiers`` (or similar list-of-dicts).""" + oem = event.get("Oem") + if not isinstance(oem, dict): + return [] + raw = oem.get("AMDFieldIdentifiers") + if not isinstance(raw, list): + return [] + out: list[int] = [] + for item in raw: + if not isinstance(item, dict): + continue + for key in ("AFID", "Afid", "afid"): + if key in item and item[key] is not None: + try: + out.append(int(item[key])) + except (TypeError, ValueError): + pass + break + return out + + +def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: + oem = event.get("Oem") + if not isinstance(oem, dict): + return [] + arr = oem.get("ErrDataArr") + if not isinstance(arr, list): + return [] + return [e for e in arr if isinstance(e, dict)] + + +def event_has_aca_decode(event: dict[str, Any]) -> bool: + """True when the log entry includes ACA-style ``DecodedData`` under ``ErrDataArr``.""" + for entry in _err_data_arr_entries(event): + decoded = entry.get("DecodedData") + if isinstance(decoded, dict) and decoded: + return True + return False + + +def _nonempty_serial_in_mapping(obj: Any) -> bool: + if not isinstance(obj, dict): + return False + for key in _SERIAL_KEYS: + val = obj.get(key) + if val is not None and str(val).strip(): + return True + return False + + +def event_aca_includes_serial(event: dict[str, Any]) -> bool: + """Serial (or UBB serial) present on any ``ErrDataArr`` row (typically ``MetaData``).""" + for entry in _err_data_arr_entries(event): + meta = entry.get("MetaData") + if _nonempty_serial_in_mapping(meta): + return True + decoded = entry.get("DecodedData") + if _nonempty_serial_in_mapping(decoded): + return True + return False + + +def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: + """Whether to omit CPER binary fetch and configured CPER decode for this Redfish member. + + Skip when: + + * Every OEM-listed AFID is below ``RF_CPER_AFID_MIN`` (non-RF CPER range), + ACA ``DecodedData`` is present, and a serial is present on the entry; or + * ACA ``DecodedData`` is present but no serial — the CPER blob does not add + actionable identity beyond what is already missing from the log. + """ + if not event_has_aca_decode(event): + return False + if not event_aca_includes_serial(event): + return True + afids = event_afids_from_oem(event) + if not afids: + return False + return all(afid < RF_CPER_AFID_MIN for afid in afids) diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index d6aef464..625d1165 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -50,6 +50,7 @@ is_valid_iso_datetime, satisfies_time_check, ) +from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import RF_CPER_AFID_MIN EVENT_URI = DUMMY_EVENT_URI @@ -201,6 +202,85 @@ def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_ assert data.cper_data == {} +def test_mi3xx_collector_skips_cper_when_aca_serial_and_low_afids( + mi3xx_collector, redfish_conn_mock +): + redfish_conn_mock.get_response.reset_mock() + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-skip", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "On-die ECC"}, + "MetaData": {"SerialNumber": "692545012569"}, + } + ], + }, + } + ] + }, + status_code=200, + ) + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw == {} + redfish_conn_mock.get_response.assert_not_called() + + +def test_mi3xx_collector_fetches_cper_when_rf_afid(mi3xx_collector, redfish_conn_mock): + import base64 + from unittest.mock import MagicMock + + redfish_conn_mock.get_response.reset_mock() + redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( + path=EVENT_URI, + success=True, + data={ + RF_MEMBERS: [ + { + "Id": "cper-evt-rf", + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/2", + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "x"}, + "MetaData": {"SerialNumber": "692545012569"}, + } + ], + }, + } + ] + }, + status_code=200, + ) + response = MagicMock() + response.ok = True + response.status_code = 200 + response.content = b"\xaa\xbb" + redfish_conn_mock.get_response.return_value = response + + args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) + result, data = mi3xx_collector.collect_data(args=args) + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.cper_raw["cper-evt-rf"] == base64.b64encode(b"\xaa\xbb").decode("ascii") + redfish_conn_mock.get_response.assert_called_once() + + def test_mi3xx_collector_filters_events_by_reference_time(mi3xx_collector, redfish_conn_mock): redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py new file mode 100644 index 00000000..e5de352d --- /dev/null +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -0,0 +1,108 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### +import pytest + +from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( + RF_CPER_AFID_MIN, + event_aca_includes_serial, + event_afids_from_oem, + event_has_aca_decode, + should_skip_cper_fetch_or_decode, +) + +_DUMMY_META_SERIAL = "DUMMY-GPU-SERIAL-0001" +_DUMMY_DECODED_FIELD = "dummy_error_type" + + +def _oem_err_row(*, serial: bool = True, decoded: bool = True): + meta = {"SerialNumber": _DUMMY_META_SERIAL} if serial else {"GpuFw": "dummy-fw"} + dec = {"error_type": _DUMMY_DECODED_FIELD} if decoded else {} + return {"DecodedData": dec, "MetaData": meta} + + +def test_skip_when_afids_below_threshold_and_aca_has_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + "ErrDataArr": [_oem_err_row()], + } + } + assert event_afids_from_oem(event) == [22] + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_no_skip_when_rf_range_afid_even_with_aca_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [_oem_err_row()], + } + } + assert should_skip_cper_fetch_or_decode(event) is False + + +def test_skip_when_aca_decode_without_serial(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], + "ErrDataArr": [_oem_err_row(serial=False)], + } + } + assert event_has_aca_decode(event) is True + assert event_aca_includes_serial(event) is False + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_no_skip_when_no_err_data_decoded(): + event = { + "Oem": { + "AMDFieldIdentifiers": [{"AFID": 22}], + } + } + assert should_skip_cper_fetch_or_decode(event) is False + + +def test_no_skip_when_aca_serial_but_no_afid_list(): + event = { + "Oem": { + "ErrDataArr": [_oem_err_row()], + } + } + assert event_afids_from_oem(event) == [] + assert should_skip_cper_fetch_or_decode(event) is False + + +@pytest.mark.parametrize( + "afids,expect_skip", + [ + ([22, 28], True), + ([22, RF_CPER_AFID_MIN], False), + ], +) +def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): + identifiers = [{"AFID": a} for a in afids] + event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [_oem_err_row()]}} + assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py new file mode 100644 index 00000000..0542c866 --- /dev/null +++ b/test/unit/serviceability_dummy_data.py @@ -0,0 +1,23 @@ +"""Shared dummy values for serviceability unit tests (not production data).""" + +DUMMY_AFID_A = 9001 +DUMMY_AFID_B = 9002 +DUMMY_AFID_C = 9003 +DUMMY_SERVICE_ACTION_NUM = 99 +DUMMY_SERVICE_ACTION_TITLE = "Dummy service action" +DUMMY_UNIT_A = "dummy_unit_a" +DUMMY_UNIT_B = "dummy_unit_b" +DUMMY_UNIT_C = "dummy_unit_c" +DUMMY_DESIGNATION_A = "DUMMY_SLOT_A" +DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" +DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" +DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" +DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" +DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" +DUMMY_RF_EVENT_COUNT = 2 +DUMMY_SAG_PID = "dummy-sag-pid" +DUMMY_SAG_REVISION = "dummy-rev-0" +DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_BMC_HOST = "dummy-bmc.example" +DUMMY_OEM_VENDOR = "DummyVendor" From af1dae1cbb12f9ce1c6530096b02a52aab02d6d2 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 19 Jun 2026 12:59:32 -0500 Subject: [PATCH 23/39] updates --- nodescraper/cli/cli.py | 12 ++++---- nodescraper/cli/embed.py | 18 ++++++------ nodescraper/cli/invocation.py | 17 ++++++----- nodescraper/pluginexecutor.py | 16 +++++------ test/unit/cli/test_cli_embed_api.py | 2 +- test/unit/framework/test_plugin_executor.py | 32 ++++++--------------- 6 files changed, 40 insertions(+), 57 deletions(-) diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 2043a152..30dc8792 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -32,7 +32,7 @@ import platform import sys import uuid -from collections.abc import Sequence +from collections.abc import Callable, Sequence from typing import Optional import nodescraper @@ -65,8 +65,8 @@ from nodescraper.connection.redfish.redfish_params import RedfishConnectionParams from nodescraper.constants import DEFAULT_LOGGER from nodescraper.enums import ExecutionStatus, SystemInteractionLevel, SystemLocation -from nodescraper.interfaces import TaskResultHook from nodescraper.models import SystemInfo +from nodescraper.models.pluginresult import PluginResult from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry @@ -463,7 +463,7 @@ def main( arg_input: Optional[list[str]] = None, *, host_cli_args: Optional[argparse.Namespace] = None, - embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, + plugin_run_result_hooks: Optional[Sequence[Callable[[PluginResult], None]]] = None, ): """Main entry point for the CLI @@ -471,8 +471,8 @@ def main( arg_input (Optional[list[str]], optional): list of args to parse. Defaults to None. host_cli_args: Optional namespace from an embedding host (e.g. detect-errors) for code that calls get_plugin_run_invocation during the plugin queue. - embed_default_task_result_hooks: Optional hooks prepended for embedded runs (see - :func:`nodescraper.cli.embed.run_cli_return_code`). + plugin_run_result_hooks: Optional callbacks invoked with each plugin's :class:`PluginResult` + after ``run()`` completes (used by embedded hosts such as error-scraper). """ if arg_input is None: arg_input = sys.argv[1:] @@ -647,8 +647,8 @@ def main( timestamp=timestamp, sname=sname, host_cli_args=host_cli_args, - embed_default_task_result_hooks=embed_default_task_result_hooks, session_id=str(uuid.uuid4()), + plugin_run_result_hooks=plugin_run_result_hooks, ) log_system_info(log_path, system_info, logger) diff --git a/nodescraper/cli/embed.py b/nodescraper/cli/embed.py index f82f0f53..b1e91c37 100644 --- a/nodescraper/cli/embed.py +++ b/nodescraper/cli/embed.py @@ -27,11 +27,11 @@ from __future__ import annotations import argparse -from collections.abc import Sequence +from collections.abc import Callable, Sequence from typing import Optional from nodescraper.cli.cli import get_cli_top_level_subcommands -from nodescraper.interfaces import TaskResultHook +from nodescraper.models.pluginresult import PluginResult CLI_TOP_LEVEL_SUBCOMMANDS = get_cli_top_level_subcommands() @@ -47,15 +47,15 @@ def run_cli_return_code( argv: list[str], *, host_cli_args: Optional[argparse.Namespace] = None, - embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, + plugin_run_result_hooks: Optional[Sequence[Callable[[PluginResult], None]]] = None, ) -> int: """Run nodescraper in-process; same behavior as :func:`run_main_return_code`. Args: argv: Tokens after the program name. host_cli_args: Optional host namespace forwarded to :func:`nodescraper.cli.cli.main`. - embed_default_task_result_hooks: Optional hooks prepended to every plugin run and connection - manager for this embed (e.g. host event logging). Merged in :class:`PluginExecutor`. + plugin_run_result_hooks: Optional callbacks invoked with each + :class:`~nodescraper.models.pluginresult.PluginResult` after a plugin finishes (embed hosts). Returns: Integer exit code (``SystemExit`` is mapped, not raised). @@ -63,7 +63,7 @@ def run_cli_return_code( return run_main_return_code( argv, host_cli_args=host_cli_args, - embed_default_task_result_hooks=embed_default_task_result_hooks, + plugin_run_result_hooks=plugin_run_result_hooks, ) @@ -71,14 +71,14 @@ def run_main_return_code( arg_input: list[str], *, host_cli_args: Optional[argparse.Namespace] = None, - embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, + plugin_run_result_hooks: Optional[Sequence[Callable[[PluginResult], None]]] = None, ) -> int: """Run :func:`nodescraper.cli.cli.main` and map ``SystemExit`` to an exit code. Args: arg_input: Tokens after the program name. host_cli_args: Optional host namespace for embedded runs. - embed_default_task_result_hooks: Optional default task-result hooks for this embed. + plugin_run_result_hooks: Optional per-plugin result callbacks for embedded runs. Returns: Integer exit code. @@ -89,7 +89,7 @@ def run_main_return_code( main( arg_input, host_cli_args=host_cli_args, - embed_default_task_result_hooks=embed_default_task_result_hooks, + plugin_run_result_hooks=plugin_run_result_hooks, ) except SystemExit as exc: code = exc.code diff --git a/nodescraper/cli/invocation.py b/nodescraper/cli/invocation.py index c5d5d853..9edc7214 100644 --- a/nodescraper/cli/invocation.py +++ b/nodescraper/cli/invocation.py @@ -28,13 +28,12 @@ import argparse import logging -from collections.abc import Iterator, Sequence +from collections.abc import Callable, Sequence from contextlib import contextmanager from contextvars import ContextVar from dataclasses import dataclass -from typing import Optional +from typing import Iterator, Optional -from nodescraper.interfaces.taskresulthook import TaskResultHook from nodescraper.models import PluginConfig, SystemInfo from nodescraper.models.pluginresult import PluginResult from nodescraper.pluginexecutor import PluginExecutor @@ -74,7 +73,7 @@ class PluginRunInvocation: sname: str host_cli_args: Optional[argparse.Namespace] = None session_id: Optional[str] = None - embed_default_task_result_hooks: tuple[TaskResultHook, ...] = () + plugin_run_result_hooks: tuple[Callable[[PluginResult], None], ...] = () def run_plugin_queue_with_invocation( @@ -89,11 +88,11 @@ def run_plugin_queue_with_invocation( sname: str, host_cli_args: Optional[argparse.Namespace] = None, session_id: Optional[str] = None, - embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, + plugin_run_result_hooks: Optional[Sequence[Callable[[PluginResult], None]]] = None, ) -> list[PluginResult]: """Constructs the plugin executor, binds invocation context, and runs the plugin queue.""" - embed_hooks_tuple: tuple[TaskResultHook, ...] = ( - tuple(embed_default_task_result_hooks) if embed_default_task_result_hooks else () + hooks_tuple: tuple[Callable[[PluginResult], None], ...] = ( + tuple(plugin_run_result_hooks) if plugin_run_result_hooks else () ) inv = PluginRunInvocation( plugin_reg=plugin_reg, @@ -106,7 +105,7 @@ def run_plugin_queue_with_invocation( sname=sname, host_cli_args=host_cli_args, session_id=session_id, - embed_default_task_result_hooks=embed_hooks_tuple, + plugin_run_result_hooks=hooks_tuple, ) plugin_executor = PluginExecutor( logger=logger, @@ -116,7 +115,7 @@ def run_plugin_queue_with_invocation( log_path=log_path, plugin_registry=plugin_reg, session_id=session_id, - embed_default_task_result_hooks=embed_hooks_tuple, + plugin_run_result_hooks=hooks_tuple, ) with plugin_run_invocation_scope(inv): return plugin_executor.run_queue() diff --git a/nodescraper/pluginexecutor.py b/nodescraper/pluginexecutor.py index df135301..bb22b8a9 100644 --- a/nodescraper/pluginexecutor.py +++ b/nodescraper/pluginexecutor.py @@ -30,7 +30,7 @@ import logging import uuid from collections import deque -from collections.abc import Sequence +from collections.abc import Callable, Sequence from typing import Optional, Type, Union from pydantic import BaseModel @@ -59,7 +59,7 @@ def __init__( plugin_registry: Optional[PluginRegistry] = None, log_path: Optional[str] = None, session_id: Optional[str] = None, - embed_default_task_result_hooks: Optional[Sequence[TaskResultHook]] = None, + plugin_run_result_hooks: Optional[Sequence[Callable[[PluginResult], None]]] = None, ): if logger is None: @@ -92,12 +92,11 @@ def __init__( self.log_path = log_path - self.embed_default_task_result_hooks: list[TaskResultHook] = ( - list(embed_default_task_result_hooks) if embed_default_task_result_hooks else [] + self.plugin_run_result_hooks: list[Callable[[PluginResult], None]] = ( + list(plugin_run_result_hooks) if plugin_run_result_hooks else [] ) self.connection_result_hooks: list[TaskResultHook] = [] - self.connection_result_hooks.extend(self.embed_default_task_result_hooks) if log_path: self.connection_result_hooks.append(FileSystemLogHook(log_base_path=log_path)) @@ -186,8 +185,6 @@ def run_queue(self) -> list[PluginResult]: "log_path": self.log_path, "session_id": self.session_id, } - if self.embed_default_task_result_hooks: - init_payload["task_result_hooks"] = list(self.embed_default_task_result_hooks) if plugin_class.CONNECTION_TYPE: if issubclass(plugin_class, OOBSSHDataPlugin): @@ -273,7 +270,10 @@ def run_queue(self) -> list[PluginResult]: continue self.logger.info("-" * 50) - plugin_results.append(plugin_inst.run(**run_payload)) + plugin_result = plugin_inst.run(**run_payload) + plugin_results.append(plugin_result) + for hook in self.plugin_run_result_hooks: + hook(plugin_result) except Exception as e: self.logger.exception( "Unexpected exception when running plugin %s: %s", plugin_name, e diff --git a/test/unit/cli/test_cli_embed_api.py b/test/unit/cli/test_cli_embed_api.py index e260f667..54b95043 100644 --- a/test/unit/cli/test_cli_embed_api.py +++ b/test/unit/cli/test_cli_embed_api.py @@ -57,7 +57,7 @@ def fake_main( arg_input: list[str], *, host_cli_args=None, - embed_default_task_result_hooks=None, + plugin_run_result_hooks=None, ) -> None: calls.append(list(arg_input)) raise SystemExit(7) diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index ceec44a2..6af854db 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -30,11 +30,10 @@ from nodescraper.enums import ExecutionStatus from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.systeminteraction import SystemInteractionLevel -from nodescraper.interfaces import PluginInterface, TaskResultHook -from nodescraper.models import PluginConfig, PluginResult, TaskResult +from nodescraper.interfaces import PluginInterface +from nodescraper.models import PluginConfig, PluginResult from nodescraper.pluginexecutor import PluginExecutor from nodescraper.pluginregistry import PluginRegistry -from nodescraper.taskresulthooks import FileSystemLogHook class DummyArgs(BaseModel): @@ -189,31 +188,16 @@ def test_connection_manager_from_plugin_when_not_in_registry(): assert results[0].status == ExecutionStatus.OK -class _CaptureEmbedHook(TaskResultHook): - def process_result(self, task_result: TaskResult, **kwargs) -> None: - pass +def test_plugin_run_result_hooks_called_after_each_plugin(plugin_registry): + seen: list[str] = [] + def hook(res: PluginResult) -> None: + seen.append(res.source) -def test_embed_default_task_result_hooks_order_before_filesystem_log(plugin_registry, tmp_path): - embed = _CaptureEmbedHook() executor = PluginExecutor( plugin_configs=[PluginConfig(plugins={"TestPluginB": {}})], plugin_registry=plugin_registry, - log_path=str(tmp_path), - embed_default_task_result_hooks=[embed], - ) - assert executor.connection_result_hooks[0] is embed - assert isinstance(executor.connection_result_hooks[1], FileSystemLogHook) - - -def test_embed_default_task_result_hooks_reach_connection_manager(plugin_registry, tmp_path): - embed = _CaptureEmbedHook() - executor = PluginExecutor( - plugin_configs=[PluginConfig(plugins={"TestPluginB": {}})], - plugin_registry=plugin_registry, - log_path=str(tmp_path), - embed_default_task_result_hooks=[embed], + plugin_run_result_hooks=[hook], ) executor.run_queue() - cm = next(iter(executor.connection_library.values())) - assert cm.task_result_hooks[0] is embed + assert seen == ["TestPluginB"] From ff07a0d59da91771b49e56f3220b9dc7be5e7954 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Fri, 19 Jun 2026 13:36:48 -0500 Subject: [PATCH 24/39] utest fix --- test/unit/framework/test_plugin_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index 6af854db..4d686a00 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -200,4 +200,4 @@ def hook(res: PluginResult) -> None: plugin_run_result_hooks=[hook], ) executor.run_queue() - assert seen == ["TestPluginB"] + assert seen == ["testB"] From fe31fad85547a51c883e3a1d2571aeee419e5580 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 09:53:26 -0500 Subject: [PATCH 25/39] amdsmi plugin update --- .../plugins/inband/amdsmi/amdsmidata.py | 20 ++++++++++-- test/unit/plugin/test_amdsmi_data.py | 32 ++++++++++++++++++- 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index fd603028..3b8aae3c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -523,6 +523,9 @@ class StaticCacheInfoItem(AmdSmiBaseModel): na_validator = field_validator("cache_size", mode="before")(na_to_none) +_STATIC_CLOCK_FREQ_LEVEL_VALIDATOR_FIELDS = tuple(f"Level_{i}" for i in range(16)) + + class StaticFrequencyLevels(AmdSmiBaseModel): """Static clock frequency levels; each level is normalized to ``ValueUnit``.""" @@ -534,8 +537,21 @@ class StaticFrequencyLevels(AmdSmiBaseModel): Level_0: ValueUnit = Field(..., alias="Level 0") Level_1: Optional[ValueUnit] = Field(default=None, alias="Level 1") Level_2: Optional[ValueUnit] = Field(default=None, alias="Level 2") - - _level_value_unit = field_validator("Level_0", "Level_1", "Level_2", mode="before")( + Level_3: Optional[ValueUnit] = Field(default=None, alias="Level 3") + Level_4: Optional[ValueUnit] = Field(default=None, alias="Level 4") + Level_5: Optional[ValueUnit] = Field(default=None, alias="Level 5") + Level_6: Optional[ValueUnit] = Field(default=None, alias="Level 6") + Level_7: Optional[ValueUnit] = Field(default=None, alias="Level 7") + Level_8: Optional[ValueUnit] = Field(default=None, alias="Level 8") + Level_9: Optional[ValueUnit] = Field(default=None, alias="Level 9") + Level_10: Optional[ValueUnit] = Field(default=None, alias="Level 10") + Level_11: Optional[ValueUnit] = Field(default=None, alias="Level 11") + Level_12: Optional[ValueUnit] = Field(default=None, alias="Level 12") + Level_13: Optional[ValueUnit] = Field(default=None, alias="Level 13") + Level_14: Optional[ValueUnit] = Field(default=None, alias="Level 14") + Level_15: Optional[ValueUnit] = Field(default=None, alias="Level 15") + + _level_value_unit = field_validator(*_STATIC_CLOCK_FREQ_LEVEL_VALIDATOR_FIELDS, mode="before")( coerce_value_unit_input ) diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index 9e28fbb9..f6c4f750 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -"""Unit tests for amd-smi pydantic models (ROCm 7.13 / legacy JSON shapes).""" +"""Unit tests for amd-smi pydantic models (legacy JSON, ROCm 7.2+ / AMD-SMI 26.2+).""" from typing import Any, Optional @@ -341,6 +341,36 @@ def test_static_frequency_levels_optional_levels(): assert levels.Level_2 is not None and levels.Level_2.value == 1300 +def test_static_frequency_levels_accepts_level_three_plus(): + """ROCm 7.2+ / AMD-SMI 26.2+ may expose additional DPM levels (e.g. Level 3).""" + levels = StaticFrequencyLevels.model_validate( + { + "Level 0": "400 MHz", + "Level 1": "800 MHz", + "Level 2": "1000 MHz", + "Level 3": "1143 MHz", + } + ) + assert levels.Level_3 is not None + assert levels.Level_3.value == 1143 + assert levels.Level_3.unit == "MHz" + + +def test_static_frequency_levels_legacy_amd_smi_three_levels_only(): + """Legacy static JSON: only Level 0–2 (no Level 3+ keys).""" + levels = StaticFrequencyLevels.model_validate( + { + "Level 0": {"value": 500, "unit": "MHz"}, + "Level 1": "900 MHz", + "Level 2": "1300 MHz", + } + ) + assert levels.Level_0.value == 500 + assert levels.Level_2 is not None and levels.Level_2.value == 1300 + assert levels.Level_3 is None + assert levels.Level_15 is None + + def test_static_limit_legacy_max_power(): """Legacy flat max_power field still resolves.""" limit = StaticLimit.model_validate(DUMMY_LIMIT_LEGACY) From 1f996d4a357135921972049761e3c30ee922aaf7 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 10:04:22 -0500 Subject: [PATCH 26/39] updates --- .../plugins/inband/amdsmi/amdsmidata.py | 22 ++++++++++++++ test/unit/plugin/test_amdsmi_data.py | 29 +++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 3b8aae3c..1ef3e4fd 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -525,6 +525,8 @@ class StaticCacheInfoItem(AmdSmiBaseModel): _STATIC_CLOCK_FREQ_LEVEL_VALIDATOR_FIELDS = tuple(f"Level_{i}" for i in range(16)) +_STATIC_FREQ_LEVEL_JSON_KEY_RE = re.compile(r"^level[\s_]*(\d+)\s*$", re.IGNORECASE) + class StaticFrequencyLevels(AmdSmiBaseModel): """Static clock frequency levels; each level is normalized to ``ValueUnit``.""" @@ -534,6 +536,26 @@ class StaticFrequencyLevels(AmdSmiBaseModel): extra="forbid", ) + @model_validator(mode="before") + @classmethod + def _normalize_level_entries(cls, data: Any) -> Any: + """Map amd-smi DPM key spellings to canonical ``Level {n}``, drop non-level keys, ignore index > 15.""" + if not isinstance(data, dict): + return data + out: dict[str, Any] = {} + for raw_key, val in data.items(): + n: Optional[int] = None + if isinstance(raw_key, int): + n = int(raw_key) + elif isinstance(raw_key, str): + m = _STATIC_FREQ_LEVEL_JSON_KEY_RE.match(raw_key.strip()) + if m: + n = int(m.group(1)) + if n is None or n < 0 or n > 15: + continue + out[f"Level {n}"] = val + return out + Level_0: ValueUnit = Field(..., alias="Level 0") Level_1: Optional[ValueUnit] = Field(default=None, alias="Level 1") Level_2: Optional[ValueUnit] = Field(default=None, alias="Level 2") diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index f6c4f750..1f8c5529 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -371,6 +371,35 @@ def test_static_frequency_levels_legacy_amd_smi_three_levels_only(): assert levels.Level_15 is None +def test_static_frequency_levels_accepts_amd_smi_key_spelling_variants(): + """ROCm / AMD-SMI JSON may use LEVEL_N / Level0 style keys instead of ``Level N``.""" + levels = StaticFrequencyLevels.model_validate( + { + "LEVEL_0": "400 MHz", + "level_1": "800 MHz", + "Level2": "1000 MHz", + "Level 3": "1143 MHz", + } + ) + assert levels.Level_0.value == 400 + assert levels.Level_1 is not None and levels.Level_1.value == 800 + assert levels.Level_2 is not None and levels.Level_2.value == 1000 + assert levels.Level_3 is not None and levels.Level_3.value == 1143 + + +def test_static_frequency_levels_drops_unknown_keys_and_high_indices(): + """Non-level keys are ignored; DPM indices above 15 are dropped (model stores 0–15 only).""" + levels = StaticFrequencyLevels.model_validate( + { + "Level 0": "100 MHz", + "num_states": 99, + "Level 16": "9999 MHz", + } + ) + assert levels.Level_0.value == 100 + assert levels.Level_1 is None + + def test_static_limit_legacy_max_power(): """Legacy flat max_power field still resolves.""" limit = StaticLimit.model_validate(DUMMY_LIMIT_LEGACY) From bc60a72ef7b21f24bb50cf7b603d850a80f3e9d9 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 10:17:23 -0500 Subject: [PATCH 27/39] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 11 +++++------ .../plugins/inband/amdsmi/amdsmidata.py | 9 +++++++-- test/unit/plugin/test_amdsmi_data.py | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index ef60995a..8f702b12 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -1037,6 +1037,8 @@ def _parse_limit(self, data: Optional[object]) -> Optional[StaticLimit]: def _parse_current_level(self, data: dict) -> Optional[int]: """Extract current DPM level index from static clock JSON.""" cur_raw = data.get("current") + if cur_raw is None: + cur_raw = data.get("current_level") if cur_raw is None: cur_raw = data.get("current level") if isinstance(cur_raw, (int, float)): @@ -1290,14 +1292,10 @@ def _parse_clock(self, data: dict) -> Optional[StaticClockData]: if not isinstance(data, dict): return None - current = self._parse_current_level(data) freq_levels_raw = data.get("frequency_levels") if isinstance(freq_levels_raw, dict) and freq_levels_raw: try: - levels = StaticFrequencyLevels.model_validate(freq_levels_raw) - return StaticClockData.model_validate( - {"frequency_levels": levels, "current level": current} - ) + return StaticClockData.model_validate(data) except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, @@ -1306,6 +1304,7 @@ def _parse_clock(self, data: dict) -> Optional[StaticClockData]: priority=EventPriority.WARNING, ) + current = self._parse_current_level(data) freqs_raw = data.get("frequency") if not isinstance(freqs_raw, list) or not freqs_raw: return None @@ -1342,7 +1341,7 @@ def _fmt(n: Optional[int]) -> Optional[str]: {"Level 0": level0, "Level 1": level1, "Level 2": level2} ) - # Use the alias "current level" as defined in the model + # current_level accepts legacy "current level" / "current" keys via StaticClockData return StaticClockData.model_validate( {"frequency_levels": levels, "current level": current} ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 1ef3e4fd..18f83695 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -581,10 +581,15 @@ def _normalize_level_entries(cls, data: Any) -> Any: class StaticClockData(BaseModel): model_config = ConfigDict( populate_by_name=True, + extra="ignore", ) frequency_levels: StaticFrequencyLevels - - current_level: Optional[int] = Field(..., alias="current level") + current_level: Optional[int] = Field( + default=None, + validation_alias=AliasChoices("current level", "current_level", "current"), + serialization_alias="current level", + ) + current_frequency: Optional[str] = None na_validator = field_validator("current_level", mode="before")(na_to_none) diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index 1f8c5529..fe684edc 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -450,6 +450,25 @@ def test_static_clock_frequency_levels_json(): assert clock.frequency_levels.Level_1.value == 900 +def test_static_clock_mi300_amd_smi_26_json_shape(): + """ROCm 7.2 / AMD-SMI 26.x clock domains use current_level, current_frequency, and Level N strings.""" + raw = { + "current_level": 0, + "current_frequency": "132MHz", + "frequency_levels": { + "Level 0": "132 MHz", + "Level 1": "500 MHz", + "Level 2": "2100 MHz", + }, + } + clock = StaticClockData.model_validate(raw) + assert clock.current_level == 0 + assert clock.current_frequency == "132MHz" + assert clock.frequency_levels.Level_0.value == 132 + assert clock.frequency_levels.Level_2 is not None + assert clock.frequency_levels.Level_2.value == 2100 + + def test_amdsmi_data_model_dummy_metric_round_trip(): """Full dummy metric payload validates and preserves key ROCm 7.13 fields.""" metric = AmdSmiMetric.model_validate(dummy_metric_dict()) From a7a36243418c1587ad25fed91dd3e3de8991a6c6 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 10:29:40 -0500 Subject: [PATCH 28/39] updates --- .../plugins/inband/amdsmi/amdsmi_collector.py | 11 +++-- .../plugins/inband/amdsmi/amdsmidata.py | 31 +----------- test/unit/plugin/test_amdsmi_data.py | 48 ------------------- 3 files changed, 8 insertions(+), 82 deletions(-) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py index 8f702b12..ef60995a 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmi_collector.py @@ -1037,8 +1037,6 @@ def _parse_limit(self, data: Optional[object]) -> Optional[StaticLimit]: def _parse_current_level(self, data: dict) -> Optional[int]: """Extract current DPM level index from static clock JSON.""" cur_raw = data.get("current") - if cur_raw is None: - cur_raw = data.get("current_level") if cur_raw is None: cur_raw = data.get("current level") if isinstance(cur_raw, (int, float)): @@ -1292,10 +1290,14 @@ def _parse_clock(self, data: dict) -> Optional[StaticClockData]: if not isinstance(data, dict): return None + current = self._parse_current_level(data) freq_levels_raw = data.get("frequency_levels") if isinstance(freq_levels_raw, dict) and freq_levels_raw: try: - return StaticClockData.model_validate(data) + levels = StaticFrequencyLevels.model_validate(freq_levels_raw) + return StaticClockData.model_validate( + {"frequency_levels": levels, "current level": current} + ) except ValidationError as err: self._log_event( category=EventCategory.APPLICATION, @@ -1304,7 +1306,6 @@ def _parse_clock(self, data: dict) -> Optional[StaticClockData]: priority=EventPriority.WARNING, ) - current = self._parse_current_level(data) freqs_raw = data.get("frequency") if not isinstance(freqs_raw, list) or not freqs_raw: return None @@ -1341,7 +1342,7 @@ def _fmt(n: Optional[int]) -> Optional[str]: {"Level 0": level0, "Level 1": level1, "Level 2": level2} ) - # current_level accepts legacy "current level" / "current" keys via StaticClockData + # Use the alias "current level" as defined in the model return StaticClockData.model_validate( {"frequency_levels": levels, "current level": current} ) diff --git a/nodescraper/plugins/inband/amdsmi/amdsmidata.py b/nodescraper/plugins/inband/amdsmi/amdsmidata.py index 18f83695..3b8aae3c 100644 --- a/nodescraper/plugins/inband/amdsmi/amdsmidata.py +++ b/nodescraper/plugins/inband/amdsmi/amdsmidata.py @@ -525,8 +525,6 @@ class StaticCacheInfoItem(AmdSmiBaseModel): _STATIC_CLOCK_FREQ_LEVEL_VALIDATOR_FIELDS = tuple(f"Level_{i}" for i in range(16)) -_STATIC_FREQ_LEVEL_JSON_KEY_RE = re.compile(r"^level[\s_]*(\d+)\s*$", re.IGNORECASE) - class StaticFrequencyLevels(AmdSmiBaseModel): """Static clock frequency levels; each level is normalized to ``ValueUnit``.""" @@ -536,26 +534,6 @@ class StaticFrequencyLevels(AmdSmiBaseModel): extra="forbid", ) - @model_validator(mode="before") - @classmethod - def _normalize_level_entries(cls, data: Any) -> Any: - """Map amd-smi DPM key spellings to canonical ``Level {n}``, drop non-level keys, ignore index > 15.""" - if not isinstance(data, dict): - return data - out: dict[str, Any] = {} - for raw_key, val in data.items(): - n: Optional[int] = None - if isinstance(raw_key, int): - n = int(raw_key) - elif isinstance(raw_key, str): - m = _STATIC_FREQ_LEVEL_JSON_KEY_RE.match(raw_key.strip()) - if m: - n = int(m.group(1)) - if n is None or n < 0 or n > 15: - continue - out[f"Level {n}"] = val - return out - Level_0: ValueUnit = Field(..., alias="Level 0") Level_1: Optional[ValueUnit] = Field(default=None, alias="Level 1") Level_2: Optional[ValueUnit] = Field(default=None, alias="Level 2") @@ -581,15 +559,10 @@ def _normalize_level_entries(cls, data: Any) -> Any: class StaticClockData(BaseModel): model_config = ConfigDict( populate_by_name=True, - extra="ignore", ) frequency_levels: StaticFrequencyLevels - current_level: Optional[int] = Field( - default=None, - validation_alias=AliasChoices("current level", "current_level", "current"), - serialization_alias="current level", - ) - current_frequency: Optional[str] = None + + current_level: Optional[int] = Field(..., alias="current level") na_validator = field_validator("current_level", mode="before")(na_to_none) diff --git a/test/unit/plugin/test_amdsmi_data.py b/test/unit/plugin/test_amdsmi_data.py index fe684edc..f6c4f750 100644 --- a/test/unit/plugin/test_amdsmi_data.py +++ b/test/unit/plugin/test_amdsmi_data.py @@ -371,35 +371,6 @@ def test_static_frequency_levels_legacy_amd_smi_three_levels_only(): assert levels.Level_15 is None -def test_static_frequency_levels_accepts_amd_smi_key_spelling_variants(): - """ROCm / AMD-SMI JSON may use LEVEL_N / Level0 style keys instead of ``Level N``.""" - levels = StaticFrequencyLevels.model_validate( - { - "LEVEL_0": "400 MHz", - "level_1": "800 MHz", - "Level2": "1000 MHz", - "Level 3": "1143 MHz", - } - ) - assert levels.Level_0.value == 400 - assert levels.Level_1 is not None and levels.Level_1.value == 800 - assert levels.Level_2 is not None and levels.Level_2.value == 1000 - assert levels.Level_3 is not None and levels.Level_3.value == 1143 - - -def test_static_frequency_levels_drops_unknown_keys_and_high_indices(): - """Non-level keys are ignored; DPM indices above 15 are dropped (model stores 0–15 only).""" - levels = StaticFrequencyLevels.model_validate( - { - "Level 0": "100 MHz", - "num_states": 99, - "Level 16": "9999 MHz", - } - ) - assert levels.Level_0.value == 100 - assert levels.Level_1 is None - - def test_static_limit_legacy_max_power(): """Legacy flat max_power field still resolves.""" limit = StaticLimit.model_validate(DUMMY_LIMIT_LEGACY) @@ -450,25 +421,6 @@ def test_static_clock_frequency_levels_json(): assert clock.frequency_levels.Level_1.value == 900 -def test_static_clock_mi300_amd_smi_26_json_shape(): - """ROCm 7.2 / AMD-SMI 26.x clock domains use current_level, current_frequency, and Level N strings.""" - raw = { - "current_level": 0, - "current_frequency": "132MHz", - "frequency_levels": { - "Level 0": "132 MHz", - "Level 1": "500 MHz", - "Level 2": "2100 MHz", - }, - } - clock = StaticClockData.model_validate(raw) - assert clock.current_level == 0 - assert clock.current_frequency == "132MHz" - assert clock.frequency_levels.Level_0.value == 132 - assert clock.frequency_levels.Level_2 is not None - assert clock.frequency_levels.Level_2.value == 2100 - - def test_amdsmi_data_model_dummy_metric_round_trip(): """Full dummy metric payload validates and preserves key ROCm 7.13 fields.""" metric = AmdSmiMetric.model_validate(dummy_metric_dict()) From d3e52caf26de6745fe56a07817fc55beda034290 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 13:32:46 -0500 Subject: [PATCH 29/39] ruff fix --- test/unit/framework/test_plugin_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/framework/test_plugin_executor.py b/test/unit/framework/test_plugin_executor.py index 4d686a00..494551ce 100644 --- a/test/unit/framework/test_plugin_executor.py +++ b/test/unit/framework/test_plugin_executor.py @@ -24,9 +24,9 @@ # ############################################################################### import pytest +from framework.common.shared_utils import DummyDataModel, MockConnectionManager from pydantic import BaseModel -from framework.common.shared_utils import DummyDataModel, MockConnectionManager from nodescraper.enums import ExecutionStatus from nodescraper.enums.eventpriority import EventPriority from nodescraper.enums.systeminteraction import SystemInteractionLevel From a7dd97d98c1e0629ab9010a54db267c842435a64 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 13:36:49 -0500 Subject: [PATCH 30/39] renames --- .../plugins/serviceability/analyzer_args.py | 26 +-- .../plugins/serviceability/cper_decode.py | 4 +- .../serviceability/mi3xx/mi3xx_analyzer.py | 14 +- .../plugins/serviceability/se_adapter.py | 6 +- .../plugins/serviceability/se_models.py | 2 +- .../plugins/serviceability/se_runner.py | 34 ++-- test/unit/mock_python_engine.py | 6 +- .../plugin/test_afid_events_bmc_schema.py | 118 +++++-------- test/unit/plugin/test_mi3xx_collector.py | 75 +++------ test/unit/plugin/test_mi3xx_cper_utils.py | 40 ++--- test/unit/plugin/test_se_runner.py | 40 ++--- test/unit/serviceability_dummy_data.py | 159 +++++++++++++++++- 12 files changed, 308 insertions(+), 216 deletions(-) diff --git a/nodescraper/plugins/serviceability/analyzer_args.py b/nodescraper/plugins/serviceability/analyzer_args.py index 2aa27ccd..639822cc 100644 --- a/nodescraper/plugins/serviceability/analyzer_args.py +++ b/nodescraper/plugins/serviceability/analyzer_args.py @@ -35,27 +35,27 @@ class ServiceabilityAnalyzerArgs(AnalyzerArgs): """Analyzer args for serviceability plugins that run a configurable Python hub.""" - engine_python_module: Optional[str] = Field( + hub_python_module: Optional[str] = Field( default=None, - description="Import path for the hub module (class implements engine_analyze_method); hub_options forwards kwargs.", + description="Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs.", ) - engine_display_name: Optional[str] = Field( + hub_display_name: Optional[str] = Field( default=None, description="Optional label for analyzer status messages.", ) afid_sag_path: Optional[str] = Field( default=None, - description="Path to hub config (e.g. AFID_SAG.json); passed as engine_init_path_kwarg.", + description="Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg.", ) - engine_init_path_kwarg: str = Field( + hub_init_path_kwarg: str = Field( default="afid_sag", description="Hub __init__ keyword that receives afid_sag_path.", ) - engine_analyze_method: str = Field( + hub_analyze_method: str = Field( default="get_service_info", description="Hub method called with rf_events first (default get_service_info).", ) - skip_engine: bool = Field( + skip_hub: bool = Field( default=False, description="If True, only build afid_events without running the service hub.", ) @@ -101,7 +101,7 @@ def resolved_hub_options(self) -> dict[str, Any]: merged["suppress_service_actions"] = self.suppress_service_actions return merged - @field_validator("engine_analyze_method", "engine_init_path_kwarg") + @field_validator("hub_analyze_method", "hub_init_path_kwarg") @classmethod def _strip_non_empty_hub_hooks(cls, value: str) -> str: text = str(value).strip() @@ -128,8 +128,8 @@ def _strip_from_date(cls, value: object) -> Optional[str]: @field_validator( "afid_sag_path", - "engine_python_module", - "engine_display_name", + "hub_python_module", + "hub_display_name", "cper_decode_module", ) @classmethod @@ -141,10 +141,10 @@ def _strip_optional_strings(cls, value: Optional[str]) -> Optional[str]: @model_validator(mode="after") def _require_hub_config_when_running(self) -> ServiceabilityAnalyzerArgs: - if self.skip_engine: + if self.skip_hub: return self if not self.afid_sag_path: raise ValueError("afid_sag_path is required when running the service hub.") - if not self.engine_python_module: - raise ValueError("engine_python_module is required when running the service hub.") + if not self.hub_python_module: + raise ValueError("hub_python_module is required when running the service hub.") return self diff --git a/nodescraper/plugins/serviceability/cper_decode.py b/nodescraper/plugins/serviceability/cper_decode.py index 6982407a..d4e9b20e 100644 --- a/nodescraper/plugins/serviceability/cper_decode.py +++ b/nodescraper/plugins/serviceability/cper_decode.py @@ -80,8 +80,8 @@ def decode_cper_raw_attachments( """Decode base64 CPER blobs keyed by Redfish event Id. The decode callable must accept a binary file-like object and return - ``(return_code, decode_dict)``. Results are passed to the service engine as - ``cper_data``; the engine does not perform CPER decoding itself. + ``(return_code, decode_dict)``. Results are passed to the service hub as + ``cper_data``; the hub does not perform CPER decoding itself. Returns ``{event_id: {"return_code": int, "decode": dict}}``. """ diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index e0eab28c..6150398e 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -76,7 +76,7 @@ def analyze_data( events = data.afid_events or build_afid_events_from_data(data) data.afid_events = events - if args.skip_engine: + if args.skip_hub: data.serviceability = ServiceabilityBlock(afid_events=events) self.result.status = ExecutionStatus.OK self.result.message = f"Built {len(events)} AFID event(s); hub skipped" @@ -140,15 +140,15 @@ def analyze_data( try: block = run_service_hub( - engine_python_module=args.engine_python_module, # type: ignore[arg-type] - engine_display_name=args.engine_display_name, + hub_python_module=args.hub_python_module, # type: ignore[arg-type] + hub_display_name=args.hub_display_name, afid_events=events, afid_sag_path=args.afid_sag_path, # type: ignore[arg-type] rf_events=data.rf_events, cper_data=cper_data or None, hub_options=args.resolved_hub_options(), - engine_analyze_method=args.engine_analyze_method, - engine_init_path_kwarg=args.engine_init_path_kwarg, + hub_analyze_method=args.hub_analyze_method, + hub_init_path_kwarg=args.hub_init_path_kwarg, ) except (SeRunError, ValueError) as exc: self.result.status = ExecutionStatus.ERROR @@ -158,7 +158,7 @@ def analyze_data( data.serviceability = block self._append_afid_sag_metadata_artifact(block) self._log_serviceability_solutions(block) - engine_label = args.engine_display_name or args.engine_python_module + hub_label = args.hub_display_name or args.hub_python_module self.result.status = ExecutionStatus.OK cper_summary = "" if cper_data: @@ -174,7 +174,7 @@ def analyze_data( ver_bits.append(f"AFID_SAG {block.afid_sag_file_version}") ver_suffix = f" [{'; '.join(ver_bits)}]" if ver_bits else "" self.result.message = ( - f"{engine_label}: {len(block.solution)} solution(s) " + f"{hub_label}: {len(block.solution)} solution(s) " f"from {len(data.rf_events)} Redfish event(s){cper_summary}{ver_suffix}" ) return self.result diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 04321c82..bea1d4a0 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -269,7 +269,7 @@ def serviceability_block_from_service_result( afid_events: list[AfidEvent], result: Any, *, - engine_label: str = "Service hub", + hub_label: str = "Service hub", rf_event_count: int = 0, ) -> ServiceabilityBlock: """Build a ``ServiceabilityBlock`` from a hub result with ``service_info``.""" @@ -326,7 +326,9 @@ def _action_title(info: dict[str, Any]) -> str: ) hub_version = _hub_version_display(version_info) afid_sag_file_version = _afid_sag_file_version_display(metadata) - reasoning = f"{engine_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + reasoning = ( + f"{hub_label}: {len(solutions)} recommendation(s) from {rf_event_count} Redfish event(s)." + ) meta_out: Optional[dict[str, Any]] = dict(metadata) if isinstance(raw_metadata, dict) else None short_service_info = _format_short_service_info_for_block( getattr(result, "short_service_info", None) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 8a3f50f3..6aa855a3 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -91,7 +91,7 @@ class ServiceabilityBlock(BaseModel): ) afid_sag_metadata: Optional[dict[str, Any]] = Field( default=None, - description="Hub-reported AFID_SAG metadata dict when the engine exposes afid_sag_metadata.", + description="Hub-reported AFID_SAG metadata dict when the hub exposes afid_sag_metadata.", ) short_service_info: Optional[str] = Field( default=None, diff --git a/nodescraper/plugins/serviceability/se_runner.py b/nodescraper/plugins/serviceability/se_runner.py index c141b6ec..6ff8b60e 100644 --- a/nodescraper/plugins/serviceability/se_runner.py +++ b/nodescraper/plugins/serviceability/se_runner.py @@ -89,21 +89,21 @@ class SeRunError(RuntimeError): def run_service_hub( *, - engine_python_module: str, - engine_display_name: Optional[str] = None, + hub_python_module: str, + hub_display_name: Optional[str] = None, afid_events: list[AfidEvent], afid_sag_path: str, rf_events: list[Any], cper_data: Optional[dict[str, Any]] = None, hub_options: Optional[dict[str, Any]] = None, - engine_analyze_method: str = "get_service_info", - engine_init_path_kwarg: str = "afid_sag", + hub_analyze_method: str = "get_service_info", + hub_init_path_kwarg: str = "afid_sag", ) -> ServiceabilityBlock: """Run the configured Python service hub and return a :class:`ServiceabilityBlock`. - The runner imports ``engine_python_module``, picks the unique class that implements - ``engine_analyze_method``, constructs it with the config file path passed as - ``engine_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any + The runner imports ``hub_python_module``, picks the unique class that implements + ``hub_analyze_method``, constructs it with the config file path passed as + ``hub_init_path_kwarg``, then calls the analyze method with ``rf_events`` and any ``hub_options`` keys that match the method signature (plus ``cper_data`` when supported). Result mapping is handled by :func:`serviceability_block_from_service_result`. """ @@ -113,25 +113,25 @@ def run_service_hub( if not rf_events: raise SeRunError( - "Collected Redfish events are required; re-run collection or use skip_engine." + "Collected Redfish events are required; re-run collection or use skip_hub." ) - label = engine_display_name or engine_python_module + label = hub_display_name or hub_python_module try: - mod = importlib.import_module(engine_python_module) + mod = importlib.import_module(hub_python_module) except ImportError as exc: - raise SeRunError(f"Cannot import {engine_python_module}: {exc}") from exc + raise SeRunError(f"Cannot import {hub_python_module}: {exc}") from exc - hub_cls = _resolve_hub_class(mod, engine_analyze_method) + hub_cls = _resolve_hub_class(mod, hub_analyze_method) try: instance = _instantiate_hub( hub_cls, afid_sag_path, - engine_init_path_kwarg, + hub_init_path_kwarg, hub_options, ) - analyze = getattr(instance, engine_analyze_method) + analyze = getattr(instance, hub_analyze_method) result = _call_hub_analyze( analyze, rf_events, @@ -139,7 +139,7 @@ def run_service_hub( hub_options, ) except Exception as exc: - raise SeRunError(f"{label} {engine_analyze_method}() failed: {exc}") from exc + raise SeRunError(f"{label} {hub_analyze_method}() failed: {exc}") from exc if result is None: return ServiceabilityBlock( @@ -151,7 +151,7 @@ def run_service_hub( return serviceability_block_from_service_result( afid_events, result, - engine_label=label, + hub_label=label, rf_event_count=len(rf_events), ) @@ -188,7 +188,7 @@ def add_candidate(obj: Any) -> None: if not candidates: raise SeRunError( f"No class with {analyze_method}() found in {package}; " - "check engine_python_module and engine_analyze_method in analysis_args." + "check hub_python_module and hub_analyze_method in analysis_args." ) names = ", ".join(cls.__name__ for cls in candidates) raise SeRunError(f"Multiple classes with {analyze_method}() in {package}: {names}.") diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py index 515eea38..f48a7e43 100644 --- a/test/unit/mock_python_engine.py +++ b/test/unit/mock_python_engine.py @@ -1,4 +1,4 @@ -"""Mock Python service engine for unit tests.""" +"""Mock Python service hub for unit tests.""" from __future__ import annotations @@ -6,7 +6,7 @@ from typing import Any, Optional from serviceability_dummy_data import ( - DUMMY_ENGINE_VERSION, + DUMMY_HUB_VERSION, DUMMY_SAG_PID, DUMMY_SAG_REVISION, DUMMY_SERVICE_ACTION_NUM, @@ -39,5 +39,5 @@ def get_service_info( return SimpleNamespace( service_info=service_info, afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, - engine_version_info={"version": DUMMY_ENGINE_VERSION}, + engine_version_info={"version": DUMMY_HUB_VERSION}, ) diff --git a/test/unit/plugin/test_afid_events_bmc_schema.py b/test/unit/plugin/test_afid_events_bmc_schema.py index 7c54364f..8529577c 100644 --- a/test/unit/plugin/test_afid_events_bmc_schema.py +++ b/test/unit/plugin/test_afid_events_bmc_schema.py @@ -4,10 +4,41 @@ # # Copyright (c) 2026 Advanced Micro Devices, Inc. # +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# ############################################################################### """AFID / serviceable unit extraction for OpenBMC-style LogEntry payloads.""" from __future__ import annotations +from serviceability_dummy_data import ( + DUMMY_AFID_A, + DUMMY_AFID_BELOW_RF, + DUMMY_AFID_FATAL_HBM, + DUMMY_TIMESTAMP, + DUMMY_UNIT_A, + DUMMY_UNIT_B, + DUMMY_UNIT_C, + dummy_fatal_hbm_log_entry, + dummy_openbmc_log_entry, + dummy_openbmc_log_entry_serviceable_units_only, +) + from nodescraper.plugins.serviceability.afid_events import ( _afid_event_from_rf_member, build_afid_events_from_data, @@ -16,95 +47,36 @@ ServiceabilityDataModel, ) -# Shape from after_clear_rma_case.json: AFID under Oem.AMDFieldIdentifiers[], OOC under Links. -_SAMPLE_LOG_ENTRY = { - "@odata.id": "/redfish/v1/Systems/UBB/LogServices/EventLog/Entries/1", - "Created": "2026-06-16T20:25:22+00:00", - "Id": "1", - "Links": { - "OriginOfCondition": { - "@odata.id": "/redfish/v1/Chassis/OAM_7", - } - }, - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 22, - "Description": "On-die ECC, Uncorrected, Non-fatal", - "ServiceableUnits": [ - {"@odata.id": "/redfish/v1/Chassis/OAM_7"}, - ], - "ServiceableUnits@odata.count": 1, - } - ], - "AMDFieldIdentifiers@Members.count": 1, - }, -} - def test_afid_event_from_openbmc_log_entry_with_links_and_amd_field_identifiers(): - ev = _afid_event_from_rf_member(_SAMPLE_LOG_ENTRY) + ev = _afid_event_from_rf_member(dummy_openbmc_log_entry()) assert ev is not None - assert ev.afid == 22 - assert ev.serviceable_unit == "OAM_7" - assert "2026-06-16" in ev.time + assert ev.afid == DUMMY_AFID_BELOW_RF + assert ev.serviceable_unit == DUMMY_UNIT_A + assert DUMMY_TIMESTAMP[:10] in ev.time def test_serviceable_unit_from_oem_serviceable_units_when_no_links(): - member = { - "Created": "2026-06-16T20:25:22+00:00", - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 23, - "ServiceableUnits": [ - {"@odata.id": "/redfish/v1/Chassis/OAM_3"}, - ], - } - ], - }, - } - ev = _afid_event_from_rf_member(member) + ev = _afid_event_from_rf_member(dummy_openbmc_log_entry_serviceable_units_only()) assert ev is not None - assert ev.afid == 23 - assert ev.serviceable_unit == "OAM_3" - - -# Minimal slice of smci350 command_artifacts.json first CPER row (Links + AMDFieldIdentifiers[]). -_SMCI350_STYLE_ENTRY = { - "Created": "2026-06-16T18:53:21+00:00", - "Id": "1", - "Links": { - "OriginOfCondition": {"@odata.id": "/redfish/v1/Chassis/OAM_2"}, - }, - "Oem": { - "AMDFieldIdentifiers": [ - { - "AFID": 25, - "Description": "All Other HBM, Fatal", - "ServiceableUnits": [{"@odata.id": "/redfish/v1/Chassis/OAM_2"}], - "ServiceableUnits@odata.count": 1, - } - ], - "AMDFieldIdentifiers@Members.count": 1, - }, -} + assert ev.afid == DUMMY_AFID_A + assert ev.serviceable_unit == DUMMY_UNIT_B -def test_afid_event_smci350_style_fatal_hbm_entry(): - ev = _afid_event_from_rf_member(_SMCI350_STYLE_ENTRY) +def test_afid_event_fatal_hbm_log_entry(): + ev = _afid_event_from_rf_member(dummy_fatal_hbm_log_entry()) assert ev is not None - assert ev.afid == 25 - assert ev.serviceable_unit == "OAM_2" + assert ev.afid == DUMMY_AFID_FATAL_HBM + assert ev.serviceable_unit == DUMMY_UNIT_C def test_build_afid_events_from_data_includes_openbmc_entries(): data = ServiceabilityDataModel( - rf_events=[_SAMPLE_LOG_ENTRY, _SMCI350_STYLE_ENTRY], + rf_events=[dummy_openbmc_log_entry(), dummy_fatal_hbm_log_entry()], cper_data={}, ) events = build_afid_events_from_data(data) assert len(events) == 2 by_afid_oam = {(e.afid, e.serviceable_unit) for e in events} - assert (22, "OAM_7") in by_afid_oam - assert (25, "OAM_2") in by_afid_oam + assert (DUMMY_AFID_BELOW_RF, DUMMY_UNIT_A) in by_afid_oam + assert (DUMMY_AFID_FATAL_HBM, DUMMY_UNIT_C) in by_afid_oam diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index 625d1165..1cddc2f3 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -27,10 +27,17 @@ from pydantic import ValidationError from serviceability_dummy_data import ( DUMMY_BMC_HOST, + DUMMY_CPER_BYTES_BASIC, + DUMMY_CPER_BYTES_RF, + DUMMY_CPER_EVENT_ID_BASIC, + DUMMY_CPER_EVENT_ID_RF, DUMMY_EVENT_URI, DUMMY_EVENT_URI_ALT, DUMMY_TIMESTAMP_EARLIER, DUMMY_TIMESTAMP_LATER, + dummy_cper_basic_member, + dummy_cper_rf_member, + dummy_cper_skip_member, ) from nodescraper.connection.redfish import RF_MEMBERS, RedfishGetResult @@ -50,7 +57,6 @@ is_valid_iso_datetime, satisfies_time_check, ) -from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import RF_CPER_AFID_MIN EVENT_URI = DUMMY_EVENT_URI @@ -176,29 +182,22 @@ def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_ redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-1", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", - } - ] - }, + data={RF_MEMBERS: [dummy_cper_basic_member()]}, status_code=200, ) response = MagicMock() response.ok = True response.status_code = 200 - response.content = b"\x01\x02dummy-cper" + response.content = DUMMY_CPER_BYTES_BASIC redfish_conn_mock.get_response.return_value = response args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = mi3xx_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert data.cper_raw["cper-evt-1"] == base64.b64encode(b"\x01\x02dummy-cper").decode("ascii") + assert data.cper_raw[DUMMY_CPER_EVENT_ID_BASIC] == base64.b64encode( + DUMMY_CPER_BYTES_BASIC + ).decode("ascii") assert data.cper_data == {} @@ -209,25 +208,7 @@ def test_mi3xx_collector_skips_cper_when_aca_serial_and_low_afids( redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-skip", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/1", - "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], - "ErrDataArr": [ - { - "DecodedData": {"error_type": "On-die ECC"}, - "MetaData": {"SerialNumber": "692545012569"}, - } - ], - }, - } - ] - }, + data={RF_MEMBERS: [dummy_cper_skip_member()]}, status_code=200, ) args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) @@ -246,38 +227,22 @@ def test_mi3xx_collector_fetches_cper_when_rf_afid(mi3xx_collector, redfish_conn redfish_conn_mock.run_get_paged.return_value = RedfishGetResult( path=EVENT_URI, success=True, - data={ - RF_MEMBERS: [ - { - "Id": "cper-evt-rf", - "Created": DUMMY_TIMESTAMP_LATER, - "DiagnosticDataType": "CPER", - "AdditionalDataURI": "/redfish/v1/Systems/UBB/LogServices/EventLog/Attachments/2", - "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [ - { - "DecodedData": {"error_type": "x"}, - "MetaData": {"SerialNumber": "692545012569"}, - } - ], - }, - } - ] - }, + data={RF_MEMBERS: [dummy_cper_rf_member()]}, status_code=200, ) response = MagicMock() response.ok = True response.status_code = 200 - response.content = b"\xaa\xbb" + response.content = DUMMY_CPER_BYTES_RF redfish_conn_mock.get_response.return_value = response args = MI3XXCollectorArgs(rf_event_log_uri=EVENT_URI) result, data = mi3xx_collector.collect_data(args=args) assert result.status == ExecutionStatus.OK assert data is not None - assert data.cper_raw["cper-evt-rf"] == base64.b64encode(b"\xaa\xbb").decode("ascii") + assert data.cper_raw[DUMMY_CPER_EVENT_ID_RF] == base64.b64encode(DUMMY_CPER_BYTES_RF).decode( + "ascii" + ) redfish_conn_mock.get_response.assert_called_once() @@ -319,11 +284,11 @@ def test_mi3xx_result_reporting_versions(): plugin_name="dummy_plugin", plugin_version="0.0-dummy", node_scraper_version="0.0-dummy", - dummy_engine_version="0.0-dummy", + dummy_hub_version="0.0-dummy", ) result = MI3XXResult(node="dummy-node", **version_fields) assert result.plugin_name == "dummy_plugin" - assert result.reporter_extensions["dummy_engine_version"] == "0.0-dummy" + assert result.reporter_extensions["dummy_hub_version"] == "0.0-dummy" def test_mi3xx_data_model_log_model(tmp_path): diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index e5de352d..b156b930 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -24,41 +24,37 @@ # ############################################################################### import pytest +from serviceability_dummy_data import ( + DUMMY_AFID_B, + DUMMY_AFID_BELOW_RF, + DUMMY_RF_CPER_AFID, + dummy_aca_err_row, +) from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( - RF_CPER_AFID_MIN, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, should_skip_cper_fetch_or_decode, ) -_DUMMY_META_SERIAL = "DUMMY-GPU-SERIAL-0001" -_DUMMY_DECODED_FIELD = "dummy_error_type" - - -def _oem_err_row(*, serial: bool = True, decoded: bool = True): - meta = {"SerialNumber": _DUMMY_META_SERIAL} if serial else {"GpuFw": "dummy-fw"} - dec = {"error_type": _DUMMY_DECODED_FIELD} if decoded else {} - return {"DecodedData": dec, "MetaData": meta} - def test_skip_when_afids_below_threshold_and_aca_has_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], - "ErrDataArr": [_oem_err_row()], + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [dummy_aca_err_row()], } } - assert event_afids_from_oem(event) == [22] + assert event_afids_from_oem(event) == [DUMMY_AFID_BELOW_RF] assert should_skip_cper_fetch_or_decode(event) is True def test_no_skip_when_rf_range_afid_even_with_aca_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [_oem_err_row()], + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row()], } } assert should_skip_cper_fetch_or_decode(event) is False @@ -67,8 +63,8 @@ def test_no_skip_when_rf_range_afid_even_with_aca_serial(): def test_skip_when_aca_decode_without_serial(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": RF_CPER_AFID_MIN}], - "ErrDataArr": [_oem_err_row(serial=False)], + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row(serial=False)], } } assert event_has_aca_decode(event) is True @@ -79,7 +75,7 @@ def test_skip_when_aca_decode_without_serial(): def test_no_skip_when_no_err_data_decoded(): event = { "Oem": { - "AMDFieldIdentifiers": [{"AFID": 22}], + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], } } assert should_skip_cper_fetch_or_decode(event) is False @@ -88,7 +84,7 @@ def test_no_skip_when_no_err_data_decoded(): def test_no_skip_when_aca_serial_but_no_afid_list(): event = { "Oem": { - "ErrDataArr": [_oem_err_row()], + "ErrDataArr": [dummy_aca_err_row()], } } assert event_afids_from_oem(event) == [] @@ -98,11 +94,11 @@ def test_no_skip_when_aca_serial_but_no_afid_list(): @pytest.mark.parametrize( "afids,expect_skip", [ - ([22, 28], True), - ([22, RF_CPER_AFID_MIN], False), + ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_B], True), + ([DUMMY_AFID_BELOW_RF, DUMMY_RF_CPER_AFID], False), ], ) def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): identifiers = [{"AFID": a} for a in afids] - event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [_oem_err_row()]}} + event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [dummy_aca_err_row()]}} assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index 01f8c4bc..554f0ccc 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -36,7 +36,7 @@ DUMMY_AFID_C, DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B, - DUMMY_ENGINE_VERSION, + DUMMY_HUB_VERSION, DUMMY_OEM_VENDOR, DUMMY_RF_EVENT_COUNT, DUMMY_SAG_PID, @@ -86,18 +86,18 @@ def test_normalize_se_timestamp_preserves_format_value(): def test_analyzer_args_require_hub_config(): with pytest.raises(ValidationError): ServiceabilityAnalyzerArgs() - with pytest.raises(ValidationError, match="engine_python_module"): + with pytest.raises(ValidationError, match="hub_python_module"): ServiceabilityAnalyzerArgs(afid_sag_path=str(AFID_SAG)) args = ServiceabilityAnalyzerArgs( - engine_python_module="dummy.test.module", + hub_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), ) - assert args.engine_python_module == "dummy.test.module" + assert args.hub_python_module == "dummy.test.module" def test_resolved_hub_options_explicit_fields_override_options_bag(): args = ServiceabilityAnalyzerArgs( - engine_python_module="dummy.test.module", + hub_python_module="dummy.test.module", afid_sag_path=str(AFID_SAG), hub_options={"from_ac_cycle": 9, "extra": 1}, from_ac_cycle=3, @@ -158,12 +158,12 @@ def test_serviceability_block_from_service_result(): }, }, afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, - engine_version_info={"version": DUMMY_ENGINE_VERSION}, + engine_version_info={"version": DUMMY_HUB_VERSION}, ) block = serviceability_block_from_service_result( EXAMPLE_EVENTS[:1], result, - engine_label="Dummy test engine", + hub_label="Dummy test hub", rf_event_count=DUMMY_RF_EVENT_COUNT, ) assert len(block.solution) == 1 @@ -171,12 +171,12 @@ def test_serviceability_block_from_service_result(): assert block.solution[0].service_action_num == DUMMY_SERVICE_ACTION_NUM assert block.solution[0].service_action_title == "Dummy service action" assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} - assert block.hub_version == DUMMY_ENGINE_VERSION + assert block.hub_version == DUMMY_HUB_VERSION assert block.afid_sag_file_version is not None assert DUMMY_SAG_PID in block.afid_sag_file_version assert DUMMY_SAG_REVISION in block.afid_sag_file_version assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning - assert "Dummy test engine" in block.solution_reasoning + assert "Dummy test hub" in block.solution_reasoning def test_serviceability_block_from_service_result_isa_version_info(): @@ -188,7 +188,7 @@ def test_serviceability_block_from_service_result_isa_version_info(): block = serviceability_block_from_service_result( EXAMPLE_EVENTS[:1], result, - engine_label="ISA", + hub_label="ISA", rf_event_count=1, ) assert block.hub_version == "1.2.3" @@ -220,7 +220,7 @@ def test_run_service_hub_with_mock_module(): {"Afid": DUMMY_AFID_C, "serviceable_unit": DUMMY_UNIT_C, "Created": DUMMY_TIMESTAMP}, ] block = run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:2], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -251,14 +251,14 @@ def analyze_events(self, rf_events, cper_data=None): sys.modules["alt_service_engine"] = mod try: run_service_hub( - engine_python_module="alt_service_engine", + hub_python_module="alt_service_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=[{"Afid": 1}], cper_data={"k": 1}, hub_options={"debug": True}, - engine_analyze_method="analyze_events", - engine_init_path_kwarg="rulebook_path", + hub_analyze_method="analyze_events", + hub_init_path_kwarg="rulebook_path", ) finally: del sys.modules["alt_service_engine"] @@ -273,7 +273,7 @@ def test_run_service_hub_accepts_hub_options(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] block = run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -290,7 +290,7 @@ def test_run_service_hub_forwards_full_hub_options_kwargs(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] run_service_hub( - engine_python_module="instinct_shaped_engine", + hub_python_module="instinct_shaped_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -318,7 +318,7 @@ def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): {"Afid": DUMMY_AFID_A, "serviceable_unit": DUMMY_UNIT_A, "Created": DUMMY_TIMESTAMP}, ] run_service_hub( - engine_python_module="instinct_shaped_engine", + hub_python_module="instinct_shaped_engine", afid_events=EXAMPLE_EVENTS[:1], afid_sag_path=str(AFID_SAG), rf_events=rf_events, @@ -331,7 +331,7 @@ def test_run_service_hub_collected_cper_overrides_hub_options_cper_data(): def test_run_service_hub_missing_sag_raises(): with pytest.raises(SeRunError, match="Hub config file not found"): run_service_hub( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_events=EXAMPLE_EVENTS, afid_sag_path="/nonexistent/dummy_afid_sag.json", rf_events=[{"Afid": DUMMY_AFID_A}], @@ -363,7 +363,7 @@ def test_build_afid_events_from_rf_members(): assert events[1].afid == DUMMY_AFID_B -def test_mi3xx_analyzer_runs_python_engine(system_info): +def test_mi3xx_analyzer_runs_python_hub(system_info): data = ServiceabilityDataModel( rf_events=[ { @@ -380,7 +380,7 @@ def test_mi3xx_analyzer_runs_python_engine(system_info): ) analyzer = MI3XXAnalyzer(system_info=system_info) args = ServiceabilityAnalyzerArgs( - engine_python_module="mock_python_engine", + hub_python_module="mock_python_engine", afid_sag_path=str(AFID_SAG), hub_options={"include_raw_events": False}, ) diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py index 0542c866..379727d1 100644 --- a/test/unit/serviceability_dummy_data.py +++ b/test/unit/serviceability_dummy_data.py @@ -1,8 +1,40 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### """Shared dummy values for serviceability unit tests (not production data).""" +from __future__ import annotations + +from typing import Any + DUMMY_AFID_A = 9001 DUMMY_AFID_B = 9002 DUMMY_AFID_C = 9003 +DUMMY_AFID_BELOW_RF = 22 +DUMMY_AFID_FATAL_HBM = 25 +DUMMY_RF_CPER_AFID = 10000 DUMMY_SERVICE_ACTION_NUM = 99 DUMMY_SERVICE_ACTION_TITLE = "Dummy service action" DUMMY_UNIT_A = "dummy_unit_a" @@ -12,12 +44,137 @@ DUMMY_DESIGNATION_B = "DUMMY_SLOT_B" DUMMY_EVENT_URI = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/Entries" DUMMY_EVENT_URI_ALT = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog/EntriesAlt" +DUMMY_EVENT_LOG_BASE = "/redfish/v1/Systems/Dummy/LogServices/DummyEventLog" +DUMMY_CPER_ATTACHMENT_URI_1 = f"{DUMMY_EVENT_LOG_BASE}/Attachments/1" +DUMMY_CPER_ATTACHMENT_URI_2 = f"{DUMMY_EVENT_LOG_BASE}/Attachments/2" DUMMY_TIMESTAMP = "2000-01-01T12:00:00+00:00" DUMMY_TIMESTAMP_EARLIER = "1999-12-31T12:00:00+00:00" DUMMY_TIMESTAMP_LATER = "2000-01-02T12:00:00+00:00" DUMMY_RF_EVENT_COUNT = 2 DUMMY_SAG_PID = "dummy-sag-pid" DUMMY_SAG_REVISION = "dummy-rev-0" -DUMMY_ENGINE_VERSION = "0.0.0-dummy" +DUMMY_HUB_VERSION = "0.0.0-dummy" DUMMY_BMC_HOST = "dummy-bmc.example" DUMMY_OEM_VENDOR = "DummyVendor" +DUMMY_GPU_SERIAL_NUMBER = "DUMMY-GPU-SERIAL-0001" +DUMMY_DECODED_ERROR_TYPE = "dummy_error_type" +DUMMY_RF_EVENT_ID_1 = "dummy-rf-evt-1" +DUMMY_RF_EVENT_ID_2 = "dummy-rf-evt-2" +DUMMY_CPER_EVENT_ID_BASIC = "dummy-cper-evt-1" +DUMMY_CPER_EVENT_ID_SKIP = "dummy-cper-evt-skip" +DUMMY_CPER_EVENT_ID_RF = "dummy-cper-evt-rf" +DUMMY_CPER_BYTES_BASIC = b"\x01\x02dummy-cper" +DUMMY_CPER_BYTES_RF = b"\xaa\xbb" + + +def dummy_chassis_uri(unit: str) -> str: + return f"/redfish/v1/Chassis/{unit}" + + +def dummy_aca_err_row(*, serial: bool = True, decoded: bool = True) -> dict[str, Any]: + meta = {"SerialNumber": DUMMY_GPU_SERIAL_NUMBER} if serial else {"GpuFw": "dummy-fw"} + decoded_data = {"error_type": DUMMY_DECODED_ERROR_TYPE} if decoded else {} + return {"DecodedData": decoded_data, "MetaData": meta} + + +def dummy_cper_rf_member() -> dict[str, Any]: + """RF-range AFID with ACA decode + serial (CPER attachment fetch expected).""" + return { + "Id": DUMMY_CPER_EVENT_ID_RF, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_2, + "Oem": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_RF_CPER_AFID}], + "ErrDataArr": [dummy_aca_err_row()], + }, + } + + +def dummy_cper_skip_member() -> dict[str, Any]: + """Low AFID with ACA decode + serial (CPER attachment fetch skipped).""" + return { + "Id": DUMMY_CPER_EVENT_ID_SKIP, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, + "Oem": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [ + { + "DecodedData": {"error_type": "dummy_on_die_ecc"}, + "MetaData": {"SerialNumber": DUMMY_GPU_SERIAL_NUMBER}, + } + ], + }, + } + + +def dummy_cper_basic_member() -> dict[str, Any]: + """CPER event without OEM ACA block (attachment fetch expected).""" + return { + "Id": DUMMY_CPER_EVENT_ID_BASIC, + "Created": DUMMY_TIMESTAMP_LATER, + "DiagnosticDataType": "CPER", + "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, + } + + +def dummy_openbmc_log_entry() -> dict[str, Any]: + """OpenBMC-style LogEntry with Links OOC and AMDFieldIdentifiers[].""" + return { + "@odata.id": f"{DUMMY_EVENT_URI}/1", + "Created": DUMMY_TIMESTAMP, + "Id": DUMMY_RF_EVENT_ID_1, + "Links": { + "OriginOfCondition": {"@odata.id": dummy_chassis_uri(DUMMY_UNIT_A)}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_BELOW_RF, + "Description": "dummy on-die ECC, uncorrected, non-fatal", + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_A)}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, + } + + +def dummy_openbmc_log_entry_serviceable_units_only() -> dict[str, Any]: + """LogEntry with ServiceableUnits only (no Links OOC).""" + return { + "Created": DUMMY_TIMESTAMP, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_A, + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_B)}], + } + ], + }, + } + + +def dummy_fatal_hbm_log_entry() -> dict[str, Any]: + """Minimal CPER-style row with Links + AMDFieldIdentifiers[].""" + return { + "Created": DUMMY_TIMESTAMP_LATER, + "Id": DUMMY_RF_EVENT_ID_2, + "Links": { + "OriginOfCondition": {"@odata.id": dummy_chassis_uri(DUMMY_UNIT_C)}, + }, + "Oem": { + "AMDFieldIdentifiers": [ + { + "AFID": DUMMY_AFID_FATAL_HBM, + "Description": "dummy fatal HBM", + "ServiceableUnits": [{"@odata.id": dummy_chassis_uri(DUMMY_UNIT_C)}], + "ServiceableUnits@odata.count": 1, + } + ], + "AMDFieldIdentifiers@Members.count": 1, + }, + } From f8e7e47e25fb84bc2069e4d8dbe91f03cb8e53a4 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 17:08:35 -0500 Subject: [PATCH 31/39] Add instinct_shaped_engine test helper for hub options forwarding --- test/unit/instinct_shaped_engine.py | 68 +++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 test/unit/instinct_shaped_engine.py diff --git a/test/unit/instinct_shaped_engine.py b/test/unit/instinct_shaped_engine.py new file mode 100644 index 00000000..6fa7f234 --- /dev/null +++ b/test/unit/instinct_shaped_engine.py @@ -0,0 +1,68 @@ +############################################################################### +# +# MIT License +# +# Copyright (c) 2026 Advanced Micro Devices, Inc. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +############################################################################### + +from __future__ import annotations + +from typing import Any, Optional + +__all__ = ["InstinctShapedEngine"] + +_LAST_CALL: dict[str, Any] = {} + + +def clear_last_call() -> None: + _LAST_CALL.clear() + + +def get_last_call() -> dict[str, Any]: + return dict(_LAST_CALL) + + +class InstinctShapedEngine: + """Mirrors keyword parameters of ``InstinctServiceAssistant.get_service_info``.""" + + def __init__(self, afid_sag: str) -> None: + self.afid_sag = afid_sag + + def get_service_info( + self, + rf_events: list[Any], + from_ac_cycle: int = -1, + from_date: Optional[str] = None, + cper_data: Optional[dict[str, Any]] = None, + designation_serials: Optional[dict[str, str]] = None, + suppress_service_actions: Optional[list[str]] = None, + ) -> None: + _LAST_CALL.clear() + _LAST_CALL.update( + from_ac_cycle=from_ac_cycle, + from_date=from_date, + cper_data=cper_data, + designation_serials=designation_serials, + suppress_service_actions=suppress_service_actions, + rf_len=len(rf_events), + ) + return None From fb3d52c2424cab39b855b8f7cbe6812a07491e38 Mon Sep 17 00:00:00 2001 From: "Chen, Richard" Date: Mon, 22 Jun 2026 15:31:46 -0700 Subject: [PATCH 32/39] Add py.typed file --- nodescraper/py.typed | 0 pyproject.toml | 3 +++ 2 files changed, 3 insertions(+) create mode 100644 nodescraper/py.typed diff --git a/nodescraper/py.typed b/nodescraper/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/pyproject.toml b/pyproject.toml index 9e24d056..8de848a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,9 @@ include = ['nodescraper'] [tool.setuptools] include-package-data = true +[tool.setuptools.package-data] +nodescraper = ["py.typed"] + [project.scripts] node-scraper = "nodescraper.cli:cli_entry" From af8a3f3e756f66fe639f7ab71fbb8e43d24d0a35 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Mon, 22 Jun 2026 18:25:42 -0500 Subject: [PATCH 33/39] updated for documentation for to include serviceability plugin --- .mypy.ini | 1 + .pre-commit-config.yaml | 2 +- docs/PLUGIN_DOC.md | 90 +++++++++++++++++++ docs/generate_plugin_doc_bundle.py | 2 +- .../mi3xx/serviceability_plugin_mi3xx.py | 4 +- pyproject.toml | 2 + 6 files changed, 98 insertions(+), 3 deletions(-) diff --git a/.mypy.ini b/.mypy.ini index f9d68f19..cf6c2344 100644 --- a/.mypy.ini +++ b/.mypy.ini @@ -1,5 +1,6 @@ [mypy] # Global mypy configuration +mypy_path = test/unit [mypy-nodescraper.base.regexanalyzer] ignore_errors = True diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9919bb08..85a64e4f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,5 +28,5 @@ repos: rev: v1.15.0 hooks: - id: mypy - args: [--install-types, --non-interactive, --allow-redefinition] + args: [--install-types, --non-interactive, --explicit-package-bases, --allow-redefinition] language: system diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 88c06e42..80d4e012 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -41,6 +41,8 @@ | OobBmcArchivePlugin | SSH (BMC) shell: tar+gzip archives for each path in collection_args (see PathSpec entries).
Uses sudo on the BMC when collection_args paths require elevated access. | - | **Collection Args:**
- `paths`: list[nodescraper.plugins.ooband.bmc_archive.collector_args.PathSpec] — Named BMC paths to archive with tar czf -. Configure in plugin config under plugins.OobBmcArchivePlugin.collection_ar...
- `sudo`: bool — Default sudo setting for paths that do not specify sudo.
- `timeout`: int — Default per-path tar timeout in seconds.
- `skip_if_missing`: bool — Skip paths that do not exist on the BMC instead of failing collection.
- `ignore_failed_read`: bool — When true, pass GNU tar's --ignore-failed-read when the remote tar supports it. | [BmcArchiveDataModel](#BmcArchiveDataModel-Model) | [BmcArchiveCollector](#Collector-Class-BmcArchiveCollector) | - | | RedfishEndpointPlugin | Redfish GET: explicit paths from collection_args.uris (parallel when max_workers>1).
Optional paged GET following the Members collection OData nextLink field when follow_next_link is true.
Redfish GET tree: when discover_tree is true, walks from api_root using OData resource id links and Members navigation (depth and endpoint caps from collection_args). | For each entry in analysis_args.checks, reads JSON paths in collected responses and compares values to constraints (eq, min/max, anyOf, regex, etc.).
URI key "*" runs checks against every collected response body.
**Analyzer Args:**
- `checks`: dict[str, dict[str, Union[int, float, str, bool, dict[str, Any]]]] — Map: URI or '*' -> { property_path: constraint }. URI keys must match a key in the collected responses (exact match).... | **Collection Args:**
- `uris`: list[str] — Redfish URIs to GET. Ignored when discover_tree is True.
- `discover_tree`: bool — If True, discover endpoints from the BMC Redfish tree (service root and links) instead of using uris.
- `tree_max_depth`: int — When discover_tree is True: max traversal depth (1=service root only, 2=root + collections, 3=+ members).
- `tree_max_endpoints`: int — When discover_tree is True: max endpoints to discover (0=no limit).
- `max_workers`: int — Max concurrent GETs (1=sequential). Use >1 for async endpoint fetches.
- `follow_next_link`: bool — If True, follow Redfish Members collection OData nextLink pagination for each URI and merge all pages into a single r...
- `max_pages`: int — When follow_next_link is True: safety cap on the number of pages to follow per URI (default 200). | [RedfishEndpointDataModel](#RedfishEndpointDataModel-Model) | [RedfishEndpointCollector](#Collector-Class-RedfishEndpointCollector) | [RedfishEndpointAnalyzer](#Data-Analyzer-Class-RedfishEndpointAnalyzer) | | RedfishOemDiagPlugin | Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService).
Optional binary archives under the plugin log path when log_path is set. | Summarizes success/failure per OEM diagnostic type from collected results.
When analysis_args.require_all_success is true, fails the run if any type failed collection.
**Analyzer Args:**
- `require_all_success`: bool — If True, analysis fails when any OEM type collection failed. | **Collection Args:**
- `log_service_path`: str — Redfish path to the LogService (e.g. DiagLogs).
- `oem_diagnostic_types_allowable`: Optional[list[str]] — Allowable OEM diagnostic types for this architecture/BMC. When set, used for validation and as default for oem_diagno...
- `oem_diagnostic_types`: list[str] — OEM diagnostic types to collect. When empty and oem_diagnostic_types_allowable is set, defaults to that list.
- `task_timeout_s`: int — Max seconds to wait for each BMC task. | [RedfishOemDiagDataModel](#RedfishOemDiagDataModel-Model) | [RedfishOemDiagCollector](#Collector-Class-RedfishOemDiagCollector) | [RedfishOemDiagAnalyzer](#Data-Analyzer-Class-RedfishOemDiagAnalyzer) | +| ServiceabilityPluginMI3XX | - | **Analyzer Args:**
- `hub_python_module`: Optional[str] — Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs.
- `hub_display_name`: Optional[str] — Optional label for analyzer status messages.
- `afid_sag_path`: Optional[str] — Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg.
- `hub_init_path_kwarg`: str — Hub __init__ keyword that receives afid_sag_path.
- `hub_analyze_method`: str — Hub method called with rf_events first (default get_service_info).
- `skip_hub`: bool — If True, only build afid_events without running the service hub.
- `cper_decode_module`: Optional[str] — Module import path for CPER decoding when events include CPER attachments.
- `cper_decode_method`: str — Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out.
- `hub_options`: Optional[dict[str, Any]] — Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key.
- `from_ac_cycle`: int — from_ac_cycle kwarg for the hub analyze call (merged after hub_options).
- `from_date`: Optional[str] — Optional from_date for the hub analyze call (merged after hub_options).
- `designation_serials`: Optional[dict[str, str]] — Optional designation_serials for the hub analyze call (merged after hub_options).
- `suppress_service_actions`: Optional[list[str]] — Optional suppress_service_actions for the hub analyze call (merged after hub_options). | **Collection Args:**
- `uri`: Optional[str] — Optional alias for ``rf_event_log_uri``. When both ``uri`` and ``rf_event_log_uri`` are explicitly set to non-empty v...
- `rf_event_log_uri`: str — Redfish URI for the event log ``Entries`` collection.
- `rf_chassis_devices`: Optional[List[str]] — Chassis designations for Assembly GETs; required with ``rf_assembly_uri_template``.
- `rf_assembly_uri_template`: Optional[str] — Redfish URI template containing ``{device}`` for each chassis Assembly resource.
- `rf_firmware_bundle_uri`: Optional[str] — Redfish URI for firmware bundle inventory when subclasses extract component details.
- `follow_next_link`: bool — If True, follow Members@odata.nextLink up to max_pages; else single GET.
- `max_pages`: int — Safety cap on the number of pages when following event log pagination.
- `top`: Optional[int] — Most recent N entries via $skip after count probe; None collects full window.
- `reference_time`: Optional[str] — Optional ISO-8601 date or date-time used with time_operator (e.g. 2026-05-17 or 2026-05-17T13:01:00).
- `time_operator`: Optional[Literal['>', '>=', '<', '<=', '==']] — Comparison operator applied when reference_time is set. | [ServiceabilityDataModel](#ServiceabilityDataModel-Model) | [MI3XXCollector](#Collector-Class-MI3XXCollector) | [MI3XXAnalyzer](#Data-Analyzer-Class-MI3XXAnalyzer) | +| ServiceabilityPluginBase | - | - | - | [ServiceabilityDataModel](#ServiceabilityDataModel-Model) | [ServiceabilityCollectorBase](#Collector-Class-ServiceabilityCollectorBase) | - | # Collectors @@ -1045,6 +1047,34 @@ RedfishOemDiagDataModel - Redfish LogService.CollectDiagnosticData for each entry in collection_args.oem_diagnostic_types (collection_args.log_service_path selects the LogService). - Optional binary archives under the plugin log path when log_path is set. +## Collector Class MI3XXCollector + +### Description + +MI3XX OOB Redfish serviceability collector. + +**Bases**: ['ServiceabilityCollectorBase'] + +**Link to code**: [mi3xx_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py) + +### Provides Data + +ServiceabilityDataModel + +## Collector Class ServiceabilityCollectorBase + +### Description + +OOB Redfish collection skeleton; subclasses implement filtering, CPER handling, and JSON parsing. + +**Bases**: ['RedfishDataCollector', 'Generic'] + +**Link to code**: [serviceability_collector.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/serviceability_collector.py) + +### Provides Data + +ServiceabilityDataModel + # Data Models ## GenericCollectionDataModel Model @@ -1549,6 +1579,30 @@ Collected Redfish OEM diagnostic log results: OEM type -> result (success, error - **results**: `dict[str, nodescraper.plugins.ooband.redfish_oem_diag.oem_diag_data.OemDiagTypeResult]` +## ServiceabilityDataModel Model + +### Description + +Collected Redfish responses and intermediate serviceability fields. + +**Link to code**: [serviceability_data.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/serviceability_data.py) + +**Bases**: ['DataModel'] + +### Model annotations and fields + +- **responses**: `dict[str, Any]` +- **rf_events**: `list[Any]` +- **assembly_info**: `Dict[str, DeviceInfo]` +- **cper_raw**: `Dict[str, str]` +- **cper_data**: `Dict[str, Any]` +- **component_details**: `Optional[str]` +- **log_path**: `Optional[str]` +- **bmc_host**: `Optional[str]` +- **afid_events**: `List[AfidEvent]` +- **serviceability**: `Optional[ServiceabilityBlock]` +- **result**: `Optional[ServiceabilityResult]` + # Data Analyzers ## Data Analyzer Class GenericAnalyzer @@ -1978,6 +2032,16 @@ Analyzes Redfish OEM diagnostic log collection results. - Summarizes success/failure per OEM diagnostic type from collected results. - When analysis_args.require_all_success is true, fails the run if any type failed collection. +## Data Analyzer Class MI3XXAnalyzer + +### Description + +Build AFID events from collected data and run the configured service hub. + +**Bases**: ['DataAnalyzer'] + +**Link to code**: [mi3xx_analyzer.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py) + # Analyzer Args ## Analyzer Args Class GenericAnalyzerArgs @@ -2300,3 +2364,29 @@ Analyzer args for Redfish OEM diagnostic log results. ### Annotations / fields - **require_all_success**: `bool` — If True, analysis fails when any OEM type collection failed. + +## Analyzer Args Class ServiceabilityAnalyzerArgs + +### Description + +Analyzer args for serviceability plugins that run a configurable Python hub. + +**Bases**: ['AnalyzerArgs'] + +**Link to code**: [analyzer_args.py](https://github.com/amd/node-scraper/blob/HEAD/nodescraper/plugins/serviceability/analyzer_args.py) + +### Annotations / fields + +- **hub_python_module**: `Optional[str]` — Import path for the hub module (class implements hub_analyze_method); hub_options forwards kwargs. +- **hub_display_name**: `Optional[str]` — Optional label for analyzer status messages. +- **afid_sag_path**: `Optional[str]` — Path to hub config (e.g. AFID_SAG.json); passed as hub_init_path_kwarg. +- **hub_init_path_kwarg**: `str` — Hub __init__ keyword that receives afid_sag_path. +- **hub_analyze_method**: `str` — Hub method called with rf_events first (default get_service_info). +- **skip_hub**: `bool` — If True, only build afid_events without running the service hub. +- **cper_decode_module**: `Optional[str]` — Module import path for CPER decoding when events include CPER attachments. +- **cper_decode_method**: `str` — Callable on cper_decode_module: file-like CPER in, (return_code, decode_dict) out. +- **hub_options**: `Optional[dict[str, Any]]` — Extra kwargs for hub __init__ and analyze; collected cper_data overrides cper_data key. +- **from_ac_cycle**: `int` — from_ac_cycle kwarg for the hub analyze call (merged after hub_options). +- **from_date**: `Optional[str]` — Optional from_date for the hub analyze call (merged after hub_options). +- **designation_serials**: `Optional[dict[str, str]]` — Optional designation_serials for the hub analyze call (merged after hub_options). +- **suppress_service_actions**: `Optional[list[str]]` — Optional suppress_service_actions for the hub analyze call (merged after hub_options). diff --git a/docs/generate_plugin_doc_bundle.py b/docs/generate_plugin_doc_bundle.py index 4d873ca5..cd9897b0 100644 --- a/docs/generate_plugin_doc_bundle.py +++ b/docs/generate_plugin_doc_bundle.py @@ -41,7 +41,7 @@ from typing import Any, Iterable, List, Optional, Type LINK_BASE_DEFAULT = "https://github.com/amd/node-scraper/blob/HEAD/" -REL_ROOT_DEFAULT = "nodescraper/plugins/inband" +REL_ROOT_DEFAULT = "nodescraper/plugins" # Import and document every concrete plugin under nodescraper.plugins (inband, ooband, # generic_collection, regex_search, serviceability, …). PACKAGE_PLUGINS_ROOT = "nodescraper.plugins" diff --git a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py index 2f38783f..d578d949 100644 --- a/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py +++ b/nodescraper/plugins/serviceability/mi3xx/serviceability_plugin_mi3xx.py @@ -23,6 +23,7 @@ # SOFTWARE. # ############################################################################### +from nodescraper.plugins.serviceability.analyzer_args import ServiceabilityAnalyzerArgs from nodescraper.plugins.serviceability.serviceability_data import ( ServiceabilityDataModel, ) @@ -41,9 +42,10 @@ class ServiceabilityPluginMI3XX(ServiceabilityPluginBase): - """MI3XX OOB Redfish serviceability plugin.""" + """MI3XX OOB Redfish serviceability: BMC event log, CPER attachments, and service hub analysis.""" DATA_MODEL = ServiceabilityDataModel COLLECTOR = MI3XXCollector ANALYZER = MI3XXAnalyzer COLLECTOR_ARGS = MI3XXCollectorArgs + ANALYZER_ARGS = ServiceabilityAnalyzerArgs diff --git a/pyproject.toml b/pyproject.toml index e3f0220a..8cf05b74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ dev = [ "pytest-cov", "mypy", "types-paramiko", + "types-requests", "types-setuptools", ] @@ -83,6 +84,7 @@ ignore = ["E501", "N806"] [tool.mypy] python_version = "3.9" mypy_path = ["test/unit"] +explicit_package_bases = true [tool.setuptools_scm] version_scheme = "post-release" From 7ee4f921298d030d4c8cd41b0f4775901f51c030 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 09:05:38 -0500 Subject: [PATCH 34/39] addresses reviews --- .../serviceability/mi3xx/mi3xx_analyzer.py | 6 +- .../serviceability/mi3xx/mi3xx_collector.py | 24 +++---- .../serviceability/mi3xx/mi3xx_cper_utils.py | 69 +++++++++++-------- test/unit/instinct_shaped_engine.py | 1 - test/unit/plugin/test_mi3xx_collector.py | 18 +++++ test/unit/plugin/test_mi3xx_cper_utils.py | 37 +++++++++- test/unit/serviceability_dummy_data.py | 3 + 7 files changed, 111 insertions(+), 47 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py index 6150398e..b3e2644d 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_analyzer.py @@ -47,7 +47,7 @@ ServiceabilityDataModel, ) -from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode +from .mi3xx_cper_utils import CPER_METHOD_AFID_MAX, should_skip_cper_fetch_or_decode class AfidSagMetadataArtifact(BaseModel): @@ -90,10 +90,10 @@ def analyze_data( if skipped_cper: self.logger.info( "(%s) Skipping CPER decode for %d CPER attachment(s); Redfish log " - "already has usable ACA fields (AFID<%s or no serial on decode)", + "already has usable ACA fields (CPER-method AFID<=%s or no serial on decode)", parent, skipped_cper, - RF_CPER_AFID_MIN, + CPER_METHOD_AFID_MAX, ) if cper_raw_to_decode and not cper_data: if not args.cper_decode_module: diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py index 8921796c..d155f14a 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_collector.py @@ -35,13 +35,14 @@ from nodescraper.plugins.serviceability.time_utils import satisfies_time_check from .mi3xx_collector_args import MI3XXCollectorArgs -from .mi3xx_cper_utils import RF_CPER_AFID_MIN, should_skip_cper_fetch_or_decode +from .mi3xx_cper_utils import CPER_METHOD_AFID_MAX, should_skip_cper_fetch_or_decode _EVENT_TIMESTAMP_KEYS = ("Created", "EventTimestamp", "Timestamp") class MI3XXCollector(ServiceabilityCollectorBase[MI3XXCollectorArgs]): - """MI3XX OOB Redfish serviceability collector.""" + """Collect MI3XX BMC Redfish data: event log members (with pagination), firmware inventory, + CPER attachment bytes for qualifying events, and optional assembly/chassis metadata.""" def satisfies_reference_time( self, @@ -69,15 +70,12 @@ def filter_event_members( return filtered def is_cper_event(self, event: dict) -> bool: - if "CPER" in event: - return True - if str(event.get("DiagnosticDataType", "")).upper() == "CPER": - return True - if event.get("AdditionalDataURI"): - return True - message_id = str(event.get("MessageId", "")).lower() - message = str(event.get("Message", "")).lower() - return "cper" in message_id or "cper" in message or "diagnostic" in message_id + """True when the log entry is a Redfish CPER attachment event.""" + return ( + "CPER" in event + and str(event.get("DiagnosticDataType", "")).upper() == "CPER" + and bool(event.get("AdditionalDataURI")) + ) def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: """Fetch CPER binaries from BMC; decoding runs in the analyzer.""" @@ -94,10 +92,10 @@ def collect_cper_attachments(self, rf_events: list[Any]) -> dict[str, str]: if should_skip_cper_fetch_or_decode(event): self.logger.info( "(%s) Skipping CPER attachment fetch for Redfish event %s " - "(ACA decode already on log entry; AFID<%s check or no serial)", + "(ACA decode already on log entry; CPER-method AFID<=%s or no serial)", parent, event_id, - RF_CPER_AFID_MIN, + CPER_METHOD_AFID_MAX, ) continue diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py index fe9661dc..7aa047a9 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -27,29 +27,42 @@ from typing import Any -# Redfish CPER (RF) style AFIDs start at this value; lower values are in-band / -# OEM-field AFIDs already reflected on the log entry. -RF_CPER_AFID_MIN = 10000 +# CPER-method AFIDs <= 34; Redfish-method AFIDs >= 10000. +CPER_METHOD_AFID_MAX = 34 +REDFISH_METHOD_AFID_MIN = 10000 _SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") -def event_afids_from_oem(event: dict[str, Any]) -> list[int]: - """AFIDs from ``Oem.AMDFieldIdentifiers`` (or similar list-of-dicts).""" +def _oem_dict(event: dict[str, Any]) -> dict[str, Any]: oem = event.get("Oem") - if not isinstance(oem, dict): - return [] - raw = oem.get("AMDFieldIdentifiers") - if not isinstance(raw, list): - return [] + return oem if isinstance(oem, dict) else {} + + +def _oem_list_field(oem: dict[str, Any], key: str) -> list[Any]: + """Return a list field from ``Oem`` or nested ``Oem.AMD`` (BMC layout varies).""" + raw = oem.get(key) + if isinstance(raw, list): + return raw + amd = oem.get("AMD") + if isinstance(amd, dict): + nested = amd.get(key) + if isinstance(nested, list): + return nested + return [] + + +def event_afids_from_oem(event: dict[str, Any]) -> list[int]: + """AFIDs from ``Oem.AMDFieldIdentifiers`` or ``Oem.AMD.AMDFieldIdentifiers``.""" + raw = _oem_list_field(_oem_dict(event), "AMDFieldIdentifiers") out: list[int] = [] for item in raw: if not isinstance(item, dict): continue for key in ("AFID", "Afid", "afid"): - if key in item and item[key] is not None: + if (v := item.get(key)) is not None: try: - out.append(int(item[key])) + out.append(int(v)) except (TypeError, ValueError): pass break @@ -57,12 +70,8 @@ def event_afids_from_oem(event: dict[str, Any]) -> list[int]: def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: - oem = event.get("Oem") - if not isinstance(oem, dict): - return [] - arr = oem.get("ErrDataArr") - if not isinstance(arr, list): - return [] + """``ErrDataArr`` rows from ``Oem.ErrDataArr`` or ``Oem.AMD.ErrDataArr``.""" + arr = _oem_list_field(_oem_dict(event), "ErrDataArr") return [e for e in arr if isinstance(e, dict)] @@ -86,24 +95,30 @@ def _nonempty_serial_in_mapping(obj: Any) -> bool: def event_aca_includes_serial(event: dict[str, Any]) -> bool: - """Serial (or UBB serial) present on any ``ErrDataArr`` row (typically ``MetaData``).""" + """Serial (or UBB serial) present on any ``ErrDataArr`` row ``MetaData``.""" for entry in _err_data_arr_entries(event): - meta = entry.get("MetaData") - if _nonempty_serial_in_mapping(meta): - return True - decoded = entry.get("DecodedData") - if _nonempty_serial_in_mapping(decoded): + if _nonempty_serial_in_mapping(entry.get("MetaData")): return True return False +def is_cper_method_afid(afid: int) -> bool: + """True for CPER-method AFIDs (<= ``CPER_METHOD_AFID_MAX``), including on RF log entries.""" + return afid <= CPER_METHOD_AFID_MAX + + +def is_redfish_method_afid(afid: int) -> bool: + """True for Redfish-method AFIDs in the 10k range.""" + return afid >= REDFISH_METHOD_AFID_MIN + + def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: """Whether to omit CPER binary fetch and configured CPER decode for this Redfish member. Skip when: - * Every OEM-listed AFID is below ``RF_CPER_AFID_MIN`` (non-RF CPER range), - ACA ``DecodedData`` is present, and a serial is present on the entry; or + * Every OEM-listed AFID is CPER-method (<= ``CPER_METHOD_AFID_MAX``; may match + in-band CPER AFIDs), ACA ``DecodedData`` is present, and serial is on the entry; or * ACA ``DecodedData`` is present but no serial — the CPER blob does not add actionable identity beyond what is already missing from the log. """ @@ -114,4 +129,4 @@ def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: afids = event_afids_from_oem(event) if not afids: return False - return all(afid < RF_CPER_AFID_MIN for afid in afids) + return all(is_cper_method_afid(afid) for afid in afids) diff --git a/test/unit/instinct_shaped_engine.py b/test/unit/instinct_shaped_engine.py index 6fa7f234..b5989a24 100644 --- a/test/unit/instinct_shaped_engine.py +++ b/test/unit/instinct_shaped_engine.py @@ -23,7 +23,6 @@ # SOFTWARE. # ############################################################################### - from __future__ import annotations from typing import Any, Optional diff --git a/test/unit/plugin/test_mi3xx_collector.py b/test/unit/plugin/test_mi3xx_collector.py index 1cddc2f3..96a9d556 100644 --- a/test/unit/plugin/test_mi3xx_collector.py +++ b/test/unit/plugin/test_mi3xx_collector.py @@ -175,6 +175,24 @@ def test_mi3xx_collector_satisfies_reference_time_helper(mi3xx_collector): assert not mi3xx_collector.satisfies_reference_time(DUMMY_TIMESTAMP_EARLIER, args) +def test_mi3xx_collector_is_cper_event_requires_cper_block_type_and_uri(mi3xx_collector): + assert mi3xx_collector.is_cper_event(dummy_cper_basic_member()) + assert not mi3xx_collector.is_cper_event( + { + "Id": "non-cper", + "AdditionalDataURI": DUMMY_EVENT_URI, + "MessageId": "ResourceEvent.1.2.1.ResourceErrorsDetectedOEM", + } + ) + assert not mi3xx_collector.is_cper_event( + { + "Id": "partial-cper", + "CPER": {"NotificationType": "dummy"}, + "DiagnosticDataType": "CPER", + } + ) + + def test_mi3xx_collector_fetches_cper_attachments(mi3xx_collector, redfish_conn_mock): import base64 from unittest.mock import MagicMock diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index b156b930..e4e2965e 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -25,16 +25,19 @@ ############################################################################### import pytest from serviceability_dummy_data import ( - DUMMY_AFID_B, DUMMY_AFID_BELOW_RF, + DUMMY_AFID_FATAL_HBM, DUMMY_RF_CPER_AFID, dummy_aca_err_row, ) from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( + CPER_METHOD_AFID_MAX, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, + is_cper_method_afid, + is_redfish_method_afid, should_skip_cper_fetch_or_decode, ) @@ -50,6 +53,34 @@ def test_skip_when_afids_below_threshold_and_aca_has_serial(): assert should_skip_cper_fetch_or_decode(event) is True +def test_event_afids_from_oem_nested_amd_block(): + event = { + "Oem": { + "AMD": { + "AMDFieldIdentifiers": [{"AFID": DUMMY_AFID_BELOW_RF}], + "ErrDataArr": [dummy_aca_err_row()], + } + } + } + assert event_afids_from_oem(event) == [DUMMY_AFID_BELOW_RF] + assert event_has_aca_decode(event) is True + assert should_skip_cper_fetch_or_decode(event) is True + + +def test_err_data_arr_entries_nested_amd_block(): + event = {"Oem": {"AMD": {"ErrDataArr": [dummy_aca_err_row()]}}} + assert event_has_aca_decode(event) is True + assert event_aca_includes_serial(event) is True + + +def test_afid_method_ranges(): + assert is_cper_method_afid(DUMMY_AFID_BELOW_RF) + assert is_cper_method_afid(CPER_METHOD_AFID_MAX) + assert not is_cper_method_afid(CPER_METHOD_AFID_MAX + 1) + assert is_redfish_method_afid(DUMMY_RF_CPER_AFID) + assert not is_redfish_method_afid(DUMMY_AFID_BELOW_RF) + + def test_no_skip_when_rf_range_afid_even_with_aca_serial(): event = { "Oem": { @@ -94,11 +125,11 @@ def test_no_skip_when_aca_serial_but_no_afid_list(): @pytest.mark.parametrize( "afids,expect_skip", [ - ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_B], True), + ([DUMMY_AFID_BELOW_RF, DUMMY_AFID_FATAL_HBM], True), ([DUMMY_AFID_BELOW_RF, DUMMY_RF_CPER_AFID], False), ], ) -def test_skip_requires_all_afids_below_rf_threshold(afids, expect_skip): +def test_skip_requires_all_afids_cper_method(afids, expect_skip): identifiers = [{"AFID": a} for a in afids] event = {"Oem": {"AMDFieldIdentifiers": identifiers, "ErrDataArr": [dummy_aca_err_row()]}} assert should_skip_cper_fetch_or_decode(event) is expect_skip diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py index 379727d1..22e883e8 100644 --- a/test/unit/serviceability_dummy_data.py +++ b/test/unit/serviceability_dummy_data.py @@ -82,6 +82,7 @@ def dummy_cper_rf_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_RF, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_2, "Oem": { @@ -96,6 +97,7 @@ def dummy_cper_skip_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_SKIP, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, "Oem": { @@ -115,6 +117,7 @@ def dummy_cper_basic_member() -> dict[str, Any]: return { "Id": DUMMY_CPER_EVENT_ID_BASIC, "Created": DUMMY_TIMESTAMP_LATER, + "CPER": {"NotificationType": "dummy-notification-type"}, "DiagnosticDataType": "CPER", "AdditionalDataURI": DUMMY_CPER_ATTACHMENT_URI_1, } From 782d1562838fef3272e6b8bbff0c155e8c7672ba Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 11:53:37 -0500 Subject: [PATCH 35/39] addressed reviews --- .../serviceability/mi3xx/mi3xx_cper_utils.py | 63 ++++++++++--------- test/unit/plugin/test_mi3xx_cper_utils.py | 19 ++++++ 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py index 7aa047a9..bdc4ce15 100644 --- a/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py +++ b/nodescraper/plugins/serviceability/mi3xx/mi3xx_cper_utils.py @@ -27,34 +27,36 @@ from typing import Any -# CPER-method AFIDs <= 34; Redfish-method AFIDs >= 10000. +# CPER-method AFIDs <= 34; MI3XX Redfish-method AFIDs 10000–10999. CPER_METHOD_AFID_MAX = 34 REDFISH_METHOD_AFID_MIN = 10000 +REDFISH_METHOD_AFID_MAX = 10999 _SERIAL_KEYS = ("SerialNumber", "serial_number", "UbbSerial", "ubb_serial") -def _oem_dict(event: dict[str, Any]) -> dict[str, Any]: - oem = event.get("Oem") - return oem if isinstance(oem, dict) else {} +def get_amd_oem_dict(event: dict[str, Any]) -> dict[str, Any]: + """Return the AMD OEM payload dict for a Redfish log member. + + BMC layouts vary: fields may live on Oem directly or under Oem.AMD. + When AMD is absent, returns Oem; when present, returns AMD if it is a dict. + """ + if not isinstance(oem := event.get("Oem"), dict): + return {} + if (amd := oem.get("AMD")) is None: + return oem + return amd if isinstance(amd, dict) else {} -def _oem_list_field(oem: dict[str, Any], key: str) -> list[Any]: - """Return a list field from ``Oem`` or nested ``Oem.AMD`` (BMC layout varies).""" - raw = oem.get(key) - if isinstance(raw, list): - return raw - amd = oem.get("AMD") - if isinstance(amd, dict): - nested = amd.get(key) - if isinstance(nested, list): - return nested - return [] +def _oem_list_field(oem_dict: dict[str, Any], key: str) -> list[Any]: + """Return a list field from the resolved AMD OEM dict.""" + raw = oem_dict.get(key) + return raw if isinstance(raw, list) else [] def event_afids_from_oem(event: dict[str, Any]) -> list[int]: - """AFIDs from ``Oem.AMDFieldIdentifiers`` or ``Oem.AMD.AMDFieldIdentifiers``.""" - raw = _oem_list_field(_oem_dict(event), "AMDFieldIdentifiers") + """AFIDs from Oem.AMDFieldIdentifiers or Oem.AMD.AMDFieldIdentifiers.""" + raw = _oem_list_field(get_amd_oem_dict(event), "AMDFieldIdentifiers") out: list[int] = [] for item in raw: if not isinstance(item, dict): @@ -70,13 +72,13 @@ def event_afids_from_oem(event: dict[str, Any]) -> list[int]: def _err_data_arr_entries(event: dict[str, Any]) -> list[dict[str, Any]]: - """``ErrDataArr`` rows from ``Oem.ErrDataArr`` or ``Oem.AMD.ErrDataArr``.""" - arr = _oem_list_field(_oem_dict(event), "ErrDataArr") + """ErrDataArr rows from Oem.ErrDataArr or Oem.AMD.ErrDataArr.""" + arr = _oem_list_field(get_amd_oem_dict(event), "ErrDataArr") return [e for e in arr if isinstance(e, dict)] def event_has_aca_decode(event: dict[str, Any]) -> bool: - """True when the log entry includes ACA-style ``DecodedData`` under ``ErrDataArr``.""" + """True when the log entry includes ACA-style DecodedData under ErrDataArr.""" for entry in _err_data_arr_entries(event): decoded = entry.get("DecodedData") if isinstance(decoded, dict) and decoded: @@ -95,21 +97,20 @@ def _nonempty_serial_in_mapping(obj: Any) -> bool: def event_aca_includes_serial(event: dict[str, Any]) -> bool: - """Serial (or UBB serial) present on any ``ErrDataArr`` row ``MetaData``.""" - for entry in _err_data_arr_entries(event): - if _nonempty_serial_in_mapping(entry.get("MetaData")): - return True - return False + """Serial (or UBB serial) present on any ErrDataArr row MetaData.""" + return any( + _nonempty_serial_in_mapping(entry.get("MetaData")) for entry in _err_data_arr_entries(event) + ) def is_cper_method_afid(afid: int) -> bool: - """True for CPER-method AFIDs (<= ``CPER_METHOD_AFID_MAX``), including on RF log entries.""" + """True for CPER-method AFIDs (<= CPER_METHOD_AFID_MAX), including on RF log entries.""" return afid <= CPER_METHOD_AFID_MAX def is_redfish_method_afid(afid: int) -> bool: - """True for Redfish-method AFIDs in the 10k range.""" - return afid >= REDFISH_METHOD_AFID_MIN + """True for MI3XX Redfish-method AFIDs in the 10k range (10000–10999).""" + return REDFISH_METHOD_AFID_MIN <= afid <= REDFISH_METHOD_AFID_MAX def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: @@ -117,9 +118,9 @@ def should_skip_cper_fetch_or_decode(event: dict[str, Any]) -> bool: Skip when: - * Every OEM-listed AFID is CPER-method (<= ``CPER_METHOD_AFID_MAX``; may match - in-band CPER AFIDs), ACA ``DecodedData`` is present, and serial is on the entry; or - * ACA ``DecodedData`` is present but no serial — the CPER blob does not add + * Every OEM-listed AFID is CPER-method (<= CPER_METHOD_AFID_MAX; may match + in-band CPER AFIDs), ACA DecodedData is present, and serial is on the entry; or + * ACA DecodedData is present but no serial — the CPER blob does not add actionable identity beyond what is already missing from the log. """ if not event_has_aca_decode(event): diff --git a/test/unit/plugin/test_mi3xx_cper_utils.py b/test/unit/plugin/test_mi3xx_cper_utils.py index e4e2965e..105ca203 100644 --- a/test/unit/plugin/test_mi3xx_cper_utils.py +++ b/test/unit/plugin/test_mi3xx_cper_utils.py @@ -33,15 +33,31 @@ from nodescraper.plugins.serviceability.mi3xx.mi3xx_cper_utils import ( CPER_METHOD_AFID_MAX, + REDFISH_METHOD_AFID_MAX, + REDFISH_METHOD_AFID_MIN, event_aca_includes_serial, event_afids_from_oem, event_has_aca_decode, + get_amd_oem_dict, is_cper_method_afid, is_redfish_method_afid, should_skip_cper_fetch_or_decode, ) +def test_get_amd_oem_dict_layouts(): + flat = {"Oem": {"AMDFieldIdentifiers": [{"AFID": 1}]}} + assert get_amd_oem_dict(flat) == {"AMDFieldIdentifiers": [{"AFID": 1}]} + + nested = {"Oem": {"AMD": {"ErrDataArr": []}}} + assert get_amd_oem_dict(nested) == {"ErrDataArr": []} + + assert get_amd_oem_dict({}) == {} + assert get_amd_oem_dict({"Oem": None}) == {} + assert get_amd_oem_dict({"Oem": "bad"}) == {} + assert get_amd_oem_dict({"Oem": {"AMD": "bad"}}) == {} + + def test_skip_when_afids_below_threshold_and_aca_has_serial(): event = { "Oem": { @@ -78,6 +94,9 @@ def test_afid_method_ranges(): assert is_cper_method_afid(CPER_METHOD_AFID_MAX) assert not is_cper_method_afid(CPER_METHOD_AFID_MAX + 1) assert is_redfish_method_afid(DUMMY_RF_CPER_AFID) + assert is_redfish_method_afid(REDFISH_METHOD_AFID_MAX) + assert not is_redfish_method_afid(REDFISH_METHOD_AFID_MIN - 1) + assert not is_redfish_method_afid(REDFISH_METHOD_AFID_MAX + 1) assert not is_redfish_method_afid(DUMMY_AFID_BELOW_RF) From e6b27960bd407955325c3c89ce45990311fa284a Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 15:45:19 -0500 Subject: [PATCH 36/39] SAG file details printout --- .../plugins/serviceability/se_adapter.py | 16 +++------ .../plugins/serviceability/se_models.py | 2 +- test/unit/mock_python_engine.py | 7 +++- test/unit/plugin/test_se_runner.py | 33 ++++++++++++++----- test/unit/serviceability_dummy_data.py | 1 + 5 files changed, 37 insertions(+), 22 deletions(-) diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index bea1d4a0..8d0478b7 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -77,25 +77,19 @@ def _hub_version_display(version_info: Any) -> Optional[str]: def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: - """Build a short AFID_SAG file identity string from hub ``afid_sag_metadata``.""" + """Build AFID_SAG file identity string (pid, revision, variant) from hub metadata.""" if not isinstance(metadata, dict) or not metadata: return None pid = metadata.get("sag_pid") or metadata.get("pid") - rev = metadata.get("sag_revision") or metadata.get("revision") - extra = ( - metadata.get("sag_version") - or metadata.get("file_version") - or metadata.get("schema_version") - ) + rev = metadata.get("revision") + variant = metadata.get("sag_variant") or metadata.get("variant") parts: list[str] = [] if pid and str(pid).strip(): parts.append(f"PID {str(pid).strip()}") if rev and str(rev).strip(): parts.append(f"revision {str(rev).strip()}") - if extra and str(extra).strip(): - ex = str(extra).strip() - if ex not in (str(pid or "").strip(), str(rev or "").strip()): - parts.append(f"version {ex}") + if variant and str(variant).strip(): + parts.append(f"variant {str(variant).strip()}") if not parts: return None return ", ".join(parts) diff --git a/nodescraper/plugins/serviceability/se_models.py b/nodescraper/plugins/serviceability/se_models.py index 6aa855a3..addef3ae 100644 --- a/nodescraper/plugins/serviceability/se_models.py +++ b/nodescraper/plugins/serviceability/se_models.py @@ -87,7 +87,7 @@ class ServiceabilityBlock(BaseModel): ) afid_sag_file_version: Optional[str] = Field( default=None, - description="AFID_SAG.json identity/revision string when the hub returned metadata.", + description="AFID_SAG.json pid/revision/variant string when the hub returned metadata.", ) afid_sag_metadata: Optional[dict[str, Any]] = Field( default=None, diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py index f48a7e43..d9954026 100644 --- a/test/unit/mock_python_engine.py +++ b/test/unit/mock_python_engine.py @@ -9,6 +9,7 @@ DUMMY_HUB_VERSION, DUMMY_SAG_PID, DUMMY_SAG_REVISION, + DUMMY_SAG_VARIANT, DUMMY_SERVICE_ACTION_NUM, DUMMY_SERVICE_ACTION_TITLE, DUMMY_UNIT_A, @@ -38,6 +39,10 @@ def get_service_info( } return SimpleNamespace( service_info=service_info, - afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + afid_sag_metadata={ + "sag_pid": DUMMY_SAG_PID, + "revision": DUMMY_SAG_REVISION, + "sag_variant": DUMMY_SAG_VARIANT, + }, engine_version_info={"version": DUMMY_HUB_VERSION}, ) diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index 554f0ccc..9fa40ba6 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -41,6 +41,7 @@ DUMMY_RF_EVENT_COUNT, DUMMY_SAG_PID, DUMMY_SAG_REVISION, + DUMMY_SAG_VARIANT, DUMMY_SERVICE_ACTION_NUM, DUMMY_TIMESTAMP, DUMMY_UNIT_A, @@ -126,12 +127,17 @@ def test_format_serviceability_solution_lines(): ], solution_reasoning="Dummy test reasoning.", hub_version="1.0.0-test", - afid_sag_file_version="PID sag-1, revision rev-a", + afid_sag_file_version=( + f"PID {DUMMY_SAG_PID}, revision {DUMMY_SAG_REVISION}, variant {DUMMY_SAG_VARIANT}" + ), ) lines = format_serviceability_solution_lines(block) assert lines[0] == "Dummy test reasoning." assert lines[1] == "Hub version: 1.0.0-test" - assert lines[2] == "AFID_SAG file: PID sag-1, revision rev-a" + assert ( + lines[2] + == f"AFID_SAG file: PID {DUMMY_SAG_PID}, revision {DUMMY_SAG_REVISION}, variant {DUMMY_SAG_VARIANT}" + ) assert f"AFID {DUMMY_AFID_A}" in lines[3] assert DUMMY_DESIGNATION_A in lines[3] assert "service action 99 (RMA)" in lines[3] @@ -157,7 +163,11 @@ def test_serviceability_block_from_service_result(): } }, }, - afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + afid_sag_metadata={ + "sag_pid": DUMMY_SAG_PID, + "revision": DUMMY_SAG_REVISION, + "sag_variant": DUMMY_SAG_VARIANT, + }, engine_version_info={"version": DUMMY_HUB_VERSION}, ) block = serviceability_block_from_service_result( @@ -172,9 +182,9 @@ def test_serviceability_block_from_service_result(): assert block.solution[0].service_action_title == "Dummy service action" assert set(block.solution[0].serviceable_unit) == {DUMMY_DESIGNATION_A, DUMMY_DESIGNATION_B} assert block.hub_version == DUMMY_HUB_VERSION - assert block.afid_sag_file_version is not None - assert DUMMY_SAG_PID in block.afid_sag_file_version - assert DUMMY_SAG_REVISION in block.afid_sag_file_version + assert block.afid_sag_file_version == ( + f"PID {DUMMY_SAG_PID}, revision {DUMMY_SAG_REVISION}, variant {DUMMY_SAG_VARIANT}" + ) assert f"{DUMMY_RF_EVENT_COUNT} Redfish event(s)" in block.solution_reasoning assert "Dummy test hub" in block.solution_reasoning @@ -182,7 +192,11 @@ def test_serviceability_block_from_service_result(): def test_serviceability_block_from_service_result_isa_version_info(): result = SimpleNamespace( service_info={}, - afid_sag_metadata={"sag_pid": DUMMY_SAG_PID, "sag_revision": DUMMY_SAG_REVISION}, + afid_sag_metadata={ + "sag_pid": DUMMY_SAG_PID, + "revision": DUMMY_SAG_REVISION, + "sag_variant": DUMMY_SAG_VARIANT, + }, isa_version_info={"VERSION": "1.2.3"}, ) block = serviceability_block_from_service_result( @@ -192,8 +206,9 @@ def test_serviceability_block_from_service_result_isa_version_info(): rf_event_count=1, ) assert block.hub_version == "1.2.3" - assert block.afid_sag_file_version is not None - assert DUMMY_SAG_PID in block.afid_sag_file_version + assert block.afid_sag_file_version == ( + f"PID {DUMMY_SAG_PID}, revision {DUMMY_SAG_REVISION}, variant {DUMMY_SAG_VARIANT}" + ) def test_resolve_hub_class_finds_package_export(): diff --git a/test/unit/serviceability_dummy_data.py b/test/unit/serviceability_dummy_data.py index 22e883e8..06c78d2e 100644 --- a/test/unit/serviceability_dummy_data.py +++ b/test/unit/serviceability_dummy_data.py @@ -53,6 +53,7 @@ DUMMY_RF_EVENT_COUNT = 2 DUMMY_SAG_PID = "dummy-sag-pid" DUMMY_SAG_REVISION = "dummy-rev-0" +DUMMY_SAG_VARIANT = "dummy-variant-0" DUMMY_HUB_VERSION = "0.0.0-dummy" DUMMY_BMC_HOST = "dummy-bmc.example" DUMMY_OEM_VENDOR = "DummyVendor" From 8731f0af094348acee5d771fd2f2dfa086954909 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 16:05:57 -0500 Subject: [PATCH 37/39] SAG file details printout --- nodescraper/plugins/serviceability/se_adapter.py | 2 +- test/unit/mock_python_engine.py | 2 +- test/unit/plugin/test_se_runner.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nodescraper/plugins/serviceability/se_adapter.py b/nodescraper/plugins/serviceability/se_adapter.py index 8d0478b7..3db9394d 100644 --- a/nodescraper/plugins/serviceability/se_adapter.py +++ b/nodescraper/plugins/serviceability/se_adapter.py @@ -82,7 +82,7 @@ def _afid_sag_file_version_display(metadata: Any) -> Optional[str]: return None pid = metadata.get("sag_pid") or metadata.get("pid") rev = metadata.get("revision") - variant = metadata.get("sag_variant") or metadata.get("variant") + variant = metadata.get("variant") parts: list[str] = [] if pid and str(pid).strip(): parts.append(f"PID {str(pid).strip()}") diff --git a/test/unit/mock_python_engine.py b/test/unit/mock_python_engine.py index d9954026..09e38a7e 100644 --- a/test/unit/mock_python_engine.py +++ b/test/unit/mock_python_engine.py @@ -42,7 +42,7 @@ def get_service_info( afid_sag_metadata={ "sag_pid": DUMMY_SAG_PID, "revision": DUMMY_SAG_REVISION, - "sag_variant": DUMMY_SAG_VARIANT, + "variant": DUMMY_SAG_VARIANT, }, engine_version_info={"version": DUMMY_HUB_VERSION}, ) diff --git a/test/unit/plugin/test_se_runner.py b/test/unit/plugin/test_se_runner.py index 9fa40ba6..025aef25 100644 --- a/test/unit/plugin/test_se_runner.py +++ b/test/unit/plugin/test_se_runner.py @@ -166,7 +166,7 @@ def test_serviceability_block_from_service_result(): afid_sag_metadata={ "sag_pid": DUMMY_SAG_PID, "revision": DUMMY_SAG_REVISION, - "sag_variant": DUMMY_SAG_VARIANT, + "variant": DUMMY_SAG_VARIANT, }, engine_version_info={"version": DUMMY_HUB_VERSION}, ) @@ -195,7 +195,7 @@ def test_serviceability_block_from_service_result_isa_version_info(): afid_sag_metadata={ "sag_pid": DUMMY_SAG_PID, "revision": DUMMY_SAG_REVISION, - "sag_variant": DUMMY_SAG_VARIANT, + "variant": DUMMY_SAG_VARIANT, }, isa_version_info={"VERSION": "1.2.3"}, ) From 1409d337db34ebb23e6786ad196d0eea1718d315 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 23 Jun 2026 21:53:16 +0000 Subject: [PATCH 38/39] docs: Update plugin documentation [automated] --- docs/PLUGIN_DOC.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 80d4e012..cf7cb371 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -1051,7 +1051,8 @@ RedfishOemDiagDataModel ### Description -MI3XX OOB Redfish serviceability collector. +Collect MI3XX BMC Redfish data: event log members (with pagination), firmware inventory, + CPER attachment bytes for qualifying events, and optional assembly/chassis metadata. **Bases**: ['ServiceabilityCollectorBase'] From ebea0dc5cd7f2effaa6e4c1a653ee45f8207a627 Mon Sep 17 00:00:00 2001 From: Alexandra Bara Date: Tue, 23 Jun 2026 18:27:45 -0500 Subject: [PATCH 39/39] added early mention of plugin doc tavble --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9b3b25ae..dce86aef 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Node Scraper Node Scraper is a tool which performs automated data collection and analysis for the purposes of -system debug. +system debug. For details on what data is collected and analyzed, see the [plugin reference table](docs/PLUGIN_DOC.md). ## Table of Contents - [Installation](#installation)