From 0967ac6025efa877dc6b073837fec80f59f98eac Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Thu, 30 Apr 2015 14:13:52 -0400 Subject: [PATCH] Implement event log retrieval from BMCs BMCs retain historical event data in the SEL. Implement code to read through the SEL. It also passes the processed data to the OEM framework for further processing since OEMs may define a number of events. Note that it is not necessarily the OEM of the system that defines the OEM decode of the event. A timestamped OEM event may contain a different OEM id. This permits things like the system, the OS, agents, et all to use the SEL to store various things. Change-Id: Ibfb07146b1dfa0ce06df863e805b5a30f17d2f18 --- pyghmi/ipmi/command.py | 59 ++++ pyghmi/ipmi/events.py | 475 +++++++++++++++++++++++++++++++ pyghmi/ipmi/private/constants.py | 180 +++++++++--- pyghmi/ipmi/sdr.py | 2 +- 4 files changed, 671 insertions(+), 45 deletions(-) create mode 100644 pyghmi/ipmi/events.py diff --git a/pyghmi/ipmi/command.py b/pyghmi/ipmi/command.py index bee7f43a..cd8ddcbc 100644 --- a/pyghmi/ipmi/command.py +++ b/pyghmi/ipmi/command.py @@ -19,6 +19,7 @@ import pyghmi.constants as const import pyghmi.exceptions as exc +import pyghmi.ipmi.events as sel import pyghmi.ipmi.fru as fru from pyghmi.ipmi.oem.lookup import get_oem_handler from pyghmi.ipmi.private import session @@ -296,6 +297,31 @@ class Command(object): raise exc.IpmiException(response['error']) return {'bootdev': bootdev} + def xraw_command(self, netfn, command, bridge_request=(), data=(), + delay_xmit=None): + """Send raw ipmi command to BMC, raising exception on error + + This is identical to raw_command, except it raises exceptions + on IPMI errors and returns data as a buffer. This is the recommend + function to use. The response['data'] being a buffer allows + traditional indexed access as well as works nicely with + struct.unpack_from when certain data is coming back. + + :param netfn: Net function number + :param command: Command value + :param bridge_request: The target slave address and channel number for + the bridge request. + :param data: Command data as a tuple or list + :returns: dict -- The response from IPMI device + """ + rsp = self.ipmi_session.raw_command(netfn=netfn, command=command, + bridge_request=bridge_request, + data=data, delay_xmit=delay_xmit) + if 'error' in rsp: + raise exc.IpmiException(rsp['error'], rsp['code']) + rsp['data'] = buffer(bytearray(rsp['data'])) + return rsp + def raw_command(self, netfn, command, bridge_request=(), data=(), delay_xmit=None): """Send raw ipmi command to BMC @@ -364,6 +390,39 @@ class Command(object): if 'error' in response: raise exc.IpmiException(response['error']) + def init_sdr(self): + """Initialize SDR + + Do the appropriate action to have a relevant sensor description + repository for the current management controller + """ + # For now, return current sdr if it exists and still connected + # future, check SDR timestamp for continued relevance + # further future, optionally support a cache directory/file + # to store cached copies for given device id, product id, mfg id, + # sdr timestamp, our data version revision, aux firmware revision, + # and oem defined field + if self._sdr is None: + self._sdr = sdr.SDR(self) + return self._sdr + + def get_event_log(self, clear=False): + """Retrieve the log of events, optionally clearing + + The contents of the SEL are returned as an iterable. Timestamps + are given as local time, ISO 8601 (whether the target has an accurate + clock or not). Timestamps may be omitted for events that cannot be + given a timestamp, leaving only the raw timecode to provide relative + time information. clear set to true will result in the log being + cleared as it is returned. This allows an atomic fetch and clear + behavior so that no log entries will be lost between the fetch and + clear actions. There is no 'clear_event_log' function to encourage + users to create code that is not at risk for losing events. + + :param clear: Whether to remove the SEL entries from the target BMC + """ + return sel.EventHandler(self.init_sdr()).fetch_sel(self, clear) + def get_inventory_descriptions(self): """Retrieve list of things that could be inventoried diff --git a/pyghmi/ipmi/events.py b/pyghmi/ipmi/events.py new file mode 100644 index 00000000..ad04d93b --- /dev/null +++ b/pyghmi/ipmi/events.py @@ -0,0 +1,475 @@ +# vim: tabstop=4 shiftwidth=4 softtabstop=4 + +# Copyright 2015 Lenovo +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# __author__ = 'jjohnson2@lenovo.com' + +import pyghmi.constants as pygconst +import pyghmi.exceptions as pygexc +import pyghmi.ipmi.private.constants as ipmiconst +import struct +import time + + +psucfg_errors = { + 0: 'Vendor mismatch', + 1: 'Revision mismatch', + 2: 'Processor missing', # e.g. pluggable CPU VRMs... + 3: 'Insufficient power', + 4: 'Voltage mismatch', +} + +firmware_progress = { + 0: 'Unspecified', + 1: 'Memory initialization', + 2: 'Disk initialization', + 3: 'Non-primary Processor initialization', + 4: 'User authentication', + 5: 'Entering setup', + 6: 'USB initialization', + 7: 'PCI initialization', + 8: 'Option ROM initialization', + 9: 'Video initialization', + 0xa: 'Cache initialization', + 0xb: 'SMBus initialization', + 0xc: 'Keyboard initialization', + 0xd: 'Embedded controller initialization', + 0xe: 'Docking station attachment', + 0xf: 'Docking station enabled', + 0x10: 'Docking station ejection', + 0x11: 'Docking station disabled', + 0x12: 'Waking OS', + 0x13: 'Starting OS boot', + 0x14: 'Baseboard initialization', + 0x16: 'Floppy initialization', + 0x17: 'Keyboard test', + 0x18: 'Pointing device test', + 0x19: 'Primary processor initialization', +} + +firmware_errors = { + 0: 'Unspecified', + 1: 'No memory installed', + 2: 'All memory failed', + 3: 'Unrecoverable disk failure', + 4: 'Unrecoverable board failure', + 5: 'Unrecoverable diskette failure', + 6: 'Unrecoverable storage controller failure', + 7: 'Unrecoverable keyboard failure', # Keyboard error, press + # any key to continue.. + 8: 'Removable boot media not found', + 9: 'Video adapter failure', + 0xa: 'No video device', + 0xb: 'Firmware corruption detected', + 0xc: 'CPU voltage mismatch', + 0xd: 'CPU speed mismatch', +} + +auxlog_actions = { + 0: 'entry added', + 1: 'entry added (could not map to standard)', + 2: 'entry added with corresponding standard events', + 3: 'log cleared', + 4: 'log disabled', + 5: 'log enabled', +} + +restart_causes = { + 0: 'Unknown', + 1: 'Remote request', + 2: 'Reset button', + 3: 'Power button', + 4: 'Watchdog', + 5: 'OEM', + 6: 'Power restored', + 7: 'Power restored', + 8: 'Reset due to event', + 9: 'Cycle due to event', + 0xa: 'OS reset', + 0xb: 'Timer wake', +} + +slot_types = { + 0: 'PCI', + 1: 'Drive Array', + 2: 'External connector', + 3: 'Docking', + 4: 'Other', + 5: 'Entity ID', + 6: 'AdvancedTCA', + 7: 'Memory', + 8: 'Fan', + 9: 'PCIe', + 10: 'SCSI', + 11: 'SATA/SAS', +} + +power_states = { + 0: 'S0', + 1: 'S1', + 2: 'S2', + 3: 'S3', + 4: 'S4', + 5: 'S5', + 6: 'S4 or S5', + 7: 'G3', + 8: 'S1, S2, or S3', + 9: 'G1', + 0xa: 'S5', + 0xb: 'on', + 0xc: 'off', +} + +watchdog_boot_phases = { + 1: 'Firmware', + 2: 'Firmware', + 3: 'OS Load', + 4: 'OS', + 5: 'OEM', +} + +version_changes = { + 1: 'Device ID', + 2: 'Management controller firmware', + 3: 'Management controller revision', + 4: 'Management conroller manufacturer', + 5: 'IPMI version', + 6: 'Management controller firmware', + 7: 'Management controller boot block', + 8: 'Management controller firmware', + 9: 'System Firmware (UEFI/BIOS)', + 0xa: 'SMBIOS', + 0xb: 'OS', + 0xc: 'OS Loader', + 0xd: 'Diagnostics', + 0xe: 'Management agent', + 0xf: 'Management application', + 0x10: 'Management middleware', + 0x11: 'FPGA', + 0x12: 'FRU', + 0x13: 'FRU', + 0x14: 'Equivalent FRU', + 0x15: 'Updated FRU', + 0x16: 'Older FRU', + 0x17: 'Hardware (switch/jumper)', +} + +fru_states = { + 0: 'Normal', + 1: 'Externally requested', + 2: 'Latch', + 3: 'Hot swap', + 4: 'Internal action', + 5: 'Lost communication', + 6: 'Lost communication', + 7: 'Unexpected removal', + 8: 'Operator', + 9: 'Unable to compute IPMB address', + 0xa: 'Unexpected deactivation', +} + + +def decode_eventdata(sensor_type, offset, eventdata, sdr): + """Decode extra event data from an alert or log + + Provide a textual summary of eventdata per descriptions in + Table 42-3 of the specification. This is for sensor specific + offset events only. + + :param sensor_type: The sensor type number from the event + :param offset: Sensor specific offset + :param eventdata: The three bytes from the log or alert + """ + if sensor_type == 5 and offset == 4: # link loss, indicates which port + return 'Port {0}'.format(eventdata[1]) + elif sensor_type == 8 and offset == 6: # PSU cfg error + errtype = eventdata[2] & 0b1111 + return psucfg_errors.get(errtype, 'Unknown') + elif sensor_type == 0xc and offset == 8: # Memory spare + return 'Module {0}'.format(eventdata[2]) + elif sensor_type == 0xf: + if offset == 0: # firmware error + return firmware_errors.get(eventdata[1], 'Unknown') + elif offset in (1, 2): + return firmware_progress.get(eventdata[1], 'Unknown') + elif sensor_type == 0x10: + if offset == 0: # Correctable error logging on a specific memory part + return 'Module {0}'.format(eventdata[1]) + elif offset == 1: + return 'Reading type {0:02X}h, offset {1:02X}h'.format( + eventdata[1], eventdata[2] & 0b1111) + elif offset == 5: + return '{0}%'.format(eventdata[2]) + elif offset == 6: + return 'Processor {0}'.format(eventdata[1]) + elif sensor_type == 0x12: + if offset == 3: + action = (eventdata[1] & 0b1111000) >> 4 + return auxlog_actions.get(action, 'Unknown') + elif offset == 4: + sysactions = [] + if eventdata[1] & 0b1 << 5: + sysactions.append('NMI') + if eventdata[1] & 0b1 << 4: + sysactions.append('OEM action') + if eventdata[1] & 0b1 << 3: + sysactions.append('Power Cycle') + if eventdata[1] & 0b1 << 2: + sysactions.append('Reset') + if eventdata[1] & 0b1 << 1: + sysactions.append('Power Down') + if eventdata[1] & 0b1: + sysactions.append('Alert') + return ','.join(sysactions) + elif offset == 5: # Clock change event, either before or after + if eventdata[1] & 0b10000000: + return 'After' + else: + return 'Before' + elif sensor_type == 0x19 and offset == 0: + return 'Requested {0] while {1}'.format(eventdata[1], eventdata[2]) + elif sensor_type == 0x1d and offset == 7: + return restart_causes.get(eventdata[1], 'Unknown') + elif sensor_type == 0x21 and offset == 0x9: + return '{0} {1}'.format(slot_types.get(eventdata[1], 'Unknown'), + eventdata[2]) + + elif sensor_type == 0x23: + phase = eventdata[1] & 0b1111 + return watchdog_boot_phases.get(phase, 'Unknown') + elif sensor_type == 0x28: + if offset == 4: + return 'Sensor {0}'.format(eventdata[1]) + elif offset == 5: + islogical = (eventdata[1] & 0b10000000) + if islogical: + if eventdata[2] in sdr.fru: + return sdr.fru[eventdata[2]].fru_name + else: + return 'FRU {0}'.format(eventdata[2]) + elif sensor_type == 0x2a and offset == 3: + return 'User {0}'.format(eventdata[1]) + elif sensor_type == 0x2b: + return version_changes.get(eventdata[1], 'Unknown') + elif sensor_type == 0x2c: + cause = (eventdata[1] & 0b11110000) >> 4 + cause = fru_states.get(cause, 'Unknown') + oldstate = eventdata[1] & 0b1111 + if oldstate != offset: + try: + cause += '(change from {0})'.format( + ipmiconst.sensor_type_offsets[0x2c][oldstate]['desc']) + except KeyError: + pass + + +def _fix_sel_time(records, ipmicmd): + timefetched = False + rsp = None + while not timefetched: + try: + rsp = ipmicmd.xraw_command(netfn=0xa, command=0x48) + timefetched = True + except pygexc.IpmiException as pi: + if pi.ipmicode == 0x81: + continue + raise + # The specification declares an epoch and all that, but we really don't + # care. We instead just focus on differences from the 'present' + nowtime = struct.unpack_from('> 6 + byte3type = (event_data[0] & 0b110000) >> 4 + if byte2type == 1: + event['triggered_value'] = event_data[1] + evtoffset = event_data[0] & 0b1111 + if event_type <= 0xc: + # use generic offset decode for event description + event['entity_type'] = ipmiconst.sensor_type_codes.get( + sensor_type, '') + evreading = ipmiconst.generic_type_offsets.get( + event_type, {}).get(evtoffset, {}) + event['description'] = evreading.get('desc', '') + event['severity'] = evreading.get('severity', pygconst.Health.Ok) + elif event_type == 0x6f: + event['entity_type'] = ipmiconst.sensor_type_codes.get( + sensor_type, '') + evreading = ipmiconst.sensor_type_offsets.get( + sensor_type, {}).get(evtoffset, {}) + event['description'] = evreading.get('desc', '') + event['severity'] = evreading.get('severity', pygconst.Health.Ok) + if event_type == 1: # threshold + if byte3type == 1: + event['threshold_value'] = event_data[2] + if 3 in (byte2type, byte3type) or event_type == 0x6f: + # sensor specific decode, see sdr module... + # 2 - 0xc: generic discrete, 0x6f, sensor specific + additionaldata = decode_eventdata( + eventdata[3], evtoffset, event_data, self._sdr) + if additionaldata: + event['description'] = ': '.join((event['description'], + additionaldata)) + + def _sel_decode(self, origselentry): + selentry = bytearray(origselentry) + event = {} + if selentry[2] == 2 or (0xc0 <= selentry[2] <= 0xdf): + # Either standard, or at least the timestamp is standard + event['timecode'] = struct.unpack_from('= 0xe0: + # In this class of OEM message, all bytes are OEM, interpretation + # is wholly left up to the OEM layer, using the OEM ID of the BMC + event['oemdata'] = selentry[3:] + return event + + def _fetch_entries(self, ipmicmd, startat, targetlist, rsvid=0): + curr = startat + endat = curr + while curr != 0xffff: + endat = curr + reqdata = bytearray(struct.pack('