From 0b10c240bd02ee48dfde71380c3b3b2009e4bd92 Mon Sep 17 00:00:00 2001 From: Jarrod Johnson Date: Fri, 5 Jun 2026 09:44:10 -0400 Subject: [PATCH] Fx issues with IPMI session management Do not continue waiting when session is broken. Do not call _timedout without releasing the lock first. Properly await on relog with bad rakp4 If an accounting issue pushes logontries too far without touching zero, then still recognize retries were exhausted. Timeout on missing RAKP2 if retries were already exhausted. --- .../aiohmi/ipmi/private/session.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/confluent_server/aiohmi/ipmi/private/session.py b/confluent_server/aiohmi/ipmi/private/session.py index d2eeefe8..746c958e 100644 --- a/confluent_server/aiohmi/ipmi/private/session.py +++ b/confluent_server/aiohmi/ipmi/private/session.py @@ -549,7 +549,7 @@ class Session(object): self.socketchecking.release() await self.login() if not self.async_: - while self.logging: + while self.logging and not self.broken: await Session.wait_for_rsp() if self.broken: raise exc.IpmiException(self.errormsg) @@ -792,9 +792,9 @@ class Session(object): self.waiting_sessions.pop(self, None) if not self.lastpayload and not self.logging: return - await self._timedout() finally: WAITING_SESSIONS.release() + await self._timedout() finally: self.awaitingresponse = False @@ -1668,7 +1668,7 @@ class Session(object): if data[1] == 2 and self.logontries: # if we retried RAKP3 because # RAKP4 got dropped, BMC can consider it done and we must # restart - self._relog() + await self._relog() return # ignore 15 value if we are retrying. # xCAT did but I can't recall why exactly @@ -1804,7 +1804,7 @@ class Session(object): self.nowait = True self.timeout += 1 if self.timeout > self.maxtimeout: - if not self.logontries: + if self.logontries <= 0: response = {'error': 'timeout', 'code': 0xffff} if self.ipmicallback: await self.ipmicallback(response) @@ -1834,8 +1834,20 @@ class Session(object): or self.sessioncontext == 'EXPECTINGRAKP4'): # If we can't be sure which RAKP was dropped or if RAKP3/4 was just # delayed, the most reliable thing to do is rewind and start over - # bmcs do not take kindly to receiving RAKP1 or RAKP3 twice - await self._relog() + # bmcs do not take kindly to receiving RAKP1 or RAKP3 twice. + # Only do this while we still have login attempts budgeted; + # otherwise each lost RAKP would spawn a fresh RAKP1 forever + # (_relog resets self.timeout, so the timeout budget above never + # trips during the RAKP phase). + if self.logontries > 0: + await self._relog() + else: + response = {'error': 'timeout', 'code': 0xffff} + if self.ipmicallback: + await self.ipmicallback(response) + self.nowait = False + await self._mark_broken() + return else: # in IPMI case, the only recourse is to act as if the packet is # idempotent. SOL has more sophisticated retry handling # the biggest risks are reset sp which is often fruitless to retry