From f139904c3eb41b97a218a97d4ff7644aff45338b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Vin=C3=ADcius=20Ferr=C3=A3o?= <2031761+viniciusferrao@users.noreply.github.com> Date: Sat, 2 May 2026 19:10:01 -0300 Subject: [PATCH] fix: invalidate NodeRange caches inherited across fork xcatd forks child processes to handle plugin requests. The child inherits NodeRange.pm's module-level caches (@allnodeset, %allgrphash, @grplist) with their timestamps from the parent. If the parent had populated these caches within the past 5 seconds, the child reuses stale data that does not reflect database changes committed by other requests that completed between cache population and the fork. This causes non-deterministic failures in group-definition regression tests (chdef_group, mkdef_group, rmdef_group) where lsdef -s runs noderange expansion inside the forked plugin process and hits the inherited stale cache that predates the mkdef -t group commit. Track the PID at cache-build time and treat any cache built by a different PID as expired, forcing a fresh database read in children. --- perl-xCAT/xCAT/NodeRange.pm | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/perl-xCAT/xCAT/NodeRange.pm b/perl-xCAT/xCAT/NodeRange.pm index e28bac035..f79fcec5e 100644 --- a/perl-xCAT/xCAT/NodeRange.pm +++ b/perl-xCAT/xCAT/NodeRange.pm @@ -41,8 +41,11 @@ my $glstamp = 0; my $allnodesetstamp = 0; my $allgrphashstamp = 0; my %allgrphash; -my $retaincache = 0; -my $recurselevel = 0; +my $retaincache = 0; +my $recurselevel = 0; +my $nodeset_pid = 0; +my $grplist_pid = 0; +my $allgrphash_pid = 0; my @cachedcolumns; @@ -74,6 +77,9 @@ sub reset_db { #workaround, something seems to be trying to use a corrupted reference to grptab #this allows init_dbworker to reset the object $grptab = 0; + $nodeset_pid = 0; + $grplist_pid = 0; + $allgrphash_pid = 0; } sub nodesbycriteria { @@ -200,7 +206,8 @@ sub nodesbycriteria { sub expandatom { my $atom = shift; if ($recurselevel > 4096) { die "NodeRange seems to be hung on evaluating $atom, recursion limit hit"; } - unless (scalar(@allnodeset) and (($allnodesetstamp + 5) > time())) { #Build a cache of all nodes, some corner cases will perform worse, but by and large it will do better. We could do tests to see where the breaking points are, and predict how many atoms we have to evaluate to mitigate, for now, implement the strategy that keeps performance from going completely off the rails + unless (scalar(@allnodeset) and (($allnodesetstamp + 5) > time()) and $nodeset_pid == $$) { #Build a cache of all nodes, some corner cases will perform worse, but by and large it will do better. We could do tests to see where the breaking points are, and predict how many atoms we have to evaluate to mitigate, for now, implement the strategy that keeps performance from going completely off the rails + $nodeset_pid = $$; $allnodesetstamp = time(); $nodelist->_set_use_cache(1); @allnodeset = $nodelist->getAllAttribs('node', 'groups'); @@ -234,7 +241,8 @@ sub expandatom { unless ($grptab) { $grptab = xCAT::Table->new('nodegroup'); } - if ($grptab and (($glstamp < (time() - 5)) or (not $didgrouplist and not scalar @grplist))) { + if ($grptab and (($glstamp < (time() - 5)) or $grplist_pid != $$ or (not $didgrouplist and not scalar @grplist))) { + $grplist_pid = $$; $didgrouplist = 1; $glstamp = time(); my $grplist_ptr = $grptab->getAllEntries(); @@ -270,7 +278,8 @@ sub expandatom { # The atom is not a dynamic node group, is it a static node group??? if (!$isdynamicgrp) { - unless (scalar %allgrphash and (time() < ($allgrphashstamp + 5))) { #build a group membership cache + unless (scalar %allgrphash and (time() < ($allgrphashstamp + 5)) and $allgrphash_pid == $$) { #build a group membership cache + $allgrphash_pid = $$; $allgrphashstamp = time(); %allgrphash = (); my $nlent; @@ -488,6 +497,9 @@ sub retain_cache { #A semi private operation to be used *ONLY* in the interestin unless ($retaincache) { #take a call to retain_cache(0) to also mean that any existing #cache must be zapped if ($nodelist) { $nodelist->_build_cache(1); } + $nodeset_pid = 0; + $grplist_pid = 0; + $allgrphash_pid = 0; $glstamp = 0; $allnodesetstamp = 0; $allgrphashstamp = 0;