From b80dc7d11ce2e6a3428c76065c5ccdab2f8d8011 Mon Sep 17 00:00:00 2001 From: dev Date: Thu, 4 Aug 2005 18:31:58 +0000 Subject: [PATCH] add MAXNODEPERJOB bounds checking git-svn-id: svn://opensvn.adaptivecomputing.com/maui/trunk@16 3f5042e3-fb1d-0410-be18-d6ca2573e517 --- include/moab.h | 3 ++- src/moab/MPBSI.c | 45 ++++++++++++++++++++++++++++++++++++--------- src/moab/MSched.c | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 68 insertions(+), 12 deletions(-) diff --git a/include/moab.h b/include/moab.h index f92c0d4..5d32416 100644 --- a/include/moab.h +++ b/include/moab.h @@ -1059,7 +1059,8 @@ enum { mjfSPViolation, mjfIgnNodePolicies, mjfNoRMStart, - mjfGlobalQueue }; + mjfGlobalQueue, + mjfIsExiting }; enum { mjifNONE = 0, diff --git a/src/moab/MPBSI.c b/src/moab/MPBSI.c index 238a623..0ab5394 100644 --- a/src/moab/MPBSI.c +++ b/src/moab/MPBSI.c @@ -3,11 +3,11 @@ /* Contains: * * int MPBSInitialize(R,SC) * * int MPBSWorkloadQuery(R,JCount,SC) * - * int __MPBSJobGetState(Name,Status,PJob) * + * int __MPBSJobGetState(Name,Status,PJob,IE) * * int MPBSClusterQuery(R,RCount,SC) * * int __MPBSGetNodeState(Name,State,PNode) * * int MPBSJobStart(J,R,Msg,SC) * - * int MPBSCancelJob(J,Message,R) * + * int MPBSJobCancel(J,Message,R) * * int MPBSJobMigrate(J,R,NL,Msg,SC) * * int MPBSJobSubmit(String,R,J,JobName,Msg,SC) * * int MPBSNodeLoad(N,PNode,State,RMIndex) * @@ -89,7 +89,7 @@ int __MPBSSystemQuery(mrm_t *,int *); int MPBSJobUpdate(struct batch_status *,mjob_t *,short *,int); int MPBSNodeLoad(mnode_t *,struct batch_status *,int,mrm_t *); int MPBSNodeUpdate(mnode_t *,struct batch_status *,enum MNodeStateEnum,mrm_t *); -int __MPBSJobGetState(struct batch_status *,mrm_t *,char *,enum MJobStateEnum *); +int __MPBSJobGetState(struct batch_status *,mrm_t *,char *,enum MJobStateEnum *,mbool_t *); int __MPBSGetNodeState(char *,enum MNodeStateEnum *,struct batch_status *); int MPBSQueryMOM(mnode_t *,mrm_t *,char *,int *); int MPBSGetClassInfo(mnode_t *N,char C[][MAX_MNAME],char A[][MAX_MNAME]); @@ -545,6 +545,8 @@ int MPBSWorkloadQuery( mjob_t *JNext; + mbool_t IsExiting; + const char *FName = "MPBSWorkloadQuery"; DBG(1,fPBS) DPrint("%s(%s,JCount,SC)\n", @@ -585,7 +587,7 @@ int MPBSWorkloadQuery( else ErrMsg = NULL; - if (ErrMsg == NULL) + if (pbs_errno == 0) { DBG(3,fPBS) DPrint("INFO: queue is empty\n"); } @@ -630,9 +632,12 @@ int MPBSWorkloadQuery( RMJID[0] = '\0'; - if (__MPBSJobGetState(cur_job,R,RMJID,&Status) == FAILURE) + if (__MPBSJobGetState(cur_job,R,RMJID,&Status,&IsExiting) == FAILURE) break; + if (IsExiting == TRUE) + J->Flags |= (1 << mjfIsExiting); + MJobGetName(NULL,RMJID,R,SJID,sizeof(SJID),mjnShortName); switch (Status) @@ -830,13 +835,17 @@ int __MPBSJobGetState( struct batch_status *PJob, /* I */ mrm_t *R, /* I */ char *JobName, /* O (optional) */ - enum MJobStateEnum *Status) /* O */ + enum MJobStateEnum *Status, /* O */ + mbool_t *IsExiting) { struct attrl *AP; *Status = mjsNONE; + if (IsExiting != NULL) + *IsExiting = FALSE; + if ((JobName != NULL) && (JobName[0] == '\0')) { strcpy(JobName,PJob->name); @@ -887,7 +896,10 @@ int __MPBSJobGetState( break; case 'E': /* differences between 'exiting' and 'completed?' */ - + + if (IsExiting != NULL) + *IsExiting = TRUE; + *Status = mjsRunning; break; @@ -2097,6 +2109,11 @@ int MPBSJobCancel( (R != NULL) ? R->Name : "NULL", (Message != NULL) ? Message : "NULL"); + if (J->Flags & (1 << mjfIsExiting)) + { + return(SUCCESS); + } + if (MSched.PreemptPolicy == mppCheckpoint) { return(MPBSJobCkpt(J,R,TRUE,Message,SC)); @@ -3314,6 +3331,8 @@ int MPBSJobLoad( tpbsa_t TA; + mbool_t IsExiting; + const char *FName = "MPBSJobLoad"; DBG(2,fPBS) DPrint("%s(%s,%s,J,TaskList,%d)\n", @@ -3329,7 +3348,7 @@ int MPBSJobLoad( memset(&TA,0,sizeof(TA)); - if (__MPBSJobGetState(PJob,&MRM[RMIndex],NULL,&J->State) == FAILURE) + if (__MPBSJobGetState(PJob,&MRM[RMIndex],NULL,&J->State,&IsExiting) == FAILURE) { DBG(1,fPBS) DPrint("ALERT: cannot get job state info for job '%s'\n", J->Name); @@ -3339,6 +3358,9 @@ int MPBSJobLoad( return(FAILURE); } + if (IsExiting == TRUE) + J->Flags |= (1 << mjfIsExiting); + /* add resource requirements information */ if (MReqCreate(J,NULL,&RQ,FALSE) == FAILURE) @@ -3558,6 +3580,8 @@ int MPBSJobUpdate( int MaxJobMem; int MaxJobSwap; + mbool_t IsExiting; + const char *FName = "MPBSJobUpdate"; DBG(2,fPBS) DPrint("%s(%s,%s,TaskList,%d)\n", @@ -3577,7 +3601,7 @@ int MPBSJobUpdate( TaskList[0] = -1; - if (__MPBSJobGetState(PJob,&MRM[RMIndex],NULL,&J->State) == FAILURE) + if (__MPBSJobGetState(PJob,&MRM[RMIndex],NULL,&J->State,&IsExiting) == FAILURE) { DBG(1,fPBS) DPrint("ALERT: cannot get job state info for job '%s'\n", J->Name); @@ -3585,6 +3609,9 @@ int MPBSJobUpdate( return(FAILURE); } + if (IsExiting == TRUE) + J->Flags |= (1 << mjfIsExiting); + RQ = J->Req[0]; TaskCount = 1; diff --git a/src/moab/MSched.c b/src/moab/MSched.c index edc0b7b..bbfb44d 100644 --- a/src/moab/MSched.c +++ b/src/moab/MSched.c @@ -6186,6 +6186,9 @@ int MJobDistributeTasks( for (nindex = 0;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + MaxTPN = MAX(MaxTPN,RQ->NodeList[nindex].TC); } @@ -6197,6 +6200,9 @@ int MJobDistributeTasks( for (nindex = 0;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (Overflow == 0) break; @@ -6223,6 +6229,9 @@ int MJobDistributeTasks( for (nindex = 0;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (index == RQ->NodeCount) { DBG(6,fSCHED) DPrint("INFO: nodecount %d reached\n", @@ -6290,6 +6299,9 @@ int MJobDistributeTasks( for (nindex = sindex;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if ((int)RQ->NodeList[nindex].TC > MaxTPN) { MaxTPN = RQ->NodeList[nindex].TC; @@ -6314,6 +6326,9 @@ int MJobDistributeTasks( for (nindex = 0;nindex < AllocIndex;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (RQ->BlockingFactor != 1) { /* blocking factor specification removes single step constraint */ @@ -6351,6 +6366,9 @@ int MJobDistributeTasks( for (nindex = AllocIndex - 1;nindex > 0;nindex--) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (tmpNodeList[nindex].TC != tmpNodeList[nindex - 1].TC) { tmpNodeList[nindex - 1].TC--; @@ -6369,6 +6387,9 @@ int MJobDistributeTasks( for (nindex = AllocIndex - 1;nindex >= 0;nindex--) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + tmpNodeList[nindex].TC--; TasksAvail--; @@ -6403,6 +6424,9 @@ int MJobDistributeTasks( for (nindex = 0;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (RQ->NodeList[nindex].TC >= TPN) { tmpNodeCount --; @@ -6432,6 +6456,9 @@ int MJobDistributeTasks( for (nindex = 0;RQ->NodeList[nindex].N != NULL;nindex++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + tmpNodeList[index].N = RQ->NodeList[nindex].N; if (RQ->NodeCount > 0) @@ -6515,8 +6542,6 @@ int MJobDistributeTasks( } /* END else (R->Type == rmLL) */ } /* END for (rqindex) */ - - tmpNodeList[tmpNodeCount].N = NULL; if (NodeList != NULL) @@ -6626,6 +6651,9 @@ int MJobDistributeTasks( for (nindex1 = 0;NodeList[nindex1].N != NULL;nindex1++) { + if (nindex >= MMAX_NODE_PER_JOB) + break; + if (NodeList[nindex1].TC == 0) continue;