From 3bd46ea84f55df1ffbc6eae9d445fd9cae5090d7 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Fri, 7 Oct 2005 03:25:09 +0000 Subject: [PATCH] STABLE140-windows-busy-vs-offline-20051006 Discovered a failure in the state machine. There was no method of distinguishing between all servers being Down (which is handled by the background thread) and all volumes being offline (perhaps due to a move). (cherry picked from commit 465273f096b1a1720ca07638537fd4c65d7d1d8d) --- src/WINNT/afsd/cm.h | 2 ++ src/WINNT/afsd/cm_conn.c | 70 +++++++++++++++++++++++++++++++++------- 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/src/WINNT/afsd/cm.h b/src/WINNT/afsd/cm.h index 91f3efef2..eb5a6c736 100644 --- a/src/WINNT/afsd/cm.h +++ b/src/WINNT/afsd/cm.h @@ -251,4 +251,6 @@ int RXAFS_Lookup (struct rx_connection *, #define CM_ERROR_TIDIPC (CM_ERROR_BASE+44) #define CM_ERROR_TOO_MANY_SYMLINKS (CM_ERROR_BASE+45) #define CM_ERROR_PATH_NOT_COVERED (CM_ERROR_BASE+46) +/* 47 and 48 are reserved for the byte range lock support */ +#define CM_ERROR_ALLDOWN (CM_ERROR_BASE+49) #endif /* __CM_H_ENV__ */ diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index ccb31d1e0..d33a8e32d 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -233,13 +233,54 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, } } - else if (errorCode == CM_ERROR_ALLOFFLINE) { - osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE."); + else if (errorCode == CM_ERROR_ALLDOWN) { + osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLDOWN."); /* Servers marked DOWN will be restored by the background daemon * thread as they become available. */ } + else if (errorCode == CM_ERROR_ALLOFFLINE) { + if (timeLeft > 7) { + osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE."); + thrd_Sleep(5000); + + if (fidp) { /* Not a VLDB call */ + if (!serversp) { + code = cm_GetServerList(fidp, userp, reqp, &serverspp); + if (code == 0) { + serversp = *serverspp; + free_svr_list = 1; + } + } + if (serversp) { + lock_ObtainWrite(&cm_serverLock); + for (tsrp = serversp; tsrp; tsrp=tsrp->next) + tsrp->status = not_busy; + lock_ReleaseWrite(&cm_serverLock); + if (free_svr_list) { + cm_FreeServerList(&serversp); + *serverspp = serversp; + } + retry = 1; + } + + cm_ForceUpdateVolume(fidp, userp, reqp); + } else { /* VLDB call */ + if (serversp) { + lock_ObtainWrite(&cm_serverLock); + for (tsrp = serversp; tsrp; tsrp=tsrp->next) + tsrp->status = not_busy; + lock_ReleaseWrite(&cm_serverLock); + if (free_svr_list) { + cm_FreeServerList(&serversp); + *serverspp = serversp; + } + } + } + } + } + /* if all servers are busy, mark them non-busy and start over */ else if (errorCode == CM_ERROR_ALLBUSY) { osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLBUSY."); @@ -465,12 +506,17 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, cm_serverRef_t *tsrp; cm_server_t *tsp; long firstError = 0; - int someBusy = 0, someOffline = 0, allBusy = 1, allDown = 1; + int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1; long timeUsed, timeLeft, hardTimeLeft; #ifdef DJGPP struct timeval now; #endif /* DJGPP */ + if (serversp == NULL) { + osi_Log1(afsd_logp, "cm_ConnByMServers returning 0x%x", CM_ERROR_NOSUCHVOLUME); + return CM_ERROR_NOSUCHVOLUME; + } + *connpp = NULL; #ifndef DJGPP @@ -490,13 +536,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, cm_GetServerNoLock(tsp); lock_ReleaseWrite(&cm_serverLock); if (!(tsp->flags & CM_SERVERFLAG_DOWN)) { + allDown = 0; if (tsrp->status == busy) { - allDown = 0; + allOffline = 0; someBusy = 1; } else if (tsrp->status == offline) { - someOffline = 1; + allBusy = 0; + someOffline = 1; } else { - allDown = 0; + allOffline = 0; allBusy = 0; code = cm_ConnByServer(tsp, usersp, connpp); if (code == 0) { /* cm_CBS only returns 0 */ @@ -523,15 +571,15 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, lock_ObtainWrite(&cm_serverLock); cm_PutServerNoLock(tsp); } - lock_ReleaseWrite(&cm_serverLock); + if (firstError == 0) { - if (serversp == NULL) - firstError = CM_ERROR_NOSUCHVOLUME; - else if (allDown) - firstError = CM_ERROR_ALLOFFLINE; + if (allDown) + firstError = CM_ERROR_ALLDOWN; else if (allBusy) firstError = CM_ERROR_ALLBUSY; + else if (allOffline || (someBusy && someOffline)) + firstError = CM_ERROR_ALLOFFLINE; else { osi_Log0(afsd_logp, "cm_ConnByMServers returning impossible error TIMEDOUT"); firstError = CM_ERROR_TIMEDOUT; -- 2.39.5