From: Jeffrey Altman Date: Fri, 23 Jul 2004 22:55:23 +0000 (+0000) Subject: allserversdown-20040723 X-Git-Tag: openafs-devel-1_3_66~18 X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=d4b14b4e947b44c8e167c81d07554ca52bc814ff;p=packages%2Fo%2Fopenafs.git allserversdown-20040723 When all servers associated with a volume get into the CM_SERVERFLAG_DOWN state, the error CM_ERROR_ALLOFFLINE would be returned by cm_ConnByMServers. cm_Analyze was then supposed to be pausing for 5 seconds and then reset the volume information. Unfortunately, although it called cm_ForceUpdateVolume, cm_ForceUpdateVolume does not reset the CM_SERVERFLAG_DOWN state on the servers. Therefore, when cm_ForceUpdateVolume calls cm_ConnByMServers it would be given a CM_ERROR_ALLOFFLINE. In other words, there was no way out of the state. cm_Analyze will now reset the CM_SERVERFLAG_DOWN as well as setting the server status to not_busy after its 5 second wait. This will allow cm_ForceUpdateVolume to actually reset the volume information, and refresh it if servers for the volume are newly accessible. Also, added some missing locking calls in cm_Analyze. In cm_ConnByMServers, change the error reporting to return CM_ERROR_ALLBUSY only when all servers are busy or down; return CM_ERROR_ALLOFFLINE when all servers are down; and only return CM_ERROR_NOSUCHVOLUME if the server list for the volume is empty. In all other cases return CM_ERROR_TIMEDOUT. --- diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index b3e22c0e3..ec0876a64 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -201,17 +201,33 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, osi_Log0(afsd_logp, "cm_Analyze passed CM_ERROR_ALLOFFLINE."); thrd_Sleep(5000); /* cm_ForceUpdateVolume marks all servers as non_busy */ - cm_ForceUpdateVolume(fidp, userp, reqp); + /* No it doesn't. It won't do anything if all of the + * the servers are marked as DOWN. So clear the DOWN + * flag and reset the busy state as well. + */ + cm_GetServerList(fidp, userp, reqp, &serversp); + lock_ObtainWrite(&cm_serverLock); + for (tsrp = serversp; tsrp; tsrp=tsrp->next) { + tsrp->server->flags &= ~CM_SERVERFLAG_DOWN; + if (tsrp->status == busy) + tsrp->status = not_busy; + } + lock_ReleaseWrite(&cm_serverLock); + + if (fidp != NULL) + cm_ForceUpdateVolume(fidp, userp, reqp); retry = 1; } /* if all servers are busy, mark them non-busy and start over */ if (errorCode == CM_ERROR_ALLBUSY) { cm_GetServerList(fidp, userp, reqp, &serversp); + lock_ObtainWrite(&cm_serverLock); for (tsrp = serversp; tsrp; tsrp=tsrp->next) { if (tsrp->status == busy) tsrp->status = not_busy; } + lock_ReleaseWrite(&cm_serverLock); thrd_Sleep(5000); retry = 1; } @@ -219,6 +235,7 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, /* special codes: VBUSY and VRESTARTING */ if (errorCode == VBUSY || errorCode == VRESTARTING) { cm_GetServerList(fidp, userp, reqp, &serversp); + lock_ObtainWrite(&cm_serverLock); for (tsrp = serversp; tsrp; tsrp=tsrp->next) { if (tsrp->server == serverp && tsrp->status == not_busy) { @@ -226,6 +243,7 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, break; } } + lock_ReleaseWrite(&cm_serverLock); retry = 1; } @@ -328,7 +346,7 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, cm_serverRef_t *tsrp; cm_server_t *tsp; long firstError = 0; - int someBusy = 0, someOffline = 0, allDown = 1; + int someBusy = 0, someOffline = 0, allBusy = 1, allDown = 1; long timeUsed, timeLeft, hardTimeLeft; #ifdef DJGPP struct timeval now; @@ -360,6 +378,7 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, else if (tsrp->status == offline) someOffline = 1; else { + allBusy = 0; code = cm_ConnByServer(tsp, usersp, connpp); if (code == 0) { cm_PutServer(tsp); @@ -389,16 +408,17 @@ long cm_ConnByMServers(cm_serverRef_t *serversp, cm_user_t *usersp, lock_ReleaseWrite(&cm_serverLock); if (firstError == 0) { - if (someBusy) + if (allBusy) firstError = CM_ERROR_ALLBUSY; - else if (someOffline) + else if (allDown) firstError = CM_ERROR_ALLOFFLINE; - else if (!allDown && serversp) - firstError = CM_ERROR_TIMEDOUT; - /* Only return CM_ERROR_NOSUCHVOLUME if there are no - servers for this volume */ - else + else if (serversp == NULL) + /* Only return CM_ERROR_NOSUCHVOLUME if there are no + * servers for this volume + */ firstError = CM_ERROR_NOSUCHVOLUME; + else + firstError = CM_ERROR_TIMEDOUT; } osi_Log1(afsd_logp, "cm_ConnByMServers returning %x", firstError); return firstError;