From 8bb35234aa8469ba93a2171fced503a875c2f1f6 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Thu, 12 Feb 2009 21:30:03 +0000 Subject: [PATCH] windows-vmoved-vnovol-failover-20090212 LICENSE MIT FIXES 124276 A readonly volume with multiple instances was being marked alldown when one of the instances resulted in VNOVOL because the vldb and the indicated file server were out of sync. The cache manager would then attempt to re-obtain the volume location info from the vldb for the entire timeout period. This patch: adds trace logging to cm_Analyze indicating which server references are being deleted and which remain in response to a VNOVOL/VMOVED error. adds trace logging to cm_UpdateVolumeStatusInt showing how the new volume status value is determined. corrects cm_Analyze() so that it doesn't corrupt the server reference list during cm_FreeServerList() calls; doesn't orphan the server reference list by setting *serverpp to NULL; and only re-obtains the volume info list once per request. --- src/WINNT/afsd/cm_conn.c | 57 ++++++++++++++++++++++++++++++++------ src/WINNT/afsd/cm_conn.h | 1 + src/WINNT/afsd/cm_volume.c | 26 +++++++++++++++-- 3 files changed, 73 insertions(+), 11 deletions(-) diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index 1d4e0fd15..693b7feae 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -345,8 +345,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, } lock_ReleaseWrite(&cm_serverLock); if (free_svr_list) { - cm_FreeServerList(&serversp, 0); - *serverspp = serversp = NULL; + cm_FreeServerList(serverspp, 0); + serverspp = NULL; + serversp = NULL; free_svr_list = 0; } @@ -420,8 +421,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, } if (free_svr_list) { - cm_FreeServerList(&serversp, 0); - *serverspp = serversp = NULL; + cm_FreeServerList(serverspp, 0); + serverspp = NULL; + serversp = NULL; free_svr_list = 0; } retry = 1; @@ -434,6 +436,10 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, char addr[16]; char *format; DWORD msgID; + + /* In case of timeout */ + reqp->volumeError = errorCode; + switch ( errorCode ) { case VNOVOL: msgID = MSG_SERVER_REPORTS_VNOVOL; @@ -473,11 +479,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume); } - /* Mark server offline for this volume */ + /* + * Mark server offline for this volume or delete the volume + * from the server list if it was moved or is not present. + */ if (!serversp && fidp) { code = cm_GetServerList(fidp, userp, reqp, &serverspp); if (code == 0) { - serversp = *serverspp = NULL; + serversp = *serverspp; free_svr_list = 1; } } @@ -486,23 +495,39 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, for (tsrp = serversp; tsrp; tsrp=tsrp->next) { if (tsrp->status == srv_deleted) continue; + + sprintf(addr, "%d.%d.%d.%d", + ((tsrp->server->addr.sin_addr.s_addr & 0xff)), + ((tsrp->server->addr.sin_addr.s_addr & 0xff00)>> 8), + ((tsrp->server->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((tsrp->server->addr.sin_addr.s_addr & 0xff000000)>> 24)); + if (tsrp->server == serverp) { /* REDIRECT */ if (errorCode == VMOVED || errorCode == VNOVOL) { + osi_Log2(afsd_logp, "volume %d not present on server %s", + fidp->volume, osi_LogSaveString(afsd_logp,addr)); tsrp->status = srv_deleted; if (fidp) cm_RemoveVolumeFromServer(serverp, fidp->volume); } else { + osi_Log2(afsd_logp, "volume %d instance on server %s marked offline", + fidp->volume, osi_LogSaveString(afsd_logp,addr)); tsrp->status = srv_offline; } + /* break; */ + } else { + osi_Log3(afsd_logp, "volume %d exists on server %s with status %u", + fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status); } } lock_ReleaseWrite(&cm_serverLock); /* Free the server list before cm_ForceUpdateVolume is called */ if (free_svr_list) { - cm_FreeServerList(&serversp, 0); - *serverspp = serversp = NULL; + cm_FreeServerList(serverspp, 0); + serverspp = NULL; + serversp = NULL; free_svr_list = 0; } @@ -513,10 +538,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, if (code == 0) statep = cm_VolumeStateByID(volp, fidp->volume); - if (errorCode == VMOVED || errorCode == VNOVOL) { + if ((errorCode == VMOVED || errorCode == VNOVOL) && + !(reqp->flags & CM_REQ_VOLUME_UPDATED)) + { code = cm_ForceUpdateVolume(fidp, userp, reqp); if (code) timeLeft = 0; /* prevent a retry on failure */ + else + reqp->flags |= CM_REQ_VOLUME_UPDATED; osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x", fidp->cell, fidp->volume, code); } @@ -799,6 +828,16 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, if (connp) cm_PutConn(connp); + /* + * clear the volume updated flag if we succeed. + * this way the flag will not prevent a subsequent volume + * from being updated if necessary. + */ + if (errorCode == 0) + { + reqp->flags &= ~CM_REQ_VOLUME_UPDATED; + } + /* retry until we fail to find a connection */ return retry; } diff --git a/src/WINNT/afsd/cm_conn.h b/src/WINNT/afsd/cm_conn.h index 68b464ce2..d5f41e279 100644 --- a/src/WINNT/afsd/cm_conn.h +++ b/src/WINNT/afsd/cm_conn.h @@ -55,6 +55,7 @@ typedef struct cm_req { #define CM_REQ_NORETRY 0x1 #define CM_REQ_NEW_CONN_FORCED 0x2 #define CM_REQ_SOURCE_SMB 0x4 +#define CM_REQ_VOLUME_UPDATED 0x8 /* * Vice2 error codes diff --git a/src/WINNT/afsd/cm_volume.c b/src/WINNT/afsd/cm_volume.c index 5f8fe841b..e107e1664 100644 --- a/src/WINNT/afsd/cm_volume.c +++ b/src/WINNT/afsd/cm_volume.c @@ -1283,6 +1283,7 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep) cm_serverRef_t *tsrp; cm_server_t *tsp; int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1; + char addr[16]; if (!volp || !statep) { #ifdef DEBUG @@ -1293,29 +1294,50 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep) lock_ObtainWrite(&cm_serverLock); for (tsrp = statep->serversp; tsrp; tsrp=tsrp->next) { - if (tsrp->status == srv_deleted) - continue; tsp = tsrp->server; + sprintf(addr, "%d.%d.%d.%d", + ((tsp->addr.sin_addr.s_addr & 0xff)), + ((tsp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((tsp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((tsp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + + if (tsrp->status == srv_deleted) { + osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s deleted", + statep->ID, osi_LogSaveString(afsd_logp,addr)); + continue; + } if (tsp) { cm_GetServerNoLock(tsp); if (!(tsp->flags & CM_SERVERFLAG_DOWN)) { allDown = 0; if (tsrp->status == srv_busy) { + osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s busy", + statep->ID, osi_LogSaveString(afsd_logp,addr)); allOffline = 0; someBusy = 1; } else if (tsrp->status == srv_offline) { + osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s offline", + statep->ID, osi_LogSaveString(afsd_logp,addr)); allBusy = 0; someOffline = 1; } else { + osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", + statep->ID, osi_LogSaveString(afsd_logp,addr)); allOffline = 0; allBusy = 0; } + } else { + osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", + statep->ID, osi_LogSaveString(afsd_logp,addr)); } cm_PutServerNoLock(tsp); } } lock_ReleaseWrite(&cm_serverLock); + osi_Log5(afsd_logp, "cm_UpdateVolumeStatusInt allDown %d allBusy %d someBusy %d someOffline %d allOffline %d", + allDown, allBusy, someBusy, someOffline, allOffline); + if (allDown) newStatus = vl_alldown; else if (allBusy || (someBusy && someOffline)) -- 2.39.5