]> git.michaelhowe.org Git - packages/o/openafs.git/commitdiff
DEVEL15-windows-vmoved-vnovol-failover-20090212
authorJeffrey Altman <jaltman@secure-endpoints.com>
Thu, 12 Feb 2009 21:30:16 +0000 (21:30 +0000)
committerJeffrey Altman <jaltman@secure-endpoints.com>
Thu, 12 Feb 2009 21:30:16 +0000 (21:30 +0000)
LICENSE MIT
FIXES 124276

A readonly volume with multiple instances was being marked alldown
when one of the instances resulted in VNOVOL because the vldb and
the indicated file server were out of sync.  The cache manager would
then attempt to re-obtain the volume location info from the vldb
for the entire timeout period.

This patch:

adds trace logging to cm_Analyze indicating which server references
are being deleted and which remain in response to a VNOVOL/VMOVED error.

adds trace logging to cm_UpdateVolumeStatusInt showing how the
new volume status value is determined.

corrects cm_Analyze() so that it doesn't corrupt the server reference
list during cm_FreeServerList() calls; doesn't orphan the server reference
list by setting *serverpp to NULL; and only re-obtains the volume info
list once per request.

(cherry picked from commit 8bb35234aa8469ba93a2171fced503a875c2f1f6)

src/WINNT/afsd/cm_conn.c
src/WINNT/afsd/cm_conn.h
src/WINNT/afsd/cm_volume.c

index f89efbe4fcad55c720b1c23889e8a168e7650cea..cabdf90292ae8c3523987cd1d53e82c3f2854b5b 100644 (file)
@@ -345,8 +345,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
                         }
                         lock_ReleaseWrite(&cm_serverLock);
                         if (free_svr_list) {
-                            cm_FreeServerList(&serversp, 0);
-                            *serverspp = serversp = NULL;
+                            cm_FreeServerList(serverspp, 0);
+                            serverspp = NULL;
+                            serversp = NULL;
                             free_svr_list = 0;
                         }
 
@@ -420,8 +421,9 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         }
 
         if (free_svr_list) {
-            cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp = NULL;
+            cm_FreeServerList(serverspp, 0);
+            serverspp = NULL;
+            serversp = NULL;
             free_svr_list = 0;
         }
         retry = 1;
@@ -434,6 +436,10 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         char addr[16];
         char *format;
        DWORD msgID;
+
+        /* In case of timeout */
+        reqp->volumeError = errorCode;
+
         switch ( errorCode ) {
         case VNOVOL:
            msgID = MSG_SERVER_REPORTS_VNOVOL;
@@ -473,11 +479,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
             LogEvent(EVENTLOG_WARNING_TYPE, msgID, addr, fidp->volume);
         }
 
-        /* Mark server offline for this volume */
+        /* 
+         * Mark server offline for this volume or delete the volume
+         * from the server list if it was moved or is not present.
+         */
         if (!serversp && fidp) {
             code = cm_GetServerList(fidp, userp, reqp, &serverspp);
             if (code == 0) {
-                serversp = *serverspp = NULL;
+                serversp = *serverspp;
                 free_svr_list = 1;
             }
         }
@@ -486,23 +495,39 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
         for (tsrp = serversp; tsrp; tsrp=tsrp->next) {
             if (tsrp->status == srv_deleted)
                 continue;
+
+            sprintf(addr, "%d.%d.%d.%d",
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff)),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff00)>> 8),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                     ((tsrp->server->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
             if (tsrp->server == serverp) {
                 /* REDIRECT */
                 if (errorCode == VMOVED || errorCode == VNOVOL) {
+                    osi_Log2(afsd_logp, "volume %d not present on server %s", 
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_deleted;
                     if (fidp)
                         cm_RemoveVolumeFromServer(serverp, fidp->volume);
                 } else {
+                    osi_Log2(afsd_logp, "volume %d instance on server %s marked offline", 
+                             fidp->volume, osi_LogSaveString(afsd_logp,addr));
                     tsrp->status = srv_offline;
                 }
+                /* break; */
+            } else {
+                osi_Log3(afsd_logp, "volume %d exists on server %s with status %u", 
+                         fidp->volume, osi_LogSaveString(afsd_logp,addr), tsrp->status);
             }
         }   
         lock_ReleaseWrite(&cm_serverLock);
 
         /* Free the server list before cm_ForceUpdateVolume is called */
         if (free_svr_list) {
-            cm_FreeServerList(&serversp, 0);
-            *serverspp = serversp = NULL;
+            cm_FreeServerList(serverspp, 0);
+            serverspp = NULL;
+            serversp = NULL;
             free_svr_list = 0;
         }
 
@@ -513,10 +538,14 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
             if (code == 0)
                 statep = cm_VolumeStateByID(volp, fidp->volume);
 
-            if (errorCode == VMOVED || errorCode == VNOVOL) {
+            if ((errorCode == VMOVED || errorCode == VNOVOL) &&
+                !(reqp->flags & CM_REQ_VOLUME_UPDATED)) 
+            {
                 code = cm_ForceUpdateVolume(fidp, userp, reqp);
                 if (code) 
                     timeLeft = 0;   /* prevent a retry on failure */
+                else
+                    reqp->flags |= CM_REQ_VOLUME_UPDATED;
                 osi_Log3(afsd_logp, "cm_Analyze called cm_ForceUpdateVolume cell %u vol %u code 0x%x",
                         fidp->cell, fidp->volume, code);
             }
@@ -799,6 +828,16 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp,
     if (connp)
         cm_PutConn(connp);
 
+    /* 
+     * clear the volume updated flag if we succeed.
+     * this way the flag will not prevent a subsequent volume 
+     * from being updated if necessary.
+     */
+    if (errorCode == 0)
+    {
+        reqp->flags &= ~CM_REQ_VOLUME_UPDATED;
+    }
+
     /* retry until we fail to find a connection */
     return retry;
 }
index 68b464ce28b0636929d3a67172e90fbbde91a21e..d5f41e279070c374f89eade4b1a299089a4a9b7c 100644 (file)
@@ -55,6 +55,7 @@ typedef struct cm_req {
 #define        CM_REQ_NORETRY          0x1
 #define CM_REQ_NEW_CONN_FORCED  0x2
 #define CM_REQ_SOURCE_SMB       0x4
+#define CM_REQ_VOLUME_UPDATED   0x8
 
 /*
  * Vice2 error codes
index 5f8fe841bd4cb87ce48b5726242d98f2d9b0de71..e107e16640c1c9d93f9ca92c1dd069c9e4ffdded 100644 (file)
@@ -1283,6 +1283,7 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep)
     cm_serverRef_t *tsrp;
     cm_server_t *tsp;
     int someBusy = 0, someOffline = 0, allOffline = 1, allBusy = 1, allDown = 1;
+    char addr[16];
 
     if (!volp || !statep) {
 #ifdef DEBUG
@@ -1293,29 +1294,50 @@ cm_UpdateVolumeStatusInt(cm_volume_t *volp, struct cm_vol_state *statep)
 
     lock_ObtainWrite(&cm_serverLock);
     for (tsrp = statep->serversp; tsrp; tsrp=tsrp->next) {
-        if (tsrp->status == srv_deleted)
-            continue;
         tsp = tsrp->server;
+        sprintf(addr, "%d.%d.%d.%d", 
+                 ((tsp->addr.sin_addr.s_addr & 0xff)),
+                 ((tsp->addr.sin_addr.s_addr & 0xff00)>> 8),
+                 ((tsp->addr.sin_addr.s_addr & 0xff0000)>> 16),
+                 ((tsp->addr.sin_addr.s_addr & 0xff000000)>> 24)); 
+
+        if (tsrp->status == srv_deleted) {
+            osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s deleted", 
+                     statep->ID, osi_LogSaveString(afsd_logp,addr));
+            continue;
+        }
         if (tsp) {
             cm_GetServerNoLock(tsp);
             if (!(tsp->flags & CM_SERVERFLAG_DOWN)) {
                 allDown = 0;
                 if (tsrp->status == srv_busy) {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s busy", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allOffline = 0;
                     someBusy = 1;
                 } else if (tsrp->status == srv_offline) {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s offline", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allBusy = 0;
                     someOffline = 1;
                 } else {
+                    osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", 
+                              statep->ID, osi_LogSaveString(afsd_logp,addr));
                     allOffline = 0;
                     allBusy = 0;
                 }
+            } else {
+                osi_Log2(afsd_logp, "cm_UpdateVolumeStatusInt volume %d server reference %s online", 
+                          statep->ID, osi_LogSaveString(afsd_logp,addr));
             }
             cm_PutServerNoLock(tsp);
         }
     }   
     lock_ReleaseWrite(&cm_serverLock);
 
+    osi_Log5(afsd_logp, "cm_UpdateVolumeStatusInt allDown %d allBusy %d someBusy %d someOffline %d allOffline %d", 
+             allDown, allBusy, someBusy, someOffline, allOffline);
+
     if (allDown)
        newStatus = vl_alldown;
     else if (allBusy || (someBusy && someOffline))