From: Jeffrey Altman Date: Tue, 16 Jun 2009 05:59:22 +0000 (+0000) Subject: windows-conn-analyze-20090615 X-Git-Tag: openafs-devel-1_5_61~218 X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=82d9807e2246997ac73930c91d1ad4312084cc57;p=packages%2Fo%2Fopenafs.git windows-conn-analyze-20090615 LICENSE MIT When processing RX_CALL_TIMEOUT, if there is no fid specified its a vldb rpc and we should retry. If there is a fid and the volume is a readonly volume then we should also retry. Only fail if the request has no alternate server to look at. When processing RX_CALL_DEAD, log the fact that the call is dead. When processing any other RX error between -2 and -64, do not force a new connection to the same server. Mark the server down and retry with a new server if possible. --- diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index 4d28832fc..39b8a2029 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -612,7 +612,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, */ if (serverp) { - /* Log server being offline for this volume */ sprintf(addr, "%d.%d.%d.%d", ((serverp->addr.sin_addr.s_addr & 0xff)), ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), @@ -620,17 +619,46 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); LogEvent(EVENTLOG_WARNING_TYPE, MSG_RX_HARD_DEAD_TIME_EXCEEDED, addr); - osi_Log1(afsd_logp, "cm_Analyze: hardDeadTime or idleDeadtime exceeded addr[%s]", osi_LogSaveString(afsd_logp,addr)); reqp->tokenIdleErrorServp = serverp; reqp->idleError++; + + if (timeLeft > 2) { + if (!fidp) { /* vldb */ + retry = 1; + } else { /* file */ + cm_volume_t *volp = cm_GetVolumeByFID(fidp); + if (volp) { + if (fidp->volume == cm_GetROVolumeID(volp)) + retry = 1; + cm_PutVolume(volp); + } + } + } } } else if (errorCode >= -64 && errorCode < 0) { /* mark server as down */ + sprintf(addr, "%d.%d.%d.%d", + ((serverp->addr.sin_addr.s_addr & 0xff)), + ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), + ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + + if (errorCode == RX_CALL_DEAD) + osi_Log2(afsd_logp, "cm_Analyze: Rx Call Dead addr[%s] forcedNew[%s]", + osi_LogSaveString(afsd_logp,addr), + (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no")); + else + osi_Log3(afsd_logp, "cm_Analyze: Rx Misc Error[%d] addr[%s] forcedNew[%s]", + errorCode, + osi_LogSaveString(afsd_logp,addr), + (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no")); + lock_ObtainMutex(&serverp->mx); - if (reqp->flags & CM_REQ_NEW_CONN_FORCED) { + if (errorCode != RX_CALL_DEAD || + (reqp->flags & CM_REQ_NEW_CONN_FORCED)) { if (!(serverp->flags & CM_SERVERFLAG_DOWN)) { serverp->flags |= CM_SERVERFLAG_DOWN; serverp->downTime = time(NULL); @@ -665,18 +693,18 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, retry = 1; } } else if (errorCode >= ERROR_TABLE_BASE_U && errorCode < ERROR_TABLE_BASE_U + 256) { - /* - * We received a ubik error. its possible that the server we are - * communicating with has a corrupted database or is partitioned - * from the rest of the servers and another server might be able - * to answer our query. Therefore, we will retry the request - * and force the use of another server. - */ - if (serverp) { - reqp->tokenIdleErrorServp = serverp; - reqp->tokenError = errorCode; - retry = 1; - } + /* + * We received a ubik error. its possible that the server we are + * communicating with has a corrupted database or is partitioned + * from the rest of the servers and another server might be able + * to answer our query. Therefore, we will retry the request + * and force the use of another server. + */ + if (serverp) { + reqp->tokenIdleErrorServp = serverp; + reqp->tokenError = errorCode; + retry = 1; + } } else if (errorCode == VICECONNBAD || errorCode == VICETOKENDEAD) { cm_ForceNewConnections(serverp); if ( timeLeft > 2 )