From 427b27a87fd34035f855ff8b1b9ec3ad618829c8 Mon Sep 17 00:00:00 2001 From: Jeffrey Altman Date: Sat, 19 Sep 2009 13:52:32 -0400 Subject: [PATCH] Windows: Do not mark server down due to Store / Fetch Data protocol error When performing a StoreData or FetchData operation there are several data validation checks performed to ensure that the lengths of data obtained with rx_Write and rx_Read are consistent with the RXAFS_FetchData and RXAFS_StoreData protocol operations. When an inconsistency is detected the cache manager terminates the call and returns an error to the caller which is passed to cm_Analyze(). The cache manager was returning -1 as the error code which is equivalent to RX_CALL_DEAD which in turn will result in the server being marked down. This commit makes the following changes: . add trace logging to permit monitoring this case . instead of returning -1 return either RX_PROTOCOL_ERROR or RX_EOF depending on the situation . in cm_Analyze do not mark a server as down for rx errors other than RX_CALL_DEAD. Instead, force a new connection and retry until the request timeout limit is reached. LICENSE MIT Reviewed-on: http://gerrit.openafs.org/470 Reviewed-by: Derrick Brashear Reviewed-by: Jeffrey Altman Tested-by: Jeffrey Altman --- src/WINNT/afsd/cm_conn.c | 16 ++++++++++------ src/WINNT/afsd/cm_dcache.c | 25 ++++++++++++++++--------- 2 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/WINNT/afsd/cm_conn.c b/src/WINNT/afsd/cm_conn.c index 5c6b76488..2939ba121 100644 --- a/src/WINNT/afsd/cm_conn.c +++ b/src/WINNT/afsd/cm_conn.c @@ -469,7 +469,6 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, else if (errorCode == VNOVOL || errorCode == VMOVED || errorCode == VOFFLINE || errorCode == VSALVAGE || errorCode == VNOSERVICE || errorCode == VIO) { - char addr[16]; char *format; DWORD msgID; @@ -676,11 +675,11 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, } else if (errorCode >= -64 && errorCode < 0) { /* mark server as down */ - sprintf(addr, "%d.%d.%d.%d", + sprintf(addr, "%d.%d.%d.%d", ((serverp->addr.sin_addr.s_addr & 0xff)), ((serverp->addr.sin_addr.s_addr & 0xff00)>> 8), ((serverp->addr.sin_addr.s_addr & 0xff0000)>> 16), - ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); + ((serverp->addr.sin_addr.s_addr & 0xff000000)>> 24)); if (errorCode == RX_CALL_DEAD) osi_Log2(afsd_logp, "cm_Analyze: Rx Call Dead addr[%s] forcedNew[%s]", @@ -693,15 +692,20 @@ cm_Analyze(cm_conn_t *connp, cm_user_t *userp, cm_req_t *reqp, (reqp->flags & CM_REQ_NEW_CONN_FORCED ? "yes" : "no")); lock_ObtainMutex(&serverp->mx); - if (errorCode != RX_CALL_DEAD || + if (errorCode == RX_CALL_DEAD && (reqp->flags & CM_REQ_NEW_CONN_FORCED)) { if (!(serverp->flags & CM_SERVERFLAG_DOWN)) { serverp->flags |= CM_SERVERFLAG_DOWN; serverp->downTime = time(NULL); } } else { - reqp->flags |= CM_REQ_NEW_CONN_FORCED; - forcing_new = 1; + if (reqp->flags & CM_REQ_NEW_CONN_FORCED) { + reqp->tokenIdleErrorServp = serverp; + reqp->tokenError = errorCode; + } else { + reqp->flags |= CM_REQ_NEW_CONN_FORCED; + forcing_new = 1; + } } lock_ReleaseMutex(&serverp->mx); cm_ForceNewConnections(serverp); diff --git a/src/WINNT/afsd/cm_dcache.c b/src/WINNT/afsd/cm_dcache.c index 2b223387e..ab5411e88 100644 --- a/src/WINNT/afsd/cm_dcache.c +++ b/src/WINNT/afsd/cm_dcache.c @@ -232,7 +232,7 @@ long cm_BufWrite(void *vscp, osi_hyper_t *offsetp, long length, long flags, temp = rx_Write(rxcallp, bufferp, wbytes); if (temp != wbytes) { osi_Log3(afsd_logp, "rx_Write failed bp 0x%p, %d != %d",bufp,temp,wbytes); - code = -1; + code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR; break; } else { osi_Log2(afsd_logp, "rx_Write succeeded bp 0x%p, %d",bufp,temp); @@ -1609,10 +1609,13 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp if (temp == sizeof(afs_int32)) { nbytes = ntohl(nbytes); FillInt64(length_found, nbytes_hi, nbytes); - if (length_found > biod.length) - code = (rxcallp->error < 0) ? rxcallp->error : -1; + if (length_found > biod.length) { + osi_Log0(afsd_logp, "cm_GetBuffer length_found > biod.length"); + code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR; + } } else { - code = (rxcallp->error < 0) ? rxcallp->error : -1; + osi_Log1(afsd_logp, "cm_GetBuffer rx_Read32 returns %d != 4", temp); + code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR; } } /* for the moment, nbytes_hi will always be 0 if code == 0 @@ -1629,11 +1632,15 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp temp = rx_Read32(rxcallp, &nbytes); if (temp == sizeof(afs_int32)) { nbytes = ntohl(nbytes); - if (nbytes > biod.length) - code = (rxcallp->error < 0) ? rxcallp->error : -1; + if (nbytes > biod.length) { + osi_Log0(afsd_logp, "cm_GetBuffer length_found > biod.length"); + code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR; + } + } + else { + osi_Log1(afsd_logp, "cm_GetBuffer rx_Read32 returns %d != 4", temp); + code = (rxcallp->error < 0) ? rxcallp->error : RX_PROTOCOL_ERROR; } - else - code = (rxcallp->error < 0) ? rxcallp->error : -1; } #endif @@ -1663,7 +1670,7 @@ long cm_GetBuffer(cm_scache_t *scp, cm_buf_t *bufp, int *cpffp, cm_user_t *userp rbytes = (nbytes > cm_data.buf_blockSize? cm_data.buf_blockSize : nbytes); temp = rx_Read(rxcallp, bufferp, rbytes); if (temp < rbytes) { - code = (rxcallp->error < 0) ? rxcallp->error : -1; + code = (rxcallp->error < 0) ? rxcallp->error : RX_EOF; break; } -- 2.39.5