From 4de3c373c0389f5548a08ded232d6c847ec149e9 Mon Sep 17 00:00:00 2001 From: Derrick Brashear Date: Wed, 30 Apr 2008 20:08:04 +0000 Subject: [PATCH] DEVEL15-client-idledeadtime-support-20080430 LICENSE IPL10 in the event a server is "melting down" and not responding with data to RPCs this adds the ability to time out the calls rather than letting keepalives effectively hang the client forever. does not mark the server down since... it's not down. Side effect: adds similar tracking for authentication errors so tokens need not be discarded if one server's clock has drifted. (cherry picked from commit 0a2c144307d2471450c0660e141906a1f7b1ea36) --- src/afs/afs.h | 5 ++ src/afs/afs_analyze.c | 113 +++++++++++++++++++++++++++++++++++---- src/afs/afs_call.c | 1 + src/afs/afs_conn.c | 8 ++- src/afs/afs_error.c | 16 ++++++ src/afs/afs_prototypes.h | 2 +- src/afs/afs_server.c | 8 +-- 7 files changed, 138 insertions(+), 15 deletions(-) diff --git a/src/afs/afs.h b/src/afs/afs.h index 6cde840f7..b3f41c246 100644 --- a/src/afs/afs.h +++ b/src/afs/afs.h @@ -86,11 +86,13 @@ extern int afs_shuttingdown; #define AFS_NRXPACKETS 80 #define AFS_RXDEADTIME 50 #define AFS_HARDDEADTIME 120 +#define AFS_IDLEDEADTIME 50 #define AFS_BLKBITS 12 #define AFS_BLKSIZE (1 << AFS_BLKBITS) extern afs_int32 afs_rx_deadtime; extern afs_int32 afs_rx_harddead; +extern afs_int32 afs_rx_idledead; struct sysname_info { char *name; @@ -186,6 +188,9 @@ struct vrequest { char volumeError; /* encountered a missing or busy volume */ char networkError; /* encountered network problems */ char permWriteError; /* fileserver returns permenent error. */ + char tokenError; /* a token error other than expired. */ + char idleError; /* the server idled too long */ + char skipserver[MAXHOSTS]; }; #define VOLMISSING 1 #define VOLBUSY 2 diff --git a/src/afs/afs_analyze.c b/src/afs/afs_analyze.c index c03fdeee8..6ee02e189 100644 --- a/src/afs/afs_analyze.c +++ b/src/afs/afs_analyze.c @@ -211,6 +211,66 @@ VLDB_Same(struct VenusFid *afid, struct vrequest *areq) return (changed ? DIFFERENT : SAME); } /*VLDB_Same */ +/*------------------------------------------------------------------------ + * afs_BlackListOnce + * + * Description: + * Mark a server as invalid for further attempts of this request only. + * + * Arguments: + * areq : The request record associated with this operation. + * afid : The FID of the file involved in the action. This argument + * may be null if none was involved. + * tsp : pointer to a server struct for the server we wish to + * blacklist. + * + * Returns: + * Non-zero value if further servers are available to try, + * zero otherwise. + * + * Environment: + * This routine is typically called in situations where we believe + * one server out of a pool may have an error condition. + * + * Side Effects: + * As advertised. + * + * NOTE: + * The afs_Conn* routines use the list of invalidated servers to + * avoid reusing a server marked as invalid for this request. + *------------------------------------------------------------------------*/ +static afs_int32 +afs_BlackListOnce(struct vrequest *areq, struct VenusFid *afid, + struct server *tsp) +{ + struct volume *tvp; + afs_int32 i; + afs_int32 serversleft = 0; + + if (afid) + tvp = afs_FindVolume(afid, READ_LOCK); + if (tvp) { + for (i = 0; i < MAXHOSTS; i++) { + if (tvp->serverHost[i] == tsp) { + areq->skipserver[i] = 1; + } + if (tvp->serverHost[i] && + !(tvp->serverHost[i]->addr->sa_flags & + SRVR_ISDOWN)) { + areq->skipserver[i] = 1; + } + } + afs_PutVolume(tvp, READ_LOCK); + } + for (i = 0; i < MAXHOSTS; i++) { + if (areq->skipserver[i] == 0) { + serversleft = 1; + break; + } + } + return serversleft; +} + /*------------------------------------------------------------------------ * EXPORTED afs_Analyze @@ -254,7 +314,9 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode, struct server *tsp; struct volume *tvp; afs_int32 shouldRetry = 0; + afs_int32 serversleft = 1; struct afs_stats_RPCErrors *aerrP; + afs_int32 markeddown; AFS_STATCNT(afs_Analyze); afs_Trace4(afs_iclSetp, CM_TRACE_ANALYZE, ICL_TYPE_INT32, op, @@ -378,10 +440,33 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode, acode = 455; #endif /* AFS_64BIT_CLIENT */ if ((acode < 0) && (acode != VRESTARTING)) { - afs_ServerDown(sa); - ForceNewConnections(sa); /*multi homed clients lock:afs_xsrvAddr? */ + if (acode == RX_CALL_TIMEOUT) { + serversleft = afs_BlackListOnce(areq, afid, tsp); + areq->idleError++; + if (serversleft) { + shouldRetry = 1; + } else { + shouldRetry = 0; + } + /* By doing this, we avoid ever marking a server down + * in an idle timeout case. That's because the server is + * still responding and may only be letting a single vnode + * time out. We otherwise risk having the server continually + * be marked down, then up, then down again... + */ + goto out; + } + markeddown = afs_ServerDown(sa); + ForceNewConnections(sa); /**multi homed clients lock:afs_xsrvAddr? */ if (aerrP) (aerrP->err_Server)++; +#if 0 + /* retry *once* when the server is timed out in case of NAT */ + if (markeddown && acode == RX_CALL_DEAD) { + aconn->forceConnectFS = 1; + shouldRetry = 1; + } +#endif } if (acode == VBUSY || acode == VRESTARTING) { @@ -412,7 +497,6 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode, || (acode & ~0xff) == ERROR_TABLE_BASE_RXK) { /* any rxkad error is treated as token expiration */ struct unixuser *tu; - /* * I'm calling these errors protection errors, since they involve * faulty authentication. @@ -431,11 +515,22 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode, ("afs: Tokens for user of AFS id %d for cell %s have expired\n", tu->vid, aconn->srvr->server->cell->cellName); } else { - aconn->forceConnectFS = 0; /* don't check until new tokens set */ - aconn->user->states |= UTokensBad; - afs_warnuser - ("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n", - tu->vid, aconn->srvr->server->cell->cellName, acode); + serversleft = afs_BlackListOnce(areq, afid, tsp); + areq->tokenError++; + + if (serversleft) { + afs_warnuser + ("afs: Tokens for user of AFS id %d for cell %s: rxkad error=%d\n", + tu->vid, aconn->srvr->server->cell->cellName, acode); + shouldRetry = 1; + } else { + areq->tokenError = 0; + aconn->forceConnectFS = 0; /* don't check until new tokens set */ + aconn->user->states |= UTokensBad; + afs_warnuser + ("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n", + tu->vid, aconn->srvr->server->cell->cellName, acode); + } } afs_PutUser(tu, READ_LOCK); } else { @@ -531,7 +626,7 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode, VSleep(1); /* Just a hack for desperate times. */ shouldRetry = 1; } - +out: /* now unlock the connection and return */ afs_PutConn(aconn, locktype); return (shouldRetry); diff --git a/src/afs/afs_call.c b/src/afs/afs_call.c index 4246bacfb..5ebce8e99 100644 --- a/src/afs/afs_call.c +++ b/src/afs/afs_call.c @@ -72,6 +72,7 @@ char afs_cachebasedir[1024]; afs_int32 afs_rx_deadtime = AFS_RXDEADTIME; afs_int32 afs_rx_harddead = AFS_HARDDEADTIME; +afs_int32 afs_rx_idledead = AFS_IDLEDEADTIME; static int afscall_set_rxpck_received = 0; diff --git a/src/afs/afs_conn.c b/src/afs/afs_conn.c index c883d7213..bae3f0135 100644 --- a/src/afs/afs_conn.c +++ b/src/afs/afs_conn.c @@ -83,7 +83,9 @@ afs_Conn(register struct VenusFid *afid, register struct vrequest *areq, /* First is always lowest rank, if it's up */ if ((tv->status[0] == not_busy) && tv->serverHost[0] - && !(tv->serverHost[0]->addr->sa_flags & SRVR_ISDOWN)) + && !(tv->serverHost[0]->addr->sa_flags & SRVR_ISDOWN) && + !(((areq->idleError > 0) || (areq->tokenError > 0)) + && (areq->skipserver[0] == 1))) lowp = tv->serverHost[0]->addr; /* Otherwise we look at all of them. There are seven levels of @@ -95,6 +97,9 @@ afs_Conn(register struct VenusFid *afid, register struct vrequest *areq, */ for (notbusy = not_busy; (!lowp && (notbusy <= end_not_busy)); notbusy++) { for (i = 0; i < MAXHOSTS && tv->serverHost[i]; i++) { + if (((areq->tokenError > 0)||(areq->idleError > 0)) + && (areq->skipserver[i] == 1)) + continue; if (tv->status[i] != notbusy) { if (tv->status[i] == rd_busy || tv->status[i] == rdwr_busy) { if (!areq->busyCount) @@ -234,6 +239,7 @@ afs_ConnBySA(struct srvAddr *sap, unsigned short aport, afs_int32 acell, if (service == 52) { rx_SetConnHardDeadTime(tc->id, afs_rx_harddead); } + rx_SetConnIdleDeadTime(tc->id, afs_rx_idledead); tc->forceConnectFS = 0; /* apparently we're appropriately connected now */ if (csec) diff --git a/src/afs/afs_error.c b/src/afs/afs_error.c index 35c92b5f8..626c1bbd4 100644 --- a/src/afs/afs_error.c +++ b/src/afs/afs_error.c @@ -209,10 +209,19 @@ et_to_sys_error(afs_int32 in) void afs_CopyError(register struct vrequest *afrom, register struct vrequest *ato) { + int i = 0; AFS_STATCNT(afs_CopyError); if (!afrom->initd) return; afs_FinalizeReq(ato); + while (i < MAXHOSTS) { + ato->skipserver[i] = afrom->skipserver[i]; + i++; + } + if (afrom->tokenError) + ato->tokenError = afrom->tokenError; + if (afrom->idleError) + ato->idleError = afrom->idleError; if (afrom->accessError) ato->accessError = 1; if (afrom->volumeError) @@ -227,10 +236,17 @@ afs_CopyError(register struct vrequest *afrom, register struct vrequest *ato) void afs_FinalizeReq(register struct vrequest *areq) { + int i = 0; AFS_STATCNT(afs_FinalizeReq); if (areq->initd) return; + while (i < MAXHOSTS) { + areq->skipserver[i] = 0; + i++; + } areq->busyCount = 0; + areq->idleError = 0; + areq->tokenError = 0; areq->accessError = 0; areq->volumeError = 0; areq->networkError = 0; diff --git a/src/afs/afs_prototypes.h b/src/afs/afs_prototypes.h index 182db7733..ba4d5e343 100644 --- a/src/afs/afs_prototypes.h +++ b/src/afs/afs_prototypes.h @@ -746,7 +746,7 @@ extern struct server *afs_GetServer(afs_uint32 * aserver, afs_int32 nservers, afs_int32 addr_uniquifier); extern void ForceAllNewConnections(void); extern void afs_MarkServerUpOrDown(struct srvAddr *sa, int a_isDown); -extern void afs_ServerDown(struct srvAddr *sa); +extern afs_int32 afs_ServerDown(struct srvAddr *sa); extern void afs_CountServers(void); extern void afs_CheckServers(int adown, struct cell *acellp); extern unsigned int afs_random(void); diff --git a/src/afs/afs_server.c b/src/afs/afs_server.c index 17d27ec29..abf485ea6 100644 --- a/src/afs/afs_server.c +++ b/src/afs/afs_server.c @@ -239,14 +239,14 @@ afs_MarkServerUpOrDown(struct srvAddr *sa, int a_isDown) } /*MarkServerUpOrDown */ -void +afs_int32 afs_ServerDown(struct srvAddr *sa) { register struct server *aserver = sa->server; AFS_STATCNT(ServerDown); - if (aserver->flags & SRVR_ISDOWN || sa->sa_flags & SRVADDR_ISDOWN) - return; + if (aserver->flags & SRVR_ISDOWN || sa->sa_flags & SRVADDR_ISDOWN) + return 0; afs_MarkServerUpOrDown(sa, SRVR_ISDOWN); if (sa->sa_portal == aserver->cell->vlport) print_internet_address @@ -254,7 +254,7 @@ afs_ServerDown(struct srvAddr *sa) else print_internet_address("afs: Lost contact with file server ", sa, "", 1); - + return 1; } /*ServerDown */ -- 2.39.5