STABLE14-client-idledeadtime-support-20080430

author Derrick Brashear <shadow@dementia.org>

Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)

committer Derrick Brashear <shadow@dementia.org>

Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)
author Derrick Brashear <shadow@dementia.org>
Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)
committer Derrick Brashear <shadow@dementia.org>
Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)
diff --git a/src/afs/afs.h b/src/afs/afs.h

index 1eb146af3ff974540336bdba69852a83197b94a7..9df1f35fee028ec8221359e9e03bb6a0564a322d 100644 (file)
--- a/src/afs/afs.h
+++ b/src/afs/afs.h
@@ -86,11 +86,13 @@ extern int afs_shuttingdown;
  #define        AFS_NRXPACKETS  80
  #define        AFS_RXDEADTIME  50
  #define AFS_HARDDEADTIME       120
+#define        AFS_IDLEDEADTIME        50
  #define AFS_BLKBITS    12
  #define AFS_BLKSIZE    (1 << AFS_BLKBITS)
  
  extern afs_int32 afs_rx_deadtime;
  extern afs_int32 afs_rx_harddead;
+extern afs_int32 afs_rx_idledead;
  
  struct sysname_info {
      char *name;
@@ -186,6 +188,9 @@ struct vrequest {
      char volumeError;          /* encountered a missing or busy volume */
      char networkError;         /* encountered network problems */
      char permWriteError;       /* fileserver returns permenent error. */
+    char tokenError;            /* a token error other than expired. */
+    char idleError;             /* the server idled too long */
+    char skipserver[MAXHOSTS];
  };
  #define VOLMISSING 1
  #define VOLBUSY 2
diff --git a/src/afs/afs_analyze.c b/src/afs/afs_analyze.c

index 486907fa3bfe8fa0b4ae943bd7a7acba16ae6a11..4e45671a54ce399ae5bb1519506ca9425b3b8fc7 100644 (file)
--- a/src/afs/afs_analyze.c
+++ b/src/afs/afs_analyze.c
@@ -215,10 +215,19 @@ et_to_sys_error(afs_int32 in)
  void
  afs_CopyError(register struct vrequest *afrom, register struct vrequest *ato)
  {
+    int i = 0;
      AFS_STATCNT(afs_CopyError);
      if (!afrom->initd)
         return;
      afs_FinalizeReq(ato);
+    while (i < MAXHOSTS) {
+       ato->skipserver[i] = afrom->skipserver[i];
+       i++;
+    }
+    if (afrom->tokenError)
+       ato->tokenError = afrom->tokenError;
+    if (afrom->idleError)
+       ato->idleError = afrom->idleError;
      if (afrom->accessError)
         ato->accessError = 1;
      if (afrom->volumeError)
@@ -233,10 +242,17 @@ afs_CopyError(register struct vrequest *afrom, register struct vrequest *ato)
  void
  afs_FinalizeReq(register struct vrequest *areq)
  {
+    int i = 0;
      AFS_STATCNT(afs_FinalizeReq);
      if (areq->initd)
         return;
+    while (i < MAXHOSTS) {
+       areq->skipserver[i] = 0;
+       i++;
+    }
      areq->busyCount = 0;
+    areq->idleError = 0;
+    areq->tokenError = 0;
      areq->accessError = 0;
      areq->volumeError = 0;
      areq->networkError = 0;
@@ -425,6 +441,66 @@ VLDB_Same(struct VenusFid *afid, struct vrequest *areq)
      return (changed ? DIFFERENT : SAME);
  }                              /*VLDB_Same */
  
+/*------------------------------------------------------------------------
+ * afs_BlackListOnce
+ *
+ * Description:
+ *     Mark a server as invalid for further attempts of this request only.
+ *
+ * Arguments:
+ *     areq  : The request record associated with this operation.
+ *     afid  : The FID of the file involved in the action.  This argument
+ *             may be null if none was involved.
+ *      tsp   : pointer to a server struct for the server we wish to 
+ *              blacklist. 
+ *
+ * Returns:
+ *     Non-zero value if further servers are available to try,
+ *     zero otherwise.
+ *
+ * Environment:
+ *     This routine is typically called in situations where we believe
+ *      one server out of a pool may have an error condition.
+ *
+ * Side Effects:
+ *     As advertised.
+ *
+ * NOTE:
+ *     The afs_Conn* routines use the list of invalidated servers to 
+ *      avoid reusing a server marked as invalid for this request.
+ *------------------------------------------------------------------------*/
+static afs_int32 
+afs_BlackListOnce(struct vrequest *areq, struct VenusFid *afid, 
+                 struct server *tsp)
+{
+    struct volume *tvp;
+    afs_int32 i;
+    afs_int32 serversleft = 0;
+
+    if (afid)
+       tvp = afs_FindVolume(afid, READ_LOCK);
+    if (tvp) {
+       for (i = 0; i < MAXHOSTS; i++) {
+           if (tvp->serverHost[i] == tsp) {
+               areq->skipserver[i] = 1;
+           }
+           if (tvp->serverHost[i] &&
+               !(tvp->serverHost[i]->addr->sa_flags & 
+                 SRVR_ISDOWN)) {
+               areq->skipserver[i] = 1;
+           }
+       }
+       afs_PutVolume(tvp, READ_LOCK);
+    }
+    for (i = 0; i < MAXHOSTS; i++) {
+       if (areq->skipserver[i] == 0) {
+           serversleft = 1;
+           break;
+       }
+    }
+    return serversleft;
+}
+
  
  /*------------------------------------------------------------------------
   * EXPORTED afs_Analyze
@@ -468,7 +544,9 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode,
      struct server *tsp;
      struct volume *tvp;
      afs_int32 shouldRetry = 0;
+    afs_int32 serversleft = 1;
      struct afs_stats_RPCErrors *aerrP;
+    afs_int32 markeddown;
  
      AFS_STATCNT(afs_Analyze);
      afs_Trace4(afs_iclSetp, CM_TRACE_ANALYZE, ICL_TYPE_INT32, op,
@@ -592,10 +670,33 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode,
         acode = 455;
  #endif /* AFS_64BIT_CLIENT */
      if ((acode < 0) && (acode != VRESTARTING)) {
-       afs_ServerDown(sa);
-       ForceNewConnections(sa);        /*multi homed clients lock:afs_xsrvAddr? */
+       if (acode == RX_CALL_TIMEOUT) {
+           serversleft = afs_BlackListOnce(areq, afid, tsp);
+           areq->idleError++;
+           if (serversleft) {
+               shouldRetry = 1;
+           } else {
+               shouldRetry = 0;
+           }
+           /* By doing this, we avoid ever marking a server down
+            * in an idle timeout case. That's because the server is 
+            * still responding and may only be letting a single vnode
+            * time out. We otherwise risk having the server continually
+            * be marked down, then up, then down again... 
+            */
+           goto out;
+       } 
+       markeddown = afs_ServerDown(sa);
+       ForceNewConnections(sa); /**multi homed clients lock:afs_xsrvAddr? */
         if (aerrP)
             (aerrP->err_Server)++;
+#if 0
+       /* retry *once* when the server is timed out in case of NAT */
+       if (markeddown && acode == RX_CALL_DEAD) {
+           aconn->forceConnectFS = 1;
+           shouldRetry = 1;
+       }
+#endif
      }
  
      if (acode == VBUSY || acode == VRESTARTING) {
@@ -626,7 +727,6 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode,
                || (acode & ~0xff) == ERROR_TABLE_BASE_RXK) {
         /* any rxkad error is treated as token expiration */
         struct unixuser *tu;
-
         /*
          * I'm calling these errors protection errors, since they involve
          * faulty authentication.
@@ -645,11 +745,22 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode,
                     ("afs: Tokens for user of AFS id %d for cell %s have expired\n",
                      tu->vid, aconn->srvr->server->cell->cellName);
             } else {
-               aconn->forceConnectFS = 0;      /* don't check until new tokens set */
-               aconn->user->states |= UTokensBad;
-               afs_warnuser
-                   ("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n",
-                    tu->vid, aconn->srvr->server->cell->cellName, acode);
+               serversleft = afs_BlackListOnce(areq, afid, tsp);
+               areq->tokenError++;
+
+               if (serversleft) {
+                   afs_warnuser
+                       ("afs: Tokens for user of AFS id %d for cell %s: rxkad error=%d\n",
+                        tu->vid, aconn->srvr->server->cell->cellName, acode);
+                   shouldRetry = 1;
+               } else {
+                   areq->tokenError = 0;
+                   aconn->forceConnectFS = 0;  /* don't check until new tokens set */
+                   aconn->user->states |= UTokensBad;
+                   afs_warnuser
+                       ("afs: Tokens for user of AFS id %d for cell %s are discarded (rxkad error=%d)\n",
+                        tu->vid, aconn->srvr->server->cell->cellName, acode);
+               }
             }
             afs_PutUser(tu, READ_LOCK);
         } else {
@@ -745,7 +856,7 @@ afs_Analyze(register struct conn *aconn, afs_int32 acode,
         VSleep(1);              /* Just a hack for desperate times. */
         shouldRetry = 1;
      }
-
+out:
      /* now unlock the connection and return */
      afs_PutConn(aconn, locktype);
      return (shouldRetry);
diff --git a/src/afs/afs_call.c b/src/afs/afs_call.c

index 12be70e95897dae89e5a25094c08675a75b62603..57b1b8a9e15cc1f517d42119ead427dc182f9ada 100644 (file)
--- a/src/afs/afs_call.c
+++ b/src/afs/afs_call.c
@@ -110,6 +110,7 @@ char afs_cachebasedir[1024];
  
  afs_int32 afs_rx_deadtime = AFS_RXDEADTIME;
  afs_int32 afs_rx_harddead = AFS_HARDDEADTIME;
+afs_int32 afs_rx_idledead = AFS_IDLEDEADTIME;
  
  static int
    Afscall_icl(long opcode, long p1, long p2, long p3, long p4, long *retval);
diff --git a/src/afs/afs_conn.c b/src/afs/afs_conn.c

index c883d7213f53cc6e7185d6159669f2afbdef4d8d..bae3f01354ca347d4b23ab0126df5e8c72684b87 100644 (file)
--- a/src/afs/afs_conn.c
+++ b/src/afs/afs_conn.c
@@ -83,7 +83,9 @@ afs_Conn(register struct VenusFid *afid, register struct vrequest *areq,
  
      /* First is always lowest rank, if it's up */
      if ((tv->status[0] == not_busy) && tv->serverHost[0]
-       && !(tv->serverHost[0]->addr->sa_flags & SRVR_ISDOWN))
+       && !(tv->serverHost[0]->addr->sa_flags & SRVR_ISDOWN) &&
+       !(((areq->idleError > 0) || (areq->tokenError > 0))
+         && (areq->skipserver[0] == 1)))
         lowp = tv->serverHost[0]->addr;
  
      /* Otherwise we look at all of them. There are seven levels of
@@ -95,6 +97,9 @@ afs_Conn(register struct VenusFid *afid, register struct vrequest *areq,
       */
      for (notbusy = not_busy; (!lowp && (notbusy <= end_not_busy)); notbusy++) {
         for (i = 0; i < MAXHOSTS && tv->serverHost[i]; i++) {
+           if (((areq->tokenError > 0)||(areq->idleError > 0)) 
+               && (areq->skipserver[i] == 1))
+               continue;
             if (tv->status[i] != notbusy) {
                 if (tv->status[i] == rd_busy || tv->status[i] == rdwr_busy) {
                     if (!areq->busyCount)
@@ -234,6 +239,7 @@ afs_ConnBySA(struct srvAddr *sap, unsigned short aport, afs_int32 acell,
         if (service == 52) {
             rx_SetConnHardDeadTime(tc->id, afs_rx_harddead);
         }
+       rx_SetConnIdleDeadTime(tc->id, afs_rx_idledead);
  
         tc->forceConnectFS = 0; /* apparently we're appropriately connected now */
         if (csec)
diff --git a/src/afs/afs_prototypes.h b/src/afs/afs_prototypes.h

index ac2c2fe753407dcc8879cdc3d059bcd0c0695322..f83484bb1f10450a4e9df8bc46a158234d5caaa1 100644 (file)
--- a/src/afs/afs_prototypes.h
+++ b/src/afs/afs_prototypes.h
@@ -700,7 +700,7 @@ extern struct server *afs_GetServer(afs_uint32 * aserver, afs_int32 nservers,
                                     afs_int32 addr_uniquifier);
  extern void ForceAllNewConnections(void);
  extern void afs_MarkServerUpOrDown(struct srvAddr *sa, int a_isDown);
-extern void afs_ServerDown(struct srvAddr *sa);
+extern afs_int32 afs_ServerDown(struct srvAddr *sa);
  extern void afs_CountServers(void);
  extern void afs_CheckServers(int adown, struct cell *acellp);
  extern unsigned int afs_random(void);
diff --git a/src/afs/afs_server.c b/src/afs/afs_server.c

index fc0355c6b47c305f6eb164997f8d2c15fa75a9e7..f277d55405c5c8a085e5535c6c348f4320cefad9 100644 (file)
--- a/src/afs/afs_server.c
+++ b/src/afs/afs_server.c
@@ -239,14 +239,14 @@ afs_MarkServerUpOrDown(struct srvAddr *sa, int a_isDown)
  }                              /*MarkServerUpOrDown */
  
  
-void
+afs_int32
  afs_ServerDown(struct srvAddr *sa)
  {
      register struct server *aserver = sa->server;
  
      AFS_STATCNT(ServerDown);
-    if (aserver->flags & SRVR_ISDOWN || sa->sa_flags & SRVADDR_ISDOWN)
-       return;
+    if (aserver->flags & SRVR_ISDOWN || sa->sa_flags & SRVADDR_ISDOWN) 
+       return 0;
      afs_MarkServerUpOrDown(sa, SRVR_ISDOWN);
      if (sa->sa_portal == aserver->cell->vlport)
         print_internet_address
@@ -254,7 +254,7 @@ afs_ServerDown(struct srvAddr *sa)
      else
         print_internet_address("afs: Lost contact with file server ", sa, "",
                                1);
-
+    return 1;
  }                              /*ServerDown */
author	Derrick Brashear <shadow@dementia.org>
	Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)
committer	Derrick Brashear <shadow@dementia.org>
	Sun, 29 Jun 2008 04:26:03 +0000 (04:26 +0000)
src/afs/afs.h		patch \| blob \| history
src/afs/afs_analyze.c		patch \| blob \| history
src/afs/afs_call.c		patch \| blob \| history
src/afs/afs_conn.c		patch \| blob \| history
src/afs/afs_prototypes.h		patch \| blob \| history
src/afs/afs_server.c		patch \| blob \| history