dafs-20060317

author Tom Keiser <tkeiser@sinenomine.net>

Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)

committer Derrick Brashear <shadow@dementia.org>

Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)
author Tom Keiser <tkeiser@sinenomine.net>
Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)
committer Derrick Brashear <shadow@dementia.org>
Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)
diff --git a/Makefile.in b/Makefile.in

index 7e8033d60ade151fa9703982fb260c378c68c6ec..209d9b272bad620de87b874c3681a631dcac7de0 100644 (file)
--- a/Makefile.in
+++ b/Makefile.in
@@ -213,6 +213,24 @@ sgiefs:
  vol: cmd comerr dir afs sgiefs
         ${COMPILE_PART1} vol ${COMPILE_PART2}
  
+tsalvaged: vol libafsrpc libafsauthent cmd util
+       set -x; \
+       if test "@DEMAND_ATTACH@" = "yes" ; then \
+               case ${SYS_NAME} in \
+               alpha_dux*|sgi_*|sun*_5*|rs_aix*|*linux*|hp_ux11*|ia64_hpux*|*fbsd*|*nbsd2*) \
+                       ${COMPILE_PART1} tsalvaged ${COMPILE_PART2} ;; \
+               *_darwin_[1-6][0-9]) \
+                       echo Not building MT tsalvaged for ${SYS_NAME} ;; \
+               *_darwin_*) \
+                       ${COMPILE_PART1} tsalvaged  ${COMPILE_PART2} ;; \
+               *) \
+                       echo Not building MT tsalvaged for ${SYS_NAME} ;; \
+               esac \
+       else \
+               echo skipping tsalvaged ; \
+       fi
+
+
  vlserver: cmd comerr vol audit vlserver_depinstall
         ${COMPILE_PART1} vlserver ${COMPILE_PART2}
  
@@ -569,13 +587,13 @@ jafs: libjafs
  jafsadm: libjafsadm
  
  finale: project cmd comerr afsd butc tbutc @ENABLE_KERNEL_MODULE@ libuafs audit kauth log package \
-       ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \
+       ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \
         venus update xstat afsmonitor dauth rxdebug libafsrpc \
         libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages
         ${COMPILE_PART1} finale ${COMPILE_PART2}
  
  finale_nolibafs: project cmd comerr afsd butc tbutc libuafs audit kauth log package \
-       ptserver scout bu_utils ubik uss bozo vfsck volser tvolser \
+       ptserver scout bu_utils ubik uss bozo vfsck volser tvolser tsalvaged \
         venus update xstat afsmonitor dauth rxdebug libafsrpc \
         libafsauthent shlibafsrpc shlibafsauthent libadmin login man-pages
         ${COMPILE_PART1} finale ${COMPILE_PART2}
@@ -633,6 +651,7 @@ clean2:
         -${COMPILE_PART1} tviced ${COMPILE_CLEAN}
         -${COMPILE_PART1} volser ${COMPILE_CLEAN}
         -${COMPILE_PART1} tvolser ${COMPILE_CLEAN}
+       -${COMPILE_PART1} tsalvaged ${COMPILE_CLEAN}
         -${COMPILE_PART1} venus ${COMPILE_CLEAN}
         -${COMPILE_PART1} venus/test ${COMPILE_CLEAN}
         -${COMPILE_PART1} afsd ${COMPILE_CLEAN}
@@ -791,6 +810,7 @@ distclean: clean
         src/tests/Makefile \
         src/tests/run-tests \
         src/tests/OpenAFS/Dirpath.pm \
+       src/tsalvaged/Makefile \
         src/tsm41/Makefile \
         src/tviced/Makefile \
         src/tvolser/Makefile \
diff --git a/acinclude.m4 b/acinclude.m4

index c9b8417dd7bc9c3662573e8853dd97c52ddeea65..d33fec3f103e20274a01deaf47d25e547788263b 100644 (file)
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -33,6 +33,8 @@ AC_ARG_ENABLE( fast-restart,
  [  --enable-fast-restart               enable fast startup of file server without salvaging],, enable_fast_restart="no")
  AC_ARG_ENABLE( bitmap-later,
  [  --enable-bitmap-later               enable fast startup of file server by not reading bitmap till needed],, enable_bitmap_later="no")
+AC_ARG_ENABLE( demand-attach-fs,
+[  --enable-demand-attach-fs           enable Demand Attach Fileserver (please see documentation)],, enable_demand_attach_fs="no")
  AC_ARG_ENABLE( full-vos-listvol-switch,
  [  --disable-full-vos-listvol-switch    disable vos full listvol switch for formatted output],, enable_full_vos_listvol_switch="yes")
  AC_ARG_WITH(dux-kernel-headers,
@@ -948,6 +950,20 @@ if test "$enable_bitmap_later" = "yes"; then
         AC_DEFINE(BITMAP_LATER, 1, [define if you want to salvager to check bitmasks later])
  fi
  
+if test "$enable_demand_attach_fs" = "yes"; then
+       AC_DEFINE(DEMAND_ATTACH_ENABLE, 1, [define if you want the demand attach fileserver])
+       DEMAND_ATTACH="yes"
+else
+       DEMAND_ATTACH="no"
+fi
+AC_SUBST(DEMAND_ATTACH)
+
+if test "$enable_fast_restart" = "yes" &&
+   test "$enable_demand_attach_fs" = "yes" ; then
+       AC_MSG_ERROR([The Demand Attach and Fast Restart extensions are mutually exclusive.  Demand Attach fileservers automatically salvage volumes in the background, thereby making Fast Restart pointless.])
+       exit 1
+fi
+
  if test "$enable_full_vos_listvol_switch" = "yes"; then
         AC_DEFINE(FULL_LISTVOL_SWITCH, 1, [define if you want to want listvol switch])
  fi
diff --git a/configure.in b/configure.in

index e96a93be9b97fbbd02aff09885030992a63364ae..c20cce9f2c40eeee7daa4cf0bad69d32eff41614 100644 (file)
--- a/configure.in
+++ b/configure.in
@@ -106,6 +106,7 @@ src/tbutc/Makefile \
  src/tests/Makefile \
  src/tests/run-tests \
  src/tests/OpenAFS/Dirpath.pm \
+src/tsalvaged/Makefile \
  src/tsm41/Makefile \
  src/tviced/Makefile \
  src/tvolser/Makefile \
diff --git a/src/auth/Makefile.in b/src/auth/Makefile.in

index 33797066b1f71740388826f2c11100e5c6d76703..975775badbf7c0008132ff2b7ec0bab7ca0f4800 100644 (file)
--- a/src/auth/Makefile.in
+++ b/src/auth/Makefile.in
@@ -96,7 +96,7 @@ test:
         cd test; $(MAKE)
  
  clean:
-       $(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core\
+       $(RM) -f *.o *.a copyauth setkey auth.h cellconfig.h acfg_errors.c ktc_errors.c core \
         AFS_component_version_number.c
  
  include ../config/Makefile.version
diff --git a/src/bozo/bos.c b/src/bozo/bos.c

index ad5a00f4f8e1f82e8537a2508cfd833207654478..cca66c03a66d0feceaf3acb0142858a53e35c777 100644 (file)
--- a/src/bozo/bos.c
+++ b/src/bozo/bos.c
@@ -52,10 +52,12 @@ static DoStat();
  
  #include "bosint.h"
  
-#define MRAFS_OFFSET  9
-#define ADDPARMOFFSET 26
+/* command offsets for bos salvage command */
+#define MRAFS_OFFSET  10
+#define ADDPARMOFFSET 27
  
-static struct SalvageParms {
+/* MR-AFS salvage parameters */
+struct MRAFSSalvageParms {
      afs_int32 Optdebug;
      afs_int32 Optnowrite;
      afs_int32 Optforce;
@@ -74,7 +76,7 @@ static struct SalvageParms {
      afs_int32 OptLogLevel;
      afs_int32 OptRxDebug;
      afs_uint32 OptResidencies;
-} mrafsParm;
+};
  
  /* dummy routine for the audit work.  It should do nothing since audits */
  /* occur at the server level and bos is not a server. */
@@ -1224,17 +1226,11 @@ StopServer(as)
  
  #define PARMBUFFERSSIZE 32
  
-static
-DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
-         orphans)
-     struct rx_connection *aconn;
-     char *aoutName;
-     char *aparm1;
-     char *aparm2;
-     afs_int32 showlog;
-     char *parallel;
-     char *atmpDir;
-     char *orphans;
+static afs_int32
+DoSalvage(struct rx_connection * aconn, char * aparm1, char * aparm2, 
+         char * aoutName, afs_int32 showlog, char * parallel, 
+         char * atmpDir, char * orphans, int dafs, 
+         struct MRAFSSalvageParms * mrafsParm)
  {
      register afs_int32 code;
      char *parms[6];
@@ -1285,19 +1281,43 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
         parms[code] = "";
      if (!aparm2)
         aparm2 = "";
+
      /* MUST pass canonical (wire-format) salvager path to bosserver */
-    strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
      if (*aparm2 != 0) {
-       if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) +
-            1) > BOZO_BSSIZE) {
-           printf("bos: command line too big\n");
-           return (E2BIG);
+       /* single volume salvage */
+       if (dafs) {
+           /* for DAFS, we call the salvagserver binary with special options.
+            * in this mode, it simply uses SALVSYNC to tell the currently
+            * running salvageserver to offline and salvage the volume in question */
+           strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH, BOZO_BSSIZE);
+
+           if ((strlen(tbuffer) + 9 + strlen(partName) + 1 + strlen(aparm2) +
+                1) > BOZO_BSSIZE) {
+               printf("bos: command line too big\n");
+               return (E2BIG);
+           }
+
+           strcat(tbuffer, " -client ");
+           strcat(tbuffer, partName);
+           strcat(tbuffer, " ");
+           strcat(tbuffer, aparm2);
+       } else {
+           strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
+
+           if ((strlen(tbuffer) + 1 + strlen(partName) + 1 + strlen(aparm2) +
+                1) > BOZO_BSSIZE) {
+               printf("bos: command line too big\n");
+               return (E2BIG);
+           }
+
+           strcat(tbuffer, " ");
+           strcat(tbuffer, partName);
+           strcat(tbuffer, " ");
+           strcat(tbuffer, aparm2);
         }
-       strcat(tbuffer, " ");
-       strcat(tbuffer, partName);
-       strcat(tbuffer, " ");
-       strcat(tbuffer, aparm2);
      } else {
+       /* partition salvage */
+       strncpy(tbuffer, AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH, BOZO_BSSIZE);
         if ((strlen(tbuffer) + 4 + strlen(partName) + 1) > BOZO_BSSIZE) {
             printf("bos: command line too big\n");
             return (E2BIG);
@@ -1306,75 +1326,82 @@ DoSalvage(aconn, aparm1, aparm2, aoutName, showlog, parallel, atmpDir,
         strcat(tbuffer, partName);
      }
  
-    /* add the parallel option if given */
-    if (parallel != NULL) {
-       if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) {
-           printf("bos: command line too big\n");
-           return (E2BIG);
+    /* For DAFS, specifying a single volume does not result in a standard
+     * salvager call.  Instead, it simply results in a SALVSYNC call to the
+     * online salvager daemon.  This interface does not give us the same rich
+     * set of call flags.  Thus, we skip these steps for DAFS single-volume 
+     * calls */
+    if (!dafs || (*aparm2 == 0)) {
+       /* add the parallel option if given */
+       if (parallel != NULL) {
+           if ((strlen(tbuffer) + 11 + strlen(parallel) + 1) > BOZO_BSSIZE) {
+               printf("bos: command line too big\n");
+               return (E2BIG);
+           }
+           strcat(tbuffer, " -parallel ");
+           strcat(tbuffer, parallel);
         }
-       strcat(tbuffer, " -parallel ");
-       strcat(tbuffer, parallel);
-    }
  
-    /* add the tmpdir option if given */
-    if (atmpDir != NULL) {
-       if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) {
-           printf("bos: command line too big\n");
-           return (E2BIG);
+       /* add the tmpdir option if given */
+       if (atmpDir != NULL) {
+           if ((strlen(tbuffer) + 9 + strlen(atmpDir) + 1) > BOZO_BSSIZE) {
+               printf("bos: command line too big\n");
+               return (E2BIG);
+           }
+           strcat(tbuffer, " -tmpdir ");
+           strcat(tbuffer, atmpDir);
         }
-       strcat(tbuffer, " -tmpdir ");
-       strcat(tbuffer, atmpDir);
-    }
  
-    /* add the orphans option if given */
-    if (orphans != NULL) {
-       if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) {
-           printf("bos: command line too big\n");
-           return (E2BIG);
+       /* add the orphans option if given */
+       if (orphans != NULL) {
+           if ((strlen(tbuffer) + 10 + strlen(orphans) + 1) > BOZO_BSSIZE) {
+               printf("bos: command line too big\n");
+               return (E2BIG);
+           }
+           strcat(tbuffer, " -orphans ");
+           strcat(tbuffer, orphans);
+       }
+
+       if (mrafsParm->Optdebug)
+           strcat(tbuffer, " -debug");
+       if (mrafsParm->Optnowrite)
+           strcat(tbuffer, " -nowrite");
+       if (mrafsParm->Optforce)
+           strcat(tbuffer, " -force");
+       if (mrafsParm->Optoktozap)
+           strcat(tbuffer, " -oktozap");
+       if (mrafsParm->Optrootfiles)
+           strcat(tbuffer, " -rootfiles");
+       if (mrafsParm->Optsalvagedirs)
+           strcat(tbuffer, " -salvagedirs");
+       if (mrafsParm->Optblockreads)
+           strcat(tbuffer, " -blockreads");
+       if (mrafsParm->OptListResidencies)
+           strcat(tbuffer, " -ListResidencies");
+       if (mrafsParm->OptSalvageRemote)
+           strcat(tbuffer, " -SalvageRemote");
+       if (mrafsParm->OptSalvageArchival)
+           strcat(tbuffer, " -SalvageArchival");
+       if (mrafsParm->OptIgnoreCheck)
+           strcat(tbuffer, " -IgnoreCheck");
+       if (mrafsParm->OptForceOnLine)
+           strcat(tbuffer, " -ForceOnLine");
+       if (mrafsParm->OptUseRootDirACL)
+           strcat(tbuffer, " -UseRootDirACL");
+       if (mrafsParm->OptTraceBadLinkCounts)
+           strcat(tbuffer, " -TraceBadLinkCounts");
+       if (mrafsParm->OptDontAskFS)
+           strcat(tbuffer, " -DontAskFS");
+       if (mrafsParm->OptLogLevel) {
+           sprintf(pbuffer, " -LogLevel %ld", mrafsParm->OptLogLevel);
+           strcat(tbuffer, pbuffer);
+       }
+       if (mrafsParm->OptRxDebug)
+           strcat(tbuffer, " -rxdebug");
+       if (mrafsParm->OptResidencies) {
+           sprintf(pbuffer, " -Residencies %lu", mrafsParm->OptResidencies);
+           strcat(tbuffer, pbuffer);
         }
-       strcat(tbuffer, " -orphans ");
-       strcat(tbuffer, orphans);
-    }
-
-    if (mrafsParm.Optdebug)
-       strcat(tbuffer, " -debug");
-    if (mrafsParm.Optnowrite)
-       strcat(tbuffer, " -nowrite");
-    if (mrafsParm.Optforce)
-       strcat(tbuffer, " -force");
-    if (mrafsParm.Optoktozap)
-       strcat(tbuffer, " -oktozap");
-    if (mrafsParm.Optrootfiles)
-       strcat(tbuffer, " -rootfiles");
-    if (mrafsParm.Optsalvagedirs)
-       strcat(tbuffer, " -salvagedirs");
-    if (mrafsParm.Optblockreads)
-       strcat(tbuffer, " -blockreads");
-    if (mrafsParm.OptListResidencies)
-       strcat(tbuffer, " -ListResidencies");
-    if (mrafsParm.OptSalvageRemote)
-       strcat(tbuffer, " -SalvageRemote");
-    if (mrafsParm.OptSalvageArchival)
-       strcat(tbuffer, " -SalvageArchival");
-    if (mrafsParm.OptIgnoreCheck)
-       strcat(tbuffer, " -IgnoreCheck");
-    if (mrafsParm.OptForceOnLine)
-       strcat(tbuffer, " -ForceOnLine");
-    if (mrafsParm.OptUseRootDirACL)
-       strcat(tbuffer, " -UseRootDirACL");
-    if (mrafsParm.OptTraceBadLinkCounts)
-       strcat(tbuffer, " -TraceBadLinkCounts");
-    if (mrafsParm.OptDontAskFS)
-       strcat(tbuffer, " -DontAskFS");
-    if (mrafsParm.OptLogLevel) {
-       sprintf(pbuffer, " -LogLevel %ld", mrafsParm.OptLogLevel);
-       strcat(tbuffer, pbuffer);
-    }
-    if (mrafsParm.OptRxDebug)
-       strcat(tbuffer, " -rxdebug");
-    if (mrafsParm.OptResidencies) {
-       sprintf(pbuffer, " -Residencies %lu", mrafsParm.OptResidencies);
-       strcat(tbuffer, pbuffer);
      }
  
      parms[0] = tbuffer;
@@ -1481,22 +1508,36 @@ SalvageCmd(as)
      char tname[BOZO_BSSIZE];
      afs_int32 newID;
      extern struct ubik_client *cstruct;
-    afs_int32 curGoal, showlog = 0, mrafs = 0;
+    afs_int32 curGoal, showlog = 0, dafs = 0, mrafs = 0;
      char *parallel;
      char *tmpDir;
      char *orphans;
      char *tp;
+    char * serviceName;
+    struct MRAFSSalvageParms mrafsParm;
  
      memset(&mrafsParm, 0, sizeof(mrafsParm));
  
      /* parm 0 is machine name, 1 is partition, 2 is volume, 3 is -all flag */
      tconn = GetConn(as, 0);
  
-    /* Find out whether fileserver is running MR-AFS (has a scanner instance) */
-    /* XXX this should really be done some other way, potentially by RPC */
      tp = &tname[0];
-    if (code = BOZO_GetInstanceParm(tconn, "fs", 3, &tp) == 0)
-       mrafs = 1;
+
+    /* find out whether fileserver is running demand attach fs */
+    if (code = BOZO_GetInstanceParm(tconn, "dafs", 0, &tp) == 0) {
+       dafs = 1;
+       serviceName = "dafs";
+       /* Find out whether fileserver is running MR-AFS (has a scanner instance) */
+       /* XXX this should really be done some other way, potentially by RPC */
+       if (code = BOZO_GetInstanceParm(tconn, serviceName, 4, &tp) == 0)
+           mrafs = 1;
+    } else {
+       serviceName = "fs";
+       /* Find out whether fileserver is running MR-AFS (has a scanner instance) */
+       /* XXX this should really be done some other way, potentially by RPC */
+       if (code = BOZO_GetInstanceParm(tconn, serviceName, 3, &tp) == 0)
+           mrafs = 1;
+    }
  
      /* we can do a volume, a partition or the whole thing, but not mixtures
       * thereof */
@@ -1542,6 +1583,14 @@ SalvageCmd(as)
         orphans = as->parms[8].items->data;
      }
  
+    if (dafs) {
+       if (!as->parms[9].items) { /* -forceDAFS flag */
+           printf("This is a demand attach fileserver.  Are you sure you want to proceed with a manual salvage?\n");
+           printf("must specify -forceDAFS flag in order to proceed.\n");
+           return EINVAL;
+       }
+    }
+
      if (mrafs) {
         if (as->parms[MRAFS_OFFSET].items)
             mrafsParm.Optdebug = 1;
@@ -1597,7 +1646,7 @@ SalvageCmd(as)
      } else {
         int stop = 0;
  
-       for (i = 9; i < ADDPARMOFFSET; i++) {
+       for (i = MRAFS_OFFSET; i < ADDPARMOFFSET; i++) {
             if (as->parms[i].items) {
                 printf(" %s only possible for MR-AFS fileserver.\n",
                        as->parms[i].name);
@@ -1610,12 +1659,12 @@ SalvageCmd(as)
  
      if (as->parms[4].items) {
         /* salvage whole enchilada */
-       curGoal = GetServerGoal(tconn, "fs");
+       curGoal = GetServerGoal(tconn, serviceName);
         if (curGoal == BSTAT_NORMAL) {
-           printf("bos: shutting down fs.\n");
-           code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN);
+           printf("bos: shutting down '%s'.\n", serviceName);
+           code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN);
             if (code) {
-               printf("bos: failed to stop 'fs' (%s)\n", em(code));
+               printf("bos: failed to stop '%s' (%s)\n", serviceName, em(code));
                 return code;
             }
             code = BOZO_WaitAll(tconn); /* wait for shutdown to complete */
@@ -1626,12 +1675,12 @@ SalvageCmd(as)
         /* now do the salvage operation */
         printf("Starting salvage.\n");
         rc = DoSalvage(tconn, NULL, NULL, outName, showlog, parallel, tmpDir,
-                      orphans);
+                      orphans, dafs, &mrafsParm);
         if (curGoal == BSTAT_NORMAL) {
-           printf("bos: restarting fs.\n");
-           code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL);
+           printf("bos: restarting %s.\n", serviceName);
+           code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL);
             if (code) {
-               printf("bos: failed to restart 'fs' (%s)\n", em(code));
+               printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code));
                 return code;
             }
         }
@@ -1651,13 +1700,13 @@ SalvageCmd(as)
                    as->parms[1].items->data);
             return -1;
         }
-       curGoal = GetServerGoal(tconn, "fs");
+       curGoal = GetServerGoal(tconn, serviceName);
         /* salvage a whole partition (specified by parms[1]) */
         if (curGoal == BSTAT_NORMAL) {
-           printf("bos: shutting down fs.\n");
-           code = BOZO_SetTStatus(tconn, "fs", BSTAT_SHUTDOWN);
+           printf("bos: shutting down '%s'.\n", serviceName);
+           code = BOZO_SetTStatus(tconn, serviceName, BSTAT_SHUTDOWN);
             if (code) {
-               printf("bos: can't stop 'fs' (%s)\n", em(code));
+               printf("bos: can't stop '%s' (%s)\n", serviceName, em(code));
                 return code;
             }
             code = BOZO_WaitAll(tconn); /* wait for shutdown to complete */
@@ -1668,12 +1717,12 @@ SalvageCmd(as)
         /* now do the salvage operation */
         printf("Starting salvage.\n");
         rc = DoSalvage(tconn, as->parms[1].items->data, NULL, outName,
-                      showlog, parallel, tmpDir, orphans);
+                      showlog, parallel, tmpDir, orphans, dafs, &mrafsParm);
         if (curGoal == BSTAT_NORMAL) {
-           printf("bos: restarting fs.\n");
-           code = BOZO_SetTStatus(tconn, "fs", BSTAT_NORMAL);
+           printf("bos: restarting '%s'.\n", serviceName);
+           code = BOZO_SetTStatus(tconn, serviceName, BSTAT_NORMAL);
             if (code) {
-               printf("bos: failed to restart 'fs' (%s)\n", em(code));
+               printf("bos: failed to restart '%s' (%s)\n", serviceName, em(code));
                 return code;
             }
         }
@@ -1723,7 +1772,7 @@ SalvageCmd(as)
         }
         printf("Starting salvage.\n");
         rc = DoSalvage(tconn, as->parms[1].items->data, tname, outName,
-                      showlog, parallel, tmpDir, orphans);
+                      showlog, parallel, tmpDir, orphans, dafs, &mrafsParm);
         if (rc)
             return rc;
      }
@@ -2153,6 +2202,8 @@ main(argc, argv)
                 "directory to place tmp files");
      cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
                 "ignore | remove | attach");
+    cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL,
+               "(DAFS) force salvage of demand attach fileserver");
      cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
                 "(MR-AFS) Run in Debugging mode");
      cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
diff --git a/src/bozo/bosserver.c b/src/bozo/bosserver.c

index 635a6810e46428144ddb5cc6e1fa2be7fbfcf29c..2351eeb0664411b7b411d63866a66e153c3a6ae2 100644 (file)
--- a/src/bozo/bosserver.c
+++ b/src/bozo/bosserver.c
@@ -51,7 +51,7 @@ RCSID
  #define BOZO_LWP_STACKSIZE     16000
  extern int BOZO_ExecuteRequest();
  extern int RXSTATS_ExecuteRequest();
-extern struct bnode_ops fsbnode_ops, ezbnode_ops, cronbnode_ops;
+extern struct bnode_ops fsbnode_ops, dafsbnode_ops, ezbnode_ops, cronbnode_ops;
  
  void bozo_Log();
  
@@ -895,6 +895,7 @@ main(int argc, char **argv, char **envp)
      }
  
      bnode_Register("fs", &fsbnode_ops, 3);
+    bnode_Register("dafs", &dafsbnode_ops, 4);
      bnode_Register("simple", &ezbnode_ops, 1);
      bnode_Register("cron", &cronbnode_ops, 2);
  
diff --git a/src/bozo/fsbnodeops.c b/src/bozo/fsbnodeops.c

index 2ac65e46210cdae8c388fc99e6d98e172176a2df..e38670e80e96baf438717d2874472e6f99515e90 100644 (file)
--- a/src/bozo/fsbnodeops.c
+++ b/src/bozo/fsbnodeops.c
@@ -41,13 +41,6 @@ RCSID
  #include <afs/afsutil.h>
  #include "bnode.h"
  
-static int fs_timeout(), fs_getstat(), fs_setstat(), fs_delete();
-static int fs_procexit(), fs_getstring(), fs_getparm(), fs_restartp();
-static int fs_hascore();
-struct bnode *fs_create();
-
-static SetNeedsClock();
-static NudgeProcs();
  
  static int emergency = 0;
  
@@ -76,49 +69,105 @@ static int emergency = 0;
      The needsSalvage flag is cleared when the salvager exits.
  */
  
-struct bnode_ops fsbnode_ops = {
-    fs_create,
-    fs_timeout,
-    fs_getstat,
-    fs_setstat,
-    fs_delete,
-    fs_procexit,
-    fs_getstring,
-    fs_getparm,
-    fs_restartp,
-    fs_hascore,
-};
-
  struct fsbnode {
      struct bnode b;
      afs_int32 timeSDStarted;   /* time shutdown operation started */
      char *filecmd;             /* command to start primary file server */
      char *volcmd;              /* command to start secondary vol server */
+    char *salsrvcmd;            /* command to start salvageserver (demand attach fs) */
      char *salcmd;              /* command to start salvager */
      char *scancmd;             /* command to start scanner (MR-AFS) */
      struct bnode_proc *fileProc;       /* process for file server */
      struct bnode_proc *volProc;        /* process for vol server */
+    struct bnode_proc *salsrvProc;     /* process for salvageserver (demand attach fs) */
      struct bnode_proc *salProc;        /* process for salvager */
      struct bnode_proc *scanProc;       /* process for scanner (MR-AFS) */
      afs_int32 lastFileStart;   /* last start for file */
      afs_int32 lastVolStart;    /* last start for vol */
+    afs_int32 lastSalsrvStart; /* last start for salvageserver (demand attach fs) */
      afs_int32 lastScanStart;   /* last start for scanner (MR-AFS) */
      char fileRunning;          /* file process is running */
      char volRunning;           /* volser is running */
+    char salsrvRunning;                /* salvageserver is running (demand attach fs) */
      char salRunning;           /* salvager is running */
      char scanRunning;          /* scanner is running (MR_AFS) */
      char fileSDW;              /* file shutdown wait */
      char volSDW;               /* vol shutdown wait */
+    char salsrvSDW;            /* salvageserver shutdown wait (demand attach fs) */
      char salSDW;               /* waiting for the salvager to shutdown */
      char scanSDW;              /* scanner shutdown wait (MR_AFS) */
      char fileKillSent;         /* kill signal has been sent */
      char volKillSent;
+    char salsrvKillSent;        /* kill signal has been sent (demand attach fs) */
      char salKillSent;
      char scanKillSent;         /* kill signal has been sent (MR_AFS) */
      char needsSalvage;         /* salvage before running */
      char needsClock;           /* do we need clock ticks */
  };
  
+
+
+struct bnode * fs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+                        char *asalcmd, char *ascancmd);
+struct bnode * dafs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+                          char * asalsrvcmd, char *asalcmd, char *ascancmd);
+
+static int fs_hascore(register struct ezbnode *abnode);
+static int fs_restartp(register struct fsbnode *abnode);
+static int SetSalFlag(register struct fsbnode *abnode, register int aflag);
+static int RestoreSalFlag(register struct fsbnode *abnode);
+static int fs_delete(struct fsbnode *abnode);
+static int fs_timeout(struct fsbnode *abnode);
+static int fs_getstat(struct fsbnode *abnode, afs_int32 * astatus);
+static int fs_setstat(register struct fsbnode *abnode, afs_int32 astatus);
+static int fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc);
+static int fs_getstring(struct fsbnode *abnode, char *abuffer, afs_int32 alen);
+
+
+static int fs_getparm(struct fsbnode *abnode, afs_int32 aindex, 
+                     char *abuffer, afs_int32 alen);
+static int dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, 
+                       char *abuffer, afs_int32 alen);
+
+#ifdef AFS_NT40_ENV
+static void AppendExecutableExtension(char *cmd)
+#else
+#define AppendExecutableExtension(x)
+#endif
+
+static void SetNeedsClock(register struct fsbnode *ab);
+static int NudgeProcs(register struct fsbnode *abnode);
+
+
+
+struct bnode_ops fsbnode_ops = {
+    fs_create,
+    fs_timeout,
+    fs_getstat,
+    fs_setstat,
+    fs_delete,
+    fs_procexit,
+    fs_getstring,
+    fs_getparm,
+    fs_restartp,
+    fs_hascore,
+};
+
+/* demand attach fs bnode ops */
+struct bnode_ops dafsbnode_ops = {
+    dafs_create,
+    fs_timeout,
+    fs_getstat,
+    fs_setstat,
+    fs_delete,
+    fs_procexit,
+    fs_getstring,
+    dafs_getparm,
+    fs_restartp,
+    fs_hascore,
+};
+
+
  /* Function to tell whether this bnode has a core file or not.  You might
   * think that this could be in bnode.c, and decide what core files to check
   * for based on the bnode's coreName property, but that doesn't work because
@@ -140,6 +189,11 @@ fs_hascore(register struct ezbnode *abnode)
      if (access(tbuffer, 0) == 0)
         return 1;
  
+    /* see if salvageserver left a core file */
+    bnode_CoreName(abnode, "salsrv", tbuffer);
+    if (access(tbuffer, 0) == 0)
+       return 1;
+
      /* see if salvager left a core file */
      bnode_CoreName(abnode, "salv", tbuffer);
      if (access(tbuffer, 0) == 0)
@@ -198,6 +252,25 @@ fs_restartp(register struct fsbnode *abnode)
      if (code)
         return code;
  
+    if (abnode->salsrvcmd) {    /* only in demand attach fs */
+       /* now do same for salsrvcmd (demand attach fs) */
+       code = bnode_ParseLine(abnode->salsrvcmd, &tt);
+       if (code)
+           return 0;
+       if (!tt)
+           return 0;
+       code = stat(tt->key, &tstat);
+       if (code) {
+           bnode_FreeTokens(tt);
+           return 0;
+       }
+       if (tstat.st_ctime > abnode->lastScanStart)
+           code = 1;
+       else
+           code = 0;
+       bnode_FreeTokens(tt);
+    }
+
      if (abnode->scancmd) {     /* Only in MR-AFS */
         /* now do same for scancmd (MR-AFS) */
         code = bnode_ParseLine(abnode->scancmd, &tt);
@@ -228,14 +301,17 @@ SetSalFlag(register struct fsbnode *abnode, register int aflag)
      char tbuffer[AFSDIR_PATH_MAX];
      int fd;
  
-    abnode->needsSalvage = aflag;
-    strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
-              SALFILE, abnode->b.name, NULL);
-    if (aflag) {
-       fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
-       close(fd);
-    } else {
-       unlink(tbuffer);
+    /* don't use the salvage flag for demand attach fs */
+    if (abnode->salsrvcmd == NULL) {
+       abnode->needsSalvage = aflag;
+       strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
+                  SALFILE, abnode->b.name, NULL);
+       if (aflag) {
+           fd = open(tbuffer, O_CREAT | O_TRUNC | O_RDWR, 0666);
+           close(fd);
+       } else {
+           unlink(tbuffer);
+       }
      }
      return 0;
  }
@@ -246,13 +322,18 @@ RestoreSalFlag(register struct fsbnode *abnode)
  {
      char tbuffer[AFSDIR_PATH_MAX];
  
-    strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
-              SALFILE, abnode->b.name, NULL);
-    if (access(tbuffer, 0) == 0) {
-       /* file exists, so need to salvage */
-       abnode->needsSalvage = 1;
-    } else {
+    /* never set needs salvage flag for demand attach fs */
+    if (abnode->salsrvcmd != NULL) {
         abnode->needsSalvage = 0;
+    } else {
+       strcompose(tbuffer, AFSDIR_PATH_MAX, AFSDIR_SERVER_LOCAL_DIRPATH, "/",
+                  SALFILE, abnode->b.name, NULL);
+       if (access(tbuffer, 0) == 0) {
+           /* file exists, so need to salvage */
+           abnode->needsSalvage = 1;
+       } else {
+           abnode->needsSalvage = 0;
+       }
      }
      return 0;
  }
@@ -272,6 +353,8 @@ fs_delete(struct fsbnode *abnode)
      free(abnode->filecmd);
      free(abnode->volcmd);
      free(abnode->salcmd);
+    if (abnode->salsrvcmd)
+       free(abnode->salsrvcmd);
      if (abnode->scancmd)
         free(abnode->scancmd);
      free(abnode);
@@ -304,95 +387,235 @@ fs_create(char *ainstance, char *afilecmd, char *avolcmd, char *asalcmd,
      char *fileCmdpath, *volCmdpath, *salCmdpath, *scanCmdpath;
      int bailout = 0;
  
-    fileCmdpath = volCmdpath = salCmdpath = NULL;
+    te = fileCmdpath = volCmdpath = salCmdpath = scanCmdpath = NULL;
  
      /* construct local paths from canonical (wire-format) paths */
      if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
         bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
         bailout = 1;
+       goto done;
      }
      if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
         bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
         bailout = 1;
+       goto done;
      }
      if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
         bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
         bailout = 1;
+       goto done;
      }
  
      if (ascancmd && strlen(ascancmd)) {
         if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
             bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
             bailout = 1;
+           goto done;
         }
      }
  
      if (!bailout) {
         sscanf(fileCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
         AppendExecutableExtension(cmdname);
-#endif
         if (stat(cmdname, &tstat)) {
             bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
             bailout = 1;
+           goto done;
         }
  
         sscanf(volCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
         AppendExecutableExtension(cmdname);
-#endif
         if (stat(cmdname, &tstat)) {
             bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
             bailout = 1;
+           goto done;
         }
  
         sscanf(salCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
         AppendExecutableExtension(cmdname);
-#endif
         if (stat(cmdname, &tstat)) {
             bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
             bailout = 1;
+           goto done;
         }
  
         if (ascancmd && strlen(ascancmd)) {
             sscanf(scanCmdpath, "%s", cmdname);
-#ifdef AFS_NT40_ENV
             AppendExecutableExtension(cmdname);
-#endif
             if (stat(cmdname, &tstat)) {
                 bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
                 bailout = 1;
+               goto done;
             }
         }
      }
  
+    te = (struct fsbnode *)malloc(sizeof(struct fsbnode));
+    if (te == NULL) {
+       bailout = 1;
+       goto done;
+    }
+    memset(te, 0, sizeof(struct fsbnode));
+    te->filecmd = fileCmdpath;
+    te->volcmd = volCmdpath;
+    te->salsrvcmd = NULL;
+    te->salcmd = salCmdpath;
+    if (ascancmd && strlen(ascancmd))
+       te->scancmd = scanCmdpath;
+    else
+       te->scancmd = NULL;
+    if (bnode_InitBnode(te, &fsbnode_ops, ainstance) != 0) {
+       bailout = 1;
+       goto done;
+    }
+    bnode_SetTimeout(te, POLLTIME);    /* ask for timeout activations every 10 seconds */
+    RestoreSalFlag(te);                /* restore needsSalvage flag based on file's existence */
+    SetNeedsClock(te);         /* compute needsClock field */
+
+ done:
      if (bailout) {
-       free(fileCmdpath);
-       free(volCmdpath);
-       free(salCmdpath);
+       if (te)
+           free(te);
+       if (fileCmdpath)
+           free(fileCmdpath);
+       if (volCmdpath)
+           free(volCmdpath);
+       if (salCmdpath)
+           free(salCmdpath);
+       if (scanCmdpath)
+           free(scanCmdpath);
         return NULL;
      }
  
+    return (struct bnode *)te;
+}
+
+/* create a demand attach fs bnode */
+struct bnode *
+dafs_create(char *ainstance, char *afilecmd, char *avolcmd, 
+           char * asalsrvcmd, char *asalcmd, char *ascancmd)
+{
+    struct stat tstat;
+    register struct fsbnode *te;
+    char cmdname[AFSDIR_PATH_MAX];
+    char *fileCmdpath, *volCmdpath, *salsrvCmdpath, *salCmdpath, *scanCmdpath;
+    int bailout = 0;
+
+    te = fileCmdpath = volCmdpath = salsrvCmdpath = salCmdpath = scanCmdpath = NULL;
+
+    /* construct local paths from canonical (wire-format) paths */
+    if (ConstructLocalBinPath(afilecmd, &fileCmdpath)) {
+       bozo_Log("BNODE: command path invalid '%s'\n", afilecmd);
+       bailout = 1;
+       goto done;
+    }
+    if (ConstructLocalBinPath(avolcmd, &volCmdpath)) {
+       bozo_Log("BNODE: command path invalid '%s'\n", avolcmd);
+       bailout = 1;
+       goto done;
+    }
+    if (ConstructLocalBinPath(asalsrvcmd, &salsrvCmdpath)) {
+       bozo_Log("BNODE: command path invalid '%s'\n", asalsrvcmd);
+       bailout = 1;
+       goto done;
+    }
+    if (ConstructLocalBinPath(asalcmd, &salCmdpath)) {
+       bozo_Log("BNODE: command path invalid '%s'\n", asalcmd);
+       bailout = 1;
+       goto done;
+    }
+
+    if (ascancmd && strlen(ascancmd)) {
+       if (ConstructLocalBinPath(ascancmd, &scanCmdpath)) {
+           bozo_Log("BNODE: command path invalid '%s'\n", ascancmd);
+           bailout = 1;
+           goto done;
+       }
+    }
+
+    if (!bailout) {
+       sscanf(fileCmdpath, "%s", cmdname);
+       AppendExecutableExtension(cmdname);
+       if (stat(cmdname, &tstat)) {
+           bozo_Log("BNODE: file server binary '%s' not found\n", cmdname);
+           bailout = 1;
+           goto done;
+       }
+
+       sscanf(volCmdpath, "%s", cmdname);
+       AppendExecutableExtension(cmdname);
+       if (stat(cmdname, &tstat)) {
+           bozo_Log("BNODE: volume server binary '%s' not found\n", cmdname);
+           bailout = 1;
+           goto done;
+       }
+
+       sscanf(salsrvCmdpath, "%s", cmdname);
+       AppendExecutableExtension(cmdname);
+       if (stat(cmdname, &tstat)) {
+           bozo_Log("BNODE: salvageserver binary '%s' not found\n", cmdname);
+           bailout = 1;
+           goto done;
+       }
+
+       sscanf(salCmdpath, "%s", cmdname);
+       AppendExecutableExtension(cmdname);
+       if (stat(cmdname, &tstat)) {
+           bozo_Log("BNODE: salvager binary '%s' not found\n", cmdname);
+           bailout = 1;
+           goto done;
+       }
+
+       if (ascancmd && strlen(ascancmd)) {
+           sscanf(scanCmdpath, "%s", cmdname);
+           AppendExecutableExtension(cmdname);
+           if (stat(cmdname, &tstat)) {
+               bozo_Log("BNODE: scanner binary '%s' not found\n", cmdname);
+               bailout = 1;
+               goto done;
+           }
+       }
+    }
+
      te = (struct fsbnode *)malloc(sizeof(struct fsbnode));
+    if (te == NULL) {
+       bailout = 1;
+       goto done;
+    }
      memset(te, 0, sizeof(struct fsbnode));
      te->filecmd = fileCmdpath;
      te->volcmd = volCmdpath;
+    te->salsrvcmd = salsrvCmdpath;
      te->salcmd = salCmdpath;
      if (ascancmd && strlen(ascancmd))
         te->scancmd = scanCmdpath;
      else
         te->scancmd = NULL;
-    if (bnode_InitBnode(te, &fsbnode_ops, ainstance) != 0) {
-       free(te);
-       free(fileCmdpath);
-       free(volCmdpath);
-       free(salCmdpath);
-       return NULL;
+    if (bnode_InitBnode(te, &dafsbnode_ops, ainstance) != 0) {
+       bailout = 1;
+       goto done;
      }
      bnode_SetTimeout(te, POLLTIME);    /* ask for timeout activations every 10 seconds */
      RestoreSalFlag(te);                /* restore needsSalvage flag based on file's existence */
      SetNeedsClock(te);         /* compute needsClock field */
+
+ done:
+    if (bailout) {
+       if (te)
+           free(te);
+       if (fileCmdpath)
+           free(fileCmdpath);
+       if (volCmdpath)
+           free(volCmdpath);
+       if (salsrvCmdpath)
+           free(salsrvCmdpath);
+       if (salCmdpath)
+           free(salCmdpath);
+       if (scanCmdpath)
+           free(scanCmdpath);
+       return NULL;
+    }
+
      return (struct bnode *)te;
  }
  
@@ -431,6 +654,15 @@ fs_timeout(struct fsbnode *abnode)
                  FSSDTIME);
         }
      }
+    if (abnode->salsrvSDW) {
+       if (!abnode->salsrvKillSent && now - abnode->timeSDStarted > SDTIME) {
+           bnode_StopProc(abnode->salsrvProc, SIGKILL);
+           abnode->salsrvKillSent = 1;
+           bozo_Log
+               ("bos shutdown: salvageserver failed to shutdown within %d seconds\n",
+                SDTIME);
+       }
+    }
      if (abnode->scanSDW) {
         if (!abnode->scanKillSent && now - abnode->timeSDStarted > SDTIME) {
             bnode_StopProc(abnode->scanProc, SIGKILL);
@@ -449,15 +681,17 @@ fs_getstat(struct fsbnode *abnode, afs_int32 * astatus)
  {
      register afs_int32 temp;
      if (abnode->volSDW || abnode->fileSDW || abnode->salSDW
-       || abnode->scanSDW)
+       || abnode->scanSDW || abnode->salsrvSDW)
         temp = BSTAT_SHUTTINGDOWN;
      else if (abnode->salRunning)
         temp = BSTAT_NORMAL;
      else if (abnode->volRunning && abnode->fileRunning
-            && (!abnode->scancmd || abnode->scanRunning))
+            && (!abnode->scancmd || abnode->scanRunning)
+            && (!abnode->salsrvcmd || abnode->salsrvRunning))
         temp = BSTAT_NORMAL;
      else if (!abnode->salRunning && !abnode->volRunning
-            && !abnode->fileRunning && !abnode->scanRunning)
+            && !abnode->fileRunning && !abnode->scanRunning
+            && !abnode->salsrvRunning)
         temp = BSTAT_SHUTDOWN;
      else
         temp = BSTAT_STARTINGUP;
@@ -508,6 +742,11 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc)
         abnode->scanRunning = 0;
         abnode->scanSDW = 0;
         abnode->scanKillSent = 0;
+    } else if (aproc == abnode->salsrvProc) {
+       abnode->salsrvProc = 0;
+       abnode->salsrvRunning = 0;
+       abnode->salsrvSDW = 0;
+       abnode->salsrvKillSent = 0;
      }
  
      /* now restart anyone who needs to restart */
@@ -515,14 +754,15 @@ fs_procexit(struct fsbnode *abnode, struct bnode_proc *aproc)
  }
  
  /* make sure we're periodically checking the state if we need to */
-static int
+static void
  SetNeedsClock(register struct fsbnode *ab)
  {
      if (ab->b.goal == 1 && ab->fileRunning && ab->volRunning
-       && (!ab->scancmd || ab->scanRunning))
+       && (!ab->scancmd || ab->scanRunning)
+       && (!ab->salsrvcmd || ab->salsrvRunning))
         ab->needsClock = 0;     /* running normally */
      else if (ab->b.goal == 0 && !ab->fileRunning && !ab->volRunning
-            && !ab->salRunning && !ab->scanRunning)
+            && !ab->salRunning && !ab->scanRunning && !ab->salsrvRunning)
         ab->needsClock = 0;     /* halted normally */
      else
         ab->needsClock = 1;     /* other */
@@ -562,6 +802,18 @@ NudgeProcs(register struct fsbnode *abnode)
                     abnode->volRunning = 1;
                 }
             }
+           if (abnode->salsrvcmd) {
+               if (!abnode->salsrvRunning) {
+                   abnode->lastSalsrvStart = FT_ApproxTime();
+                   code =
+                       bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv",
+                                     &tp);
+                   if (code == 0) {
+                       abnode->salsrvProc = tp;
+                       abnode->salsrvRunning = 1;
+                   }
+               }
+           }
             if (abnode->scancmd) {
                 if (!abnode->scanRunning) {
                     abnode->lastScanStart = FT_ApproxTime();
@@ -576,7 +828,8 @@ NudgeProcs(register struct fsbnode *abnode)
             }
         } else {                /* file is not running */
             /* see how to start */
-           if (!abnode->needsSalvage) {
+           /* for demand attach fs, needsSalvage flag is ignored */
+           if (!abnode->needsSalvage || abnode->salsrvcmd) {
                 /* no crash apparent, just start up normally */
                 if (!abnode->fileRunning) {
                     abnode->lastFileStart = FT_ApproxTime();
@@ -596,6 +849,16 @@ NudgeProcs(register struct fsbnode *abnode)
                         abnode->volRunning = 1;
                     }
                 }
+               if (abnode->salsrvcmd && !abnode->salsrvRunning) {
+                   abnode->lastSalsrvStart = FT_ApproxTime();
+                   code =
+                       bnode_NewProc(abnode, abnode->salsrvcmd, "salsrv",
+                                     &tp);
+                   if (code == 0) {
+                       abnode->salsrvProc = tp;
+                       abnode->salsrvRunning = 1;
+                   }
+               }
                 if (abnode->scancmd && !abnode->scanRunning) {
                     abnode->lastScanStart = FT_ApproxTime();
                     code =
@@ -656,6 +919,11 @@ NudgeProcs(register struct fsbnode *abnode)
             abnode->volSDW = 1;
             abnode->timeSDStarted = now;
         }
+       if (abnode->salsrvRunning && !abnode->salsrvSDW) {
+           bnode_StopProc(abnode->salsrvProc, SIGTERM);
+           abnode->salsrvSDW = 1;
+           abnode->timeSDStarted = now;
+       }
         if (abnode->scanRunning && !abnode->scanSDW) {
             bnode_StopProc(abnode->scanProc, SIGTERM);
             abnode->scanSDW = 1;
@@ -724,3 +992,22 @@ fs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer,
         return BZDOM;
      return 0;
  }
+
+static int
+dafs_getparm(struct fsbnode *abnode, afs_int32 aindex, char *abuffer,
+            afs_int32 alen)
+{
+    if (aindex == 0)
+       strcpy(abuffer, abnode->filecmd);
+    else if (aindex == 1)
+       strcpy(abuffer, abnode->volcmd);
+    else if (aindex == 2)
+       strcpy(abuffer, abnode->salsrvcmd);
+    else if (aindex == 3)
+       strcpy(abuffer, abnode->salcmd);
+    else if (aindex == 4 && abnode->scancmd)
+       strcpy(abuffer, abnode->scancmd);
+    else
+       return BZDOM;
+    return 0;
+}
diff --git a/src/cf/osconf.m4 b/src/cf/osconf.m4

index 9fe6161d8bd8c08d96780637159d938f6658b9eb..22daf81e3e20612ab0740f69bf956862223347c4 100644 (file)
--- a/src/cf/osconf.m4
+++ b/src/cf/osconf.m4
@@ -971,6 +971,18 @@ case $AFS_SYSNAME in
         ;;
  esac
  
+
+
+dnl pthreads fixes
+case $AFS_SYSNAME in
+dnl we'll go ahead and turn on XOPEN2K and ISO_C99
+dnl if this causes problems, we should scale back to _XOPEN_SOURCE=500
+       *linux*)
+               MT_CFLAGS="${MT_CFLAGS} -D_XOPEN_SOURCE=600 -D_BSD_SOURCE"
+       ;;
+esac
+
+
  dnl Disable the default for debugging/optimization if not enabled
  if test "x$enable_debug_kernel" = "xno"; then
    KERN_DBG=
diff --git a/src/config/param.rs_aix51.h b/src/config/param.rs_aix51.h

index ecfe978c4eba67242e9acb6821741ab1df0495f6..cd49793baed6599abd33c2c261c235c3c487f3a0 100644 (file)
--- a/src/config/param.rs_aix51.h
+++ b/src/config/param.rs_aix51.h
@@ -25,8 +25,6 @@
  #ifdef AFS_NAMEI_ENV
  #define AFS_64BIT_IOPS_ENV     1
  #endif
-#define BITMAP_LATER           1
-#define FAST_RESTART           1
  
  #define AFS_HAVE_FLOCK_SYSID    1
  
diff --git a/src/config/param.rs_aix52.h b/src/config/param.rs_aix52.h

index 0ee9986ec99d42d9e4658187ae516bd429ea2375..b20bb378dcbcae1cdbda0dd3b14daf6d1a456991 100644 (file)
--- a/src/config/param.rs_aix52.h
+++ b/src/config/param.rs_aix52.h
@@ -26,8 +26,6 @@
  #ifdef AFS_NAMEI_ENV
  #define AFS_64BIT_IOPS_ENV     1
  #endif
-#define BITMAP_LATER           1
-#define FAST_RESTART           1
  
  #define AFS_HAVE_FLOCK_SYSID    1
  
diff --git a/src/config/param.rs_aix53.h b/src/config/param.rs_aix53.h

index ba4f151f3cd30209cc36e4041469bf63422e9352..ecfb3671a29cacb7453c02984c3d2e62dbfa39e1 100644 (file)
--- a/src/config/param.rs_aix53.h
+++ b/src/config/param.rs_aix53.h
@@ -27,8 +27,6 @@
  #ifdef AFS_NAMEI_ENV
  #define AFS_64BIT_IOPS_ENV     1
  #endif
-#define BITMAP_LATER           1
-#define FAST_RESTART           1
  
  #define AFS_HAVE_FLOCK_SYSID    1
  
diff --git a/src/config/stds.h b/src/config/stds.h

index 7b256b6735e317906cd1c0c05ec0b530fc6759a0..9266b0c7f632a89658037ebb868493ea10bfd350 100644 (file)
--- a/src/config/stds.h
+++ b/src/config/stds.h
@@ -56,8 +56,16 @@ typedef unsigned __int64 afs_uint64;
  typedef long long afs_int64;
  typedef unsigned long long afs_uint64;
  #endif
-#define ZeroInt64(a)       (a) = 0
+#define ZeroInt64(a)       (a = 0)
  #define AssignInt64(a, b) *(b) = (a) 
+#define IncInt64(a) (*(a))++
+#define IncUInt64(a) (*(a))++
+#define DecInt64(a) (*(a))--
+#define DecUInt64(a) (*(a))--
+#define GTInt64(a,b) ((a) > (b))
+#define GEInt64(a,b) ((a) >= (b))
+#define LEInt64(a,b) ((a) <= (b))
+#define LTInt64(a,b) ((a) < (b))
  #define AddInt64(a,b,c) *(c) = (afs_int64)(a) + (afs_int64)(b)
  #define AddUInt64(a,b,c) *(c) = (afs_uint64)(a) + (afs_uint64)(b)
  #define SubtractInt64(a,b,c) *(c) = (afs_int64)(a) - (afs_int64)(b)
@@ -83,8 +91,16 @@ struct u_Int64 {
      afs_uint32 low;
  };
  typedef struct u_Int64 afs_uint64;
-#define ZeroInt64(a) (a).high = (a).low = 0
+#define ZeroInt64(a) ((a).high = (a).low = 0)
  #define AssignInt64(a, b) (b)->high = (a).high; (b)->low = (a).low
+#define IncInt64(a) ((++((a)->low)) ? 0 : (a)->high++ )
+#define IncUInt64(a) ((++((a)->low)) ? 0 : (a)->high++ )
+#define DecInt64(a) (((a)->low)-- ? 0 : (a)->high-- )
+#define DecUInt64(a) (((a)->low)-- ? 0 : (a)->high-- )
+#define GTInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low > (b).low)))
+#define GEInt64(a,b) (((a).high > (b).high) || (((a).high == (b).high) && ((a).low >= (b).low)))
+#define LEInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low <= (b).low)))
+#define LTInt64(a,b) (((a).high < (b).high) || (((a).high == (b).high) && ((a).low < (b).low)))
  #define CompareInt64(a,b) (((afs_int32)(a).high - (afs_int32)(b).high) || (((a).high == (b).high) && ((a).low - (b).low))) 
  #define AddInt64(a, b, c) {  afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low + _b.low; (c)->high = _a.high + _b.high + ((c)->low < _b.low); } 
  #define SubtractInt64(a, b, c) { afs_int64 _a, _b; _a = a; _b = b; (c)->low = _a.low - _b.low;  (c)->high = _a.high - _b.high - (_a.low < _b.low); } 
@@ -246,4 +262,9 @@ struct afsUUID {
  };
  typedef struct afsUUID afsUUID;
  
+/* for now, demand attach fileserver is only support on unix pthreads builds */
+#if defined(DEMAND_ATTACH_ENABLE) && defined(AFS_PTHREAD_ENV) && !defined(AFS_NT40_ENV)
+#define AFS_DEMAND_ATTACH_FS 1
+#endif
+
  #endif /* OPENAFS_CONFIG_AFS_STDS_H */
diff --git a/src/rx/rx_queue.h b/src/rx/rx_queue.h

index fcd813c407718c794615f3021fa0e582795d2afb..1e930a676534f73f6bbe803c68461550018cd73f 100644 (file)
--- a/src/rx/rx_queue.h
+++ b/src/rx/rx_queue.h
@@ -78,6 +78,13 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
  #define _RXQSP(q1,q2,i,a,b,c,d,x,y) if (!queue_IsEnd(q1,i->c)) \
      (((y->b->a=q2->a)->b=y->b), ((x->a->b=q2)->a=x->a), ((i->c=q1)->d=i))
  
+/* This one moves a chain of elements from (s) to (e) from its
+ * current position to either before or after element (i)
+ * if (a,b,x,y) is (prev,next,s,e) then chain is moved before (i)
+ * if (a,b,x,y) is (next,prev,e,s) then chain is moved after (i) */
+#define _RXQMV(i, s, e, a, b, x, y) if (i->a != y) \
+    (((e->next->prev=s->prev)->next=e->next), ((i->a->b=x)->a=i->a), ((y->b=i)->a=y))
+
  /* Basic remove operation.  Doesn't update the queue item to indicate it's been removed */
  #define _RXQR(i) ((_RXQ(i)->prev->next=_RXQ(i)->next)->prev=_RXQ(i)->prev)
  
@@ -120,6 +127,12 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
  #define queue_Replace(q1,q2) if (queue_IsEmpty(q2)) queue_Init(q1); else \
      (*_RXQ(q1) = *_RXQ(q2), _RXQ(q1)->next->prev = _RXQ(q1)->prev->next = _RXQ(q1), queue_Init(q2))
  
+/* move a chain of elements beginning at (s) and ending at (e) before node (i) */
+#define queue_MoveChainBefore(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),prev,next,_RXQ(s),_RXQ(e))
+
+/* move a chain of elements beginning at (s) and ending at (e) after node (i) */
+#define queue_MoveChainAfter(i, s, e) _RXQMV(_RXQ(i),_RXQ(s),_RXQ(e),next,prev,_RXQ(e),_RXQ(s))
+
  /* Remove a queue element (*i) from it's queue.  The next field is 0'd, so that any further use of this q entry will hopefully cause a core dump.  Multiple removes of the same queue item are not supported */
  #define queue_Remove(i) (_RXQR(i), _RXQ(i)->next = 0)
  
@@ -155,6 +168,10 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
  /* Returns false if the item was removed from a queue OR is uninitialized (zero) */
  #define queue_IsOnQueue(i) (_RXQ(i)->next != 0)
  
+/* Returns true if the item was removed from a queue OR is uninitialized (zero) */
+/* Return false if the queue item is currently in a queue */
+#define queue_IsNotOnQueue(i) (_RXQ(i)->next == 0)
+
  /* Returns true if the queue item (i) is the first element of the queue (q) */
  #define queue_IsFirst(q,i) (_RXQ(q)->first == _RXQ(i))
  
@@ -164,6 +181,9 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
  /* Returns true if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */
  #define queue_IsEnd(q,i) (_RXQ(q) == _RXQ(i))
  
+/* Returns false if the queue item (i) is the end of the queue (q), that is, i is the head of the queue */
+#define queue_IsNotEnd(q,i) (_RXQ(q) != _RXQ(i))
+
  /* Prototypical loop to scan an entire queue forwards.  q is the queue
   * head, qe is the loop variable, next is a variable used to store the
   * queue entry for the next iteration of the loop, s is the user's
@@ -180,12 +200,24 @@ for (n=0, queue_Scan(&myqueue, qe, nqe, myelement), n++) {}
         !queue_IsEnd(q, qe);                            \
         (qe) = (next), next = queue_Next(qe, s)
  
+/* similar to queue_Scan except start at element 'start' instead of the beginning */
+#define        queue_ScanFrom(q, start, qe, next, s)      \
+    (qe) = (struct s*)(start), next = queue_Next(qe, s);  \
+       !queue_IsEnd(q, qe);                               \
+       (qe) = (next), next = queue_Next(qe, s)
+
  /* This is similar to queue_Scan, but scans from the end of the queue to the beginning.  Next is the previous queue entry.  */
  #define        queue_ScanBackwards(q, qe, prev, s)             \
      (qe) = queue_Last(q, s), prev = queue_Prev(qe, s); \
         !queue_IsEnd(q, qe);                            \
         (qe) = prev, prev = queue_Prev(qe, s)
  
+/* This is similar to queue_ScanBackwards, but start at element 'start' instead of the end.  Next is the previous queue entry.  */
+#define        queue_ScanBackwardsFrom(q, start, qe, prev, s)  \
+    (qe) = (struct s*)(start), prev = queue_Prev(qe, s);       \
+       !queue_IsEnd(q, qe);                                    \
+       (qe) = prev, prev = queue_Prev(qe, s)
+
  #define queue_Count(q, qe, nqe, s, n)                  \
      for (n=0, queue_Scan(q, qe, nqe, s), n++) {}
  #endif /* _RX_QUEUE_ */
diff --git a/src/tsalvaged/Makefile.in b/src/tsalvaged/Makefile.in

new file mode 100644 (file)

index 0000000..1f4ccc6
--- /dev/null
+++ b/src/tsalvaged/Makefile.in
@@ -0,0 +1,200 @@
+# Copyright 2000, International Business Machines Corporation and others.
+# All Rights Reserved.
+# 
+# This software has been released under the terms of the IBM Public
+# License.  For details, see the LICENSE file in the top-level source
+# directory or online at http://www.openafs.org/dl/license10.html
+#
+# Portions Copyright (c) 2003 Apple Computer, Inc.
+# Portions Copyright (c) 2006 Sine Nomine Associates
+
+srcdir=@srcdir@
+include @TOP_OBJDIR@/src/config/Makefile.config
+
+CC=${MT_CC}
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT \
+       -DSALVSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT
+
+CCRULE=${CC} ${CFLAGS} -c $?
+
+VICED=../viced
+VLSERVER=../vlserver
+LWP=../lwp
+LIBACL=../libacl
+UTIL=../util
+DIR=../dir
+VOL=../vol
+FSINT=../fsint
+
+SALVAGEDOBJS=salvaged.o vol-salvage.o physio.o
+
+DIROBJS=buffer.o dir.o salvage.o
+
+LWPOBJS=lock.o threadname.o
+
+UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o flipbase64.o softsig.o fstab.o
+
+VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-client.o \
+        clone.o nuke.o devname.o listinodes.o ihandle.o \
+        namei_ops.o salvsync-server.o salvsync-client.o \
+        daemon_com.o
+
+OBJECTS= ${SALVAGEDOBJS} ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+FSSDEBUG_OBJS = fssync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+SSSDEBUG_OBJS = salvsync-debug.o physio.o common.o ${UTILOBJS} ${VLIBOBJS} ${DIROBJS} ${LWPOBJS}
+
+LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a ${TOP_LIBDIR}/libcmd.a
+
+INSTALL_TARGS = ${DESTDIR}${afssrvlibexecdir}/salvageserver \
+               ${DESTDIR}${afssrvsbindir}/fssync-debug \
+               ${DESTDIR}${afssrvsbindir}/salvsync-debug
+
+DEST_TARGS =   ${DEST}/root.server/usr/afs/bin/salvageserver \
+               ${DEST}/root.server/usr/afs/bin/fssync-debug \
+               ${DEST}/root.server/usr/afs/bin/salvsync-debug
+
+all: salvageserver fssync-debug salvsync-debug
+
+salvaged.o: ${VOL}/salvaged.c
+       ${CCRULE}
+
+vol-salvage.o: ${VOL}/vol-salvage.c
+       ${CCRULE}
+
+physio.o: ${VOL}/physio.c
+       ${CCRULE}
+
+fssync-debug.o: ${VOL}/fssync-debug.c
+       ${CCRULE}
+
+salvsync-debug.o: salvsync-debug.c
+       ${CCRULE}
+
+assert.o: ${UTIL}/assert.c
+       ${CCRULE}
+
+uuid.o: ${UTIL}/uuid.c
+       ${CCRULE}
+
+serverLog.o: ${UTIL}/serverLog.c
+       ${CCRULE}
+
+fileutil.o: ${UTIL}/fileutil.c
+       ${CCRULE}
+
+volparse.o: ${UTIL}/volparse.c
+       ${CCRULE}
+
+flipbase64.o: ${UTIL}/flipbase64.c
+       ${CCRULE}
+
+netutils.o: ${UTIL}/netutils.c
+       ${CCRULE}
+
+dirpath.o: ${UTIL}/dirpath.c
+       ${CCRULE}
+
+softsig.o: ${UTIL}/softsig.c
+       ${CCRULE}
+
+buffer.o: ${DIR}/buffer.c
+       ${CCRULE}
+
+dir.o: ${DIR}/dir.c
+       ${CCRULE}
+
+salvage.o: ${DIR}/salvage.c
+       ${CCRULE}
+
+lock.o: ${LWP}/lock.c
+       ${CCRULE}
+
+threadname.o: ${LWP}/threadname.c
+       ${CCRULE}
+
+vnode.o: ${VOL}/vnode.c
+       ${CCRULE}
+
+volume.o: ${VOL}/volume.c
+       ${CCRULE}
+
+vutil.o: ${VOL}/vutil.c
+       ${CCRULE}
+
+partition.o: ${VOL}/partition.c
+       ${CCRULE}
+
+fssync-client.o: ${VOL}/fssync-client.c
+       ${CCRULE}
+
+salvsync-server.o: ${VOL}/salvsync-server.c
+       ${CCRULE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
+       ${CCRULE}
+
+daemon_com.o: ${VOL}/daemon_com.c
+       ${CCRULE}
+
+clone.o: ${VOL}/clone.c
+       ${CCRULE}
+
+nuke.o: ${VOL}/nuke.c
+       ${CCRULE}
+
+devname.o: ${VOL}/devname.c
+       ${CCRULE}
+
+# only for darwin?
+fstab.o: ${UTIL}/fstab.c
+       ${CCRULE}
+
+common.o: ${VOL}/common.c
+       ${CCRULE}
+
+listinodes.o: ${VOL}/listinodes.c
+       ${CCRULE}
+
+ihandle.o: ${VOL}/ihandle.c
+       ${CCRULE}
+
+namei_ops.o: ${VOL}/namei_ops.c
+       ${CCRULE}
+
+salvageserver: ${OBJECTS} ${LIBS}
+       ${CC} ${LDFLAGS} -o salvageserver ${OBJECTS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+fssync-debug: ${FSSDEBUG_OBJS} ${LIBS}
+       ${CC} ${LDFLAGS} -o fssync-debug ${FSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+salvsync-debug: ${SSSDEBUG_OBJS} ${LIBS}
+       ${CC} ${LDFLAGS} -o salvsync-debug ${SSSDEBUG_OBJS} ${LIBS} ${MT_LIBS} ${XLIBS}
+
+${DEST}/root.server/usr/afs/bin/salvageserver: salvageserver
+       ${INSTALL} -ns $? $@
+
+${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug
+       ${INSTALL} -s $? $@
+
+${DEST}/root.server/usr/afs/bin/salvsync-debug: salvsync-debug
+       ${INSTALL} -s $? $@
+
+install: ${INSTALL_TARGS}
+
+clean:
+       $(RM) -f *.o salvageserver core AFS_component_version_number.c
+
+include ../config/Makefile.version
+
+${DESTDIR}${afssrvlibexecdir}/salvageserver: salvageserver
+       ${INSTALL} -ns $? $@
+
+${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug
+       ${INSTALL} -s $? $@
+
+${DESTDIR}${afssrvsbindir}/salvsync-debug: salvsync-debug
+       ${INSTALL} -s $? $@
+
+dest: ${DEST_TARGS}
diff --git a/src/tsalvaged/salvsync-debug.c b/src/tsalvaged/salvsync-debug.c

new file mode 100644 (file)

index 0000000..4d4949a
--- /dev/null
+++ b/src/tsalvaged/salvsync-debug.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+/*
+ * salvsync debug tool
+ */
+
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+
+
+#include <fcntl.h>
+
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "salvsync.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+int VolumeChanged; /* hack to make dir package happy */
+
+
+#ifndef AFS_DEMAND_ATTACH_FS
+int
+main(int argc, char ** argv)
+{
+    fprintf(stderr, "*** salvsync-debug is only supported for OpenAFS builds with the demand-attach fileserver extension\n");
+    return -1;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+
+struct salv_state {
+    afs_uint32 prio;
+    afs_uint32 volume;
+    char partName[16];
+};
+
+struct state {
+    afs_int32 reason;
+    struct salv_state * sop;
+};
+
+static int common_prolog(struct cmd_syndesc *, struct state *);
+static int common_salv_prolog(struct cmd_syndesc *, struct state *);
+
+static int do_salvop(struct state *, afs_int32 command, SYNC_response * res);
+
+static char * response_code_to_string(afs_int32);
+static char * command_code_to_string(afs_int32);
+static char * reason_code_to_string(afs_int32);
+static char * program_type_to_string(afs_int32);
+static char * state_code_to_string(afs_int32);
+
+
+static int OpStats(struct cmd_syndesc * as, char * rock);
+static int OpSalvage(struct cmd_syndesc * as, char * rock);
+static int OpCancel(struct cmd_syndesc * as, char * rock);
+static int OpCancelAll(struct cmd_syndesc * as, char * rock);
+static int OpRaisePrio(struct cmd_syndesc * as, char * rock);
+static int OpQuery(struct cmd_syndesc * as, char * rock);
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+
+#define COMMON_PARMS_OFFSET    13
+#define COMMON_PARMS(ts) \
+    cmd_Seek(ts, COMMON_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \
+    cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code")
+
+#define COMMON_SALV_PARMS_OFFSET    10
+#define COMMON_SALV_PARMS(ts) \
+    cmd_Seek(ts, COMMON_SALV_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name"); \
+    cmd_AddParm(ts, "-priority", CMD_SINGLE, CMD_OPTIONAL, "priority")
+
+#define SALV_PARMS_DECL(ts) \
+    COMMON_SALV_PARMS(ts); \
+    COMMON_PARMS(ts)
+
+#define COMMON_PARMS_DECL(ts) \
+    COMMON_PARMS(ts)
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    int i;
+    extern char cml_version_number[];
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+       ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+       fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+               argv[0]);
+       exit(2);
+    }
+
+
+    ts = cmd_CreateSyntax("stats", OpStats, 0, "get salvageserver statistics (SALVSYNC_NOP opcode)");
+    COMMON_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "nop");
+
+    ts = cmd_CreateSyntax("salvage", OpSalvage, 0, "schedule a salvage (SALVSYNC_SALVAGE opcode)");
+    SALV_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("cancel", OpCancel, 0, "cancel a salvage (SALVSYNC_CANCEL opcode)");
+    SALV_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("raiseprio", OpRaisePrio, 0, "raise a salvage priority (SALVSYNC_RAISEPRIO opcode)");
+    SALV_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "rp");
+
+    ts = cmd_CreateSyntax("query", OpQuery, 0, "query salvage status (SALVSYNC_QUERY opcode)");
+    SALV_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "qry");
+
+    ts = cmd_CreateSyntax("kill", OpCancelAll, 0, "cancel all scheduled salvages (SALVSYNC_CANCELALL opcode)");
+    COMMON_PARMS_DECL(ts);
+
+    err = cmd_Dispatch(argc, argv);
+    exit(err);
+}
+
+static int
+common_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+
+#ifdef AFS_NT40_ENV
+    if (afs_winsockInit() < 0) {
+       Exit(1);
+    }
+#endif
+
+    VInitVolumePackage(debugUtility, 1, 1,
+                      DONT_CONNECT_FS, 0);
+    DInit(1);
+
+    if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) { /* -reason */
+       state->reason = atoi(ti->data);
+    }
+    if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) {       /* -programtype */
+       if (!strcmp(ti->data, "fileServer")) {
+           programType = fileServer;
+       } else if (!strcmp(ti->data, "volumeUtility")) {
+           programType = volumeUtility;
+       } else if (!strcmp(ti->data, "salvager")) {
+           programType = salvager;
+       } else if (!strcmp(ti->data, "salvageServer")) {
+           programType = salvageServer;
+       } else {
+           programType = (ProgramType) atoi(ti->data);
+       }
+    }
+
+    VConnectSALV();
+
+    return 0;
+}
+
+static int
+common_salv_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+
+    state->sop = (struct salv_state *) calloc(1, sizeof(struct salv_state));
+    assert(state->sop != NULL);
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET].items)) {    /* -volumeid */
+       state->sop->volume = atoi(ti->data);
+    } else {
+       fprintf(stderr, "required argument -volumeid not given\n");
+    }
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+1].items)) {  /* -partition */
+       strlcpy(state->sop->partName, ti->data, sizeof(state->sop->partName));
+    } else {
+       memset(state->sop->partName, 0, sizeof(state->sop->partName));
+    }
+
+    if ((ti = as->parms[COMMON_SALV_PARMS_OFFSET+2].items)) {  /* -prio */
+       state->sop->prio = atoi(ti->data);
+    } else {
+       state->sop->prio = 0;
+    }
+
+    return 0;
+}
+
+static int
+do_salvop(struct state * state, afs_int32 command, SYNC_response * res)
+{
+    afs_int32 code;
+    SALVSYNC_response_hdr hdr_l, *hdr;
+    SYNC_response res_l;
+
+    if (!res) {
+       res = &res_l;
+       res->payload.len = sizeof(hdr_l);
+       res->payload.buf = hdr = &hdr_l;
+    } else {
+       hdr = (SALVSYNC_response_hdr *) res->payload.buf;
+    }
+
+    fprintf(stderr, "calling SALVSYNC_SalvageVolume with command code %d (%s)\n", 
+           command, command_code_to_string(command));
+
+    code = SALVSYNC_SalvageVolume(state->sop->volume,
+                                 state->sop->partName,
+                                 command,
+                                 state->reason,
+                                 state->sop->prio,
+                                 res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+       break;
+    default:
+       fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "SALVSYNC_SalvageVolume returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+           res->hdr.response, response_code_to_string(res->hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+           res->hdr.reason, reason_code_to_string(res->hdr.reason));
+
+    printf("state = {\n");
+    if (res->hdr.flags & SALVSYNC_FLAG_VOL_STATS_VALID) {
+       printf("\tstate = %d (%s)\n",
+              hdr->state, state_code_to_string(hdr->state));
+       printf("\tprio = %d\n", hdr->prio);
+    }
+    printf("\tsq_len = %d\n", hdr->sq_len);
+    printf("\tpq_len = %d\n", hdr->pq_len);
+    printf("}\n");
+
+    VDisconnectSALV();
+}
+
+static char *
+response_code_to_string(afs_int32 response)
+{
+    switch (response) {
+    case SYNC_OK:
+       return "SYNC_OK";
+    case SYNC_DENIED:
+       return "SYNC_DENIED";
+    case SYNC_COM_ERROR:
+       return "SYNC_COM_ERROR";
+    case SYNC_BAD_COMMAND:
+       return "SYNC_BAD_COMMAND";
+    case SYNC_FAILED:
+       return "SYNC_FAILED";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+command_code_to_string(afs_int32 command)
+{
+    switch (command) {
+    case SYNC_COM_CHANNEL_CLOSE:
+       return "SYNC_COM_CHANNEL_CLOSE";
+    case SALVSYNC_NOP:
+       return "SALVSYNC_NOP";
+    case SALVSYNC_SALVAGE:
+       return "SALVSYNC_SALVAGE";
+    case SALVSYNC_CANCEL:
+       return "SALVSYNC_CANCEL";
+    case SALVSYNC_RAISEPRIO:
+       return "SALVSYNC_RAISEPRIO";
+    case SALVSYNC_QUERY:
+       return "SALVSYNC_QUERY";
+    case SALVSYNC_CANCELALL:
+       return "SALVSYNC_CANCELLALL";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+reason_code_to_string(afs_int32 reason)
+{
+    switch (reason) {
+    case SALVSYNC_WHATEVER:
+       return "SALVSYNC_WHATEVER";
+    case SALVSYNC_ERROR:
+       return "SALVSYNC_ERROR";
+    case SALVSYNC_OPERATOR:
+       return "SALVSYNC_OPERATOR";
+    case SALVSYNC_SHUTDOWN:
+       return "SALVSYNC_SHUTDOWN";
+    case SALVSYNC_NEEDED:
+       return "SALVSYNC_NEEDED";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+program_type_to_string(afs_int32 type)
+{
+    switch ((ProgramType)type) {
+    case fileServer:
+       return "fileServer";
+    case volumeUtility:
+       return "volumeUtility";
+    case salvager:
+       return "salvager";
+    case salvageServer:
+       return "salvageServer";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+state_code_to_string(afs_int32 state)
+{
+    switch (state) {
+    case SALVSYNC_STATE_UNKNOWN:
+       return "SALVSYNC_STATE_UNKNOWN";
+    case SALVSYNC_STATE_QUEUED:
+       return "SALVSYNC_STATE_QUEUED";
+    case SALVSYNC_STATE_SALVAGING:
+       return "SALVSYNC_STATE_SALVAGING";
+    case SALVSYNC_STATE_ERROR:
+       return "SALVSYNC_STATE_ERROR";
+    case SALVSYNC_STATE_DONE:
+       return "SALVSYNC_STATE_DONE";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static int
+OpStats(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_NOP, NULL);
+
+    return 0;
+}
+
+static int
+OpSalvage(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_SALVAGE, NULL);
+
+    return 0;
+}
+
+static int
+OpCancel(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_CANCEL, NULL);
+
+    return 0;
+}
+
+static int
+OpCancelAll(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_CANCELALL, NULL);
+
+    return 0;
+}
+
+static int
+OpRaisePrio(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_RAISEPRIO, NULL);
+
+    return 0;
+}
+
+static int
+OpQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_salv_prolog(as, &state);
+
+    do_salvop(&state, SALVSYNC_QUERY, NULL);
+
+    return 0;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tviced/Makefile.in b/src/tviced/Makefile.in

index b10e1a4ca883a17985d38ddb920ce84d77218358..68363fc543156954cefdcd2dc179bf440f6ca4d4 100644 (file)
--- a/src/tviced/Makefile.in
+++ b/src/tviced/Makefile.in
@@ -11,7 +11,7 @@ srcdir=@srcdir@
  include @TOP_OBJDIR@/src/config/Makefile.config
  
  CC=${MT_CC}
-CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_SERVER -DSALVSYNC_BUILD_CLIENT
  
  CCRULE=${CC} ${CFLAGS} -c $?
  
@@ -24,7 +24,7 @@ DIR=../dir
  VOL=../vol
  FSINT=../fsint
  
-VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o    
+VICEDOBJS=viced.o afsfileprocs.o host.o physio.o callback.o serialize_state.o  
  
  VLSERVEROBJS=vldbint.cs.o vldbint.xdr.o
  
@@ -36,18 +36,20 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o
  
  DIROBJS=buffer.o dir.o salvage.o
  
-VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \
+VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-server.o \
          clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o \
-        fstab.o
+        fstab.o salvsync-client.o daemon_com.o
  
  FSINTOBJS= afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o
  
  objects= ${VICEDOBJS} ${VLSERVEROBJS} ${LWPOBJS} ${LIBACLOBJS} \
          ${UTILOBJS} ${DIROBJS} ${VOLOBJS} ${FSINTOBJS}
  
+SDBGOBJS = state_analyzer.o uuid.o dirpath.o fileutil.o ${TOP_LIBDIR}/util.a
+
  LIBS=${TOP_LIBDIR}/libafsauthent.a ${TOP_LIBDIR}/libafsrpc.a ${TOP_LIBDIR}/util.a
  
-all: fileserver
+all: fileserver state_analyzer
  
  viced.o: ${VICED}/viced.c
         ${CCRULE}
@@ -64,6 +66,9 @@ physio.o: ${VICED}/physio.c
  callback.o: ${VICED}/callback.c
         ${CCRULE}
  
+serialize_state.o: ./serialize_state.c
+       ${CCRULE}
+
  assert.o: ${UTIL}/assert.c
         ${CCRULE}
  
@@ -130,10 +135,16 @@ vutil.o: ${VOL}/vutil.c
  partition.o: ${VOL}/partition.c
         ${CCRULE}
  
-fssync.o: ${VOL}/fssync.c
+fssync-server.o: ${VOL}/fssync-server.c
+       ${CCRULE}
+
+fssync-client.o: ${VOL}/fssync-client.c
+       ${CCRULE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
         ${CCRULE}
  
-purge.o: ${VOL}/purge.c
+daemon_com.o: ${VOL}/daemon_com.c
         ${CCRULE}
  
  clone.o: ${VOL}/clone.c
@@ -179,21 +190,33 @@ afsint.ss.o: ${FSINT}/afsint.ss.c
  afsint.xdr.o: ${FSINT}/afsint.xdr.c
         ${CCRULE}
  
+state_analyzer.o: state_analyzer.c
+       ${CCRULE}
+
  fileserver: ${objects} ${LIBS}
         ${CC} ${LDFLAGS} -o fileserver ${objects} ${LIBS} ${MT_LIBS} ${XLIBS}
  
+state_analyzer: ${SDBGOBJS}
+       ${CC} ${LDFLAGS} -o state_analyzer ${SDBGOBJS} ${MT_LIBS} ${XLIBS}
+
  ${DEST}/root.server/usr/afs/bin/fileserver: fileserver
         ${INSTALL} -ns $? $@
  
-install: ${DESTDIR}${afssrvlibexecdir}/fileserver
+${DEST}/root.server/usr/afs/bin/state_analyzer: state_analyzer
+       ${INSTALL} $? $@
+
+install: ${DESTDIR}${afssrvlibexecdir}/fileserver ${DESTDIR}${afssrvsbindir}/state_analyzer
  
  clean:
-       $(RM) -f *.o fileserver core AFS_component_version_number.c
+       $(RM) -f *.o fileserver state_analyzer core AFS_component_version_number.c
  
  include ../config/Makefile.version
  
  ${DESTDIR}${afssrvlibexecdir}/fileserver: fileserver
         ${INSTALL} -ns $? $@
  
-dest: ${DEST}/root.server/usr/afs/bin/fileserver
+${DESTDIR}${afssrvsbindir}/state_analyzer: state_analyzer
+       ${INSTALL} $? $@
+
+dest: ${DEST}/root.server/usr/afs/bin/fileserver ${DEST}/root.server/usr/afs/bin/state_analyzer
  
diff --git a/src/tviced/NTMakefile b/src/tviced/NTMakefile

index e9e2c270e9e438daf71ae05a7199a1860bff717c..e58c5cc226a72410e3791fb6fc256c9e1796eec5 100644 (file)
--- a/src/tviced/NTMakefile
+++ b/src/tviced/NTMakefile
@@ -5,7 +5,7 @@
  # License.  For details, see the LICENSE file in the top-level source
  # directory or online at http://www.openafs.org/dl/license10.html
  
-AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG
+AFSDEV_AUXCDEFINES = -DAFS_PTHREAD_ENV -DRXDEBUG -DFSSYNC_BUILD_SERVER
  
  RELDIR=tviced
  !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
diff --git a/src/tviced/serialize_state.c b/src/tviced/serialize_state.c

new file mode 100644 (file)

index 0000000..c1b4583
--- /dev/null
+++ b/src/tviced/serialize_state.c
@@ -0,0 +1,1120 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdio.h>
+#include <stdlib.h>            /* for malloc() */
+#include <time.h>              /* ANSI standard location for time stuff */
+#ifdef AFS_NT40_ENV
+#include <fcntl.h>
+#include <io.h>
+#else
+#include <sys/time.h>
+#include <sys/file.h>
+#endif
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+#include <afs/assert.h>
+#include <sys/stat.h>
+
+#include <afs/stds.h>
+
+#include <rx/xdr.h>
+#include <lwp.h>
+#include <lock.h>
+#include <afs/afsint.h>
+#include <afs/rxgen_consts.h>
+#include <afs/nfs.h>
+#include <afs/errors.h>
+#include <afs/ihandle.h>
+#include <afs/vnode.h>
+#include <afs/volume.h>
+#include <afs/acl.h>
+#include <afs/ptclient.h>
+#include <afs/prs_fs.h>
+#include <afs/auth.h>
+#include <afs/afsutil.h>
+#include <rx/rx.h>
+#include <afs/cellconfig.h>
+#include <stdlib.h>
+
+#include "../viced/viced_prototypes.h"
+#include "../viced/viced.h"
+#include "../viced/host.h"
+#include "../viced/callback.h"
+#include "serialize_state.h"
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#ifdef S_SPLINT_S
+extern off64_t afs_lseek(int FD, off64_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)    lseek64(FD, (off64_t)(O), F)
+#define afs_stat               stat64
+#define afs_fstat              fstat64
+#define afs_open               open64
+#define afs_fopen              fopen64
+#define afs_ftruncate           ftruncate64
+#define afs_mmap                mmap64
+#ifdef AFS_AIX_ENV
+extern void * mmap64();  /* ugly hack since aix build env appears to be somewhat broken */
+#endif
+#else /* !O_LARGEFILE */
+#ifdef S_SPLINT_S
+extern off_t afs_lseek(int FD, off_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)    lseek(FD, (off_t)(O), F)
+#define afs_stat               stat
+#define afs_fstat              fstat
+#define afs_open               open
+#define afs_fopen              fopen
+#define afs_ftruncate           ftruncate
+#define afs_mmap                mmap
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+
+/*
+ * demand attach fs
+ * state dump routines
+ *
+ * in order to make state dump/restore as fast as possible,
+ * we use memory mapped files
+ *
+ * if this causes problems on certain platforms, the APIs
+ * have been written so that it will be very simple to go
+ * back to standard I/O for just those poorly written platforms
+ */
+#define FS_STATE_USE_MMAP
+
+
+#ifdef FS_STATE_USE_MMAP
+#define FS_STATE_INIT_FILESIZE (8 * 1024 * 1024)  /* truncate to 8MB initially */
+#include <sys/mman.h>
+#endif
+
+static int fs_stateCreateDump(struct fs_dump_state * state);
+static int fs_stateLoadDump(struct fs_dump_state * state);
+static int fs_stateInvalidateDump(struct fs_dump_state * state);
+static int fs_stateCommitDump(struct fs_dump_state * state);
+static int fs_stateCloseDump(struct fs_dump_state * state);
+
+#ifdef FS_STATE_USE_MMAP
+static int fs_stateSizeFile(struct fs_dump_state * state);
+static int fs_stateResizeFile(struct fs_dump_state * state, size_t min_add);
+static int fs_stateTruncateFile(struct fs_dump_state * state);
+
+static int fs_stateMapFile(struct fs_dump_state * state);
+static int fs_stateUnmapFile(struct fs_dump_state * state);
+
+static int fs_stateIncCursor(struct fs_dump_state * state, size_t len);
+static int fs_stateCheckIOSafety(struct fs_dump_state * state,
+                                size_t len);
+#endif
+
+static int fs_stateFillHeader(struct fs_state_header * hdr);
+static int fs_stateCheckHeader(struct fs_state_header * hdr);
+
+static int fs_stateAlloc(struct fs_dump_state * state);
+static int fs_stateFree(struct fs_dump_state * state);
+
+extern afsUUID FS_HostUUID;
+extern char cml_version_number[];
+
+/*
+ * demand attach fs
+ * save all fileserver state 
+ */
+int
+fs_stateSave(void)
+{
+    int ret = 0, verified = 1;
+    struct fs_dump_state state;
+
+    /* save and restore need to be atomic wrt other host package operations */
+    H_LOCK; 
+
+    ViceLog(0, ("fs_stateSave: commencing fileserver state dump\n"));
+
+    if (fs_stateAlloc(&state)) {
+       ViceLog(0, ("fs_stateSave: memory allocation failed; dump aborted\n"));
+       ret = 1;
+       goto done;
+    }
+
+    /* XXX
+     * on busy servers, these checks will inevitably fail since stuff drops H_LOCK
+     * all over the place (with structs left in inconsistent states) while RPCs to
+     * clients happen (grumble, grumble, the host package needs to be rewritten...)
+     *
+     * the current hack is to force the background threads that deal with host and
+     * callback state offline early in the shutdown process, do VShutdown, come
+     * back and wait for those threads to die, THEN do the state dump
+     *
+     * BUT, this still has one flaw -- what do we do about rx worker threads that
+     * are blocked in the host package making an RPC call to a cm???
+     *
+     * perhaps we need a refcounter that keeps track of threads blocked in rpc calls
+     * with H_LOCK dropped (and the host struct likely left in an inconsistent state)
+     *
+     * or better yet, we need to associate a state machine with each host object
+     * (kind of like demand attach Volume structures).
+     *
+     * sigh. I suspect we'll need to revisit this issue
+     */
+
+    if (fs_state.options.fs_state_verify_before_save) {
+       ViceLog(0, ("fs_stateSave: performing internal consistency checks before proceeding with state dump\n"));
+
+       if (h_stateVerify(&state)) {
+           ViceLog(0, ("fs_stateSave: error: host table consistency checks failed; state dump will not be marked clean\n"));
+           verified = 0;
+           ret = 1;
+       }
+
+       if (cb_stateVerify(&state)) {
+           ViceLog(0, ("fs_stateSave: error: callback table consistency checks failed; state dump will not be marked clean\n"));
+           verified = 0;
+           ret = 1;
+       }
+
+       /* if a consistency check asserted the bail flag, reset it */
+       state.bail = 0;
+
+       ViceLog(0, ("fs_stateSave: proceeding with dump\n"));
+    }
+
+    if (fs_stateCreateDump(&state)) {
+       ViceLog(0, ("fs_stateSave: error: dump create failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+    if (h_stateSave(&state)) {
+       ViceLog(0, ("fs_stateSave: error: host state dump failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateSave(&state)) {
+       ViceLog(0, ("fs_stateSave: error: callback state dump failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+    if (!verified) {
+       state.bail = 1;
+    }
+
+    if (fs_stateCommitDump(&state)) {
+       ViceLog(0, ("fs_stateSave: error: dump commit failed\n"));
+       ret = 1; 
+       goto done;
+    }
+
+    if (verified) {
+       ViceLog(0, ("fs_stateSave: fileserver state dump completed successfully\n"));
+    } else {
+       ViceLog(0, ("fs_stateSave: fileserver state dump completed, but not marked clean.\n"));
+       ViceLog(0, ("fs_stateSave: please save a copy of '%s' for use by technical support\n",
+                   state.fn));
+    }
+
+ done:
+    if (state.fd >= 0)
+       fs_stateCloseDump(&state);
+    fs_stateFree(&state);
+    H_UNLOCK;
+    return ret;
+}
+
+/*
+ * demand attach fs
+ * restore all fileserver state
+ *
+ * this function must appear as one atomic operation to the host and callback
+ * packages, hence H_LOCK is held for the entirety of the process.
+ */
+int
+fs_stateRestore(void)
+{
+    int ret = 0;
+    struct fs_dump_state state;
+
+    /* save and restore need to be atomic wrt other host package operations */
+    H_LOCK;
+
+    ViceLog(0, ("fs_stateRestore: commencing fileserver state restore\n"));
+
+    if (fs_stateAlloc(&state)) {
+       ViceLog(0, ("fs_stateRestore: memory allocation failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateLoadDump(&state)) {
+       ViceLog(0, ("fs_stateRestore: failed to load dump file '%s'\n", state.fn));
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateInvalidateDump(&state)) {
+       ViceLog(0, ("fs_stateRestore: failed to invalidate dump file '%s'\n", state.fn));
+       ret = 1;
+       goto done;
+    }
+
+
+    if (state.flags.do_host_restore) {
+       if (h_stateRestore(&state)) {
+           ViceLog(0, ("fs_stateRestore: error: host state restore failed. exiting avoid further corruption\n"));
+           exit(0);
+       }
+       ViceLog(0, ("fs_stateRestore: host table restored\n"));
+
+       if (cb_stateRestore(&state)) {
+           ViceLog(0, ("fs_stateRestore: error: callback state restore failed. exiting to avoid further corruption\n"));
+           exit(0);
+       }
+       ViceLog(0, ("fs_stateRestore: FileEntry and CallBack tables restored\n"));
+
+       if (h_stateRestoreIndices(&state)) {
+           ViceLog(0, ("fs_stateRestore: error: host index remapping failed. exiting to avoid further corruption\n"));
+           exit(0);
+       }
+       ViceLog(0, ("fs_stateRestore: host table indices remapped\n"));
+
+       if (cb_stateRestoreIndices(&state)) {
+           ViceLog(0, ("fs_stateRestore: error: callback index remapping failed. exiting to avoid further corruption\n"));
+           exit(0);
+       }
+       ViceLog(0, ("fs_stateRestore: FileEntry and CallBack indices remapped\n"));
+    }
+
+    ViceLog(0, ("fs_stateRestore: restore phase complete\n"));
+
+    if (fs_state.options.fs_state_verify_after_restore) {
+       ViceLog(0, ("fs_stateRestore: beginning state verification phase\n"));
+
+       if (state.flags.do_host_restore) {
+           if (h_stateVerify(&state)) {
+               ViceLog(0, ("fs_stateRestore: error: host table consistency checks failed; exiting to avoid further corruption\n"));
+               exit(0);
+           }
+
+           if (cb_stateVerify(&state)) {
+               ViceLog(0, ("fs_stateRestore: error: callback table consistency checks failed; exiting to avoid further corruption\n"));
+               exit(0);
+           }
+       }
+
+       ViceLog(0, ("fs_stateRestore: fileserver state verification complete\n"));
+    }
+
+    ViceLog(0, ("fs_stateRestore: restore was successful\n"));
+
+ done:
+    if (state.fd >= 0) {
+       fs_stateInvalidateDump(&state);
+       fs_stateCloseDump(&state);
+    }
+    fs_stateFree(&state);
+    H_UNLOCK;
+    return ret;
+}
+
+static int
+fs_stateCreateDump(struct fs_dump_state * state)
+{
+    int fd, ret = 0;
+    char savedump[MAXPATHLEN];
+    struct afs_stat status;
+
+    afs_snprintf(savedump, sizeof(savedump), "%s.old", state->fn);
+
+    if (afs_stat(state->fn, &status) == 0) {
+       renamefile(state->fn, savedump);
+    }
+
+    if (((fd = afs_open(state->fn, 
+                       O_RDWR | O_CREAT | O_TRUNC, 
+                       S_IRUSR | S_IWUSR)) == -1) ||
+       (afs_fstat(fd, &status) == -1)) {
+       ViceLog(0, ("fs_stateCreateDump: failed to create state dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    state->fd = fd;
+    state->mode = FS_STATE_DUMP_MODE;
+    memset(state->hdr, 0, sizeof(struct fs_state_header));
+    fs_stateIncEOF(state, sizeof(struct fs_state_header));
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateSizeFile(state)) {
+       ViceLog(0, ("fs_stateCreateDump: failed to resize state dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateMapFile(state)) {
+       ViceLog(0, ("fs_stateCreateDump: failed to memory map state dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+    ret = fs_stateInvalidateDump(state);
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateInvalidateDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int ret = 0;
+    struct fs_state_header hdr;
+
+#ifdef FS_STATE_USE_MMAP
+    if (state->mmap.map == NULL) {
+       return 1;
+    }
+#endif
+
+    memcpy(&hdr, state->hdr, sizeof(hdr));
+    hdr.valid = 0;
+    ZeroInt64(z);
+
+    /* write a bogus header to flag dump in progress */
+    if (fs_stateWriteHeader(state, &z, &hdr, sizeof(hdr))) {
+       ViceLog(0, ("fs_stateInvalidateDump: failed to invalidate old dump file header '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+    if (fs_stateSync(state)) {
+       ViceLog(0, ("fs_stateInvalidateDump: failed to sync changes to disk\n"));
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateCommitDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int ret = 0;
+
+    ZeroInt64(z);
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateTruncateFile(state)) {
+       ViceLog(0, ("fs_stateCommitDump: failed to truncate dump file to proper size\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+    /* ensure that all pending data I/Os for the state file have been committed 
+     * _before_ we make the metadata I/Os */
+    if (fs_stateSync(state)) {
+       ViceLog(0, ("fs_stateCommitDump: failed to sync changes to disk\n"));
+       ret = 1;
+       goto done;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    /* XXX madvise may not exist on all platforms, so
+     * we may need to add some ifdefs at some point... */
+    {
+       madvise((((char *)state->mmap.map) + sizeof(struct fs_state_header)), 
+               state->mmap.size - sizeof(struct fs_state_header), 
+               MADV_DONTNEED);
+    }
+#endif
+
+    /* build the header, and write it to disk */
+    fs_stateFillHeader(state->hdr);
+    if (state->bail) {
+       state->hdr->valid = 0;
+    }
+    if (fs_stateWriteHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) {
+       ViceLog(0, ("fs_stateCommitDump: failed to write header to dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+    if (fs_stateSync(state)) {
+       ViceLog(0, ("fs_stateCommitDump: failed to sync new header to disk\n"));
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateLoadDump(struct fs_dump_state * state)
+{
+    afs_uint64 z;
+    int fd, ret = 0;
+    struct afs_stat status;
+    afs_int32 now = FT_ApproxTime();
+
+    ZeroInt64(z);
+
+    if ((fd = afs_open(state->fn, O_RDWR)) == -1 ||
+       (afs_fstat(fd, &status) == -1)) {
+       ViceLog(0, ("fs_stateLoadDump: failed to load state dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+    state->fd = fd;
+    state->mode = FS_STATE_LOAD_MODE;
+    state->file_len = status.st_size;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateMapFile(state)) {
+       ViceLog(0, ("fs_stateLoadDump: failed to memory map state dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+    if (fs_stateReadHeader(state, &z, state->hdr, sizeof(struct fs_state_header))) {
+       ViceLog(0, ("fs_stateLoadDump: failed to read header from dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    /* check the validity of the header */
+    if (fs_stateCheckHeader(state->hdr)) {
+       ViceLog(1, ("fs_stateLoadDump: header failed validity checks; not restoring '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    if ((state->hdr->timestamp + HOST_STATE_VALID_WINDOW) >= now) {
+       state->flags.do_host_restore = 1;
+    } else {
+       ViceLog(0, ("fs_stateLoadDump: warning: dump is too old for host and callback restore; skipping those steps\n"));
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateCloseDump(struct fs_dump_state * state)
+{
+#ifdef FS_STATE_USE_MMAP
+    fs_stateUnmapFile(state);
+#endif
+    close(state->fd);
+    return 0;
+}
+
+int
+fs_stateWrite(struct fs_dump_state * state,
+             void * buf, size_t len)
+{
+    int ret = 0;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+       if (fs_stateResizeFile(state, len)) {
+           ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n",
+                       state->fn));
+           ret = 1;
+           goto done;
+       }
+    }
+           
+    memcpy(state->mmap.cursor, buf, len);
+    fs_stateIncCursor(state, len);
+#else
+    if (write(state->fd, buf, len) != len) {
+       ViceLog(0, ("fs_stateWrite: write failed\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateRead(struct fs_dump_state * state,
+            void * buf, size_t len)
+{
+    int ret = 0;
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+       ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    memcpy(buf, state->mmap.cursor, len);
+    fs_stateIncCursor(state, len);
+#else
+    if (read(state->fd, buf, len) != len) {
+       ViceLog(0, ("fs_stateRead: read failed\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateWriteV(struct fs_dump_state * state,
+              struct iovec * iov, int niov)
+{
+    int i, ret = 0;
+    size_t len = 0;
+
+    for (i=0; i < niov; i++) {
+       len += iov[i].iov_len;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+       if (fs_stateResizeFile(state, len)) {
+           ViceLog(0, ("fs_stateWrite: could not resize dump file '%s'\n",
+                       state->fn));
+           ret = 1;
+           goto done;
+       }
+    }
+
+    for (i=0; i < niov; i++) {
+       memcpy(state->mmap.cursor, iov[i].iov_base, iov[i].iov_len);
+       fs_stateIncCursor(state, iov[i].iov_len);
+    }
+#else
+    if (writev(state->fd, iov, niov) != len) {
+       ViceLog(0, ("fs_stateWriteV: write failed\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateReadV(struct fs_dump_state * state,
+             struct iovec * iov, int niov)
+{
+    int i, ret = 0;
+    size_t len = 0;
+
+    for (i=0; i < niov; i++) {
+       len += iov[i].iov_len;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateCheckIOSafety(state, len)) {
+       ViceLog(0, ("fs_stateRead: read beyond EOF for dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    for (i=0; i < niov; i++) {
+       memcpy(iov[i].iov_base, state->mmap.cursor, iov[i].iov_len);
+       fs_stateIncCursor(state, iov[i].iov_len);
+    }
+#else
+    if (readv(state->fd, iov, niov) != len) {
+       ViceLog(0, ("fs_stateReadV: read failed\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+int
+fs_stateWriteHeader(struct fs_dump_state * state,
+                   afs_uint64 * offset,
+                   void * hdr, size_t len)
+{
+    int ret = 0;
+
+    if (fs_stateSeek(state, offset)) {
+       ViceLog(0, ("fs_stateWriteHeader: could not seek to correct position in dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateWrite(state, hdr, len)) {
+       ViceLog(0, ("fs_stateWriteHeader: write failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+int
+fs_stateReadHeader(struct fs_dump_state * state,
+                  afs_uint64 * offset,
+                  void * hdr, size_t len)
+{
+    int ret = 0;
+
+    if (fs_stateSeek(state, offset)) {
+       ViceLog(0, ("fs_stateReadHeader: could not seek to correct position in dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateRead(state, hdr,len)) {
+       ViceLog(0, ("fs_stateReadHeader: read failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateSizeFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+    state->file_len = FS_STATE_INIT_FILESIZE;
+    if (afs_ftruncate(state->fd, state->file_len) != 0)
+       ret = 1;
+    return ret;
+}
+
+static int
+fs_stateResizeFile(struct fs_dump_state * state, size_t min_add)
+{
+    int ret = 0;
+    afs_foff_t inc;
+
+#ifdef FS_STATE_USE_MMAP
+    fs_stateUnmapFile(state);
+#endif
+
+    inc = ((min_add / FS_STATE_INIT_FILESIZE)+1) * FS_STATE_INIT_FILESIZE;
+    state->file_len += inc;
+
+    if (afs_ftruncate(state->fd, state->file_len) != 0) {
+       ViceLog(0, ("fs_stateResizeFile: truncate failed\n"));
+       ret = 1;
+       goto done;
+    }
+
+#ifdef FS_STATE_USE_MMAP
+    if (fs_stateMapFile(state)) {
+       ViceLog(0, ("fs_stateResizeFile: remapping memory mapped file failed\n"));
+       ret = 1;
+       goto done;
+    }
+#endif
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateTruncateFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+#ifdef AFS_LARGEFILE_ENV
+    if (afs_ftruncate(state->fd, state->eof_offset) != 0) {
+       ret = 1;
+    }
+#else
+    afs_uint32 hi, lo;
+    SplitInt64(state->eof_offset, hi, lo);
+    if (afs_ftruncate(state->fd, lo) != 0) {
+       ret = 1;
+    }
+#endif
+
+    return ret;
+}
+#endif
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateMapFile(struct fs_dump_state * state)
+{
+    int ret = 0, flags;
+
+    switch(state->mode) {
+    case FS_STATE_LOAD_MODE:
+       flags = PROT_READ | PROT_WRITE;   /* loading involves a header invalidation */
+       break;
+    case FS_STATE_DUMP_MODE:
+       flags = PROT_WRITE;
+       break;
+    default:
+       ViceLog(0, ("fs_stateMapFile: invalid dump state mode\n"));
+       return 1;
+    }
+
+    state->mmap.map = afs_mmap(NULL, 
+                              state->file_len, 
+                              flags, 
+                              MAP_SHARED,
+                              state->fd, 
+                              0);
+
+    if (state->mmap.map == MAP_FAILED) {
+       state->mmap.size = 0;
+       state->mmap.map = NULL;
+       ViceLog(0, ("fs_stateMapFile: failed to memory map file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+    state->mmap.size = state->file_len;
+    state->mmap.cursor = state->mmap.map;
+    state->mmap.offset = 0;
+
+    /* for state loading, accesses will be sequential, so let's give
+     * the VM subsystem a heads up */
+    if (state->mode == FS_STATE_LOAD_MODE) {
+       /* XXX madvise may not exist on all platforms, so
+        * we may need to add some ifdefs at some point... */
+       flags = MADV_SEQUENTIAL | MADV_WILLNEED;
+#ifdef AFS_SUN510_ENV
+       flags |= MADV_ACCESS_LWP;   /* added in solaris 9 12/02 */
+#endif
+       madvise(state->mmap.map, state->mmap.size, flags);
+    }
+
+ done:
+    return ret;
+}
+
+static int
+fs_stateUnmapFile(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (munmap(state->mmap.map, state->mmap.size) == -1) {
+       ViceLog(0, ("fs_stateUnmapFile: failed to unmap dump file '%s'\n",
+                   state->fn));
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+#endif /* FS_STATE_USE_MMAP */
+
+#ifdef FS_STATE_USE_MMAP
+int
+fs_stateSync(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    msync(state->mmap.map, state->mmap.size, MS_SYNC);
+
+ done:
+    return ret;
+}
+#else /* !FS_STATE_USE_MMAP */
+int
+fs_stateSync(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (fsync(state->fd) == -1)
+       ret = 1;
+
+ done:
+    return ret;
+}
+#endif /* !FS_STATE_USE_MMAP */
+
+int
+fs_stateIncEOF(struct fs_dump_state * state, afs_int32 len)
+{
+    afs_uint64 temp;
+    FillInt64(temp, 0, len);
+    AddUInt64(state->eof_offset, temp, &state->eof_offset);
+    return 0;
+}
+
+#ifdef FS_STATE_USE_MMAP
+static int
+fs_stateIncCursor(struct fs_dump_state * state, size_t len)
+{
+    char * p;
+
+    state->mmap.offset += len;
+
+    p = (char *) state->mmap.cursor;
+    p += len;
+    state->mmap.cursor = (void *) p;
+
+    return 0;
+}
+
+static int
+fs_stateCheckIOSafety(struct fs_dump_state * state, size_t len)
+{
+    int ret = 0;
+
+    if ((state->mmap.offset + len) > state->mmap.size) {
+       ret = 1;
+    }
+    return ret;
+}
+#endif /* FS_STATE_USE_MMAP */
+
+#ifdef FS_STATE_USE_MMAP
+int
+fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset)
+{
+    int ret = 0;
+    char * p;
+    afs_uint32 hi, lo;
+
+    SplitInt64(*offset, hi, lo);
+
+    /* update cursor */
+    p = (char *) state->mmap.map;
+#ifdef AFS_64BIT_ENV
+    p += *offset;
+#else
+    p += lo;
+#endif
+    state->mmap.cursor = (void *) p;
+
+    /* update offset */
+#ifdef AFS_LARGEFILE_ENV
+    state->mmap.offset = *offset;
+#else
+    if (hi)
+       ret = 1;
+    state->mmap.offset = lo;
+#endif
+
+    return ret;
+}
+#else /* !FS_STATE_USE_MMAP */
+int
+fs_stateSeek(struct fs_dump_state * state, afs_uint64 * offset)
+{
+    int ret = 0;
+#ifndef AFS_LARGEFILE_ENV
+    afs_uint32 high, low;
+    
+    SplitInt64(*offset, high, low);
+    if (high) {
+       ret = 1;
+       goto done;
+    }
+    
+    if (afs_lseek(state->fd, low, SEEK_SET) == -1)
+       ret = 1;
+#else
+    if (afs_lseek(state->fd, *offset, SEEK_SET) == -1)
+       ret = 1;
+#endif
+    return ret;
+}
+#endif /* !FS_STATE_USE_MMAP */
+
+static int
+fs_stateFillHeader(struct fs_state_header * hdr)
+{
+    hdr->stamp.magic = FS_STATE_MAGIC;
+    hdr->stamp.version = FS_STATE_VERSION;
+#ifdef SYS_NAME_ID
+    hdr->sys_name = SYS_NAME_ID;
+#else
+    hdr->sys_name = 0xFFFFFFFF;
+#endif
+    hdr->timestamp = FT_ApproxTime();
+    hdr->server_uuid = FS_HostUUID;
+    hdr->valid = 1;
+#ifdef AFSBIG_ENDIAN
+    hdr->endianness = 1;
+#else
+    hdr->endianness = 0;
+#endif
+#ifdef FS_STATS_DETAILED
+    hdr->stats_detailed = 1;
+#else
+    hdr->stats_detailed = 0;
+#endif
+    if (strlcpy(hdr->server_version_string, cml_version_number, sizeof(hdr->server_version_string))
+       >= sizeof(hdr->server_version_string)) {
+       ViceLog(0, ("fs_stateFillHeader: WARNING -- cml_version_number field truncated\n"));
+    }
+    return 0;
+}
+
+static int
+fs_stateCheckHeader(struct fs_state_header * hdr)
+{
+    int ret = 0;
+
+    if (!hdr->valid) {
+       ViceLog(0, ("fs_stateCheckHeader: dump was previously flagged invalid\n"));
+       ret = 1;
+    }
+#ifdef AFSBIG_ENDIAN
+    else if (!hdr->endianness) {
+       ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n"));
+       ret = 1;
+    }
+#else /* AFSLITTLE_ENDIAN */
+    else if (hdr->endianness) {
+       ViceLog(0, ("fs_stateCheckHeader: wrong endianness\n"));
+       ret = 1;
+    }
+#endif /* AFSLITTLE_ENDIAN */
+
+    else if (hdr->stamp.magic != FS_STATE_MAGIC) {
+       ViceLog(0, ("fs_stateCheckHeader: invalid dump header\n"));
+       ret = 1;
+    }
+    else if (hdr->stamp.version != FS_STATE_VERSION) {
+       ViceLog(0, ("fs_stateCheckHeader: unknown dump format version number\n"));
+       ret = 1;
+    }
+
+#ifdef FS_STATS_DETAILED
+    else if (!hdr->stats_detailed) {
+       ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n"));
+       ret = 1;
+    }
+#else /* FS_STATS_DETAILED */
+    else if (hdr->stats_detailed) {
+       ViceLog(0, ("fs_stateCheckHeader: wrong config flags\n"));
+       ret = 1;
+    }
+#endif /* FS_STATS_DETAILED */
+
+    else if (!afs_uuid_equal(&hdr->server_uuid, &FS_HostUUID)) {
+       ViceLog(0, ("fs_stateCheckHeader: server UUID does not match this server's UUID\n"));
+       ret = 1;
+    }
+
+    /* the cml_version_string is included for informational purposes only.  If someone ever
+     * wants to limit state dump reloading based upon the contents of this string, just
+     * uncomment the following code.  uncommenting this code is _strongly discouraged_ because
+     * we already make use of the version stamps in the various dump headers to deal with
+     * data structure version incompatabilities.
+    else if (strncmp(hdr->server_version_string, cml_version_number, 
+                    sizeof(hdr->server_version_string)) != 0) {
+       ViceLog(0, ("fs_stateCheckHeader: dump from different server version\n"));
+       ret = 1;
+    }
+    */
+
+    else if (strncmp(hdr->server_version_string, cml_version_number, 
+                    sizeof(hdr->server_version_string)) != 0) {
+       ViceLog(0, ("fs_stateCheckHeader: dump from different server version ; attempting state reload anyway\n"));
+    }
+
+
+    return ret;
+}
+
+static int
+fs_stateAlloc(struct fs_dump_state * state)
+{
+    int ret = 0;
+    memset(state, 0, sizeof(struct fs_dump_state));
+    state->fd = -1;
+    state->fn = AFSDIR_SERVER_FSSTATE_FILEPATH;
+    state->hdr = (struct fs_state_header *)malloc(sizeof(struct fs_state_header));
+    state->h_hdr = (struct host_state_header *)malloc(sizeof(struct host_state_header));
+    state->cb_hdr = (struct callback_state_header *)malloc(sizeof(struct callback_state_header));
+    state->cb_timeout_hdr = (struct callback_state_timeout_header *)
+      malloc(sizeof(struct callback_state_timeout_header));
+    state->cb_fehash_hdr = (struct callback_state_fehash_header *)
+      malloc(sizeof(struct callback_state_fehash_header));
+    if ((state->hdr == NULL) || (state->h_hdr == NULL) || (state->cb_hdr == NULL) ||
+       (state->cb_timeout_hdr == NULL) || (state->cb_fehash_hdr == NULL))
+       ret = 1;
+    return ret;
+}
+
+static int
+fs_stateFree(struct fs_dump_state * state)
+{
+    if (state->hdr)
+       free(state->hdr);
+    if (state->h_hdr)
+       free(state->h_hdr);
+    if (state->cb_hdr)
+       free(state->cb_hdr);
+    if (state->cb_timeout_hdr)
+       free(state->cb_timeout_hdr);
+    if (state->cb_fehash_hdr)
+       free(state->cb_fehash_hdr);
+    if (state->h_map.entries)
+       free(state->h_map.entries);
+    if (state->fe_map.entries)
+       free(state->fe_map.entries);
+    if (state->cb_map.entries)
+       free(state->cb_map.entries);
+    return 0;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tviced/serialize_state.h b/src/tviced/serialize_state.h

new file mode 100644 (file)

index 0000000..c1a08c0
--- /dev/null
+++ b/src/tviced/serialize_state.h
@@ -0,0 +1,311 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+
+#ifndef _AFS_TVICED_SERIALIZE_STATE_H
+#define _AFS_TVICED_SERIALIZE_STATE_H
+
+#ifdef AFS_DEMAND_ATTACH_FS
+
+#define FS_STATE_MAGIC 0x62FA841C
+#define FS_STATE_VERSION 2
+
+#define HOST_STATE_MAGIC 0x7B8C9DAE
+#define HOST_STATE_VERSION 2
+
+#define HOST_STATE_ENTRY_MAGIC 0xA8B9CADB
+
+#define CALLBACK_STATE_MAGIC 0x89DE67BC
+#define CALLBACK_STATE_VERSION 1
+
+#define CALLBACK_STATE_TIMEOUT_MAGIC 0x99DD5511
+#define CALLBACK_STATE_FEHASH_MAGIC 0x77BB33FF
+#define CALLBACK_STATE_ENTRY_MAGIC 0x54637281
+
+#define ACTIVE_VOLUME_STATE_MAGIC 0xAC7557CA
+#define ACTIVE_VOLUME_STATE_VERSION 1
+
+#define ACTIVE_VOLUME_STATE_AVEHASH_MAGIC 0xBADDF00D
+
+#define HOST_STATE_VALID_WINDOW 1800 /* 30 minutes */
+
+/*
+ * on-disk structures
+ */
+struct disk_version_stamp {
+    afs_uint32 magic;
+    afs_uint32 version;
+};
+
+/* 1024 byte header structure */
+struct fs_state_header {
+    struct disk_version_stamp stamp;  /* version stamp */
+    afs_uint32 timestamp;             /* timestamp of save */
+    afs_uint32 sys_name;              /* sys name id for this machine */
+    afsUUID server_uuid;              /* server's UUID */
+    byte valid;                       /* whether header contents are valid */
+    byte endianness;                  /* endianness sanity check (0 for LE, 1 for BE) */
+    byte stats_detailed;              /* fs stats detailed sanity check */
+    byte padding1[1];                 /* padding */
+    afs_uint32 reserved1[23];         /* for expansion */
+    afs_uint64 avol_offset;           /* offset of active volumes structure */
+    afs_uint64 h_offset;              /* offset of host_state_header structure */
+    afs_uint64 cb_offset;             /* offset of callback_state_header structure */
+    afs_uint64 vlru_offset;           /* offset of vlru state structure */
+    afs_uint32 reserved2[56];         /* for expansion */
+    char server_version_string[128];  /* version string from AFS_component_version_number.c */
+    afs_uint32 reserved3[128];        /* for expansion */
+};
+
+/*
+ * host package serialization
+ */
+
+/* 256 byte header for the host state data */
+struct host_state_header {
+    struct disk_version_stamp stamp;  /* host state version stamp */
+    afs_uint32 records;               /* number of stored host records */
+    afs_uint32 index_max;             /* max index value encountered */
+    afs_uint32 reserved[60];          /* for expansion */
+};
+
+/* 32 byte host entry header */
+struct host_state_entry_header {
+    afs_uint32 magic;         /* stamp */
+    afs_uint32 len;           /* number of bytes in this record */
+    afs_uint32 interfaces;    /* number of interfaces included in record */
+    afs_uint32 hcps;          /* number of hcps entries in record */
+    afs_uint32 reserved[4];
+};
+
+/* 36 byte host entry structure */
+struct hostDiskEntry {
+    afs_uint32 host;           /* IP address of host interface that is
+                                * currently being used, in network
+                                * byte order */
+    afs_uint16 port;           /* port address of host */
+    afs_uint16 hostFlags;       /*  bit map */
+    byte Console;              /* XXXX This host is a console */
+    byte hcpsfailed;           /* Retry the cps call next time */
+    byte hcps_valid;            /* prlist_val not null */
+#if FS_STATS_DETAILED
+    byte InSameNetwork;                /*Is host's addr in the same network as
+                                * the File Server's? */
+#else
+    byte padding1[1];          /* for padding */
+#endif                         /* FS_STATS_DETAILED */
+    afs_uint32 hcps_len;        /* length of hcps */
+    afs_uint32 LastCall;       /* time of last call from host */
+    afs_uint32 ActiveCall;     /* time of any call but gettime */
+    afs_uint32 cpsCall;                /* time of last cps call from this host */
+    afs_uint32 cblist;         /* Call back list for this host */
+    afs_uint32 index;           /* index for correlating w/ callback dumps */
+};
+
+/*
+ * callback package serialization
+ */
+
+/* 512 byte header */
+struct callback_state_header {
+    struct disk_version_stamp stamp;    /* callback state version stamp */
+    afs_uint32 nFEs;                    /* number of FileEntry records */
+    afs_uint32 nCBs;                    /* number of CallBack records */
+    afs_uint32 fe_max;                  /* max FileEntry index */
+    afs_uint32 cb_max;                  /* max CallBack index */
+    afs_int32 tfirst;                   /* first valid timeout */
+    afs_uint32 reserved[115];           /* for expansion */
+    afs_uint64 timeout_offset;          /* offset of timeout queue heads */
+    afs_uint64 fehash_offset;           /* offset of file entry hash buckets */
+    afs_uint64 fe_offset;               /* offset of first file entry */
+};
+
+/* 32 byte header */
+struct callback_state_timeout_header {
+    afs_uint32 magic;         /* magic number for timeout header */
+    afs_uint32 len;           /* total length of header and timeout records */
+    afs_uint32 records;       /* number of timeout records */
+    afs_uint32 reserved[5];
+};
+
+/* 32 byte header */
+struct callback_state_fehash_header {
+    afs_uint32 magic;         /* magic number for fehash header */
+    afs_uint32 len;           /* total length of header and fehash bucket heads */
+    afs_uint32 records;       /* number of hash buckets */
+    afs_uint32 reserved[5];
+};
+
+/* 32 byte header */
+struct callback_state_entry_header {
+    afs_uint32 magic;         /* magic number for FE entry */
+    afs_uint32 len;           /* number of bytes in this record */
+    afs_uint32 nCBs;          /* number of callbacks for this FE */
+    afs_uint32 reserved[5];
+};
+
+struct FEDiskEntry {
+    struct FileEntry fe;
+    afs_uint32 index;
+};
+
+struct CBDiskEntry {
+    struct CallBack cb;
+    afs_uint32 index;
+};
+
+/*
+ * active volumes state serialization
+ *
+ * these structures are meant to support
+ * automated salvaging of active volumes
+ * in the event of a fileserver crash
+ */
+
+/* 512 byte header */
+struct active_volume_state_header {
+    struct disk_version_stamp stamp;    /* callback state version stamp */
+    afs_uint32 nAVEs;                   /* number of ActiveVolumeEntry records */
+    afs_uint32 init_timestamp;          /* timestamp of AVE initialization */
+    afs_uint32 update_timetamp;         /* timestamp of last AVE update */
+    afs_uint32 reserved[119];           /* for expansion */
+    afs_uint64 avehash_offset;          /* offset of active volume entry hash buckets */
+    afs_uint64 ave_offset;              /* offset of first active volume entry */
+};
+
+/* 32 byte header */
+struct active_volume_state_avehash_header {
+    afs_uint32 magic;         /* magic number for avehash header */
+    afs_uint32 len;           /* total length of header and avehash bucket heads */
+    afs_uint32 records;       /* number of hash buckets */
+    afs_uint32 reserved[5];
+};
+
+typedef afs_uint32 active_volume_state_avehash_entry;
+
+/* active volume entry */
+struct AVDiskEntry {
+    afs_uint32 volume;
+    afs_uint32 partition;
+    afs_uint32 hash_next;
+};
+
+
+/*
+ * dump runtime state
+ */
+struct idx_map_entry_t {
+    afs_uint32 old_idx;                    /* host hash id from last runtime */
+    afs_uint32 new_idx;                    /* host hash id for this runtime */
+};
+
+
+/* verification process sanity check constants
+ *
+ * make them fairly large so we don't get 
+ * false positives 
+ */
+#define FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN    100000     /* max elements in a host uuid-hash chain */
+#define FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN    2000000    /* max elements in a host ipv4-hash chain */
+#define FS_STATE_FE_MAX_HASH_CHAIN_LEN        100000     /* max elements in a FE fid-hash chain */
+#define FS_STATE_FCB_MAX_LIST_LEN             100000     /* max elements in a per-FE CB list */
+#define FS_STATE_HCB_MAX_LIST_LEN             100000     /* max elements in a per-host CB list */
+#define FS_STATE_TCB_MAX_LIST_LEN             100000     /* max elements in a per-timeout CB list */
+
+
+/*
+ * main state serialization state structure
+ */
+
+struct fs_dump_state {
+    enum {
+       FS_STATE_DUMP_MODE,
+       FS_STATE_LOAD_MODE
+    } mode;
+    struct {
+       byte do_host_restore;              /* whether host restore should be done */
+       byte some_steps_skipped;           /* whether some steps were skipped */
+       byte warnings_generated;           /* whether any warnings were generated during restore */
+    } flags;
+    afs_fsize_t file_len;
+    int fd;                                /* fd of the current dump file */
+    int bail;                              /* non-zero if something went wrong */
+    char * fn;                             /* name of the current dump file */
+    struct {                               /* memory map of dump file */
+       void * map;
+       void * cursor;
+       afs_foff_t offset;
+       afs_fsize_t size;
+    } mmap;
+    struct fs_state_header * hdr;          /* main header */
+    struct host_state_header * h_hdr;      /* header for host state data */
+    struct callback_state_header * cb_hdr; /* header for callback state data */
+    struct callback_state_timeout_header * cb_timeout_hdr;
+    struct callback_state_fehash_header * cb_fehash_hdr;
+    afs_uint64 eof_offset;                 /* current end of file offset */
+    struct {
+       int len;                           /* number of host entries in map */
+       struct idx_map_entry_t * entries;
+    } h_map;
+    struct {
+       int len;
+       struct idx_map_entry_t * entries;
+    } fe_map;
+    struct {
+       int len;
+       struct idx_map_entry_t * entries;
+    } cb_map;
+};
+
+
+/* prototypes */
+
+/* serialize_state.c */
+extern int fs_stateWrite(struct fs_dump_state * state,
+                        void * buf, size_t len);
+extern int fs_stateRead(struct fs_dump_state * state,
+                       void * buf, size_t len);
+extern int fs_stateWriteV(struct fs_dump_state * state,
+                         struct iovec * iov, int niov);
+extern int fs_stateReadV(struct fs_dump_state * state,
+                        struct iovec * iov, int niov);
+extern int fs_stateSync(struct fs_dump_state * state);
+extern int fs_stateWriteHeader(struct fs_dump_state * state,
+                              afs_uint64 * offset,
+                              void * hdr, size_t len);
+extern int fs_stateReadHeader(struct fs_dump_state * state,
+                             afs_uint64 * offset,
+                             void * hdr, size_t len);
+extern int fs_stateIncEOF(struct fs_dump_state * state,
+                         afs_int32 len);
+extern int fs_stateSeek(struct fs_dump_state * state,
+                       afs_uint64 * offset);
+
+/* host.c */
+extern int h_stateSave(struct fs_dump_state * state);
+extern int h_stateRestore(struct fs_dump_state * state);
+extern int h_stateRestoreIndices(struct fs_dump_state * state);
+extern int h_stateVerify(struct fs_dump_state * state);
+extern int h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+
+/* callback.c */
+extern int cb_stateSave(struct fs_dump_state * state);
+extern int cb_stateRestore(struct fs_dump_state * state);
+extern int cb_stateRestoreIndices(struct fs_dump_state * state);
+extern int cb_stateVerify(struct fs_dump_state * state);
+extern int cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host);
+extern int fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+extern int cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new);
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+#endif /* _AFS_TVICED_SERIALIZE_STATE_H */
diff --git a/src/tviced/state_analyzer.c b/src/tviced/state_analyzer.c

new file mode 100644 (file)

index 0000000..ae8c3ff
--- /dev/null
+++ b/src/tviced/state_analyzer.c
@@ -0,0 +1,2004 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * fileserver state serialization
+ *
+ * state analyzer
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdio.h>
+#include <errno.h>
+#include <sys/file.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <time.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+#include <afs/stds.h>
+#include <rx/xdr.h>
+#include <afs/assert.h>
+#include <lwp.h>
+#include <lock.h>
+#include <afs/afsint.h>
+#include <afs/rxgen_consts.h>
+#include <afs/nfs.h>
+#include <afs/errors.h>
+#include <afs/ihandle.h>
+#include <afs/vnode.h>
+#include <afs/volume.h>
+#ifdef AFS_ATHENA_STDENV
+#include <krb.h>
+#endif
+#include <afs/acl.h>
+#include <afs/ptclient.h>
+#include <afs/prs_fs.h>
+#include <afs/auth.h>
+#include <afs/afsutil.h>
+#include <rx/rx.h>
+#include <afs/cellconfig.h>
+#include <stdlib.h>
+#include "../util/afsutil_prototypes.h"
+#include "../viced/viced.h"
+#include "../viced/host.h"
+#include "../viced/callback.h"
+#include "serialize_state.h"
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#ifdef S_SPLINT_S
+extern off64_t afs_lseek(int FD, off64_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)    lseek64(FD, (off64_t)(O), F)
+#define afs_stat               stat64
+#define afs_fstat              fstat64
+#define afs_open               open64
+#define afs_fopen              fopen64
+#define afs_mmap                mmap64
+#ifdef AFS_AIX_ENV
+extern void * mmap64();  /* ugly hack since aix build env appears to be somewhat broken */
+#endif
+#else /* !O_LARGEFILE */
+#ifdef S_SPLINT_S
+extern off_t afs_lseek(int FD, off_t O, int F);
+#endif /*S_SPLINT_S */
+#define afs_lseek(FD, O, F)    lseek(FD, (off_t)(O), F)
+#define afs_stat               stat
+#define afs_fstat              fstat
+#define afs_open               open
+#define afs_fopen              fopen
+#define afs_mmap                mmap
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+#ifndef AFS_DEMAND_ATTACH_FS
+int
+main (int argc, char ** argv)
+{
+    fprintf(stderr, "%s is only supported for demand attach fileservers\n",
+           argv[0] ? argv[0] : "state analyzer");
+    return 1;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+
+static void usage(char * prog);
+static int openFile(char * path);
+static void initState(void);
+
+static void banner(void);
+static void prompt(void);
+
+static void print_help(void);
+static void print_global_help(void);
+static void print_h_help(void);
+static void print_fe_help(void);
+static void print_cb_help(void);
+
+static void dump_hdr(void);
+static void dump_h_hdr(void);
+static void dump_cb_hdr(void);
+
+static void dump_cb_timeout(void);
+static void dump_cb_fehash(void);
+
+static void dump_all_hes(void);
+static void dump_all_fes(void);
+static void dump_all_cbs(void);
+
+static void dump_he(afs_uint32 idx);
+static void dump_fe(afs_uint32 idx);
+static void dump_cb(afs_uint32 idx);
+static void dump_this_he(void);
+static void dump_this_fe(void);
+static void dump_this_cb(void);
+static void dump_next_he(void);
+static void dump_next_fe(void);
+static void dump_next_cb(void);
+static void dump_prev_he(void);
+static void dump_prev_fe(void);
+static void dump_prev_cb(void);
+static void dump_first_he(void);
+static void dump_first_fe(void);
+static void dump_first_cb(void);
+static void dump_last_he(void);
+static void dump_last_fe(void);
+static void dump_last_cb(void);
+static void dump_he_hdr(void);
+static void dump_he_entry(void);
+static void dump_he_interfaces(void);
+static void dump_he_hcps(void);
+static void dump_fe_hdr(void);
+static void dump_fe_entry(void);
+static void dump_cb_entry(void);
+
+static void hexdump_map(afs_uint32 offset, afs_uint32 len);
+
+static int get_hdr(void);
+static int get_h_hdr(void);
+static int get_cb_hdr(void);
+static int get_cb_timeout_hdr(void);
+static int get_cb_timeout(void);
+static int get_cb_fehash_hdr(void);
+static int get_cb_fehash(void);
+static int get_he(afs_uint32 idx);
+static int get_he_hdr(void);
+static int get_he_entry(void);
+static int get_fe(afs_uint32 idx);
+static int get_fe_hdr(void);
+static int get_fe_entry(void);
+static int get_cb(afs_uint32 idx);
+static int get_cb_entry(void);
+
+static int find_fe_by_index(afs_uint32 idx);
+static int find_cb_by_index(afs_uint32 idx);
+static int find_fe_by_fid(afs_uint32 vol, afs_uint32 vn, afs_uint32 uniq);
+
+
+static int dump_fd = -1;
+static void * map = NULL;
+static size_t map_len;
+
+static struct {
+    struct fs_state_header hdr;
+    struct host_state_header h_hdr;
+    struct callback_state_header cb_hdr;
+    struct callback_state_timeout_header timeout_hdr;
+    struct callback_state_fehash_header fehash_hdr;
+    afs_uint32 * timeout;
+    afs_uint32 * fehash;
+
+    /* pointers into the memory map */
+    void * hdr_p;
+    void * h_hdr_p;
+    void * cb_hdr_p;
+    void * timeout_hdr_p;
+    void * timeout_p;
+    void * fehash_hdr_p;
+    void * fehash_p;
+
+    byte hdr_valid;
+    byte h_hdr_valid;
+    byte cb_hdr_valid;
+    byte timeout_hdr_valid;
+    byte fehash_hdr_valid;
+} hdrs;
+
+static struct {
+    void * fh;
+    void * cursor;
+    void * ifp;
+    void * hcps;
+    struct host_state_entry_header hdr;
+    struct hostDiskEntry he;
+    afs_uint32 idx;
+    byte hdr_valid;
+    byte he_valid;
+} he_cursor;
+
+static struct {
+    void ** cursor;
+} he_cache;
+
+static struct {
+    void * ffe;
+    void * cursor;
+    void * fcb;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fe;
+    afs_uint32 idx;
+    byte hdr_valid;
+    byte fe_valid;
+} fe_cursor;
+
+static struct {
+    void ** cursor;
+} fe_cache;
+
+static struct {
+    void * cursor;
+    struct CBDiskEntry cb;
+    afs_uint32 idx;
+    byte cb_valid;
+} cb_cursor;
+
+static struct {
+    void ** cursor;
+} cb_cache;
+
+static void
+usage(char * prog)
+{
+    fprintf(stderr, "usage: %s [<state dump file>]\n");
+}
+
+int
+main(int argc, char ** argv)
+{
+    banner();
+
+    if (argc > 2 || (argc == 2 && !strcmp(argv[1], "-h"))) {
+       usage(argv[0]);
+       return 1;
+    }
+
+    initState();
+
+    if (argc > 1) {
+       if (openFile(argv[1]))
+           return 1;
+    } else {
+       if (openFile(AFSDIR_SERVER_FSSTATE_FILEPATH))
+           return 1;
+    }
+
+    prompt();
+    return 0;
+}
+
+
+static int
+openFile(char * path)
+{
+    int ret = 0;
+    struct afs_stat status;
+    
+    dump_fd = afs_open(path, O_RDWR);
+    if (dump_fd == -1) {
+       fprintf(stderr, "dump file '%s' failed to open\n", path);
+       ret = 1;
+       goto done;
+    }
+
+    printf("opened dump file '%s'\n", path);
+
+    if (afs_fstat(dump_fd, &status) == -1) {
+       fprintf(stderr, "failed to stat file\n");
+       ret = 1;
+       goto done;
+    }
+
+    map_len = status.st_size;
+
+    map = afs_mmap(NULL, map_len, PROT_READ, MAP_SHARED, dump_fd, 0);
+    if (map == MAP_FAILED) {
+       fprintf(stderr, "failed to mmap file\n");
+       ret = 1;
+       goto done;
+    }
+
+    printf("mapped %d bytes at 0x%x\n", map_len, map);
+
+ done:
+    if (ret) {
+       if (map) {
+           munmap(map, map_len);
+           map = NULL;
+       }
+       if (dump_fd != -1) {
+           close(dump_fd);
+           dump_fd = -1;
+       }
+    }
+    return ret;
+}
+
+static void
+initState(void)
+{
+    hdrs.hdr_valid = hdrs.h_hdr_valid = hdrs.cb_hdr_valid = 0;
+    he_cursor.cursor = fe_cursor.cursor = cb_cursor.cursor = NULL;
+    he_cursor.fh = fe_cursor.ffe = fe_cursor.fcb = NULL;
+    he_cache.cursor = fe_cache.cursor = NULL;
+}
+
+static void
+banner(void)
+{
+    fprintf(stderr, "demand attach fs\n");
+    fprintf(stderr, "fileserver state analyzer\n");
+    fprintf(stderr, "version 0.1\n");
+}
+
+#define PROGNAME "fs state analyzer"
+
+static void
+prompt(void)
+{
+    char input[256];
+    char prev_input[256];
+    char * tok = NULL;
+    afs_uint32 x, y, z;
+    enum {
+       PR_GLOBAL_MODE,
+       PR_H_MODE,
+       PR_FE_MODE,
+       PR_CB_MODE
+    } mode = PR_GLOBAL_MODE, next_mode;
+
+    next_mode = mode;
+    input[0] = prev_input[0] = '\0';
+
+    while (1) {
+       if (!tok) {
+           switch(mode) {
+           case PR_GLOBAL_MODE:
+               printf(PROGNAME "> ");
+               break;
+           case PR_H_MODE:
+               printf(PROGNAME ": h(%d)> ", he_cursor.idx);
+               break;
+           case PR_FE_MODE:
+               printf(PROGNAME ": fe(%d)> ", fe_cursor.idx);
+               break;
+           case PR_CB_MODE:
+               printf(PROGNAME ": fe(%d):cb(%d)> ", fe_cursor.idx, cb_cursor.idx);
+               break;
+           default:
+               fprintf(stderr, "prompt state broken; aborting\n");
+               return;
+           }
+           gets(input);
+
+           if (!strcmp(input, "")) {
+               /* repeat last command */
+               if (!strcmp(prev_input, "")) {
+                   continue;
+               }
+               strlcpy(input, prev_input, sizeof(input));
+           } else {
+               /* save command for repetition */
+               strlcpy(prev_input, input, sizeof(prev_input));
+           }
+
+           tok = strtok(input, " \t");
+       }
+       while (tok && !strcmp(tok, ";")) {
+           tok = strtok(NULL, "; \t");
+       }
+
+       if (!tok) {
+           continue;
+       }
+
+       if (!strcasecmp(tok, "exit")) {
+           return;
+       } else if (!strcasecmp(tok, "quit")) {
+           switch(mode) {
+           case PR_CB_MODE:
+               next_mode = PR_FE_MODE;
+               break;
+           case PR_FE_MODE:
+           case PR_H_MODE:
+               next_mode = PR_GLOBAL_MODE;
+               break;
+           default:
+               return;
+           }
+       } else if (!strcasecmp(tok, "h")) {
+           tok = strtok(NULL, " \t");
+           mode = PR_H_MODE;
+           if (!tok) {
+               next_mode = mode;
+           }
+           continue;
+       } else if (!strcasecmp(tok, "fe")) {
+           tok = strtok(NULL, " \t");
+           mode = PR_FE_MODE;
+           if (!tok) {
+               next_mode = mode;
+           }
+           continue;
+       } else if (!strcasecmp(tok, "fs")) {
+           tok = strtok(NULL, " \t");
+           mode = PR_GLOBAL_MODE;
+           if (!tok) {
+               next_mode = mode;
+           }
+           continue;
+       } else if (!strcasecmp(tok, "cb")) {
+           tok = strtok(NULL, " \t");
+           mode = PR_CB_MODE;
+           if (!tok) {
+               next_mode = mode;
+           }
+           continue;
+       } else if (!strcasecmp(tok, "help")) {
+           switch(mode) {
+           case PR_H_MODE:
+               print_h_help();
+               break;
+           case PR_FE_MODE:
+               print_fe_help();
+               break;
+           case PR_CB_MODE:
+               print_cb_help();
+               break;
+           default:
+               print_global_help();
+           }
+           print_help();
+       } else if (!strcasecmp(tok, "hexdump")) {
+           tok = strtok(NULL, " \t");
+           if (!tok) {
+               hexdump_map(0, map_len);
+               continue;
+           }
+           if (sscanf(tok, "%u", &x) != 1) {
+               fprintf(stderr, "hexdump parse error 1\n");
+               tok = NULL;
+               continue;
+           }
+           tok = strtok(NULL, " \t");
+           if (!tok) {
+               hexdump_map(x, map_len - x);
+               continue;
+           }
+           if (sscanf(tok, "%u", &y) != 1) {
+               fprintf(stderr, "hexdump parse error 2\n");
+               continue;
+           }
+           hexdump_map(x,y);
+       } else if (!strcasecmp(tok, "hdr")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_h_hdr();
+               break;
+           case PR_FE_MODE:
+               dump_cb_hdr();
+               break;
+           case PR_CB_MODE:
+               dump_this_fe();
+               break;
+           default:
+               dump_hdr();
+           }
+       } else if (!strcasecmp(tok, "this")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_this_he();
+               break;
+           case PR_FE_MODE:
+               dump_this_fe();
+               break;
+           case PR_CB_MODE:
+               dump_this_cb();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "next")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_next_he();
+               break;
+           case PR_FE_MODE:
+               dump_next_fe();
+               break;
+           case PR_CB_MODE:
+               dump_next_cb();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "prev")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_prev_he();
+               break;
+           case PR_FE_MODE:
+               dump_prev_fe();
+               break;
+           case PR_CB_MODE:
+               dump_prev_cb();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "first")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_first_he();
+               break;
+           case PR_FE_MODE:
+               dump_first_fe();
+               break;
+           case PR_CB_MODE:
+               dump_first_cb();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "last")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_last_he();
+               break;
+           case PR_FE_MODE:
+               dump_last_fe();
+               break;
+           case PR_CB_MODE:
+               dump_last_cb();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "dump")) {
+           switch(mode) {
+           case PR_H_MODE:
+               dump_all_hes();
+               break;
+           case PR_FE_MODE:
+               dump_all_fes();
+               break;
+           case PR_CB_MODE:
+               dump_all_cbs();
+               break;
+           default:
+               fprintf(stderr, "command not valid for this mode\n");
+           }
+       } else if (!strcasecmp(tok, "find")) {
+           tok = strtok(NULL, " \t");
+           if (!tok || strcasecmp(tok, "by")) {
+               tok = NULL;
+               fprintf(stderr, "find syntax error 1 (%s)\n", 
+                       (tok) ? tok : "nil");
+               continue;
+           }
+           tok = strtok(NULL, " \t");
+           if (!tok) {
+               fprintf(stderr, "find syntax error 2\n");
+               continue;
+           }
+           switch(mode) {
+           case PR_H_MODE:
+               fprintf(stderr, "not implemented yet\n");
+               break;
+           case PR_FE_MODE:
+               if (!strcasecmp(tok, "index")) {
+                   tok = strtok(NULL, " \t");
+                   if (!tok || sscanf(tok, "%u", &x) != 1) {
+                       tok = NULL;
+                       fprintf(stderr, "find syntax error 3\n");
+                       continue;
+                   }
+                   if (find_fe_by_index(x)) {
+                       fprintf(stderr, "find returned no results\n");
+                   }
+               } else if (!strcasecmp(tok, "fid")) {
+                   tok = strtok(NULL, "(), \t");
+                   if (!tok || sscanf(tok, "%u", &x) != 1) {
+                       tok = NULL;
+                       fprintf(stderr, "find syntax error 4\n");
+                       continue;
+                   }
+                   tok = strtok(NULL, "(), \t");
+                   if (!tok || sscanf(tok, "%u", &y) != 1) {
+                       tok = NULL;
+                       fprintf(stderr, "find syntax error 5\n");
+                       continue;
+                   }
+                   tok = strtok(NULL, "(), \t");
+                   if (!tok || sscanf(tok, "%u", &z) != 1) {
+                       tok = NULL;
+                       fprintf(stderr, "find syntax error 6\n");
+                       continue;
+                   }
+                   if (find_fe_by_fid(x,y,z)) {
+                       fprintf(stderr, "find returned no results\n");
+                   }
+               } else {
+                   fprintf(stderr, "unsupported filter type\n");
+               }
+               break;
+           case PR_CB_MODE:
+               if (!strcasecmp(tok, "index")) {
+                   tok = strtok(NULL, " \t");
+                   if (!tok || sscanf(tok, "%u", &x) != 1) {
+                       tok = NULL;
+                       fprintf(stderr, "find syntax error 3\n");
+                       continue;
+                   }
+                   if (find_cb_by_index(x)) {
+                       fprintf(stderr, "find returned no results\n");
+                   }
+               } else {
+                   fprintf(stderr, "unsupported filter type\n");
+               }
+               break;
+           default:
+               fprintf(stderr, "find not supported for this menu\n");
+           }
+       } else if (!strcspn(tok, "0123456789")) {
+           if (sscanf(tok, "%u", &x) == 1) {
+               switch(mode) {
+               case PR_H_MODE:
+                   dump_he(x);
+                   break;
+               case PR_FE_MODE:
+                   dump_fe(x);
+                   break;
+               case PR_CB_MODE:
+                   dump_cb(x);
+                   break;
+               default:
+                   fprintf(stderr, "command not available from this menu\n");
+               }
+           } else {
+               fprintf(stderr, "input parse error ('%s')\n", tok);
+           }
+       } else if (mode == PR_FE_MODE) {
+           if (!strcmp(tok, "timeout")) {
+               dump_cb_timeout();
+           } else if (!strcmp(tok, "hash")) {
+               dump_cb_fehash();
+           }
+       } else {
+           fprintf(stderr, "unknown command\n");
+       }
+       tok = strtok(NULL, " \t");
+       mode = next_mode;
+    }
+}
+
+static void
+print_help(void)
+{
+    printf("\th <...>  -- host menu commands\n");
+    printf("\tfe <...> -- FileEntry menu commands\n");
+    printf("\tcb <...> -- CallBack menu commands\n");
+    printf("\thexdump [<offset> [<len>]]\n\t\t -- hex dump the raw data\n");
+    printf("\tquit     -- quit this menu\n");
+    printf("\texit     -- exit the debugger\n");
+    printf("\thelp     -- this help message\n");
+}
+
+static void
+print_global_help(void)
+{
+    printf("\thdr      -- display the fs_state_header struct\n");
+}
+
+static void
+print_h_help(void)
+{
+    printf("\thdr      -- display the host_state_header struct\n");
+    printf("\tfirst    -- display the first host\n");
+    printf("\tprev     -- display the previous host\n");
+    printf("\tthis     -- display this host\n");
+    printf("\tnext     -- display the next host\n");
+    printf("\tlast     -- display the last host\n");
+    printf("\tdump     -- display all hosts\n");
+}
+
+static void
+print_fe_help(void)
+{
+    printf("\thdr      -- display the callback_state_header struct\n");
+    printf("\tfirst    -- display the first FE\n");
+    printf("\tprev     -- display the previous FE\n");
+    printf("\tthis     -- display this FE\n");
+    printf("\tnext     -- display the next FE\n");
+    printf("\tlast     -- display the last FE\n");
+    printf("\tdump     -- display all FEs\n");
+    printf("\ttimeout  -- display the timeout queue heads\n");
+    printf("\thash   -- display the file entry hash buckets\n");
+    printf("\tfind by index <id>\n\t\t -- find an fe by its array index\n");
+    printf("\tfind by fid <(vol,vnode,unique)>\n\t\t -- find an fe by its AFSFid\n");
+}
+
+static void
+print_cb_help(void)
+{
+    printf("\thdr      -- display the callback_state_entry_header struct\n");
+    printf("\tfirst    -- display the first CB\n");
+    printf("\tprev     -- display the previous CB\n");
+    printf("\tthis     -- display this CB\n");
+    printf("\tnext     -- display the next CB\n");
+    printf("\tlast     -- display the last CB\n");
+    printf("\tdump     -- display all CBs\n");
+}
+
+#define DPFTB0 "\t"
+#define DPFTB1 "\t\t"
+#define DPFTB2 "\t\t\t"
+
+#define DPFOFF(addr) \
+    do { \
+        char * _p = (char *)addr; \
+        char * _m = (char *)map; \
+        printf("loading structure from address 0x%x (offset %u)\n", \
+               addr, _p-_m); \
+    } while (0)
+
+/* structs */
+#define DPFSO(T, name) printf(T "%s = {\n", name)
+#define DPFSO0(name) DPFSO(DPFTB0, name)
+#define DPFSO1(name) DPFSO(DPFTB1, name)
+#define DPFSC(T) printf(T "}\n")
+#define DPFSC0 DPFSC(DPFTB0)
+#define DPFSC1 DPFSC(DPFTB1)
+
+/* arrays */
+#define DPFAO(T1, T2, name) printf(T1 "%s =\n" T2 "{ ", name)
+#define DPFAO0(name) DPFAO(DPFTB0, DPFTB1, name)
+#define DPFAO1(name) DPFAO(DPFTB1, DPFTB2, name)
+#define DPFAC0 printf(" }\n")
+#define DPFAC1 DPFAC0
+#define DPFA1 printf(DPFTB1 "  ")
+#define DPFA2 printf(DPFTB2 "  ")
+#define DPFAN printf("\n")
+#define DPFALE(type, var) printf("%" type, var)
+#define DPFAE(type, var) printf("%" type ",\t", var)
+
+/* normal vars */
+#define DPFV(T, name, type, var) printf(T "%s = %" type "\n", name, var)
+#define DPFV1(name, type, var) DPFV(DPFTB1, name, type, var)
+#define DPFV2(name, type, var) DPFV(DPFTB2, name, type, var)
+
+/* hex */
+#define DPFX(T, name, var) printf(T "%s = 0x%x\n", name, var)
+#define DPFX1(name, var) DPFX(DPFTB1, name, var)
+#define DPFX2(name, var) DPFX(DPFTB2, name, var)
+
+/* strings */
+#define DPFS(T, name, var) printf(T "%s = \"%s\"\n", name, var)
+#define DPFS1(name, var) DPFS(DPFTB1, name, var)
+#define DPFS2(name, var) DPFS(DPFTB2, name, var)
+
+/* time */
+#define DPFT(T, name, var) \
+    do { \
+        char * last; \
+        printf(T "%s = \"%s\"\n", name, strtok_r(ctime(&(var)), "\r\n", &last)); \
+    } while(0)
+#define DPFT1(name, var) DPFT(DPFTB1, name, var)
+#define DPFT2(name, var) DPFT(DPFTB2, name, var)
+
+static void
+dump_hdr(void)
+{
+    char uuid_str[40];
+    afs_uint32 hi, lo;
+
+    if (get_hdr())
+       return;
+
+    DPFOFF(map);
+    DPFSO0("fs_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.hdr.stamp.version);
+    DPFSC1;
+    DPFT1("timestamp", hdrs.hdr.timestamp);
+    DPFV1("sys_name", "u", hdrs.hdr.sys_name);
+
+    afsUUID_to_string(&hdrs.hdr.server_uuid, uuid_str, sizeof(uuid_str));
+    DPFS1("server_uuid", uuid_str);
+    DPFV1("valid", "d", hdrs.hdr.valid);
+    DPFV1("endianness", "d", hdrs.hdr.endianness);
+    DPFV1("stats_detailed", "d", hdrs.hdr.stats_detailed);
+
+    SplitInt64(hdrs.hdr.h_offset, hi, lo);
+    DPFSO1("h_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.hdr.cb_offset, hi, lo);
+    DPFSO1("cb_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    DPFS1("server_version_string", hdrs.hdr.server_version_string);
+    DPFSC0;
+
+    if (hdrs.hdr.stamp.magic != FS_STATE_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.hdr.stamp.version != FS_STATE_VERSION) {
+       fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_h_hdr(void)
+{
+    if (get_h_hdr())
+       return;
+
+    DPFOFF(hdrs.h_hdr_p);
+    DPFSO0("host_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.h_hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.h_hdr.stamp.version);
+    DPFSC1;
+    DPFV1("records", "u", hdrs.h_hdr.records);
+    DPFV1("index_max", "u", hdrs.h_hdr.index_max);
+    DPFSC0;
+
+    if (hdrs.h_hdr.stamp.magic != HOST_STATE_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.h_hdr.stamp.version != HOST_STATE_VERSION) {
+       fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_cb_hdr(void)
+{
+    afs_uint32 hi, lo;
+
+    if (get_cb_hdr())
+       return;
+
+    DPFOFF(hdrs.cb_hdr_p);
+    DPFSO0("callback_state_header");
+    DPFSO1("stamp");
+    DPFX2("magic", hdrs.cb_hdr.stamp.magic);
+    DPFV2("version", "u", hdrs.cb_hdr.stamp.version);
+    DPFSC1;
+    DPFV1("nFEs", "u", hdrs.cb_hdr.nFEs);
+    DPFV1("nCBs", "u", hdrs.cb_hdr.nCBs);
+    DPFV1("fe_max", "u", hdrs.cb_hdr.fe_max);
+    DPFV1("cb_max", "u", hdrs.cb_hdr.cb_max);
+    DPFV1("tfirst", "d", hdrs.cb_hdr.tfirst);
+
+    SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo);
+    DPFSO1("timeout_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo);
+    DPFSO1("fehash_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo);
+    DPFSO1("fe_offset");
+    DPFV2("hi", "u", hi);
+    DPFV2("lo", "u", lo);
+    DPFSC1;
+
+    DPFSC0;
+
+    if (hdrs.cb_hdr.stamp.magic != CALLBACK_STATE_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+    if (hdrs.cb_hdr.stamp.version != CALLBACK_STATE_VERSION) {
+       fprintf(stderr, "* version check failed\n");
+    }
+}
+
+static void
+dump_cb_timeout(void)
+{
+    int i;
+
+    if (get_cb_hdr())
+       return;
+
+    if (get_cb_timeout_hdr())
+       return;
+
+    if (get_cb_timeout())
+       return;
+
+    DPFOFF(hdrs.timeout_hdr_p);
+    DPFSO0("callback_state_timeout_header");
+    DPFX1("magic", hdrs.timeout_hdr.magic);
+    DPFV1("len", "u", hdrs.timeout_hdr.len);
+    DPFV1("records", "u", hdrs.timeout_hdr.records);
+    DPFSC0;
+
+    if (hdrs.timeout_hdr.magic != CALLBACK_STATE_TIMEOUT_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+
+    DPFOFF(hdrs.timeout_p);
+    DPFAO0("timeout");
+    for (i = 0; i < 127; i++) {
+       DPFAE("u", hdrs.timeout[i]);
+       if ((i % 8) == 7) {
+           DPFAN;
+           DPFA1;
+       }
+    }
+    DPFALE("u", hdrs.timeout[127]);
+    DPFAC0;
+}
+
+static void
+dump_cb_fehash(void)
+{
+    int i;
+
+    if (get_cb_hdr())
+       return;
+
+    if (get_cb_fehash_hdr())
+       return;
+
+    if (get_cb_fehash())
+       return;
+
+    DPFOFF(hdrs.fehash_hdr_p);
+    DPFSO0("callback_state_fehash_header");
+    DPFX1("magic", hdrs.fehash_hdr.magic);
+    DPFV1("len", "u", hdrs.fehash_hdr.len);
+    DPFV1("records", "u", hdrs.fehash_hdr.records);
+    DPFSC0;
+
+    if (hdrs.fehash_hdr.magic != CALLBACK_STATE_FEHASH_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+
+    DPFOFF(hdrs.fehash_p);
+    DPFAO0("fehash");
+    for (i = 0; i < hdrs.fehash_hdr.records - 1; i++) {
+       DPFAE("u", hdrs.fehash[i]);
+       if ((i % 8) == 7) {
+           DPFAN;
+           DPFA1;
+       }
+    }
+    DPFALE("u", hdrs.fehash[hdrs.fehash_hdr.records-1]);
+    DPFAC0;
+}
+
+static void
+dump_all_hes(void)
+{
+    int i;
+
+    if (get_h_hdr()) {
+       fprintf(stderr, "error getting host_state_header\n");
+       return;
+    }
+
+    for (i = 0; i < hdrs.h_hdr.records; i++) {
+       dump_he(i);
+    }
+}
+
+static void
+dump_all_fes(void)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+       fprintf(stderr, "error getting callback_state_header\n");
+       return;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+       dump_fe(i);
+    }
+}
+
+static void
+dump_all_cbs(void)
+{
+    int i;
+
+    if (get_fe_hdr()) {
+       fprintf(stderr, "error getting callback_state_entry_header\n");
+       return;
+    }
+
+    for (i = 0; i < fe_cursor.hdr.nCBs; i++) {
+       dump_cb(i);
+    }
+}
+
+static void
+dump_he(afs_uint32 idx)
+{
+    if (get_he(idx)) {
+       fprintf(stderr, "error getting he %d\n", idx);
+       return;
+    }
+
+    DPFOFF(he_cursor.cursor);
+    dump_he_hdr();
+    dump_he_entry();
+    dump_he_interfaces();
+    dump_he_hcps();
+}
+
+static void
+dump_fe(afs_uint32 idx)
+{
+    if (get_fe(idx)) {
+       fprintf(stderr, "error getting fe %d\n", idx);
+       return;
+    }
+
+    DPFOFF(fe_cursor.cursor);
+    dump_fe_hdr();
+    dump_fe_entry();
+}
+
+static void
+dump_cb(afs_uint32 idx)
+{
+    if (get_cb(idx)) {
+       fprintf(stderr, "error getting cb %d\n", idx);
+       return;
+    }
+
+    DPFOFF(cb_cursor.cursor);
+    dump_cb_entry();
+}
+
+static void
+dump_this_he(void)
+{
+    dump_he(he_cursor.idx);
+}
+
+static void
+dump_this_fe(void)
+{
+    dump_fe(fe_cursor.idx);
+}
+
+static void
+dump_this_cb(void)
+{
+    dump_cb(cb_cursor.idx);
+}
+
+static void
+dump_next_he(void)
+{
+    if (get_h_hdr()) {
+       fprintf(stderr, "error getting host_state_header\n");
+       return;
+    }
+
+    if ((he_cursor.idx + 1) >= hdrs.h_hdr.records) {
+       fprintf(stderr, "no more HEs\n");
+       return;
+    }
+    
+    dump_he(he_cursor.idx+1);
+}
+
+static void
+dump_next_fe(void)
+{
+    if (get_cb_hdr()) {
+       fprintf(stderr, "error getting callback_state_header\n");
+       return;
+    }
+
+    if ((fe_cursor.idx + 1) >= hdrs.cb_hdr.nFEs) {
+       fprintf(stderr, "no more FEs\n");
+       return;
+    }
+    
+    dump_fe(fe_cursor.idx+1);
+}
+
+static void
+dump_next_cb(void)
+{
+    if (get_fe_hdr()) {
+       fprintf(stderr, "error getting callback_state_entry_header\n");
+       return;
+    }
+
+    if ((cb_cursor.idx + 1) >= fe_cursor.hdr.nCBs) {
+       fprintf(stderr, "no more CBs\n");
+       return;
+    }
+    
+    dump_cb(cb_cursor.idx+1);
+}
+
+static void
+dump_prev_he(void)
+{
+    if (!he_cursor.idx) {
+       fprintf(stderr, "no more HEs\n");
+       return;
+    }
+    
+    dump_he(he_cursor.idx-1);
+}
+
+static void
+dump_prev_fe(void)
+{
+    if (!fe_cursor.idx) {
+       fprintf(stderr, "no more FEs\n");
+       return;
+    }
+    
+    dump_fe(fe_cursor.idx-1);
+}
+
+static void
+dump_prev_cb(void)
+{
+    if (!cb_cursor.idx) {
+       fprintf(stderr, "no more CBs\n");
+       return;
+    }
+    
+    dump_cb(cb_cursor.idx-1);
+}
+
+static void
+dump_first_fe(void)
+{
+    if (get_cb_hdr()) {
+       fprintf(stderr, "error getting callback_state_header\n");
+       return;
+    }
+
+    if (!hdrs.cb_hdr.nFEs) {
+       fprintf(stderr, "no FEs present\n");
+       return;
+    }
+    
+    dump_fe(0);
+}
+
+static void
+dump_first_he(void)
+{
+    if (get_h_hdr()) {
+       fprintf(stderr, "error getting host_state_header\n");
+       return;
+    }
+
+    if (!hdrs.h_hdr.records) {
+       fprintf(stderr, "no HEs present\n");
+       return;
+    }
+    
+    dump_he(0);
+}
+
+static void
+dump_first_cb(void)
+{
+    if (get_fe_hdr()) {
+       fprintf(stderr, "error getting callback_state_entry_header\n");
+       return;
+    }
+
+    if (!fe_cursor.hdr.nCBs) {
+       fprintf(stderr, "no CBs present\n");
+       return;
+    }
+    
+    dump_cb(0);
+}
+
+static void
+dump_last_he(void)
+{
+    if (get_h_hdr()) {
+       fprintf(stderr, "error getting host_state_header\n");
+       return;
+    }
+
+    if (!hdrs.h_hdr.records) {
+       fprintf(stderr, "no HEs present\n");
+       return;
+    }
+    
+    dump_he(hdrs.h_hdr.records-1);
+}
+
+static void
+dump_last_fe(void)
+{
+    if (get_cb_hdr()) {
+       fprintf(stderr, "error getting callback_state_header\n");
+       return;
+    }
+
+    if (!hdrs.cb_hdr.nFEs) {
+       fprintf(stderr, "no FEs present\n");
+       return;
+    }
+    
+    dump_fe(hdrs.cb_hdr.nFEs-1);
+}
+
+static void
+dump_last_cb(void)
+{
+    if (get_fe_hdr()) {
+       fprintf(stderr, "error getting callback_state_entry_header\n");
+       return;
+    }
+
+    if (!fe_cursor.hdr.nCBs) {
+       fprintf(stderr, "no CBs present\n");
+       return;
+    }
+
+    dump_cb(fe_cursor.hdr.nCBs-1);
+}
+
+static void
+dump_he_hdr(void)
+{
+    DPFSO0("host_state_entry_header");
+    DPFX1("magic", he_cursor.hdr.magic);
+    DPFV1("len", "u", he_cursor.hdr.len);
+    DPFV1("interfaces", "u", he_cursor.hdr.interfaces);
+    DPFV1("hcps", "u", he_cursor.hdr.hcps);
+    DPFSC0;
+
+    if (he_cursor.hdr.magic != HOST_STATE_ENTRY_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+}
+
+static void
+dump_he_entry(void)
+{
+    DPFSO0("hostDiskEntry");
+    DPFS1("host", afs_inet_ntoa(he_cursor.he.host));
+    DPFV1("port", "u", he_cursor.he.port);
+    DPFX1("hostFlags", he_cursor.he.hostFlags);
+    DPFV1("Console", "u", he_cursor.he.Console);
+    DPFV1("hcpsfailed", "u", he_cursor.he.hcpsfailed);
+    DPFV1("hcps_valid", "u", he_cursor.he.hcps_valid);
+    if (hdrs.hdr.stats_detailed) {
+#ifdef FS_STATS_DETAILED
+       DPFV1("InSameNetwork", "u", he_cursor.he.InSameNetwork);
+#else
+       DPFV1("InSameNetwork", "u", he_cursor.he.padding1[0]);
+#endif
+    }
+    DPFV1("hcps_len", "u", he_cursor.he.hcps_len);
+    DPFT1("LastCall", he_cursor.he.LastCall);
+    DPFT1("ActiveCall", he_cursor.he.ActiveCall);
+    DPFT1("cpsCall", he_cursor.he.cpsCall);
+    DPFV1("cblist", "u", he_cursor.he.cblist);
+    DPFV1("index", "u", he_cursor.he.index);
+    DPFSC0;
+}
+
+static void
+dump_he_interfaces(void)
+{
+    char temp_str[40];
+    struct Interface * ifp;
+    int len, i;
+
+    if (!he_cursor.hdr.interfaces)
+       return;
+
+    len = sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort));
+    ifp = (struct Interface *) malloc(len);
+    assert(ifp != NULL);
+
+    memcpy(ifp, he_cursor.ifp, len);
+
+    DPFSO0("Interface");
+    DPFV1("numberOfInterfaces", "u", ifp->numberOfInterfaces);
+
+    afsUUID_to_string(&ifp->uuid, temp_str, sizeof(temp_str));
+    DPFS1("uuid", temp_str);
+    for (i = 0; i < he_cursor.hdr.interfaces; i++) {
+       snprintf(temp_str, sizeof(temp_str), "interface[%d]", i);
+       DPFSO1(temp_str);
+       DPFS2("addr", afs_inet_ntoa(ifp->interface[i].addr));
+       DPFV2("port", "u", ifp->interface[i].port);
+       DPFSC1;
+    }
+
+    DPFSC0;
+
+    if (he_cursor.hdr.interfaces != ifp->numberOfInterfaces) {
+       fprintf(stderr, "* interface count mismatch between header and Interface struct\n");
+    }
+    free(ifp);
+}
+
+static void
+dump_he_hcps(void)
+{
+    char temp_str[40];
+    afs_int32 * hcps;
+    int len, i;
+
+    if (!he_cursor.hdr.hcps)
+       return;
+
+    len = (he_cursor.hdr.hcps)*sizeof(afs_uint32);
+    hcps = (afs_int32 *) malloc(len);
+    assert(hcps != NULL);
+    memcpy(hcps, he_cursor.hcps, len);
+
+    DPFSO0("hcps");
+    DPFAO1("prlist_val");
+    for (i = 0; i < he_cursor.hdr.hcps - 1; i++) {
+       DPFAE("d", hcps[i]);
+       if ((i % 8) == 7) {
+           DPFAN;
+           DPFA2;
+       }
+    }
+    DPFALE("d", hcps[he_cursor.hdr.hcps-1]);
+    DPFAC1;
+    DPFSC0;
+    free(hcps);
+}
+
+static void
+dump_fe_hdr(void)
+{
+    DPFSO0("callback_state_entry_header");
+    DPFX1("magic", fe_cursor.hdr.magic);
+    DPFV1("len", "u", fe_cursor.hdr.len);
+    DPFV1("nCBs", "u", fe_cursor.hdr.nCBs);
+    DPFSC0;
+
+    if (fe_cursor.hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) {
+       fprintf(stderr, "* magic check failed\n");
+    }
+}
+
+static void
+dump_fe_entry(void)
+{
+    DPFSO0("FEDiskEntry");
+    DPFSO1("fe");
+    DPFV2("vnode", "u", fe_cursor.fe.fe.vnode);
+    DPFV2("unique", "u", fe_cursor.fe.fe.unique);
+    DPFV2("volid", "u", fe_cursor.fe.fe.volid);
+    DPFV2("fnext", "u", fe_cursor.fe.fe.fnext);
+    DPFV2("ncbs", "u", fe_cursor.fe.fe.ncbs);
+    DPFV2("firstcb", "u", fe_cursor.fe.fe.firstcb);
+    DPFV2("status", "u", fe_cursor.fe.fe.status);
+    DPFSC1;
+    DPFV1("index", "u", fe_cursor.fe.index);
+    DPFSC0;
+}
+
+static void
+dump_cb_entry(void)
+{
+    DPFSO0("CBDiskEntry");
+    DPFSO1("cb");
+    DPFV2("cnext", "u", cb_cursor.cb.cb.cnext);
+    DPFV2("fhead", "u", cb_cursor.cb.cb.fhead);
+    DPFV2("thead", "u", (afs_uint32)cb_cursor.cb.cb.thead);
+    DPFV2("status", "u", (afs_uint32)cb_cursor.cb.cb.status);
+    DPFV2("hhead", "u", cb_cursor.cb.cb.hhead);
+    DPFV2("tprev", "u", cb_cursor.cb.cb.tprev);
+    DPFV2("tnext", "u", cb_cursor.cb.cb.tnext);
+    DPFV2("hprev", "u", cb_cursor.cb.cb.hprev);
+    DPFV2("hnext", "u", cb_cursor.cb.cb.hnext);
+    DPFSC1;
+    DPFV1("index", "u", cb_cursor.cb.index);
+    DPFSC0;
+}
+
+#define DPFHMS printf("  ")
+#define DPFHS printf("    ")
+#define DPFHN(offset) printf("\n%u\t", offset)
+#define DPFHD(x) printf("%02X  ", x)
+#define DPFHE printf("\n")
+
+static void
+hexdump_map(afs_uint32 offset, afs_uint32 len)
+{
+    int i;
+    unsigned char * p = (unsigned char *)map;
+    afs_uint32 c32;
+
+    if (!len)
+       return;
+
+    if ((offset + len) > map_len) {
+       fprintf(stderr, "offset + length exceeds memory map size (%u > %u)\n",
+               offset+len, map_len);
+       return;
+    }
+
+    p += offset;
+    DPFOFF(p);
+    DPFHN(offset);
+
+    for (i = offset % 16; i > 0; i--) {
+       DPFHS;
+    }
+
+    for (i=0; i < len; i++, p++, offset++) {
+       if (!(offset % 16)) {
+           DPFHN(offset);
+       } else if (!(offset % 8)) {
+           DPFHMS;
+       }
+       DPFHD(*p);
+    }
+    DPFHE;
+}
+
+static int
+get_hdr(void)
+{
+    if (!hdrs.hdr_valid) {
+       if (map_len < sizeof(struct fs_state_header)) {
+           fprintf(stderr, "corrupt state dump: fs_state_header larger than memory map\n");
+           return 1;
+       }
+       memcpy(&hdrs.hdr, map, sizeof(hdrs.hdr));
+       hdrs.hdr_p = map;
+       hdrs.hdr_valid = 1;
+    }
+    return 0;
+}
+
+static int
+get_h_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.h_hdr_valid)
+       return 0;
+
+    if (get_hdr())
+       return 1;
+
+    SplitInt64(hdrs.hdr.h_offset, hi, lo);
+
+    if (hi) {
+       fprintf(stderr, "hi offset bits set in h_offset; can't get host_state_header\n");
+       return 1;
+    }
+    if ((lo >= map_len) || 
+       ((lo + sizeof(struct host_state_header)) > map_len) ||
+       (lo + sizeof(struct host_state_header) < lo)) {
+       fprintf(stderr, "h_offset puts host_state_header beyond end of memory map\n");
+       return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.h_hdr, buf, sizeof(struct host_state_header));
+    hdrs.h_hdr_p = buf;
+    buf += sizeof(struct host_state_header);
+    he_cursor.fh = (void *)buf;
+    return 0;
+}
+
+static int
+get_cb_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.cb_hdr_valid)
+       return 0;
+
+    if (get_hdr())
+       return 1;
+
+    SplitInt64(hdrs.hdr.cb_offset, hi, lo);
+
+    if (hi) {
+       fprintf(stderr, "hi offset bits set in cb_offset; can't get callback_state_header\n");
+       return 1;
+    }
+    if ((lo >= map_len) || 
+       ((lo + sizeof(struct callback_state_header)) > map_len) ||
+       (lo + sizeof(struct callback_state_header) < lo)) {
+       fprintf(stderr, "cb_offset puts callback_state_header beyond end of memory map\n");
+       return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.cb_hdr, buf, sizeof(struct callback_state_header));
+    hdrs.cb_hdr_p = buf;
+    hdrs.cb_hdr_valid = 1;
+
+    SplitInt64(hdrs.cb_hdr.fe_offset, hi, lo);
+
+    if (hi) {
+       fprintf(stderr, "hi offset bits set in fe_offset; can't get callback_state_entry_header\n");
+       return 1;
+    }
+    hi = lo + (hdrs.cb_hdr.nFEs * (sizeof(struct callback_state_entry_header) +
+                                 sizeof(struct FEDiskEntry)) +
+              hdrs.cb_hdr.nCBs * sizeof(struct CBDiskEntry));
+    if ((hi > map_len) ||
+       (lo > hi)) {
+       fprintf(stderr, "fe_offset puts callback_state_entry_header beyond end of memory map\n");
+       return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    fe_cursor.ffe = (void *)buf;
+
+    return 0;
+}
+
+static int
+get_cb_timeout_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.timeout_hdr_valid)
+       return 0;
+
+    if (get_cb_hdr())
+       return 1;
+
+    SplitInt64(hdrs.cb_hdr.timeout_offset, hi, lo);
+
+    if (hi) {
+       fprintf(stderr, "hi offset bits set in timeout_offset; can't get callback_state_timeout_header\n");
+       return 1;
+    }
+    if ((lo >= map_len) || 
+       ((lo + sizeof(struct callback_state_timeout_header)) > map_len) ||
+       (lo + sizeof(struct callback_state_timeout_header) < lo)) {
+       fprintf(stderr, "timeout_offset puts callback_state_timeout_header beyond end of memory map\n");
+       return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.timeout_hdr, buf, sizeof(struct callback_state_timeout_header));
+    hdrs.timeout_hdr_p = buf;
+    hdrs.timeout_hdr_valid = 1;
+    buf += sizeof(struct callback_state_timeout_header);
+    hdrs.timeout_p = buf;
+
+    return 0;
+}
+
+static int
+get_cb_timeout(void)
+{
+    char * buf;
+
+    if (hdrs.timeout)
+       return 0;
+
+    if (get_cb_timeout_hdr())
+       return 1;
+
+    hdrs.timeout = (afs_uint32 *) calloc(hdrs.timeout_hdr.records, sizeof(afs_uint32));
+    assert(hdrs.timeout != NULL);
+    memcpy(hdrs.timeout, hdrs.timeout_p, hdrs.timeout_hdr.records * sizeof(afs_uint32));
+    return 0;
+}
+
+static int
+get_cb_fehash_hdr(void)
+{
+    char * buf;
+    afs_uint32 hi, lo;
+
+    if (hdrs.fehash_hdr_valid)
+       return 0;
+
+    if (get_cb_hdr())
+       return 1;
+
+    SplitInt64(hdrs.cb_hdr.fehash_offset, hi, lo);
+
+    if (hi) {
+       fprintf(stderr, "hi offset bits set in fehash_offset; can't get callback_state_fehash_header\n");
+       return 1;
+    }
+    if ((lo >= map_len) || 
+       ((lo + sizeof(struct callback_state_fehash_header)) > map_len) ||
+       (lo + sizeof(struct callback_state_fehash_header) < lo)) {
+       fprintf(stderr, "timeout_offset puts callback_state_fehash_header beyond end of memory map\n");
+       return 1;
+    }
+
+    buf = (char *) map;
+    buf += lo;
+    memcpy(&hdrs.fehash_hdr, buf, sizeof(struct callback_state_fehash_header));
+    hdrs.fehash_hdr_p = buf;
+    hdrs.fehash_hdr_valid = 1;
+    buf += sizeof(struct callback_state_fehash_header);
+    hdrs.fehash_p = buf;
+
+    return 0;
+}
+
+static int
+get_cb_fehash(void)
+{
+    char * buf;
+
+    if (hdrs.fehash)
+       return 0;
+
+    if (get_cb_fehash_hdr())
+       return 1;
+
+    hdrs.fehash = (afs_uint32 *) calloc(hdrs.fehash_hdr.records, sizeof(afs_uint32));
+    assert(hdrs.fehash != NULL);
+    memcpy(hdrs.fehash, hdrs.fehash_p, hdrs.fehash_hdr.records * sizeof(afs_uint32));
+    return 0;
+}
+
+static int
+get_he(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    if (get_h_hdr())
+       return 1;
+
+    if (idx >= hdrs.h_hdr.records)
+       return 1;
+
+    if (he_cursor.idx == idx && he_cursor.hdr_valid && he_cursor.he_valid)
+       return 0;
+
+    he_cursor.hdr_valid = he_cursor.he_valid = 0;
+
+    if (he_cache.cursor == NULL) {
+       he_cache.cursor = (void **) calloc(hdrs.h_hdr.records, sizeof(void *));
+       assert(he_cache.cursor != NULL);
+    }
+
+    if (idx && he_cache.cursor[idx-1] == NULL) {
+       for (i = 0; i < idx; i++) {
+           if (he_cache.cursor[i] == NULL) {
+               get_he(i);
+           }
+       }
+    }
+
+    if (!idx) {
+       he_cursor.cursor = he_cursor.fh;
+    } else if (he_cursor.cursor == he_cache.cursor[idx-1]) {
+       p = (char *)he_cursor.cursor;
+       p += he_cursor.hdr.len;
+       he_cursor.cursor = (void *)p;
+    } else {
+       he_cursor.cursor = he_cache.cursor[idx-1];
+       if (get_he_hdr())
+           return 1;
+       p = (char *)he_cursor.cursor;
+       p += he_cursor.hdr.len;
+       he_cursor.cursor = (void *)p;
+    }
+
+    he_cursor.idx = idx;
+    he_cache.cursor[idx] = he_cursor.cursor;
+
+    if (get_he_hdr())
+       return 1;
+    if (get_he_entry())
+       return 1;
+
+    return 0;
+}
+
+static int
+get_he_hdr(void)
+{
+    memcpy(&he_cursor.hdr, he_cursor.cursor, sizeof(struct host_state_entry_header));
+    he_cursor.hdr_valid = 1;
+    return 0;
+}
+
+static int
+get_he_entry(void)
+{
+    char * p;
+
+    if (!he_cursor.hdr_valid) {
+       if (get_he_hdr()) {
+           return 1;
+       }
+    }
+
+    p = (char *) he_cursor.cursor;
+    p += sizeof(struct host_state_entry_header);
+
+    memcpy(&he_cursor.he, p, sizeof(struct hostDiskEntry));
+
+    he_cursor.he_valid = 1;
+    p += sizeof(struct hostDiskEntry);
+    he_cursor.ifp = (void *)p;
+    if (he_cursor.hdr.interfaces) {
+       p += sizeof(struct Interface) + ((he_cursor.hdr.interfaces-1)*sizeof(struct AddrPort));
+       he_cursor.hcps = (void *)p;
+    } else {
+       he_cursor.hcps = he_cursor.ifp;
+    }
+    return 0;
+}
+
+static int
+get_fe(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    cb_cursor.cb_valid = 0;
+
+    if (get_cb_hdr())
+       return 1;
+
+    if (idx >= hdrs.cb_hdr.nFEs)
+       return 1;
+
+    if (fe_cursor.idx == idx && fe_cursor.hdr_valid && fe_cursor.fe_valid)
+       return 0;
+
+    fe_cursor.hdr_valid = fe_cursor.fe_valid = 0;
+
+    if (fe_cache.cursor == NULL) {
+       fe_cache.cursor = (void **) calloc(hdrs.cb_hdr.nFEs, sizeof(void *));
+       assert(fe_cache.cursor != NULL);
+    }
+
+    if (idx && fe_cache.cursor[idx-1] == NULL) {
+       for (i = 0; i < idx; i++) {
+           if (fe_cache.cursor[i] == NULL) {
+               get_fe(i);
+           }
+       }
+    }
+
+    if (!idx) {
+       fe_cursor.cursor = fe_cursor.ffe;
+    } else if (fe_cursor.cursor == fe_cache.cursor[idx-1]) {
+       p = (char *)fe_cursor.cursor;
+       p += fe_cursor.hdr.len;
+       fe_cursor.cursor = (void *)p;
+    } else {
+       fe_cursor.cursor = fe_cache.cursor[idx-1];
+       if (get_fe_hdr())
+           return 1;
+       p = (char *)fe_cursor.cursor;
+       p += fe_cursor.hdr.len;
+       fe_cursor.cursor = (void *)p;
+    }
+
+    fe_cursor.idx = idx;
+    fe_cache.cursor[idx] = fe_cursor.cursor;
+
+    if (get_fe_hdr())
+       return 1;
+    if (get_fe_entry())
+       return 1;
+
+    return 0;
+}
+
+static int
+get_fe_hdr(void)
+{
+    memcpy(&fe_cursor.hdr, fe_cursor.cursor, sizeof(struct callback_state_entry_header));
+    fe_cursor.hdr_valid = 1;
+    return 0;
+}
+
+static int
+get_fe_entry(void)
+{
+    char * p;
+
+    if (!fe_cursor.hdr_valid) {
+       if (get_fe_hdr()) {
+           return 1;
+       }
+    }
+
+    p = (char *) fe_cursor.cursor;
+    p += sizeof(struct callback_state_entry_header);
+
+    memcpy(&fe_cursor.fe, p, sizeof(struct FEDiskEntry));
+
+    fe_cursor.fe_valid = 1;
+    p += sizeof(struct FEDiskEntry);
+    fe_cursor.fcb = (void *)p;
+    return 0;
+}
+
+static int
+get_cb(afs_uint32 idx)
+{
+    int i;
+    char * p;
+
+    if (get_fe(fe_cursor.idx))
+       return 1;
+
+    if (idx >= fe_cursor.hdr.nCBs)
+       return 1;
+
+    if (idx == cb_cursor.idx && cb_cursor.cb_valid)
+       return 0;
+
+    cb_cursor.cb_valid = 0;
+
+    p = (char *)fe_cursor.fcb;
+    p += idx * sizeof(struct CBDiskEntry);
+    cb_cursor.cursor = (void *)p;
+
+    cb_cursor.idx = idx;
+
+    if (get_cb_entry())
+       return 1;
+
+    return 0;
+}
+
+static int
+get_cb_entry(void)
+{
+    memcpy(&cb_cursor.cb, cb_cursor.cursor, sizeof(struct CBDiskEntry));
+    cb_cursor.cb_valid = 1;
+    return 0;
+}
+
+static int
+find_he_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_h_hdr()) {
+       return 1;
+    }
+
+    for (i = 0; i < hdrs.h_hdr.records; i++) {
+       if (get_he(i)) {
+           fprintf(stderr, "error getting he %d\n", i);
+           return 1;
+       }
+       if (he_cursor.he.index == idx)
+           break;
+    }
+
+    if (i < hdrs.h_hdr.records) {
+       dump_this_he();
+       return 0;
+    }
+    return 1;
+}
+
+static int
+find_fe_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+       return 1;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+       if (get_fe(i)) {
+           fprintf(stderr, "error getting fe %d\n", i);
+           return 1;
+       }
+       if (fe_cursor.fe.index == idx)
+           break;
+    }
+
+    if (i < hdrs.cb_hdr.nFEs) {
+       dump_this_fe();
+       return 0;
+    }
+    return 1;
+}
+
+static int
+find_fe_by_fid(afs_uint32 volid, afs_uint32 vnode, afs_uint32 unique)
+{
+    int i;
+
+    if (get_cb_hdr()) {
+       return 1;
+    }
+
+    for (i = 0; i < hdrs.cb_hdr.nFEs; i++) {
+       if (get_fe(i)) {
+           fprintf(stderr, "error getting fe %d\n", i);
+           return 1;
+       }
+       if ((fe_cursor.fe.fe.unique == unique) &&
+           (fe_cursor.fe.fe.volid == volid) &&
+           (fe_cursor.fe.fe.vnode == vnode))
+           break;
+    }
+
+    if (i < hdrs.cb_hdr.nFEs) {
+       dump_this_fe();
+       return 0;
+    }
+    return 1;
+}
+
+static int
+find_cb_by_index(afs_uint32 idx)
+{
+    int i;
+
+    if (get_fe_hdr()) {
+       return 1;
+    }
+
+    for (i = 0; i < fe_cursor.hdr.nCBs; i++) {
+       if (get_cb(i)) {
+           fprintf(stderr, "error getting cb %d\n", i);
+           return 1;
+       }
+       if (cb_cursor.cb.index == idx)
+           break;
+    }
+
+    if (i < fe_cursor.hdr.nCBs) {
+       dump_this_cb();
+       return 0;
+    }
+    return 1;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/tvolser/Makefile.in b/src/tvolser/Makefile.in

index 8b8b1a7578d4e47cd5c7b0462ced3da94a5376e1..bfeb3a24af9d9dbf37b22a935ba0b01b30127f35 100644 (file)
--- a/src/tvolser/Makefile.in
+++ b/src/tvolser/Makefile.in
@@ -10,7 +10,7 @@ include @TOP_OBJDIR@/src/config/Makefile.config
  HELPER_SPLINT=@HELPER_SPLINT@
  
  CC=${MT_CC}
-CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG
+CFLAGS=${COMMON_CFLAGS} -I.. -DNINTERFACE ${MT_CFLAGS} -DRXDEBUG -DFSSYNC_BUILD_CLIENT
  
  CCRULE=${CC} ${CFLAGS} -c $?
  
@@ -36,8 +36,9 @@ UTILOBJS=assert.o uuid.o serverLog.o fileutil.o netutils.o dirpath.o volparse.o
  
  DIROBJS=buffer.o dir.o salvage.o
  
-VOLOBJS= vnode.o volume.o vutil.o partition.o fssync.o purge.o \
-        clone.o devname.o common.o ihandle.o listinodes.o namei_ops.o nuke.o
+VOLOBJS= vnode.o volume.o vutil.o partition.o fssync-client.o purge.o \
+        clone.o devname.o common.o ihandle.o listinodes.o \
+        namei_ops.o nuke.o salvsync-client.o daemon_com.o
  
  FSINTOBJS=# afsaux.o afscbint.cs.o afsint.ss.o afsint.xdr.o
  
@@ -138,7 +139,13 @@ partition.o: ${VOL}/partition.c
  nuke.o: ${VOL}/nuke.c
         ${COMPILE}
  
-fssync.o: ${VOL}/fssync.c
+fssync-client.o: ${VOL}/fssync-client.c
+       ${COMPILE}
+
+salvsync-client.o: ${VOL}/salvsync-client.c
+       ${COMPILE}
+
+daemon_com.o: ${VOL}/daemon_com.c
         ${COMPILE}
  
  purge.o: ${VOL}/purge.c
diff --git a/src/util/Makefile.in b/src/util/Makefile.in

index 7b8c36e3ea9f7e6bee6a12d09981fcc5fd5a5e95..ccf3446695eaeb9109eae8e6b0355673bf0e66dd 100644 (file)
--- a/src/util/Makefile.in
+++ b/src/util/Makefile.in
@@ -13,7 +13,7 @@ HELPER_SPLINT=@HELPER_SPLINT@
  objects = assert.o base64.o casestrcpy.o ktime.o volparse.o hostparse.o \
          hputil.o kreltime.o isathing.o get_krbrlm.o uuid.o serverLog.o \
          dirpath.o fileutil.o netutils.o flipbase64.o fstab.o \
-        afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o \
+        afs_atomlist.o afs_lhash.o snprintf.o strlcat.o strlcpy.o strnlen.o \
          daemon.o rxkstats.o ${REGEX_OBJ}
  
  includes = \
@@ -134,6 +134,9 @@ strlcat.o: ${srcdir}/strlcat.c ${includes}
  strlcpy.o: ${srcdir}/strlcpy.c ${includes}
         ${CCOBJ} ${CFLAGS} -c ${srcdir}/strlcpy.c
  
+strnlen.o: ${srcdir}/strnlen.c ${includes}
+       ${CCOBJ} ${CFLAGS} -c ${srcdir}/strnlen.c
+
  daemon.o: ${srcdir}/daemon.c ${includes}
         ${CCOBJ} ${CFLAGS} -c ${srcdir}/daemon.c
  
diff --git a/src/util/afsutil_prototypes.h b/src/util/afsutil_prototypes.h

index 89f05365e629111004059682af881e96fce16140..2848da36419bb67d6eb6397f9ba5ba09b4157536 100644 (file)
--- a/src/util/afsutil_prototypes.h
+++ b/src/util/afsutil_prototypes.h
@@ -173,6 +173,9 @@ extern size_t strlcpy(char *dst, const char *src, size_t siz);
  extern size_t strlcat(char *dst, const char *src, size_t siz);
  #endif
  
+/* strn */
+extern size_t afs_strnlen(char * buf, size_t len);
+
  
  /* sys.c */
  
@@ -184,6 +187,10 @@ extern void afs_htonuuid(afsUUID * uuidp);
  extern void afs_ntohuuid(afsUUID * uuidp);
  extern afs_int32 afs_uuid_create(afsUUID * uuid);
  extern u_short afs_uuid_hash(afsUUID * uuid);
+#if !defined(KERNEL) && !defined(UKERNEL)
+extern int afsUUID_from_string(const char *str, afsUUID * uuid);
+extern int afsUUID_to_string(const afsUUID * uuid, char *str, size_t strsz);
+#endif
  
  /* volparse.c */
  extern afs_int32 volutil_GetPartitionID(char *aname);
diff --git a/src/util/dirpath.c b/src/util/dirpath.c

index ff856f9523b1642cea05524e9db0be1ff4fe2882..1e9d78da7699b3685965666906448cac6716bd24 100644 (file)
--- a/src/util/dirpath.c
+++ b/src/util/dirpath.c
@@ -292,10 +292,17 @@ initDirPathArray(void)
      pathp = dirPathArray[AFSDIR_SERVER_SLVGLOG_FILEPATH_ID];
      AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SLVGLOG_FILE);
  
+    pathp = dirPathArray[AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOGS_DIR, AFSDIR_SALSRVLOG_FILE);
+
      pathp = dirPathArray[AFSDIR_SERVER_SALVAGER_FILEPATH_ID];
      AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR,
                            AFSDIR_SALVAGER_FILE);
  
+    pathp = dirPathArray[AFSDIR_SERVER_SALSRV_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_BIN_DIR,
+                          AFSDIR_SALSRV_FILE);
+
      pathp = dirPathArray[AFSDIR_SERVER_SLVGLOCK_FILEPATH_ID];
      AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_SLVGLOCK_FILE);
  
@@ -368,6 +375,9 @@ initDirPathArray(void)
      pathp = dirPathArray[AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID];
      AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_SERVER_ETC_DIR, AFSDIR_KRB_EXCL_FILE);
  
+    pathp = dirPathArray[AFSDIR_SERVER_FSSTATE_FILEPATH_ID];
+    AFSDIR_SERVER_FILEPATH(pathp, AFSDIR_LOCAL_DIR, AFSDIR_FSSTATE_FILE);
+
      /* client file paths */
  #ifdef AFS_NT40_ENV
      strcpy(dirPathArray[AFSDIR_CLIENT_THISCELL_FILEPATH_ID],
diff --git a/src/util/dirpath.hin b/src/util/dirpath.hin

index 23590ad4a92ca2775248529b1e947c2ba0836eca..ae1c46a78ec44b394f22c3cdd046fe12782c6191 100644 (file)
--- a/src/util/dirpath.hin
+++ b/src/util/dirpath.hin
@@ -135,7 +135,9 @@ ConstructLocalLogPath(const char *cpath,
  #define AFSDIR_VLOG_FILE        "VLLog"
  #define AFSDIR_CORE_FILE        "core"
  #define AFSDIR_SLVGLOG_FILE     "SalvageLog"
+#define AFSDIR_SALSRVLOG_FILE   "SalsrvLog"
  #define AFSDIR_SALVAGER_FILE    "salvager"
+#define AFSDIR_SALSRV_FILE      "salvageserver"
  #define AFSDIR_SLVGLOCK_FILE    "salvage.lock"
  #define AFSDIR_BOZCONF_FILE     "BosConfig"
  #define AFSDIR_BOZCONFNEW_FILE  "BosConfig.new"
@@ -155,6 +157,8 @@ ConstructLocalLogPath(const char *cpath,
  #define AFSDIR_FILELOG_FILE     "FileLog"
  #define AFSDIR_MIGRATE_LOGNAME  "wtlog."
  
+#define AFSDIR_FSSTATE_FILE     "fsstate.dat"
+
  #define AFSDIR_CELLSERVDB_FILE_NTCLIENT  "afsdcell.ini"
  
  #define AFSDIR_NETINFO_FILE     "NetInfo"
@@ -194,9 +198,15 @@ AFSDIR_CANONICAL_SERVER_AFS_DIRPATH "/local"
  #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \
  AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE
  
+#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \
+AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE
+
  #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \
  AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE
  
+#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \
+AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE
+
  
  /* ---------------------  Local path macros ---------------------- */
  
@@ -264,6 +274,9 @@ typedef enum afsdir_id {
        AFSDIR_SERVER_BIN_FILE_DIRPATH_ID,
        AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID,
        AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID,
+      AFSDIR_SERVER_SALSRV_FILEPATH_ID,
+      AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID,
+      AFSDIR_SERVER_FSSTATE_FILEPATH_ID,
        AFSDIR_PATHSTRING_MAX } afsdir_id_t;
  
  /* getDirPath() returns a pointer to a string from an internal array of path strings 
@@ -310,7 +323,9 @@ const char *getDirPath(afsdir_id_t string_id);
  #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID)
  #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID)
  #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID)
  #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID)
@@ -332,6 +347,7 @@ const char *getDirPath(afsdir_id_t string_id);
  #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID)
  #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID)
  #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID)
+#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID)
  
  /* client file paths */
  #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID)
diff --git a/src/util/dirpath_nt.h b/src/util/dirpath_nt.h

index b0c62bc392ec06fb0b5fd813c86a45982db9ce81..1d49d8155a29cebe6838ec0639e642c53525614d 100644 (file)
--- a/src/util/dirpath_nt.h
+++ b/src/util/dirpath_nt.h
@@ -126,7 +126,9 @@ extern int
  #define AFSDIR_VLOG_FILE        "VLLog"
  #define AFSDIR_CORE_FILE        "core"
  #define AFSDIR_SLVGLOG_FILE     "SalvageLog"
+#define AFSDIR_SALSRVLOG_FILE   "SalsrvLog"
  #define AFSDIR_SALVAGER_FILE    "salvager"
+#define AFSDIR_SALSRV_FILE      "salvageserver"
  #define AFSDIR_SLVGLOCK_FILE    "salvage.lock"
  #define AFSDIR_BOZCONF_FILE     "BosConfig"
  #define AFSDIR_BOZCONFNEW_FILE  "BosConfig.new"
@@ -146,6 +148,8 @@ extern int
  #define AFSDIR_FILELOG_FILE     "FileLog"
  #define AFSDIR_MIGRATE_LOGNAME  "wtlog."
  
+#define AFSDIR_FSSTATE_FILE     "fsstate.dat"
+
  #ifdef COMMENT
  #define AFSDIR_CELLSERVDB_FILE_NTCLIENT  "afsdcell.ini"
  #else
@@ -189,9 +193,15 @@ AFSDIR_LOCAL_DIR
  #define AFSDIR_CANONICAL_SERVER_SALVAGER_FILEPATH \
  AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALVAGER_FILE
  
+#define AFSDIR_CANONICAL_SERVER_SALSRV_FILEPATH \
+AFSDIR_CANONICAL_SERVER_BIN_DIRPATH "/" AFSDIR_SALSRV_FILE
+
  #define AFSDIR_CANONICAL_SERVER_SLVGLOG_FILEPATH \
  AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SLVGLOG_FILE
  
+#define AFSDIR_CANONICAL_SERVER_SALSRVLOG_FILEPATH \
+AFSDIR_CANONICAL_SERVER_LOGS_DIRPATH "/" AFSDIR_SALSRVLOG_FILE
+
  
  /* ---------------------  Local path macros ---------------------- */
  
@@ -259,6 +269,9 @@ typedef enum afsdir_id {
      AFSDIR_SERVER_BIN_FILE_DIRPATH_ID,
      AFSDIR_CLIENT_CELLALIAS_FILEPATH_ID,
      AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID,
+    AFSDIR_SERVER_SALSRV_FILEPATH_ID,
+    AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID,
+    AFSDIR_SERVER_FSSTATE_FILEPATH_ID,
      AFSDIR_PATHSTRING_MAX
  } afsdir_id_t;
  
@@ -306,7 +319,9 @@ const char *getDirPath(afsdir_id_t string_id);
  #define AFSDIR_SERVER_VLOG_FILEPATH getDirPath(AFSDIR_SERVER_VLOG_FILEPATH_ID)
  #define AFSDIR_SERVER_CORELOG_FILEPATH getDirPath(AFSDIR_SERVER_CORELOG_FILEPATH_ID)
  #define AFSDIR_SERVER_SLVGLOG_FILEPATH getDirPath(AFSDIR_SERVER_SLVGLOG_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRVLOG_FILEPATH getDirPath(AFSDIR_SERVER_SALSRVLOG_FILEPATH_ID)
  #define AFSDIR_SERVER_SALVAGER_FILEPATH getDirPath(AFSDIR_SERVER_SALVAGER_FILEPATH_ID)
+#define AFSDIR_SERVER_SALSRV_FILEPATH getDirPath(AFSDIR_SERVER_SALSRV_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZCONF_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONF_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZCONFNEW_FILEPATH getDirPath(AFSDIR_SERVER_BOZCONFNEW_FILEPATH_ID)
  #define AFSDIR_SERVER_BOZINIT_FILEPATH getDirPath(AFSDIR_SERVER_BOZINIT_FILEPATH_ID)
@@ -328,6 +343,7 @@ const char *getDirPath(afsdir_id_t string_id);
  #define AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH getDirPath(AFSDIR_SERVER_THRESHOLD_CONSTANTS_FILEPATH_ID)
  #define AFSDIR_SERVER_MIGRATELOG_FILEPATH getDirPath(AFSDIR_SERVER_MIGRATELOG_FILEPATH_ID)
  #define AFSDIR_SERVER_KRB_EXCL_FILEPATH getDirPath(AFSDIR_SERVER_KRB_EXCL_FILEPATH_ID)
+#define AFSDIR_SERVER_FSSTATE_FILEPATH getDirPath(AFSDIR_SERVER_FSSTATE_FILEPATH_ID)
  
  /* client file paths */
  #define AFSDIR_CLIENT_THISCELL_FILEPATH getDirPath(AFSDIR_CLIENT_THISCELL_FILEPATH_ID)
diff --git a/src/util/errors.h b/src/util/errors.h

index aa805d27ca4d7d2b0fa0473644f298baac85a559..bc16dd6eb4efd95e3c3bef32210b46ed3bf12f7c 100644 (file)
--- a/src/util/errors.h
+++ b/src/util/errors.h
@@ -50,6 +50,7 @@
                                  * to THIS server to find out where */
  #define VIO            112     /* Vnode temporarily unaccessible, but not known 
                                  * to be permanently bad. */
+#define VSALVAGING      113     /* Volume is being salvaged (demand attach fs) */
  #define VRESTRICTED     120    /* Volume is restricted from using one or more
                                  * of the given residencies; do a
                                  * vos examine to find out the current
diff --git a/src/util/strnlen.c b/src/util/strnlen.c

new file mode 100644 (file)

index 0000000..6c350df
--- /dev/null
+++ b/src/util/strnlen.c
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* strnlen.c - fixed length string length */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdarg.h>
+#include <ctype.h>
+
+
+size_t
+afs_strnlen(char * buf, size_t len)
+{
+    size_t i;
+
+    for (i = 0; i < len; i++) {
+       if (buf[i] == '\0')
+           break;
+    }
+
+    return i;
+}
+
diff --git a/src/viced/Makefile.in b/src/viced/Makefile.in

index 1b7d23f597ead4cdce4c4cccdaa780df6d58ba7b..6de76052eb6c703c487e57dcfac053310f3d3a90 100644 (file)
--- a/src/viced/Makefile.in
+++ b/src/viced/Makefile.in
@@ -50,6 +50,7 @@ headers=${TOP_INCDIR}/lwp.h           \
         ${TOP_INCDIR}/afs/afsint.h      \
         viced.h                         \
         host.h                          \
+       callback.h                      \
         fs_stats.h
  
  objects=viced.o                \
diff --git a/src/viced/NTMakefile b/src/viced/NTMakefile

index 125d1ca6aab42dd533d0010658915d231e245706..0ffb6b7e954f334c01990552505c37435373785d 100644 (file)
--- a/src/viced/NTMakefile
+++ b/src/viced/NTMakefile
@@ -5,6 +5,8 @@
  # License.  For details, see the LICENSE file in the top-level source
  # directory or online at http://www.openafs.org/dl/license10.html
  
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER
+
  RELDIR=viced
  !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
  !INCLUDE ..\config\NTMakefile.version
diff --git a/src/viced/afsfileprocs.c b/src/viced/afsfileprocs.c

index 4743a2cb99d53a571c198e7012d337ca034ef122..429a7de3a230c4a34f95c49f1a54111aade5f2ab 100644 (file)
--- a/src/viced/afsfileprocs.c
+++ b/src/viced/afsfileprocs.c
@@ -112,6 +112,7 @@ RCSID
  #include "viced_prototypes.h"
  #include "viced.h"
  #include "host.h"
+#include "callback.h"
  #include <afs/unified_afs.h>
  #include <afs/audit.h>
  #include <afs/afsutil.h>
@@ -209,7 +210,7 @@ extern afs_int32 readonlyServer;
  /*
   * Externals used by the xstat code.
   */
-extern int VolumeCacheSize, VolumeGets, VolumeReplacements;
+extern VolPkgStats VStats;
  extern int CEs, CEBlocks;
  
  extern int HTs, HTBlocks;
@@ -438,7 +439,7 @@ static afs_int32
  CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
  {
      int fileCode = 0;
-    int errorCode = -1;
+    afs_int32 local_errorCode, errorCode = -1;
      static struct timeval restartedat = { 0, 0 };
  
      if (fid->Volume == 0 || fid->Vnode == 0)   /* not: || fid->Unique == 0) */
@@ -448,7 +449,7 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
  
         while (1) {
             errorCode = 0;
-           *volptr = VGetVolume(&errorCode, (afs_int32) fid->Volume);
+           *volptr = VGetVolume(&local_errorCode, &errorCode, (afs_int32) fid->Volume);
             if (!errorCode) {
                 assert(*volptr);
                 break;
@@ -525,8 +526,10 @@ CheckVnode(AFSFid * fid, Volume ** volptr, Vnode ** vptr, int lock)
                     }
                 }
             }
-           /* allow read operations on busy volume */
-           else if (errorCode == VBUSY && lock == READ_LOCK) {
+           /* allow read operations on busy volume. 
+            * must check local_errorCode because demand attach fs
+            * can have local_errorCode == VSALVAGING, errorCode == VBUSY */
+           else if (local_errorCode == VBUSY && lock == READ_LOCK) {
                 errorCode = 0;
                 break;
             } else if (errorCode)
@@ -1151,6 +1154,8 @@ CopyOnWrite(Vnode * targetptr, Volume * volptr)
                          wrlen, errno));
  #ifdef FAST_RESTART            /* if running in no-salvage, don't core the server */
                 ViceLog(0, ("CopyOnWrite failed: taking volume offline\n"));
+#elif defined(AFS_DEMAND_ATTACH_FS)
+               ViceLog(0, ("CopyOnWrite failed: requesting salvage\n"));
  #else /* Avoid further corruption and try to get a core. */
                 assert(0);
  #endif
@@ -5564,7 +5569,7 @@ SRXAFS_XStatsVersion(struct rx_call * a_call, afs_int32 * a_versionP)
  static void
  FillPerfValues(struct afs_PerfStats *a_perfP)
  {                              /*FillPerfValues */
-
+    afs_uint32 hi, lo;
      int dir_Buffers;           /*# buffers in use by dir package */
      int dir_Calls;             /*# read calls in dir package */
      int dir_IOs;               /*# I/O ops in dir package */
@@ -5582,9 +5587,11 @@ FillPerfValues(struct afs_PerfStats *a_perfP)
      a_perfP->vcache_S_Gets = VnodeClassInfo[vSmall].gets;
      a_perfP->vcache_S_Reads = VnodeClassInfo[vSmall].reads;
      a_perfP->vcache_S_Writes = VnodeClassInfo[vSmall].writes;
-    a_perfP->vcache_H_Entries = VolumeCacheSize;
-    a_perfP->vcache_H_Gets = VolumeGets;
-    a_perfP->vcache_H_Replacements = VolumeReplacements;
+    a_perfP->vcache_H_Entries = VStats.hdr_cache_size;
+    SplitInt64(VStats.hdr_gets, hi, lo);
+    a_perfP->vcache_H_Gets = lo;
+    SplitInt64(VStats.hdr_loads, hi, lo);
+    a_perfP->vcache_H_Replacements = lo;
  
      /*
       * Directory section.
diff --git a/src/viced/callback.c b/src/viced/callback.c

index 8c3040dc965c2b3c6ed881e72c8587d577804be8..44b45235763f28efb37816c3d6631a51db622f56 100644 (file)
--- a/src/viced/callback.c
+++ b/src/viced/callback.c
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -120,94 +122,24 @@ RCSID
  
  #include <afs/ptclient.h>      /* need definition of prlist for host.h */
  #include "host.h"
+#include "callback.h"
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "../tviced/serialize_state.h"
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  
  extern afsUUID FS_HostUUID;
  extern int hostCount;
-int ShowProblems = 1;
-
-/* Maximum number of call backs to break at once, single fid */
-/* There is some debate as to just how large this value should be */
-/* Ideally, it would be very very large, but I am afraid that the */
-/* cache managers will all send in their responses simultaneously, */
-/* thereby swamping the file server.  As a result, something like */
-/* 10 or 15 might be a better bet. */
-#define MAX_CB_HOSTS   10
-
-/* max time to break a callback, otherwise client is dead or net is hosed */
-#define MAXCBT 25
-
-#define u_byte unsigned char
+static int ShowProblems = 1;
  
  struct cbcounters cbstuff;
  
-struct cbstruct {
-    struct host *hp;
-    afs_uint32 thead;
-};
-
-struct FileEntry {
-    afs_uint32 vnode;
-    afs_uint32 unique;
-    afs_uint32 volid;
-    afs_uint32 fnext;
-    afs_uint32 ncbs;
-    afs_uint32 firstcb;
-    afs_uint32 status;
-    afs_uint32 spare;
-} *FE;                         /* Don't use FE[0] */
-#define FE_LATER 0x1
-
-struct CallBack {
-    afs_uint32 cnext;          /* Next call back entry */
-    afs_uint32 fhead;          /* Head of this call back chain */
-    u_byte thead;              /* Head of timeout chain */
-    u_byte status;             /* Call back status; see definitions, below */
-    afs_uint32 hhead;          /* Head of host table chain */
-    afs_uint32 tprev, tnext;   /* Timeout chain */
-    afs_uint32 hprev, hnext;   /* Chain from host table */
-    unsigned short spare;      /* make it a multiple of 32 bits. */
-} *CB;                         /* Don't use CB[0] */
-
-/* status values for status field of CallBack structure */
-#define CB_NORMAL   1          /* Normal call back */
-#define CB_DELAYED  2          /* Delayed call back due to rpc problems.
-                                * The call back entry will be added back to the
-                                * host list at the END of the list, so that
-                                * searching backwards in the list will find all
-                                * the (consecutive)host. delayed call back entries */
-#define CB_VOLUME   3          /* Callback for a volume */
-#define CB_BULK     4          /* Normal callbacks, handed out from FetchBulkStatus */
-
-/* call back indices to pointers, and vice-versa */
-#define itocb(i)    ((i)?CB+(i):0)
-#define cbtoi(cbp)  (!(cbp)?0:(cbp)-CB)
-
-/* file entry indices to pointers, and vice-versa */
-#define itofe(i)    ((i)?FE+(i):0)
-#define fetoi(fep)  (!(fep)?0:(fep)-FE)
-
-/* Timeouts:  there are 128 possible timeout values in effect at any
- * given time.  Each timeout represents timeouts in an interval of 128
- * seconds.  So the maximum timeout for a call back is 128*128=16384
- * seconds, or 4 1/2 hours.  The timeout cleanup stuff is called only
- * if space runs out or by the file server every 5 minutes.  This 5
- * minute slack should be allowed for--so a maximum time of 4 hours
- * is safer.
- *
- * Timeouts must be chosen to correspond to an exact multiple
- * of 128, because all times are truncated to a 128 multiple, and
- * timed out if the current truncated time is <= to the truncated time
- * corresponding to the timeout queue.
- */
+static struct FileEntry * FE = NULL;    /* don't use FE[0] */
+static struct CallBack * CB = NULL;     /* don't use CB[0] */
  
-/* Unix time to Call Back time, and vice-versa.  Call back time is
-   in units of 128 seconds, corresponding to time queues. */
-#define CBtime(uxtime) ((uxtime)>>7)
-#define UXtime(cbtime) ((cbtime)<<7)
+static struct CallBack * CBfree = NULL;
+static struct FileEntry * FEfree = NULL;
  
-/* Given a Unix time, compute the closest Unix time that corresponds to
-   a time queue, rounding up */
-#define TimeCeiling(uxtime)    (((uxtime)+127)&~127)
  
  /* Time to live for call backs depends upon number of users of the file.
   * TimeOuts is indexed by this number/8 (using TimeOut macro).  Times
@@ -229,52 +161,17 @@ static int TimeOuts[] = {
  /* minimum time given for a call back */
  static int MinTimeOut = (7 * 60);
  
-#define TimeOutCutoff   ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8)
-#define TimeOut(nusers)  ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3])
-
-/* time out at server is 3 minutes more than ws */
-#define ServerBias       (3*60)
-
  /* Heads of CB queues; a timeout index is 1+index into this array */
-static afs_uint32 timeout[128];
-
-/* Convert cbtime to timeout queue index */
-#define TIndex(cbtime)  (((cbtime)&127)+1)
-
-/* Convert cbtime to pointer to timeout queue head */
-#define THead(cbtime)  (&timeout[TIndex(cbtime)-1])
+static afs_uint32 timeout[CB_NUM_TIMEOUT_QUEUES];
  
  static afs_int32 tfirst;       /* cbtime of oldest unexpired call back time queue */
  
-/* Normalize index into timeout array so that two such indices will be
-   ordered correctly, so that they can be compared to see which times
-   sooner, or so that the difference in time out times between them
-   can be computed. */
-#define TNorm(index)   ((index)<TIndex(tfirst)?(index)+128:(index))
-
-/* This converts a timeout index into the actual time it will expire */
-#define TIndexToTime(index) (UXtime(TNorm(index) - TIndex(tfirst) + tfirst))
-
-
-/* Convert pointer to timeout queue head to index, and vice versa */
-#define ttoi(t)                ((t-timeout)+1)
-#define itot(i)                ((timeout)+(i-1))
  
  /* 16 byte object get/free routines */
  struct object {
      struct object *next;
  };
  
-struct VCBParams {
-    struct cbstruct cba[MAX_CB_HOSTS]; /* re-entrant storage */
-    unsigned int ncbas;
-    afs_uint32 thead;          /* head of timeout queue for youngest callback */
-    struct AFSFid *fid;
-};
-
-struct CallBack *CBfree = 0;
-struct FileEntry *FEfree = 0;
-
  /* Prototypes for static routines */
  static struct FileEntry *FindFE(register AFSFid * fid);
  static struct CallBack *iGetCB(register int *nused);
@@ -308,12 +205,11 @@ static int ClearHostCallbacks_r(struct host *hp, int locked);
  #define FreeCB(cb) iFreeCB((struct CallBack *)cb, &cbstuff.nCBs)
  #define FreeFE(fe) iFreeFE((struct FileEntry *)fe, &cbstuff.nFEs)
  
+
  /* Other protos - move out sometime */
  void PrintCB(register struct CallBack *cb, afs_uint32 now);
  
-#define VHASH 512              /* Power of 2 */
-static afs_uint32 HashTable[VHASH];    /* File entry hash table */
-#define VHash(volume, unique) (((volume)+(unique))&(VHASH-1))
+static afs_uint32 HashTable[FEHASH_SIZE];      /* File entry hash table */
  
  static struct FileEntry *
  FindFE(register AFSFid * fid)
@@ -322,7 +218,7 @@ FindFE(register AFSFid * fid)
      register int fei;
      register struct FileEntry *fe;
  
-    hash = VHash(fid->Volume, fid->Unique);
+    hash = FEHash(fid->Volume, fid->Unique);
      for (fei = HashTable[hash]; fei; fei = fe->fnext) {
         fe = itofe(fei);
         if (fe->volid == fid->Volume && fe->unique == fid->Unique
@@ -421,11 +317,11 @@ HAdd(register struct CallBack *cb, register struct host *host)
      if (!host->cblist) {
         host->cblist = cb->hnext = cb->hprev = cbtoi(cb);
      } else {
-       register struct CallBack *hhp = itocb(host->cblist);
+       register struct CallBack *fcb = itocb(host->cblist);
  
-       cb->hprev = hhp->hprev;
-       cb->hnext = host->cblist;
-       hhp->hprev = (itocb(hhp->hprev)->hnext = cbtoi(cb));
+       cb->hprev = fcb->hprev;
+       cb->hnext = cbtoi(fcb);
+       fcb->hprev = (itocb(fcb->hprev)->hnext = cbtoi(cb));
      }
      return 0;
  }
@@ -475,7 +371,7 @@ CDel(struct CallBack *cb, int deletefe)
  /* N.B.  This one also deletes the CB, and also possibly parent FE, so
   * make sure that it is not on any other list before calling this
   * routine */
-int Ccdelpt = 0, CcdelB = 0;
+static int Ccdelpt = 0, CcdelB = 0;
  
  static int
  CDelPtr(register struct FileEntry *fe, register afs_uint32 * cbp,
@@ -522,7 +418,7 @@ static int
  FDel(register struct FileEntry *fe)
  {
      register int fei = fetoi(fe);
-    register afs_uint32 *p = &HashTable[VHash(fe->volid, fe->unique)];
+    register afs_uint32 *p = &HashTable[FEHash(fe->volid, fe->unique)];
  
      while (*p && *p != fei)
         p = &itofe(*p)->fnext;
@@ -532,6 +428,7 @@ FDel(register struct FileEntry *fe)
      return 0;
  }
  
+/* initialize the callback package */
  int
  InitCallBack(int nblks)
  {
@@ -539,19 +436,21 @@ InitCallBack(int nblks)
      tfirst = CBtime(FT_ApproxTime());
      /* N.B. The "-1", below, is because
       * FE[0] and CB[0] are not used--and not allocated */
-    FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry)))) - 1;
+    FE = ((struct FileEntry *)(calloc(nblks, sizeof(struct FileEntry))));
      if (!FE) {
         ViceLog(0, ("Failed malloc in InitCallBack\n"));
         assert(0);
      }
+    FE--;  /* FE[0] is supposed to point to junk */
      cbstuff.nFEs = nblks;
      while (cbstuff.nFEs)
         FreeFE(&FE[cbstuff.nFEs]);      /* This is correct */
-    CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack)))) - 1;
+    CB = ((struct CallBack *)(calloc(nblks, sizeof(struct CallBack))));
      if (!CB) {
         ViceLog(0, ("Failed malloc in InitCallBack\n"));
         assert(0);
      }
+    CB--;  /* CB[0] is supposed to point to junk */
      cbstuff.nCBs = nblks;
      while (cbstuff.nCBs)
         FreeCB(&CB[cbstuff.nCBs]);      /* This is correct */
@@ -696,7 +595,7 @@ AddCallBack1_r(struct host *host, AFSFid * fid, afs_uint32 * thead, int type,
         fe->unique = fid->Unique;
         fe->ncbs = 0;
         fe->status = 0;
-       hash = VHash(fid->Volume, fid->Unique);
+       hash = FEHash(fid->Volume, fid->Unique);
         fe->fnext = HashTable[hash];
         HashTable[hash] = fetoi(fe);
      }
@@ -1302,7 +1201,7 @@ BreakVolumeCallBacks(afs_uint32 volume)
  
      H_LOCK;
      fid.Volume = volume, fid.Vnode = fid.Unique = 0;
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
         for (feip = &HashTable[hash]; (fe = itofe(*feip));) {
             if (fe->volid == volume) {
                 register struct CallBack *cbnext;
@@ -1360,7 +1259,7 @@ int
  BreakVolumeCallBacksLater(afs_uint32 volume)
  {
      int hash;
-    afs_int32 *feip;
+    afs_uint32 *feip;
      struct FileEntry *fe;
      struct CallBack *cb;
      struct host *host;
@@ -1368,7 +1267,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume)
  
      ViceLog(25, ("Setting later on volume %u\n", volume));
      H_LOCK;
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
         for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) {
             if (fe->volid == volume) {
                 register struct CallBack *cbnext;
@@ -1381,7 +1280,7 @@ BreakVolumeCallBacksLater(afs_uint32 volume)
                 FSYNC_LOCK;
                 fe->status |= FE_LATER;
                 FSYNC_UNLOCK;
-               found++;
+               found = 1;
             }
             feip = &fe->fnext;
         }
@@ -1408,7 +1307,7 @@ BreakLaterCallBacks(void)
  {
      struct AFSFid fid;
      int hash;
-    afs_int32 *feip;
+    afs_uint32 *feip;
      struct CallBack *cb;
      struct FileEntry *fe = NULL;
      struct FileEntry *myfe = NULL;
@@ -1424,7 +1323,7 @@ BreakLaterCallBacks(void)
      /* Pick the first volume we see to clean up */
      fid.Volume = fid.Vnode = fid.Unique = 0;
  
-    for (hash = 0; hash < VHASH; hash++) {
+    for (hash = 0; hash < FEHASH_SIZE; hash++) {
         for (feip = &HashTable[hash]; (fe = itofe(*feip)) != NULL; ) {
             if (fe && (fe->status & FE_LATER)
                 && (fid.Volume == 0 || fid.Volume == fe->volid)) {
@@ -1775,6 +1674,973 @@ PrintCallBackStats(void)
  
  #ifndef INTERPRET_DUMP
  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * callback state serialization
+ */
+static int cb_stateSaveTimeouts(struct fs_dump_state * state);
+static int cb_stateSaveFEHash(struct fs_dump_state * state);
+static int cb_stateSaveFEs(struct fs_dump_state * state);
+static int cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateRestoreTimeouts(struct fs_dump_state * state);
+static int cb_stateRestoreFEHash(struct fs_dump_state * state);
+static int cb_stateRestoreFEs(struct fs_dump_state * state);
+static int cb_stateRestoreFE(struct fs_dump_state * state);
+static int cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, 
+                             struct iovec * iov, int niovecs);
+
+static int cb_stateVerifyFEHash(struct fs_dump_state * state);
+static int cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe);
+static int cb_stateVerifyTimeoutQueues(struct fs_dump_state * state);
+
+static int cb_stateFEToDiskEntry(struct FileEntry *, struct FEDiskEntry *);
+static int cb_stateDiskEntryToFE(struct fs_dump_state * state,
+                                struct FEDiskEntry *, struct FileEntry *);
+
+static int cb_stateCBToDiskEntry(struct CallBack *, struct CBDiskEntry *);
+static int cb_stateDiskEntryToCB(struct fs_dump_state * state,
+                                struct CBDiskEntry *, struct CallBack *);
+
+static int cb_stateFillHeader(struct callback_state_header * hdr);
+static int cb_stateCheckHeader(struct callback_state_header * hdr);
+
+static int cb_stateAllocMap(struct fs_dump_state * state);
+
+int
+cb_stateSave(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    AssignInt64(state->eof_offset, &state->hdr->cb_offset);
+
+    /* invalidate callback state header */
+    memset(state->cb_hdr, 0, sizeof(struct callback_state_header));
+    if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+                           sizeof(struct callback_state_header))) {
+       ret = 1;
+       goto done;
+    }
+
+    fs_stateIncEOF(state, sizeof(struct callback_state_header));
+
+    /* dump timeout state */
+    if (cb_stateSaveTimeouts(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    /* dump fe hashtable state */
+    if (cb_stateSaveFEHash(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    /* dump callback state */
+    if (cb_stateSaveFEs(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    /* write the callback state header to disk */
+    cb_stateFillHeader(state->cb_hdr);
+    if (fs_stateWriteHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+                           sizeof(struct callback_state_header))) {
+       ret = 1;
+       goto done;
+    }
+    
+ done:
+    return ret;
+}
+
+int
+cb_stateRestore(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (fs_stateReadHeader(state, &state->hdr->cb_offset, state->cb_hdr,
+                          sizeof(struct callback_state_header))) {
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateCheckHeader(state->cb_hdr)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateAllocMap(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateRestoreTimeouts(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateRestoreFEHash(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    /* restore FEs and CBs from disk */
+    if (cb_stateRestoreFEs(state)) {
+       ret = 1;
+       goto done;
+    }
+
+    /* restore the timeout queue heads */
+    tfirst = state->cb_hdr->tfirst;
+
+ done:
+    return ret;
+}
+
+int
+cb_stateRestoreIndices(struct fs_dump_state * state)
+{
+    int i, ret = 0;
+    struct FileEntry * fe;
+    struct CallBack * cb;
+
+    /* restore indices in the FileEntry structures */
+    for (i = 1; i < state->fe_map.len; i++) {
+       if (state->fe_map.entries[i].new_idx) {
+           fe = itofe(state->fe_map.entries[i].new_idx);
+
+           /* restore the fe->fnext entry */
+           if (fe_OldToNew(state, fe->fnext, &fe->fnext)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the fe->firstcb entry */
+           if (cb_OldToNew(state, fe->firstcb, &fe->firstcb)) {
+               ret = 1;
+               goto done;
+           }
+       }
+    }
+    
+    /* restore indices in the CallBack structures */
+    for (i = 1; i < state->cb_map.len; i++) {
+       if (state->cb_map.entries[i].new_idx) {
+           cb = itocb(state->cb_map.entries[i].new_idx);
+
+           /* restore the cb->cnext entry */
+           if (cb_OldToNew(state, cb->cnext, &cb->cnext)) {
+               ret = 1;
+               goto done;
+           }
+           
+           /* restore the cb->fhead entry */
+           if (fe_OldToNew(state, cb->fhead, &cb->fhead)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the cb->hhead entry */
+           if (h_OldToNew(state, cb->hhead, &cb->hhead)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the cb->tprev entry */
+           if (cb_OldToNew(state, cb->tprev, &cb->tprev)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the cb->tnext entry */
+           if (cb_OldToNew(state, cb->tnext, &cb->tnext)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the cb->hprev entry */
+           if (cb_OldToNew(state, cb->hprev, &cb->hprev)) {
+               ret = 1;
+               goto done;
+           }
+
+           /* restore the cb->hnext entry */
+           if (cb_OldToNew(state, cb->hnext, &cb->hnext)) {
+               ret = 1;
+               goto done;
+           }
+       }
+    }
+
+    /* restore the timeout queue head indices */
+    for (i = 0; i < state->cb_timeout_hdr->records; i++) {
+       if (cb_OldToNew(state, timeout[i], &timeout[i])) {
+           ret = 1;
+           goto done;
+       }
+    }
+
+    /* restore the FE hash table queue heads */
+    for (i = 0; i < state->cb_fehash_hdr->records; i++) {
+       if (fe_OldToNew(state, HashTable[i], &HashTable[i])) {
+           ret = 1;
+           goto done;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_stateVerify(struct fs_dump_state * state)
+{
+    int ret = 0;
+
+    if (cb_stateVerifyFEHash(state)) {
+       ret = 1;
+    }
+
+    if (cb_stateVerifyTimeoutQueues(state)) {
+       ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFEHash(struct fs_dump_state * state)
+{
+    int ret = 0, i;
+    struct FileEntry * fe;
+    afs_uint32 fei, chain_len;
+
+    for (i = 0; i < FEHASH_SIZE; i++) {
+       chain_len = 0;
+       for (fei = HashTable[i], fe = itofe(fei);
+            fe;
+            fei = fe->fnext, fe = itofe(fei)) {
+           if (fei > cbstuff.nblks) {
+               ViceLog(0, ("cb_stateVerifyFEHash: error: index out of range (fei=%d)\n", fei));
+               ret = 1;
+               break;
+           }
+           if (cb_stateVerifyFE(state, fe)) {
+               ret = 1;
+           }
+           if (chain_len > FS_STATE_FE_MAX_HASH_CHAIN_LEN) {
+               ViceLog(0, ("cb_stateVerifyFEHash: error: hash chain %d length exceeds %d; assuming there's a loop\n",
+                           i, FS_STATE_FE_MAX_HASH_CHAIN_LEN));
+               ret = 1;
+               break;
+           }
+           chain_len++;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFE(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0;
+
+    if ((fe->firstcb && !fe->ncbs) ||
+       (!fe->firstcb && fe->ncbs)) {
+       ViceLog(0, ("cb_stateVerifyFE: error: fe->firstcb does not agree with fe->ncbs (fei=%d, fe->firstcb=%d, fe->ncbs=%d)\n",
+                   fetoi(fe), fe->firstcb, fe->ncbs));
+       ret = 1;
+    }
+    if (cb_stateVerifyFCBList(state, fe)) {
+       ViceLog(0, ("cb_stateVerifyFE: error: FCBList failed verification (fei=%d)\n", fetoi(fe)));
+       ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyFCBList(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0;
+    afs_uint32 cbi, fei, chain_len = 0;
+    struct CallBack * cb;
+
+    fei = fetoi(fe);
+
+    for (cbi = fe->firstcb, cb = itocb(cbi);
+        cb;
+        cbi = cb->cnext, cb = itocb(cbi)) {
+       if (cbi > cbstuff.nblks) {
+           ViceLog(0, ("cb_stateVerifyFCBList: error: list index out of range (cbi=%d, ncbs=%d)\n",
+                       cbi, cbstuff.nblks));
+           ret = 1;
+           goto done;
+       }
+       if (cb->fhead != fei) {
+           ViceLog(0, ("cb_stateVerifyFCBList: error: cb->fhead != fei (fei=%d, cb->fhead=%d)\n",
+                       fei, cb->fhead));
+           ret = 1;
+       }
+       if (chain_len > FS_STATE_FCB_MAX_LIST_LEN) {
+           ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (fei=%d); assuming there's a loop\n",
+                       FS_STATE_FCB_MAX_LIST_LEN, fei));
+           ret = 1;
+           goto done;
+       }
+       chain_len++;
+    }
+
+    if (fe->ncbs != chain_len) {
+       ViceLog(0, ("cb_stateVerifyFCBList: error: list length mismatch (len=%d, fe->ncbs=%d)\n",
+                   chain_len, fe->ncbs));
+       ret = 1;
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_stateVerifyHCBList(struct fs_dump_state * state, struct host * host)
+{
+    int ret = 0;
+    afs_uint32 hi, chain_len, cbi;
+    struct CallBack *cb, *ncb;
+
+    hi = h_htoi(host);
+    chain_len = 0;
+
+    for (cbi = host->cblist, cb = itocb(cbi);
+        cb;
+        cbi = cb->hnext, cb = ncb) {
+       if (chain_len && (host->cblist == cbi)) {
+           /* we've wrapped around the circular list, and everything looks ok */
+           break;
+       }
+       if (cb->hhead != hi) {
+           ViceLog(0, ("cb_stateVerifyHCBList: error: incorrect cb->hhead (cbi=%d, h->index=%d, cb->hhead=%d)\n",
+                       cbi, hi, cb->hhead));
+           ret = 1;
+       }
+       if (!cb->hprev || !cb->hnext) {
+           ViceLog(0, ("cb_stateVerifyHCBList: error: null index in circular list (cbi=%d, h->index=%d)\n",
+                       cbi, hi));
+           ret = 1;
+           goto done;
+       }
+       if ((cb->hprev > cbstuff.nblks) ||
+           (cb->hnext > cbstuff.nblks)) {
+           ViceLog(0, ("cb_stateVerifyHCBList: error: list index out of range (cbi=%d, h->index=%d, cb->hprev=%d, cb->hnext=%d, nCBs=%d)\n",
+                       cbi, hi, cb->hprev, cb->hnext, cbstuff.nblks));
+           ret = 1;
+           goto done;
+       }
+       ncb = itocb(cb->hnext);
+       if (cbi != ncb->hprev) {
+           ViceLog(0, ("cb_stateVerifyHCBList: error: corrupt linked list (cbi=%d, h->index=%d)\n",
+                       cbi, hi));
+           ret = 1;
+           goto done;
+       }
+       if (chain_len > FS_STATE_HCB_MAX_LIST_LEN) {
+           ViceLog(0, ("cb_stateVerifyFCBList: error: list length exceeds %d (h->index=%d); assuming there's a loop\n",
+                       FS_STATE_HCB_MAX_LIST_LEN, hi));
+           ret = 1;
+           goto done;
+       }
+       chain_len++;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateVerifyTimeoutQueues(struct fs_dump_state * state)
+{
+    int ret = 0, i;
+    afs_uint32 cbi, chain_len;
+    struct CallBack *cb, *ncb;
+
+    for (i = 0; i < CB_NUM_TIMEOUT_QUEUES; i++) {
+       chain_len = 0;
+       for (cbi = timeout[i], cb = itocb(cbi);
+            cb;
+            cbi = cb->tnext, cb = ncb) {
+           if (chain_len && (cbi == timeout[i])) {
+               /* we've wrapped around the circular list, and everything looks ok */
+               break;
+           }
+           if (cbi > cbstuff.nblks) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: list index out of range (cbi=%d, tindex=%d)\n",
+                           cbi, i));
+               ret = 1;
+               break;
+           }
+           if (itot(cb->thead) != &timeout[i]) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: error: cb->thead points to wrong timeout queue (tindex=%d, cbi=%d, cb->thead=%d)\n",
+                           i, cbi, cb->thead));
+               ret = 1;
+           }
+           if (!cb->tprev || !cb->tnext) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: null index in circular list (cbi=%d, tindex=%d)\n",
+                           cbi, i));
+               ret = 1;
+               break;
+           }
+           if ((cb->tprev > cbstuff.nblks) ||
+               (cb->tnext > cbstuff.nblks)) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: list index out of range (cbi=%d, tindex=%d, cb->tprev=%d, cb->tnext=%d, nCBs=%d)\n",
+                           cbi, i, cb->tprev, cb->tnext, cbstuff.nblks));
+               ret = 1;
+               break;
+           }
+           ncb = itocb(cb->tnext);
+           if (cbi != ncb->tprev) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: corrupt linked list (cbi=%d, tindex=%d)\n",
+                           cbi, i));
+               ret = 1;
+               break;
+           }
+           if (chain_len > FS_STATE_TCB_MAX_LIST_LEN) {
+               ViceLog(0, ("cb_stateVerifyTimeoutQueues: list length exceeds %d (tindex=%d); assuming there's a loop\n",
+                           FS_STATE_TCB_MAX_LIST_LEN, i));
+               ret = 1;
+               break;
+           }
+           chain_len++;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveTimeouts(struct fs_dump_state * state)
+{
+    int ret = 0;
+    struct iovec iov[2];
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->timeout_offset);
+
+    memset(state->cb_timeout_hdr, 0, sizeof(struct callback_state_fehash_header));
+    state->cb_timeout_hdr->magic = CALLBACK_STATE_TIMEOUT_MAGIC;
+    state->cb_timeout_hdr->records = CB_NUM_TIMEOUT_QUEUES;
+    state->cb_timeout_hdr->len = sizeof(struct callback_state_timeout_header) +
+       (state->cb_timeout_hdr->records * sizeof(afs_uint32));
+
+    iov[0].iov_base = (char *)state->cb_timeout_hdr;
+    iov[0].iov_len = sizeof(struct callback_state_timeout_header);
+    iov[1].iov_base = (char *)timeout;
+    iov[1].iov_len = sizeof(timeout);
+
+    if (fs_stateSeek(state, &state->cb_hdr->timeout_offset)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateWriteV(state, iov, 2)) {
+       ret = 1;
+       goto done;
+    }
+
+    fs_stateIncEOF(state, state->cb_timeout_hdr->len);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreTimeouts(struct fs_dump_state * state)
+{
+    int ret = 0, len;
+
+    if (fs_stateReadHeader(state, &state->cb_hdr->timeout_offset,
+                          state->cb_timeout_hdr, 
+                          sizeof(struct callback_state_timeout_header))) {
+       ret = 1;
+       goto done;
+    }
+
+    if (state->cb_timeout_hdr->magic != CALLBACK_STATE_TIMEOUT_MAGIC) {
+       ret = 1;
+       goto done;
+    }
+    if (state->cb_timeout_hdr->records != CB_NUM_TIMEOUT_QUEUES) {
+       ret = 1;
+       goto done;
+    }
+
+    len = state->cb_timeout_hdr->records * sizeof(afs_uint32);
+
+    if (state->cb_timeout_hdr->len !=
+       (sizeof(struct callback_state_timeout_header) + len)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateRead(state, timeout, len)) {
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFEHash(struct fs_dump_state * state)
+{
+    int ret = 0;
+    struct iovec iov[2];
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->fehash_offset);
+
+    memset(state->cb_fehash_hdr, 0, sizeof(struct callback_state_fehash_header));
+    state->cb_fehash_hdr->magic = CALLBACK_STATE_FEHASH_MAGIC;
+    state->cb_fehash_hdr->records = FEHASH_SIZE;
+    state->cb_fehash_hdr->len = sizeof(struct callback_state_fehash_header) +
+       (state->cb_fehash_hdr->records * sizeof(afs_uint32));
+
+    iov[0].iov_base = (char *)state->cb_fehash_hdr;
+    iov[0].iov_len = sizeof(struct callback_state_fehash_header);
+    iov[1].iov_base = (char *)HashTable;
+    iov[1].iov_len = sizeof(HashTable);
+
+    if (fs_stateSeek(state, &state->cb_hdr->fehash_offset)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateWriteV(state, iov, 2)) {
+       ret = 1;
+       goto done;
+    }
+
+    fs_stateIncEOF(state, state->cb_fehash_hdr->len);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFEHash(struct fs_dump_state * state)
+{
+    int ret = 0, len;
+
+    if (fs_stateReadHeader(state, &state->cb_hdr->fehash_offset,
+                          state->cb_fehash_hdr, 
+                          sizeof(struct callback_state_fehash_header))) {
+       ret = 1;
+       goto done;
+    }
+
+    if (state->cb_fehash_hdr->magic != CALLBACK_STATE_FEHASH_MAGIC) {
+       ret = 1;
+       goto done;
+    }
+    if (state->cb_fehash_hdr->records != FEHASH_SIZE) {
+       ret = 1;
+       goto done;
+    }
+
+    len = state->cb_fehash_hdr->records * sizeof(afs_uint32);
+
+    if (state->cb_fehash_hdr->len !=
+       (sizeof(struct callback_state_fehash_header) + len)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (fs_stateRead(state, HashTable, len)) {
+       ret = 1;
+       goto done;
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFEs(struct fs_dump_state * state)
+{
+    int ret = 0;
+    register int fei, hash;
+    register struct FileEntry *fe;
+
+    AssignInt64(state->eof_offset, &state->cb_hdr->fe_offset);
+
+    for (hash = 0; hash < FEHASH_SIZE ; hash++) {
+       for (fei = HashTable[hash]; fei; fei = fe->fnext) {
+           fe = itofe(fei);
+           if (cb_stateSaveFE(state, fe)) {
+               ret = 1;
+               goto done;
+           }
+       }
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFEs(struct fs_dump_state * state)
+{
+    int count, nFEs, ret = 0;
+
+    nFEs = state->cb_hdr->nFEs;
+
+    for (count = 0; count < nFEs; count++) {
+       if (cb_stateRestoreFE(state)) {
+           ret = 1;
+           goto done;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateSaveFE(struct fs_dump_state * state, struct FileEntry * fe)
+{
+    int ret = 0, iovcnt, cbi, idx, len, written = 0;
+    afs_uint32 fei;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fedsk;
+    struct CBDiskEntry cbdsk[16];
+    struct iovec iov[16];
+    struct CallBack *cb;
+
+    fei = fetoi(fe);
+    if (fei > state->cb_hdr->fe_max) {
+       state->cb_hdr->fe_max = fei;
+    }
+
+    memset(&hdr, 0, sizeof(struct callback_state_entry_header));
+
+    if (cb_stateFEToDiskEntry(fe, &fedsk)) {
+       ret = 1;
+       goto done;
+    }
+
+    iov[0].iov_base = (char *)&hdr;
+    len = iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *)&fedsk;
+    len += iov[1].iov_len = sizeof(struct FEDiskEntry);
+    iovcnt = 2;
+
+    for (cbi = fe->firstcb, cb = itocb(cbi), idx = 2; 
+        cb != NULL; 
+        cbi = cb->cnext, cb = itocb(cbi), idx++, hdr.nCBs++) {
+       if (cbi > state->cb_hdr->cb_max) {
+           state->cb_hdr->cb_max = cbi;
+       }
+       if (cb_stateCBToDiskEntry(cb, &cbdsk[idx])) {
+           ret = 1;
+           goto done;
+       }
+       cbdsk[idx].index = cbi;
+       iov[idx].iov_base = (char *)&cbdsk[idx];
+       len += iov[idx].iov_len = sizeof(struct CBDiskEntry);
+       iovcnt++;
+       if ((iovcnt == 16) || (!cb->cnext)) {
+           if (fs_stateWriteV(state, iov, iovcnt)) {
+               ret = 1;
+               goto done;
+           }
+           written = 1;
+           iovcnt = 0;
+           len = 0;
+       }
+    }
+
+    hdr.magic = CALLBACK_STATE_ENTRY_MAGIC;
+    hdr.len = sizeof(hdr) + sizeof(struct FEDiskEntry) + 
+       (hdr.nCBs * sizeof(struct CBDiskEntry));
+
+    if (!written) {
+       if (fs_stateWriteV(state, iov, iovcnt)) {
+           ret = 1;
+           goto done;
+       }
+    } else {
+       if (fs_stateWriteHeader(state, &state->eof_offset, &hdr, sizeof(hdr))) {
+           ret = 1;
+           goto done;
+       }
+    }
+
+    fs_stateIncEOF(state, hdr.len);
+
+    if (written) {
+       if (fs_stateSeek(state, &state->eof_offset)) {
+           ret = 1;
+           goto done;
+       }
+    }
+
+    state->cb_hdr->nFEs++;
+    state->cb_hdr->nCBs += hdr.nCBs;
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreFE(struct fs_dump_state * state)
+{
+    int ret = 0, iovcnt, len, nCBs, idx;
+    struct callback_state_entry_header hdr;
+    struct FEDiskEntry fedsk;
+    struct CBDiskEntry cbdsk[16];
+    struct iovec iov[16];
+    struct FileEntry * fe;
+    struct CallBack * cb;
+
+    iov[0].iov_base = (char *)&hdr;
+    len = iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *)&fedsk;
+    len += iov[1].iov_len = sizeof(fedsk);
+    iovcnt = 2;
+
+    if (fs_stateReadV(state, iov, iovcnt)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (hdr.magic != CALLBACK_STATE_ENTRY_MAGIC) {
+       ret = 1;
+       goto done;
+    }
+
+    fe = GetFE();
+    if (fe == NULL) {
+       ViceLog(0, ("cb_stateRestoreFE: ran out of free FileEntry structures\n"));
+       ret = 1;
+       goto done;
+    }
+
+    if (cb_stateDiskEntryToFE(state, &fedsk, fe)) {
+       ret = 1;
+       goto done;
+    }
+
+    if (hdr.nCBs) {
+       for (iovcnt = 0, idx = 0, len = 0, nCBs = 0;
+            nCBs < hdr.nCBs;
+            idx++, nCBs++) {
+           iov[idx].iov_base = (char *)&cbdsk[idx];
+           len += iov[idx].iov_len = sizeof(struct CBDiskEntry);
+           iovcnt++;
+           if ((iovcnt == 16) || (nCBs == hdr.nCBs - 1)) {
+               if (fs_stateReadV(state, iov, iovcnt)) {
+                   ret = 1;
+                   goto done;
+               }
+               if (cb_stateRestoreCBs(state, fe, iov, iovcnt)) {
+                   ret = 1;
+                   goto done;
+               }
+               len = 0;
+               iovcnt = 0;
+           }
+       }
+    }
+    
+ done:
+    return ret;
+}
+
+static int
+cb_stateRestoreCBs(struct fs_dump_state * state, struct FileEntry * fe, 
+                  struct iovec * iov, int niovecs)
+{
+    int ret = 0, idx;
+    register struct CallBack * cb;
+    struct CBDiskEntry * cbdsk;
+    afs_uint32 fei;
+
+    fei = fetoi(fe);
+
+    for (idx = 0; idx < niovecs; idx++) {
+       cbdsk = (struct CBDiskEntry *) iov[idx].iov_base;
+       if ((cb = GetCB()) == NULL) {
+           ViceLog(0, ("cb_stateRestoreCBs: ran out of free CallBack structures\n"));
+           ret = 1;
+           goto done;
+       }
+       if (cb_stateDiskEntryToCB(state, cbdsk, cb)) {
+           ViceLog(0, ("cb_stateRestoreCBs: corrupt CallBack disk entry\n"));
+           ret = 1;
+           goto done;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+
+static int
+cb_stateFillHeader(struct callback_state_header * hdr)
+{
+    hdr->stamp.magic = CALLBACK_STATE_MAGIC;
+    hdr->stamp.version = CALLBACK_STATE_VERSION;
+    hdr->tfirst = tfirst;
+    return 0;
+}
+
+static int
+cb_stateCheckHeader(struct callback_state_header * hdr)
+{
+    int ret = 0;
+
+    if (hdr->stamp.magic != CALLBACK_STATE_MAGIC) {
+       ret = 1;
+    } else if (hdr->stamp.version != CALLBACK_STATE_VERSION) {
+       ret = 1;
+    } else if ((hdr->nFEs > cbstuff.nblks) || (hdr->nCBs > cbstuff.nblks)) {
+       ViceLog(0, ("cb_stateCheckHeader: saved callback state larger than callback memory allocation\n"));
+       ret = 1;
+    }
+    return ret;
+}
+
+/* disk entry conversion routines */
+static int
+cb_stateFEToDiskEntry(struct FileEntry * in, struct FEDiskEntry * out)
+{
+    memcpy(&out->fe, in, sizeof(struct FileEntry));
+    out->index = fetoi(in);
+    return 0;
+}
+
+static int
+cb_stateDiskEntryToFE(struct fs_dump_state * state, 
+                     struct FEDiskEntry * in, struct FileEntry * out)
+{
+    int ret = 0;
+
+    memcpy(out, &in->fe, sizeof(struct FileEntry));
+
+    /* setup FE map entry */
+    if (!in->index || (in->index >= state->fe_map.len)) {
+       ViceLog(0, ("cb_stateDiskEntryToFE: index (%d) out of range",
+                   in->index));
+       ret = 1;
+       goto done;
+    }
+    state->fe_map.entries[in->index].old_idx = in->index;
+    state->fe_map.entries[in->index].new_idx = fetoi(out);
+
+ done:
+    return ret;
+}
+
+static int
+cb_stateCBToDiskEntry(struct CallBack * in, struct CBDiskEntry * out)
+{
+    memcpy(&out->cb, in, sizeof(struct CallBack));
+    out->index = cbtoi(in);
+    return 0;
+}
+
+static int
+cb_stateDiskEntryToCB(struct fs_dump_state * state,
+                     struct CBDiskEntry * in, struct CallBack * out)
+{
+    int ret = 0;
+
+    memcpy(out, &in->cb, sizeof(struct CallBack));
+
+    /* setup CB map entry */
+    if (!in->index || (in->index >= state->cb_map.len)) {
+       ViceLog(0, ("cb_stateDiskEntryToCB: index (%d) out of range\n",
+                   in->index));
+       ret = 1;
+       goto done;
+    }
+    state->cb_map.entries[in->index].old_idx = in->index;
+    state->cb_map.entries[in->index].new_idx = cbtoi(out);
+
+ done:
+    return ret;
+}
+
+/* index map routines */
+static int
+cb_stateAllocMap(struct fs_dump_state * state)
+{
+    state->fe_map.len = state->cb_hdr->fe_max + 1;
+    state->cb_map.len = state->cb_hdr->cb_max + 1;
+    state->fe_map.entries = (struct idx_map_entry_t *)
+       calloc(state->fe_map.len, sizeof(struct idx_map_entry_t));
+    state->cb_map.entries = (struct idx_map_entry_t *)
+       calloc(state->cb_map.len, sizeof(struct idx_map_entry_t));
+    return ((state->fe_map.entries != NULL) && (state->cb_map.entries != NULL)) ? 0 : 1;
+}
+
+int
+fe_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* FEs use a one-based indexing system, so old==0 implies no mapping */
+    if (!old) {
+       *new = 0;
+       goto done;
+    }
+
+    if (old >= state->fe_map.len) {
+       ViceLog(0, ("fe_OldToNew: index %d is out of range\n", old));
+       ret = 1;
+    } else if (state->fe_map.entries[old].old_idx != old) { /* sanity check */
+       ViceLog(0, ("fe_OldToNew: index %d points to an invalid FileEntry record\n", old));
+       ret = 1;
+    } else {
+       *new = state->fe_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+
+int
+cb_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* CBs use a one-based indexing system, so old==0 implies no mapping */
+    if (!old) {
+       *new = 0;
+       goto done;
+    }
+
+    if (old >= state->cb_map.len) {
+       ViceLog(0, ("cb_OldToNew: index %d is out of range\n", old));
+       ret = 1;
+    } else if (state->cb_map.entries[old].old_idx != old) { /* sanity check */
+       ViceLog(0, ("cb_OldToNew: index %d points to an invalid CallBack record\n", old));
+       ret = 1;
+    } else {
+       *new = state->cb_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  int
  DumpCallBackState(void)
  {
@@ -1807,7 +2673,7 @@ DumpCallBackState(void)
      return 0;
  }
  
-#endif
+#endif /* !INTERPRET_DUMP */
  
  #ifdef INTERPRET_DUMP
  
@@ -1931,7 +2797,7 @@ main(int argc, char **argv)
         struct CallBack *cb;
         struct FileEntry *fe;
  
-       for (hash = 0; hash < VHASH; hash++) {
+       for (hash = 0; hash < FEHASH_SIZE; hash++) {
             for (feip = &HashTable[hash]; fe = itofe(*feip);) {
                 if (!vol || (fe->volid == vol)) {
                     register struct CallBack *cbnext;
@@ -2201,6 +3067,15 @@ MultiProbeAlternateAddress_r(struct host *host)
                  H_UNLOCK;
              }
          }
+#ifdef AFS_DEMAND_ATTACH_FS
+       /* try to bail ASAP if the fileserver is shutting down */
+       FS_STATE_RDLOCK;
+       if (fs_state.mode == FS_MODE_SHUTDOWN) {
+           FS_STATE_UNLOCK;
+           multi_Abort;
+       }
+       FS_STATE_UNLOCK;
+#endif
      }
      multi_End_Ignore;
      H_LOCK;
diff --git a/src/viced/callback.h b/src/viced/callback.h

new file mode 100644 (file)

index 0000000..2f4cca8
--- /dev/null
+++ b/src/viced/callback.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+#ifndef _AFS_VICED_CALLBACK_H
+#define _AFS_VICED_CALLBACK_H
+
+/* Maximum number of call backs to break at once, single fid
+ * There is some debate as to just how large this value should be
+ * Ideally, it would be very very large, but I am afraid that the
+ * cache managers will all send in their responses simultaneously,
+ * thereby swamping the file server.  As a result, something like
+ * 10 or 15 might be a better bet.
+ */
+#define MAX_CB_HOSTS   10
+
+/* max time to break a callback, otherwise client is dead or net is hosed */
+#define MAXCBT 25
+
+#define u_byte unsigned char
+
+struct cbcounters {
+    afs_int32 DeleteFiles;
+    afs_int32 DeleteCallBacks;
+    afs_int32 BreakCallBacks;
+    afs_int32 AddCallBacks;
+    afs_int32 GotSomeSpaces;
+    afs_int32 DeleteAllCallBacks;
+    afs_int32 nFEs, nCBs, nblks;
+    afs_int32 CBsTimedOut;
+    afs_int32 nbreakers;
+    afs_int32 GSS1, GSS2, GSS3, GSS4, GSS5;
+};
+extern struct cbcounters cbstuff;
+
+struct cbstruct {
+    struct host *hp;
+    afs_uint32 thead;
+};
+
+/* structure MUST be multiple of 8 bytes, otherwise the casts to
+ * struct object will have alignment issues on *P64 userspaces */
+struct FileEntry {
+    afs_uint32 vnode;
+    afs_uint32 unique;
+    afs_uint32 volid;
+    afs_uint32 fnext;           /* index of next FE in hash chain */
+    afs_uint32 ncbs;            /* number of callbacks for this FE */
+    afs_uint32 firstcb;         /* index of first cb in per-FE list */
+    afs_uint32 status;          /* status bits for this FE */
+    afs_uint32 spare;
+};
+#define FE_LATER 0x1
+
+/* structure MUST be multiple of 8 bytes, otherwise the casts to
+ * struct object will have alignment issues on *P64 userspaces */
+struct CallBack {
+    afs_uint32 cnext;          /* index of next cb in per-FE list */
+    afs_uint32 fhead;          /* index of associated FE */
+    u_byte thead;              /* Head of timeout chain */
+    u_byte status;             /* Call back status; see definitions, below */
+    unsigned short spare;      /* ensure proper alignment */
+    afs_uint32 hhead;          /* Head of host table chain */
+    afs_uint32 tprev, tnext;   /* per-timeout circular list of callbacks */
+    afs_uint32 hprev, hnext;   /* per-host circular list of callbacks */
+};
+
+struct VCBParams {
+    struct cbstruct cba[MAX_CB_HOSTS]; /* re-entrant storage */
+    unsigned int ncbas;
+    afs_uint32 thead;          /* head of timeout queue for youngest callback */
+    struct AFSFid *fid;
+};
+
+
+/* callback hash macros */
+#define FEHASH_SIZE 512                /* Power of 2 */
+#define FEHASH_MASK (FEHASH_SIZE-1)
+#define FEHash(volume, unique) (((volume)+(unique))&(FEHASH_MASK))
+
+#define CB_NUM_TIMEOUT_QUEUES 128
+
+
+/* status values for status field of CallBack structure */
+#define CB_NORMAL   1          /* Normal call back */
+#define CB_DELAYED  2          /* Delayed call back due to rpc problems.
+                                * The call back entry will be added back to the
+                                * host list at the END of the list, so that
+                                * searching backwards in the list will find all
+                                * the (consecutive)host. delayed call back entries */
+#define CB_VOLUME   3          /* Callback for a volume */
+#define CB_BULK     4          /* Normal callbacks, handed out from FetchBulkStatus */
+
+/* call back indices to pointers, and vice-versa */
+#define itocb(i)    ((i)?CB+(i):0)
+#define cbtoi(cbp)  (!(cbp)?0:(cbp)-CB)
+
+/* file entry indices to pointers, and vice-versa */
+#define itofe(i)    ((i)?FE+(i):0)
+#define fetoi(fep)  (!(fep)?0:(fep)-FE)
+
+/* Timeouts:  there are 128 possible timeout values in effect at any
+ * given time.  Each timeout represents timeouts in an interval of 128
+ * seconds.  So the maximum timeout for a call back is 128*128=16384
+ * seconds, or 4 1/2 hours.  The timeout cleanup stuff is called only
+ * if space runs out or by the file server every 5 minutes.  This 5
+ * minute slack should be allowed for--so a maximum time of 4 hours
+ * is safer.
+ *
+ * Timeouts must be chosen to correspond to an exact multiple
+ * of 128, because all times are truncated to a 128 multiple, and
+ * timed out if the current truncated time is <= to the truncated time
+ * corresponding to the timeout queue.
+ */
+
+/* Unix time to Call Back time, and vice-versa.  Call back time is
+   in units of 128 seconds, corresponding to time queues. */
+#define CBtime(uxtime) ((uxtime)>>7)
+#define UXtime(cbtime) ((cbtime)<<7)
+
+/* Given a Unix time, compute the closest Unix time that corresponds to
+   a time queue, rounding up */
+#define TimeCeiling(uxtime)    (((uxtime)+127)&~127)
+
+#define TimeOutCutoff   ((sizeof(TimeOuts)/sizeof(TimeOuts[0]))*8)
+#define TimeOut(nusers)  ((nusers)>=TimeOutCutoff? MinTimeOut: TimeOuts[(nusers)>>3])
+
+/* time out at server is 3 minutes more than ws */
+#define ServerBias       (3*60)
+
+/* Convert cbtime to timeout queue index */
+#define TIndex(cbtime)  (((cbtime)&127)+1)
+
+/* Convert cbtime to pointer to timeout queue head */
+#define THead(cbtime)  (&timeout[TIndex(cbtime)-1])
+
+/* Normalize index into timeout array so that two such indices will be
+   ordered correctly, so that they can be compared to see which times
+   sooner, or so that the difference in time out times between them
+   can be computed. */
+#define TNorm(index)   ((index)<TIndex(tfirst)?(index)+128:(index))
+
+/* This converts a timeout index into the actual time it will expire */
+#define TIndexToTime(index) (UXtime(TNorm(index) - TIndex(tfirst) + tfirst))
+
+
+/* Convert pointer to timeout queue head to index, and vice versa */
+#define ttoi(t)                ((t-timeout)+1)
+#define itot(i)                ((timeout)+(i-1))
+
+#endif /* _AFS_VICED_CALLBACK_H */
diff --git a/src/viced/host.c b/src/viced/host.c

index 092a18da1baad17f21494ad8f71b23d103a6c9a3..5f2f940decaba54100c8ed8afec6fadcc8434b65 100644 (file)
--- a/src/viced/host.c
+++ b/src/viced/host.c
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  #include <afsconfig.h>
@@ -59,7 +61,11 @@ RCSID
  #include "viced_prototypes.h"
  #include "viced.h"
  #include "host.h"
-
+#include "callback.h"
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "../util/afsutil_prototypes.h"
+#include "../tviced/serialize_state.h"
+#endif /* AFS_DEMAND_ATTACH_FS */
  
  #ifdef AFS_PTHREAD_ENV
  pthread_mutex_t host_glock_mutex;
@@ -83,6 +89,13 @@ int hostCount = 0;           /* number of hosts in hostList */
  int rxcon_ident_key;
  int rxcon_client_key;
  
+static struct rx_securityClass *sc = NULL;
+
+static void h_SetupCallbackConn_r(struct host * host);
+static void h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host * host);
+static void h_AddHostToUuidHashTable_r(afsUUID * uuid, struct host * host);
+static int h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host);
+
  #define CESPERBLOCK 73
  struct CEBlock {               /* block of CESPERBLOCK file entries */
      struct client entry[CESPERBLOCK];
@@ -232,9 +245,9 @@ GetHT()
  {
      register struct host *entry;
  
-    if (HTFree == 0)
+    if (HTFree == NULL)
         GetHTBlock();
-    assert(HTFree != 0);
+    assert(HTFree != NULL);
      entry = HTFree;
      HTFree = entry->next;
      HTs++;
@@ -448,7 +461,7 @@ h_gethostcps_r(register struct host *host, register afs_int32 now)
         free(host->hcps.prlist_val);    /* this is for hostaclRefresh */
      host->hcps.prlist_val = NULL;
      host->hcps.prlist_len = 0;
-    slept ? (host->cpsCall = FT_ApproxTime()) : (host->cpsCall = now);
+    host->cpsCall = slept ? (FT_ApproxTime()) : (now);
  
      H_UNLOCK;
      code = pr_GetHostCPS(ntohl(host->host), &host->hcps);
@@ -533,7 +546,6 @@ h_Alloc_r(register struct rx_connection *r_con)
  {
      struct servent *serverentry;
      struct host *host;
-    static struct rx_securityClass *sc = 0;
      afs_int32 now;
  #if FS_STATS_DETAILED
      afs_uint32 newHostAddr_HBO;        /*New host IP addr, in host byte order */
@@ -544,7 +556,7 @@ h_Alloc_r(register struct rx_connection *r_con)
      host->host = rxr_HostOf(r_con);
      host->port = rxr_PortOf(r_con);
  
-    hashInsert_r(host->host, host->port, host);
+    h_AddHostToHashTable_r(host->host, host->port, host);
  
      if (consolePort == 0) {    /* find the portal number for console */
  #if    defined(AFS_OSF_ENV)
@@ -561,24 +573,17 @@ h_Alloc_r(register struct rx_connection *r_con)
         host->Console = 1;
      /* Make a callback channel even for the console, on the off chance that it
       * makes a request that causes a break call back.  It shouldn't. */
-    {
-       if (!sc)
-           sc = rxnull_NewClientSecurityObject();
-       host->callback_rxcon =
-           rx_NewConnection(host->host, host->port, 1, sc, 0);
-       rx_SetConnDeadTime(host->callback_rxcon, 50);
-       rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME);
-    }
+    h_SetupCallbackConn_r(host);
      now = host->LastCall = host->cpsCall = host->ActiveCall = FT_ApproxTime();
      host->hostFlags = 0;
      host->hcps.prlist_val = NULL;
      host->hcps.prlist_len = 0;
-    host->interface = 0;
+    host->interface = NULL;
  #ifdef undef
      host->hcpsfailed = 0;      /* save cycles */
      h_gethostcps(host);                /* do this under host hold/lock */
  #endif
-    host->FirstClient = 0;
+    host->FirstClient = NULL;
      h_Hold_r(host);
      h_Lock_r(host);
      h_InsertList_r(host);      /* update global host List */
@@ -596,6 +601,20 @@ h_Alloc_r(register struct rx_connection *r_con)
  }                              /*h_Alloc_r */
  
  
+
+/* Make a callback channel even for the console, on the off chance that it
+ * makes a request that causes a break call back.  It shouldn't. */
+static void
+h_SetupCallbackConn_r(struct host * host)
+{
+    if (!sc)
+       sc = rxnull_NewClientSecurityObject();
+    host->callback_rxcon =
+       rx_NewConnection(host->host, host->port, 1, sc, 0);
+    rx_SetConnDeadTime(host->callback_rxcon, 50);
+    rx_SetConnHardDeadTime(host->callback_rxcon, AFS_HARDDEADTIME);
+}
+
  /* Lookup a host given an IP address and UDP port number. */
  /* hostaddr and hport are in network order */
  /* Note: host should be released by caller if 0 == *heldp and non-null */
@@ -833,7 +852,7 @@ h_FreeConnection(struct rx_connection *tcon)
      if (client) {
         H_LOCK;
         if (client->tcon == tcon)
-           client->tcon = (struct rx_connection *)0;
+           client->tcon = NULL;
         H_UNLOCK;
      }
      return 0;
@@ -878,8 +897,11 @@ h_Enumerate(int (*proc) (), char *param)
      H_UNLOCK;
      for (i = 0; i < count; i++) {
         held[i] = (*proc) (list[i], held[i], param);
-       if (!held[i])
+       if (!H_ENUMERATE_ISSET_HELD(held[i]))
             h_Release(list[i]); /* this might free up the host */
+       /* bail out of the enumeration early */
+       if (H_ENUMERATE_ISSET_BAIL(held[i]))
+           break;
      }
      free((void *)list);
      free((void *)held);
@@ -908,17 +930,19 @@ h_Enumerate_r(int (*proc) (), struct host *enumstart, char *param)
         h_Hold_r(enumstart); 
      for (host = enumstart; host; host = next, held = nheld) {
         next = host->next;
-       if (next && !(nheld = h_Held_r(next)))
+       if (next && !(nheld = h_Held_r(next)) && !H_ENUMERATE_ISSET_BAIL(held))
             h_Hold_r(next);
         held = (*proc) (host, held, param);
-       if (!held)
+       if (!H_ENUMERATE_ISSET_HELD(held))
             h_Release_r(host); /* this might free up the host */
+       if (H_ENUMERATE_ISSET_BAIL(held))
+           break;
      }
  }                              /*h_Enumerate_r */
  
  /* inserts a new HashChain structure corresponding to this UUID */
-void
-hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
+static void
+h_AddHostToUuidHashTable_r(struct afsUUID *uuid, struct host *host)
  {
      int index;
      struct h_hashChain *chain;
@@ -929,7 +953,7 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
      /* insert into beginning of list for this bucket */
      chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain));
      if (!chain) {
-       ViceLog(0, ("Failed malloc in hashInsertUuid_r\n"));
+       ViceLog(0, ("Failed malloc in h_AddHostToUuidHashTable_r\n"));
         assert(0);
      }
      assert(chain);
@@ -940,8 +964,8 @@ hashInsertUuid_r(struct afsUUID *uuid, struct host *host)
  
  
  /* inserts a new HashChain structure corresponding to this address */
-void
-hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host)
+static void
+h_AddHostToHashTable_r(afs_uint32 addr, afs_uint16 port, struct host *host)
  {
      int index;
      struct h_hashChain *chain;
@@ -952,7 +976,7 @@ hashInsert_r(afs_uint32 addr, afs_uint16 port, struct host *host)
      /* insert into beginning of list for this bucket */
      chain = (struct h_hashChain *)malloc(sizeof(struct h_hashChain));
      if (!chain) {
-       ViceLog(0, ("Failed malloc in hashInsert_r\n"));
+       ViceLog(0, ("Failed malloc in h_AddHostToHashTable_r\n"));
         assert(0);
      }
      chain->hostPtr = host;
@@ -1017,7 +1041,7 @@ addInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port)
      /*
       * Create a hash table entry for this address
       */
-    hashInsert_r(addr, port, host);
+    h_AddHostToHashTable_r(addr, port, host);
  
      return 0;
  }
@@ -1072,7 +1096,7 @@ removeInterfaceAddr_r(struct host *host, afs_uint32 addr, afs_uint16 port)
      /*
       * Remove the hash table entry for this address
       */
-    hashDelete_r(addr, port, host);
+    h_DeleteHostFromHashTableByAddr_r(addr, port, host);
  
      return 0;
  }
@@ -1394,7 +1418,7 @@ h_GetHost_r(struct rx_connection *tcon)
                     /* the new host is held and locked */
                 } else {
                     /* This really is a new host */
-                   hashInsertUuid_r(&identP->uuid, host);
+                   h_AddHostToUuidHashTable_r(&identP->uuid, host);
                     cb_conn = host->callback_rxcon;
                     rx_GetConnection(cb_conn);          
                     H_UNLOCK;
@@ -1735,7 +1759,7 @@ h_FindClient_r(struct rx_connection *tcon)
             client->authClass = authClass;      /* rx only */
             client->sid = rxr_CidOf(tcon);
             client->VenusEpoch = rxr_GetEpoch(tcon);
-           client->CPS.prlist_val = 0;
+           client->CPS.prlist_val = NULL;
             client->CPS.prlist_len = 0;
             h_Unlock_r(host);
         }
@@ -2134,6 +2158,540 @@ h_DumpHosts()
  
  }                              /*h_DumpHosts */
  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * host state serialization
+ */
+static int h_stateFillHeader(struct host_state_header * hdr);
+static int h_stateCheckHeader(struct host_state_header * hdr);
+static int h_stateAllocMap(struct fs_dump_state * state);
+static int h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state);
+static int h_stateRestoreHost(struct fs_dump_state * state);
+static int h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state);
+static int h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state);
+static int h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port);
+static int h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h);
+static void h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out);
+static void h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out);
+
+
+/* this procedure saves all host state to disk for fast startup */
+int
+h_stateSave(struct fs_dump_state * state)
+{
+    AssignInt64(state->eof_offset, &state->hdr->h_offset);
+
+    /* XXX debug */
+    ViceLog(0, ("h_stateSave:  hostCount=%d\n", hostCount));
+
+    /* invalidate host state header */
+    memset(state->h_hdr, 0, sizeof(struct host_state_header));
+
+    if (fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr,
+                           sizeof(struct host_state_header))) {
+       state->bail = 1;
+       goto done;
+    }
+
+    fs_stateIncEOF(state, sizeof(struct host_state_header));
+
+    h_Enumerate_r(h_stateSaveHost, hostList, (char *)state);
+    if (state->bail) {
+       goto done;
+    }
+
+    h_stateFillHeader(state->h_hdr);
+
+    /* write the real header to disk */
+    state->bail = fs_stateWriteHeader(state, &state->hdr->h_offset, state->h_hdr,
+                                     sizeof(struct host_state_header));
+
+ done:
+    return state->bail;
+}
+
+/* demand attach fs
+ * host state serialization
+ *
+ * this procedure restores all host state from a disk for fast startup 
+ */
+int
+h_stateRestore(struct fs_dump_state * state)
+{
+    int i, records;
+
+    /* seek to the right position and read in the host state header */
+    if (fs_stateReadHeader(state, &state->hdr->h_offset, state->h_hdr,
+                          sizeof(struct host_state_header))) {
+       state->bail = 1;
+       goto done;
+    }
+
+    /* check the validity of the header */
+    if (h_stateCheckHeader(state->h_hdr)) {
+       state->bail = 1;
+       goto done;
+    }
+
+    records = state->h_hdr->records;
+
+    if (h_stateAllocMap(state)) {
+       state->bail = 1;
+       goto done;
+    }
+
+    /* iterate over records restoring host state */
+    for (i=0; i < records; i++) {
+       if (h_stateRestoreHost(state) != 0) {
+           state->bail = 1;
+           break;
+       }
+    }
+
+ done:
+    return state->bail;
+}
+
+int
+h_stateRestoreIndices(struct fs_dump_state * state)
+{
+    h_Enumerate_r(h_stateRestoreIndex, hostList, (char *)state);
+    return state->bail;
+}
+
+static int
+h_stateRestoreIndex(struct host * h, int held, struct fs_dump_state * state)
+{
+    if (cb_OldToNew(state, h->cblist, &h->cblist)) {
+       return H_ENUMERATE_BAIL(held);
+    }
+    return held;
+}
+
+int
+h_stateVerify(struct fs_dump_state * state)
+{
+    h_Enumerate_r(h_stateVerifyHost, hostList, (char *)state);
+    return state->bail;
+}
+
+static int
+h_stateVerifyHost(struct host * h, int held, struct fs_dump_state * state)
+{
+    int i;
+
+    if (h == NULL) {
+       ViceLog(0, ("h_stateVerifyHost: error: NULL host pointer in linked list\n"));
+       return H_ENUMERATE_BAIL(held);
+    }
+
+    if (h->interface) {
+       for (i = h->interface->numberOfInterfaces-1; i >= 0; i--) {
+           if (h_stateVerifyAddrHash(state, h, h->interface->interface[i].addr, 
+                                     h->interface->interface[i].port)) {
+               state->bail = 1;
+           }
+       }
+       if (h_stateVerifyUuidHash(state, h)) {
+           state->bail = 1;
+       }
+    } else if (h_stateVerifyAddrHash(state, h, h->host, h->port)) {
+       state->bail = 1;
+    }
+
+    if (cb_stateVerifyHCBList(state, h)) {
+       state->bail = 1;
+    }
+
+ done:
+    return held;
+}
+
+static int
+h_stateVerifyAddrHash(struct fs_dump_state * state, struct host * h, afs_uint32 addr, afs_uint16 port)
+{
+    int ret = 0, found = 0;
+    struct host *host = NULL;
+    struct h_hashChain *chain;
+    int index = h_HashIndex(addr);
+    char tmp[16];
+    int chain_len = 0;
+
+    for (chain = hostHashTable[index]; chain; chain = chain->next) {
+       host = chain->hostPtr;
+       if (host == NULL) {
+           afs_inet_ntoa_r(addr, tmp);
+           ViceLog(0, ("h_stateVerifyAddrHash: error: addr hash chain has NULL host ptr (lookup addr %s)\n", tmp));
+           ret = 1;
+           goto done;
+       }
+       if ((chain->addr == addr) && (chain->port == port)) {
+           if (host != h) {
+               ViceLog(0, ("h_stateVerifyAddrHash: warning: addr hash entry points to different host struct (%d, %d)\n", 
+                           h->index, host->index));
+               state->flags.warnings_generated = 1;
+           }
+           found = 1;
+           break;
+       }
+       if (chain_len > FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN) {
+           ViceLog(0, ("h_stateVerifyAddrHash: error: hash chain length exceeds %d; assuming there's a loop\n",
+                       FS_STATE_H_MAX_ADDR_HASH_CHAIN_LEN));
+           ret = 1;
+           goto done;
+       }
+       chain_len++;
+    }
+
+    if (!found) {
+       afs_inet_ntoa_r(addr, tmp);
+       if (state->mode == FS_STATE_LOAD_MODE) {
+           ViceLog(0, ("h_stateVerifyAddrHash: error: addr %s not found in hash\n", tmp));
+           ret = 1;
+           goto done;
+       } else {
+           ViceLog(0, ("h_stateVerifyAddrHash: warning: addr %s not found in hash\n", tmp));
+           state->flags.warnings_generated = 1;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+static int
+h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h)
+{
+    int ret = 0, found = 0;
+    struct host *host = NULL;
+    struct h_hashChain *chain;
+    afsUUID * uuidp = &h->interface->uuid;
+    int index = h_UuidHashIndex(uuidp);
+    char tmp[40];
+    int chain_len = 0;
+
+    for (chain = hostUuidHashTable[index]; chain; chain = chain->next) {
+       host = chain->hostPtr;
+       if (host == NULL) {
+           afsUUID_to_string(uuidp, tmp, sizeof(tmp));
+           ViceLog(0, ("h_stateVerifyUuidHash: error: uuid hash chain has NULL host ptr (lookup uuid %s)\n", tmp));
+           ret = 1;
+           goto done;
+       }
+       if (host->interface &&
+           afs_uuid_equal(&host->interface->uuid, uuidp)) {
+           if (host != h) {
+               ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid hash entry points to different host struct (%d, %d)\n", 
+                           h->index, host->index));
+               state->flags.warnings_generated = 1;
+           }
+           found = 1;
+           goto done;
+       }
+       if (chain_len > FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN) {
+           ViceLog(0, ("h_stateVerifyUuidHash: error: hash chain length exceeds %d; assuming there's a loop\n",
+                       FS_STATE_H_MAX_UUID_HASH_CHAIN_LEN));
+           ret = 1;
+           goto done;
+       }
+       chain_len++;
+    }
+
+    if (!found) {
+       afsUUID_to_string(uuidp, tmp, sizeof(tmp));
+       if (state->mode == FS_STATE_LOAD_MODE) {
+           ViceLog(0, ("h_stateVerifyUuidHash: error: uuid %s not found in hash\n", tmp));
+           ret = 1;
+           goto done;
+       } else {
+           ViceLog(0, ("h_stateVerifyUuidHash: warning: uuid %s not found in hash\n", tmp));
+           state->flags.warnings_generated = 1;
+       }
+    }
+
+ done:
+    return ret;
+}
+
+/* create the host state header structure */
+static int
+h_stateFillHeader(struct host_state_header * hdr)
+{
+    hdr->stamp.magic = HOST_STATE_MAGIC;
+    hdr->stamp.version = HOST_STATE_VERSION;
+}
+
+/* check the contents of the host state header structure */
+static int
+h_stateCheckHeader(struct host_state_header * hdr)
+{
+    int ret=0;
+
+    if (hdr->stamp.magic != HOST_STATE_MAGIC) {
+       ViceLog(0, ("check_host_state_header: invalid state header\n"));
+       ret = 1;
+    }
+    else if (hdr->stamp.version != HOST_STATE_VERSION) {
+       ViceLog(0, ("check_host_state_header: unknown version number\n"));
+       ret = 1;
+    }
+    return ret;
+}
+
+/* allocate the host id mapping table */
+static int
+h_stateAllocMap(struct fs_dump_state * state)
+{
+    state->h_map.len = state->h_hdr->index_max + 1;
+    state->h_map.entries = (struct idx_map_entry_t *)
+       calloc(state->h_map.len, sizeof(struct idx_map_entry_t));
+    return (state->h_map.entries != NULL) ? 0 : 1;
+}
+
+/* function called by h_Enumerate to save a host to disk */
+static int
+h_stateSaveHost(register struct host * host, int held, struct fs_dump_state * state)
+{
+    int i, if_len=0, hcps_len=0;
+    struct hostDiskEntry hdsk;
+    struct host_state_entry_header hdr;
+    struct Interface * ifp = NULL;
+    afs_int32 * hcps = NULL;
+    struct iovec iov[4];
+    int iovcnt = 2;
+
+    memset(&hdr, 0, sizeof(hdr));
+
+    if (state->h_hdr->index_max < host->index) {
+       state->h_hdr->index_max = host->index;
+    }
+
+    h_hostToDiskEntry_r(host, &hdsk);
+    if (host->interface) {
+       if_len = sizeof(struct Interface) + 
+           ((host->interface->numberOfInterfaces-1) * sizeof(struct AddrPort));
+       ifp = (struct Interface *) malloc(if_len);
+       assert(ifp != NULL);
+       memcpy(ifp, host->interface, if_len);
+       hdr.interfaces = host->interface->numberOfInterfaces;
+       iov[iovcnt].iov_base = (char *) ifp;
+       iov[iovcnt].iov_len = if_len;
+       iovcnt++;
+    }
+    if (host->hcps.prlist_val) {
+       hdr.hcps = host->hcps.prlist_len;
+       hcps_len = hdr.hcps * sizeof(afs_int32);
+       hcps = (afs_int32 *) malloc(hcps_len);
+       assert(hcps != NULL);
+       memcpy(hcps, host->hcps.prlist_val, hcps_len);
+       iov[iovcnt].iov_base = (char *) hcps;
+       iov[iovcnt].iov_len = hcps_len;
+       iovcnt++;
+    }
+
+    if (hdsk.index > state->h_hdr->index_max)
+       state->h_hdr->index_max = hdsk.index;
+
+    hdr.len = sizeof(struct host_state_entry_header) + 
+       sizeof(struct hostDiskEntry) + if_len + hcps_len;
+    hdr.magic = HOST_STATE_ENTRY_MAGIC;
+
+    iov[0].iov_base = (char *) &hdr;
+    iov[0].iov_len = sizeof(hdr);
+    iov[1].iov_base = (char *) &hdsk;
+    iov[1].iov_len = sizeof(struct hostDiskEntry);
+    
+    if (fs_stateWriteV(state, iov, iovcnt)) {
+       ViceLog(0, ("h_stateSaveHost: failed to save host %d", host->index));
+       state->bail = 1;
+    }
+
+    fs_stateIncEOF(state, hdr.len);
+
+    state->h_hdr->records++;
+
+ done:
+    if (ifp)
+       free(ifp);
+    if (hcps)
+       free(hcps);
+    if (state->bail) {
+       return H_ENUMERATE_BAIL(held);
+    }
+    return held;
+}
+
+/* restores a host from disk */
+static int
+h_stateRestoreHost(struct fs_dump_state * state)
+{
+    int ifp_len=0, hcps_len=0, bail=0;
+    struct host_state_entry_header hdr;
+    struct hostDiskEntry hdsk;
+    struct host *host = NULL;
+    struct Interface *ifp = NULL;
+    afs_int32 * hcps = NULL;
+    struct iovec iov[3];
+    int iovcnt = 1;
+
+    if (fs_stateRead(state, &hdr, sizeof(hdr))) {
+       ViceLog(0, ("h_stateRestoreHost: failed to read host entry header from dump file '%s'\n",
+                   state->fn));
+       bail = 1;
+       goto done;
+    }
+
+    if (hdr.magic != HOST_STATE_ENTRY_MAGIC) {
+       ViceLog(0, ("h_stateRestoreHost: fileserver state dump file '%s' is corrupt.\n",
+                   state->fn));
+       bail = 1;
+       goto done;
+    }
+
+    iov[0].iov_base = (char *) &hdsk;
+    iov[0].iov_len = sizeof(struct hostDiskEntry);
+
+    if (hdr.interfaces) {
+       ifp_len = sizeof(struct Interface) +
+           ((hdr.interfaces-1) * sizeof(struct AddrPort));
+       ifp = (struct Interface *) malloc(ifp_len);
+       assert(ifp != NULL);
+       iov[iovcnt].iov_base = (char *) ifp;
+       iov[iovcnt].iov_len = ifp_len;
+       iovcnt++;
+    }
+    if (hdr.hcps) {
+       hcps_len = hdr.hcps * sizeof(afs_int32);
+       hcps = (afs_int32 *) malloc(hcps_len);
+       assert(hcps != NULL);
+       iov[iovcnt].iov_base = (char *) hcps;
+       iov[iovcnt].iov_len = hcps_len;
+       iovcnt++;
+    }
+
+    if ((ifp_len + hcps_len + sizeof(hdsk) + sizeof(hdr)) != hdr.len) {
+       ViceLog(0, ("h_stateRestoreHost: host entry header length fields are inconsistent\n"));
+       bail = 1;
+       goto done;
+    }
+
+    if (fs_stateReadV(state, iov, iovcnt)) {
+       ViceLog(0, ("h_stateRestoreHost: failed to read host entry\n"));
+       bail = 1;
+       goto done;
+    }
+
+    if (!hdr.hcps && hdsk.hcps_valid) {
+       /* valid, zero-length host cps ; does this ever happen? */
+       hcps = (afs_int32 *) malloc(sizeof(afs_int32));
+       assert(hcps != NULL);
+    }
+
+    host = GetHT();
+    assert(host != NULL);
+
+    if (ifp) {
+       host->interface = ifp;
+    }
+    if (hcps) {
+       host->hcps.prlist_val = hcps;
+       host->hcps.prlist_len = hdr.hcps;
+    }
+
+    h_diskEntryToHost_r(&hdsk, host);
+    h_SetupCallbackConn_r(host);
+
+    if (ifp) {
+       int i;
+       for (i = ifp->numberOfInterfaces-1; i >= 0; i--) {
+           h_AddHostToHashTable_r(ifp->interface[i].addr, 
+                                  ifp->interface[i].port, host);
+       }
+       h_AddHostToUuidHashTable_r(&ifp->uuid, host);
+    } else {
+       h_AddHostToHashTable_r(host->host, host->port, host);
+    }
+    h_InsertList_r(host);
+
+    /* setup host id map entry */
+    state->h_map.entries[hdsk.index].old_idx = hdsk.index;
+    state->h_map.entries[hdsk.index].new_idx = host->index;
+
+ done:
+    if (bail) {
+       if (ifp)
+           free(ifp);
+       if (hcps)
+           free(hcps);
+    }
+    return bail;
+}
+
+/* serialize a host structure to disk */
+static void
+h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out)
+{
+    out->host = in->host;
+    out->port = in->port;
+    out->hostFlags = in->hostFlags;
+    out->Console = in->Console;
+    out->hcpsfailed = in->hcpsfailed;
+    out->LastCall = in->LastCall;
+    out->ActiveCall = in->ActiveCall;
+    out->cpsCall = in->cpsCall;
+    out->cblist = in->cblist;
+#ifdef FS_STATS_DETAILED
+    out->InSameNetwork = in->InSameNetwork;
+#endif
+
+    /* special fields we save, but are not memcpy'd back on restore */
+    out->index = in->index;
+    out->hcps_len = in->hcps.prlist_len;
+    out->hcps_valid = (in->hcps.prlist_val == NULL) ? 0 : 1;
+}
+
+/* restore a host structure from disk */
+static void
+h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out)
+{
+    out->host = in->host;
+    out->port = in->port;
+    out->hostFlags = in->hostFlags;
+    out->Console = in->Console;
+    out->hcpsfailed = in->hcpsfailed;
+    out->LastCall = in->LastCall;
+    out->ActiveCall = in->ActiveCall;
+    out->cpsCall = in->cpsCall;
+    out->cblist = in->cblist;
+#ifdef FS_STATS_DETAILED
+    out->InSameNetwork = in->InSameNetwork;
+#endif
+}
+
+/* index translation routines */
+int
+h_OldToNew(struct fs_dump_state * state, afs_uint32 old, afs_uint32 * new)
+{
+    int ret = 0;
+
+    /* hosts use a zero-based index, so old==0 is valid */
+
+    if (old >= state->h_map.len) {
+       ViceLog(0, ("h_OldToNew: index %d is out of range\n", old));
+       ret = 1;
+    } else if (state->h_map.entries[old].old_idx != old) { /* sanity check */
+       ViceLog(0, ("h_OldToNew: index %d points to an invalid host record\n", old));
+       ret = 1;
+    } else {
+       *new = state->h_map.entries[old].new_idx;
+    }
+
+ done:
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  
  /*
   * This counts the number of workstations, the number of active workstations,
@@ -2348,13 +2906,23 @@ static struct AFSFid zerofid;
   * Since it can serialize them, and pile up, it should be a separate LWP
   * from other events.
   */
-int
+static int
  CheckHost(register struct host *host, int held)
  {
      register struct client *client;
      struct rx_connection *cb_conn = NULL;
      int code;
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* kill the checkhost lwp ASAP during shutdown */
+    FS_STATE_RDLOCK;
+    if (fs_state.mode == FS_MODE_SHUTDOWN) {
+       FS_STATE_UNLOCK;
+       return H_ENUMERATE_BAIL(held);
+    }
+    FS_STATE_UNLOCK;
+#endif
+
      /* Host is held by h_Enumerate */
      H_LOCK;
      for (client = host->FirstClient; client; client = client->next) {
@@ -2455,7 +3023,7 @@ CheckHost(register struct host *host, int held)
   * This routine is called roughly every 5 minutes.
   */
  void
-h_CheckHosts()
+h_CheckHosts(void)
  {
      afs_uint32 now = FT_ApproxTime();
  
@@ -2570,7 +3138,7 @@ initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf)
  /* deleted a HashChain structure for this address and host */
  /* returns 1 on success */
  static int
-hashDelete_r(afs_uint32 addr, afs_uint16 port, struct host *host)
+h_DeleteHostFromHashTableByAddr_r(afs_uint32 addr, afs_uint16 port, struct host *host)
  {
      int flag;
      register struct h_hashChain **hp, *th;
diff --git a/src/viced/host.h b/src/viced/host.h

index bd17cfd156bfdaabc67c3054cd0ee7fe29d0dc0c..60df3bcea794b2485ddf790f2791fb6c2c05618c 100644 (file)
--- a/src/viced/host.h
+++ b/src/viced/host.h
@@ -5,8 +5,13 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
+#ifndef _AFS_VICED_HOST_H
+#define _AFS_VICED_HOST_H
+
  #include "fs_stats.h"          /*File Server stats package */
  
  #ifdef AFS_PTHREAD_ENV
@@ -59,6 +64,7 @@ struct Interface {
      struct AddrPort interface[1];/* there are actually more than one here */
      /* in network byte order */
  };
+
  struct host {
      struct host *next, *prev;  /* linked list of all hosts */
      struct rx_connection *callback_rxcon;      /* rx callback connection */
@@ -85,7 +91,7 @@ struct host {
      struct client *FirstClient;        /* first connection from host */
      afs_uint32 cpsCall;                /* time of last cps call from this host */
      struct Interface *interface;       /* all alternate addr for client */
-    afs_uint32 cblist;         /* Call back list for this host */
+    afs_uint32 cblist;         /* index of a cb in the per-host circular CB list */
      /*
       * These don't get zeroed, keep them at the end. If index doesn't
       * follow an unsigned short then we need to pad to ensure that
@@ -142,6 +148,7 @@ struct client {
  /* Don't zero the lock */
  #define CLIENT_TO_ZERO(C)      ((int)(((char *)(&((C)->lock))-(char *)(C))))
  
+
  /*
   * key for the client structure stored in connection specific data
   */
@@ -245,6 +252,19 @@ extern void h_CheckHosts();
  struct Interface *MultiVerifyInterface_r();
  extern int initInterfaceAddr_r(struct host *host, struct interfaceAddr *interf);
  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * state serialization
+ */
+extern int h_SaveState(void);
+extern int h_RestoreState(void);
+#endif
+
+#define H_ENUMERATE_BAIL(held)        ((held)|0x80000000)
+#define H_ENUMERATE_ISSET_BAIL(held)  ((held)&0x80000000)
+#define H_ENUMERATE_ISSET_HELD(held)  ((held)&0x7FFFFFFF)
+
  struct host *(hosttableptrs[h_MAXHOSTTABLES]); /* Used by h_itoh */
  #define h_htoi(host) ((host)->index)   /* index isn't zeroed, no need to lock */
  #define h_itoh(hostindex) (hosttableptrs[(hostindex)>>h_HTSHIFT]+((hostindex)&(h_HTSPERBLOCK-1)))
@@ -269,4 +289,4 @@ struct host *(hosttableptrs[h_MAXHOSTTABLES]);      /* Used by h_itoh */
  #define HFE_LATER                       0x80   /* host has FE_LATER callbacks */
  #define HERRORTRANS                    0x100   /* do error translation */
  
-
+#endif /* _AFS_VICED_HOST_H */
diff --git a/src/viced/viced.c b/src/viced/viced.c

index 1202d933a443e2b4dc8f69686e740a1eb21ef78e..1c7296bf229f6f4274700b574b77a388405098b4 100644 (file)
--- a/src/viced/viced.c
+++ b/src/viced/viced.c
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*  viced.c    - File Server main loop                                  */
@@ -215,6 +217,27 @@ afsUUID FS_HostUUID;
  
  static void FlagMsg();
  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver mode support
+ *
+ * during fileserver shutdown, we have to track the graceful shutdown of
+ * certain background threads before we are allowed to dump state to
+ * disk
+ */
+struct fs_state fs_state = 
+    { FS_MODE_NORMAL, 
+      0, 
+      0, 
+      0, 
+      0,
+      { 1,1,1,1 },
+      PTHREAD_COND_INITIALIZER,
+      PTHREAD_RWLOCK_INITIALIZER
+    };
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  /*
   * Home for the performance statistics.
   */
@@ -420,13 +443,31 @@ FiveMinuteCheckLWP()
  
      ViceLog(1, ("Starting five minute check process\n"));
      setThreadId("FiveMinuteCheckLWP");
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+       fs_state.FiveMinuteLWP_tranquil = 1;
+       FS_STATE_UNLOCK;
+#else
      while (1) {
+#endif
+
  #ifdef AFS_PTHREAD_ENV
         sleep(fiveminutes);
  #else /* AFS_PTHREAD_ENV */
         IOMGR_Sleep(fiveminutes);
  #endif /* AFS_PTHREAD_ENV */
  
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+       if (fs_state.mode != FS_MODE_NORMAL) {
+           break;
+       }
+       fs_state.FiveMinuteLWP_tranquil = 0;
+       FS_STATE_UNLOCK;
+#endif
+
         /* close the log so it can be removed */
         ReOpenLog(AFSDIR_SERVER_FILELOG_FILEPATH);      /* don't trunc, just append */
         ViceLog(2, ("Cleaning up timed out callbacks\n"));
@@ -452,7 +493,17 @@ FiveMinuteCheckLWP()
                          afs_ctime(&now, tbuffer, sizeof(tbuffer))));
             }
         }
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+#endif
      }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.FiveMinuteLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif
  }                              /*FiveMinuteCheckLWP */
  
  
@@ -460,20 +511,50 @@ FiveMinuteCheckLWP()
   * other 5 minute activities because it may be delayed by timeouts when
   * it probes the workstations
   */
+
  static void
  HostCheckLWP()
  {
      ViceLog(1, ("Starting Host check process\n"));
      setThreadId("HostCheckLWP");
-    while (1) {
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+       fs_state.HostCheckLWP_tranquil = 1;
+       FS_STATE_UNLOCK;
+#else
+    while(1) {
+#endif
+
  #ifdef AFS_PTHREAD_ENV
         sleep(fiveminutes);
  #else /* AFS_PTHREAD_ENV */
         IOMGR_Sleep(fiveminutes);
  #endif /* AFS_PTHREAD_ENV */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+       if (fs_state.mode != FS_MODE_NORMAL) {
+           break;
+       }
+       fs_state.HostCheckLWP_tranquil = 0;
+       FS_STATE_UNLOCK;
+#endif
+
         ViceLog(2, ("Checking for dead venii & clients\n"));
         h_CheckHosts();
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+#endif
      }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.HostCheckLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif
  }                              /*HostCheckLWP */
  
  /* This LWP does fsync checks every 5 minutes:  it should not be used for
@@ -496,7 +577,14 @@ FsyncCheckLWP()
      assert(pthread_mutex_init(&fsync_glock_mutex, NULL) == 0);
  #endif
  
-    while (1) {
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    while (fs_state.mode == FS_MODE_NORMAL) {
+       fs_state.FsyncCheckLWP_tranquil = 1;
+       FS_STATE_UNLOCK;
+#else
+    while(1) {
+#endif
         FSYNC_LOCK;
  #ifdef AFS_PTHREAD_ENV
         /* rounding is fine */
@@ -513,11 +601,31 @@ FsyncCheckLWP()
             ViceLog(0, ("LWP_WaitProcess returned %d\n", code));
  #endif /* AFS_PTHREAD_ENV */
         FSYNC_UNLOCK;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+       if (fs_state.mode != FS_MODE_NORMAL) {
+           break;
+       }
+       fs_state.FsyncCheckLWP_tranquil = 0;
+       FS_STATE_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
         ViceLog(2, ("Checking for fsync events\n"));
         do {
             code = BreakLaterCallBacks();
         } while (code != 0);
+#ifdef AFS_DEMAND_ATTACH_FS
+       FS_STATE_WRLOCK;
+#endif
      }
+#ifdef AFS_DEMAND_ATTACH_FS
+    fs_state.FsyncCheckLWP_tranquil = 1;
+    FS_LOCK;
+    assert(pthread_cond_broadcast(&fs_state.worker_done_cv)==0);
+    FS_UNLOCK;
+    FS_STATE_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
  
  /*------------------------------------------------------------------------
@@ -604,6 +712,11 @@ PrintCounters()
             ("Vice was last started at %s\n",
              afs_ctime(&StartTime, tbuffer, sizeof(tbuffer))));
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* XXX perhaps set extended stats verbosity flags
+     * based upon LogLevel ?? */
+    VPrintExtendedCacheStats(VOL_STATS_PER_CHAIN2);
+#endif
      VPrintCacheStats();
      VPrintDiskStats();
      DStat(&dirbuff, &dircall, &dirio);
@@ -656,6 +769,16 @@ ShutDownAndCore(int dopanic)
      time_t now = time(0);
      char tbuffer[32];
  
+    /* do not allows new reqests to be served from now on, all new requests
+     * are returned with an error code of RX_RESTARTING ( transient failure ) */
+    rx_SetRxTranquil();                /* dhruba */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    FS_STATE_WRLOCK;
+    fs_state.mode = FS_MODE_SHUTDOWN;
+    FS_STATE_UNLOCK;
+#endif
+
      ViceLog(0,
             ("Shutting down file server at %s",
              afs_ctime(&now, tbuffer, sizeof(tbuffer))));
@@ -671,11 +794,34 @@ ShutDownAndCore(int dopanic)
      if (!dopanic)
         PrintCounters();
  
-    /* do not allows new reqests to be served from now on, all new requests
-     * are returned with an error code of RX_RESTARTING ( transient failure ) */
-    rx_SetRxTranquil();                /* dhruba */
+    /* shut down volume package */
      VShutdown();
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (fs_state.options.fs_state_save) {
+       /* 
+        * demand attach fs
+        * save fileserver state to disk */
+
+       /* make sure background threads have finished all of their asynchronous 
+        * work on host and callback structures */
+       FS_STATE_RDLOCK;
+       while (!fs_state.FiveMinuteLWP_tranquil ||
+              !fs_state.HostCheckLWP_tranquil ||
+              !fs_state.FsyncCheckLWP_tranquil) {
+           FS_LOCK;
+           FS_STATE_UNLOCK;
+           ViceLog(0, ("waiting for background host/callback threads to quiesce before saving fileserver state...\n"));
+           assert(pthread_cond_wait(&fs_state.worker_done_cv, &fileproc_glock_mutex) == 0);
+           FS_UNLOCK;
+           FS_STATE_RDLOCK;
+       }
+
+       /* ok. it should now be fairly safe. let's do the state dump */
+       fs_stateSave();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
      if (debugFile) {
         rx_PrintStats(debugFile);
         fflush(debugFile);
@@ -715,7 +861,7 @@ ShutDown(void)
  static void
  FlagMsg()
  {
-    char buffer[1024];
+    char buffer[2048];
  
      /* default supports help flag */
  
@@ -743,8 +889,18 @@ FlagMsg()
      strcat(buffer, "[-rxdbg (enable rx debugging)] ");
      strcat(buffer, "[-rxdbge (enable rxevent debugging)] ");
      strcat(buffer, "[-rxmaxmtu <bytes>] ");
-#if AFS_PTHREAD_ENV
-    strcat(buffer, "[-vattachpar <number of volume attach threads>] ");
+#ifdef AFS_DEMAND_ATTACH_FS
+    strcat(buffer, "[-fs-state-dont-save (disable state save during shutdown)] ");
+    strcat(buffer, "[-fs-state-dont-restore (disable state restore during startup)] ");
+    strcat(buffer, "[-fs-state-verify <none|save|restore|both> (default is both)] ");
+    strcat(buffer, "[-vattachpar <max number of volume attach/shutdown threads> (default is 1)] ");
+    strcat(buffer, "[-vhashsize <log(2) of number of volume hash buckets> (default is 8)] ");
+    strcat(buffer, "[-vlrudisable (disable VLRU functionality)] ");
+    strcat(buffer, "[-vlruthresh <minutes before unused volumes become eligible for soft detach> (default is 2 hours)] ");
+    strcat(buffer, "[-vlruinterval <seconds between VLRU scans> (default is 2 minutes)] ");
+    strcat(buffer, "[-vlrumax <max volumes to soft detach in one VLRU scan> (default is 8)] ");
+#elif AFS_PTHREAD_ENV
+    strcat(buffer, "[-vattachpar <number of volume attach threads> (default is 1)] ");
  #endif
  #ifdef AFS_AIX32_ENV
      strcat(buffer, "[-m <min percentage spare in partition>] ");
@@ -945,11 +1101,62 @@ ParseArgs(int argc, char *argv[])
  #ifdef AFS_PTHREAD_ENV
         } else if (!strcmp(argv[i], "-vattachpar")) {
              if ((i + 1) >= argc) {
-               fprintf(stderr, "missing argument for -vattachpar\n"); 
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
                 return -1; 
             }
             vol_attach_threads = atoi(argv[++i]);
  #endif /* AFS_PTHREAD_ENV */
+#ifdef AFS_DEMAND_ATTACH_FS
+       } else if (!strcmp(argv[i], "-fs-state-dont-save")) {
+           fs_state.options.fs_state_save = 0;
+       } else if (!strcmp(argv[i], "-fs-state-dont-restore")) {
+           fs_state.options.fs_state_restore = 0;
+       } else if (!strcmp(argv[i], "-fs-state-verify")) {
+            if ((i + 1) >= argc) {
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
+               return -1; 
+           }
+           i++;
+           if (!strcmp(argv[i], "none")) {
+               fs_state.options.fs_state_verify_before_save = 0;
+               fs_state.options.fs_state_verify_after_restore = 0;
+           } else if (!strcmp(argv[i], "save")) {
+               fs_state.options.fs_state_verify_after_restore = 0;
+           } else if (!strcmp(argv[i], "restore")) {
+               fs_state.options.fs_state_verify_before_save = 0;
+           } else if (!strcmp(argv[i], "both")) {
+               /* default */
+           } else {
+               fprintf(stderr, "invalid argument for %s\n", argv[i-1]);
+               return -1;
+           }
+       } else if (!strcmp(argv[i], "-vhashsize")) {
+            if ((i + 1) >= argc) {
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
+               return -1; 
+           }
+           VSetVolHashSize(atoi(argv[++i]));
+       } else if (!strcmp(argv[i], "-vlrudisable")) {
+           VLRU_SetOptions(VLRU_SET_ENABLED, 0);
+       } else if (!strcmp(argv[i], "-vlruthresh")) {
+            if ((i + 1) >= argc) {
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
+               return -1; 
+           }
+           VLRU_SetOptions(VLRU_SET_THRESH, 60*atoi(argv[++i]));
+       } else if (!strcmp(argv[i], "-vlruinterval")) {
+            if ((i + 1) >= argc) {
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
+               return -1; 
+           }
+           VLRU_SetOptions(VLRU_SET_INTERVAL, atoi(argv[++i]));
+       } else if (!strcmp(argv[i], "-vlrumax")) {
+            if ((i + 1) >= argc) {
+               fprintf(stderr, "missing argument for %s\n", argv[i]); 
+               return -1; 
+           }
+           VLRU_SetOptions(VLRU_SET_MAX, atoi(argv[++i]));
+#endif /* AFS_DEMAND_ATTACH_FS */
         } else if (!strcmp(argv[i], "-s")) {
             Sawsmall = 1;
              if ((i + 1) >= argc) {
@@ -1923,6 +2130,15 @@ main(int argc, char *argv[])
         exit(1);
      }
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (fs_state.options.fs_state_restore) {
+       /*
+        * demand attach fs
+        * restore fileserver state */
+       fs_stateRestore();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
      /*
       * We are done calling fopen/fdopen. It is safe to use a large
       * of the file descriptor cache.
diff --git a/src/viced/viced.h b/src/viced/viced.h

index 3b230e5311941ba11507cb912e3d6c73674c152f..d8c837cad822f5f446b8d581646776d9684f18cd 100644 (file)
--- a/src/viced/viced.h
+++ b/src/viced/viced.h
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*  file.h     - include file for the File Server                      */
@@ -20,6 +22,9 @@
   * Start with clean version to sync test and dev trees.
   * */
  
+#ifndef _AFS_VICED_VICED_H
+#define _AFS_VICED_VICED_H
+
  #include <afs/afssyscalls.h>
  #include <afs/afsutil.h>
  #include "fs_stats.h"          /*Defs for xstat-based statistics */
@@ -46,18 +51,6 @@ typedef struct DirHandle {
  } DirHandle;
  
  
-struct cbcounters {
-    int DeleteFiles;
-    int DeleteCallBacks;
-    int BreakCallBacks;
-    int AddCallBacks;
-    int GotSomeSpaces;
-    int DeleteAllCallBacks;
-    int nFEs, nCBs, nblks;
-    int CBsTimedOut;
-    int nbreakers;
-    int GSS1, GSS2, GSS3, GSS4, GSS5;
-};
  
  #define MAXCNTRS (AFS_HIGHEST_OPCODE+1)
  
@@ -219,3 +212,46 @@ extern pthread_mutex_t fsync_glock_mutex;
  #define FSYNC_LOCK
  #define FSYNC_UNLOCK
  #endif /* AFS_PTHREAD_ENV */
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver mode support
+ */
+struct fs_state {
+    volatile int mode;
+    volatile byte FiveMinuteLWP_tranquil;      /* five minute check thread is shutdown or sleeping */
+    volatile byte HostCheckLWP_tranquil;       /* host check thread is shutdown or sleeping */
+    volatile byte FsyncCheckLWP_tranquil;      /* fsync check thread is shutdown or sleeping */
+    volatile byte salvsync_fatal_error;        /* fatal error with salvsync comm */
+
+    /* some command-line options we use in 
+     * various places
+     *
+     * these fields are immutable once we
+     * go multithreaded */
+    struct {
+       byte fs_state_save;
+       byte fs_state_restore;
+       byte fs_state_verify_before_save;
+       byte fs_state_verify_after_restore;
+    } options;
+
+    pthread_cond_t worker_done_cv;
+    pthread_rwlock_t state_lock;
+};
+
+extern struct fs_state fs_state;
+
+/* this lock is defined to be directly above FS_LOCK in the locking hierarchy */
+#define FS_STATE_RDLOCK  assert(pthread_rwlock_rdlock(&fs_state.state_lock) == 0)
+#define FS_STATE_WRLOCK  assert(pthread_rwlock_wrlock(&fs_state.state_lock) == 0)
+#define FS_STATE_UNLOCK  assert(pthread_rwlock_unlock(&fs_state.state_lock) == 0)
+
+#define FS_MODE_NORMAL    0
+#define FS_MODE_SHUTDOWN  1
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+#endif /* _AFS_VICED_VICED_H */
diff --git a/src/viced/viced_prototypes.h b/src/viced/viced_prototypes.h

index df11f8aa5bdf9d9a697cbe9d0bdd4627511e5d28..556d3500c5b00b7b835e8987ed718be7702a68af 100644 (file)
--- a/src/viced/viced_prototypes.h
+++ b/src/viced/viced_prototypes.h
@@ -1,4 +1,27 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+#ifndef _AFS_VICED_VICED_PROTOTYPES_H
+#define _AFS_VICED_VICED_PROTOTYPES_H
+
  extern int sendBufSize;
  afs_int32 sys_error_to_et(afs_int32 in);
  void init_sys_error_to_et(void);
+  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * fileserver state serialization
+ */
+extern int fs_stateSave(void);
+extern int fs_stateRestore(void);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  
+#endif /* _AFS_VICED_VICED_PROTOTYPES_H */
diff --git a/src/vol/Makefile.in b/src/vol/Makefile.in

index 114a3049971b6fda9258448b985054756af2a9ea..33131a060000df9c28053483b138ca34b4307fa8 100644 (file)
--- a/src/vol/Makefile.in
+++ b/src/vol/Makefile.in
@@ -16,22 +16,23 @@ LIBS=${TOP_LIBDIR}/libcmd.a vlib.a ${TOP_LIBDIR}/util.a \
         ${TOP_LIBDIR}/libsys.a ${TOP_LIBDIR}/libdir.a \
         ${TOP_LIBDIR}/liblwp.a  ${TOP_LIBDIR}/libacl.a
  
-CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS}
+CFLAGS = ${COMMON_CFLAGS} -D${SYS_NAME} ${FSINCLUDES} ${XCFLAGS} ${ARCHFLAGS} -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT
  
-PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h\
-       fssync.h ihandle.h namei_ops.h
+PUBLICHEADERS=nfs.h vnode.h viceinode.h volume.h voldefs.h partition.h \
+       fssync.h ihandle.h namei_ops.h salvsync.h daemon_com.h
  
-VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync.o purge.o \
-        clone.o nuke.o devname.o listinodes.o common.o ihandle.o \
-        namei_ops.o
+VLIBOBJS=vnode.o volume.o vutil.o partition.o fssync-server.o fssync-client.o \
+        clone.o nuke.o devname.o listinodes.o common.o ihandle.o purge.o \
+        namei_ops.o salvsync-server.o salvsync-client.o daemon_com.o
  
-OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o
+OBJECTS=${VLIBOBJS} physio.o vol-salvage.o vol-info.o vol-dump.o vol-bless.o fssync-debug.o
  
  all: gi \
         ${TOP_LIBDIR}/vlib.a \
         ${TOP_LIBDIR}/libvlib.a \
         salvager \
         volinfo \
+       fssync-debug \
         $(FS_CONV_OSF40D) \
         $(XFS_SIZE_CHECK) \
         $(FS_CONV_SOL26) \
@@ -42,6 +43,8 @@ all: gi \
         ${TOP_INCDIR}/afs/voldefs.h \
         ${TOP_INCDIR}/afs/partition.h \
         ${TOP_INCDIR}/afs/fssync.h \
+       ${TOP_INCDIR}/afs/salvsync.h \
+       ${TOP_INCDIR}/afs/daemon_com.h \
         ${TOP_INCDIR}/afs/ihandle.h \
         ${TOP_INCDIR}/afs/namei_ops.h
  
@@ -53,6 +56,7 @@ install: \
         ${DESTDIR}${libdir}/afs/libvlib.a \
         ${DESTDIR}${afssrvlibexecdir}/salvager \
         ${DESTDIR}${afssrvsbindir}/volinfo \
+       ${DESTDIR}${afssrvsbindir}/fssync-debug \
         $(install_FS_CONV_OSF40D) \
         $(install_XFS_SIZE_CHECK) \
         $(install_FS_CONV_SOL26) \
@@ -63,6 +67,8 @@ install: \
         ${DESTDIR}${includedir}/afs/voldefs.h \
         ${DESTDIR}${includedir}/afs/partition.h \
         ${DESTDIR}${includedir}/afs/fssync.h \
+       ${DESTDIR}${includedir}/afs/salvsync.h \
+       ${DESTDIR}${includedir}/afs/daemon_com.h \
         ${DESTDIR}${includedir}/afs/ihandle.h \
         ${DESTDIR}${includedir}/afs/namei_ops.h
  
@@ -72,6 +78,11 @@ ${DEST}/root.server/usr/afs/bin/salvager: salvager
  ${DEST}/root.server/usr/afs/bin/volinfo: volinfo
         ${INSTALL} -s $? $@
  
+${DEST}/root.server/usr/afs/bin/fssync-debug: fssync-debug
+       if test "@DEMAND_ATTACH@" = "no"; then \
+               ${INSTALL} -s $? $@ ; \
+       fi
+
  ${DEST}/lib/afs/vlib.a: vlib.a
         ${INSTALL} $? $@
  
@@ -117,6 +128,12 @@ ${DEST}/include/afs/partition.h: partition.h
  ${DEST}/include/afs/fssync.h: fssync.h
         ${INSTALL} $? $@
  
+${DEST}/include/afs/salvsync.h: salvsync.h
+       ${INSTALL} $? $@
+
+${DEST}/include/afs/daemon_com.h: daemon_com.h
+       ${INSTALL} $? $@
+
  ${DEST}/include/afs/ihandle.h: ihandle.h
         ${INSTALL} $? $@
  
@@ -129,6 +146,8 @@ ${DEST}/include/afs/namei_ops.h: namei_ops.h
  ${OBJECTS}: ${PUBLICHEADERS} ${TOP_INCDIR}/lwp.h ${TOP_INCDIR}/lock.h ${TOP_INCDIR}/afs/afsint.h vutils.h salvage.h AFS_component_version_number.c
  
  vol-salvage.o vutil.o: volinodes.h
+vol-salvage.o salvager.o: vol-salvage.h
+vol-salvage.o: salvsync.h daemon_com.h
  
  vlib.a:        ${VLIBOBJS} AFS_component_version_number.o
         $(RM) -f $@
@@ -136,8 +155,8 @@ vlib.a:     ${VLIBOBJS} AFS_component_version_number.o
         $(RANLIB) $@
  
  # new salvager:  remove references to /vice by linking with novice.o
-salvager: vol-salvage.o physio.o vlib.a
-       ${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o ${LIBS} ${XLIBS}
+salvager: vol-salvage.o physio.o vlib.a salvager.o ${LIBS}
+       ${CC} ${LDFLAGS} -o salvager vol-salvage.o physio.o salvager.o ${LIBS} ${XLIBS}
  
  vol-salvage: vol-salvage.o
  vol-info: vol-info.o physio.o ihandle.o
@@ -167,13 +186,16 @@ volinfo: vol-info.o physio.o ihandle.o ${LIBS}
         ${CC} ${CFLAGS} -o volinfo vol-info.o physio.o \
                 ihandle.o ${LIBS} ${XLIBS}
  
+fssync-debug: fssync-debug.o physio.o AFS_component_version_number.c ${LIBS}
+       ${CC} ${LDFLAGS} -o fssync-debug fssync-debug.o physio.o ${LIBS} ${XLIBS}
+
  vol-bless: vol-bless.o physio.o ihandle.o ${LIBS}
         ${CC} ${CFLAGS} -o vol-bless vol-bless.o physio.o ${LIBS} ${XLIBS}
  
-fs_conv_dux40D: fs_conv_411.o
+fs_conv_dux40D: fs_conv_411.o ${LIBS}
         ${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_dux40D fs_conv_411.o  ${LIBS} ${XLIBS}
  
-fs_conv_sol26: fs_conv_411.o vlib.a 
+fs_conv_sol26: fs_conv_411.o ${LIBS}
         ${CC} ${CFLAGS} ${TOP_LIBDIR}/libcmd.a -o fs_conv_sol26 fs_conv_411.o  ${LIBS} ${XLIBS}
  
  fs_conv_411.o: fs_conv_411.c AFS_component_version_number.c
@@ -211,6 +233,11 @@ ${DESTDIR}${afssrvlibexecdir}/salvager: salvager
  ${DESTDIR}${afssrvsbindir}/volinfo: volinfo
         ${INSTALL} -s $? $@
  
+${DESTDIR}${afssrvsbindir}/fssync-debug: fssync-debug
+       if test "@DEMAND_ATTACH@" = "no" ; then \
+               ${INSTALL} -s $? $@ ; \
+       fi
+
  ${DESTDIR}${includedir}/afs/nfs.h: nfs.h
         ${INSTALL} $? $@
  
@@ -253,6 +280,18 @@ ${DESTDIR}${includedir}/afs/fssync.h: fssync.h
  ${TOP_INCDIR}/afs/fssync.h: fssync.h
         ${INSTALL} $? $@
  
+${DESTDIR}${includedir}/afs/salvsync.h: salvsync.h
+       ${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/salvsync.h: salvsync.h
+       ${INSTALL} $? $@
+
+${DESTDIR}${includedir}/afs/daemon_com.h: daemon_com.h
+       ${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/daemon_com.h: daemon_com.h
+       ${INSTALL} $? $@
+
  ${DESTDIR}${includedir}/afs/ihandle.h: ihandle.h
         ${INSTALL} $? $@
  
@@ -265,11 +304,24 @@ ${DESTDIR}${includedir}/afs/namei_ops.h: namei_ops.h
  ${TOP_INCDIR}/afs/namei_ops.h: namei_ops.h
         ${INSTALL} $? $@
  
+${DESTDIR}${includedir}/afs/salvage.h: salvage.h
+       ${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/salvage.h: salvage.h
+       ${INSTALL} $? $@
+
+${DESTDIR}${includedir}/afs/vol-salvage.h: vol-salvage.h
+       ${INSTALL} $? $@
+
+${TOP_INCDIR}/afs/vol-salvage.h: vol-salvage.h
+       ${INSTALL} $? $@
+
  dest: \
         ${DEST}/lib/afs/vlib.a \
         ${DEST}/lib/afs/libvlib.a \
         ${DEST}/root.server/usr/afs/bin/salvager \
         ${DEST}/root.server/usr/afs/bin/volinfo \
+       ${DEST}/root.server/usr/afs/bin/fssync-debug \
         $(dest_FS_CONV_OSF40D) \
         $(dest_XFS_SIZE_CHECK) \
         $(dest_FS_CONV_SOL26) \
@@ -280,12 +332,14 @@ dest: \
         ${DEST}/include/afs/voldefs.h \
         ${DEST}/include/afs/partition.h \
         ${DEST}/include/afs/fssync.h \
+       ${DEST}/include/afs/salvsync.h \
+       ${DEST}/include/afs/daemon_com.h \
         ${DEST}/include/afs/ihandle.h \
         ${DEST}/include/afs/namei_ops.h
  
  check-splint::
         sh $(HELPER_SPLINT) $(CFLAGS) \
-           vnode.c volume.c vutil.c partition.c fssync.c purge.c \
+           vnode.c volume.c vutil.c partition.c fssync-server.c fssync-client.c \
             clone.c nuke.c devname.c listinodes.c common.c ihandle.c \
-           namei_ops.c \
-           physio.c vol-salvage.c vol-info.c vol-bless.c
+           namei_ops.c salvsync-server.c salvsync-client.c daemon_com.c purge.c \
+           physio.c vol-salvage.c vol-info.c vol-bless.c fssync-debug.c
diff --git a/src/vol/NTMakefile b/src/vol/NTMakefile

index e09db2b73405c13c0dd7e7a9648926c34c2015e1..096026fe7ae768966846ca65a94aeaff4040b594 100644 (file)
--- a/src/vol/NTMakefile
+++ b/src/vol/NTMakefile
@@ -5,6 +5,8 @@
  # License.  For details, see the LICENSE file in the top-level source
  # directory or online at http://www.openafs.org/dl/license10.html
  
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_SERVER -DFSSYNC_BUILD_CLIENT
+
  RELDIR=vol
  !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
  !INCLUDE ..\config\NTMakefile.version
diff --git a/src/vol/daemon_com.c b/src/vol/daemon_com.c

new file mode 100644 (file)

index 0000000..26bddbf
--- /dev/null
+++ b/src/vol/daemon_com.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * localhost interprocess communication for servers
+ *
+ * currently handled by a localhost socket
+ * (yes, this needs to be replaced someday)
+ */
+
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+int (*V_BreakVolumeCallbacks) ();
+
+#define MAXHANDLERS    4       /* Up to 4 clients; must be at least 2, so that
+                                * move = dump+restore can run on single server */
+
+#define MAX_BIND_TRIES 5       /* Number of times to retry socket bind */
+
+static int getport(SYNC_client_state * state, struct sockaddr_in *addr);
+static int SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res);
+
+/* daemon com SYNC client interface */
+
+int
+SYNC_connect(SYNC_client_state * state)
+{
+    struct sockaddr_in addr;
+    /* I can't believe the following is needed for localhost connections!! */
+    static time_t backoff[] =
+       { 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 };
+    time_t *timeout = &backoff[0];
+
+    if (state->fd >= 0) {
+       return 1;
+    }
+
+    for (;;) {
+       state->fd = getport(state, &addr);
+       if (connect(state->fd, (struct sockaddr *)&addr, sizeof(addr)) >= 0)
+           return 1;
+       if (!*timeout)
+           break;
+       if (!(*timeout & 1))
+           Log("SYNC_connect temporary failure (will retry)\n");
+       SYNC_disconnect(state);
+       sleep(*timeout++);
+    }
+    perror("SYNC_connect failed (giving up!)");
+    return 0;
+}
+
+int
+SYNC_disconnect(SYNC_client_state * state)
+{
+#ifdef AFS_NT40_ENV
+    closesocket(state->fd);
+#else
+    close(state->fd);
+#endif
+    state->fd = -1;
+    return 0;
+}
+
+afs_int32
+SYNC_closeChannel(SYNC_client_state * state)
+{
+    afs_int32 code;
+    SYNC_command com;
+    SYNC_response res;
+    SYNC_PROTO_BUF_DECL(ores);
+
+    if (state->fd == -1)
+       return SYNC_OK;
+
+    memset(&com, 0, sizeof(com));
+    memset(&res, 0, sizeof(res));
+
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = ores;
+
+    com.hdr.command = SYNC_COM_CHANNEL_CLOSE;
+    com.hdr.command_len = sizeof(SYNC_command_hdr);
+
+    /* in case the other end dropped, don't do any retries */
+    state->retry_limit = 0;
+    state->hard_timeout = 0;
+
+    code = SYNC_ask(state, &com, &res);
+
+    if (code == SYNC_OK) {
+       if (res.hdr.response != SYNC_OK) {
+           Log("SYNC_closeChannel:  channel shutdown request denied; closing socket anyway\n");
+       } else if (!(res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN)) {
+           Log("SYNC_closeChannel:  channel shutdown request mishandled by server\n");
+       }
+    } else {
+       Log("SYNC_closeChannel: channel communications problem");
+    }
+
+    SYNC_disconnect(state);
+
+    return code;
+}
+
+int
+SYNC_reconnect(SYNC_client_state * state)
+{
+    SYNC_disconnect(state);
+    return SYNC_connect(state);
+}
+
+/* private function to fill in the sockaddr struct for us */
+static int
+getport(SYNC_client_state * state, struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;        /* was localhost->h_addrtype */
+    addr->sin_port = htons(state->port);       /* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+afs_int32
+SYNC_ask(SYNC_client_state * state, SYNC_command * com, SYNC_response * res)
+{
+    int tries;
+    afs_uint32 now, timeout, code=SYNC_OK;
+
+    if (state->fatal_error) {
+       return SYNC_COM_ERROR;
+    }
+
+    if (state->fd == -1) {
+       SYNC_connect(state);
+    }
+
+    if (state->fd == -1) {
+       state->fatal_error = 1;
+       return SYNC_COM_ERROR;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    com->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS;
+#endif
+
+    now = FT_ApproxTime();
+    timeout = now + state->hard_timeout;
+    for (tries = 0; 
+        (tries <= state->retry_limit) && (now <= timeout);
+        tries++, now = FT_ApproxTime()) {
+       code = SYNC_ask_internal(state, com, res);
+       if (code == SYNC_OK) {
+           break;
+       } else if (code == SYNC_BAD_COMMAND) {
+           Log("SYNC_ask: protocol mismatch; make sure fileserver, volserver, salvageserver and salvager are same version\n");
+           break;
+       } else if (code == SYNC_COM_ERROR) {
+           Log("SYNC_ask: protocol communications failure; attempting reconnect to server\n");
+           SYNC_reconnect(state);
+           /* try again */
+       } else {
+           /* unknown (probably protocol-specific) response code, pass it up to the caller, and let them deal with it */
+           break;
+       }
+    }
+
+    if (code == SYNC_COM_ERROR) {
+       Log("SYNC_ask: fatal protocol error; disabling sync protocol to server running on port %d until next server restart\n", 
+           state->port);
+       state->fatal_error = 1;
+    }
+
+    return code;
+}
+
+static afs_int32
+SYNC_ask_internal(SYNC_client_state * state, SYNC_command * com, SYNC_response * res)
+{
+    int n;
+    SYNC_PROTO_BUF_DECL(buf);
+#ifndef AFS_NT40_ENV
+    int iovcnt;
+    struct iovec iov[2];
+#endif
+
+    if (state->fd == -1) {
+       Log("SYNC_ask:  invalid sync file descriptor\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+
+    if (com->hdr.command_len > SYNC_PROTO_MAX_LEN) {
+       Log("SYNC_ask:  internal SYNC buffer too small; please file a bug\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+
+    com->hdr.proto_version = state->proto_version;
+
+    memcpy(buf, &com->hdr, sizeof(com->hdr));
+    if (com->payload.len) {
+       memcpy(buf + sizeof(com->hdr), com->payload.buf, 
+              com->hdr.command_len - sizeof(com->hdr));
+    }
+
+#ifdef AFS_NT40_ENV
+    n = send(state->fd, buf, com->hdr.command_len, 0);
+    if (n != com->hdr.command_len) {
+       Log("SYNC_ask:  write failed\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+
+    n = recv(state->fd, buf, SYNC_PROTO_MAX_LEN, 0);
+    if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) {
+       Log("SYNC_ask:  No response\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+#else /* !AFS_NT40_ENV */
+    n = write(state->fd, buf, com->hdr.command_len);
+    if (com->hdr.command_len != n) {
+       Log("SYNC_ask: write failed\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+
+    /* receive the response */
+    iov[0].iov_base = (char *)&res->hdr;
+    iov[0].iov_len = sizeof(res->hdr);
+    if (res->payload.len) {
+       iov[1].iov_base = (char *)res->payload.buf;
+       iov[1].iov_len = res->payload.len;
+       iovcnt = 2;
+    } else {
+       iovcnt = 1;
+    }
+    n = readv(state->fd, iov, iovcnt);
+    if (n == 0 || (n < 0 && errno != EINTR)) {
+       Log("SYNC_ask: No response\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+#endif /* !AFS_NT40_ENV */
+
+    res->recv_len = n;
+
+    if (n < sizeof(res->hdr)) {
+       Log("SYNC_ask:  response too short\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(&res->hdr, buf, sizeof(res->hdr));
+#endif
+
+    if ((n - sizeof(res->hdr)) > res->payload.len) {
+       Log("SYNC_ask:  response too long\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(res->payload.buf, buf + sizeof(res->hdr), n - sizeof(res->hdr));
+#endif
+
+    if (res->hdr.response_len != n) {
+       Log("SYNC_ask:  length field in response inconsistent\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+    if (res->hdr.response == SYNC_DENIED) {
+       Log("SYNC_ask: negative response\n");
+    }
+
+  done:
+    return res->hdr.response;
+}
+
+
+/* 
+ * daemon com SYNC server-side interfaces 
+ */
+
+/* get a command */
+afs_int32
+SYNC_getCom(int fd, SYNC_command * com)
+{
+    int n;
+    afs_int32 code = SYNC_OK;
+#ifdef AFS_NT40_ENV
+    SYNC_PROTO_BUF_DECL(buf);
+#else
+    struct iovec iov[2];
+    int iovcnt;
+#endif
+
+#ifdef AFS_NT40_ENV
+    n = recv(fd, buf, SYNC_PROTO_MAX_LEN, 0);
+
+    if (n == 0 || (n < 0 && WSAEINTR != WSAGetLastError())) {
+       Log("SYNC_getCom:  error receiving command\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+#else /* !AFS_NT40_ENV */
+    iov[0].iov_base = (char *)&com->hdr;
+    iov[0].iov_len = sizeof(com->hdr);
+    if (com->payload.len) {
+       iov[1].iov_base = (char *)com->payload.buf;
+       iov[1].iov_len = com->payload.len;
+       iovcnt = 2;
+    } else {
+       iovcnt = 1;
+    }
+
+    n = readv(fd, iov, iovcnt);
+    if (n == 0 || (n < 0 && errno != EINTR)) {
+       Log("SYNC_getCom:  error receiving command\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+#endif /* !AFS_NT40_ENV */
+
+    com->recv_len = n;
+
+    if (n < sizeof(com->hdr)) {
+       Log("SYNC_getCom:  command too short\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(&com->hdr, buf, sizeof(com->hdr));
+#endif
+
+    if ((n - sizeof(com->hdr)) > com->payload.len) {
+       Log("SYNC_getCom:  command too long\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+#ifdef AFS_NT40_ENV
+    memcpy(com->payload.buf, buf + sizeof(com->hdr), n - sizeof(com->hdr));
+#endif
+
+ done:
+    return code;
+}
+
+/* put a response */
+afs_int32
+SYNC_putRes(int fd, SYNC_response * res)
+{
+    int n;
+    afs_int32 code = SYNC_OK;
+    SYNC_PROTO_BUF_DECL(buf);
+
+    if (res->hdr.response_len > (sizeof(res->hdr) + res->payload.len)) {
+       Log("SYNC_putRes:  response_len field in response header inconsistent\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+
+    if (res->hdr.response_len > SYNC_PROTO_MAX_LEN) {
+       Log("SYNC_putRes:  internal SYNC buffer too small; please file a bug\n");
+       code = SYNC_COM_ERROR;
+       goto done;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    res->hdr.flags |= SYNC_FLAG_DAFS_EXTENSIONS;
+#endif
+
+    memcpy(buf, &res->hdr, sizeof(res->hdr));
+    if (res->payload.len) {
+       memcpy(buf + sizeof(res->hdr), res->payload.buf, 
+              res->hdr.response_len - sizeof(res->hdr));
+    }
+
+#ifdef AFS_NT40_ENV
+    n = send(fd, buf, res->hdr.response_len, 0);
+#else /* !AFS_NT40_ENV */
+    n = write(fd, buf, res->hdr.response_len);
+#endif /* !AFS_NT40_ENV */
+
+    if (res->hdr.response_len != n) {
+       Log("SYNC_putRes: write failed\n");
+       res->hdr.response = SYNC_COM_ERROR;
+       goto done;
+    }
+
+ done:
+    return code;
+}
+
+/* return 0 for legal (null-terminated) string,
+ * 1 for illegal (unterminated) string */
+int
+SYNC_verifyProtocolString(char * buf, size_t len)
+{
+    int ret = 0;
+    size_t s_len;
+
+    s_len = afs_strnlen(buf, len);
+
+    return (s_len == len) ? 1 : 0;
+}
diff --git a/src/vol/daemon_com.h b/src/vol/daemon_com.h

new file mode 100644 (file)

index 0000000..8464367
--- /dev/null
+++ b/src/vol/daemon_com.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+#ifndef _AFS_VOL_DAEMON_COM_H
+#define _AFS_VOL_DAEMON_COM_H
+
+/* 
+ * SYNC protocol constants
+ */
+
+/* SYNC protocol command codes
+ *
+ * command codes 0-65535 are reserved for
+ * global SYNC package command codes
+ */
+#define SYNC_COM_CODE_USER_BASE 65536
+#define SYNC_COM_CODE_DECL(code) (SYNC_COM_CODE_USER_BASE+(code))
+
+/* general command codes */
+#define SYNC_COM_CHANNEL_CLOSE 0
+
+
+/* SYNC protocol response codes
+ *
+ * response codes 0-65535 are reserved for 
+ * global SYNC package response codes
+ */
+#define SYNC_RES_CODE_USER_BASE 65536
+#define SYNC_RES_CODE_DECL(code) (SYNC_RES_CODE_USER_BASE+(code))
+
+/* general response codes */
+#define SYNC_OK                0   /* sync call returned ok */
+#define SYNC_DENIED            1   /* sync request denied by server */
+#define SYNC_COM_ERROR         2   /* sync protocol communicaions error */
+#define SYNC_BAD_COMMAND       3   /* sync command code not implemented by server */
+#define SYNC_FAILED            4   /* sync server-side procedure failed */
+
+
+/* SYNC protocol reason codes
+ *
+ * reason codes 0-65535 are reserved for
+ * global SYNC package reason codes
+ */
+#define SYNC_REASON_CODE_USER_BASE 65536
+#define SYNC_REASON_CODE_DECL(code) (SYNC_REASON_CODE_USER_BASE+(code))
+
+/* general reason codes */
+#define SYNC_REASON_NONE                 0
+#define SYNC_REASON_MALFORMED_PACKET     1
+
+
+/* SYNC protocol flags
+ *
+ * flag bits 0-7 are reserved for
+ * global SYNC package flags
+ */
+#define SYNC_FLAG_CODE_USER_BASE 8
+#define SYNC_FLAG_CODE_DECL(code) (1 << (SYNC_FLAG_CODE_USER_BASE+(code)))
+
+/* general flag codes */
+#define SYNC_FLAG_CHANNEL_SHUTDOWN   0x1
+#define SYNC_FLAG_DAFS_EXTENSIONS    0x2   /* signal that other end of socket is compiled
+                                           * with demand attach extensions */
+
+/* SYNC protocol response buffers */
+#define SYNC_PROTO_MAX_LEN     768  /* maximum size of sync protocol message */
+
+/* use a large type to get proper buffer alignment so we can safely cast the pointer */
+#define SYNC_PROTO_BUF_DECL(buf) \
+    afs_int64 _##buf##_l[SYNC_PROTO_MAX_LEN/sizeof(afs_int64)]; \
+    char * buf = (char *)(_##buf##_l)
+
+
+/* client-side state object */
+typedef struct SYNC_client_state {
+    int fd;
+    afs_uint16 port;
+    afs_uint32 proto_version;
+    int retry_limit;            /* max number of times for SYNC_ask to retry */
+    afs_int32 hard_timeout;     /* upper limit on time to keep trying */
+    byte fatal_error;           /* fatal error on this client conn */
+} SYNC_client_state;
+
+/* wire types */
+typedef struct SYNC_command_hdr {
+    afs_uint32 proto_version;   /* sync protocol version */
+    afs_int32 programType;      /* type of program issuing the request */
+    afs_int32 command;          /* request type */
+    afs_int32 reason;           /* reason for request */
+    afs_uint32 command_len;     /* entire length of command */
+    afs_uint32 flags;
+} SYNC_command_hdr;
+
+typedef struct SYNC_response_hdr {
+    afs_uint32 proto_version;    /* sync protocol version */
+    afs_uint32 response_len;    /* entire length of response */
+    afs_int32 response;         /* response code */
+    afs_int32 reason;           /* reason for response */
+    afs_uint32 flags;
+} SYNC_response_hdr;
+
+
+/* user-visible types */
+typedef struct SYNC_command {
+    SYNC_command_hdr hdr;
+    struct {
+       afs_uint32 len;
+       void * buf;
+    } payload;
+    afs_int32 recv_len;
+} SYNC_command;
+
+typedef struct SYNC_response {
+    SYNC_response_hdr hdr;
+    struct {
+       afs_uint32 len;
+       void * buf;
+    } payload;
+    afs_int32 recv_len;
+} SYNC_response;
+
+
+/* client-side prototypes */
+extern afs_int32 SYNC_ask(SYNC_client_state *, SYNC_command * com, SYNC_response * res);
+extern int SYNC_connect(SYNC_client_state *);             /* setup the channel */
+extern int SYNC_disconnect(SYNC_client_state *);          /* just close the socket */
+extern afs_int32 SYNC_closeChannel(SYNC_client_state *);  /* do a graceful channel close */
+extern int SYNC_reconnect(SYNC_client_state *);           /* do a reconnect after a protocol error, or from a forked child */
+
+/* server-side prototypes */
+extern int SYNC_getCom(int fd, SYNC_command * com);
+extern int SYNC_putRes(int fd, SYNC_response * res);
+extern int SYNC_verifyProtocolString(char * buf, size_t len);
+
+#endif /* _AFS_VOL_DAEMON_COM_H */
diff --git a/src/vol/fssync-client.c b/src/vol/fssync-client.c

new file mode 100644 (file)

index 0000000..205a089
--- /dev/null
+++ b/src/vol/fssync-client.c
@@ -0,0 +1,222 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+/*
+       System:         VICE-TWO
+       Module:         fssync.c
+       Institution:    The Information Technology Center, Carnegie-Mellon University
+
+ */
+#ifdef notdef
+
+/* All this is going away in early 1989 */
+int newVLDB;                   /* Compatibility flag */
+
+#endif
+static int newVLDB = 1;
+
+
+#ifndef AFS_PTHREAD_ENV
+#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
+
+/*
+ * stack size increased from 8K because the HP machine seemed to have trouble
+ * with the smaller stack
+ */
+#define USUAL_STACK_SIZE       (24 * 1024)
+#endif /* !AFS_PTHREAD_ENV */
+
+/*
+   fssync-client.c
+   File server synchronization with external volume utilities.
+   client-side implementation
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#ifdef AFS_PTHREAD_ENV
+#include <assert.h>
+#else /* AFS_PTHREAD_ENV */
+#include <afs/assert.h>
+#endif /* AFS_PTHREAD_ENV */
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "fssync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+
+#ifdef FSSYNC_BUILD_CLIENT
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+extern int LogLevel;
+
+static SYNC_client_state fssync_state = { -1, 2040, FSYNC_PROTO_VERSION, 5, 120 };
+
+#ifdef AFS_PTHREAD_ENV
+static pthread_mutex_t vol_fsync_mutex;
+static volatile vol_fsync_mutex_init = 0;
+#define VFSYNC_LOCK \
+    assert(pthread_mutex_lock(&vol_fsync_mutex) == 0)
+#define VFSYNC_UNLOCK \
+    assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0)
+#else
+#define VFSYNC_LOCK
+#define VFSYNC_UNLOCK
+#endif
+
+int
+FSYNC_clientInit(void)
+{
+#ifdef AFS_PTHREAD_ENV
+    /* this is safe since it gets called with VOL_LOCK held, or before we go multithreaded */
+    if (!vol_fsync_mutex_init) {
+       assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0);
+       vol_fsync_mutex_init = 1;
+    }
+#endif
+    return SYNC_connect(&fssync_state);
+}
+
+void
+FSYNC_clientFinis(void)
+{
+    SYNC_closeChannel(&fssync_state);
+}
+
+int
+FSYNC_clientChildProcReconnect(void)
+{
+    return SYNC_reconnect(&fssync_state);
+}
+
+/* fsync client interface */
+afs_int32
+FSYNC_askfs(SYNC_command * com, SYNC_response * res)
+{
+    afs_int32 code;
+
+    VFSYNC_LOCK;
+    code = SYNC_ask(&fssync_state, com, res);
+    VFSYNC_UNLOCK;
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_FAILED:
+       break;
+    case SYNC_COM_ERROR:
+    case SYNC_BAD_COMMAND:
+       Log("FSYNC_askfs: fatal FSSYNC protocol error; volume management functionality disabled until next fileserver restart\n");
+       break;
+    case SYNC_DENIED:
+       Log("FSYNC_askfs: FSSYNC request denied for reason=%d\n", res->hdr.reason);
+       break;
+    default:
+       Log("FSYNC_askfs: unknown protocol response %d\n", code);
+       break;
+    }
+    return code;
+}
+
+afs_int32
+FSYNC_GenericOp(void * ext_hdr, size_t ext_len,
+             int command, int reason,
+             SYNC_response * res_in)
+{
+    SYNC_response res_l, *res;
+    SYNC_command com;
+
+    if (res_in) {
+       res = res_in;
+    } else {
+       res = &res_l;
+       res_l.payload.buf = NULL;
+       res_l.payload.len = 0;
+    }
+
+    memset(&com, 0, sizeof(com));
+
+    com.hdr.programType = programType;
+    com.hdr.command = command;
+    com.hdr.reason = reason;
+    com.hdr.command_len = sizeof(com.hdr) + ext_len;
+    com.payload.buf = ext_hdr;
+    com.payload.len = ext_len;
+
+    return FSYNC_askfs(&com, res);
+}
+
+afs_int32
+FSYNC_VolOp(VolumeId volume, char * partition, 
+           int command, int reason,
+           SYNC_response * res)
+{
+    FSSYNC_VolOp_hdr vcom;
+
+    memset(&vcom, 0, sizeof(vcom));
+
+    vcom.volume = volume;
+    if (partition)
+       strlcpy(vcom.partName, partition, sizeof(vcom.partName));
+
+    return FSYNC_GenericOp(&vcom, sizeof(vcom), command, reason, res);
+}
+
+afs_int32
+FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason,
+             SYNC_response * res)
+{
+    return FSYNC_GenericOp(scom, sizeof(*scom), command, reason, res);
+}
+
+
+#endif /* FSSYNC_BUILD_CLIENT */
diff --git a/src/vol/fssync-debug.c b/src/vol/fssync-debug.c

new file mode 100644 (file)

index 0000000..194204e
--- /dev/null
+++ b/src/vol/fssync-debug.c
@@ -0,0 +1,1148 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+/*
+ * fssync administration tool
+ */
+
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+
+
+#include <fcntl.h>
+
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+int VolumeChanged; /* hack to make dir package happy */
+
+
+struct volop_state {
+    afs_uint32 volume;
+    char partName[16];
+};
+
+struct state {
+    afs_int32 reason;
+    struct volop_state * vop;
+};
+
+static int common_prolog(struct cmd_syndesc *, struct state *);
+static int common_volop_prolog(struct cmd_syndesc *, struct state *);
+
+static int do_volop(struct state *, afs_int32 command, SYNC_response * res);
+
+static char * response_code_to_string(afs_int32);
+static char * command_code_to_string(afs_int32);
+static char * reason_code_to_string(afs_int32);
+static char * program_type_to_string(afs_int32);
+
+static int VolOnline(struct cmd_syndesc * as, char * rock);
+static int VolOffline(struct cmd_syndesc * as, char * rock);
+static int VolMode(struct cmd_syndesc * as, char * rock);
+static int VolDetach(struct cmd_syndesc * as, char * rock);
+static int VolBreakCBKs(struct cmd_syndesc * as, char * rock);
+static int VolMove(struct cmd_syndesc * as, char * rock);
+static int VolList(struct cmd_syndesc * as, char * rock);
+static int VolQuery(struct cmd_syndesc * as, char * rock);
+static int VolHdrQuery(struct cmd_syndesc * as, char * rock);
+static int VolOpQuery(struct cmd_syndesc * as, char * rock);
+static int StatsQuery(struct cmd_syndesc * as, char * rock);
+
+
+static void print_vol_stats_general(VolPkgStats * stats);
+static void print_vol_stats_viceP(struct DiskPartitionStats * stats);
+static void print_vol_stats_hash(struct VolumeHashChainStats * stats);
+#ifdef AFS_DEMAND_ATTACH_FS
+static void print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats);
+#endif
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+
+#define COMMON_PARMS_OFFSET    12
+#define COMMON_PARMS(ts) \
+    cmd_Seek(ts, COMMON_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-reason", CMD_SINGLE, CMD_OPTIONAL, "sync protocol reason code"); \
+    cmd_AddParm(ts, "-programtype", CMD_SINGLE, CMD_OPTIONAL, "program type code")
+
+#define COMMON_VOLOP_PARMS_OFFSET    10
+#define COMMON_VOLOP_PARMS(ts) \
+    cmd_Seek(ts, COMMON_VOLOP_PARMS_OFFSET); \
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, 0, "volume id"); \
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL, "partition name")
+
+#define CUSTOM_PARMS_OFFSET 1
+
+
+#define VOLOP_PARMS_DECL(ts) \
+    COMMON_VOLOP_PARMS(ts); \
+    COMMON_PARMS(ts)
+#define COMMON_PARMS_DECL(ts) \
+    COMMON_PARMS(ts)
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    int i;
+    extern char cml_version_number[];
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+       ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+       fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+               argv[0]);
+       exit(2);
+    }
+
+    
+    ts = cmd_CreateSyntax("online", VolOnline, 0, "bring a volume online (FSYNC_VOL_ON opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("offline", VolOffline, 0, "take a volume offline (FSYNC_VOL_OFF opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("mode", VolMode, 0, "change volume attach mode (FSYNC_VOL_NEEDVOLUME opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "needvolume");
+
+    ts = cmd_CreateSyntax("detach", VolDetach, 0, "detach a volume (FSYNC_VOL_DONE opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("callback", VolBreakCBKs, 0, "break callbacks for volume (FSYNC_VOL_BREAKCBKS opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "cbk");
+
+    ts = cmd_CreateSyntax("move", VolMove, 0, "set volume moved flag (FSYNC_VOL_MOVE opcode)");
+    VOLOP_PARMS_DECL(ts);
+
+    ts = cmd_CreateSyntax("list", VolList, 0, "sync local volume list (FSYNC_VOL_LISTVOLUMES opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "ls");
+
+    ts = cmd_CreateSyntax("query", VolQuery, 0, "get volume structure (FSYNC_VOL_QUERY opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "qry");
+
+    ts = cmd_CreateSyntax("header", VolHdrQuery, 0, "get volume disk data structure (FSYNC_VOL_QUERY_HDR opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "hdr");
+
+    ts = cmd_CreateSyntax("volop", VolOpQuery, 0, "get pending volume operation info (FSYNC_VOL_QUERY_VOP opcode)");
+    VOLOP_PARMS_DECL(ts);
+    cmd_CreateAlias(ts, "vop");
+
+    ts = cmd_CreateSyntax("stats", StatsQuery, 0, "see 'stats help' for more information");
+    cmd_Seek(ts, CUSTOM_PARMS_OFFSET);
+    cmd_AddParm(ts, "-cmd", CMD_SINGLE, 0, "subcommand");
+    cmd_AddParm(ts, "-arg1", CMD_SINGLE, CMD_OPTIONAL, "arg1");
+    cmd_AddParm(ts, "-arg2", CMD_SINGLE, CMD_OPTIONAL, "arg2");
+    COMMON_PARMS_DECL(ts);
+
+    err = cmd_Dispatch(argc, argv);
+    exit(err);
+}
+
+static int
+common_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+
+#ifdef AFS_NT40_ENV
+    if (afs_winsockInit() < 0) {
+       Exit(1);
+    }
+#endif
+
+    VInitVolumePackage(debugUtility, 1, 1,
+                      DONT_CONNECT_FS, 0);
+    DInit(1);
+
+    if ((ti = as->parms[COMMON_PARMS_OFFSET].items)) { /* -reason */
+       state->reason = atoi(ti->data);
+    }
+    if ((ti = as->parms[COMMON_PARMS_OFFSET+1].items)) {       /* -programtype */
+       if (!strcmp(ti->data, "fileServer")) {
+           programType = fileServer;
+       } else if (!strcmp(ti->data, "volumeUtility")) {
+           programType = volumeUtility;
+       } else if (!strcmp(ti->data, "salvager")) {
+           programType = salvager;
+       } else if (!strcmp(ti->data, "salvageServer")) {
+           programType = salvageServer;
+       } else {
+           programType = (ProgramType) atoi(ti->data);
+       }
+    }
+
+    VConnectFS();
+
+    return 0;
+}
+
+static int
+common_volop_prolog(struct cmd_syndesc * as, struct state * state)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+
+    state->vop = (struct volop_state *) calloc(1, sizeof(struct volop_state));
+    assert(state->vop != NULL);
+
+    if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET].items)) {   /* -volumeid */
+       state->vop->volume = atoi(ti->data);
+    } else {
+       fprintf(stderr, "required argument -volumeid not given\n");
+    }
+
+    if ((ti = as->parms[COMMON_VOLOP_PARMS_OFFSET+1].items)) { /* -partition */
+       strlcpy(state->vop->partName, ti->data, sizeof(state->vop->partName));
+    } else {
+       memset(state->vop->partName, 0, sizeof(state->vop->partName));
+    }
+
+    return 0;
+}
+
+static int
+do_volop(struct state * state, afs_int32 command, SYNC_response * res)
+{
+    afs_int32 code;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res_l;
+
+    if (!res) {
+       res = &res_l;
+       res->payload.len = SYNC_PROTO_MAX_LEN;
+       res->payload.buf = res_buf;
+    }
+
+    fprintf(stderr, "calling FSYNC_VolOp with command code %d (%s)\n", 
+           command, command_code_to_string(command));
+
+    code = FSYNC_VolOp(state->vop->volume,
+                      state->vop->partName,
+                      command,
+                      state->reason,
+                      res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+       break;
+    default:
+       fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+           res->hdr.response, response_code_to_string(res->hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+           res->hdr.reason, reason_code_to_string(res->hdr.reason));
+
+    VDisconnectFS();
+}
+
+static char *
+response_code_to_string(afs_int32 response)
+{
+    switch (response) {
+    case SYNC_OK:
+       return "SYNC_OK";
+    case SYNC_DENIED:
+       return "SYNC_DENIED";
+    case SYNC_COM_ERROR:
+       return "SYNC_COM_ERROR";
+    case SYNC_BAD_COMMAND:
+       return "SYNC_BAD_COMMAND";
+    case SYNC_FAILED:
+       return "SYNC_FAILED";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+command_code_to_string(afs_int32 command)
+{
+    switch (command) {
+    case SYNC_COM_CHANNEL_CLOSE:
+       return "SYNC_COM_CHANNEL_CLOSE";
+    case FSYNC_VOL_ON:
+       return "FSYNC_VOL_ON";
+    case FSYNC_VOL_OFF:
+       return "FSYNC_VOL_OFF";
+    case FSYNC_VOL_LISTVOLUMES:
+       return "FSYNC_VOL_LISTVOLUMES";
+    case FSYNC_VOL_NEEDVOLUME:
+       return "FSYNC_VOL_NEEDVOLUME";
+    case FSYNC_VOL_MOVE:
+       return "FSYNC_VOL_MOVE";
+    case FSYNC_VOL_BREAKCBKS:
+       return "FSYNC_VOL_BREAKCBKS";
+    case FSYNC_VOL_DONE:
+       return "FSYNC_VOL_DONE";
+    case FSYNC_VOL_QUERY:
+       return "FSYNC_VOL_QUERY";
+    case FSYNC_VOL_QUERY_HDR:
+       return "FSYNC_VOL_QUERY_HDR";
+    case FSYNC_VOL_QUERY_VOP:
+       return "FSYNC_VOL_QUERY_VOP";
+    case FSYNC_VOL_STATS_GENERAL:
+       return "FSYNC_VOL_STATS_GENERAL";
+    case FSYNC_VOL_STATS_VICEP:
+       return "FSYNC_VOL_STATS_VICEP";
+    case FSYNC_VOL_STATS_HASH:
+       return "FSYNC_VOL_STATS_HASH";
+    case FSYNC_VOL_STATS_HDR:
+       return "FSYNC_VOL_STATS_HDR";
+    case FSYNC_VOL_STATS_VLRU:
+       return "FSYNC_VOL_STATS_VLRU";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+reason_code_to_string(afs_int32 reason)
+{
+    switch (reason) {
+    case SYNC_REASON_NONE:
+       return "SYNC_REASON_NONE";
+    case SYNC_REASON_MALFORMED_PACKET:
+       return "SYNC_REASON_MALFORMED_PACKET";
+    case FSYNC_WHATEVER:
+       return "FSYNC_WHATEVER";
+    case FSYNC_SALVAGE:
+       return "FSYNC_SALVAGE";
+    case FSYNC_MOVE:
+       return "FSYNC_MOVE";
+    case FSYNC_OPERATOR:
+       return "FSYNC_OPERATOR";
+    case FSYNC_EXCLUSIVE:
+       return "FSYNC_EXCLUSIVE";
+    case FSYNC_UNKNOWN_VOLID:
+       return "FSYNC_UNKNOWN_VOLID";
+    case FSYNC_HDR_NOT_ATTACHED:
+       return "FSYNC_HDR_NOT_ATTACHED";
+    case FSYNC_NO_PENDING_VOL_OP:
+       return "FSYNC_NO_PENDING_VOL_OP";
+    case FSYNC_VOL_PKG_ERROR:
+       return "FSYNC_VOL_PKG_ERROR";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+program_type_to_string(afs_int32 type)
+{
+    switch ((ProgramType)type) {
+    case fileServer:
+       return "fileServer";
+    case volumeUtility:
+       return "volumeUtility";
+    case salvager:
+       return "salvager";
+    case salvageServer:
+       return "salvageServer";
+    case debugUtility:
+      return "debugUtility";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static int 
+VolOnline(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_ON, NULL);
+
+    return 0;
+}
+
+static int 
+VolOffline(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_OFF, NULL);
+
+    return 0;
+}
+
+static int
+VolMode(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_NEEDVOLUME, NULL);
+
+    return 0;
+}
+
+static int
+VolDetach(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_DONE, NULL);
+
+    return 0;
+}
+
+static int
+VolBreakCBKs(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_BREAKCBKS, NULL);
+
+    return 0;
+}
+
+static int
+VolMove(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_MOVE, NULL);
+
+    return 0;
+}
+
+static int
+VolList(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_LISTVOLUMES, NULL);
+
+    return 0;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static char *
+vol_state_to_string(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_UNATTACHED:
+       return "VOL_STATE_UNATTACHED";
+    case VOL_STATE_PREATTACHED:
+       return "VOL_STATE_PREATTACHED";
+    case VOL_STATE_ATTACHING:
+       return "VOL_STATE_ATTACHING";
+    case VOL_STATE_ATTACHED:
+       return "VOL_STATE_ATTACHED";
+    case VOL_STATE_UPDATING:
+       return "VOL_STATE_UPDATING";
+    case VOL_STATE_GET_BITMAP:
+       return "VOL_STATE_GET_BITMAP";
+    case VOL_STATE_HDR_LOADING:
+       return "VOL_STATE_HDR_LOADING";
+    case VOL_STATE_HDR_ATTACHING:
+       return "VOL_STATE_HDR_ATTACHING";
+    case VOL_STATE_SHUTTING_DOWN:
+       return "VOL_STATE_SHUTTING_DOWN";
+    case VOL_STATE_GOING_OFFLINE:
+       return "VOL_STATE_GOING_OFFLINE";
+    case VOL_STATE_OFFLINING:
+       return "VOL_STATE_OFFLINING";
+    case VOL_STATE_DETACHING:
+       return "VOL_STATE_DETACHING";
+    case VOL_STATE_SALVSYNC_REQ:
+      return "VOL_STATE_SALVSYNC_REQ";
+    case VOL_STATE_SALVAGING:
+       return "VOL_STATE_SALVAGING";
+    case VOL_STATE_ERROR:
+       return "VOL_STATE_ERROR";
+    case VOL_STATE_FREED:
+       return "VOL_STATE_FREED";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+
+static char *
+vol_flags_to_string(afs_uint16 flags)
+{
+    static char str[128];
+    int count = 0;
+    str[0]='\0';
+
+    if (flags & VOL_HDR_ATTACHED) {
+       strlcat(str, "VOL_HDR_ATTACHED", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_HDR_LOADED) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_HDR_LOADED", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_HDR_IN_LRU) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_HDR_IN_LRU", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_IN_HASH) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_IN_HASH", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_ON_VBYP_LIST) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_ON_VBYP_LIST", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_IS_BUSY) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_IS_BUSY", sizeof(str));
+       count++;
+    }
+
+    if (flags & VOL_ON_VLRU) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_ON_VLRU", sizeof(str));
+    }
+
+    if (flags & VOL_HDR_DONTSALV) {
+       if (count) {
+           strlcat(str, " | ", sizeof(str));
+       }
+       strlcat(str, "VOL_HDR_DONTSALV", sizeof(str));
+    }
+
+    return str;
+}
+
+static char *
+vlru_idx_to_string(int idx)
+{
+    switch (idx) {
+    case VLRU_QUEUE_NEW:
+       return "VLRU_QUEUE_NEW";
+    case VLRU_QUEUE_MID:
+       return "VLRU_QUEUE_MID";
+    case VLRU_QUEUE_OLD:
+       return "VLRU_QUEUE_OLD";
+    case VLRU_QUEUE_CANDIDATE:
+       return "VLRU_QUEUE_CANDIDATE";
+    case VLRU_QUEUE_HELD:
+       return "VLRU_QUEUE_HELD";
+    case VLRU_QUEUE_INVALID:
+       return "VLRU_QUEUE_INVALID";
+    default:
+       return "**UNKNOWN**";
+    }
+}
+#endif
+
+static int
+VolQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    Volume v;
+    int hi, lo;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY, &res);
+
+    if (res.hdr.response == SYNC_OK) {
+       memcpy(&v, res.payload.buf, sizeof(Volume));
+
+       printf("volume = {\n");
+       printf("\thashid          = %u\n", v.hashid);
+       printf("\theader          = 0x%x\n", v.header);
+       printf("\tdevice          = %d\n", v.device);
+       printf("\tpartition       = 0x%x\n", v.partition);
+       printf("\tlinkHandle      = 0x%x\n", v.linkHandle);
+       printf("\tnextVnodeUnique = %u\n", v.nextVnodeUnique);
+       printf("\tdiskDataHandle  = 0x%x\n", v.diskDataHandle);
+       printf("\tvnodeHashOffset = %u\n", v.vnodeHashOffset);
+       printf("\tshuttingDown    = %d\n", v.shuttingDown);
+       printf("\tgoingOffline    = %d\n", v.goingOffline);
+       printf("\tcacheCheck      = %u\n", v.cacheCheck);
+       printf("\tnUsers          = %d\n", v.nUsers);
+       printf("\tneedsPutBack    = %d\n", v.needsPutBack);
+       printf("\tspecialStatus   = %d\n", v.specialStatus);
+       printf("\tupdateTime      = %u\n", v.updateTime);
+       
+       printf("\tvnodeIndex[vSmall] = {\n");
+        printf("\t\thandle       = 0x%x\n", v.vnodeIndex[vSmall].handle);
+        printf("\t\tbitmap       = 0x%x\n", v.vnodeIndex[vSmall].bitmap);
+       printf("\t\tbitmapSize   = %u\n", v.vnodeIndex[vSmall].bitmapSize);
+       printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vSmall].bitmapOffset);
+       printf("\t}\n");
+       printf("\tvnodeIndex[vLarge] = {\n");
+        printf("\t\thandle       = 0x%x\n", v.vnodeIndex[vLarge].handle);
+        printf("\t\tbitmap       = 0x%x\n", v.vnodeIndex[vLarge].bitmap);
+       printf("\t\tbitmapSize   = %u\n", v.vnodeIndex[vLarge].bitmapSize);
+       printf("\t\tbitmapOffset = %u\n", v.vnodeIndex[vLarge].bitmapOffset);
+       printf("\t}\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+       if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) {
+           printf("\tupdateTime      = %u\n", v.updateTime);
+           printf("\tattach_state    = %s\n", vol_state_to_string(v.attach_state));
+           printf("\tattach_flags    = %s\n", vol_flags_to_string(v.attach_flags));
+           printf("\tnWaiters        = %d\n", v.nWaiters);
+           printf("\tchainCacheCheck = %d\n", v.chainCacheCheck);
+           
+           /* online salvage structure */
+           printf("\tsalvage = {\n");
+           printf("\t\tprio      = %u\n", v.salvage.prio);
+           printf("\t\treason    = %d\n", v.salvage.reason);
+           printf("\t\trequested = %d\n", v.salvage.requested);
+           printf("\t\tscheduled = %d\n", v.salvage.scheduled);
+           printf("\t}\n");
+           
+           /* statistics structure */
+           printf("\tstats = {\n");
+
+           printf("\t\thash_lookups = {\n");
+           SplitInt64(v.stats.hash_lookups,hi,lo);
+           printf("\t\t\thi = %u\n", hi);
+           printf("\t\t\tlo = %u\n", lo);
+           printf("\t\t}\n");
+
+           printf("\t\thash_short_circuits = {\n");
+           SplitInt64(v.stats.hash_short_circuits,hi,lo);
+           printf("\t\t\thi = %u\n", hi);
+           printf("\t\t\tlo = %u\n", lo);
+           printf("\t\t}\n");
+
+           printf("\t\thdr_loads = {\n");
+           SplitInt64(v.stats.hdr_loads,hi,lo);
+           printf("\t\t\thi = %u\n", hi);
+           printf("\t\t\tlo = %u\n", lo);
+           printf("\t\t}\n");
+
+           printf("\t\thdr_gets = {\n");
+           SplitInt64(v.stats.hdr_gets,hi,lo);
+           printf("\t\t\thi = %u\n", hi);
+           printf("\t\t\tlo = %u\n", lo);
+           printf("\t\t}\n");
+           
+           printf("\t\tattaches         = %u\n", v.stats.attaches);
+           printf("\t\tsoft_detaches    = %u\n", v.stats.soft_detaches);
+           printf("\t\tsalvages         = %u\n", v.stats.salvages);
+           printf("\t\tvol_ops          = %u\n", v.stats.vol_ops);
+           
+           printf("\t\tlast_attach      = %u\n", v.stats.last_attach);
+           printf("\t\tlast_get         = %u\n", v.stats.last_get);
+           printf("\t\tlast_promote     = %u\n", v.stats.last_promote);
+           printf("\t\tlast_hdr_get     = %u\n", v.stats.last_hdr_get);
+           printf("\t\tlast_salvage     = %u\n", v.stats.last_salvage);
+           printf("\t\tlast_salvage_req = %u\n", v.stats.last_salvage_req);
+           printf("\t\tlast_vol_op      = %u\n", v.stats.last_vol_op);
+           printf("\t}\n");
+           
+           /* VLRU state */
+           printf("\tvlru = {\n");
+           printf("\t\tidx = %d (%s)\n", 
+                  v.vlru.idx, vlru_idx_to_string(v.vlru.idx));
+           printf("\t}\n");
+
+           /* volume op state */
+           printf("\tpending_vol_op  = 0x%x\n", v.pending_vol_op);
+       }
+#else /* !AFS_DEMAND_ATTACH_FS */
+       if (res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS) {
+           printf("*** server asserted demand attach extensions. fssync-debug not built to\n");
+           printf("*** recognize those extensions. please recompile fssync-debug if you need\n");
+           printf("*** to dump dafs extended state\n");
+       }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+       printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+VolHdrQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    VolumeDiskData v;
+    int i;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY_HDR, &res);
+
+    if (res.hdr.response == SYNC_OK) {
+       memcpy(&v, res.payload.buf, sizeof(VolumeDiskData));
+
+       printf("VolumeDiskData = {\n");
+       printf("\tstamp = {\n");
+       printf("\t\tmagic   = 0x%x\n", v.stamp.magic);
+       printf("\t\tversion = %u\n", v.stamp.version);
+       printf("\t}\n");
+       
+       printf("\tid               = %u\n", v.id);
+       printf("\tname             = '%s'\n", v.name);
+       printf("\tinUse            = %d\n", v.inUse);
+       printf("\tinService        = %d\n", v.inService);
+       printf("\tblessed          = %d\n", v.blessed);
+       printf("\tneedsSalvaged    = %d\n", v.needsSalvaged);
+       printf("\tuniquifier       = %u\n", v.uniquifier);
+       printf("\ttype             = %d\n", v.type);
+       printf("\tparentId         = %u\n", v.parentId);
+       printf("\tcloneId          = %u\n", v.cloneId);
+       printf("\tbackupId         = %u\n", v.backupId);
+       printf("\trestoredFromId   = %u\n", v.restoredFromId);
+       printf("\tneedsCallback    = %d\n", v.needsCallback);
+       printf("\tdestroyMe        = %d\n", v.destroyMe);
+       printf("\tdontSalvage      = %d\n", v.dontSalvage);
+       printf("\tmaxquota         = %d\n", v.maxquota);
+       printf("\tminquota         = %d\n", v.minquota);
+       printf("\tmaxfiles         = %d\n", v.maxfiles);
+       printf("\taccountNumber    = %u\n", v.accountNumber);
+       printf("\towner            = %u\n", v.owner);
+       printf("\tfilecount        = %d\n", v.filecount);
+       printf("\tdiskused         = %d\n", v.diskused);
+       printf("\tdayUse           = %d\n", v.dayUse);
+       for (i = 0; i < 7; i++) {
+           printf("\tweekUse[%d]       = %d\n", i, v.weekUse[i]);
+       }
+       printf("\tdayUseDate       = %u\n", v.dayUseDate);
+       printf("\tcreationDate     = %u\n", v.creationDate);
+       printf("\taccessDate       = %u\n", v.accessDate);
+       printf("\tupdateDate       = %u\n", v.updateDate);
+       printf("\texpirationDate   = %u\n", v.expirationDate);
+       printf("\tbackupDate       = %u\n", v.backupDate);
+       printf("\tcopyDate         = %u\n", v.copyDate);
+#ifdef OPENAFS_VOL_STATS
+       printf("\tstat_initialized = %d\n", v.stat_initialized);
+#else
+        printf("\tmtd              = '%s'\n", v.motd);
+#endif
+       printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+VolOpQuery(struct cmd_syndesc * as, char * rock)
+{
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    FSSYNC_VolOp_info vop;
+    int i;
+
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    common_prolog(as, &state);
+    common_volop_prolog(as, &state);
+
+    do_volop(&state, FSYNC_VOL_QUERY_VOP, &res);
+
+    if (!(res.hdr.flags & SYNC_FLAG_DAFS_EXTENSIONS)) {
+       printf("*** file server not compiled with demand attach extensions.\n");
+       printf("*** pending volume operation metadata not available.\n");
+    }
+
+    if (res.hdr.response == SYNC_OK) {
+       memcpy(&vop, res.payload.buf, sizeof(FSSYNC_VolOp_info));
+
+       printf("pending_vol_op = {\n");
+
+       printf("\tcom = {\n");
+       printf("\t\tproto_version  = %u\n", vop.com.proto_version);
+       printf("\t\tprogramType    = %d (%s)\n", 
+              vop.com.programType, program_type_to_string(vop.com.programType));
+       printf("\t\tcommand        = %d (%s)\n", 
+              vop.com.command, command_code_to_string(vop.com.command));
+       printf("\t\treason         = %d (%s)\n", 
+              vop.com.reason, reason_code_to_string(vop.com.reason));
+       printf("\t\tcommand_len    = %u\n", vop.com.command_len);
+       printf("\t\tflags          = 0x%x\n", vop.com.flags);
+       printf("\t}\n");
+
+       printf("\tvop = {\n");
+       printf("\t\tvolume         = %u\n", vop.vop.volume);
+       if (afs_strnlen(vop.vop.partName, sizeof(vop.vop.partName)) <
+           sizeof(vop.vop.partName)) {
+           printf("\t\tpartName       = '%s'\n", vop.vop.partName);
+       } else {
+           printf("\t\tpartName       = (illegal string)\n");
+       }
+       printf("\t}\n");
+
+       printf("}\n");
+    }
+
+    return 0;
+}
+
+static int
+StatsQuery(struct cmd_syndesc * as, char * rock)
+{
+    afs_int32 code;
+    int command;
+    struct cmd_item *ti;
+    struct state state;
+    SYNC_PROTO_BUF_DECL(res_buf);
+    SYNC_response res;
+    FSSYNC_StatsOp_hdr scom;
+    union {
+       void * ptr;
+       struct VolPkgStats * vol_stats;
+       struct VolumeHashChainStats * hash_stats;
+#ifdef AFS_DEMAND_ATTACH_FS
+       struct volume_hdr_LRU_stats * hdr_stats;
+#endif
+       struct DiskPartitionStats * vicep_stats;
+    } sres;
+
+    sres.ptr = res_buf;
+    res.hdr.response_len = sizeof(res.hdr);
+    res.payload.buf = res_buf;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+
+    if ((ti = as->parms[CUSTOM_PARMS_OFFSET].items)) { /* -subcommand */
+       if (!strcasecmp(ti->data, "vicep")) {
+           command = FSYNC_VOL_STATS_VICEP;
+       } else if (!strcasecmp(ti->data, "hash")) {
+           command = FSYNC_VOL_STATS_HASH;
+#ifdef AFS_DEMAND_ATTACH_FS
+       } else if (!strcasecmp(ti->data, "hdr")) {
+           command = FSYNC_VOL_STATS_HDR;
+       } else if (!strcasecmp(ti->data, "vlru")) {
+           command = FSYNC_VOL_STATS_VLRU;
+#endif
+       } else if (!strcasecmp(ti->data, "pkg")) {
+           command = FSYNC_VOL_STATS_GENERAL;
+       } else if (!strcasecmp(ti->data, "help")) {
+           fprintf(stderr, "fssync-debug stats subcommands:\n");
+           fprintf(stderr, "\tpkg\tgeneral volume package stats\n");
+           fprintf(stderr, "\tvicep\tvice partition stats\n");
+           fprintf(stderr, "\thash\tvolume hash chain stats\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+           fprintf(stderr, "\thdr\tvolume header cache stats\n");
+           fprintf(stderr, "\tvlru\tvlru generation stats\n");
+#endif
+           exit(0);
+       } else {
+           fprintf(stderr, "invalid stats subcommand");
+           exit(1);
+       }
+    } else {
+       command = FSYNC_VOL_STATS_GENERAL;
+    }
+
+    if ((ti = as->parms[CUSTOM_PARMS_OFFSET+1].items)) {       /* -arg1 */
+       switch (command) {
+       case FSYNC_VOL_STATS_VICEP:
+           strlcpy(scom.args.partName, ti->data, sizeof(state.vop->partName));
+           break;
+       case FSYNC_VOL_STATS_HASH:
+           scom.args.hash_bucket = atoi(ti->data);
+           break;
+       case FSYNC_VOL_STATS_VLRU:
+           scom.args.vlru_generation = atoi(ti->data);
+           break;
+       default:
+           fprintf(stderr, "unrecognized arguments\n");
+           exit(1);
+       }
+    } else {
+       switch (command) {
+       case FSYNC_VOL_STATS_VICEP:
+       case FSYNC_VOL_STATS_HASH:
+       case FSYNC_VOL_STATS_VLRU:
+           fprintf(stderr, "this subcommand requires more parameters\n");
+           exit(1);
+       }
+    }
+
+    common_prolog(as, &state);
+
+    fprintf(stderr, "calling FSYNC_askfs with command code %d (%s)\n", 
+           command, command_code_to_string(command));
+
+    code = FSYNC_StatsOp(&scom, command, FSYNC_WHATEVER, &res);
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_DENIED:
+       break;
+    default:
+       fprintf(stderr, "possible sync protocol error. return code was %d\n", code);
+    }
+
+    fprintf(stderr, "FSYNC_VolOp returned %d (%s)\n", code, response_code_to_string(code));
+    fprintf(stderr, "protocol response code was %d (%s)\n", 
+           res.hdr.response, response_code_to_string(res.hdr.response));
+    fprintf(stderr, "protocol reason code was %d (%s)\n", 
+           res.hdr.reason, reason_code_to_string(res.hdr.reason));
+
+    VDisconnectFS();
+
+    if (res.hdr.response == SYNC_OK) {
+       switch (command) {
+       case FSYNC_VOL_STATS_GENERAL:
+           print_vol_stats_general(sres.vol_stats);
+           break;
+       case FSYNC_VOL_STATS_VICEP:
+           print_vol_stats_viceP(sres.vicep_stats);
+           break;
+       case FSYNC_VOL_STATS_HASH:
+           print_vol_stats_hash(sres.hash_stats);
+           break;
+#ifdef AFS_DEMAND_ATTACH_FS
+       case FSYNC_VOL_STATS_HDR:
+           print_vol_stats_hdr(sres.hdr_stats);
+           break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+       }
+    }
+
+    return 0;
+}
+
+static void
+print_vol_stats_general(VolPkgStats * stats)
+{
+    int i;
+    afs_uint32 hi, lo;
+
+    printf("VolPkgStats = {\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+    for (i = 0; i < VOL_STATE_COUNT; i++) {
+       printf("\tvol_state_count[%s] = %d\n", 
+              vol_state_to_string(i),
+              stats->state_levels[i]);
+    }
+
+    SplitInt64(stats->hash_looks, hi, lo);
+    printf("\thash_looks = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->hash_reorders, hi, lo);
+    printf("\thash_reorders = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->salvages, hi, lo);
+    printf("\tsalvages = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->vol_ops, hi, lo);
+    printf("\tvol_ops = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+#endif
+    SplitInt64(stats->hdr_loads, hi, lo);
+    printf("\thdr_loads = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->hdr_gets, hi, lo);
+    printf("\thdr_gets = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->attaches, hi, lo);
+    printf("\tattaches = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->soft_detaches, hi, lo);
+    printf("\tsoft_detaches = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    printf("\thdr_cache_size = %d\n", stats->hdr_cache_size);
+           
+    printf("}\n");
+}
+
+static void
+print_vol_stats_viceP(struct DiskPartitionStats * stats)
+{
+    printf("DiskPartitionStats = {\n");
+    printf("\tfree = %d\n", stats->free);
+    printf("\tminFree = %d\n", stats->minFree);
+    printf("\ttotalUsable = %d\n", stats->totalUsable);
+    printf("\tf_files = %d\n", stats->f_files);
+#ifdef AFS_DEMAND_ATTACH_FS
+    printf("\tvol_list_len = %d\n", stats->vol_list_len);
+#endif
+    printf("}\n");
+}
+
+static void
+print_vol_stats_hash(struct VolumeHashChainStats * stats)
+{
+    afs_uint32 hi, lo;
+
+    printf("DiskPartitionStats = {\n");
+    printf("\ttable_size = %d\n", stats->table_size);
+    printf("\tchain_len = %d\n", stats->chain_len);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    printf("\tchain_cacheCheck = %d\n", stats->chain_cacheCheck);
+    printf("\tchain_busy = %d\n", stats->chain_busy);
+
+    SplitInt64(stats->chain_looks, hi, lo);
+    printf("\tchain_looks = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->chain_gets, hi, lo);
+    printf("\tchain_gets = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+
+    SplitInt64(stats->chain_reorders, hi, lo);
+    printf("\tchain_reorders = {\n");
+    printf("\t\thi = %u\n", hi);
+    printf("\t\tlo = %u\n", lo);
+    printf("\t}\n");
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    printf("}\n");
+}
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+print_vol_stats_hdr(struct volume_hdr_LRU_stats * stats)
+{
+    printf("volume_hdr_LRU_stats = {\n");
+    printf("\tfree = %d\n", stats->free);
+    printf("\tused = %d\n", stats->used);
+    printf("\tattached = %d\n", stats->attached);
+    printf("}\n");
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
diff --git a/src/vol/fssync-server.c b/src/vol/fssync-server.c

new file mode 100644 (file)

index 0000000..44494ca
--- /dev/null
+++ b/src/vol/fssync-server.c
@@ -0,0 +1,1179 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
+ */
+
+/*
+       System:         VICE-TWO
+       Module:         fssync.c
+       Institution:    The Information Technology Center, Carnegie-Mellon University
+
+ */
+#ifdef notdef
+
+/* All this is going away in early 1989 */
+int newVLDB;                   /* Compatibility flag */
+
+#endif
+static int newVLDB = 1;
+
+
+#ifndef AFS_PTHREAD_ENV
+#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
+
+/*
+ * stack size increased from 8K because the HP machine seemed to have trouble
+ * with the smaller stack
+ */
+#define USUAL_STACK_SIZE       (24 * 1024)
+#endif /* !AFS_PTHREAD_ENV */
+
+/*
+   fssync-server.c
+   File server synchronization with external volume utilities.
+   server-side implementation
+ */
+
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform.  Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#ifdef AFS_PTHREAD_ENV
+#include <assert.h>
+#else /* AFS_PTHREAD_ENV */
+#include <afs/assert.h>
+#endif /* AFS_PTHREAD_ENV */
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "daemon_com.h"
+#include "fssync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+
+
+#ifdef FSSYNC_BUILD_SERVER
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+int (*V_BreakVolumeCallbacks) ();
+
+#define MAXHANDLERS    4       /* Up to 4 clients; must be at least 2, so that
+                                * move = dump+restore can run on single server */
+#define MAXOFFLINEVOLUMES 128  /* This needs to be as big as the maximum
+                                * number that would be offline for 1 operation.
+                                * Current winner is salvage, which needs all
+                                * cloned read-only copies offline when salvaging
+                                * a single read-write volume */
+
+#define MAX_BIND_TRIES 5       /* Number of times to retry socket bind */
+
+
+
+static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES];
+
+static int AcceptSd = -1;      /* Socket used by server for accepting connections */
+
+static int getport();
+
+/* Forward declarations */
+static void FSYNC_sync();
+static void FSYNC_newconnection();
+static void FSYNC_com();
+static void FSYNC_Drop();
+static void AcceptOn();
+static void AcceptOff();
+static void InitHandler();
+static void CallHandler(fd_set * fdsetp);
+static int AddHandler();
+static int FindHandler();
+static int FindHandler_r();
+static int RemoveHandler();
+static void GetHandler(fd_set * fdsetp, int *maxfdp);
+
+extern int LogLevel;
+
+static afs_int32 FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res);
+
+static afs_int32 FSYNC_com_VolOn(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolOff(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolMove(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolDone(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+static afs_int32 FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32 FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * com, SYNC_response * res);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static afs_int32 FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res);
+
+static afs_int32 FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+static afs_int32 FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res);
+
+
+static void FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info);
+
+
+/*
+ * This lock controls access to the handler array. The overhead
+ * is minimal in non-preemptive environments.
+ */
+struct Lock FSYNC_handler_lock;
+
+void
+FSYNC_fsInit(void)
+{
+#ifdef AFS_PTHREAD_ENV
+    pthread_t tid;
+    pthread_attr_t tattr;
+#else /* AFS_PTHREAD_ENV */
+    PROCESS pid;
+#endif /* AFS_PTHREAD_ENV */
+
+    Lock_Init(&FSYNC_handler_lock);
+
+#ifdef AFS_PTHREAD_ENV
+    assert(pthread_attr_init(&tattr) == 0);
+    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
+#else /* AFS_PTHREAD_ENV */
+    assert(LWP_CreateProcess
+          (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
+           "FSYNC_sync", &pid) == LWP_SUCCESS);
+#endif /* AFS_PTHREAD_ENV */
+}
+
+static fd_set FSYNC_readfds;
+
+static int
+getport(struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;        /* was localhost->h_addrtype */
+    addr->sin_port = htons(2040);      /* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+
+static void
+FSYNC_sync()
+{
+    struct sockaddr_in addr;
+    int on = 1;
+    extern int VInit;
+    int code;
+    int numTries;
+#ifdef AFS_PTHREAD_ENV
+    int tid;
+#endif
+
+#ifndef AFS_NT40_ENV
+    (void)signal(SIGPIPE, SIG_IGN);
+#endif
+
+#ifdef AFS_PTHREAD_ENV
+    /* set our 'thread-id' so that the host hold table works */
+    MUTEX_ENTER(&rx_stats_mutex);      /* protects rxi_pthread_hinum */
+    tid = ++rxi_pthread_hinum;
+    MUTEX_EXIT(&rx_stats_mutex);
+    pthread_setspecific(rx_thread_id_key, (void *)tid);
+    Log("Set thread id %d for FSYNC_sync\n", tid);
+#endif /* AFS_PTHREAD_ENV */
+
+    while (!VInit) {
+       /* Let somebody else run until level > 0.  That doesn't mean that 
+        * all volumes have been attached. */
+#ifdef AFS_PTHREAD_ENV
+       pthread_yield();
+#else /* AFS_PTHREAD_ENV */
+       LWP_DispatchProcess();
+#endif /* AFS_PTHREAD_ENV */
+    }
+    AcceptSd = getport(&addr);
+    /* Reuseaddr needed because system inexplicably leaves crud lying around */
+    code =
+       setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
+                  sizeof(on));
+    if (code)
+       Log("FSYNC_sync: setsockopt failed with (%d)\n", errno);
+
+    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
+       if ((code =
+            bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
+           break;
+       Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n",
+           errno);
+       sleep(5);
+    }
+    assert(!code);
+    listen(AcceptSd, 100);
+    InitHandler();
+    AcceptOn();
+    for (;;) {
+       int maxfd;
+       GetHandler(&FSYNC_readfds, &maxfd);
+       /* Note: check for >= 1 below is essential since IOMGR_select
+        * doesn't have exactly same semantics as select.
+        */
+#ifdef AFS_PTHREAD_ENV
+       if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
+#else /* AFS_PTHREAD_ENV */
+       if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
+#endif /* AFS_PTHREAD_ENV */
+           CallHandler(&FSYNC_readfds);
+    }
+}
+
+static void
+FSYNC_newconnection(int afd)
+{
+    struct sockaddr_in other;
+    int junk, fd;
+    junk = sizeof(other);
+    fd = accept(afd, (struct sockaddr *)&other, &junk);
+    if (fd == -1) {
+       Log("FSYNC_newconnection:  accept failed, errno==%d\n", errno);
+       assert(1 == 2);
+    } else if (!AddHandler(fd, FSYNC_com)) {
+       AcceptOff();
+       assert(AddHandler(fd, FSYNC_com));
+    }
+}
+
+/* this function processes commands from an fssync file descriptor (fd) */
+afs_int32 FS_cnt = 0;
+static void
+FSYNC_com(int fd)
+{
+    SYNC_command com;
+    SYNC_response res;
+    SYNC_PROTO_BUF_DECL(com_buf);
+    SYNC_PROTO_BUF_DECL(res_buf);
+
+    memset(&res.hdr, 0, sizeof(res.hdr));
+
+    com.payload.buf = (void *)com_buf;
+    com.payload.len = SYNC_PROTO_MAX_LEN;
+    res.hdr.response_len = sizeof(res.hdr);
+    res.hdr.proto_version = FSYNC_PROTO_VERSION;
+    res.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = (void *)res_buf;
+
+    FS_cnt++;
+    if (SYNC_getCom(fd, &com)) {
+       Log("FSYNC_com:  read failed; dropping connection (cnt=%d)\n", FS_cnt);
+       FSYNC_Drop(fd);
+       return;
+    }
+
+    if (com.hdr.proto_version != FSYNC_PROTO_VERSION) {
+       Log("FSYNC_com:  invalid protocol version (%u)\n", com.hdr.proto_version);
+       res.hdr.response = SYNC_COM_ERROR;
+       res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       goto respond;
+    }
+
+    VOL_LOCK;
+    switch (com.hdr.command) {
+    case FSYNC_VOL_ON:
+    case FSYNC_VOL_OFF:
+    case FSYNC_VOL_LISTVOLUMES:
+    case FSYNC_VOL_NEEDVOLUME:
+    case FSYNC_VOL_MOVE:
+    case FSYNC_VOL_BREAKCBKS:
+    case FSYNC_VOL_DONE:
+    case FSYNC_VOL_QUERY:
+    case FSYNC_VOL_QUERY_HDR:
+    case FSYNC_VOL_QUERY_VOP:
+       res.hdr.response = FSYNC_com_VolOp(fd, &com, &res);
+       break;
+    case FSYNC_VOL_STATS_GENERAL:
+    case FSYNC_VOL_STATS_VICEP:
+    case FSYNC_VOL_STATS_HASH:
+    case FSYNC_VOL_STATS_HDR:
+    case FSYNC_VOL_STATS_VLRU:
+       res.hdr.response = FSYNC_com_StatsOp(fd, &com, &res);
+       break;
+    case SYNC_COM_CHANNEL_CLOSE:
+       res.hdr.response = SYNC_OK;
+       res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       break;
+    default:
+       res.hdr.response = SYNC_BAD_COMMAND;
+       break;
+    }
+    VOL_UNLOCK;
+
+ respond:
+    SYNC_putRes(fd, &res);
+    if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) {
+       FSYNC_Drop(fd);
+    }
+}
+
+static afs_int32
+FSYNC_com_VolOp(int fd, SYNC_command * com, SYNC_response * res)
+{
+    int i;
+    afs_int32 code = SYNC_OK;
+    FSSYNC_VolOp_command vcom;
+
+    if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_VolOp_hdr))) {
+       res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       return SYNC_COM_ERROR;
+    }
+
+    vcom.hdr = &com->hdr;
+    vcom.vop = (FSSYNC_VolOp_hdr *) com->payload.buf;
+    vcom.com = com;
+
+    vcom.volumes = OfflineVolumes[FindHandler(fd)];
+    for (vcom.v = NULL, i = 0; i < MAXOFFLINEVOLUMES; i++) {
+       if ((vcom.volumes[i].volumeID == vcom.vop->volume) &&
+           (strncmp(vcom.volumes[i].partName, vcom.vop->partName,
+                    sizeof(vcom.volumes[i].partName)) == 0)) {
+           vcom.v = &vcom.volumes[i];
+           break;
+       }
+    }
+
+    switch (com->hdr.command) {
+    case FSYNC_VOL_ON:
+       code = FSYNC_com_VolOn(&vcom, res);
+       break;
+    case FSYNC_VOL_OFF:
+    case FSYNC_VOL_NEEDVOLUME:
+       code = FSYNC_com_VolOff(&vcom, res);
+       break;
+    case FSYNC_VOL_LISTVOLUMES:
+       code = SYNC_OK;
+       break;
+    case FSYNC_VOL_MOVE:
+       code = FSYNC_com_VolMove(&vcom, res);
+       break;
+    case FSYNC_VOL_BREAKCBKS:
+       code = FSYNC_com_VolBreakCBKs(&vcom, res);
+       break;
+    case FSYNC_VOL_DONE:
+       code = FSYNC_com_VolDone(&vcom, res);
+       break;
+    case FSYNC_VOL_QUERY:
+       code = FSYNC_com_VolQuery(&vcom, res);
+       break;
+    case FSYNC_VOL_QUERY_HDR:
+       code = FSYNC_com_VolHdrQuery(&vcom, res);
+       break;
+#ifdef AFS_DEMAND_ATTACH_FS
+    case FSYNC_VOL_QUERY_VOP:
+       code = FSYNC_com_VolOpQuery(&vcom, res);
+       break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    default:
+       code = SYNC_BAD_COMMAND;
+    }
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolOn(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    char tvolName[VMAXPATHLEN];
+    Volume * vp;
+    Error error;
+
+    if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) {
+       res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       code = SYNC_FAILED;
+       goto done;
+    }
+
+    /*
+      This is where a detatched volume gets reattached. However in the
+      special case where the volume is merely busy, it is already
+      attatched and it is only necessary to clear the busy flag. See
+      defect #2080 for details.
+    */
+
+    /* is the volume already attatched? */
+#ifdef notdef
+    /*
+     * XXX With the following enabled we had bizarre problems where the backup id would
+     * be reset to 0; that was due to the interaction between fileserver/volserver in that they
+     * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of
+     * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call
+     * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to
+     * be done right XXX
+     */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+    if (vp) {
+       /* yep, is the BUSY flag set? */
+       if (vp->specialStatus == VBUSY) {
+
+           /* yep, clear BUSY flag */
+
+           vp->specialStatus = 0;
+           /* make sure vol is online */
+           if (vcom->v) {
+               vcom->v->volumeID = 0;
+               V_inUse(vp) = 1;        /* online */
+           }
+           VPutVolume_r(vp);
+           break;
+       }
+       VPutVolume_r(vp);
+    }
+#endif /* notdef */
+
+    /* so, we need to attach the volume */
+
+    if (vcom->v)
+       vcom->v->volumeID = 0;
+    tvolName[0] = '/';
+    snprintf(&tvolName[1], sizeof(tvolName)-1, VFORMAT, vcom->vop->volume);
+    tvolName[sizeof(tvolName)-1] = '\0';
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VPreAttachVolumeByName_r(&error, vcom->vop->partName, tvolName,
+                                 V_VOLUPD);
+    if (vp && vp->pending_vol_op) {
+       VDeregisterVolOp_r(vp, vp->pending_vol_op);
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
+    vp = VAttachVolumeByName_r(&error, vcom->vop->partName, tvolName,
+                              V_VOLUPD);
+    if (vp)
+       VPutVolume_r(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (error) {
+       code = SYNC_DENIED;
+       res->hdr.reason = error;
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolOff(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    FSSYNC_VolOp_info info;
+    afs_int32 code = SYNC_OK;
+    int i;
+    Volume * vp, * nvp;
+    Error error;
+
+    if (SYNC_verifyProtocolString(vcom->vop->partName, sizeof(vcom->vop->partName))) {
+       res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       code = SYNC_FAILED;
+       goto done;
+    }
+
+    /* not already offline, we need to find a slot for newly offline volume */
+    if (vcom->hdr->programType == debugUtility) {
+       /* debug utilities do not have their operations tracked */
+       vcom->v = NULL;
+    } else {
+       if (!vcom->v) {
+           for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
+               if (vcom->volumes[i].volumeID == 0) {
+                   vcom->v = &vcom->volumes[i];
+                   break;
+               }
+           }
+       }
+       if (!vcom->v) {
+           goto deny;
+       }
+    }
+
+    FSYNC_com_to_info(vcom, &info);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif
+
+    if (vp) {
+       if ((vcom->vop->partName[0] != 0) &&
+           (strncmp(vcom->vop->partName, vp->partition->name, 
+                   sizeof(vcom->vop->partName)) != 0)) {
+           /* volume on desired partition is not online, so we
+            * should treat this as an offline volume.
+            */
+#ifndef AFS_DEMAND_ATTACH_FS
+           VPutVolume_r(vp);
+#endif
+           vp = NULL;
+           goto done;
+       }
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (vp) {
+       ProgramType type = (ProgramType) vcom->hdr->programType;
+
+       /* do initial filtering of requests */
+
+       /* enforce mutual exclusion for volume ops */
+       if (vp->pending_vol_op) {
+           if (vp->pending_vol_op->com.programType != type) {
+               Log("volume %u already checked out\n", vp->hashid);
+               /* XXX debug */
+               Log("vp->vop = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x }, vop = { vol=%u, part='%s' } }\n",
+                   vp->pending_vol_op->com.proto_version, 
+                   vp->pending_vol_op->com.programType,
+                   vp->pending_vol_op->com.command,
+                   vp->pending_vol_op->com.reason,
+                   vp->pending_vol_op->com.command_len,
+                   vp->pending_vol_op->com.flags,
+                   vp->pending_vol_op->vop.volume,
+                   vp->pending_vol_op->vop.partName );
+               Log("vcom = { com = { ver=%u, prog=%d, com=%d, reason=%d, len=%u, flags=0x%x } , vop = { vol=%u, part='%s' } }\n",
+                   vcom->hdr->proto_version,
+                   vcom->hdr->programType,
+                   vcom->hdr->command,
+                   vcom->hdr->reason,
+                   vcom->hdr->command_len,
+                   vcom->hdr->flags,
+                   vcom->vop->volume,
+                   vcom->vop->partName);
+               res->hdr.reason = FSYNC_EXCLUSIVE;
+               goto deny;
+           } else {
+               Log("warning: volume %u recursively checked out by programType id %d\n",
+                   vp->hashid, vcom->hdr->programType);
+           }
+       }
+
+       /* filter based upon requestor
+        *
+        * volume utilities are not allowed to check out volumes
+        * which are in an error state
+        *
+        * unknown utility programs will be denied on principal
+        */
+       switch (type) {
+       case salvageServer:
+       case debugUtility:
+           /* give the salvageserver lots of liberty */
+           break;
+       case volumeUtility:
+           if ((V_attachState(vp) == VOL_STATE_ERROR) ||
+               (V_attachState(vp) == VOL_STATE_SALVAGING)) {
+               goto deny;
+           }
+           break;
+       default:
+           Log("bad program type passed to FSSYNC\n");
+           goto deny;
+       }
+
+       /* short circuit for offline volume states
+        * so we can avoid I/O penalty of attachment */
+       switch (V_attachState(vp)) {
+       case VOL_STATE_UNATTACHED:
+       case VOL_STATE_PREATTACHED:
+       case VOL_STATE_SALVAGING:
+       case VOL_STATE_ERROR:
+           /* register the volume operation metadata with the volume
+            *
+            * if the volume is currently pre-attached, attach2()
+            * will evaluate the vol op metadata to determine whether
+            * attaching the volume would be safe */
+           VRegisterVolOp_r(vp, &info);
+           goto done;
+       default:
+           break;
+       }
+
+       /* convert to heavyweight ref */
+       nvp = VGetVolumeByVp_r(&error, vp);
+
+       /* register the volume operation metadata with the volume */
+       VRegisterVolOp_r(vp, &info);
+
+       if (!nvp) {
+           Log("FSYNC_com_VolOff: failed to get heavyweight reference to volume %u\n",
+               vcom->vop->volume);
+           res->hdr.reason = FSYNC_VOL_PKG_ERROR;
+           goto deny;
+       }
+       vp = nvp;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (vp) {
+       if (VVolOpLeaveOnline_r(vp, &info)) {
+           VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);       /* At least get volume stats right */
+           if (LogLevel) {
+               Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", 
+                   V_id(vp), V_name(vp), 
+                   vcom->hdr->reason == V_CLONE ? "clone" : 
+                   vcom->hdr->reason == V_READONLY ? "readonly" : 
+                   vcom->hdr->reason == V_DUMP ? "dump" : 
+                   "UNKNOWN");
+           }
+           VPutVolume_r(vp);
+       } else {
+           if (VVolOpSetVBusy_r(vp, &info)) {
+               vp->specialStatus = VBUSY;
+           }
+
+           /* remember what volume we got, so we can keep track of how
+            * many volumes the volserver or whatever is using.  Note that
+            * vp is valid since leaveonline is only set when vp is valid.
+            */
+           if (vcom->v) {
+               vcom->v->volumeID = vcom->vop->volume;
+               strlcpy(vcom->v->partName, vp->partition->name, sizeof(vcom->v->partName));
+           }
+
+           VOffline_r(vp, "A volume utility is running.");
+           vp = NULL;
+       }
+    }
+
+ done:
+    return code;
+
+ deny:
+    return SYNC_DENIED;
+}
+
+static afs_int32
+FSYNC_com_VolMove(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    Error error;
+    Volume * vp;
+
+    /* Yuch:  the "reason" for the move is the site it got moved to... */
+    /* still set specialStatus so we stop sending back VBUSY.
+     * also should still break callbacks.  Note that I don't know
+     * how to tell if we should break all or not, so we just do it
+     * since it doesn't matter much if we do an extra break
+     * volume callbacks on a volume move within the same server */
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif
+    if (vp) {
+       vp->specialStatus = VMOVED;
+#ifndef AFS_DEMAND_ATTACH_FS
+       VPutVolume_r(vp);
+#endif
+    }
+
+    if (V_BreakVolumeCallbacks) {
+       Log("fssync: volume %u moved to %x; breaking all call backs\n",
+           vcom->vop->volume, vcom->hdr->reason);
+       VOL_UNLOCK;
+       (*V_BreakVolumeCallbacks) (vcom->vop->volume);
+       VOL_LOCK;
+    }
+
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolDone(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+#ifdef AFS_DEMAND_ATTACH_FS
+    Error error;
+    Volume * vp;
+#endif
+
+    /* don't try to put online, this call is made only after deleting
+     * a volume, in which case we want to remove the vol # from the
+     * OfflineVolumes array only */
+    if (vcom->v)
+       vcom->v->volumeID = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+    if (vp && vp->pending_vol_op) {
+       VDeregisterVolOp_r(vp, vp->pending_vol_op);
+    }
+#endif
+
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolBreakCBKs(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    /* if the volume is being restored, break all callbacks on it */
+    if (V_BreakVolumeCallbacks) {
+       Log("fssync: breaking all call backs for volume %u\n",
+           vcom->vop->volume);
+       VOL_UNLOCK;
+       (*V_BreakVolumeCallbacks) (vcom->vop->volume);
+       VOL_LOCK;
+    }
+    return SYNC_OK;
+}
+
+static afs_int32
+FSYNC_com_VolQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+#else /* !AFS_DEMAND_ATTACH_FS */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+    if (vp) {
+       assert(sizeof(Volume) <= res->payload.len);
+       memcpy(res->payload.buf, vp, sizeof(Volume));
+       res->hdr.response_len += sizeof(Volume);
+#ifndef AFS_DEMAND_ATTACH_FS
+       VPutVolume_r(vp);
+#endif
+    } else {
+       res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+       code = SYNC_FAILED;
+    }
+    return code;
+}
+
+static afs_int32
+FSYNC_com_VolHdrQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+    int hdr_ok = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+    if (vp &&
+       (vp->header != NULL) &&
+       (V_attachFlags(vp) & VOL_HDR_ATTACHED) &&
+       (V_attachFlags(vp) & VOL_HDR_LOADED)) {
+       hdr_ok = 1;
+    }
+#else /* !AFS_DEMAND_ATTACH_FS */
+    vp = VGetVolume_r(&error, vcom->vop->volume);
+    if (vp && vp->header) {
+       hdr_ok = 1;
+    }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+ load_done:
+    if (hdr_ok) {
+       assert(sizeof(VolumeDiskData) <= res->payload.len);
+       memcpy(res->payload.buf, &V_disk(vp), sizeof(VolumeDiskData));
+       res->hdr.response_len += sizeof(VolumeDiskData);
+#ifndef AFS_DEMAND_ATTACH_FS
+       VPutVolume_r(vp);
+#endif
+    } else {
+       if (vp) {
+           res->hdr.reason = FSYNC_HDR_NOT_ATTACHED;
+       } else {
+           res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+       }
+       code = SYNC_FAILED;
+    }
+    return code;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32
+FSYNC_com_VolOpQuery(FSSYNC_VolOp_command * vcom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    Error error;
+    Volume * vp;
+
+    vp = VLookupVolume_r(&error, vcom->vop->volume, NULL);
+
+    if (vp && vp->pending_vol_op) {
+       assert(sizeof(FSSYNC_VolOp_info) <= res->payload.len);
+       memcpy(res->payload.buf, vp->pending_vol_op, sizeof(FSSYNC_VolOp_info));
+       res->hdr.response_len += sizeof(FSSYNC_VolOp_info);
+    } else {
+       if (vp) {
+           res->hdr.reason = FSYNC_NO_PENDING_VOL_OP;
+       } else {
+           res->hdr.reason = FSYNC_UNKNOWN_VOLID;
+       }
+       code = SYNC_FAILED;
+    }
+    return code;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static afs_int32
+FSYNC_com_StatsOp(int fd, SYNC_command * com, SYNC_response * res)
+{
+    int i;
+    afs_int32 code = SYNC_OK;
+    FSSYNC_StatsOp_command scom;
+
+    if (com->recv_len != (sizeof(com->hdr) + sizeof(FSSYNC_StatsOp_hdr))) {
+       res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       res->hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       return SYNC_COM_ERROR;
+    }
+
+    scom.hdr = &com->hdr;
+    scom.sop = (FSSYNC_StatsOp_hdr *) com->payload.buf;
+    scom.com = com;
+
+    switch (com->hdr.command) {
+    case FSYNC_VOL_STATS_GENERAL:
+       code = FSYNC_com_StatsOpGeneral(&scom, res);
+       break;
+#ifdef AFS_DEMAND_ATTACH_FS
+       /* statistics for the following subsystems are only tracked
+        * for demand attach fileservers */
+    case FSYNC_VOL_STATS_VICEP:
+       code = FSYNC_com_StatsOpViceP(&scom, res);
+       break;
+    case FSYNC_VOL_STATS_HASH:
+       code = FSYNC_com_StatsOpHash(&scom, res);
+       break;
+    case FSYNC_VOL_STATS_HDR:
+       code = FSYNC_com_StatsOpHdr(&scom, res);
+       break;
+    case FSYNC_VOL_STATS_VLRU:
+       code = FSYNC_com_StatsOpVLRU(&scom, res);
+       break;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    default:
+       code = SYNC_BAD_COMMAND;
+    }
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpGeneral(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    memcpy(res->payload.buf, &VStats, sizeof(VStats));
+    res->hdr.response_len += sizeof(VStats);
+
+    return code;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static afs_int32
+FSYNC_com_StatsOpViceP(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct DiskPartition * dp;
+    struct DiskPartitionStats * stats;
+
+    if (SYNC_verifyProtocolString(scom->sop->args.partName, sizeof(scom->sop->args.partName))) {
+       res->hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       code = SYNC_FAILED;
+       goto done;
+    }
+
+    dp = VGetPartition_r(scom->sop->args.partName, 0);
+    if (!dp) {
+       code = SYNC_FAILED;
+    } else {
+       stats = (struct DiskPartitionStats *) res->payload.buf;
+       stats->free = dp->free;
+       stats->totalUsable = dp->totalUsable;
+       stats->minFree = dp->minFree;
+       stats->f_files = dp->f_files;
+       stats->vol_list_len = dp->vol_list.len;
+       
+       res->hdr.response_len += sizeof(struct DiskPartitionStats);
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpHash(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct VolumeHashChainStats * stats;
+    struct VolumeHashChainHead * head;
+
+    if (scom->sop->args.hash_bucket >= VolumeHashTable.Size) {
+       return SYNC_FAILED;
+    }
+
+    head = &VolumeHashTable.Table[scom->sop->args.hash_bucket];
+    stats = (struct VolumeHashChainStats *) res->payload.buf;
+    stats->table_size = VolumeHashTable.Size;
+    stats->chain_len = head->len;
+    stats->chain_cacheCheck = head->cacheCheck;
+    stats->chain_busy = head->busy;
+    AssignInt64(head->looks, &stats->chain_looks);
+    AssignInt64(head->gets, &stats->chain_gets);
+    AssignInt64(head->reorders, &stats->chain_reorders);
+
+    res->hdr.response_len += sizeof(struct VolumeHashChainStats);
+    
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpHdr(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    memcpy(res->payload.buf, &volume_hdr_LRU.stats, sizeof(volume_hdr_LRU.stats));
+    res->hdr.response_len += sizeof(volume_hdr_LRU.stats);
+
+    return code;
+}
+
+static afs_int32
+FSYNC_com_StatsOpVLRU(FSSYNC_StatsOp_command * scom, SYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+
+    code = SYNC_BAD_COMMAND;
+
+    return code;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static void
+FSYNC_com_to_info(FSSYNC_VolOp_command * vcom, FSSYNC_VolOp_info * info)
+{
+    memcpy(&info->com, vcom->hdr, sizeof(SYNC_command_hdr));
+    memcpy(&info->vop, vcom->vop, sizeof(FSSYNC_VolOp_hdr));
+}
+
+static void
+FSYNC_Drop(int fd)
+{
+    struct offlineInfo *p;
+    int i;
+    Error error;
+    char tvolName[VMAXPATHLEN];
+
+    VOL_LOCK;
+    p = OfflineVolumes[FindHandler(fd)];
+    for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
+       if (p[i].volumeID) {
+
+           Volume *vp;
+
+           tvolName[0] = '/';
+           sprintf(&tvolName[1], VFORMAT, p[i].volumeID);
+           vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName,
+                                      V_VOLUPD);
+           if (vp)
+               VPutVolume_r(vp);
+           p[i].volumeID = 0;
+       }
+    }
+    VOL_UNLOCK;
+    RemoveHandler(fd);
+#ifdef AFS_NT40_ENV
+    closesocket(fd);
+#else
+    close(fd);
+#endif
+    AcceptOn();
+}
+
+static int AcceptHandler = -1; /* handler id for accept, if turned on */
+
+static void
+AcceptOn()
+{
+    if (AcceptHandler == -1) {
+       assert(AddHandler(AcceptSd, FSYNC_newconnection));
+       AcceptHandler = FindHandler(AcceptSd);
+    }
+}
+
+static void
+AcceptOff()
+{
+    if (AcceptHandler != -1) {
+       assert(RemoveHandler(AcceptSd));
+       AcceptHandler = -1;
+    }
+}
+
+/* The multiple FD handling code. */
+
+static int HandlerFD[MAXHANDLERS];
+static int (*HandlerProc[MAXHANDLERS]) ();
+
+static void
+InitHandler()
+{
+    register int i;
+    ObtainWriteLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+       HandlerFD[i] = -1;
+       HandlerProc[i] = 0;
+    }
+    ReleaseWriteLock(&FSYNC_handler_lock);
+}
+
+static void
+CallHandler(fd_set * fdsetp)
+{
+    register int i;
+    ObtainReadLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+       if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
+           ReleaseReadLock(&FSYNC_handler_lock);
+           (*HandlerProc[i]) (HandlerFD[i]);
+           ObtainReadLock(&FSYNC_handler_lock);
+       }
+    }
+    ReleaseReadLock(&FSYNC_handler_lock);
+}
+
+static int
+AddHandler(int afd, int (*aproc) ())
+{
+    register int i;
+    ObtainWriteLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == -1)
+           break;
+    if (i >= MAXHANDLERS) {
+       ReleaseWriteLock(&FSYNC_handler_lock);
+       return 0;
+    }
+    HandlerFD[i] = afd;
+    HandlerProc[i] = aproc;
+    ReleaseWriteLock(&FSYNC_handler_lock);
+    return 1;
+}
+
+static int
+FindHandler(register int afd)
+{
+    register int i;
+    ObtainReadLock(&FSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == afd) {
+           ReleaseReadLock(&FSYNC_handler_lock);
+           return i;
+       }
+    ReleaseReadLock(&FSYNC_handler_lock);      /* just in case */
+    assert(1 == 2);
+    return -1;                 /* satisfy compiler */
+}
+
+static int
+FindHandler_r(register int afd)
+{
+    register int i;
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == afd) {
+           return i;
+       }
+    assert(1 == 2);
+    return -1;                 /* satisfy compiler */
+}
+
+static int
+RemoveHandler(register int afd)
+{
+    ObtainWriteLock(&FSYNC_handler_lock);
+    HandlerFD[FindHandler_r(afd)] = -1;
+    ReleaseWriteLock(&FSYNC_handler_lock);
+    return 1;
+}
+
+static void
+GetHandler(fd_set * fdsetp, int *maxfdp)
+{
+    register int i;
+    register int maxfd = -1;
+    FD_ZERO(fdsetp);
+    ObtainReadLock(&FSYNC_handler_lock);       /* just in case */
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] != -1) {
+           FD_SET(HandlerFD[i], fdsetp);
+           if (maxfd < HandlerFD[i])
+               maxfd = HandlerFD[i];
+       }
+    *maxfdp = maxfd;
+    ReleaseReadLock(&FSYNC_handler_lock);      /* just in case */
+}
+
+#endif /* FSSYNC_BUILD_SERVER */
diff --git a/src/vol/fssync.c b/src/vol/fssync.c

deleted file mode 100644 (file)

index 714aaf5..0000000
--- a/src/vol/fssync.c
+++ /dev/null
@@ -1,751 +0,0 @@
-/*
- * Copyright 2000, International Business Machines Corporation and others.
- * All Rights Reserved.
- * 
- * This software has been released under the terms of the IBM Public
- * License.  For details, see the LICENSE file in the top-level source
- * directory or online at http://www.openafs.org/dl/license10.html
- */
-
-/*
-       System:         VICE-TWO
-       Module:         fssync.c
-       Institution:    The Information Technology Center, Carnegie-Mellon University
-
- */
-#ifdef notdef
-
-/* All this is going away in early 1989 */
-int newVLDB;                   /* Compatibility flag */
-
-#endif
-static int newVLDB = 1;
-
-
-#ifndef AFS_PTHREAD_ENV
-#define USUAL_PRIORITY (LWP_MAX_PRIORITY - 2)
-
-/*
- * stack size increased from 8K because the HP machine seemed to have trouble
- * with the smaller stack
- */
-#define USUAL_STACK_SIZE       (24 * 1024)
-#endif /* !AFS_PTHREAD_ENV */
-
-/*
-   fsync.c
-   File server synchronization with external volume utilities.
- */
-
-/* This controls the size of an fd_set; it must be defined early before
- * the system headers define that type and the macros that operate on it.
- * Its value should be as large as the maximum file descriptor limit we
- * are likely to run into on any platform.  Right now, that is 65536
- * which is the default hard fd limit on Solaris 9 */
-#ifndef _WIN32
-#define FD_SETSIZE 65536
-#endif
-
-#include <afsconfig.h>
-#include <afs/param.h>
-
-RCSID
-    ("$Header$");
-
-#include <sys/types.h>
-#include <stdio.h>
-#ifdef AFS_NT40_ENV
-#include <winsock2.h>
-#include <time.h>
-#else
-#include <sys/param.h>
-#include <sys/socket.h>
-#include <netinet/in.h>
-#include <netdb.h>
-#include <sys/time.h>
-#endif
-#include <errno.h>
-#ifdef AFS_PTHREAD_ENV
-#include <assert.h>
-#else /* AFS_PTHREAD_ENV */
-#include <afs/assert.h>
-#endif /* AFS_PTHREAD_ENV */
-#include <signal.h>
-
-#ifdef HAVE_STRING_H
-#include <string.h>
-#else
-#ifdef HAVE_STRINGS_H
-#include <strings.h>
-#endif
-#endif
-
-
-#include <rx/xdr.h>
-#include <afs/afsint.h>
-#include "nfs.h"
-#include <afs/errors.h>
-#include "fssync.h"
-#include "lwp.h"
-#include "lock.h"
-#include <afs/afssyscalls.h>
-#include "ihandle.h"
-#include "vnode.h"
-#include "volume.h"
-#include "partition.h"
-
-/*@printflike@*/ extern void Log(const char *format, ...);
-
-#ifdef osi_Assert
-#undef osi_Assert
-#endif
-#define osi_Assert(e) (void)(e)
-
-int (*V_BreakVolumeCallbacks) ();
-
-#define MAXHANDLERS    4       /* Up to 4 clients; must be at least 2, so that
-                                * move = dump+restore can run on single server */
-#define MAXOFFLINEVOLUMES 128  /* This needs to be as big as the maximum
-                                * number that would be offline for 1 operation.
-                                * Current winner is salvage, which needs all
-                                * cloned read-only copies offline when salvaging
-                                * a single read-write volume */
-
-#define MAX_BIND_TRIES 5       /* Number of times to retry socket bind */
-
-
-struct offlineInfo {
-    VolumeId volumeID;
-    char partName[16];
-};
-
-static struct offlineInfo OfflineVolumes[MAXHANDLERS][MAXOFFLINEVOLUMES];
-
-static FS_sd = -1;             /* Client socket for talking to file server */
-static AcceptSd = -1;          /* Socket used by server for accepting connections */
-
-static int getport();
-
-struct command {
-    bit32 command;
-    bit32 reason;
-    VolumeId volume;
-    char partName[16];         /* partition name, e.g. /vicepa */
-};
-
-/* Forward declarations */
-static void FSYNC_sync();
-static void FSYNC_newconnection();
-static void FSYNC_com();
-static void FSYNC_Drop();
-static void AcceptOn();
-static void AcceptOff();
-static void InitHandler();
-static void CallHandler(fd_set * fdsetp);
-static int AddHandler();
-static int FindHandler();
-static int FindHandler_r();
-static int RemoveHandler();
-static void GetHandler(fd_set * fdsetp, int *maxfdp);
-
-extern int LogLevel;
-
-/*
- * This lock controls access to the handler array. The overhead
- * is minimal in non-preemptive environments.
- */
-struct Lock FSYNC_handler_lock;
-
-int
-FSYNC_clientInit(void)
-{
-    struct sockaddr_in addr;
-    /* I can't believe the following is needed for localhost connections!! */
-    static time_t backoff[] =
-       { 3, 3, 3, 5, 5, 5, 7, 15, 16, 24, 32, 40, 48, 0 };
-    time_t *timeout = &backoff[0];
-
-    for (;;) {
-       FS_sd = getport(&addr);
-       if (connect(FS_sd, (struct sockaddr *)&addr, sizeof(addr)) >= 0)
-           return 1;
-       if (!*timeout)
-           break;
-       if (!(*timeout & 1))
-           Log("FSYNC_clientInit temporary failure (will retry)");
-       FSYNC_clientFinis();
-       sleep(*timeout++);
-    }
-    perror("FSYNC_clientInit failed (giving up!)");
-    return 0;
-}
-
-void
-FSYNC_clientFinis(void)
-{
-#ifdef AFS_NT40_ENV
-    closesocket(FS_sd);
-#else
-    close(FS_sd);
-#endif
-    FS_sd = -1;
-}
-
-int
-FSYNC_askfs(VolumeId volume, char *partName, int com, int reason)
-{
-    byte response;
-    struct command command;
-    int n;
-    command.volume = volume;
-    command.command = com;
-    command.reason = reason;
-    if (partName)
-       strcpy(command.partName, partName);
-    else
-       command.partName[0] = 0;
-    assert(FS_sd != -1);
-    VFSYNC_LOCK;
-#ifdef AFS_NT40_ENV
-    if (send(FS_sd, (char *)&command, sizeof(command), 0) != sizeof(command)) {
-       printf("FSYNC_askfs: write to file server failed\n");
-       response = FSYNC_DENIED;
-       goto done;
-    }
-    while ((n = recv(FS_sd, &response, 1, 0)) != 1) {
-       if (n == 0 || WSAEINTR != WSAGetLastError()) {
-           printf("FSYNC_askfs: No response from file server\n");
-           response = FSYNC_DENIED;
-           goto done;
-       }
-    }
-#else
-    if (write(FS_sd, &command, sizeof(command)) != sizeof(command)) {
-       printf("FSYNC_askfs: write to file server failed\n");
-       response = FSYNC_DENIED;
-       goto done;
-    }
-    while ((n = read(FS_sd, &response, 1)) != 1) {
-       if (n == 0 || errno != EINTR) {
-           printf("FSYNC_askfs: No response from file server\n");
-           response = FSYNC_DENIED;
-           goto done;
-       }
-    }
-#endif
-    if (response == 0) {
-       printf
-           ("FSYNC_askfs: negative response from file server; volume %u, command %d\n",
-            command.volume, (int)command.command);
-    }
-  done:
-    VFSYNC_UNLOCK;
-    return response;
-}
-
-void
-FSYNC_fsInit(void)
-{
-#ifdef AFS_PTHREAD_ENV
-    pthread_t tid;
-    pthread_attr_t tattr;
-    assert(pthread_attr_init(&tattr) == 0);
-    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
-    assert(pthread_create(&tid, &tattr, FSYNC_sync, NULL) == 0);
-#else /* AFS_PTHREAD_ENV */
-    PROCESS pid;
-    assert(LWP_CreateProcess
-          (FSYNC_sync, USUAL_STACK_SIZE, USUAL_PRIORITY, (void *)0,
-           "FSYNC_sync", &pid) == LWP_SUCCESS);
-#endif /* AFS_PTHREAD_ENV */
-}
-
-static int
-getport(struct sockaddr_in *addr)
-{
-    int sd;
-
-    memset(addr, 0, sizeof(*addr));
-    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
-#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
-    addr->sin_len = sizeof(struct sockaddr_in);
-#endif
-    addr->sin_addr.s_addr = htonl(0x7f000001);
-    addr->sin_family = AF_INET;        /* was localhost->h_addrtype */
-    addr->sin_port = htons(2040);      /* XXXX htons not _really_ neccessary */
-
-    return sd;
-}
-
-static fd_set FSYNC_readfds;
-
-static void
-FSYNC_sync()
-{
-    struct sockaddr_in addr;
-    int on = 1;
-    extern VInit;
-    int code;
-    int numTries;
-#ifdef AFS_PTHREAD_ENV
-    int tid;
-#endif
-
-#ifndef AFS_NT40_ENV
-    (void)signal(SIGPIPE, SIG_IGN);
-#endif
-
-#ifdef AFS_PTHREAD_ENV
-    /* set our 'thread-id' so that the host hold table works */
-    MUTEX_ENTER(&rx_stats_mutex);      /* protects rxi_pthread_hinum */
-    tid = ++rxi_pthread_hinum;
-    MUTEX_EXIT(&rx_stats_mutex);
-    pthread_setspecific(rx_thread_id_key, (void *)tid);
-    Log("Set thread id %d for FSYNC_sync\n", tid);
-#endif /* AFS_PTHREAD_ENV */
-
-    while (!VInit) {
-       /* Let somebody else run until level > 0.  That doesn't mean that 
-        * all volumes have been attached. */
-#ifdef AFS_PTHREAD_ENV
-       pthread_yield();
-#else /* AFS_PTHREAD_ENV */
-       LWP_DispatchProcess();
-#endif /* AFS_PTHREAD_ENV */
-    }
-    AcceptSd = getport(&addr);
-    /* Reuseaddr needed because system inexplicably leaves crud lying around */
-    code =
-       setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
-                  sizeof(on));
-    if (code)
-       Log("FSYNC_sync: setsockopt failed with (%d)\n", errno);
-
-    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
-       if ((code =
-            bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
-           break;
-       Log("FSYNC_sync: bind failed with (%d), will sleep and retry\n",
-           errno);
-       sleep(5);
-    }
-    assert(!code);
-    listen(AcceptSd, 100);
-    InitHandler();
-    AcceptOn();
-    for (;;) {
-       int maxfd;
-       GetHandler(&FSYNC_readfds, &maxfd);
-       /* Note: check for >= 1 below is essential since IOMGR_select
-        * doesn't have exactly same semantics as select.
-        */
-#ifdef AFS_PTHREAD_ENV
-       if (select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
-#else /* AFS_PTHREAD_ENV */
-       if (IOMGR_Select(maxfd + 1, &FSYNC_readfds, NULL, NULL, NULL) >= 1)
-#endif /* AFS_PTHREAD_ENV */
-           CallHandler(&FSYNC_readfds);
-    }
-}
-
-static void
-FSYNC_newconnection(int afd)
-{
-    struct sockaddr_in other;
-    int junk, fd;
-    junk = sizeof(other);
-    fd = accept(afd, (struct sockaddr *)&other, &junk);
-    if (fd == -1) {
-       Log("FSYNC_newconnection:  accept failed, errno==%d\n", errno);
-       assert(1 == 2);
-    } else if (!AddHandler(fd, FSYNC_com)) {
-       AcceptOff();
-       assert(AddHandler(fd, FSYNC_com));
-    }
-}
-
-/*
-#define TEST2081
-*/
-
-afs_int32 FS_cnt = 0;
-static void
-FSYNC_com(int fd)
-{
-    byte rc = FSYNC_OK;
-    int n, i;
-    Error error;
-    struct command command;
-    int leaveonline;
-    register struct offlineInfo *volumes, *v;
-    Volume *vp;
-    char tvolName[VMAXPATHLEN];
-
-    FS_cnt++;
-#ifdef AFS_NT40_ENV
-    n = recv(fd, &command, sizeof(command), 0);
-#else
-    n = read(fd, &command, sizeof(command));
-#endif
-    if (n <= 0) {
-       FSYNC_Drop(fd);
-       return;
-    }
-    if (n < sizeof(command)) {
-       Log("FSYNC_com:  partial read (%d instead of %d); dropping connection (cnt=%d)\n", n, sizeof(command), FS_cnt);
-       FSYNC_Drop(fd);
-       return;
-    }
-    VATTACH_LOCK;
-    VOL_LOCK;
-    volumes = OfflineVolumes[FindHandler(fd)];
-    for (v = 0, i = 0; i < MAXOFFLINEVOLUMES; i++) {
-       if (volumes[i].volumeID == command.volume
-           && strcmp(volumes[i].partName, command.partName) == 0) {
-           v = &volumes[i];
-           break;
-       }
-    }
-    switch (command.command) {
-    case FSYNC_DONE:
-       /* don't try to put online, this call is made only after deleting
-        * a volume, in which case we want to remove the vol # from the
-        * OfflineVolumes array only */
-       if (v)
-           v->volumeID = 0;
-       break;
-    case FSYNC_ON:
-
-/*
-This is where a detatched volume gets reattached. However in the
-special case where the volume is merely busy, it is already
-attatched and it is only necessary to clear the busy flag. See
-defect #2080 for details.
-*/
-
-       /* is the volume already attatched? */
-#ifdef notdef
-/*
- * XXX With the following enabled we had bizarre problems where the backup id would
- * be reset to 0; that was due to the interaction between fileserver/volserver in that they
- * both keep volumes in memory and the changes wouldn't be made to the fileserver. Some of
- * the problems were due to refcnt changes as result of VGetVolume/VPutVolume which would call
- * VOffline, etc. when we don't want to; someday the whole #2080 issue should be revisited to
- * be done right XXX
- */
-       vp = VGetVolume_r(&error, command.volume);
-       if (vp) {
-           /* yep, is the BUSY flag set? */
-           if (vp->specialStatus == VBUSY) {
-/* test harness for defect #2081 */
-
-#ifdef TEST2081
-               /*
-                * test #2081 by releasing TEST.2081,
-                * so leave it alone here, zap it after
-                */
-
-               if (strcmp(vp->header->diskstuff.name, "TEST.2081") == 0)
-                   break;
-#endif
-               /* yep, clear BUSY flag */
-
-               vp->specialStatus = 0;
-               /* make sure vol is online */
-               if (v) {
-                   v->volumeID = 0;
-                   V_inUse(vp) = 1;    /* online */
-               }
-               VPutVolume_r(vp);
-               break;
-           }
-           VPutVolume_r(vp);
-       }
-#endif
-
-       /* so, we need to attach the volume */
-
-       if (v)
-           v->volumeID = 0;
-       tvolName[0] = '/';
-       sprintf(&tvolName[1], VFORMAT, command.volume);
-
-       vp = VAttachVolumeByName_r(&error, command.partName, tvolName,
-                                  V_VOLUPD);
-       if (vp)
-           VPutVolume_r(vp);
-       break;
-    case FSYNC_OFF:
-    case FSYNC_NEEDVOLUME:{
-           leaveonline = 0;
-           /* not already offline, we need to find a slot for newly offline volume */
-           if (!v) {
-               for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
-                   if (volumes[i].volumeID == 0) {
-                       v = &volumes[i];
-                       break;
-                   }
-               }
-           }
-           if (!v) {
-               rc = FSYNC_DENIED;
-               break;
-           }
-           vp = VGetVolume_r(&error, command.volume);
-           if (vp) {
-               if (command.partName[0] != 0
-                   && strcmp(command.partName, vp->partition->name) != 0) {
-                   /* volume on desired partition is not online, so we
-                    * should treat this as an offline volume.
-                    */
-                   VPutVolume_r(vp);
-                   vp = (Volume *) 0;
-               }
-           }
-           if (vp) {
-               leaveonline = (command.command == FSYNC_NEEDVOLUME
-                              && (command.reason == V_READONLY
-                                  || (!VolumeWriteable(vp)
-                                      && (command.reason == V_CLONE
-                                          || command.reason == V_DUMP))
-                              )
-                   );
-               if (!leaveonline) {
-                   if (command.command == FSYNC_NEEDVOLUME
-                       && (command.reason == V_CLONE
-                           || command.reason == V_DUMP)) {
-                       vp->specialStatus = VBUSY;
-                   }
-                   /* remember what volume we got, so we can keep track of how
-                    * many volumes the volserver or whatever is using.  Note that
-                    * vp is valid since leaveonline is only set when vp is valid.
-                    */
-                   v->volumeID = command.volume;
-                   strcpy(v->partName, vp->partition->name);
-                   if (!V_inUse(vp)) {
-                       /* in this case, VOffline just returns sans decrementing
-                        * ref count.  We could try to fix it, but it has lots of
-                        * weird callers.
-                        */
-                       VPutVolume_r(vp);
-                   } else {
-                       VOffline_r(vp, "A volume utility is running.");
-                   }
-                   vp = 0;
-               } else {
-                   VUpdateVolume_r(&error, vp);        /* At least get volume stats right */
-                   if (LogLevel) {
-                       Log("FSYNC: Volume %u (%s) was left on line for an external %s request\n", V_id(vp), V_name(vp), command.reason == V_CLONE ? "clone" : command.reason == V_READONLY ? "readonly" : command.reason == V_DUMP ? "dump" : "UNKNOWN");
-                   }
-               }
-               if (vp)
-                   VPutVolume_r(vp);
-           }
-           rc = FSYNC_OK;
-           break;
-       }
-    case FSYNC_MOVEVOLUME:
-       /* Yuch:  the "reason" for the move is the site it got moved to... */
-       /* still set specialStatus so we stop sending back VBUSY.
-        * also should still break callbacks.  Note that I don't know
-        * how to tell if we should break all or not, so we just do it
-        * since it doesn't matter much if we do an extra break
-        * volume callbacks on a volume move within the same server */
-       vp = VGetVolume_r(&error, command.volume);
-       if (vp) {
-           vp->specialStatus = VMOVED;
-           VPutVolume_r(vp);
-       }
-
-       if (V_BreakVolumeCallbacks) {
-           Log("fssync: volume %u moved to %x; breaking all call backs\n",
-               command.volume, command.reason);
-           VOL_UNLOCK;
-           VATTACH_UNLOCK;
-           (*V_BreakVolumeCallbacks) (command.volume);
-           VATTACH_LOCK;
-           VOL_LOCK;
-       }
-       break;
-    case FSYNC_RESTOREVOLUME:
-       /* if the volume is being restored, break all callbacks on it */
-       if (V_BreakVolumeCallbacks) {
-           Log("fssync: volume %u restored; breaking all call backs\n",
-               command.volume);
-           VOL_UNLOCK;
-           VATTACH_UNLOCK;
-           (*V_BreakVolumeCallbacks) (command.volume);
-           VATTACH_LOCK;
-           VOL_LOCK;
-       }
-       break;
-    default:
-       rc = FSYNC_DENIED;
-       break;
-    }
-    VOL_UNLOCK;
-    VATTACH_UNLOCK;
-#ifdef AFS_NT40_ENV
-    (void)send(fd, &rc, 1, 0);
-#else
-    (void)write(fd, &rc, 1);
-#endif
-}
-
-static void
-FSYNC_Drop(int fd)
-{
-    struct offlineInfo *p;
-    register i;
-    Error error;
-    char tvolName[VMAXPATHLEN];
-
-    VATTACH_LOCK;
-    VOL_LOCK;
-    p = OfflineVolumes[FindHandler(fd)];
-    for (i = 0; i < MAXOFFLINEVOLUMES; i++) {
-       if (p[i].volumeID) {
-           Volume *vp;
-
-           tvolName[0] = '/';
-           sprintf(&tvolName[1], VFORMAT, p[i].volumeID);
-           vp = VAttachVolumeByName_r(&error, p[i].partName, tvolName,
-                                      V_VOLUPD);
-           if (vp)
-               VPutVolume_r(vp);
-           p[i].volumeID = 0;
-       }
-    }
-    VOL_UNLOCK;
-    VATTACH_UNLOCK;
-    RemoveHandler(fd);
-#ifdef AFS_NT40_ENV
-    closesocket(fd);
-#else
-    close(fd);
-#endif
-    AcceptOn();
-}
-
-static int AcceptHandler = -1; /* handler id for accept, if turned on */
-
-static void
-AcceptOn()
-{
-    if (AcceptHandler == -1) {
-       assert(AddHandler(AcceptSd, FSYNC_newconnection));
-       AcceptHandler = FindHandler(AcceptSd);
-    }
-}
-
-static void
-AcceptOff()
-{
-    if (AcceptHandler != -1) {
-       assert(RemoveHandler(AcceptSd));
-       AcceptHandler = -1;
-    }
-}
-
-/* The multiple FD handling code. */
-
-static int HandlerFD[MAXHANDLERS];
-static int (*HandlerProc[MAXHANDLERS]) ();
-
-static void
-InitHandler()
-{
-    register int i;
-    ObtainWriteLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++) {
-       HandlerFD[i] = -1;
-       HandlerProc[i] = 0;
-    }
-    ReleaseWriteLock(&FSYNC_handler_lock);
-}
-
-static void
-CallHandler(fd_set * fdsetp)
-{
-    register int i;
-    ObtainReadLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++) {
-       if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
-           ReleaseReadLock(&FSYNC_handler_lock);
-           (*HandlerProc[i]) (HandlerFD[i]);
-           ObtainReadLock(&FSYNC_handler_lock);
-       }
-    }
-    ReleaseReadLock(&FSYNC_handler_lock);
-}
-
-static int
-AddHandler(int afd, int (*aproc) ())
-{
-    register int i;
-    ObtainWriteLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++)
-       if (HandlerFD[i] == -1)
-           break;
-    if (i >= MAXHANDLERS) {
-       ReleaseWriteLock(&FSYNC_handler_lock);
-       return 0;
-    }
-    HandlerFD[i] = afd;
-    HandlerProc[i] = aproc;
-    ReleaseWriteLock(&FSYNC_handler_lock);
-    return 1;
-}
-
-static int
-FindHandler(register int afd)
-{
-    register int i;
-    ObtainReadLock(&FSYNC_handler_lock);
-    for (i = 0; i < MAXHANDLERS; i++)
-       if (HandlerFD[i] == afd) {
-           ReleaseReadLock(&FSYNC_handler_lock);
-           return i;
-       }
-    ReleaseReadLock(&FSYNC_handler_lock);      /* just in case */
-    assert(1 == 2);
-    return -1;                 /* satisfy compiler */
-}
-
-static int
-FindHandler_r(register int afd)
-{
-    register int i;
-    for (i = 0; i < MAXHANDLERS; i++)
-       if (HandlerFD[i] == afd) {
-           return i;
-       }
-    assert(1 == 2);
-    return -1;                 /* satisfy compiler */
-}
-
-static int
-RemoveHandler(register int afd)
-{
-    ObtainWriteLock(&FSYNC_handler_lock);
-    HandlerFD[FindHandler_r(afd)] = -1;
-    ReleaseWriteLock(&FSYNC_handler_lock);
-    return 1;
-}
-
-static void
-GetHandler(fd_set * fdsetp, int *maxfdp)
-{
-    register int i;
-    register int maxfd = -1;
-    FD_ZERO(fdsetp);
-    ObtainReadLock(&FSYNC_handler_lock);       /* just in case */
-    for (i = 0; i < MAXHANDLERS; i++)
-       if (HandlerFD[i] != -1) {
-           FD_SET(HandlerFD[i], fdsetp);
-           if (maxfd < HandlerFD[i])
-               maxfd = HandlerFD[i];
-       }
-    *maxfdp = maxfd;
-    ReleaseReadLock(&FSYNC_handler_lock);      /* just in case */
-}
diff --git a/src/vol/fssync.h b/src/vol/fssync.h

index af5ab02c7105e2d7ab31e633b6e622c9826a180f..873b274970c1da0c0f25693683d0b5acb1e57f06 100644 (file)
--- a/src/vol/fssync.h
+++ b/src/vol/fssync.h
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -14,38 +16,117 @@
  
   */
  
+#ifndef __fssync_h_
+#define __fssync_h_
  
-/* FSYNC commands */
  
-#define FSYNC_ON               1       /* Volume online */
-#define FSYNC_OFF              2       /* Volume offline */
-#define FSYNC_LISTVOLUMES      3       /* Update local volume list */
-#define FSYNC_NEEDVOLUME       4       /* Put volume in whatever mode (offline, or whatever)
-                                        * best fits the attachment mode provided in reason */
-#define FSYNC_MOVEVOLUME       5       /* Generate temporary relocation information
-                                        * for this volume to another site, to be used
-                                        * if this volume disappears */
-#define        FSYNC_RESTOREVOLUME     6       /* Break all the callbacks on this volume since                                   it is being restored */
-#define FSYNC_DONE             7       /* Done with this volume (used after a delete).
-                                        * Don't put online, but remove from list */
+#define FSYNC_PROTO_VERSION     2
  
  
-/* Reasons (these could be communicated to venus or converted to messages) */
+/* FSYNC command codes */
+#define FSYNC_VOL_ON           SYNC_COM_CODE_DECL(0)   /* Volume online */
+#define FSYNC_VOL_OFF          SYNC_COM_CODE_DECL(1)   /* Volume offline */
+#define FSYNC_VOL_LISTVOLUMES  SYNC_COM_CODE_DECL(2)   /* Update local volume list */
+#define FSYNC_VOL_NEEDVOLUME   SYNC_COM_CODE_DECL(3)   /* Put volume in whatever mode (offline, or whatever)
+                                                        * best fits the attachment mode provided in reason */
+#define FSYNC_VOL_MOVE         SYNC_COM_CODE_DECL(4)   /* Generate temporary relocation information
+                                                        * for this volume to another site, to be used
+                                                        * if this volume disappears */
+#define        FSYNC_VOL_BREAKCBKS     SYNC_COM_CODE_DECL(5)   /* Break all the callbacks on this volume */
+#define FSYNC_VOL_DONE         SYNC_COM_CODE_DECL(6)   /* Done with this volume (used after a delete).
+                                                        * Don't put online, but remove from list */
+#define FSYNC_VOL_QUERY         SYNC_COM_CODE_DECL(7)   /* query the volume state */
+#define FSYNC_VOL_QUERY_HDR     SYNC_COM_CODE_DECL(8)   /* query the volume disk data structure */
+#define FSYNC_VOL_QUERY_VOP     SYNC_COM_CODE_DECL(9)   /* query the volume for pending vol op info */
+#define FSYNC_VOL_STATS_GENERAL SYNC_COM_CODE_DECL(10)  /* query the general volume package statistics */
+#define FSYNC_VOL_STATS_VICEP   SYNC_COM_CODE_DECL(11)  /* query the per-partition volume package stats */
+#define FSYNC_VOL_STATS_HASH    SYNC_COM_CODE_DECL(12)  /* query the per hash-chain volume package stats */
+#define FSYNC_VOL_STATS_HDR     SYNC_COM_CODE_DECL(13)  /* query the volume header cache statistics */
+#define FSYNC_VOL_STATS_VLRU    SYNC_COM_CODE_DECL(14)  /* query the VLRU statistics */
  
-#define FSYNC_WHATEVER         0       /* XXXX */
-#define FSYNC_SALVAGE          1       /* volume is being salvaged */
-#define FSYNC_MOVE             2       /* volume is being moved */
-#define FSYNC_OPERATOR         3       /* operator forced volume offline */
+/* FSYNC reason codes */
+#define FSYNC_WHATEVER         SYNC_REASON_CODE_DECL(0)  /* XXXX */
+#define FSYNC_SALVAGE          SYNC_REASON_CODE_DECL(1)  /* volume is being salvaged */
+#define FSYNC_MOVE             SYNC_REASON_CODE_DECL(2)  /* volume is being moved */
+#define FSYNC_OPERATOR         SYNC_REASON_CODE_DECL(3)  /* operator forced volume offline */
+#define FSYNC_EXCLUSIVE         SYNC_REASON_CODE_DECL(4)  /* somebody else has the volume offline */
+#define FSYNC_UNKNOWN_VOLID     SYNC_REASON_CODE_DECL(5)  /* volume id not known by fileserver */
+#define FSYNC_HDR_NOT_ATTACHED  SYNC_REASON_CODE_DECL(6)  /* volume header not currently attached */
+#define FSYNC_NO_PENDING_VOL_OP SYNC_REASON_CODE_DECL(7)  /* no volume operation pending */
+#define FSYNC_VOL_PKG_ERROR     SYNC_REASON_CODE_DECL(8)  /* error in the volume package */
  
+/* FSYNC response codes */
  
-/* Replies (1 byte) */
+/* FSYNC flag codes */
  
-#define FSYNC_DENIED           0
-#define FSYNC_OK               1
  
  
-/* Prototypes from fssync.c */
-void FSYNC_clientFinis(void);
-int FSYNC_clientInit(void);
-void FSYNC_fsInit(void);
-int FSYNC_askfs(VolumeId volume, char *partName, int com, int reason);
+struct offlineInfo {
+    afs_uint32 volumeID;
+    char partName[16];
+};
+
+typedef struct FSSYNC_VolOp_hdr {
+    afs_uint32 volume;          /* volume id associated with request */
+    char partName[16];         /* partition name, e.g. /vicepa */
+} FSSYNC_VolOp_hdr;
+
+typedef struct FSSYNC_VolOp_command {
+    SYNC_command_hdr * hdr;
+    FSSYNC_VolOp_hdr * vop;
+    SYNC_command * com;
+    struct offlineInfo * v;
+    struct offlineInfo * volumes;
+} FSSYNC_VolOp_command;
+
+typedef struct FSSYNC_VolOp_info {
+    SYNC_command_hdr com;
+    FSSYNC_VolOp_hdr vop;
+} FSSYNC_VolOp_info;
+
+
+typedef struct FSSYNC_StatsOp_hdr {
+    union {
+       afs_uint32 vlru_generation;
+       afs_uint32 hash_bucket;
+       char partName[16];
+    } args;
+} FSSYNC_StatsOp_hdr;
+
+typedef struct FSSYNC_StatsOp_command {
+    SYNC_command_hdr * hdr;
+    FSSYNC_StatsOp_hdr * sop;
+    SYNC_command * com;
+} FSSYNC_StatsOp_command;
+
+
+
+/*
+ * common interfaces
+ */
+extern void FSYNC_Init(void);
+
+/* 
+ * fsync client interfaces 
+ */
+extern void FSYNC_clientFinis(void);
+extern int FSYNC_clientInit(void);
+extern int FSYNC_clientChildProcReconnect(void);
+
+/* generic low-level interface */
+extern afs_int32 FSYNC_askfs(SYNC_command * com, SYNC_response * res);
+
+/* generic higher-level interface */
+extern afs_int32 FSYNC_GenericOp(void * ext_hdr, size_t ext_len,
+                                int command, int reason,
+                                SYNC_response * res);
+
+/* volume operations interface */
+extern afs_int32 FSYNC_VolOp(VolumeId volume, char *partName, int com, int reason, 
+                            SYNC_response * res);
+
+/* statistics query interface */
+extern afs_int32 FSYNC_StatsOp(FSSYNC_StatsOp_hdr * scom, int command, int reason,
+                              SYNC_response * res_in);
+
+#endif /* __fssync_h_ */
diff --git a/src/vol/nuke.c b/src/vol/nuke.c

index f787b5ae39783419d2b1daf11cf959735e6bd5f5..5b52e46a06b42c3a459f7a0767a967eae794781f 100644 (file)
--- a/src/vol/nuke.c
+++ b/src/vol/nuke.c
@@ -41,6 +41,7 @@ RCSID
  #include "partition.h"
  #include "viceinode.h"
  #include "salvage.h"
+#include "daemon_com.h"
  #include "fssync.h"
  
  #ifdef O_LARGEFILE
diff --git a/src/vol/partition.c b/src/vol/partition.c

index f8aa3a81dd59dcb4d803c02164b42fcf37f18b87..9eea9f577d36edadaf88f674b04751d74f2d581b 100644 (file)
--- a/src/vol/partition.c
+++ b/src/vol/partition.c
@@ -7,6 +7,7 @@
   * directory or online at http://www.openafs.org/dl/license10.html
   *
   * Portions Copyright (c) 2003 Apple Computer, Inc.
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -189,6 +190,14 @@ RCSID
  int aixlow_water = 8;          /* default 8% */
  struct DiskPartition *DiskPartitionList;
  
+#ifdef AFS_DEMAND_ATTACH_FS
+static struct DiskPartition *DiskPartitionTable[VOLMAXPARTS+1];
+
+static struct DiskPartition * VLookupPartition_r(char * path);
+static void AddPartitionToTable_r(struct DiskPartition *);
+static void DeletePartitionFromTable_r(struct DiskPartition *);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  #ifdef AFS_SGI_XFS_IOPS_ENV
  /* Verify that the on disk XFS inodes on the partition are large enough to
   * hold the AFS attribute. Returns -1 if the attribute can't be set or is
@@ -225,8 +234,16 @@ VerifyXFSInodeSize(char *part, char *fstype)
      }
      return code;
  }
-#endif
+#endif /* AFS_SGI_XFS_IOPS_ENV */
  
+int
+VInitPartitionPackage(void)
+{
+#ifdef AFS_DEMAND_ATTACH_ENV
+    memset(&DiskPartitionTable, 0, sizeof(DiskPartitionTable));
+#endif /* AFS_DEMAND_ATTACH_ENV */
+    return 0;
+}
  
  static void
  VInitPartition_r(char *path, char *devname, Device dev)
@@ -245,6 +262,7 @@ VInitPartition_r(char *path, char *devname, Device dev)
      dp->next = 0;
      dp->name = (char *)malloc(strlen(path) + 1);
      strncpy(dp->name, path, strlen(path) + 1);
+    dp->index = volutil_GetPartitionID(path);
  #if defined(AFS_NAMEI_ENV) && !defined(AFS_NT40_ENV)
      /* Create a lockfile for the partition, of the form /vicepa/Lock/vicepa */
      dp->devName = (char *)malloc(2 * strlen(path) + 6);
@@ -254,7 +272,7 @@ VInitPartition_r(char *path, char *devname, Device dev)
      mkdir(dp->devName, 0700);
      strcat(dp->devName, path);
      close(afs_open(dp->devName, O_RDWR | O_CREAT, 0600));
-    dp->device = volutil_GetPartitionID(path);
+    dp->device = dp->index;
  #else
      dp->devName = (char *)malloc(strlen(devname) + 1);
      strncpy(dp->devName, devname, strlen(devname) + 1);
@@ -268,6 +286,11 @@ VInitPartition_r(char *path, char *devname, Device dev)
         (void)namei_ViceREADME(VPartitionPath(dp));
  #endif
      VSetPartitionDiskUsage_r(dp);
+#ifdef AFS_DEMAND_ATTACH_FS
+    AddPartitionToTable_r(dp);
+    queue_Init(&dp->vol_list);
+    assert(pthread_cond_init(&dp->vol_list.cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
  
  static void
@@ -352,7 +375,7 @@ VCheckPartition(char *part, char *devname)
         return -1;
  #endif
  #endif /* AFS_NAMEI_ENV */
-#endif
+#endif /* !AFS_LINUX20_ENV && !AFS_NT40_ENV */
  
  #if defined(AFS_DUX40_ENV) && !defined(AFS_NAMEI_ENV)
      if (status.st_ino != ROOTINO) {
@@ -825,10 +848,14 @@ struct DiskPartition *
  VGetPartition_r(char *name, int abortp)
  {
      register struct DiskPartition *dp;
+#ifdef AFS_DEMAND_ATTACH_FS
+    dp = VLookupPartition_r(name);
+#else /* AFS_DEMAND_ATTACH_FS */
      for (dp = DiskPartitionList; dp; dp = dp->next) {
         if (strcmp(dp->name, name) == 0)
             break;
      }
+#endif /* AFS_DEMAND_ATTACH_FS */
      if (abortp)
         assert(dp != NULL);
      return dp;
@@ -1234,3 +1261,60 @@ VUnlockPartition(char *name)
      VUnlockPartition_r(name);
      VOL_UNLOCK;
  }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* XXX not sure this will work on AFS_NT40_ENV
+ * needs to be tested!
+ */
+struct DiskPartition * 
+VGetPartitionById_r(afs_int32 id, int abortp)
+{
+    struct DiskPartition * dp = NULL;
+
+    if ((id >= 0) && (id <= VOLMAXPARTS)) {
+       dp = DiskPartitionTable[id];
+    }
+
+    if (abortp) {
+       assert(dp != NULL);
+    }
+    return dp;
+}
+
+struct DiskPartition *
+VGetPartitionById(afs_int32 id, int abortp)
+{
+    struct Diskpartition * dp;
+
+    VOL_LOCK;
+    dp = VGetPartitionById_r(id, abortp);
+    VOL_UNLOCK;
+
+    return dp;
+}
+
+static struct DiskPartition * 
+VLookupPartition_r(char * path)
+{
+    afs_int32 id = volutil_GetPartitionID(path);
+
+    if (id < 0 || id > VOLMAXPARTS)
+       return NULL;
+
+    return DiskPartitionTable[id];
+}
+
+static void 
+AddPartitionToTable_r(struct DiskPartition * dp)
+{
+    assert(dp->index >= 0 && dp->index <= VOLMAXPARTS);
+    DiskPartitionTable[dp->index] = dp;
+}
+
+static void 
+DeletePartitionFromTable_r(struct DiskPartition * dp)
+{
+    assert(dp->index >= 0 && dp->index <= VOLMAXPARTS);
+    DiskPartitionTable[dp->index] = NULL;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/partition.h b/src/vol/partition.h

index 547ec94c188d688c99079fc9f28d4afb6d3e7445..7d869dfae99d7ef821c472d8a4d00c85dbaf5467 100644 (file)
--- a/src/vol/partition.h
+++ b/src/vol/partition.h
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -27,6 +29,7 @@
  #define        AFS_RDSKDEV     "/dev/r"
  #endif
  
+
  /* All Vice partitions on a server will have the following name prefix */
  #define VICE_PARTITION_PREFIX  "/vicep"
  #define VICE_PREFIX_SIZE       (sizeof(VICE_PARTITION_PREFIX)-1)
@@ -53,6 +56,7 @@ struct DiskPartition {
      char *name;                        /* Mounted partition name */
      char *devName;             /* Device mounted on */
      Device device;             /* device number */
+    afs_int32 index;            /* partition index (0<=x<=VOLMAXPARTS) */
      int lock_fd;               /* File descriptor of this partition if locked; otherwise -1;
                                  * Not used by the file server */
      int free;                  /* Total number of blocks (1K) presumed
@@ -77,7 +81,26 @@ struct DiskPartition {
                                  * from the superblock */
      int flags;
      int f_files;               /* total number of files in this partition */
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct {
+       struct rx_queue head;   /* list of volumes on this partition (VByPList) */
+       afs_uint32 len;         /* length of volume list */
+       int busy;               /* asynch vol list op in progress */
+       pthread_cond_t cv;      /* vol_list.busy change cond var */
+    } vol_list;
+#endif /* AFS_DEMAND_ATTACH_FS */
+};
+
+struct DiskPartitionStats {
+    afs_int32 free;
+    afs_int32 totalUsable;
+    afs_int32 minFree;
+    afs_int32 f_files;
+#ifdef AFS_DEMAND_ATTACH_FS
+    afs_int32 vol_list_len;
+#endif
  };
+
  #define        PART_DONTUPDATE 1
  #define PART_DUPLICATE  2      /* NT - used if we find more than one partition 
                                  * using the same drive. Will be dumped before
@@ -93,7 +116,12 @@ extern int VValidVPTEntry(struct vptab *vptp);
  struct Volume;                 /* Potentially forward definition */
  
  extern struct DiskPartition *DiskPartitionList;
-extern struct DiskPartition *VGetPartition();
+extern struct DiskPartition *VGetPartition(char * name, int abortp);
+extern struct DiskPartition *VGetPartition_r(char * name, int abortp);
+#ifdef AFS_DEMAND_ATTACH_FS
+extern struct DiskPartition *VGetPartitionById(afs_int32 index, int abortp);
+extern struct DiskPartition *VGetPartitionById_r(afs_int32 index, int abortp);
+#endif
  extern int VAttachPartitions(void);
  extern void VLockPartition(char *name);
  extern void VLockPartition_r(char *name);
@@ -108,3 +136,4 @@ extern void VAdjustDiskUsage(Error * ec, struct Volume *vp,
                              afs_sfsize_t blocks, afs_sfsize_t checkBlocks);
  extern int VDiskUsage(struct Volume *vp, afs_sfsize_t blocks);
  extern void VPrintDiskStats(void);
+extern int VInitPartitionPackage(void);
diff --git a/src/vol/purge.c b/src/vol/purge.c

index 01bb22efa33780022a3d610ace9ea4a38323d327..4b13fcf2bc456a9ff066f524126660d112f27a44 100644 (file)
--- a/src/vol/purge.c
+++ b/src/vol/purge.c
@@ -52,11 +52,16 @@ RCSID
  #include "volume.h"
  #include "viceinode.h"
  #include "partition.h"
+#include "daemon_com.h"
  #include "fssync.h"
  
  /* forward declarations */
-void PurgeIndex_r(Volume * vp, VnodeClass class);
-void PurgeHeader_r(Volume * vp);
+static int ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile,
+                           afs_int32 * aoffset);
+static void PurgeIndex(Volume * vp, VnodeClass class);
+static void PurgeIndex_r(Volume * vp, VnodeClass class);
+static void PurgeHeader_r(Volume * vp);
+static void PurgeHeader(Volume * vp);
  
  void
  VPurgeVolume_r(Error * ec, Volume * vp)
@@ -78,7 +83,7 @@ VPurgeVolume_r(Error * ec, Volume * vp)
      /*
       * Call the fileserver to break all call backs for that volume
       */
-    FSYNC_askfs(V_id(vp), tpartp->name, FSYNC_RESTOREVOLUME, 0);
+    FSYNC_VolOp(V_id(vp), tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL);
  }
  
  void
@@ -161,7 +166,7 @@ ObliterateRegion(Volume * avp, VnodeClass aclass, StreamHandle_t * afile,
      return -1;
  }
  
-void
+static void
  PurgeIndex(Volume * vp, VnodeClass class)
  {
      VOL_LOCK;
@@ -169,7 +174,7 @@ PurgeIndex(Volume * vp, VnodeClass class)
      VOL_UNLOCK;
  }
  
-void
+static void
  PurgeIndex_r(Volume * vp, VnodeClass class)
  {
      StreamHandle_t *ifile;
@@ -199,7 +204,7 @@ PurgeIndex_r(Volume * vp, VnodeClass class)
      FDH_CLOSE(fdP);
  }
  
-void
+static void
  PurgeHeader(Volume * vp)
  {
      VOL_LOCK;
@@ -207,7 +212,7 @@ PurgeHeader(Volume * vp)
      VOL_UNLOCK;
  }
  
-void
+static void
  PurgeHeader_r(Volume * vp)
  {
      IH_REALLYCLOSE(V_diskDataHandle(vp));
diff --git a/src/vol/salvage.h b/src/vol/salvage.h

index a18a24574cd75abc8e5313d715b9d7928a8d96ee..ce5353907000974502db5c709d3baf15cd37a718 100644 (file)
--- a/src/vol/salvage.h
+++ b/src/vol/salvage.h
@@ -14,6 +14,9 @@
  
   */
  
+#ifndef __salvage_h_
+#define __salvage_h_
+
  #include <afs/afssyscalls.h>
  /* Definition of DirHandle for salvager.  Not the same as for the file server */
  
@@ -24,3 +27,5 @@ typedef struct DirHandle {
      IHandle_t *dirh_handle;
      afs_int32 dirh_cacheCheck;
  } DirHandle;
+
+#endif /* __salvage_h_ */
diff --git a/src/vol/salvaged.c b/src/vol/salvaged.c

new file mode 100644 (file)

index 0000000..d5b318b
--- /dev/null
+++ b/src/vol/salvaged.c
@@ -0,0 +1,738 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/* 
+ * demand attach fs
+ * online salvager daemon
+ */
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#if    defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
+#define WCOREDUMP(x)   (x & 0200)
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV)
+#if defined(AFS_VFSINCL_ENV)
+#include <sys/vnode.h>
+#ifdef AFS_SUN5_ENV
+#include <sys/fs/ufs_inode.h>
+#else
+#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV)
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#else
+#include <ufs/inode.h>
+#endif
+#endif
+#else /* AFS_VFSINCL_ENV */
+#ifdef AFS_OSF_ENV
+#include <ufs/inode.h>
+#else /* AFS_OSF_ENV */
+#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
+#include <sys/inode.h>
+#endif
+#endif
+#endif /* AFS_VFSINCL_ENV */
+#endif /* AFS_SGI_ENV */
+#ifdef AFS_AIX_ENV
+#include <sys/vfs.h>
+#include <sys/lockf.h>
+#else
+#ifdef AFS_HPUX_ENV
+#include <unistd.h>
+#include <checklist.h>
+#else
+#if defined(AFS_SGI_ENV)
+#include <unistd.h>
+#include <fcntl.h>
+#include <mntent.h>
+#else
+#if    defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV)
+#ifdef   AFS_SUN5_ENV
+#include <unistd.h>
+#include <sys/mnttab.h>
+#include <sys/mntent.h>
+#else
+#include <mntent.h>
+#endif
+#else
+#endif /* AFS_SGI_ENV */
+#endif /* AFS_HPUX_ENV */
+#endif
+#endif
+#include <fcntl.h>
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+#include <afs/procmgmt.h>      /* signal(), kill(), wait(), etc. */
+#ifndef AFS_NT40_ENV
+#include <syslog.h>
+#endif
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
+#include "viceinode.h"
+#include "salvage.h"
+#include "volinodes.h"         /* header magic number, etc. stuff */
+#include "vol-salvage.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+
+#if !defined(AFS_DEMAND_ATTACH_FS)
+#error "online salvager only supported for demand attach fileserver"
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#if defined(AFS_NT40_ENV)
+#error "online salvager not supported on NT"
+#endif /* AFS_NT40_ENV */
+
+
+/* Forward declarations */
+/*@printflike@*/ void Log(const char *format, ...);
+/*@printflike@*/ void Abort(const char *format, ...);
+
+
+/*@+fcnmacros +macrofcndecl@*/
+#ifdef O_LARGEFILE
+#define afs_fopen      fopen64
+#else /* !O_LARGEFILE */
+#define afs_fopen      fopen
+#endif /* !O_LARGEFILE */
+/*@=fcnmacros =macrofcndecl@*/
+
+
+
+static volatile int current_workers = 0;
+static volatile struct rx_queue pending_q;
+static pthread_mutex_t worker_lock;
+static pthread_cond_t worker_cv;
+
+static void * SalvageChildReaperThread(void *);
+static int DoSalvageVolume(struct SalvageQueueNode * node, int slot);
+
+static void SalvageServer(void);
+static void SalvageClient(VolumeId vid, char * pname);
+
+static int Reap_Child(char * prog, int * pid, int * status);
+
+static void * SalvageLogCleanupThread(void *);
+static int SalvageLogCleanup(int pid);
+
+struct log_cleanup_node {
+    struct rx_queue q;
+    int pid;
+};
+
+struct {
+    struct rx_queue queue_head;
+    pthread_cond_t queue_change_cv;
+} log_cleanup_queue;
+
+
+#define DEFAULT_PARALLELISM 4 /* allow 4 parallel salvage workers by default */
+
+static int
+handleit(struct cmd_syndesc *as)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
+    struct DiskPartition *partP;
+
+
+#ifdef AFS_SGI_VNODE_GLUE
+    if (afs_init_kernel_config(-1) < 0) {
+       printf
+           ("Can't determine NUMA configuration, not starting salvager.\n");
+       exit(1);
+    }
+#endif
+
+    if (as->parms[2].items)    /* -debug */
+       debug = 1;
+    if (as->parms[3].items)    /* -nowrite */
+       Testing = 1;
+    if (as->parms[4].items)    /* -inodes */
+       ListInodeOption = 1;
+    if (as->parms[5].items)    /* -oktozap */
+       OKToZap = 1;
+    if (as->parms[6].items)    /* -rootinodes */
+       ShowRootFiles = 1;
+    if (as->parms[8].items)    /* -ForceReads */
+       forceR = 1;
+    if ((ti = as->parms[9].items)) {   /* -Parallel # */
+       temp = ti->data;
+       if (strncmp(temp, "all", 3) == 0) {
+           PartsPerDisk = 1;
+           temp += 3;
+       }
+       if (strlen(temp) != 0) {
+           Parallel = atoi(temp);
+           if (Parallel < 1)
+               Parallel = 1;
+           if (Parallel > MAXPARALLEL) {
+               printf("Setting parallel salvages to maximum of %d \n",
+                      MAXPARALLEL);
+               Parallel = MAXPARALLEL;
+           }
+       }
+    } else {
+       Parallel = MIN(DEFAULT_PARALLELISM, MAXPARALLEL);
+    }
+    if ((ti = as->parms[10].items)) {  /* -tmpdir */
+       DIR *dirp;
+
+       tmpdir = ti->data;
+       dirp = opendir(tmpdir);
+       if (!dirp) {
+           printf
+               ("Can't open temporary placeholder dir %s; using current partition \n",
+                tmpdir);
+           tmpdir = NULL;
+       } else
+           closedir(dirp);
+    }
+    if ((ti = as->parms[11].items))    /* -showlog */
+       ShowLog = 1;
+    if ((ti = as->parms[12].items)) {  /* -orphans */
+       if (Testing)
+           orphans = ORPH_IGNORE;
+       else if (strcmp(ti->data, "remove") == 0
+                || strcmp(ti->data, "r") == 0)
+           orphans = ORPH_REMOVE;
+       else if (strcmp(ti->data, "attach") == 0
+                || strcmp(ti->data, "a") == 0)
+           orphans = ORPH_ATTACH;
+    }
+#ifndef AFS_NT40_ENV           /* ignore options on NT */
+    if ((ti = as->parms[13].items)) {  /* -syslog */
+       useSyslog = 1;
+       ShowLog = 0;
+    }
+    if ((ti = as->parms[14].items)) {  /* -syslogfacility */
+       useSyslogFacility = atoi(ti->data);
+    }
+
+    if ((ti = as->parms[15].items)) {  /* -datelogs */
+       TimeStampLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH);
+    }
+#endif
+
+    if ((ti = as->parms[16].items)) {   /* -client */
+       if ((ti = as->parms[0].items)) {        /* -partition */
+           seenpart = 1;
+           strlcpy(pname, ti->data, sizeof(pname));
+       }
+       if ((ti = as->parms[1].items)) {        /* -volumeid */
+           seenvol = 1;
+           vid = atoi(ti->data);
+       }
+
+       if (!seenpart || !seenvol) {
+           printf("You must specify '-partition' and '-volumeid' with the '-client' option\n");
+           exit(-1);
+       }
+
+       SalvageClient(vid, pname);
+
+    } else {  /* salvageserver mode */
+       SalvageServer();
+    }
+    return (0);
+}
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+#ifdef AFS_NT40_ENV
+char *save_args[MAX_ARGS];
+int n_save_args = 0;
+pthread_t main_thread;
+#endif
+
+static char commandLine[150];
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+
+    int i;
+    extern char cml_version_number[];
+
+#ifdef AFS_AIX32_ENV
+    /*
+     * The following signal action for AIX is necessary so that in case of a 
+     * crash (i.e. core is generated) we can include the user's data section 
+     * in the core dump. Unfortunately, by default, only a partial core is
+     * generated which, in many cases, isn't too useful.
+     */
+    struct sigaction nsa;
+
+    sigemptyset(&nsa.sa_mask);
+    nsa.sa_handler = SIG_DFL;
+    nsa.sa_flags = SA_FULLDUMP;
+    sigaction(SIGABRT, &nsa, NULL);
+    sigaction(SIGSEGV, &nsa, NULL);
+#endif
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+       ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+       fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+               argv[0]);
+       exit(2);
+    }
+#ifdef AFS_NT40_ENV
+    main_thread = pthread_self();
+    if (spawnDatap && spawnDataLen) {
+       /* This is a child per partition salvager. Don't setup log or
+        * try to lock the salvager lock.
+        */
+       if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
+           exit(3);
+    } else {
+#endif
+       for (commandLine[0] = '\0', i = 0; i < argc; i++) {
+           if (i > 0)
+               strlcat(commandLine, " ", sizeof(commandLine));
+           strlcat(commandLine, argv[i], sizeof(commandLine));
+       }
+
+#ifndef AFS_NT40_ENV
+       if (geteuid() != 0) {
+           printf("Salvager must be run as root.\n");
+           fflush(stdout);
+           Exit(0);
+       }
+#endif
+
+       /* bad for normal help flag processing, but can do nada */
+
+#ifdef AFS_NT40_ENV
+    }
+#endif
+
+    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
+               "Name of partition to salvage");
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
+               "Volume Id to salvage");
+    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
+               "Run in Debugging mode");
+    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
+               "Run readonly/test mode");
+    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
+               "Just list affected afs inodes - debugging flag");
+    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
+               "Give permission to destroy bogus inodes/volumes - debugging flag");
+    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
+               "Show inodes owned by root - debugging flag");
+    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
+               "Force rebuild/salvage of all directories");
+    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
+               "Read smaller blocks to handle IO/bad blocks");
+    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
+               "# of max parallel partition salvaging");
+    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
+               "Name of dir to place tmp files ");
+    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
+               "Show log file upon completion");
+    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
+               "ignore | remove | attach");
+
+    /* note - syslog isn't avail on NT, but if we make it conditional, have
+     * to deal with screwy offsets for cmd params */
+    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
+               "Write salvage log to syslogs");
+    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
+               "Syslog facility number to use");
+    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
+               "Include timestamp in logfile filename");
+
+    cmd_AddParm(ts, "-client", CMD_FLAG, CMD_OPTIONAL,
+               "Use SALVSYNC to ask salvageserver to salvage a volume");
+
+    err = cmd_Dispatch(argc, argv);
+    Exit(err);
+}
+
+static void
+SalvageClient(VolumeId vid, char * pname)
+{
+    int done = 0;
+    afs_int32 code;
+    SYNC_response res;
+    SALVSYNC_response_hdr sres;
+
+    VInitVolumePackage(volumeUtility, 5, 5, DONT_CONNECT_FS, 0);
+    SALVSYNC_clientInit();
+    
+    code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_SALVAGE, SALVSYNC_OPERATOR, 0, NULL);
+    if (code != SYNC_OK) {
+       goto sync_error;
+    }
+
+    res.payload.buf = (void *) &sres;
+    res.payload.len = sizeof(sres);
+
+    while(!done) {
+       sleep(2);
+       code = SALVSYNC_SalvageVolume(vid, pname, SALVSYNC_QUERY, SALVSYNC_WHATEVER, 0, &res);
+       if (code != SYNC_OK) {
+           goto sync_error;
+       }
+       switch (sres.state) {
+       case SALVSYNC_STATE_ERROR:
+           printf("salvageserver reports salvage ended in an error; check log files for more details\n");
+       case SALVSYNC_STATE_DONE:
+       case SALVSYNC_STATE_UNKNOWN:
+           done = 1;
+       }
+    }
+    SALVSYNC_clientFinis();
+    return;
+
+ sync_error:
+    if (code == SYNC_DENIED) {
+       printf("salvageserver refused to salvage volume %u on partition %s\n",
+              vid, pname);
+    } else if (code == SYNC_BAD_COMMAND) {
+       printf("SALVSYNC protocol mismatch; please make sure fileserver, volserver, salvageserver and salvager are same version\n");
+    } else if (code == SYNC_COM_ERROR) {
+       printf("SALVSYNC communications error\n");
+    }
+    SALVSYNC_clientFinis();
+    exit(-1);
+}
+
+static int * child_slot;
+
+static void
+SalvageServer(void)
+{
+    int pid, ret;
+    struct SalvageQueueNode * node;
+    pthread_t tid;
+    pthread_attr_t attrs;
+    int slot;
+
+    /* All entries to the log will be appended.  Useful if there are
+     * multiple salvagers appending to the log.
+     */
+
+    CheckLogFile(AFSDIR_SERVER_SALSRVLOG_FILEPATH);
+#ifndef AFS_NT40_ENV
+#ifdef AFS_LINUX20_ENV
+    fcntl(fileno(logFile), F_SETFL, O_APPEND); /* Isn't this redundant? */
+#else
+    fcntl(fileno(logFile), F_SETFL, FAPPEND);  /* Isn't this redundant? */
+#endif
+#endif
+    setlinebuf(logFile);
+
+    fprintf(logFile, "%s\n", cml_version_number);
+    Log("Starting OpenAFS Online Salvage Server %s (%s)\n", SalvageVersion, commandLine);
+    
+    /* Get and hold a lock for the duration of the salvage to make sure
+     * that no other salvage runs at the same time.  The routine
+     * VInitVolumePackage (called below) makes sure that a file server or
+     * other volume utilities don't interfere with the salvage.
+     */
+    
+    /* even demand attach online salvager
+     * still needs this because we don't want
+     * a stand-alone salvager to conflict with
+     * the salvager daemon */
+    ObtainSalvageLock();
+
+    child_slot = (int *) malloc(Parallel * sizeof(int));
+    assert(child_slot != NULL);
+    memset(child_slot, 0, Parallel * sizeof(int));
+           
+    /* initialize things */
+    VInitVolumePackage(salvageServer, 5, 5,
+                      1, 0);
+    DInit(10);
+    queue_Init(&pending_q);
+    queue_Init(&log_cleanup_queue);
+    assert(pthread_mutex_init(&worker_lock, NULL) == 0);
+    assert(pthread_cond_init(&worker_cv, NULL) == 0);
+    assert(pthread_cond_init(&log_cleanup_queue.queue_change_cv, NULL) == 0);
+    assert(pthread_attr_init(&attrs) == 0);
+
+    /* start up the reaper and log cleaner threads */
+    assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, 
+                         &attrs, 
+                         &SalvageChildReaperThread,
+                         NULL) == 0);
+    assert(pthread_create(&tid, 
+                         &attrs, 
+                         &SalvageLogCleanupThread,
+                         NULL) == 0);
+
+    /* loop forever serving requests */
+    while (1) {
+       node = SALVSYNC_getWork();
+       assert(node != NULL);
+
+       VOL_LOCK;
+       /* find a slot */
+       for (slot = 0; slot < Parallel; slot++) {
+         if (!child_slot[slot])
+           break;
+       }
+       assert (slot < Parallel);
+
+       pid = Fork();
+       if (pid == 0) {
+           VOL_UNLOCK;
+           ret = DoSalvageVolume(node, slot);
+           Exit(ret);
+       } else if (pid < 0) {
+           VOL_UNLOCK;
+           SALVSYNC_doneWork(node, 1);
+       } else {
+           child_slot[slot] = pid;
+           node->pid = pid;
+           VOL_UNLOCK;
+           
+           assert(pthread_mutex_lock(&worker_lock) == 0);
+           current_workers++;
+           
+           /* let the reaper thread know another worker was spawned */
+           assert(pthread_cond_broadcast(&worker_cv) == 0);
+           
+           /* if we're overquota, wait for the reaper */
+           while (current_workers >= Parallel) {
+               assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0);
+           }
+           assert(pthread_mutex_unlock(&worker_lock) == 0);
+       }
+    }
+}
+
+static int
+DoSalvageVolume(struct SalvageQueueNode * node, int slot)
+{
+    char childLog[AFSDIR_PATH_MAX];
+    int ret;
+    struct DiskPartition * partP;
+
+    VChildProcReconnectFS();
+
+    /* do not attempt to close parent's logFile handle as
+     * another thread may have held the lock on the FILE
+     * structure when fork was called! */
+
+    afs_snprintf(childLog, sizeof(childLog), "%s.%d", 
+                AFSDIR_SERVER_SLVGLOG_FILEPATH, getpid());
+
+    logFile = afs_fopen(childLog, "a");
+    if (!logFile) {            /* still nothing, use stdout */
+       logFile = stdout;
+       ShowLog = 0;
+    }
+
+    if (node->command.sop.volume <= 0) {
+       Log("salvageServer: invalid volume id specified; salvage aborted\n");
+       return 1;
+    }
+    
+    partP = VGetPartition(node->command.sop.partName, 0);
+    if (!partP) {
+       Log("salvageServer: Unknown or unmounted partition %s; salvage aborted\n", 
+           node->command.sop.partName);
+       return 1;
+    }
+
+    /* Salvage individual volume; don't notify fs */
+    SalvageFileSys1(partP, node->command.sop.volume);
+
+    VDisconnectFS();
+
+    fclose(logFile);
+    return 0;
+}
+
+
+static void *
+SalvageChildReaperThread(void * args)
+{
+    int slot, pid, status, code, found;
+    struct SalvageQueueNode *qp, *nqp;
+    struct log_cleanup_node * cleanup;
+
+    assert(pthread_mutex_lock(&worker_lock) == 0);
+
+    /* loop reaping our children */
+    while (1) {
+       /* wait() won't block unless we have children, so
+        * block on the cond var if we're childless */
+       while (current_workers == 0) {
+           assert(pthread_cond_wait(&worker_cv, &worker_lock) == 0);
+       }
+
+       assert(pthread_mutex_unlock(&worker_lock) == 0);
+
+       cleanup = (struct log_cleanup_node *) malloc(sizeof(struct log_cleanup_node));
+
+       while (Reap_Child("salvageserver", &pid, &status) < 0) {
+           /* try to prevent livelock if something goes wrong */
+           sleep(1);
+       }
+
+       VOL_LOCK;
+       for (slot = 0; slot < Parallel; slot++) {
+           if (child_slot[slot] == pid)
+               break;
+       }
+       assert(slot < Parallel);
+       child_slot[slot] = 0;
+       VOL_UNLOCK;
+
+       assert(pthread_mutex_lock(&worker_lock) == 0);
+
+       if (cleanup) {
+           cleanup->pid = pid;
+           queue_Append(&log_cleanup_queue, cleanup);
+           assert(pthread_cond_signal(&log_cleanup_queue.queue_change_cv) == 0);
+       }
+
+       /* ok, we've reaped a child */
+       current_workers--;
+       SALVSYNC_doneWorkByPid(pid, 0);
+       assert(pthread_cond_broadcast(&worker_cv) == 0);
+    }
+
+    return NULL;
+}
+
+static int
+Reap_Child(char *prog, int * pid, int * status)
+{
+    int ret;
+    ret = wait(status);
+
+    if (ret >= 0) {
+       *pid = ret;
+        if (WCOREDUMP(*status))
+           Log("\"%s\" core dumped!\n", prog);
+       if (WIFSIGNALED(*status) != 0 || WEXITSTATUS(*status) != 0)
+           Log("\"%s\" (pid=%d) terminated abnormally!\n", prog, ret);
+    } else {
+       Log("wait returned -1\n");
+    }
+    return ret;
+}
+
+/*
+ * thread to combine salvager child logs
+ * back into the main salvageserver log
+ */
+static void *
+SalvageLogCleanupThread(void * arg)
+{
+    struct log_cleanup_node * cleanup;
+
+    assert(pthread_mutex_lock(&worker_lock) == 0);
+
+    while (1) {
+       while (queue_IsEmpty(&log_cleanup_queue)) {
+           assert(pthread_cond_wait(&log_cleanup_queue.queue_change_cv, &worker_lock) == 0);
+       }
+
+       while (queue_IsNotEmpty(&log_cleanup_queue)) {
+           cleanup = queue_First(&log_cleanup_queue, log_cleanup_node);
+           queue_Remove(cleanup);
+           assert(pthread_mutex_unlock(&worker_lock) == 0);
+           SalvageLogCleanup(cleanup->pid);
+           free(cleanup);
+           assert(pthread_mutex_lock(&worker_lock) == 0);
+       }           
+    }
+
+    assert(pthread_mutex_unlock(&worker_lock) == 0);
+    return NULL;
+}
+
+#define LOG_XFER_BUF_SIZE 65536
+static int
+SalvageLogCleanup(int pid)
+{
+    int pidlog, len;
+    char fn[AFSDIR_PATH_MAX];
+    static char buf[LOG_XFER_BUF_SIZE];
+
+    afs_snprintf(fn, sizeof(fn), "%s.%d", 
+                AFSDIR_SERVER_SLVGLOG_FILEPATH, pid);
+    
+
+    pidlog = open(fn, O_RDONLY);
+    unlink(fn);
+    if (pidlog < 0)
+       return 1;
+
+    len = read(pidlog, buf, LOG_XFER_BUF_SIZE);
+    while (len) {
+       fwrite(buf, len, 1, logFile);
+       len = read(pidlog, buf, LOG_XFER_BUF_SIZE);
+    }
+
+    close(pidlog);
+
+    return 0;
+}
diff --git a/src/vol/salvager.c b/src/vol/salvager.c

new file mode 100644 (file)

index 0000000..4af0daa
--- /dev/null
+++ b/src/vol/salvager.c
@@ -0,0 +1,499 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ *      System:                VICE-TWO
+ *      Module:                salvager.c
+ *      Institution:   The Information Technology Center, Carnegie-Mellon University
+ */
+
+
+/* Main program file. Define globals. */
+#define MAIN 1
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <errno.h>
+#ifdef AFS_NT40_ENV
+#include <io.h>
+#include <WINNT/afsevent.h>
+#else
+#include <sys/param.h>
+#include <sys/file.h>
+#ifndef ITIMER_REAL
+#include <sys/time.h>
+#endif /* ITIMER_REAL */
+#endif
+#if    defined(AFS_AIX_ENV) || defined(AFS_SUN4_ENV)
+#define WCOREDUMP(x)   (x & 0200)
+#endif
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include <afs/assert.h>
+#if !defined(AFS_SGI_ENV) && !defined(AFS_NT40_ENV)
+#if defined(AFS_VFSINCL_ENV)
+#include <sys/vnode.h>
+#ifdef AFS_SUN5_ENV
+#include <sys/fs/ufs_inode.h>
+#else
+#if defined(AFS_DARWIN_ENV) || defined(AFS_XBSD_ENV)
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+#else
+#include <ufs/inode.h>
+#endif
+#endif
+#else /* AFS_VFSINCL_ENV */
+#ifdef AFS_OSF_ENV
+#include <ufs/inode.h>
+#else /* AFS_OSF_ENV */
+#if !defined(AFS_LINUX20_ENV) && !defined(AFS_XBSD_ENV)
+#include <sys/inode.h>
+#endif
+#endif
+#endif /* AFS_VFSINCL_ENV */
+#endif /* AFS_SGI_ENV */
+#ifdef AFS_AIX_ENV
+#include <sys/vfs.h>
+#include <sys/lockf.h>
+#else
+#ifdef AFS_HPUX_ENV
+#include <unistd.h>
+#include <checklist.h>
+#else
+#if defined(AFS_SGI_ENV)
+#include <unistd.h>
+#include <fcntl.h>
+#include <mntent.h>
+#else
+#if    defined(AFS_SUN_ENV) || defined(AFS_SUN5_ENV)
+#ifdef   AFS_SUN5_ENV
+#include <unistd.h>
+#include <sys/mnttab.h>
+#include <sys/mntent.h>
+#else
+#include <mntent.h>
+#endif
+#else
+#endif /* AFS_SGI_ENV */
+#endif /* AFS_HPUX_ENV */
+#endif
+#endif
+#include <fcntl.h>
+#ifndef AFS_NT40_ENV
+#include <afs/osi_inode.h>
+#endif
+#include <afs/cmd.h>
+#include <afs/afsutil.h>
+#include <afs/fileutil.h>
+#include <afs/procmgmt.h>      /* signal(), kill(), wait(), etc. */
+#ifndef AFS_NT40_ENV
+#include <syslog.h>
+#endif
+
+#include "nfs.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
+#include "viceinode.h"
+#include "salvage.h"
+#include "volinodes.h"         /* header magic number, etc. stuff */
+#include "vol-salvage.h"
+#ifdef AFS_NT40_ENV
+#include <pthread.h>
+#endif
+
+
+static int get_salvage_lock = 0;
+
+
+/* Forward declarations */
+/*@printflike@*/ void Log(const char *format, ...);
+/*@printflike@*/ void Abort(const char *format, ...);
+
+
+static int
+handleit(struct cmd_syndesc *as)
+{
+    register struct cmd_item *ti;
+    char pname[100], *temp;
+    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
+    struct DiskPartition *partP;
+
+#ifdef AFS_SGI_VNODE_GLUE
+    if (afs_init_kernel_config(-1) < 0) {
+       printf
+           ("Can't determine NUMA configuration, not starting salvager.\n");
+       exit(1);
+    }
+#endif
+
+#ifdef FAST_RESTART
+    {
+       afs_int32 i;
+       for (i = 0; i < CMD_MAXPARMS; i++) {
+           if (as->parms[i].items) {
+               seenany = 1;
+               break;
+           }
+       }
+    }
+    if (!seenany) {
+       char *msg =
+           "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
+
+       if (useSyslog)
+           Log(msg);
+       else
+           printf("%s\n", msg);
+
+       Exit(0);
+    }
+#endif /* FAST_RESTART */
+    if ((ti = as->parms[0].items)) {   /* -partition */
+       seenpart = 1;
+       strncpy(pname, ti->data, 100);
+    }
+    if ((ti = as->parms[1].items)) {   /* -volumeid */
+       if (!seenpart) {
+           printf
+               ("You must also specify '-partition' option with the '-volumeid' option\n");
+           exit(-1);
+       }
+       seenvol = 1;
+       vid = atoi(ti->data);
+    }
+    if (as->parms[2].items)    /* -debug */
+       debug = 1;
+    if (as->parms[3].items)    /* -nowrite */
+       Testing = 1;
+    if (as->parms[4].items)    /* -inodes */
+       ListInodeOption = 1;
+    if (as->parms[5].items)    /* -force */
+       ForceSalvage = 1;
+    if (as->parms[6].items)    /* -oktozap */
+       OKToZap = 1;
+    if (as->parms[7].items)    /* -rootinodes */
+       ShowRootFiles = 1;
+    if (as->parms[8].items)    /* -RebuildDirs */
+       RebuildDirs = 1;
+    if (as->parms[9].items)    /* -ForceReads */
+       forceR = 1;
+    if ((ti = as->parms[10].items)) {  /* -Parallel # */
+       temp = ti->data;
+       if (strncmp(temp, "all", 3) == 0) {
+           PartsPerDisk = 1;
+           temp += 3;
+       }
+       if (strlen(temp) != 0) {
+           Parallel = atoi(temp);
+           if (Parallel < 1)
+               Parallel = 1;
+           if (Parallel > MAXPARALLEL) {
+               printf("Setting parallel salvages to maximum of %d \n",
+                      MAXPARALLEL);
+               Parallel = MAXPARALLEL;
+           }
+       }
+    }
+    if ((ti = as->parms[11].items)) {  /* -tmpdir */
+       DIR *dirp;
+
+       tmpdir = ti->data;
+       dirp = opendir(tmpdir);
+       if (!dirp) {
+           printf
+               ("Can't open temporary placeholder dir %s; using current partition \n",
+                tmpdir);
+           tmpdir = NULL;
+       } else
+           closedir(dirp);
+    }
+    if ((ti = as->parms[12].items))    /* -showlog */
+       ShowLog = 1;
+    if ((ti = as->parms[13].items)) {  /* -log */
+       Testing = 1;
+       ShowSuid = 1;
+       Showmode = 1;
+    }
+    if ((ti = as->parms[14].items)) {  /* -showmounts */
+       Testing = 1;
+       Showmode = 1;
+       ShowMounts = 1;
+    }
+    if ((ti = as->parms[15].items)) {  /* -orphans */
+       if (Testing)
+           orphans = ORPH_IGNORE;
+       else if (strcmp(ti->data, "remove") == 0
+                || strcmp(ti->data, "r") == 0)
+           orphans = ORPH_REMOVE;
+       else if (strcmp(ti->data, "attach") == 0
+                || strcmp(ti->data, "a") == 0)
+           orphans = ORPH_ATTACH;
+    }
+#ifndef AFS_NT40_ENV           /* ignore options on NT */
+    if ((ti = as->parms[16].items)) {  /* -syslog */
+       useSyslog = 1;
+       ShowLog = 0;
+    }
+    if ((ti = as->parms[17].items)) {  /* -syslogfacility */
+       useSyslogFacility = atoi(ti->data);
+    }
+
+    if ((ti = as->parms[18].items)) {  /* -datelogs */
+       TimeStampLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH);
+    }
+#endif
+
+#ifdef FAST_RESTART
+    if (ti = as->parms[19].items) {    /* -DontSalvage */
+       char *msg =
+           "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
+
+       if (useSyslog)
+           Log(msg);
+       else
+           printf("%s\n", msg);
+       Exit(0);
+    }
+#elif defined(DEMAND_ATTACH_ENABLE)
+    if (seenvol && !as->parms[19].items) {
+       char * msg =
+           "The standalone salvager cannot be run concurrently with a Demand Attach Fileserver.  Please use 'salvageserver -client <partition> <volume id>' to manually schedule volume salvages with the salvageserver (new versions of 'bos salvage' automatically do this for you).  Or, if you insist on using the standalone salvager, add the -forceDAFS flag to your salvager command line.";
+
+       if (useSyslog)
+           Log(msg);
+       else
+           printf("%s\n", msg);
+       Exit(1);
+    }
+#endif
+
+    if (get_salvage_lock) {
+       ObtainSalvageLock();
+    }
+
+    /* Note:  if seenvol we initialize this as a standard volume utility:  this has the
+     * implication that the file server may be running; negotations have to be made with
+     * the file server in this case to take the read write volume and associated read-only
+     * volumes off line before salvaging */
+#ifdef AFS_NT40_ENV
+    if (seenvol) {
+       if (afs_winsockInit() < 0) {
+           ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0,
+                               AFSDIR_SALVAGER_FILE, 0);
+           Log("Failed to initailize winsock, exiting.\n");
+           Exit(1);
+       }
+    }
+#endif
+    VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5,
+                      DONT_CONNECT_FS, 0);
+    DInit(10);
+#ifdef AFS_NT40_ENV
+    if (myjob.cj_number != NOT_CHILD) {
+       if (!seenpart) {
+           seenpart = 1;
+           (void)strcpy(pname, myjob.cj_part);
+       }
+    }
+#endif
+    if (seenpart == 0) {
+       for (partP = DiskPartitionList; partP; partP = partP->next) {
+           SalvageFileSysParallel(partP);
+       }
+       SalvageFileSysParallel(0);
+    } else {
+       partP = VGetPartition(pname, 0);
+       if (!partP) {
+           Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname);
+           Exit(1);
+       }
+       if (!seenvol)
+           SalvageFileSys(partP, 0);
+       else {
+           /* Salvage individual volume */
+           if (vid <= 0) {
+               Log("salvage: invalid volume id specified; salvage aborted\n");
+               Exit(1);
+           }
+           SalvageFileSys(partP, vid);
+       }
+    }
+    return (0);
+}
+
+
+#ifndef AFS_NT40_ENV
+#include "AFS_component_version_number.c"
+#endif
+#define MAX_ARGS 128
+#ifdef AFS_NT40_ENV
+char *save_args[MAX_ARGS];
+int n_save_args = 0;
+pthread_t main_thread;
+#endif
+
+int
+main(int argc, char **argv)
+{
+    struct cmd_syndesc *ts;
+    int err = 0;
+    char commandLine[150];
+
+    int i;
+    extern char cml_version_number[];
+
+#ifdef AFS_AIX32_ENV
+    /*
+     * The following signal action for AIX is necessary so that in case of a 
+     * crash (i.e. core is generated) we can include the user's data section 
+     * in the core dump. Unfortunately, by default, only a partial core is
+     * generated which, in many cases, isn't too useful.
+     */
+    struct sigaction nsa;
+
+    sigemptyset(&nsa.sa_mask);
+    nsa.sa_handler = SIG_DFL;
+    nsa.sa_flags = SA_FULLDUMP;
+    sigaction(SIGABRT, &nsa, NULL);
+    sigaction(SIGSEGV, &nsa, NULL);
+#endif
+
+    /* Initialize directory paths */
+    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
+#ifdef AFS_NT40_ENV
+       ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
+#endif
+       fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
+               argv[0]);
+       exit(2);
+    }
+#ifdef AFS_NT40_ENV
+    main_thread = pthread_self();
+    if (spawnDatap && spawnDataLen) {
+       /* This is a child per partition salvager. Don't setup log or
+        * try to lock the salvager lock.
+        */
+       if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
+           exit(3);
+    } else {
+#endif
+       for (commandLine[0] = '\0', i = 0; i < argc; i++) {
+           if (i > 0)
+               strcat(commandLine, " ");
+           strcat(commandLine, argv[i]);
+       }
+
+       /* All entries to the log will be appended.  Useful if there are
+        * multiple salvagers appending to the log.
+        */
+
+       CheckLogFile(AFSDIR_SERVER_SLVGLOG_FILEPATH);
+#ifndef AFS_NT40_ENV
+#ifdef AFS_LINUX20_ENV
+       fcntl(fileno(logFile), F_SETFL, O_APPEND);      /* Isn't this redundant? */
+#else
+       fcntl(fileno(logFile), F_SETFL, FAPPEND);       /* Isn't this redundant? */
+#endif
+#endif
+       setlinebuf(logFile);
+
+#ifndef AFS_NT40_ENV
+       if (geteuid() != 0) {
+           printf("Salvager must be run as root.\n");
+           fflush(stdout);
+           Exit(0);
+       }
+#endif
+
+       /* bad for normal help flag processing, but can do nada */
+
+       fprintf(logFile, "%s\n", cml_version_number);
+       Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine);
+
+       /* Get and hold a lock for the duration of the salvage to make sure
+        * that no other salvage runs at the same time.  The routine
+        * VInitVolumePackage (called below) makes sure that a file server or
+        * other volume utilities don't interfere with the salvage.
+        */
+       get_salvage_lock = 1;
+#ifdef AFS_NT40_ENV
+    }
+#endif
+
+    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
+    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
+               "Name of partition to salvage");
+    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
+               "Volume Id to salvage");
+    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
+               "Run in Debugging mode");
+    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
+               "Run readonly/test mode");
+    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
+               "Just list affected afs inodes - debugging flag");
+    cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging");
+    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
+               "Give permission to destroy bogus inodes/volumes - debugging flag");
+    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
+               "Show inodes owned by root - debugging flag");
+    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
+               "Force rebuild/salvage of all directories");
+    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
+               "Read smaller blocks to handle IO/bad blocks");
+    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
+               "# of max parallel partition salvaging");
+    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
+               "Name of dir to place tmp files ");
+    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
+               "Show log file upon completion");
+    cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL,
+               "Report on suid/sgid files");
+    cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL,
+               "Report on mountpoints");
+    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
+               "ignore | remove | attach");
+
+    /* note - syslog isn't avail on NT, but if we make it conditional, have
+     * to deal with screwy offsets for cmd params */
+    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
+               "Write salvage log to syslogs");
+    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
+               "Syslog facility number to use");
+    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
+               "Include timestamp in logfile filename");
+#ifdef FAST_RESTART
+    cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL,
+               "Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline");
+#elif defined(DEMAND_ATTACH_ENABLE)
+    cmd_AddParm(ts, "-forceDAFS", CMD_FLAG, CMD_OPTIONAL,
+               "For Demand Attach Fileserver, permit a manual volume salvage outside of the salvageserver");
+#endif /* FAST_RESTART */
+    err = cmd_Dispatch(argc, argv);
+    Exit(err);
+}
+
diff --git a/src/vol/salvsync-client.c b/src/vol/salvsync-client.c

new file mode 100644 (file)

index 0000000..7ed96d6
--- /dev/null
+++ b/src/vol/salvsync-client.c
@@ -0,0 +1,172 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * salvsync-client.c
+ *
+ * OpenAFS demand attach fileserver
+ * Salvage server synchronization with fileserver.
+ */
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "salvsync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * SALVSYNC is a feature specific to the demand attach fileserver
+ */
+
+extern int LogLevel;
+extern int VInit;
+extern pthread_mutex_t vol_salvsync_mutex;
+
+static SYNC_client_state salvsync_client_state = { -1, 2041, SALVSYNC_PROTO_VERSION, 5, 120 };
+
+/*
+ * client-side routines
+ */
+
+int
+SALVSYNC_clientInit(void)
+{
+    return SYNC_connect(&salvsync_client_state);
+}
+
+int
+SALVSYNC_clientFinis(void)
+{
+    SYNC_closeChannel(&salvsync_client_state);
+    return 1;
+}
+
+int
+SALVSYNC_clientReconnect(void)
+{
+    return SYNC_reconnect(&salvsync_client_state);
+}
+
+afs_int32
+SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res)
+{
+    afs_int32 code;
+
+    VSALVSYNC_LOCK;
+    code = SYNC_ask(&salvsync_client_state, com, res);
+    VSALVSYNC_UNLOCK;
+
+    switch (code) {
+    case SYNC_OK:
+    case SYNC_FAILED:
+      break;
+    case SYNC_COM_ERROR:
+    case SYNC_BAD_COMMAND:
+       Log("SALVSYNC_askSalv: fatal SALVSYNC protocol error; online salvager functionality disabled until next fileserver restart\n");
+       break;
+    case SYNC_DENIED:
+       Log("SALVSYNC_askSalv: SALVSYNC request denied for reason=%d\n", res->hdr.reason);
+       break;
+    default:
+       Log("SALVSYNC_askSalv: unknown protocol response %d\n", code);
+       break;
+    }
+
+    return code;
+}
+
+afs_int32
+SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int command, int reason, 
+                      afs_uint32 prio, SYNC_response * res_in)
+{
+    SYNC_command com;
+    SYNC_response res_l, *res;
+    SALVSYNC_command_hdr scom;
+    SALVSYNC_response_hdr sres;
+    int n, tot;
+
+    memset(&com, 0, sizeof(com));
+    memset(&scom, 0, sizeof(scom));
+
+    if (res_in) {
+       res = res_in;
+    } else {
+       memset(&res_l, 0, sizeof(res_l));
+       memset(&sres, 0, sizeof(sres));
+       res_l.payload.buf = (void *) &sres;
+       res_l.payload.len = sizeof(sres);
+       res = &res_l;
+    }
+
+    com.payload.buf = (void *) &scom;
+    com.payload.len = sizeof(scom);
+    com.hdr.command = command;
+    com.hdr.reason = reason;
+    com.hdr.command_len = sizeof(com.hdr) + sizeof(scom);
+    scom.volume = volume;
+    scom.prio = prio;
+
+    if (partName) {
+       strlcpy(scom.partName, partName, sizeof(scom.partName));
+    } else {
+       scom.partName[0] = '\0';
+    }
+
+    return SALVSYNC_askSalv(&com, res);
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/salvsync-server.c b/src/vol/salvsync-server.c

new file mode 100644 (file)

index 0000000..d9e083b
--- /dev/null
+++ b/src/vol/salvsync-server.c
@@ -0,0 +1,1009 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * salvsync-server.c
+ *
+ * OpenAFS demand attach fileserver
+ * Salvage server synchronization with fileserver.
+ */
+
+/* This controls the size of an fd_set; it must be defined early before
+ * the system headers define that type and the macros that operate on it.
+ * Its value should be as large as the maximum file descriptor limit we
+ * are likely to run into on any platform.  Right now, that is 65536
+ * which is the default hard fd limit on Solaris 9 */
+#ifndef _WIN32
+#define FD_SETSIZE 65536
+#endif
+
+#include <afsconfig.h>
+#include <afs/param.h>
+
+RCSID
+    ("$Header$");
+
+#include <sys/types.h>
+#include <stdio.h>
+#ifdef AFS_NT40_ENV
+#include <winsock2.h>
+#include <time.h>
+#else
+#include <sys/param.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <netdb.h>
+#include <sys/time.h>
+#endif
+#include <errno.h>
+#include <assert.h>
+#include <signal.h>
+
+#ifdef HAVE_STRING_H
+#include <string.h>
+#else
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#endif
+
+
+#include <rx/xdr.h>
+#include <afs/afsint.h>
+#include "nfs.h"
+#include <afs/errors.h>
+#include "salvsync.h"
+#include "lwp.h"
+#include "lock.h"
+#include <afs/afssyscalls.h>
+#include "ihandle.h"
+#include "vnode.h"
+#include "volume.h"
+#include "partition.h"
+#include <rx/rx_queue.h>
+
+#if !defined(offsetof)
+#include <stddef.h>
+#endif
+
+/*@printflike@*/ extern void Log(const char *format, ...);
+
+#ifdef osi_Assert
+#undef osi_Assert
+#endif
+#define osi_Assert(e) (void)(e)
+
+#define MAXHANDLERS    4       /* Up to 4 clients; must be at least 2, so that
+                                * move = dump+restore can run on single server */
+
+#define MAX_BIND_TRIES 5       /* Number of times to retry socket bind */
+
+
+
+/* Forward declarations */
+static void * SALVSYNC_syncThread(void *);
+static void SALVSYNC_newconnection(int fd);
+static void SALVSYNC_com(int fd);
+static void SALVSYNC_Drop(int fd);
+static void AcceptOn(void);
+static void AcceptOff(void);
+static void InitHandler(void);
+static void CallHandler(fd_set * fdsetp);
+static int AddHandler(int afd, void (*aproc) (int));
+static int FindHandler(register int afd);
+static int FindHandler_r(register int afd);
+static int RemoveHandler(register int afd);
+static void GetHandler(fd_set * fdsetp, int *maxfdp);
+
+
+/*
+ * This lock controls access to the handler array.
+ */
+struct Lock SALVSYNC_handler_lock;
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * SALVSYNC is a feature specific to the demand attach fileserver
+ */
+
+static int AddToSalvageQueue(struct SalvageQueueNode * node);
+static void DeleteFromSalvageQueue(struct SalvageQueueNode * node);
+static void AddToPendingQueue(struct SalvageQueueNode * node);
+static void DeleteFromPendingQueue(struct SalvageQueueNode * node);
+static struct SalvageQueueNode * LookupPendingCommand(SALVSYNC_command_hdr * qry);
+static struct SalvageQueueNode * LookupPendingCommandByPid(int pid);
+static void RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com);
+
+static struct SalvageQueueNode * LookupNode(VolumeId vid, char * partName);
+static struct SalvageQueueNode * LookupNodeByCommand(SALVSYNC_command_hdr * qry);
+static void AddNodeToHash(struct SalvageQueueNode * node);
+static void DeleteNodeFromHash(struct SalvageQueueNode * node);
+
+static afs_int32 SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res);
+static afs_int32 SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res);
+
+
+extern int LogLevel;
+extern int VInit;
+extern pthread_mutex_t vol_salvsync_mutex;
+
+static int AcceptSd = -1;              /* Socket used by server for accepting connections */
+
+
+/* be careful about rearranging elements in this structure.
+ * element placement has been optimized for locality of reference
+ * in SALVSYNC_getWork() */
+struct SalvageQueue {
+    volatile int total_len;
+    volatile afs_int32 last_insert;    /* id of last partition to have a salvage node insert */
+    volatile int len[VOLMAXPARTS+1];
+    volatile struct rx_queue part[VOLMAXPARTS+1];
+    pthread_cond_t cv;
+};
+static struct SalvageQueue salvageQueue;  /* volumes waiting to be salvaged */
+
+struct QueueHead {
+    volatile struct rx_queue q;
+    volatile int len;
+    pthread_cond_t queue_change_cv;
+};
+static struct QueueHead pendingQueue;  /* volumes being salvaged */
+
+/* XXX
+ * whether a partition has a salvage in progress
+ *
+ * the salvager code only permits one salvage per partition at a time
+ *
+ * the following hack tries to keep salvaged parallelism high by
+ * only permitting one salvage dispatch per partition at a time
+ *
+ * unfortunately, the parallel salvager currently
+ * has a rather braindead routine that won't permit
+ * multiple salvages on the same "device".  this
+ * function happens to break pretty badly on lvm, raid luns, etc.
+ *
+ * this hack isn't good enough to stop the device limiting code from
+ * crippling performance.  someday that code needs to be rewritten
+ */
+static int partition_salvaging[VOLMAXPARTS+1];
+
+#define VSHASH_SIZE 64
+#define VSHASH_MASK (VSHASH_SIZE-1)
+#define VSHASH(vid) ((vid)&VSHASH_MASK)
+
+static struct QueueHead  SalvageHashTable[VSHASH_SIZE];
+
+static struct SalvageQueueNode *
+LookupNode(afs_uint32 vid, char * partName)
+{
+    struct rx_queue *qp, *nqp;
+    struct SalvageQueueNode *vsp;
+    int idx = VSHASH(vid);
+
+    for (queue_Scan(&SalvageHashTable[idx], qp, nqp, rx_queue)) {
+       vsp = (struct SalvageQueueNode *)((char *)qp - offsetof(struct SalvageQueueNode, hash_chain));
+       if ((vsp->command.sop.volume == vid) &&
+           !strncmp(vsp->command.sop.partName, partName, sizeof(vsp->command.sop.partName))) {
+           break;
+       }
+    }
+
+    if (queue_IsEnd(&SalvageHashTable[idx], qp)) {
+       vsp = NULL;
+    }
+    return vsp;
+}
+
+static struct SalvageQueueNode *
+LookupNodeByCommand(SALVSYNC_command_hdr * qry)
+{
+    return LookupNode(qry->volume, qry->partName);
+}
+
+static void
+AddNodeToHash(struct SalvageQueueNode * node)
+{
+    int idx = VSHASH(node->command.sop.volume);
+
+    if (queue_IsOnQueue(&node->hash_chain)) {
+       return;
+    }
+
+    queue_Append(&SalvageHashTable[idx], &node->hash_chain);
+    SalvageHashTable[idx].len++;
+}
+
+static void
+DeleteNodeFromHash(struct SalvageQueueNode * node)
+{
+    int idx = VSHASH(node->command.sop.volume);
+
+    if (queue_IsNotOnQueue(&node->hash_chain)) {
+       return;
+    }
+
+    queue_Remove(&node->hash_chain);
+    SalvageHashTable[idx].len--;
+}
+
+void
+SALVSYNC_salvInit(void)
+{
+    int i;
+    pthread_t tid;
+    pthread_attr_t tattr;
+
+    /* initialize the queues */
+    assert(pthread_cond_init(&salvageQueue.cv, NULL) == 0);
+    for (i = 0; i <= VOLMAXPARTS; i++) {
+       queue_Init(&salvageQueue.part[i]);
+       salvageQueue.len[i] = 0;
+    }
+    assert(pthread_cond_init(&pendingQueue.queue_change_cv, NULL) == 0);
+    queue_Init(&pendingQueue);
+    salvageQueue.total_len = pendingQueue.len = 0;
+    salvageQueue.last_insert = -1;
+    memset(partition_salvaging, 0, sizeof(partition_salvaging));
+
+    for (i = 0; i < VSHASH_SIZE; i++) {
+       assert(pthread_cond_init(&SalvageHashTable[i].queue_change_cv, NULL) == 0);
+       SalvageHashTable[i].len = 0;
+       queue_Init(&SalvageHashTable[i]);
+    }
+
+    /* start the salvsync thread */
+    assert(pthread_attr_init(&tattr) == 0);
+    assert(pthread_attr_setdetachstate(&tattr, PTHREAD_CREATE_DETACHED) == 0);
+    assert(pthread_create(&tid, &tattr, SALVSYNC_syncThread, NULL) == 0);
+}
+
+static int
+getport(struct sockaddr_in *addr)
+{
+    int sd;
+
+    memset(addr, 0, sizeof(*addr));
+    assert((sd = socket(AF_INET, SOCK_STREAM, 0)) >= 0);
+#ifdef STRUCT_SOCKADDR_HAS_SA_LEN
+    addr->sin_len = sizeof(struct sockaddr_in);
+#endif
+    addr->sin_addr.s_addr = htonl(0x7f000001);
+    addr->sin_family = AF_INET;        /* was localhost->h_addrtype */
+    addr->sin_port = htons(2041);      /* XXXX htons not _really_ neccessary */
+
+    return sd;
+}
+
+static fd_set SALVSYNC_readfds;
+
+static void *
+SALVSYNC_syncThread(void * args)
+{
+    struct sockaddr_in addr;
+    int on = 1;
+    int code;
+    int numTries;
+    int tid;
+
+#ifndef AFS_NT40_ENV
+    (void)signal(SIGPIPE, SIG_IGN);
+#endif
+
+    /* set our 'thread-id' so that the host hold table works */
+    MUTEX_ENTER(&rx_stats_mutex);      /* protects rxi_pthread_hinum */
+    tid = ++rxi_pthread_hinum;
+    MUTEX_EXIT(&rx_stats_mutex);
+    pthread_setspecific(rx_thread_id_key, (void *)tid);
+    Log("Set thread id %d for SALVSYNC_syncThread\n", tid);
+
+    AcceptSd = getport(&addr);
+    /* Reuseaddr needed because system inexplicably leaves crud lying around */
+    code =
+       setsockopt(AcceptSd, SOL_SOCKET, SO_REUSEADDR, (char *)&on,
+                  sizeof(on));
+    if (code)
+       Log("SALVSYNC_sync: setsockopt failed with (%d)\n", errno);
+
+    for (numTries = 0; numTries < MAX_BIND_TRIES; numTries++) {
+       if ((code =
+            bind(AcceptSd, (struct sockaddr *)&addr, sizeof(addr))) == 0)
+           break;
+       Log("SALVSYNC_sync: bind failed with (%d), will sleep and retry\n",
+           errno);
+       sleep(5);
+    }
+    assert(!code);
+    listen(AcceptSd, 100);
+    InitHandler();
+    AcceptOn();
+
+    for (;;) {
+       int maxfd;
+       GetHandler(&SALVSYNC_readfds, &maxfd);
+       /* Note: check for >= 1 below is essential since IOMGR_select
+        * doesn't have exactly same semantics as select.
+        */
+       if (select(maxfd + 1, &SALVSYNC_readfds, NULL, NULL, NULL) >= 1)
+           CallHandler(&SALVSYNC_readfds);
+    }
+
+    return NULL;
+}
+
+static void
+SALVSYNC_newconnection(int afd)
+{
+    struct sockaddr_in other;
+    int junk, fd;
+    junk = sizeof(other);
+    fd = accept(afd, (struct sockaddr *)&other, &junk);
+    if (fd == -1) {
+       Log("SALVSYNC_newconnection:  accept failed, errno==%d\n", errno);
+       assert(1 == 2);
+    } else if (!AddHandler(fd, SALVSYNC_com)) {
+       AcceptOff();
+       assert(AddHandler(fd, SALVSYNC_com));
+    }
+}
+
+/* this function processes commands from an salvsync file descriptor (fd) */
+static afs_int32 SALV_cnt = 0;
+static void
+SALVSYNC_com(int fd)
+{
+    SYNC_command com;
+    SYNC_response res;
+    SALVSYNC_response_hdr sres_hdr;
+    SALVSYNC_command scom;
+    SALVSYNC_response sres;
+    SYNC_PROTO_BUF_DECL(buf);
+    
+    com.payload.buf = (void *)buf;
+    com.payload.len = SYNC_PROTO_MAX_LEN;
+    res.payload.buf = (void *) &sres_hdr;
+    res.payload.len = sizeof(sres_hdr);
+    res.hdr.response_len = sizeof(res.hdr) + sizeof(sres_hdr);
+    res.hdr.proto_version = SALVSYNC_PROTO_VERSION;
+
+    scom.hdr = &com.hdr;
+    scom.sop = (SALVSYNC_command_hdr *) buf;
+    scom.com = &com;
+    sres.hdr = &res.hdr;
+    sres.sop = &sres_hdr;
+    sres.res = &res;
+
+    SALV_cnt++;
+    if (SYNC_getCom(fd, &com)) {
+       Log("SALVSYNC_com:  read failed; dropping connection (cnt=%d)\n", SALV_cnt);
+       SALVSYNC_Drop(fd);
+       return;
+    }
+
+    if (com.hdr.proto_version != SALVSYNC_PROTO_VERSION) {
+       Log("SALVSYNC_com:  invalid protocol version (%u)\n", com.hdr.proto_version);
+       res.hdr.response = SYNC_COM_ERROR;
+       res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       goto respond;
+    }
+
+    if (com.recv_len != (sizeof(com.hdr) + sizeof(SALVSYNC_command_hdr))) {
+       Log("SALVSYNC_com:  invalid protocol message length (%u)\n", com.recv_len);
+       res.hdr.response = SYNC_COM_ERROR;
+       res.hdr.reason = SYNC_REASON_MALFORMED_PACKET;
+       res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       goto respond;
+    }
+
+    VOL_LOCK;
+    switch (com.hdr.command) {
+    case SALVSYNC_NOP:
+       break;
+    case SALVSYNC_SALVAGE:
+       res.hdr.response = SALVSYNC_com_Salvage(&scom, &sres);
+       break;
+    case SALVSYNC_CANCEL:
+       /* cancel a salvage */
+       res.hdr.response = SALVSYNC_com_Cancel(&scom, &sres);
+       break;
+    case SALVSYNC_CANCELALL:
+       /* cancel all queued salvages */
+       res.hdr.response = SALVSYNC_com_CancelAll(&scom, &sres);
+       break;
+    case SALVSYNC_RAISEPRIO:
+       /* raise the priority of a salvage */
+       res.hdr.response = SALVSYNC_com_RaisePrio(&scom, &sres);
+       break;
+    case SALVSYNC_QUERY:
+       /* query whether a volume is done salvaging */
+       res.hdr.response = SALVSYNC_com_Query(&scom, &sres);
+       break;
+    case SYNC_COM_CHANNEL_CLOSE:
+       res.hdr.response = SYNC_OK;
+       res.hdr.flags |= SYNC_FLAG_CHANNEL_SHUTDOWN;
+       break;
+    default:
+       res.hdr.response = SYNC_BAD_COMMAND;
+       break;
+    }
+
+    sres_hdr.sq_len = salvageQueue.total_len;
+    sres_hdr.pq_len = pendingQueue.len;
+    VOL_UNLOCK;
+
+ respond:
+    SYNC_putRes(fd, &res);
+    if (res.hdr.flags & SYNC_FLAG_CHANNEL_SHUTDOWN) {
+       SALVSYNC_Drop(fd);
+    }
+}
+
+static afs_int32
+SALVSYNC_com_Salvage(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+       code = SYNC_FAILED;
+       res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+       goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* schedule a salvage for this volume */
+    if (node != NULL) {
+       switch (node->state) {
+       case SALVSYNC_STATE_ERROR:
+       case SALVSYNC_STATE_DONE:
+           memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr));
+           memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr));
+           node->command.sop.prio = 0;
+           if (AddToSalvageQueue(node)) {
+               code = SYNC_DENIED;
+           }
+           break;
+       default:
+           break;
+       }
+    } else {
+       node = (struct SalvageQueueNode *) malloc(sizeof(struct SalvageQueueNode));
+       if (node == NULL) {
+           code = SYNC_DENIED;
+           goto done;
+       }
+       memset(node, 0, sizeof(struct SalvageQueueNode));
+       memcpy(&node->command.com, com->hdr, sizeof(SYNC_command_hdr));
+       memcpy(&node->command.sop, com->sop, sizeof(SALVSYNC_command_hdr));
+       AddNodeToHash(node);
+       if (AddToSalvageQueue(node)) {
+           /* roll back */
+           DeleteNodeFromHash(node);
+           free(node);
+           node = NULL;
+           code = SYNC_DENIED;
+           goto done;
+       }
+    }
+
+    res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+    res->sop->state = node->state;
+    res->sop->prio = node->command.sop.prio;
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_Cancel(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+       code = SYNC_FAILED;
+       res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+       goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    if (node == NULL) {
+       res->sop->state = SALVSYNC_STATE_UNKNOWN;
+       res->sop->prio = 0;
+    } else {
+       res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+       res->sop->prio = node->command.sop.prio;
+       res->sop->state = node->state;
+       if (node->state == SALVSYNC_STATE_QUEUED) {
+           DeleteFromSalvageQueue(node);
+       }
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_CancelAll(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    struct SalvageQueueNode * np, *nnp;
+    struct DiskPartition * dp;
+
+    for (dp = DiskPartitionList ; dp ; dp = dp->next) {
+       for (queue_Scan(&salvageQueue.part[dp->index], np, nnp, SalvageQueueNode)) {
+           DeleteFromSalvageQueue(np);
+       }
+    }
+
+    return SYNC_OK;
+}
+
+static afs_int32
+SALVSYNC_com_RaisePrio(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+       code = SYNC_FAILED;
+       res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+       goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* raise the priority of a salvage */
+    if (node == NULL) {
+       code = SALVSYNC_com_Salvage(com, res);
+       node = LookupNodeByCommand(com->sop);
+    } else {
+       switch (node->state) {
+       case SALVSYNC_STATE_QUEUED:
+           RaiseCommandPrio(node, com->sop);
+           break;
+       case SALVSYNC_STATE_SALVAGING:
+           break;
+       case SALVSYNC_STATE_ERROR:
+       case SALVSYNC_STATE_DONE:
+           code = SALVSYNC_com_Salvage(com, res);
+           break;
+       default:
+           break;
+       }
+    }
+
+    if (node == NULL) {
+       res->sop->prio = 0;
+       res->sop->state = SALVSYNC_STATE_UNKNOWN;
+    } else {
+       res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+       res->sop->prio = node->command.sop.prio;
+       res->sop->state = node->state;
+    }
+
+ done:
+    return code;
+}
+
+static afs_int32
+SALVSYNC_com_Query(SALVSYNC_command * com, SALVSYNC_response * res)
+{
+    afs_int32 code = SYNC_OK;
+    struct SalvageQueueNode * node;
+
+    if (SYNC_verifyProtocolString(com->sop->partName, sizeof(com->sop->partName))) {
+       code = SYNC_FAILED;
+       res->hdr->reason = SYNC_REASON_MALFORMED_PACKET;
+       goto done;
+    }
+
+    node = LookupNodeByCommand(com->sop);
+
+    /* query whether a volume is done salvaging */
+    if (node == NULL) {
+       res->sop->state = SALVSYNC_STATE_UNKNOWN;
+       res->sop->prio = 0;
+    } else {
+       res->hdr->flags |= SALVSYNC_FLAG_VOL_STATS_VALID;
+       res->sop->state = node->state;
+       res->sop->prio = node->command.sop.prio;
+    }
+
+ done:
+    return code;
+}
+
+static void
+SALVSYNC_Drop(int fd)
+{
+    RemoveHandler(fd);
+#ifdef AFS_NT40_ENV
+    closesocket(fd);
+#else
+    close(fd);
+#endif
+    AcceptOn();
+}
+
+static int AcceptHandler = -1; /* handler id for accept, if turned on */
+
+static void
+AcceptOn(void)
+{
+    if (AcceptHandler == -1) {
+       assert(AddHandler(AcceptSd, SALVSYNC_newconnection));
+       AcceptHandler = FindHandler(AcceptSd);
+    }
+}
+
+static void
+AcceptOff(void)
+{
+    if (AcceptHandler != -1) {
+       assert(RemoveHandler(AcceptSd));
+       AcceptHandler = -1;
+    }
+}
+
+/* The multiple FD handling code. */
+
+static int HandlerFD[MAXHANDLERS];
+static void (*HandlerProc[MAXHANDLERS]) (int);
+
+static void
+InitHandler(void)
+{
+    register int i;
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+       HandlerFD[i] = -1;
+       HandlerProc[i] = NULL;
+    }
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+}
+
+static void
+CallHandler(fd_set * fdsetp)
+{
+    register int i;
+    ObtainReadLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++) {
+       if (HandlerFD[i] >= 0 && FD_ISSET(HandlerFD[i], fdsetp)) {
+           ReleaseReadLock(&SALVSYNC_handler_lock);
+           (*HandlerProc[i]) (HandlerFD[i]);
+           ObtainReadLock(&SALVSYNC_handler_lock);
+       }
+    }
+    ReleaseReadLock(&SALVSYNC_handler_lock);
+}
+
+static int
+AddHandler(int afd, void (*aproc) (int))
+{
+    register int i;
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == -1)
+           break;
+    if (i >= MAXHANDLERS) {
+       ReleaseWriteLock(&SALVSYNC_handler_lock);
+       return 0;
+    }
+    HandlerFD[i] = afd;
+    HandlerProc[i] = aproc;
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+    return 1;
+}
+
+static int
+FindHandler(register int afd)
+{
+    register int i;
+    ObtainReadLock(&SALVSYNC_handler_lock);
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == afd) {
+           ReleaseReadLock(&SALVSYNC_handler_lock);
+           return i;
+       }
+    ReleaseReadLock(&SALVSYNC_handler_lock);   /* just in case */
+    assert(1 == 2);
+    return -1;                 /* satisfy compiler */
+}
+
+static int
+FindHandler_r(register int afd)
+{
+    register int i;
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] == afd) {
+           return i;
+       }
+    assert(1 == 2);
+    return -1;                 /* satisfy compiler */
+}
+
+static int
+RemoveHandler(register int afd)
+{
+    ObtainWriteLock(&SALVSYNC_handler_lock);
+    HandlerFD[FindHandler_r(afd)] = -1;
+    ReleaseWriteLock(&SALVSYNC_handler_lock);
+    return 1;
+}
+
+static void
+GetHandler(fd_set * fdsetp, int *maxfdp)
+{
+    register int i;
+    register int maxfd = -1;
+    FD_ZERO(fdsetp);
+    ObtainReadLock(&SALVSYNC_handler_lock);    /* just in case */
+    for (i = 0; i < MAXHANDLERS; i++)
+       if (HandlerFD[i] != -1) {
+           FD_SET(HandlerFD[i], fdsetp);
+           if (maxfd < HandlerFD[i])
+               maxfd = HandlerFD[i];
+       }
+    *maxfdp = maxfd;
+    ReleaseReadLock(&SALVSYNC_handler_lock);   /* just in case */
+}
+
+static int
+AddToSalvageQueue(struct SalvageQueueNode * node)
+{
+    afs_int32 id;
+
+    id = volutil_GetPartitionID(node->command.sop.partName);
+    if (id < 0 || id > VOLMAXPARTS) {
+       return 1;
+    }
+    if (!VGetPartitionById_r(id, 0)) {
+       /* don't enqueue salvage requests for unmounted partitions */
+       return 1;
+    }
+    queue_Append(&salvageQueue.part[id], node);
+    salvageQueue.len[id]++;
+    salvageQueue.total_len++;
+    salvageQueue.last_insert = id;
+    node->partition_id = id;
+    node->state = SALVSYNC_STATE_QUEUED;
+    assert(pthread_cond_broadcast(&salvageQueue.cv) == 0);
+    return 0;
+}
+
+static void
+DeleteFromSalvageQueue(struct SalvageQueueNode * node)
+{
+    if (queue_IsOnQueue(node)) {
+       queue_Remove(node);
+       salvageQueue.len[node->partition_id]--;
+       salvageQueue.total_len--;
+       node->state = SALVSYNC_STATE_UNKNOWN;
+       assert(pthread_cond_broadcast(&salvageQueue.cv) == 0);
+    }
+}
+
+static void
+AddToPendingQueue(struct SalvageQueueNode * node)
+{
+    queue_Append(&pendingQueue, node);
+    pendingQueue.len++;
+    node->state = SALVSYNC_STATE_SALVAGING;
+    assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0);
+}
+
+static void
+DeleteFromPendingQueue(struct SalvageQueueNode * node)
+{
+    if (queue_IsOnQueue(node)) {
+       queue_Remove(node);
+       pendingQueue.len--;
+       node->state = SALVSYNC_STATE_UNKNOWN;
+       assert(pthread_cond_broadcast(&pendingQueue.queue_change_cv) == 0);
+    }
+}
+
+static struct SalvageQueueNode *
+LookupPendingCommand(SALVSYNC_command_hdr * qry)
+{
+    struct SalvageQueueNode * np, * nnp;
+
+    for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) {
+       if ((np->command.sop.volume == qry->volume) && 
+           !strncmp(np->command.sop.partName, qry->partName,
+                    sizeof(qry->partName)))
+           break;
+    }
+
+    if (queue_IsEnd(&pendingQueue, np))
+       np = NULL;
+    return np;
+}
+
+static struct SalvageQueueNode *
+LookupPendingCommandByPid(int pid)
+{
+    struct SalvageQueueNode * np, * nnp;
+
+    for (queue_Scan(&pendingQueue, np, nnp, SalvageQueueNode)) {
+       if (np->pid == pid)
+           break;
+    }
+
+    if (queue_IsEnd(&pendingQueue, np))
+       np = NULL;
+    return np;
+}
+
+
+/* raise the priority of a previously scheduled salvage */
+static void
+RaiseCommandPrio(struct SalvageQueueNode * node, SALVSYNC_command_hdr * com)
+{
+    struct SalvageQueueNode *np, *nnp;
+    afs_int32 id;
+
+    assert(queue_IsOnQueue(node));
+
+    node->command.sop.prio = com->prio;
+    id = node->partition_id;
+    if (queue_First(&salvageQueue.part[id], SalvageQueueNode)->command.sop.prio < com->prio) {
+       queue_Remove(node);
+       queue_Prepend(&salvageQueue.part[id], node);
+    } else {
+       for (queue_ScanBackwardsFrom(&salvageQueue.part[id], node, np, nnp, SalvageQueueNode)) {
+           if (np->command.sop.prio > com->prio)
+               break;
+       }
+       if (queue_IsEnd(&salvageQueue.part[id], np)) {
+           queue_Remove(node);
+           queue_Prepend(&salvageQueue.part[id], node);
+       } else if (node != np) {
+           queue_Remove(node);
+           queue_InsertAfter(np, node);
+       }
+    }
+}
+
+/* this will need to be rearchitected if we ever want more than one thread
+ * to wait for new salvage nodes */
+struct SalvageQueueNode * 
+SALVSYNC_getWork(void)
+{
+    int i, ret;
+    struct DiskPartition * dp = NULL, * fdp;
+    static afs_int32 next_part_sched = 0;
+    struct SalvageQueueNode *node = NULL, *np;
+
+    VOL_LOCK;
+
+    /*
+     * wait for work to be scheduled
+     * if there are no disk partitions, just sit in this wait loop forever
+     */
+    while (!salvageQueue.total_len || !DiskPartitionList) {
+      assert(pthread_cond_wait(&salvageQueue.cv, &vol_glock_mutex) == 0);
+    }
+
+
+    /* 
+     * short circuit for simple case where only one partition has
+     * scheduled salvages
+     */
+    if (salvageQueue.last_insert >= 0 && salvageQueue.last_insert <= VOLMAXPARTS &&
+       (salvageQueue.total_len == salvageQueue.len[salvageQueue.last_insert])) {
+       node = queue_First(&salvageQueue.part[salvageQueue.last_insert], SalvageQueueNode);
+       goto have_node;
+    }
+
+
+    /* 
+     * ok, more than one partition has scheduled salvages.
+     * now search for partitions with scheduled salvages, but no pending salvages. 
+     */
+    dp = VGetPartitionById_r(next_part_sched, 0);
+    if (!dp) {
+       dp = DiskPartitionList;
+    }
+    fdp = dp;
+
+    for (i=0 ; 
+        !i || dp != fdp ; 
+        dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) {
+       if (!partition_salvaging[dp->index] && salvageQueue.len[dp->index]) {
+           node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode);
+           goto have_node;
+       }
+    }
+
+
+    /*
+     * all partitions with scheduled salvages have at least one pending.
+     * now do an exhaustive search for a scheduled salvage.
+     */
+    dp = fdp;
+
+    for (i=0 ; 
+        !i || dp != fdp ; 
+        dp = (dp->next) ? dp->next : DiskPartitionList, i++ ) {
+       if (salvageQueue.len[dp->index]) {
+           node = queue_First(&salvageQueue.part[dp->index], SalvageQueueNode);
+           goto have_node;
+       }
+    }
+
+    /* we should never reach this line */
+    assert(1==2);
+
+ have_node:
+    assert(node != NULL);
+    node->pid = 0;
+    partition_salvaging[node->partition_id]++;
+    DeleteFromSalvageQueue(node);
+    AddToPendingQueue(node);
+
+    if (dp) {
+       /* update next_part_sched field */
+       if (dp->next) {
+           next_part_sched = dp->next->index;
+       } else if (DiskPartitionList) {
+           next_part_sched = DiskPartitionList->index;
+       } else {
+           next_part_sched = -1;
+       }
+    }
+
+ bail:
+    VOL_UNLOCK;
+    return node;
+}
+
+static void
+SALVSYNC_doneWork_r(struct SalvageQueueNode * node, int result)
+{
+    afs_int32 partid;
+    DeleteFromPendingQueue(node);
+    partid = node->partition_id;
+    if (partid >=0 && partid <= VOLMAXPARTS) {
+       partition_salvaging[partid]--;
+    }
+    if (result == 0) {
+       node->state = SALVSYNC_STATE_DONE;
+    } else {
+       node->state = SALVSYNC_STATE_ERROR;
+    }
+}
+
+void 
+SALVSYNC_doneWork(struct SalvageQueueNode * node, int result)
+{
+    VOL_LOCK;
+    SALVSYNC_doneWork_r(node, result);
+    VOL_UNLOCK;
+}
+
+void
+SALVSYNC_doneWorkByPid(int pid, int result)
+{
+    struct SalvageQueueNode * node;
+
+    VOL_LOCK;
+    node = LookupPendingCommandByPid(pid);
+    if (node != NULL) {
+       SALVSYNC_doneWork_r(node, result);
+    }
+    VOL_UNLOCK;
+}
+
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/salvsync.h b/src/vol/salvsync.h

new file mode 100644 (file)

index 0000000..6611df6
--- /dev/null
+++ b/src/vol/salvsync.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2006, Sine Nomine Associates and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ * demand attach fs
+ * salvage server interface
+ */
+#ifndef _AFS_VOL_SALVSYNC_H
+#define _AFS_VOL_SALVSYNC_H
+
+#ifdef AFS_DEMAND_ATTACH_FS
+#include "daemon_com.h"
+
+
+#define SALVSYNC_PROTO_VERSION        1
+
+
+/* SALVSYNC command codes */
+#define SALVSYNC_NOP            SYNC_COM_CODE_DECL(0)   /* just return stats */
+#define SALVSYNC_SALVAGE       SYNC_COM_CODE_DECL(1)   /* schedule a salvage */
+#define SALVSYNC_CANCEL         SYNC_COM_CODE_DECL(2)   /* Cancel a salvage */
+#define SALVSYNC_RAISEPRIO      SYNC_COM_CODE_DECL(3)   /* move a salvage operation to
+                                                        * the head of the work queue */
+#define SALVSYNC_QUERY          SYNC_COM_CODE_DECL(4)   /* query the status of a salvage */
+#define SALVSYNC_CANCELALL      SYNC_COM_CODE_DECL(5)   /* cancel all pending salvages */
+
+/* SALVSYNC reason codes */
+#define SALVSYNC_WHATEVER      SYNC_REASON_CODE_DECL(0)  /* XXXX */
+#define SALVSYNC_ERROR         SYNC_REASON_CODE_DECL(1)  /* volume is in error state */
+#define SALVSYNC_OPERATOR      SYNC_REASON_CODE_DECL(2)  /* operator forced salvage */
+#define SALVSYNC_SHUTDOWN       SYNC_REASON_CODE_DECL(3)  /* cancel due to shutdown */
+#define SALVSYNC_NEEDED         SYNC_REASON_CODE_DECL(4)  /* needsSalvaged flag set */
+
+/* SALVSYNC response codes */
+
+/* SALVSYNC flags */
+#define SALVSYNC_FLAG_VOL_STATS_VALID SYNC_FLAG_CODE_DECL(0) /* volume stats in response are valid */
+
+/* SALVSYNC command state fields */
+#define SALVSYNC_STATE_UNKNOWN        0         /* unknown state */
+#define SALVSYNC_STATE_QUEUED         1         /* salvage request on queue */
+#define SALVSYNC_STATE_SALVAGING      2         /* salvage is happening now */
+#define SALVSYNC_STATE_ERROR          3         /* salvage ended in an error */
+#define SALVSYNC_STATE_DONE           4         /* last salvage ended successfully */
+
+
+typedef struct SALVSYNC_command_hdr {
+    afs_uint32 prio;
+    afs_uint32 volume;
+    char partName[16];         /* partition name, e.g. /vicepa */
+} SALVSYNC_command_hdr;
+
+typedef struct SALVSYNC_response_hdr {
+    afs_int32 state;
+    afs_int32 prio;
+    afs_int32 sq_len;
+    afs_int32 pq_len;
+} SALVSYNC_response_hdr;
+
+typedef struct SALVSYNC_command {
+    SYNC_command_hdr * hdr;
+    SALVSYNC_command_hdr * sop;
+    SYNC_command * com;
+} SALVSYNC_command;
+
+typedef struct SALVSYNC_response {
+    SYNC_response_hdr * hdr;
+    SALVSYNC_response_hdr * sop;
+    SYNC_response * res;
+} SALVSYNC_response;
+
+typedef struct SALVSYNC_command_info {
+    SYNC_command_hdr com;
+    SALVSYNC_command_hdr sop;
+} SALVSYNC_command_info;
+
+struct SalvageQueueNode {
+    struct rx_queue q;
+    struct rx_queue hash_chain;
+    afs_uint32 state;
+    struct SALVSYNC_command_info command;
+    afs_int32 partition_id;
+    int pid;
+};
+
+
+/* Prototypes from salvsync.c */
+
+/* online salvager client interfaces */
+extern int SALVSYNC_clientFinis(void);
+extern int SALVSYNC_clientInit(void);
+extern int SALVSYNC_clientReconnect(void);
+extern afs_int32 SALVSYNC_askSalv(SYNC_command * com, SYNC_response * res);
+extern afs_int32 SALVSYNC_SalvageVolume(VolumeId volume, char *partName, int com, int reason,
+                                       afs_uint32 prio, SYNC_response * res);
+
+/* salvage server interfaces */
+extern void SALVSYNC_salvInit(void);
+extern struct SalvageQueueNode * SALVSYNC_getWork(void);
+extern void SALVSYNC_doneWork(struct SalvageQueueNode *, int result);
+extern void SALVSYNC_doneWorkByPid(int pid, int result);
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#endif /* _AFS_VOL_SALVSYNC_H */
diff --git a/src/vol/test/listVicepx.c b/src/vol/test/listVicepx.c

index 7cb53d7d42f7d36cdd0e468c8d8d37c4a79501a9..7e9307ee1a356b0c5880e42565b33f5f72a71c82 100644 (file)
--- a/src/vol/test/listVicepx.c
+++ b/src/vol/test/listVicepx.c
@@ -102,6 +102,7 @@ RCSID
  #include "afs/assert.h"
  #include "filesignal.h"
  #include "vutils.h"
+#include "daemon_com.h"
  #include "fssync.h"
  #include <afs/auxinode.h>
  #include <afs/dir.h>
diff --git a/src/vol/test/updateDirInode.c b/src/vol/test/updateDirInode.c

index 1ebbcda15c215d5c2ddf9c778f9eb56afb746842..ff2d6b27d035e20f6f5b4c748532cd9f070bca65 100644 (file)
--- a/src/vol/test/updateDirInode.c
+++ b/src/vol/test/updateDirInode.c
@@ -102,6 +102,7 @@ RCSID
  #include "afs/assert.h"
  #include "filesignal.h"
  #include "vutils.h"
+#include "daemon_com.h"
  #include "fssync.h"
  #include <afs/auxinode.h>
  #include <afs/dir.h>
diff --git a/src/vol/vnode.c b/src/vol/vnode.c

index c9a6c0c58cdf25e604b8aee7d29c9c3bd706255b..75e90bd6ac68f52adeff08a74613736c1ad17a38 100644 (file)
--- a/src/vol/vnode.c
+++ b/src/vol/vnode.c
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -46,6 +48,7 @@ RCSID
  #include "vnode.h"
  #include "volume.h"
  #include "partition.h"
+#include "salvsync.h"
  #if defined(AFS_SGI_ENV)
  #include "sys/types.h"
  #include "fcntl.h"
@@ -73,8 +76,8 @@ RCSID
  struct VnodeClassInfo VnodeClassInfo[nVNODECLASSES];
  
  private int moveHash(register Vnode * vnp, bit32 newHash);
-void StickOnLruChain_r(register Vnode * vnp,
-                      register struct VnodeClassInfo *vcp);
+private void StickOnLruChain_r(register Vnode * vnp,
+                              register struct VnodeClassInfo *vcp);
  
  #define BAD_IGET       -1000
  
@@ -162,6 +165,83 @@ private Vnode *VnodeHashTable[VNODE_HASH_TABLE_SIZE];
  #define VNODE_HASH(volumeptr,vnodenumber)\
      ((volumeptr->vnodeHashOffset + vnodenumber)&(VNODE_HASH_TABLE_SIZE-1))
  
+/*
+ * new support to secondarily hash vnodes by volume id
+ */
+#define VNVOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
+
+#include "rx/rx_queue.h"
+typedef struct VnodeHashByVolumeChainHead {
+    struct rx_queue queue;
+    int len;
+    /* someday we could put a per-chain lock here... */
+#ifdef AFS_DEMAND_ATTACH_FS
+    int busy;
+    pthread_cond_t chain_busy_cv;
+#endif /* AFS_DEMAND_ATTACH_FS */
+} VnodeHashByVolumeChainHead;
+private VnodeHashByVolumeChainHead *VnodeHashByVolumeTable = NULL;
+
+void
+VInitVnHashByVolume(void)
+{
+    register int i;
+
+    VnodeHashByVolumeTable = (VnodeHashByVolumeChainHead *) calloc(VolumeHashTable.Size, 
+                                                                  sizeof(VnodeHashByVolumeChainHead));
+    assert(VnodeHashByVolumeTable != NULL);
+    
+    for (i=0; i < VolumeHashTable.Size; i++) {
+       queue_Init(&VnodeHashByVolumeTable[i]);
+#ifdef AFS_DEMAND_ATTACH_FS
+       assert(pthread_cond_init(&VnodeHashByVolumeTable[i].chain_busy_cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+}
+
+static void
+AddToVnHashByVolumeTable(register Vnode * vnp)
+{
+    VnodeHashByVolumeChainHead * head;
+
+    if (queue_IsOnQueue(vnp))
+       return;
+
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+       /* if the hash table is busy, wait */
+       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len++;
+    queue_Append(head, vnp);
+}
+
+/* for demand-attach, caller MUST hold a ref count on vp */
+static void
+DeleteFromVnHashByVolumeTable(register Vnode * vnp)
+{
+    VnodeHashByVolumeChainHead * head;
+
+    if (!queue_IsOnQueue(vnp))
+       return;
+
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vnp->volumePtr->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+       /* if the hash table is busy, wait */
+       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len--;
+    queue_Remove(vnp);
+}
+
  /* Code to invalidate a vnode entry.  Called when we've damaged a vnode, and want
      to prevent future VGetVnode's from applying to it.  Leaves it in the same hash bucket
      but that shouldn't be important.  */
@@ -305,7 +385,7 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
         unique = vp->nextVnodeUnique++;
  
      if (vp->nextVnodeUnique > V_uniquifier(vp)) {
-       VUpdateVolume_r(ec, vp);
+       VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
         if (*ec)
             return NULL;
      }
@@ -317,7 +397,8 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
      }
  
      /* Find a slot in the bit map */
-    bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class]);
+    bitNumber = VAllocBitmapEntry_r(ec, vp, &vp->vnodeIndex[class],
+                                   VOL_ALLOC_BITMAP_WAIT);
      if (*ec)
         return NULL;
      vnodeNumber = bitNumberToVnodeNumber(bitNumber, class);
@@ -376,7 +457,6 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
         vnp->volumePtr = vp;
         vnp->cacheCheck = vp->cacheCheck;
         vnp->nUsers = 1;
-       moveHash(vnp, newHash);
         /* This will never block */
         ObtainWriteLock(&vnp->lock);
  #ifdef AFS_PTHREAD_ENV
@@ -391,18 +471,33 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
             FdHandle_t *fdP;
             off_t off = vnodeIndexOffset(vcp, vnodeNumber);
  
+           /* XXX we have a potential race here if two threads
+            * allocate new vnodes at the same time, and they
+            * both decide it's time to extend the index
+            * file size... */
+
             VOL_UNLOCK;
             fdP = IH_OPEN(ihP);
-           if (fdP == NULL)
-               Abort("VAllocVnode: can't open index file!\n");
-           if ((size = FDH_SIZE(fdP)) < 0)
-               Abort("VAllocVnode: can't stat index file!\n");
-           if (FDH_SEEK(fdP, off, SEEK_SET) < 0)
-               Abort("VAllocVnode: can't seek on index file!\n");
-           if (off < size) {
-               if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) == vcp->diskSize) {
-                   if (vnp->disk.type != vNull)
-                       Abort("VAllocVnode:  addled bitmap or index!\n");
+           if (fdP == NULL) {
+               Log("VAllocVnode: can't open index file!\n");
+               goto error_encountered;
+           }
+           if ((size = FDH_SIZE(fdP)) < 0) {
+               Log("VAllocVnode: can't stat index file!\n");
+               goto error_encountered;
+           }
+           if (FDH_SEEK(fdP, off, SEEK_SET) < 0) {
+               Log("VAllocVnode: can't seek on index file!\n");
+               goto error_encountered;
+           }
+           if (off + vcp->diskSize <= size) {
+               if (FDH_READ(fdP, &vnp->disk, vcp->diskSize) != vcp->diskSize) {
+                   Log("VAllocVnode: can't read index file!\n");
+                   goto error_encountered;
+               }
+               if (vnp->disk.type != vNull) {
+                   Log("VAllocVnode:  addled bitmap or index!\n");
+                   goto error_encountered;
                 }
             } else {
                 /* growing file - grow in a reasonable increment */
@@ -414,9 +509,28 @@ VAllocVnode_r(Error * ec, Volume * vp, VnodeType type)
                 free(buf);
             }
             FDH_CLOSE(fdP);
+           fdP = NULL;
             VOL_LOCK;
+           goto sane;
+
+       error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+           VOL_LOCK;
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+           if (fdP)
+               FDH_CLOSE(fdP);
+           VInvalidateVnode_r(vnp);
+           StickOnLruChain_r(vnp, vcp);
+           return NULL;
+#else
+           assert(1 == 2);
+#endif
+
         }
+    sane:
         VNLog(4, 2, vnodeNumber, (afs_int32) vnp);
+       AddToVnHashByVolumeTable(vnp);
+       moveHash(vnp, newHash);
      }
  
      VNLog(5, 1, (afs_int32) vnp);
@@ -510,6 +624,8 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
         vcp->reads++;
         vnp = VGetFreeVnode_r(vcp);
         /* Remove it from the old hash chain */
+       if (vnp->volumePtr)
+           DeleteFromVnHashByVolumeTable(vnp);
         moveHash(vnp, newHash);
         /* Remove it from the LRU chain */
         if (vnp == vcp->lruHead)
@@ -525,6 +641,7 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
         vnp->volumePtr = vp;
         vnp->cacheCheck = vp->cacheCheck;
         vnp->nUsers = 1;
+       AddToVnHashByVolumeTable(vnp);
  
         /* This will never block */
         ObtainWriteLock(&vnp->lock);
@@ -540,11 +657,21 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
         if (fdP == NULL) {
             Log("VGetVnode: can't open index dev=%u, i=%s\n", vp->device,
                 PrintInode(NULL, vp->vnodeIndex[class].handle->ih_ino));
+#ifdef AFS_DEMAND_ATTACH_FS
+           VOL_LOCK;
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+           VOL_UNLOCK;
+#endif
             *ec = VIO;
             mlkReason = 9;
         } else if (FDH_SEEK(fdP, vnodeIndexOffset(vcp, vnodeNumber), SEEK_SET)
                    < 0) {
             Log("VGetVnode: can't seek on index file vn=%u\n", vnodeNumber);
+#ifdef AFS_DEMAND_ATTACH_FS
+           VOL_LOCK;
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+           VOL_UNLOCK;
+#endif
             *ec = VIO;
             mlkReason = 10;
             FDH_REALLYCLOSE(fdP);
@@ -564,8 +691,18 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
              * is not allocated */
             if (n == -1 && errno == EIO) {
                 Log("VGetVnode: Couldn't read vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp));
-               VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+               if (programType == fileServer) {
+                   VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+                   *ec = VSALVAGING;
+               } else {
+                   VForceOffline_r(vp, 0);
+                   *ec = VSALVAGE;
+               }
+#else
+               VForceOffline_r(vp, 0);
                 *ec = VSALVAGE;
+#endif
                 mlkReason = 4;
             } else {
                 mlkReason = 5;
@@ -603,9 +740,19 @@ VGetVnode_r(Error * ec, Volume * vp, VnodeId vnodeNumber, int locktype)
                     *ec = VNOVNODE;
                 } else {
                     Log("VGetVnode: Bad magic number, vnode %u, volume %u (%s); volume needs salvage\n", vnodeNumber, V_id(vp), V_name(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+                   if (programType == fileServer) {
+                       VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+                       *ec = VSALVAGING;
+                   } else {
+                       vp->goingOffline = 1;
+                       *ec = VSALVAGE;
+                   }
+#else
                     vp->goingOffline = 1;       /* used to call VOffline, but that would mess
                                                  * up the volume ref count if called here */
                     *ec = VSALVAGE;
+#endif
                     mlkReason = 7;
                 }
                 VInvalidateVnode_r(vnp);
@@ -728,20 +875,27 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
  
             /* The vnode has been changed. Write it out to disk */
             if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+               VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+               *ec = VSALVAGING;
+#else
                 assert(V_needsSalvaged(vp));
                 *ec = VSALVAGE;
+#endif
             } else {
                 IHandle_t *ihP = vp->vnodeIndex[class].handle;
                 FdHandle_t *fdP;
                 VOL_UNLOCK;
                 fdP = IH_OPEN(ihP);
-               if (fdP == NULL)
-                   Abort("VPutVnode: can't open index file!\n");
+               if (fdP == NULL) {
+                   Log("VPutVnode: can't open index file!\n");
+                   goto error_encountered;
+               }
                 offset = vnodeIndexOffset(vcp, vnp->vnodeNumber);
                 if (FDH_SEEK(fdP, offset, SEEK_SET) < 0) {
-                   Abort
-                       ("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
-                        fdP, offset, errno);
+                   Log("VPutVnode: can't seek on index file! fdp=0x%x offset=%d, errno=%d\n",
+                       fdP, offset, errno);
+                   goto error_encountered;
                 }
                 code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
                 if (code != vcp->diskSize) {
@@ -756,8 +910,13 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
                         *ec = VIO;
                     } else {
                         Log("VPutVnode: Couldn't write vnode %u, volume %u (%s) (error %d)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr), code);
-                       VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+                       VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+                       *ec = VSALVAGING;
+#else
+                       VForceOffline_r(vp, 0);
                         *ec = VSALVAGE;
+#endif
                     }
                     VOL_UNLOCK;
                     FDH_REALLYCLOSE(fdP);
@@ -765,6 +924,23 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
                     FDH_CLOSE(fdP);
                 }
                 VOL_LOCK;
+               goto sane;
+
+           error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+               /* XXX instead of dumping core, let's try to request a salvage
+                * and just fail the putvnode */
+               if (fdP)
+                   FDH_CLOSE(fdP);
+               VOL_LOCK;
+               VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+               *ec = VSALVAGING;
+               goto done;
+#else
+               assert(1 == 2);
+#endif
+
+           sane:
                 /* If the vnode is to be deleted, and we wrote the vnode out,
                  * free its bitmap entry. Do after the vnode is written so we
                  * don't allocate from bitmap before the vnode is written
@@ -787,6 +963,7 @@ VPutVnode_r(Error * ec, register Vnode * vnp)
                  vnp);
      }
  
+ done:
      /* Do not look at disk portion of vnode after this point; it may
       * have been deleted above */
      if (vnp->nUsers-- == 1)
@@ -865,19 +1042,28 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
  
         /* The inode has been changed.  Write it out to disk */
         if (!V_inUse(vp)) {
+#ifdef AFS_DEMAND_ATTACH_FS
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+           *ec = VSALVAGING;
+#else
             assert(V_needsSalvaged(vp));
             *ec = VSALVAGE;
+#endif
         } else {
             IHandle_t *ihP = vp->vnodeIndex[class].handle;
             FdHandle_t *fdP;
             off_t off = vnodeIndexOffset(vcp, vnp->vnodeNumber);
             VOL_UNLOCK;
             fdP = IH_OPEN(ihP);
-           if (fdP == NULL)
-               Abort("VPutVnode: can't open index file!\n");
+           if (fdP == NULL) {
+               Log("VPutVnode: can't open index file!\n");
+               goto error_encountered;
+           }
             code = FDH_SEEK(fdP, off, SEEK_SET);
-           if (code < 0)
-               Abort("VPutVnode: can't seek on index file!\n");
+           if (code < 0) {
+               Log("VPutVnode: can't seek on index file!\n");
+               goto error_encountered;
+           }
             code = FDH_WRITE(fdP, &vnp->disk, vcp->diskSize);
             if (code != vcp->diskSize) {
                 /*
@@ -892,14 +1078,33 @@ VVnodeWriteToRead_r(Error * ec, register Vnode * vnp)
                     *ec = VIO;
                 } else {
                     Log("VPutVnode: Couldn't write vnode %u, volume %u (%s)\n", vnp->vnodeNumber, V_id(vnp->volumePtr), V_name(vnp->volumePtr));
-                   VForceOffline_r(vp);
+#ifdef AFS_DEMAND_ATTACH_FS
+                   VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+                   *ec = VSALVAGING;
+#else
+                   VForceOffline_r(vp, 0);
                     *ec = VSALVAGE;
+#endif
                 }
                 VOL_UNLOCK;
             }
             FDH_CLOSE(fdP);
             VOL_LOCK;
+           goto sane;
+
+       error_encountered:
+#ifdef AFS_DEMAND_ATTACH_FS
+           if (fdP)
+               FDH_CLOSE(fdP);
+           VOL_LOCK;
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+           *ec = VSALVAGING;
+#else
+           assert(1 == 2);
+#endif
+
         }
+    sane:
         vcp->writes++;
         vnp->changed_newTime = vnp->changed_oldTime = 0;
      }
@@ -931,7 +1136,7 @@ moveHash(register Vnode * vnp, bit32 newHash)
      return 0;
  }
  
-void
+private void
  StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp)
  {
      /* Add it to the circular LRU list */
@@ -950,8 +1155,10 @@ StickOnLruChain_r(register Vnode * vnp, register struct VnodeClassInfo *vcp)
         vcp->lruHead = vnp->lruNext;
      /* If caching is turned off, set volumeptr to NULL to invalidate the
       * entry */
-    if (!TrustVnodeCacheEntry)
+    if (!TrustVnodeCacheEntry) {
+       DeleteFromVnHashByVolumeTable(vnp);
         vnp->volumePtr = NULL;
+    }
  }
  
  /* VCloseVnodeFiles - called when a volume is going off line. All open
@@ -962,15 +1169,30 @@ void
  VCloseVnodeFiles_r(Volume * vp)
  {
      int i;
-    Vnode *vnp;
+    Vnode *vnp, *nvnp;
+    VnodeHashByVolumeChainHead * head;
  
-    for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) {
-       for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) {
-           if (vnp->volumePtr == vp) {
-               IH_REALLYCLOSE(vnp->handle);
-           }
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)];
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+
+    head->busy = 1;
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    for (queue_Scan(head, vnp, nvnp, Vnode)) {
+       if (vnp->volumePtr == vp) {
+           IH_REALLYCLOSE(vnp->handle);
         }
      }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
  
  /* VReleaseVnodeFiles - called when a volume is going detached. All open
@@ -981,13 +1203,29 @@ void
  VReleaseVnodeFiles_r(Volume * vp)
  {
      int i;
-    Vnode *vnp;
+    Vnode *vnp, *nvnp;
+    VnodeHashByVolumeChainHead * head;
  
-    for (i = 0; i < VNODE_HASH_TABLE_SIZE; i++) {
-       for (vnp = VnodeHashTable[i]; vnp; vnp = vnp->hashNext) {
-           if (vnp->volumePtr == vp) {
-               IH_RELEASE(vnp->handle);
-           }
+    head = &VnodeHashByVolumeTable[VNVOLUME_HASH(vp->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    while (head->busy) {
+       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+
+    head->busy = 1;
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    for (queue_Scan(head, vnp, nvnp, Vnode)) {
+       if (vnp->volumePtr == vp) {
+           IH_RELEASE(vnp->handle);
         }
      }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
diff --git a/src/vol/vnode.h b/src/vol/vnode.h

index 9446f793209baafd8d241a55278e198e30ac8372..618cb83635fc0f86b8eb37cc1fb22545c2248b87 100644 (file)
--- a/src/vol/vnode.h
+++ b/src/vol/vnode.h
@@ -118,6 +118,7 @@ typedef struct VnodeDiskObject {
  #define SIZEOF_LARGEDISKVNODE  256
  
  typedef struct Vnode {
+    struct rx_queue vid_hash;   /* for vnode by volume id hash */
      struct Vnode *hashNext;    /* Next vnode on hash conflict chain */
      struct Vnode *lruNext;     /* Less recently used vnode than this one */
      struct Vnode *lruPrev;     /* More recently used vnode than this one */
@@ -216,3 +217,4 @@ extern Vnode *VAllocVnode(Error * ec, struct Volume *vp, VnodeType type);
  extern Vnode *VAllocVnode_r(Error * ec, struct Volume *vp, VnodeType type);
  /*extern VFreeVnode();*/
  extern Vnode *VGetFreeVnode_r(struct VnodeClassInfo *vcp);
+extern void VInitVnHashByVolume(void);
diff --git a/src/vol/vol-salvage.c b/src/vol/vol-salvage.c

index 04eb2694f987b3017b8ff4b3a61e091a61ea4809..eaaf6b96e2c7e5f3d14fa14e50f88d87904f63f2 100644 (file)
--- a/src/vol/vol-salvage.c
+++ b/src/vol/vol-salvage.c
@@ -83,11 +83,6 @@ Vnodes with 0 inode pointers in RW volumes are now deleted.
  */
  
  
-#define SalvageVersion "2.4"
-
-/* Main program file. Define globals. */
-#define MAIN 1
-
  #include <afsconfig.h>
  #include <afs/param.h>
  
@@ -186,10 +181,13 @@ RCSID
  #include "vnode.h"
  #include "volume.h"
  #include "partition.h"
+#include "daemon_com.h"
  #include "fssync.h"
+#include "salvsync.h"
  #include "viceinode.h"
  #include "salvage.h"
  #include "volinodes.h"         /* header magic number, etc. stuff */
+#include "vol-salvage.h"
  #ifdef AFS_NT40_ENV
  #include <pthread.h>
  #endif
@@ -221,10 +219,6 @@ extern void *calloc();
  #endif
  static char *TimeStamp(time_t clock, int precision);
  
-#define ORPH_IGNORE 0
-#define ORPH_REMOVE 1
-#define ORPH_ATTACH 2
-
  
  int debug;                     /* -d flag */
  int Testing = 0;               /* -n flag */
@@ -251,7 +245,7 @@ int OKToZap;                        /* -o flag */
  int ForceSalvage;              /* If salvage should occur despite the DONT_SALVAGE flag
                                  * in the volume header */
  
-static FILE *logFile = 0;      /* one of {/usr/afs/logs,/vice/file}/SalvageLog */
+FILE *logFile = 0;     /* one of {/usr/afs/logs,/vice/file}/SalvageLog */
  
  #define ROOTINODE      2       /* Root inode of a 4.2 Unix file system
                                  * partition */
@@ -279,201 +273,30 @@ int VolumeChanged;               /* Set by any routine which would change the volume in
  
  VolumeDiskData VolInfo;                /* A copy of the last good or salvaged volume header dealt with */
  
-struct InodeSummary {          /* Inode summary file--an entry for each
-                                * volume in the inode file for a partition */
-    VolId volumeId;            /* Volume id */
-    VolId RWvolumeId;          /* RW volume associated */
-    int index;                 /* index into inode file (0, 1, 2 ...) */
-    int nInodes;               /* Number of inodes for this volume */
-    int nSpecialInodes;                /* Number of special inodes, i.e.  volume
-                                * header, index, etc.  These are all
-                                * marked (viceinode.h) and will all be sorted
-                                * to the beginning of the information for
-                                * this volume.  Read-only volumes should
-                                * ONLY have special inodes (all the other
-                                * inodes look as if they belong to the
-                                * original RW volume). */
-    Unique maxUniquifier;      /* The maximum uniquifier found in all the inodes.
-                                * This is only useful for RW volumes and is used
-                                * to compute a new volume uniquifier in the event
-                                * that the header needs to be recreated. The inode
-                                * uniquifier may be a truncated version of vnode
-                                * uniquifier (AFS_3DISPARES). The real maxUniquifer
-                                * is from the vnodes and later calcuated from it */
-    struct VolumeSummary *volSummary;
-    /* Either a pointer to the original volume
-     * header summary, or constructed summary
-     * information */
-} *inodeSummary;
-#define readOnly(isp)  ((isp)->volumeId != (isp)->RWvolumeId)
  int nVolumesInInodeFile;       /* Number of read-write volumes summarized */
  int inodeFd;                   /* File descriptor for inode file */
  
  
-struct VolumeSummary {         /* Volume summary an entry for each
-                                * volume in a volume directory.
-                                * Assumption: one volume directory per
-                                * partition */
-    char *fileName;            /* File name on the partition for the volume
-                                * header */
-    struct VolumeHeader header;
-    /* volume number, rw volume number, inode
-     * numbers of each major component of
-     * the volume */
-    IHandle_t *volumeInfoHandle;
-    byte wouldNeedCallback;    /* set if the file server should issue
-                                * call backs for all the files in this volume when
-                                * the volume goes back on line */
-};
-
-struct VnodeInfo {
-    IHandle_t *handle;         /* Inode containing this index */
-    int nVnodes;               /* Total number of vnodes in index */
-    int nAllocatedVnodes;      /* Total number actually used */
-    int volumeBlockCount;      /* Total number of blocks used by volume */
-    Inode *inodes;             /* Directory only */
-    struct VnodeEssence {
-       short count;            /* Number of references to vnode; MUST BE SIGNED */
-       unsigned claimed:1;     /* Set when a parent directory containing an entry
-                                * referencing this vnode is found.  The claim
-                                * is that the parent in "parent" can point to
-                                * this vnode, and no other */
-       unsigned changed:1;     /* Set if any parameters (other than the count)
-                                * in the vnode change.   It is determined if the
-                                * link count has changed by noting whether it is
-                                * 0 after scanning all directories */
-       unsigned salvaged:1;    /* Set if this directory vnode has already been salvaged. */
-       unsigned todelete:1;    /* Set if this vnode is to be deleted (should not be claimed) */
-       afs_fsize_t blockCount;
-       /* Number of blocks (1K) used by this vnode,
-        * approximately */
-       VnodeId parent;         /* parent in vnode */
-       Unique unique;          /* Must match entry! */
-       char *name;             /* Name of directory entry */
-       int modeBits;           /* File mode bits */
-       Inode InodeNumber;      /* file's inode */
-       int type;               /* File type */
-       int author;             /* File author */
-       int owner;              /* File owner */
-       int group;              /* File group */
-    } *vnodes;
-} vnodeInfo[nVNODECLASSES];
-
-struct DirSummary {
-    struct DirHandle dirHandle;
-    VnodeId vnodeNumber;
-    Unique unique;
-    unsigned haveDot, haveDotDot;
-    VolumeId rwVid;
-    int copied;                        /* If the copy-on-write stuff has been applied */
-    VnodeId parent;
-    char *name;
-    char *vname;
-    IHandle_t *ds_linkH;
-};
+struct VnodeInfo vnodeInfo[nVNODECLASSES];
  
  
  struct VolumeSummary *volumeSummaryp;  /* Holds all the volumes in a part */
  int nVolumes;                  /* Number of volumes (read-write and read-only)
                                  * in volume summary */
  
-#ifdef AFS_NT40_ENV
-/* For NT, we can fork the per partition salvagers to gain the required
- * safety against Aborts. But there's too many complex data structures at
- * the per volume salvager layer to easilty copy the data across.
- * childJobNumber is resset from -1 to the job number if this is a
- * per partition child of the main salvager. This information is passed
- * out-of-band in the extra data area setup for the now unused parent/child
- * data transfer.
- */
-#define SALVAGER_MAGIC 0x00BBaaDD
-#define NOT_CHILD -1           /* job numbers start at 0 */
-/* If new options need to be passed to child, add them here. */
-typedef struct {
-    int cj_magic;
-    int cj_number;
-    char cj_part[32];
-} childJob_t;
+extern char * tmpdir = 0;
  
+
+#ifdef AFS_NT40_ENV
  /* Child job this process is running. */
  childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" };
-
-int nt_SalvagePartition(char *partName, int jobn);
-int nt_SetupPartitionSalvage(void *datap, int len);
-
-typedef struct {
-    struct InodeSummary *svgp_inodeSummaryp;
-    int svgp_count;
-} SVGParms_t;
-#define canfork 0
-#else
-#define canfork 1
-#endif
+#endif /* AFS_NT40_ENV */
  
  
  
  /* Forward declarations */
  /*@printflike@*/ void Log(const char *format, ...);
  /*@printflike@*/ void Abort(const char *format, ...);
-void Exit(int code);
-int Fork(void);
-int Wait(char *prog);
-char *ToString(char *s);
-void AskOffline(VolumeId volumeId);
-void AskOnline(VolumeId volumeId, char *partition);
-void CheckLogFile(void);
-#ifndef AFS_NT40_ENV
-void TimeStampLogFile(void);
-#endif
-void ClearROInUseBit(struct VolumeSummary *summary);
-void CopyAndSalvage(register struct DirSummary *dir);
-int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume);
-void CopyOnWrite(register struct DirSummary *dir);
-void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes,
-                      register struct InodeSummary *summary);
-void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp);
-void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino,
-                       Unique * maxu);
-int GetInodeSummary(char *path, VolumeId singleVolumeNumber);
-void GetVolumeSummary(VolumeId singleVolumeNumber);
-void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber,
-               Unique unique);
-void MaybeZapVolume(register struct InodeSummary *isp, char *message,
-                   int deleteMe, int check);
-void ObtainSalvageLock(void);
-void PrintInodeList(void);
-void PrintInodeSummary(void);
-void PrintVolumeSummary(void);
-int QuickCheck(register struct InodeSummary *isp, int nVols);
-void RemoveTheForce(char *path);
-void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo,
-               IHandle_t * alinkH, int i, struct DirSummary *rootdir,
-               int *rootdirfound);
-void SalvageFileSysParallel(struct DiskPartition *partP);
-void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber);
-void SalvageFileSys1(struct DiskPartition *partP,
-                    VolumeId singleVolumeNumber);
-int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp,
-                 int check, int *deleteMe);
-int SalvageIndex(Inode ino, VnodeClass class, int RW,
-                register struct ViceInodeInfo *ip, int nInodes,
-                struct VolumeSummary *volSummary, int check);
-int SalvageVnodes(register struct InodeSummary *rwIsp,
-                 register struct InodeSummary *thisIsp,
-                 register struct ViceInodeInfo *inodes, int check);
-int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH);
-void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
-#ifdef AFS_NT40_ENV
-void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
-#else
-#define SalvageVolumeGroup DoSalvageVolumeGroup
-#endif
-int SalvageVolumeHeaderFile(register struct InodeSummary *isp,
-                           register struct ViceInodeInfo *inodes, int RW,
-                           int check, int *deleteMe);
-void showlog(void);
-int UseTheForceLuke(char *path);
-
  static int IsVnodeOrphaned(VnodeId vnode);
  
  /* Uniquifier stored in the Inode */
@@ -500,207 +323,6 @@ BadError(register int aerror)
  }
  
  
-char *tmpdir = 0;
-static int
-handleit(struct cmd_syndesc *as)
-{
-    register struct cmd_item *ti;
-    char pname[100], *temp;
-    afs_int32 seenpart = 0, seenvol = 0, vid = 0, seenany = 0;
-    struct DiskPartition *partP;
-
-#ifdef AFS_SGI_VNODE_GLUE
-    if (afs_init_kernel_config(-1) < 0) {
-       printf
-           ("Can't determine NUMA configuration, not starting salvager.\n");
-       exit(1);
-    }
-#endif
-
-#ifdef FAST_RESTART
-    {
-       afs_int32 i;
-       for (i = 0; i < CMD_MAXPARMS; i++) {
-           if (as->parms[i].items) {
-               seenany = 1;
-               break;
-           }
-       }
-    }
-    if (!seenany) {
-       char *msg =
-           "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
-
-       if (useSyslog)
-           Log(msg);
-       else
-           printf("%s\n", msg);
-
-       Exit(0);
-    }
-#endif /* FAST_RESTART */
-    if ((ti = as->parms[0].items)) {   /* -partition */
-       seenpart = 1;
-       strncpy(pname, ti->data, 100);
-    }
-    if ((ti = as->parms[1].items)) {   /* -volumeid */
-       if (!seenpart) {
-           printf
-               ("You must also specify '-partition' option with the '-volumeid' option\n");
-           exit(-1);
-       }
-       seenvol = 1;
-       vid = atoi(ti->data);
-    }
-    if (as->parms[2].items)    /* -debug */
-       debug = 1;
-    if (as->parms[3].items)    /* -nowrite */
-       Testing = 1;
-    if (as->parms[4].items)    /* -inodes */
-       ListInodeOption = 1;
-    if (as->parms[5].items)    /* -force */
-       ForceSalvage = 1;
-    if (as->parms[6].items)    /* -oktozap */
-       OKToZap = 1;
-    if (as->parms[7].items)    /* -rootinodes */
-       ShowRootFiles = 1;
-    if (as->parms[8].items)    /* -RebuildDirs */
-       RebuildDirs = 1;
-    if (as->parms[9].items)    /* -ForceReads */
-       forceR = 1;
-    if ((ti = as->parms[10].items)) {  /* -Parallel # */
-       temp = ti->data;
-       if (strncmp(temp, "all", 3) == 0) {
-           PartsPerDisk = 1;
-           temp += 3;
-       }
-       if (strlen(temp) != 0) {
-           Parallel = atoi(temp);
-           if (Parallel < 1)
-               Parallel = 1;
-           if (Parallel > MAXPARALLEL) {
-               printf("Setting parallel salvages to maximum of %d \n",
-                      MAXPARALLEL);
-               Parallel = MAXPARALLEL;
-           }
-       }
-    }
-    if ((ti = as->parms[11].items)) {  /* -tmpdir */
-       DIR *dirp;
-
-       tmpdir = ti->data;
-       dirp = opendir(tmpdir);
-       if (!dirp) {
-           printf
-               ("Can't open temporary placeholder dir %s; using current partition \n",
-                tmpdir);
-           tmpdir = NULL;
-       } else
-           closedir(dirp);
-    }
-    if ((ti = as->parms[12].items))    /* -showlog */
-       ShowLog = 1;
-    if ((ti = as->parms[13].items)) {  /* -log */
-       Testing = 1;
-       ShowSuid = 1;
-       Showmode = 1;
-    }
-    if ((ti = as->parms[14].items)) {  /* -showmounts */
-       Testing = 1;
-       Showmode = 1;
-       ShowMounts = 1;
-    }
-    if ((ti = as->parms[15].items)) {  /* -orphans */
-       if (Testing)
-           orphans = ORPH_IGNORE;
-       else if (strcmp(ti->data, "remove") == 0
-                || strcmp(ti->data, "r") == 0)
-           orphans = ORPH_REMOVE;
-       else if (strcmp(ti->data, "attach") == 0
-                || strcmp(ti->data, "a") == 0)
-           orphans = ORPH_ATTACH;
-    }
-#ifndef AFS_NT40_ENV           /* ignore options on NT */
-    if ((ti = as->parms[16].items)) {  /* -syslog */
-       useSyslog = 1;
-       ShowLog = 0;
-    }
-    if ((ti = as->parms[17].items)) {  /* -syslogfacility */
-       useSyslogFacility = atoi(ti->data);
-    }
-
-    if ((ti = as->parms[18].items)) {  /* -datelogs */
-       TimeStampLogFile();
-    }
-#endif
-
-#ifdef FAST_RESTART
-    if (ti = as->parms[19].items) {    /* -DontSalvage */
-       char *msg =
-           "Exiting immediately without salvage. Look into the FileLog to find volumes which really need to be salvaged!";
-
-       if (useSyslog)
-           Log(msg);
-       else
-           printf("%s\n", msg);
-       Exit(0);
-    }
-#endif /* FAST_RESTART */
-
-    /* Note:  if seemvol we initialize this as a standard volume utility:  this has the
-     * implication that the file server may be running; negotations have to be made with
-     * the file server in this case to take the read write volume and associated read-only
-     * volumes off line before salvaging */
-#ifdef AFS_NT40_ENV
-    if (seenvol) {
-       if (afs_winsockInit() < 0) {
-           ReportErrorEventAlt(AFSEVT_SVR_WINSOCK_INIT_FAILED, 0,
-                               AFSDIR_SALVAGER_FILE, 0);
-           Log("Failed to initailize winsock, exiting.\n");
-           Exit(1);
-       }
-    }
-#endif
-    VInitVolumePackage(seenvol ? volumeUtility : salvager, 5, 5,
-                      DONT_CONNECT_FS, 0);
-    DInit(10);
-#ifdef AFS_NT40_ENV
-    if (myjob.cj_number != NOT_CHILD) {
-       if (!seenpart) {
-           seenpart = 1;
-           (void)strcpy(pname, myjob.cj_part);
-       }
-    }
-#endif
-    if (seenpart == 0) {
-       for (partP = DiskPartitionList; partP; partP = partP->next) {
-           SalvageFileSysParallel(partP);
-       }
-       SalvageFileSysParallel(0);
-    } else {
-       partP = VGetPartition(pname, 0);
-       if (!partP) {
-           Log("salvage: Unknown or unmounted partition %s; salvage aborted\n", pname);
-           Exit(1);
-       }
-       if (!seenvol)
-           SalvageFileSys(partP, 0);
-       else {
-           /* Salvage individual volume */
-           if (vid <= 0) {
-               Log("salvage: invalid volume id specified; salvage aborted\n");
-               Exit(1);
-           }
-           SalvageFileSys(partP, vid);
-       }
-    }
-    return (0);
-}
-
-
-#ifndef AFS_NT40_ENV
-#include "AFS_component_version_number.c"
-#endif
  #define MAX_ARGS 128
  #ifdef AFS_NT40_ENV
  char *save_args[MAX_ARGS];
@@ -708,143 +330,6 @@ int n_save_args = 0;
  pthread_t main_thread;
  #endif
  
-int
-main(int argc, char **argv)
-{
-    struct cmd_syndesc *ts;
-    int err = 0;
-    char commandLine[150];
-
-    int i;
-    extern char cml_version_number[];
-
-#ifdef AFS_AIX32_ENV
-    /*
-     * The following signal action for AIX is necessary so that in case of a 
-     * crash (i.e. core is generated) we can include the user's data section 
-     * in the core dump. Unfortunately, by default, only a partial core is
-     * generated which, in many cases, isn't too useful.
-     */
-    struct sigaction nsa;
-
-    sigemptyset(&nsa.sa_mask);
-    nsa.sa_handler = SIG_DFL;
-    nsa.sa_flags = SA_FULLDUMP;
-    sigaction(SIGABRT, &nsa, NULL);
-    sigaction(SIGSEGV, &nsa, NULL);
-#endif
-
-    /* Initialize directory paths */
-    if (!(initAFSDirPath() & AFSDIR_SERVER_PATHS_OK)) {
-#ifdef AFS_NT40_ENV
-       ReportErrorEventAlt(AFSEVT_SVR_NO_INSTALL_DIR, 0, argv[0], 0);
-#endif
-       fprintf(stderr, "%s: Unable to obtain AFS server directory.\n",
-               argv[0]);
-       exit(2);
-    }
-#ifdef AFS_NT40_ENV
-    main_thread = pthread_self();
-    if (spawnDatap && spawnDataLen) {
-       /* This is a child per partition salvager. Don't setup log or
-        * try to lock the salvager lock.
-        */
-       if (nt_SetupPartitionSalvage(spawnDatap, spawnDataLen) < 0)
-           exit(3);
-    } else {
-#endif
-       for (commandLine[0] = '\0', i = 0; i < argc; i++) {
-           if (i > 0)
-               strcat(commandLine, " ");
-           strcat(commandLine, argv[i]);
-       }
-
-       /* All entries to the log will be appended.  Useful if there are
-        * multiple salvagers appending to the log.
-        */
-
-       CheckLogFile();
-#ifndef AFS_NT40_ENV
-#ifdef AFS_LINUX20_ENV
-       fcntl(fileno(logFile), F_SETFL, O_APPEND);      /* Isn't this redundant? */
-#else
-       fcntl(fileno(logFile), F_SETFL, FAPPEND);       /* Isn't this redundant? */
-#endif
-#endif
-       setlinebuf(logFile);
-
-#ifndef AFS_NT40_ENV
-       if (geteuid() != 0) {
-           printf("Salvager must be run as root.\n");
-           fflush(stdout);
-           Exit(0);
-       }
-#endif
-
-       /* bad for normal help flag processing, but can do nada */
-
-       fprintf(logFile, "%s\n", cml_version_number);
-       Log("STARTING AFS SALVAGER %s (%s)\n", SalvageVersion, commandLine);
-
-       /* Get and hold a lock for the duration of the salvage to make sure
-        * that no other salvage runs at the same time.  The routine
-        * VInitVolumePackage (called below) makes sure that a file server or
-        * other volume utilities don't interfere with the salvage.
-        */
-       ObtainSalvageLock();
-#ifdef AFS_NT40_ENV
-    }
-#endif
-
-    ts = cmd_CreateSyntax("initcmd", handleit, 0, "initialize the program");
-    cmd_AddParm(ts, "-partition", CMD_SINGLE, CMD_OPTIONAL,
-               "Name of partition to salvage");
-    cmd_AddParm(ts, "-volumeid", CMD_SINGLE, CMD_OPTIONAL,
-               "Volume Id to salvage");
-    cmd_AddParm(ts, "-debug", CMD_FLAG, CMD_OPTIONAL,
-               "Run in Debugging mode");
-    cmd_AddParm(ts, "-nowrite", CMD_FLAG, CMD_OPTIONAL,
-               "Run readonly/test mode");
-    cmd_AddParm(ts, "-inodes", CMD_FLAG, CMD_OPTIONAL,
-               "Just list affected afs inodes - debugging flag");
-    cmd_AddParm(ts, "-force", CMD_FLAG, CMD_OPTIONAL, "Force full salvaging");
-    cmd_AddParm(ts, "-oktozap", CMD_FLAG, CMD_OPTIONAL,
-               "Give permission to destroy bogus inodes/volumes - debugging flag");
-    cmd_AddParm(ts, "-rootinodes", CMD_FLAG, CMD_OPTIONAL,
-               "Show inodes owned by root - debugging flag");
-    cmd_AddParm(ts, "-salvagedirs", CMD_FLAG, CMD_OPTIONAL,
-               "Force rebuild/salvage of all directories");
-    cmd_AddParm(ts, "-blockreads", CMD_FLAG, CMD_OPTIONAL,
-               "Read smaller blocks to handle IO/bad blocks");
-    cmd_AddParm(ts, "-parallel", CMD_SINGLE, CMD_OPTIONAL,
-               "# of max parallel partition salvaging");
-    cmd_AddParm(ts, "-tmpdir", CMD_SINGLE, CMD_OPTIONAL,
-               "Name of dir to place tmp files ");
-    cmd_AddParm(ts, "-showlog", CMD_FLAG, CMD_OPTIONAL,
-               "Show log file upon completion");
-    cmd_AddParm(ts, "-showsuid", CMD_FLAG, CMD_OPTIONAL,
-               "Report on suid/sgid files");
-    cmd_AddParm(ts, "-showmounts", CMD_FLAG, CMD_OPTIONAL,
-               "Report on mountpoints");
-    cmd_AddParm(ts, "-orphans", CMD_SINGLE, CMD_OPTIONAL,
-               "ignore | remove | attach");
-
-    /* note - syslog isn't avail on NT, but if we make it conditional, have
-     * to deal with screwy offsets for cmd params */
-    cmd_AddParm(ts, "-syslog", CMD_FLAG, CMD_OPTIONAL,
-               "Write salvage log to syslogs");
-    cmd_AddParm(ts, "-syslogfacility", CMD_SINGLE, CMD_OPTIONAL,
-               "Syslog facility number to use");
-    cmd_AddParm(ts, "-datelogs", CMD_FLAG, CMD_OPTIONAL,
-               "Include timestamp in logfile filename");
-
-#ifdef FAST_RESTART
-    cmd_AddParm(ts, "-DontSalvage", CMD_FLAG, CMD_OPTIONAL,
-               "Don't salvage. This my be set in BosConfig to let the fileserver restart immediately after a crash. Bad volumes will be taken offline");
-#endif /* FAST_RESTART */
-    err = cmd_Dispatch(argc, argv);
-    Exit(err);
-}
  
  /* Get the salvage lock if not already held. Hold until process exits. */
  void
@@ -1249,7 +734,8 @@ SalvageFileSys1(struct DiskPartition *partP, VolumeId singleVolumeNumber)
         ForceSalvage = UseTheForceLuke(fileSysPath);
  
      if (singleVolumeNumber) {
-       if (!VConnectFS()) {
+       /* salvageserver already setup fssync conn for us */
+       if ((programType != salvageServer) && !VConnectFS()) {
             Abort("Couldn't connect to file server\n");
         }
         AskOffline(singleVolumeNumber);
@@ -2554,7 +2040,7 @@ SalvageIndex(Inode ino, VnodeClass class, int RW,
                      * if no such match, take the first determined by our sort
                      * order */
                     register struct ViceInodeInfo *lip = ip;
-                   register lnInodes = nInodes;
+                   register int lnInodes = nInodes;
                     while (lnInodes
                            && lip->u.vnode.vnodeNumber == vnodeNumber) {
                         if (VNDISK_GET_INO(vnode) == lip->inodeNumber) {
@@ -3628,8 +3114,38 @@ MaybeZapVolume(register struct InodeSummary *isp, char *message, int deleteMe,
  void
  AskOffline(VolumeId volumeId)
  {
-    if (FSYNC_askfs(volumeId, NULL, FSYNC_OFF, FSYNC_SALVAGE) == FSYNC_DENIED) {
-       Log("AskOffline:  file server denied offline request; a general salvage is required.\n");
+    afs_int32 code, i;
+
+    for (i = 0; i < 3; i++) {
+       code = FSYNC_VolOp(volumeId, NULL, FSYNC_VOL_OFF, FSYNC_SALVAGE, NULL);
+
+       if (code == SYNC_OK) {
+           break;
+       } else if (code == SYNC_DENIED) {
+#ifdef DEMAND_ATTACH_ENABLE
+           Log("AskOffline:  file server denied offline request; a general salvage may be required.\n");
+#else
+           Log("AskOffline:  file server denied offline request; a general salvage is required.\n");
+#endif
+           Abort("Salvage aborted\n");
+       } else if (code == SYNC_BAD_COMMAND) {
+           Log("AskOffline:  fssync protocol mismatch (bad command word '%d'); salvage aborting.\n",
+               FSYNC_VOL_OFF);
+#ifdef DEMAND_ATTACH_ENABLE
+           Log("AskOffline:  please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n");
+#else
+           Log("AskOffline:  please make sure fileserver, volserver and salvager binaries are same version.\n");
+#endif
+           Abort("Salvage aborted\n");
+       } else if (i < 2) {
+           /* try it again */
+           Log("AskOffline:  request for fileserver to take volume offline failed; trying again...\n");
+           FSYNC_clientFinis();
+           FSYNC_clientInit();
+       }
+    }
+    if (code != SYNC_OK) {
+       Log("AskOffline:  request for fileserver to take volume offline failed; salvage aborting.\n");
         Abort("Salvage aborted\n");
      }
  }
@@ -3637,8 +3153,30 @@ AskOffline(VolumeId volumeId)
  void
  AskOnline(VolumeId volumeId, char *partition)
  {
-    if (FSYNC_askfs(volumeId, partition, FSYNC_ON, 0) == FSYNC_DENIED) {
-       Log("AskOnline:  file server denied online request to volume %u partition %s\n", volumeId, partition);
+    afs_int32 code, i;
+
+    for (i = 0; i < 3; i++) {
+       code = FSYNC_VolOp(volumeId, partition, FSYNC_VOL_ON, FSYNC_WHATEVER, NULL);
+
+       if (code == SYNC_OK) {
+           break;
+       } else if (code == SYNC_DENIED) {
+           Log("AskOnline:  file server denied online request to volume %u partition %s; trying again...\n", volumeId, partition);
+       } else if (code == SYNC_BAD_COMMAND) {
+           Log("AskOnline:  fssync protocol mismatch (bad command word '%d')\n",
+               FSYNC_VOL_ON);
+#ifdef DEMAND_ATTACH_ENABLE
+           Log("AskOnline:  please make sure fileserver, volserver, salvageserver and salvager binaries are same version.\n");
+#else
+           Log("AskOnline:  please make sure fileserver, volserver and salvager binaries are same version.\n");
+#endif
+           break;
+       } else if (i < 2) {
+           /* try it again */
+           Log("AskOnline:  request for fileserver to take volume offline failed; trying again...\n");
+           FSYNC_clientFinis();
+           FSYNC_clientInit();
+       }
      }
  }
  
@@ -3772,7 +3310,7 @@ TimeStamp(time_t clock, int precision)
  }
  
  void
-CheckLogFile(void)
+CheckLogFile(char * log_path)
  {
      char oldSlvgLog[AFSDIR_PATH_MAX];
  
@@ -3783,11 +3321,11 @@ CheckLogFile(void)
      }
  #endif
  
-    strcpy(oldSlvgLog, AFSDIR_SERVER_SLVGLOG_FILEPATH);
+    strcpy(oldSlvgLog, log_path);
      strcat(oldSlvgLog, ".old");
      if (!logFile) {
-       renamefile(AFSDIR_SERVER_SLVGLOG_FILEPATH, oldSlvgLog);
-       logFile = afs_fopen(AFSDIR_SERVER_SLVGLOG_FILEPATH, "a");
+       renamefile(log_path, oldSlvgLog);
+       logFile = afs_fopen(log_path, "a");
  
         if (!logFile) {         /* still nothing, use stdout */
             logFile = stdout;
@@ -3801,7 +3339,7 @@ CheckLogFile(void)
  
  #ifndef AFS_NT40_ENV
  void
-TimeStampLogFile(void)
+TimeStampLogFile(char * log_path)
  {
      char stampSlvgLog[AFSDIR_PATH_MAX];
      struct tm *lt;
@@ -3811,13 +3349,13 @@ TimeStampLogFile(void)
      lt = localtime(&now);
      (void)afs_snprintf(stampSlvgLog, sizeof stampSlvgLog,
                        "%s.%04d-%02d-%02d.%02d:%02d:%02d",
-                      AFSDIR_SERVER_SLVGLOG_FILEPATH, lt->tm_year + 1900,
+                      log_path, lt->tm_year + 1900,
                        lt->tm_mon + 1, lt->tm_mday, lt->tm_hour, lt->tm_min,
                        lt->tm_sec);
  
      /* try to link the logfile to a timestamped filename */
      /* if it fails, oh well, nothing we can do */
-    link(AFSDIR_SERVER_SLVGLOG_FILEPATH, stampSlvgLog);
+    link(log_path, stampSlvgLog);
  }
  #endif
  
@@ -3937,7 +3475,7 @@ UseTheForceLuke(char *path)
   *
   * NOTE:
   *     The VRMIX fsck will not muck with the filesystem it is supposedly
- *     fixing and create a "FORCESAVAGE" file (by design).  Instead, we
+ *     fixing and create a "FORCESALVAGE" file (by design).  Instead, we
   *     muck directly with the root inode, which is within the normal
   *     domain of fsck.
   *     ListViceInodes() has a side effect of setting ForceSalvage if
diff --git a/src/vol/vol-salvage.h b/src/vol/vol-salvage.h

new file mode 100644 (file)

index 0000000..c95ce24
--- /dev/null
+++ b/src/vol/vol-salvage.h
@@ -0,0 +1,282 @@
+/*
+ * Copyright 2000, International Business Machines Corporation and others.
+ * All Rights Reserved.
+ * 
+ * This software has been released under the terms of the IBM Public
+ * License.  For details, see the LICENSE file in the top-level source
+ * directory or online at http://www.openafs.org/dl/license10.html
+ */
+
+/*
+ *      Module:                vol-salvage.h
+ */
+
+#ifndef __vol_salvage_h_
+#define __vol_salvage_h_
+
+#define SalvageVersion "2.4"
+
+#include "salvage.h"
+#include "volinodes.h"
+
+/* salvager data structures */
+struct InodeSummary {          /* Inode summary file--an entry for each
+                                * volume in the inode file for a partition */
+    VolId volumeId;            /* Volume id */
+    VolId RWvolumeId;          /* RW volume associated */
+    int index;                 /* index into inode file (0, 1, 2 ...) */
+    int nInodes;               /* Number of inodes for this volume */
+    int nSpecialInodes;                /* Number of special inodes, i.e.  volume
+                                * header, index, etc.  These are all
+                                * marked (viceinode.h) and will all be sorted
+                                * to the beginning of the information for
+                                * this volume.  Read-only volumes should
+                                * ONLY have special inodes (all the other
+                                * inodes look as if they belong to the
+                                * original RW volume). */
+    Unique maxUniquifier;      /* The maximum uniquifier found in all the inodes.
+                                * This is only useful for RW volumes and is used
+                                * to compute a new volume uniquifier in the event
+                                * that the header needs to be recreated. The inode
+                                * uniquifier may be a truncated version of vnode
+                                * uniquifier (AFS_3DISPARES). The real maxUniquifer
+                                * is from the vnodes and later calcuated from it */
+    struct VolumeSummary *volSummary;
+    /* Either a pointer to the original volume
+     * header summary, or constructed summary
+     * information */
+} *inodeSummary;
+#define readOnly(isp)  ((isp)->volumeId != (isp)->RWvolumeId)
+
+struct VolumeSummary {         /* Volume summary an entry for each
+                                * volume in a volume directory.
+                                * Assumption: one volume directory per
+                                * partition */
+    char *fileName;            /* File name on the partition for the volume
+                                * header */
+    struct VolumeHeader header;
+    /* volume number, rw volume number, inode
+     * numbers of each major component of
+     * the volume */
+    IHandle_t *volumeInfoHandle;
+    byte wouldNeedCallback;    /* set if the file server should issue
+                                * call backs for all the files in this volume when
+                                * the volume goes back on line */
+};
+
+struct VnodeInfo {
+    IHandle_t *handle;         /* Inode containing this index */
+    int nVnodes;               /* Total number of vnodes in index */
+    int nAllocatedVnodes;      /* Total number actually used */
+    int volumeBlockCount;      /* Total number of blocks used by volume */
+    Inode *inodes;             /* Directory only */
+    struct VnodeEssence {
+       short count;            /* Number of references to vnode; MUST BE SIGNED */
+       unsigned claimed:1;     /* Set when a parent directory containing an entry
+                                * referencing this vnode is found.  The claim
+                                * is that the parent in "parent" can point to
+                                * this vnode, and no other */
+       unsigned changed:1;     /* Set if any parameters (other than the count)
+                                * in the vnode change.   It is determined if the
+                                * link count has changed by noting whether it is
+                                * 0 after scanning all directories */
+       unsigned salvaged:1;    /* Set if this directory vnode has already been salvaged. */
+       unsigned todelete:1;    /* Set if this vnode is to be deleted (should not be claimed) */
+       afs_fsize_t blockCount;
+       /* Number of blocks (1K) used by this vnode,
+        * approximately */
+       VnodeId parent;         /* parent in vnode */
+       Unique unique;          /* Must match entry! */
+       char *name;             /* Name of directory entry */
+       int modeBits;           /* File mode bits */
+       Inode InodeNumber;      /* file's inode */
+       int type;               /* File type */
+       int author;             /* File author */
+       int owner;              /* File owner */
+       int group;              /* File group */
+    } *vnodes;
+};
+
+struct DirSummary {
+    struct DirHandle dirHandle;
+    VnodeId vnodeNumber;
+    Unique unique;
+    unsigned haveDot, haveDotDot;
+    VolumeId rwVid;
+    int copied;                        /* If the copy-on-write stuff has been applied */
+    VnodeId parent;
+    char *name;
+    char *vname;
+    IHandle_t *ds_linkH;
+};
+
+#define ORPH_IGNORE 0
+#define ORPH_REMOVE 1
+#define ORPH_ATTACH 2
+
+
+/* command line options */
+extern int debug;                      /* -d flag */
+extern int Testing;                    /* -n flag */
+extern int ListInodeOption;            /* -i flag */
+extern int ShowRootFiles;              /* -r flag */
+extern int RebuildDirs;                        /* -sal flag */
+extern int Parallel;                   /* -para X flag */
+extern int PartsPerDisk;               /* Salvage up to 8 partitions on same disk sequentially */
+extern int forceR;                     /* -b flag */
+extern int ShowLog;                    /* -showlog flag */
+extern int ShowSuid;                   /* -showsuid flag */
+extern int ShowMounts;                 /* -showmounts flag */
+extern int orphans;                    /* -orphans option */
+extern int Showmode;
+
+#ifndef AFS_NT40_ENV
+extern int useSyslog;                  /* -syslog flag */
+extern int useSyslogFacility;          /* -syslogfacility option */
+#endif
+
+#define        MAXPARALLEL     32
+
+extern int OKToZap;                    /* -o flag */
+extern int ForceSalvage;               /* If salvage should occur despite the DONT_SALVAGE flag
+                                        * in the volume header */
+
+
+#define ROOTINODE      2       /* Root inode of a 4.2 Unix file system
+                                * partition */
+extern Device fileSysDevice;   /* The device number of the current
+                                * partition being salvaged */
+#ifdef AFS_NT40_ENV
+extern char fileSysPath[8];
+#else
+extern char *fileSysPath;      /* The path of the mounted partition currently
+                                * being salvaged, i.e. the directory
+                                * containing the volume headers */
+#endif /* AFS_NT40_ENV */
+extern char *fileSysPathName;  /* NT needs this to make name pretty in log. */
+extern IHandle_t *VGLinkH;     /* Link handle for current volume group. */
+extern int VGLinkH_cnt;                /* # of references to lnk handle. */
+extern struct DiskPartition *fileSysPartition; /* Partition  being salvaged */
+#ifndef AFS_NT40_ENV
+extern char *fileSysDeviceName;        /* The block device where the file system
+                                * being salvaged was mounted */
+extern char *filesysfulldev;
+#endif /* AFS_NT40_ENV */
+extern int VolumeChanged;      /* Set by any routine which would change the volume in
+                                * a way which would require callback is to be broken if the
+                                * volume was put back on line by an active file server */
+
+extern VolumeDiskData VolInfo; /* A copy of the last good or salvaged volume header dealt with */
+
+extern int nVolumesInInodeFile;        /* Number of read-write volumes summarized */
+extern int inodeFd;            /* File descriptor for inode file */
+
+
+extern struct VnodeInfo vnodeInfo[nVNODECLASSES];
+
+
+extern struct VolumeSummary *volumeSummaryp;   /* Holds all the volumes in a part */
+extern int nVolumes;           /* Number of volumes (read-write and read-only)
+                                * in volume summary */
+
+extern char * tmpdir;
+extern FILE *logFile;          /* one of {/usr/afs/logs,/vice/file}/SalvageLog */
+
+
+#ifdef AFS_NT40_ENV
+/* For NT, we can fork the per partition salvagers to gain the required
+ * safety against Aborts. But there's too many complex data structures at
+ * the per volume salvager layer to easilty copy the data across.
+ * childJobNumber is resset from -1 to the job number if this is a
+ * per partition child of the main salvager. This information is passed
+ * out-of-band in the extra data area setup for the now unused parent/child
+ * data transfer.
+ */
+#define SALVAGER_MAGIC 0x00BBaaDD
+#define NOT_CHILD -1           /* job numbers start at 0 */
+/* If new options need to be passed to child, add them here. */
+typedef struct {
+    int cj_magic;
+    int cj_number;
+    char cj_part[32];
+} childJob_t;
+
+/* Child job this process is running. */
+extern childJob_t myjob = { SALVAGER_MAGIC, NOT_CHILD, "" };
+
+extern int nt_SalvagePartition(char *partName, int jobn);
+extern int nt_SetupPartitionSalvage(void *datap, int len);
+
+typedef struct {
+    struct InodeSummary *svgp_inodeSummaryp;
+    int svgp_count;
+} SVGParms_t;
+#define canfork 0
+#else /* AFS_NT40_ENV */
+#define canfork 1
+#endif /* AFS_NT40_ENV */
+
+
+/* prototypes */
+extern void Exit(int code);
+extern int Fork(void);
+extern int Wait(char *prog);
+extern char *ToString(char *s);
+extern void AskOffline(VolumeId volumeId);
+extern void AskOnline(VolumeId volumeId, char *partition);
+extern void CheckLogFile(char * log_path);
+#ifndef AFS_NT40_ENV
+extern void TimeStampLogFile(char * log_path);
+#endif
+extern void ClearROInUseBit(struct VolumeSummary *summary);
+extern void CopyAndSalvage(register struct DirSummary *dir);
+extern int CopyInode(Device device, Inode inode1, Inode inode2, int rwvolume);
+extern void CopyOnWrite(register struct DirSummary *dir);
+extern void CountVolumeInodes(register struct ViceInodeInfo *ip, int maxInodes,
+                      register struct InodeSummary *summary);
+extern void DeleteExtraVolumeHeaderFile(register struct VolumeSummary *vsp);
+extern void DistilVnodeEssence(VolumeId vid, VnodeClass class, Inode ino,
+                              Unique * maxu);
+extern int GetInodeSummary(char *path, VolumeId singleVolumeNumber);
+extern void GetVolumeSummary(VolumeId singleVolumeNumber);
+extern void JudgeEntry(struct DirSummary *dir, char *name, VnodeId vnodeNumber,
+                      Unique unique);
+extern void MaybeZapVolume(register struct InodeSummary *isp, char *message,
+                          int deleteMe, int check);
+extern void ObtainSalvageLock(void);
+extern void PrintInodeList(void);
+extern void PrintInodeSummary(void);
+extern void PrintVolumeSummary(void);
+extern int QuickCheck(register struct InodeSummary *isp, int nVols);
+extern void RemoveTheForce(char *path);
+extern void SalvageDir(char *name, VolumeId rwVid, struct VnodeInfo *dirVnodeInfo,
+                      IHandle_t * alinkH, int i, struct DirSummary *rootdir,
+                      int *rootdirfound);
+extern void SalvageFileSysParallel(struct DiskPartition *partP);
+extern void SalvageFileSys(struct DiskPartition *partP, VolumeId singleVolumeNumber);
+extern void SalvageFileSys1(struct DiskPartition *partP,
+                           VolumeId singleVolumeNumber);
+extern int SalvageHeader(register struct stuff *sp, struct InodeSummary *isp,
+                        int check, int *deleteMe);
+extern int SalvageIndex(Inode ino, VnodeClass class, int RW,
+                       register struct ViceInodeInfo *ip, int nInodes,
+                       struct VolumeSummary *volSummary, int check);
+extern int SalvageVnodes(register struct InodeSummary *rwIsp,
+                        register struct InodeSummary *thisIsp,
+                        register struct ViceInodeInfo *inodes, int check);
+extern int SalvageVolume(register struct InodeSummary *rwIsp, IHandle_t * alinkH);
+extern void DoSalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
+#ifdef AFS_NT40_ENV
+extern void SalvageVolumeGroup(register struct InodeSummary *isp, int nVols);
+#else
+#define SalvageVolumeGroup DoSalvageVolumeGroup
+#endif
+extern int SalvageVolumeHeaderFile(register struct InodeSummary *isp,
+                                  register struct ViceInodeInfo *inodes, int RW,
+                                  int check, int *deleteMe);
+extern void showlog(void);
+extern int UseTheForceLuke(char *path);
+
+
+
+#endif /* __vol_salvage_h_ */
diff --git a/src/vol/voldefs.h b/src/vol/voldefs.h

index 2094a0ca045e0cee4b1a34264b2ddd834f031317..b546be24f35c9783dd09b18786b351e68df1c5a7 100644 (file)
--- a/src/vol/voldefs.h
+++ b/src/vol/voldefs.h
@@ -25,6 +25,9 @@
  #define ROVOL                  1
  #define BACKVOL                        2
  
+/* maximum numbe of Vice partitions */
+#define        VOLMAXPARTS     255
+
  /* All volumes will have a volume header name in this format */
  #if    defined(AFS_AIX_ENV) || defined(AFS_HPUX_ENV)
  /* Note that <afs/param.h> must have been included before we get here... */
diff --git a/src/vol/volinodes.h b/src/vol/volinodes.h

index cb72b9c0b6c62ee3477b3adbfda4284740a83892..37b00fef6b726b232aab00d025709fd777af23f6 100644 (file)
--- a/src/vol/volinodes.h
+++ b/src/vol/volinodes.h
@@ -14,6 +14,9 @@
  
   */
  
+#ifndef __volinodes_h_
+#define __volinodes_h_
+
  /* Used by vutil.c and salvager.c */
  
  private struct VolumeHeader tempHeader;
@@ -56,3 +59,5 @@ LINKTABLEMAGIC, LINKTABLEVERSION}, VI_LINKTABLE,
  #define MAXINODETYPE VI_LINKTABLE
  
  Volume *VWaitAttachVolume();
+
+#endif /* __volinodes_h_ */
diff --git a/src/vol/volume.c b/src/vol/volume.c

index 7eb8854e86033ddcc5db87039a373cdfbecce234..fae9f87b56f5175ecdb6c776e911fbb2e1dc7d1d 100644 (file)
--- a/src/vol/volume.c
+++ b/src/vol/volume.c
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /* 1/1/89: NB:  this stuff is all going to be replaced.  Don't take it too seriously */
@@ -121,6 +123,9 @@ RCSID
  #ifdef AFS_NT40_ENV
  #include <io.h>
  #endif
+#include "daemon_com.h"
+#include "fssync.h"
+#include "salvsync.h"
  #include "vnode.h"
  #include "volume.h"
  #include "partition.h"
@@ -130,11 +135,15 @@ RCSID
  #include "afs/assert.h"
  #endif /* AFS_PTHREAD_ENV */
  #include "vutils.h"
-#include "fssync.h"
+#include <dir/dir.h>
  #ifndef AFS_NT40_ENV
  #include <unistd.h>
  #endif
  
+#if !defined(offsetof)
+#include <stddef.h>
+#endif
+
  #ifdef O_LARGEFILE
  #define afs_stat       stat64
  #define afs_fstat      fstat64
@@ -147,14 +156,16 @@ RCSID
  
  #ifdef AFS_PTHREAD_ENV
  pthread_mutex_t vol_glock_mutex;
-pthread_mutex_t vol_attach_mutex;
-pthread_mutex_t vol_fsync_mutex;
  pthread_mutex_t vol_trans_mutex;
  pthread_cond_t vol_put_volume_cond;
  pthread_cond_t vol_sleep_cond;
  int vol_attach_threads = 1;
  #endif /* AFS_PTHREAD_ENV */
  
+#ifdef AFS_DEMAND_ATTACH_FS
+pthread_mutex_t vol_salvsync_mutex;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  #ifdef AFS_OSF_ENV
  extern void *calloc(), *realloc();
  #endif
@@ -162,12 +173,18 @@ extern void *calloc(), *realloc();
  /*@printflike@*/ extern void Log(const char *format, ...);
  
  /* Forward declarations */
-static Volume *attach2(Error * ec, char *path,
+static Volume *attach2(Error * ec, VolId vid, char *path,
                        register struct VolumeHeader *header,
-                      struct DiskPartition *partp, int isbusy);
+                      struct DiskPartition *partp, Volume * vp, 
+                      int isbusy, int mode);
+static void ReallyFreeVolume(Volume * vp);
+#ifdef AFS_DEMAND_ATTACH_FS
  static void FreeVolume(Volume * vp);
+#else /* !AFS_DEMAND_ATTACH_FS */
+#define FreeVolume(vp) ReallyFreeVolume(vp)
  static void VScanUpdateList(void);
-static void InitLRU(int howMany);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+static void VInitVolumeHeaderCache(afs_uint32 howMany);
  static int GetVolumeHeader(register Volume * vp);
  static void ReleaseVolumeHeader(register struct volHeader *hd);
  static void FreeVolumeHeader(register Volume * vp);
@@ -175,22 +192,72 @@ static void AddVolumeToHashTable(register Volume * vp, int hashid);
  static void DeleteVolumeFromHashTable(register Volume * vp);
  static int VHold(Volume * vp);
  static int VHold_r(Volume * vp);
-static void GetBitmap(Error * ec, Volume * vp, VnodeClass class);
+static void VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class);
  static void GetVolumePath(Error * ec, VolId volumeId, char **partitionp,
                           char **namep);
  static void VReleaseVolumeHandles_r(Volume * vp);
  static void VCloseVolumeHandles_r(Volume * vp);
+static void LoadVolumeHeader(Error * ec, Volume * vp);
+static int VCheckOffline(register Volume * vp);
+static int VCheckDetach(register Volume * vp);
+static Volume * GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags);
+static int VolumeExternalName_r(VolumeId volumeId, char * name, size_t len);
  
  int LogLevel;                  /* Vice loglevel--not defined as extern so that it will be
                                  * defined when not linked with vice, XXXX */
  ProgramType programType;       /* The type of program using the package */
  
  
+/* extended volume package statistics */
+VolPkgStats VStats;
+
+
  #define VOLUME_BITMAP_GROWSIZE 16      /* bytes, => 128vnodes */
                                         /* Must be a multiple of 4 (1 word) !! */
-#define VOLUME_HASH_TABLE_SIZE 128     /* Must be a power of 2!! */
-#define VOLUME_HASH(volumeId) (volumeId&(VOLUME_HASH_TABLE_SIZE-1))
-private Volume *VolumeHashTable[VOLUME_HASH_TABLE_SIZE];
+
+/* this parameter needs to be tunable at runtime.
+ * 128 was really inadequate for largish servers -- at 16384 volumes this
+ * puts average chain length at 128, thus an average 65 deref's to find a volptr.
+ * talk about bad spatial locality...
+ *
+ * an AVL or splay tree might work a lot better, but we'll just increase
+ * the default hash table size for now
+ */
+#define DEFAULT_VOLUME_HASH_SIZE 256   /* Must be a power of 2!! */
+#define DEFAULT_VOLUME_HASH_MASK (DEFAULT_VOLUME_HASH_SIZE-1)
+#define VOLUME_HASH(volumeId) (volumeId&(VolumeHashTable.Mask))
+
+/*
+ * turn volume hash chains into partially ordered lists.
+ * when the threshold is exceeded between two adjacent elements,
+ * perform a chain rebalancing operation.
+ *
+ * keep the threshold high in order to keep cache line invalidates
+ * low "enough" on SMPs
+ */
+#define VOLUME_HASH_REORDER_THRESHOLD 200
+
+/*
+ * when possible, don't just reorder single elements, but reorder
+ * entire chains of elements at once.  a chain of elements that
+ * exceed the element previous to the pivot by at least CHAIN_THRESH 
+ * accesses are moved in front of the chain whose elements have at
+ * least CHAIN_THRESH less accesses than the pivot element
+ */
+#define VOLUME_HASH_REORDER_CHAIN_THRESH (VOLUME_HASH_REORDER_THRESHOLD / 2)
+
+#include "rx/rx_queue.h"
+
+
+VolumeHashTable_t VolumeHashTable = {
+    DEFAULT_VOLUME_HASH_SIZE,
+    DEFAULT_VOLUME_HASH_MASK,
+    NULL
+};
+
+
+static void VInitVolumeHash(void);
+
  
  #ifndef AFS_HAVE_FFS
  /* This macro is used where an ffs() call does not exist. Was in util/ffs.c */
@@ -211,7 +278,6 @@ ffs(x)
  #endif /* !AFS_HAVE_FFS */
  
  #ifdef AFS_PTHREAD_ENV
-#include "rx/rx_queue.h"
  typedef struct diskpartition_queue_t {
      struct rx_queue queue;
      struct DiskPartition * diskP;
@@ -224,9 +290,120 @@ typedef struct vinitvolumepackage_thread_t {
  static void * VInitVolumePackageThread(void * args);
  #endif /* AFS_PTHREAD_ENV */
  
-struct Lock vol_listLock;      /* Lock obtained when listing volumes:  prevents a volume from being missed if the volume is attached during a list volumes */
+static int VAttachVolumesByPartition(struct DiskPartition *diskP, 
+                                    int * nAttached, int * nUnattached);
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* demand attach fileserver extensions */
+
+/* XXX
+ * in the future we will support serialization of VLRU state into the fs_state
+ * disk dumps
+ *
+ * these structures are the beginning of that effort
+ */
+struct VLRU_DiskHeader {
+    struct versionStamp stamp;            /* magic and structure version number */
+    afs_uint32 mtime;                     /* time of dump to disk */
+    afs_uint32 num_records;               /* number of VLRU_DiskEntry records */
+};
+
+struct VLRU_DiskEntry {
+    afs_uint32 vid;                       /* volume ID */
+    afs_uint32 idx;                       /* generation */
+    afs_uint32 last_get;                  /* timestamp of last get */
+};
+
+struct VLRU_StartupQueue {
+    struct VLRU_DiskEntry * entry;
+    int num_entries;
+    int next_idx;
+};
+
+typedef struct vshutdown_thread_t {
+    struct rx_queue q;
+    pthread_mutex_t lock;
+    pthread_cond_t cv;
+    pthread_cond_t master_cv;
+    int n_threads;
+    int n_threads_complete;
+    int vol_remaining;
+    int schedule_version;
+    int pass;
+    byte n_parts;
+    byte n_parts_done_pass;
+    byte part_thread_target[VOLMAXPARTS+1];
+    byte part_done_pass[VOLMAXPARTS+1];
+    struct rx_queue * part_pass_head[VOLMAXPARTS+1];
+    int stats[4][VOLMAXPARTS+1];
+} vshutdown_thread_t;
+static void * VShutdownThread(void * args);
+
+
+static Volume * VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode);
+static int VCheckFree(Volume * vp);
+
+/* VByP List */
+static void AddVolumeToVByPList_r(Volume * vp);
+static void DeleteVolumeFromVByPList_r(Volume * vp);
+static void VVByPListBeginExclusive_r(struct DiskPartition * dp);
+static void VVByPListEndExclusive_r(struct DiskPartition * dp);
+static void VVByPListWait_r(struct DiskPartition * dp);
+
+/* online salvager */
+static int VCheckSalvage(register Volume * vp);
+static int VUpdateSalvagePriority_r(Volume * vp);
+static int VScheduleSalvage_r(Volume * vp);
+static int VCancelSalvage_r(Volume * vp, int reason);
+
+/* Volume hash table */
+static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
+static void VHashBeginExclusive_r(VolumeHashChainHead * head);
+static void VHashEndExclusive_r(VolumeHashChainHead * head);
+static void VHashWait_r(VolumeHashChainHead * head);
+
+/* Volume state machine */
+static void VCreateReservation_r(Volume * vp);
+static void VCancelReservation_r(Volume * vp);
+static void VWaitStateChange_r(Volume * vp);
+static void VWaitExclusiveState_r(Volume * vp);
+static int IsExclusiveState(VolState state);
+static int IsErrorState(VolState state);
+static int IsValidState(VolState state);
+
+/* shutdown */
+static int ShutdownVByPForPass_r(struct DiskPartition * dp, int pass);
+static int ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+                               struct rx_queue ** idx);
+static void ShutdownController(vshutdown_thread_t * params);
+static void ShutdownCreateSchedule(vshutdown_thread_t * params);
+
+/* VLRU */
+static void VLRU_ComputeConstants(void);
+static void VInitVLRU(void);
+static void VLRU_Init_Node_r(volatile Volume * vp);
+static void VLRU_Add_r(volatile Volume * vp);
+static void VLRU_Delete_r(volatile Volume * vp);
+static void VLRU_UpdateAccess_r(volatile Volume * vp);
+static void * VLRU_ScannerThread(void * args);
+static void VLRU_Scan_r(int idx);
+static void VLRU_Promote_r(int idx);
+static void VLRU_Demote_r(int idx);
+static void VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append);
+
+/* soft detach */
+static int VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh);
+static int VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh);
+static int VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+struct Lock vol_listLock;      /* Lock obtained when listing volumes:  
+                                * prevents a volume from being missed 
+                                * if the volume is attached during a 
+                                * list volumes */
  
-extern struct Lock FSYNC_handler_lock;
  
  static int TimeZoneCorrection; /* Number of seconds west of GMT */
  
@@ -247,12 +424,16 @@ bit32 VolumeCacheCheck;           /* Incremented everytime a volume goes on line--
                                  * vnode will be invalidated
                                  * access only with VOL_LOCK held */
  
-int VolumeCacheSize = 200, VolumeGets = 0, VolumeReplacements = 0, Vlooks = 0;
  
  
+
+/***************************************************/
+/* Startup routines                                */
+/***************************************************/
+
  int
-VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
-                  int connect, int volcache)
+VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes, afs_uint32 nSmallVnodes,
+                  int connect, afs_uint32 volcache)
  {
      int errors = 0;            /* Number of errors while finding vice partitions. */
      struct timeval tv;
@@ -260,10 +441,24 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
  
      programType = pt;
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    memset(&VStats, 0, sizeof(VStats));
+    VStats.hdr_cache_size = 200;
+#endif
+
+    VInitPartitionPackage();
+    VInitVolumeHash();
+    VInitVnHashByVolume();
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (programType == fileServer) {
+       VInitVLRU();
+    } else {
+       VLRU_SetOptions(VLRU_SET_ENABLED, 0);
+    }
+#endif
+
  #ifdef AFS_PTHREAD_ENV
      assert(pthread_mutex_init(&vol_glock_mutex, NULL) == 0);
-    assert(pthread_mutex_init(&vol_attach_mutex, NULL) == 0);
-    assert(pthread_mutex_init(&vol_fsync_mutex, NULL) == 0);
      assert(pthread_mutex_init(&vol_trans_mutex, NULL) == 0);
      assert(pthread_cond_init(&vol_put_volume_cond, NULL) == 0);
      assert(pthread_cond_init(&vol_sleep_cond, NULL) == 0);
@@ -271,25 +466,41 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
      IOMGR_Initialize();
  #endif /* AFS_PTHREAD_ENV */
      Lock_Init(&vol_listLock);
-    Lock_Init(&FSYNC_handler_lock);
+
      srandom(time(0));          /* For VGetVolumeInfo */
      gettimeofday(&tv, &tz);
      TimeZoneCorrection = tz.tz_minuteswest * 60;
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    assert(pthread_mutex_init(&vol_salvsync_mutex, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
      /* Ok, we have done enough initialization that fileserver can 
       * start accepting calls, even though the volumes may not be 
       * available just yet.
       */
      VInit = 1;
  
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_SERVER)
+    if (programType == salvageServer) {
+       SALVSYNC_salvInit();
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+#ifdef FSSYNC_BUILD_SERVER
      if (programType == fileServer) {
-       /* File server or "stand" */
         FSYNC_fsInit();
      }
+#endif
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(SALVSYNC_BUILD_CLIENT)
+    if (programType == fileServer) {
+       /* establish a connection to the salvager at this point */
+       assert(VConnectSALV() != 0);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
-    if (volcache > VolumeCacheSize)
-       VolumeCacheSize = volcache;
-    InitLRU(VolumeCacheSize);
+    if (volcache > VStats.hdr_cache_size)
+       VStats.hdr_cache_size = volcache;
+    VInitVolumeHeaderCache(VStats.hdr_cache_size);
  
      VInitVnodes(vLarge, nLargeVnodes);
      VInitVnodes(vSmall, nSmallVnodes);
@@ -304,7 +515,7 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
  #ifdef AFS_PTHREAD_ENV
         struct vinitvolumepackage_thread_t params;
         struct diskpartition_queue_t * dpq;
-       int i, len;
+       int i, threads, parts;
         pthread_t tid;
         pthread_attr_t attrs;
  
@@ -313,29 +524,56 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
         params.n_threads_complete = 0;
  
         /* create partition work queue */
-       for (len=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, len++) {
+       for (parts=0, diskP = DiskPartitionList; diskP; diskP = diskP->next, parts++) {
             dpq = (diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
             assert(dpq != NULL);
             dpq->diskP = diskP;
             queue_Prepend(&params,dpq);
         }
  
-       assert(pthread_attr_init(&attrs) == 0);
-       assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+       threads = MIN(parts, vol_attach_threads);
  
-       len = MIN(len, vol_attach_threads);
-       
-       VOL_LOCK;
-       for (i=0; i < len; i++) {
-           assert(pthread_create
-                  (&tid, &attrs, &VInitVolumePackageThread,
-                   &params) == 0);
-       }
+       if (threads > 1) {
+           /* spawn off a bunch of initialization threads */
+           assert(pthread_attr_init(&attrs) == 0);
+           assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
  
-       while(params.n_threads_complete < len) {
-           pthread_cond_wait(&params.thread_done_cv,&vol_glock_mutex);
+           Log("VInitVolumePackage: beginning parallel fileserver startup\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+           Log("VInitVolumePackage: using %d threads to pre-attach volumes on %d partitions\n",
+               threads, parts);
+#else /* AFS_DEMAND_ATTACH_FS */
+           Log("VInitVolumePackage: using %d threads to attach volumes on %d partitions\n",
+               threads, parts);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+           VOL_LOCK;
+           for (i=0; i < threads; i++) {
+               assert(pthread_create
+                      (&tid, &attrs, &VInitVolumePackageThread,
+                       &params) == 0);
+           }
+
+           while(params.n_threads_complete < threads) {
+               pthread_cond_wait(&params.thread_done_cv,&vol_glock_mutex);
+           }
+           VOL_UNLOCK;
+
+           assert(pthread_attr_destroy(&attrs) == 0);
+       } else {
+           /* if we're only going to run one init thread, don't bother creating
+            * another LWP */
+           Log("VInitVolumePackage: beginning single-threaded fileserver startup\n");
+#ifdef AFS_DEMAND_ATTACH_FS
+           Log("VInitVolumePackage: using 1 thread to pre-attach volumes on %d partition(s)\n",
+               parts);
+#else /* AFS_DEMAND_ATTACH_FS */
+           Log("VInitVolumePackage: using 1 thread to attach volumes on %d partition(s)\n",
+               parts);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+           VInitVolumePackageThread(&params);
         }
-       VOL_UNLOCK;
  
         assert(pthread_cond_destroy(&params.thread_done_cv) == 0);
  
@@ -346,44 +584,28 @@ VInitVolumePackage(ProgramType pt, int nLargeVnodes, int nSmallVnodes,
         /* Attach all the volumes in this partition */
         for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
             int nAttached = 0, nUnattached = 0;
-           Log("Partition %s: attaching volumes\n", diskP->name);
-           dirp = opendir(VPartitionPath(diskP));
-           assert(dirp);
-           while ((dp = readdir(dirp))) {
-               char *p;
-               p = strrchr(dp->d_name, '.');
-               if (p != NULL && strcmp(p, VHDREXT) == 0) {
-                   Error error;
-                   Volume *vp;
-                   vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
-                                            V_VOLUPD);
-                   (*(vp ? &nAttached : &nUnattached))++;
-                   if (error == VOFFLINE)
-                       Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
-                   else if (LogLevel >= 5) {
-                       Log("Partition %s: attached volume %d (%s)\n",
-                           diskP->name, VolumeNumber(dp->d_name),
-                           dp->d_name);
-                   }
-                   if (vp) {
-                       VPutVolume(vp);
-                   }
-               }
-           }
-           Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached);
-           closedir(dirp);
+           assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
         }
  #endif /* AFS_PTHREAD_ENV */
      }
  
      VInit = 2;                 /* Initialized, and all volumes have been attached */
+#ifdef FSSYNC_BUILD_CLIENT
      if (programType == volumeUtility && connect) {
         if (!VConnectFS()) {
             Log("Unable to connect to file server; aborted\n");
-           Lock_Destroy(&FSYNC_handler_lock);
             exit(1);
         }
      }
+#ifdef AFS_DEMAND_ATTACH_FS
+    else if (programType == salvageServer) {
+       if (!VConnectFS()) {
+           Log("Unable to connect to file server; aborted\n");
+           exit(1);
+       }
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+#endif /* FSSYNC_BUILD_CLIENT */
      return 0;
  }
  
@@ -412,32 +634,8 @@ VInitVolumePackageThread(void * args) {
         diskP = dpq->diskP;
         free(dpq);
  
-       Log("Partition %s: attaching volumes\n", diskP->name);
-       dirp = opendir(VPartitionPath(diskP));
-       assert(dirp);
-       while ((dp = readdir(dirp))) {
-           char *p;
-           p = strrchr(dp->d_name, '.');
-           if (p != NULL && strcmp(p, VHDREXT) == 0) {
-               Error error;
-               Volume *vp;
-               vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
-                                        V_VOLUPD);
-               (*(vp ? &nAttached : &nUnattached))++;
-               if (error == VOFFLINE)
-                   Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
-               else if (LogLevel >= 5) {
-                   Log("Partition %s: attached volume %d (%s)\n",
-                       diskP->name, VolumeNumber(dp->d_name),
-                       dp->d_name);
-               }
-               if (vp) {
-                   VPutVolume(vp);
-               }
-           }
-       }
-       Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, nAttached, nUnattached);
-       closedir(dirp);
+       assert(VAttachVolumesByPartition(diskP, &nAttached, &nUnattached) == 0);
+
         VOL_LOCK;
      }
  
@@ -448,46 +646,114 @@ VInitVolumePackageThread(void * args) {
  }
  #endif /* AFS_PTHREAD_ENV */
  
-/* This must be called by any volume utility which needs to run while the
-   file server is also running.  This is separated from VInitVolumePackage so
-   that a utility can fork--and each of the children can independently
-   initialize communication with the file server */
-int
-VConnectFS(void)
+/*
+ * attach all volumes on a given disk partition
+ */
+static int
+VAttachVolumesByPartition(struct DiskPartition *diskP, int * nAttached, int * nUnattached)
  {
-    int retVal;
-    VOL_LOCK;
-    retVal = VConnectFS_r();
-    VOL_UNLOCK;
-    return retVal;
-}
+  DIR * dirp;
+  struct dirent * dp;
+  int ret = 0;
+
+  Log("Partition %s: attaching volumes\n", diskP->name);
+  dirp = opendir(VPartitionPath(diskP));
+  if (!dirp) {
+    Log("opendir on Partition %s failed!\n", diskP->name);
+    return 1;
+  }
+
+  while ((dp = readdir(dirp))) {
+    char *p;
+    p = strrchr(dp->d_name, '.');
+    if (p != NULL && strcmp(p, VHDREXT) == 0) {
+      Error error;
+      Volume *vp;
+#ifdef AFS_DEMAND_ATTACH_FS
+      vp = VPreAttachVolumeByName(&error, diskP->name, dp->d_name,
+                                  V_VOLUPD);
+#else /* AFS_DEMAND_ATTACH_FS */
+      vp = VAttachVolumeByName(&error, diskP->name, dp->d_name,
+                              V_VOLUPD);
+#endif /* AFS_DEMAND_ATTACH_FS */
+      (*(vp ? nAttached : nUnattached))++;
+      if (error == VOFFLINE)
+       Log("Volume %d stays offline (/vice/offline/%s exists)\n", VolumeNumber(dp->d_name), dp->d_name);
+      else if (LogLevel >= 5) {
+       Log("Partition %s: attached volume %d (%s)\n",
+           diskP->name, VolumeNumber(dp->d_name),
+           dp->d_name);
+      }
+#if !defined(AFS_DEMAND_ATTACH_FS)
+      if (vp) {
+       VPutVolume(vp);
+      }
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+  }
  
-int
-VConnectFS_r(void)
-{
-    int rc;
-    assert(VInit == 2 && programType == volumeUtility);
-    rc = FSYNC_clientInit();
-    if (rc)
-       VInit = 3;
-    return rc;
+  Log("Partition %s: attached %d volumes; %d volumes not attached\n", diskP->name, *nAttached, *nUnattached);
+  closedir(dirp);
+  return ret;
  }
  
-void
-VDisconnectFS_r(void)
-{
-    assert(programType == volumeUtility);
-    FSYNC_clientFinis();
-    VInit = 2;
-}
  
-void
-VDisconnectFS(void)
-{
-    VOL_LOCK;
-    VDisconnectFS_r();
-    VOL_UNLOCK;
-}
+/***************************************************/
+/* Shutdown routines                               */
+/***************************************************/
+
+/*
+ * demand attach fs
+ * highly multithreaded volume package shutdown
+ *
+ * with the demand attach fileserver extensions,
+ * VShutdown has been modified to be multithreaded.
+ * In order to achieve optimal use of many threads,
+ * the shutdown code involves one control thread and
+ * n shutdown worker threads.  The control thread
+ * periodically examines the number of volumes available
+ * for shutdown on each partition, and produces a worker
+ * thread allocation schedule.  The idea is to eliminate
+ * redundant scheduling computation on the workers by
+ * having a single master scheduler.
+ *
+ * The scheduler's objectives are:
+ * (1) fairness
+ *   each partition with volumes remaining gets allocated
+ *   at least 1 thread (assuming sufficient threads)
+ * (2) performance
+ *   threads are allocated proportional to the number of
+ *   volumes remaining to be offlined.  This ensures that
+ *   the OS I/O scheduler has many requests to elevator
+ *   seek on partitions that will (presumably) take the
+ *   longest amount of time (from now) to finish shutdown
+ * (3) keep threads busy
+ *   when there are extra threads, they are assigned to
+ *   partitions using a simple round-robin algorithm
+ *
+ * In the future, we may wish to add the ability to adapt
+ * to the relative performance patterns of each disk
+ * partition.
+ *
+ *
+ * demand attach fs
+ * multi-step shutdown process
+ *
+ * demand attach shutdown is a four-step process. Each
+ * shutdown "pass" shuts down increasingly more difficult
+ * volumes.  The main purpose is to achieve better cache
+ * utilization during shutdown.
+ *
+ * pass 0
+ *   shutdown volumes in the unattached, pre-attached
+ *   and error states
+ * pass 1
+ *   shutdown attached volumes with cached volume headers
+ * pass 2
+ *   shutdown all volumes in non-exclusive states
+ * pass 3
+ *   shutdown all remaining volumes
+ */
  
  void
  VShutdown_r(void)
@@ -495,36 +761,139 @@ VShutdown_r(void)
      int i;
      register Volume *vp, *np;
      register afs_int32 code;
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct DiskPartition * diskP;
+    struct diskpartition_queue_t * dpq;
+    vshutdown_thread_t params;
+    pthread_t tid;
+    pthread_attr_t attrs;
+
+    memset(&params, 0, sizeof(vshutdown_thread_t));
+
+    for (params.n_parts=0, diskP = DiskPartitionList;
+        diskP; diskP = diskP->next, params.n_parts++);
+
+    Log("VShutdown:  shutting down on-line volumes on %d partition%s...\n", 
+       params.n_parts, params.n_parts > 1 ? "s" : "");
+
+    if (vol_attach_threads > 1) {
+       /* prepare for parallel shutdown */
+       params.n_threads = vol_attach_threads;
+       assert(pthread_mutex_init(&params.lock, NULL) == 0);
+       assert(pthread_cond_init(&params.cv, NULL) == 0);
+       assert(pthread_cond_init(&params.master_cv, NULL) == 0);
+       assert(pthread_attr_init(&attrs) == 0);
+       assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+       queue_Init(&params);
+
+       /* setup the basic partition information structures for
+        * parallel shutdown */
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           /* XXX debug */
+           struct rx_queue * qp, * nqp;
+           Volume * vp;
+           int count = 0;
+
+           VVByPListWait_r(diskP);
+           VVByPListBeginExclusive_r(diskP);
+
+           /* XXX debug */
+           for (queue_Scan(&diskP->vol_list, qp, nqp, rx_queue)) {
+               vp = (Volume *)((char *)qp - offsetof(Volume, vol_list));
+               if (vp->header)
+                   count++;
+           }
+           Log("VShutdown: partition %s has %d volumes with attached headers\n",
+               VPartitionPath(diskP), count);
+               
+
+           /* build up the pass 0 shutdown work queue */
+           dpq = (struct diskpartition_queue_t *) malloc(sizeof(struct diskpartition_queue_t));
+           assert(dpq != NULL);
+           dpq->diskP = diskP;
+           queue_Prepend(&params, dpq);
+
+           params.part_pass_head[diskP->device] = queue_First(&diskP->vol_list, rx_queue);
+       }
+
+       Log("VShutdown:  beginning parallel fileserver shutdown\n");
+       Log("VShutdown:  using %d threads to offline volumes on %d partition%s\n",
+           vol_attach_threads, params.n_parts, params.n_parts > 1 ? "s" : "" );
+
+       /* do pass 0 shutdown */
+       assert(pthread_mutex_lock(&params.lock) == 0);
+       for (i=0; i < params.n_threads; i++) {
+           assert(pthread_create
+                  (&tid, &attrs, &VShutdownThread,
+                   &params) == 0);
+       }
+       
+       /* wait for all the pass 0 shutdowns to complete */
+       while (params.n_threads_complete < params.n_threads) {
+           assert(pthread_cond_wait(&params.master_cv, &params.lock) == 0);
+       }
+       params.n_threads_complete = 0;
+       params.pass = 1;
+       assert(pthread_cond_broadcast(&params.cv) == 0);
+       assert(pthread_mutex_unlock(&params.lock) == 0);
+
+       Log("VShutdown:  pass 0 completed using the 1 thread per partition algorithm\n");
+       Log("VShutdown:  starting passes 1 through 3 using finely-granular mp-fast algorithm\n");
+
+       /* run the parallel shutdown scheduler. it will drop the glock internally */
+       ShutdownController(&params);
+       
+       /* wait for all the workers to finish pass 3 and terminate */
+       while (params.pass < 4) {
+           assert(pthread_cond_wait(&params.cv, &vol_glock_mutex) == 0);
+       }
+       
+       assert(pthread_attr_destroy(&attrs) == 0);
+       assert(pthread_cond_destroy(&params.cv) == 0);
+       assert(pthread_cond_destroy(&params.master_cv) == 0);
+       assert(pthread_mutex_destroy(&params.lock) == 0);
+
+       /* drop the VByPList exclusive reservations */
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           VVByPListEndExclusive_r(diskP);
+           Log("VShutdown:  %s stats : (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+               VPartitionPath(diskP),
+               params.stats[0][diskP->device],
+               params.stats[1][diskP->device],
+               params.stats[2][diskP->device],
+               params.stats[3][diskP->device]);
+       }
+
+       Log("VShutdown:  shutdown finished using %d threads\n", params.n_threads);
+    } else {
+       /* if we're only going to run one shutdown thread, don't bother creating
+        * another LWP */
+       Log("VShutdown:  beginning single-threaded fileserver shutdown\n");
+
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           VShutdownByPartition_r(diskP);
+       }
+    }
  
+    Log("VShutdown:  complete.\n");
+#else /* AFS_DEMAND_ATTACH_FS */
      Log("VShutdown:  shutting down on-line volumes...\n");
-    for (i = 0; i < VOLUME_HASH_TABLE_SIZE; i++) {
+    for (i = 0; i < VolumeHashTable.Size; i++) {
         /* try to hold first volume in the hash table */
-       for (vp = VolumeHashTable[i]; vp; vp = vp->hashNext) {
+       for (queue_Scan(&VolumeHashTable.Table[i],vp,np,Volume)) {
             code = VHold_r(vp);
-           if (code == 0)
-               break;          /* got it */
-           /* otherwise we go around again, trying another volume */
-       }
-       while (vp) {
-           if (LogLevel >= 5)
-               Log("VShutdown:  Attempting to take volume %u offline.\n",
-                   vp->hashid);
-           /* first compute np before releasing vp, in case vp disappears
-            * after releasing.  Hold it, so it doesn't disapear.  If we
-            * can't hold it, try the next one in the chain.  Invariant
-            * at the top of this loop is that vp is held (has extra ref count).
-            */
-           for (np = vp->hashNext; np; np = np->hashNext) {
-               code = VHold_r(np);
-               if (code == 0)
-                   break;      /* got it */
+           if (code == 0) {
+               if (LogLevel >= 5)
+                   Log("VShutdown:  Attempting to take volume %u offline.\n",
+                       vp->hashid);
+               
+               /* next, take the volume offline (drops reference count) */
+               VOffline_r(vp, "File server was shut down");
             }
-           /* next, take the volume offline (drops reference count) */
-           VOffline_r(vp, "File server was shut down");
-           vp = np;            /* next guy to try */
         }
      }
      Log("VShutdown:  complete.\n");
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
  
  void
@@ -535,56 +904,576 @@ VShutdown(void)
      VOL_UNLOCK;
  }
  
-
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * shutdown control thread
+ */
  static void
-ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
-          bit32 version)
+ShutdownController(vshutdown_thread_t * params)
  {
-    struct versionStamp *vsn;
-    FdHandle_t *fdP;
+    /* XXX debug */
+    struct DiskPartition * diskP;
+    Device id;
+    vshutdown_thread_t shadow;
  
-    *ec = 0;
-    if (h == NULL) {
-       *ec = VSALVAGE;
-       return;
+    ShutdownCreateSchedule(params);
+
+    while ((params->pass < 4) &&
+          (params->n_threads_complete < params->n_threads)) {
+       /* recompute schedule once per second */
+
+       memcpy(&shadow, params, sizeof(vshutdown_thread_t));
+
+       VOL_UNLOCK;
+       /* XXX debug */
+       Log("ShutdownController:  schedule version=%d, vol_remaining=%d, pass=%d\n",
+           shadow.schedule_version, shadow.vol_remaining, shadow.pass);
+       Log("ShutdownController:  n_threads_complete=%d, n_parts_done_pass=%d\n",
+           shadow.n_threads_complete, shadow.n_parts_done_pass);
+       for (diskP = DiskPartitionList; diskP; diskP=diskP->next) {
+           id = diskP->device;
+           Log("ShutdownController:  part[%d] : (len=%d, thread_target=%d, done_pass=%d, pass_head=%p)\n",
+               id, 
+               diskP->vol_list.len,
+               shadow.part_thread_target[id], 
+               shadow.part_done_pass[id], 
+               shadow.part_pass_head[id]);
+       }
+
+       sleep(1);
+       VOL_LOCK;
+
+       ShutdownCreateSchedule(params);
      }
+}
  
-    fdP = IH_OPEN(h);
-    if (fdP == NULL) {
-       *ec = VSALVAGE;
-       return;
+/* create the shutdown thread work schedule.
+ * this scheduler tries to implement fairness
+ * by allocating at least 1 thread to each 
+ * partition with volumes to be shutdown,
+ * and then it attempts to allocate remaining
+ * threads based upon the amount of work left
+ */
+static void
+ShutdownCreateSchedule(vshutdown_thread_t * params)
+{
+    struct DiskPartition * diskP;
+    int sum, thr_workload, thr_left;
+    int part_residue[VOLMAXPARTS+1];
+    Device id;
+
+    /* compute the total number of outstanding volumes */
+    sum = 0;
+    for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+       sum += diskP->vol_list.len;
      }
+    
+    params->schedule_version++;
+    params->vol_remaining = sum;
  
-    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
+    if (!sum)
         return;
+
+    /* compute average per-thread workload */
+    thr_workload = sum / params->n_threads;
+    if (sum % params->n_threads)
+       thr_workload++;
+
+    thr_left = params->n_threads;
+    memset(&part_residue, 0, sizeof(part_residue));
+
+    /* for fairness, give every partition with volumes remaining
+     * at least one thread */
+    for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+       id = diskP->device;
+       if (diskP->vol_list.len) {
+           params->part_thread_target[id] = 1;
+           thr_left--;
+       } else {
+           params->part_thread_target[id] = 0;
+       }
      }
-    vsn = (struct versionStamp *)to;
-    if (FDH_READ(fdP, to, size) != size || vsn->magic != magic) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
-       return;
+
+    if (thr_left && thr_workload) {
+       /* compute length-weighted workloads */
+       int delta;
+
+       for (diskP = DiskPartitionList; diskP && thr_left; diskP = diskP->next) {
+           id = diskP->device;
+           delta = (diskP->vol_list.len / thr_workload) -
+               params->part_thread_target[id];
+           if (delta < 0) {
+               continue;
+           }
+           if (delta < thr_left) {
+               params->part_thread_target[id] += delta;
+               thr_left -= delta;
+           } else {
+               params->part_thread_target[id] += thr_left;
+               thr_left = 0;
+               break;
+           }
+       }
      }
-    FDH_CLOSE(fdP);
  
-    /* Check is conditional, in case caller wants to inspect version himself */
-    if (version && vsn->version != version) {
-       *ec = VSALVAGE;
+    if (thr_left) {
+       /* try to assign any leftover threads to partitions that
+        * had volume lengths closer to needing thread_target+1 */
+       int max_residue, max_id;
+
+       /* compute the residues */
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           id = diskP->device;
+           part_residue[id] = diskP->vol_list.len - 
+               (params->part_thread_target[id] * thr_workload);
+       }
+
+       /* now try to allocate remaining threads to partitions with the
+        * highest residues */
+       while (thr_left) {
+           max_residue = 0;
+           for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+               id = diskP->device;
+               if (part_residue[id] > max_residue) {
+                   max_residue = part_residue[id];
+                   max_id = id;
+               }
+           }
+
+           if (!max_residue) {
+               break;
+           }
+
+           params->part_thread_target[max_id]++;
+           thr_left--;
+           part_residue[max_id] = 0;
+       }
+    }
+
+    if (thr_left) {
+       /* punt and give any remaining threads equally to each partition */
+       int alloc;
+       if (thr_left >= params->n_parts) {
+           alloc = thr_left / params->n_parts;
+           for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+               id = diskP->device;
+               params->part_thread_target[id] += alloc;
+               thr_left -= alloc;
+           }
+       }
+
+       /* finish off the last of the threads */
+       for (diskP = DiskPartitionList; thr_left && diskP; diskP = diskP->next) {
+           id = diskP->device;
+           params->part_thread_target[id]++;
+           thr_left--;
+       }
      }
  }
  
-/* VolumeHeaderToDisk
- * Allows for storing 64 bit inode numbers in on-disk volume header
- * file.
- */
-void
-VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
+/* worker thread for parallel shutdown */
+static void *
+VShutdownThread(void * args)
  {
+    struct rx_queue *qp;
+    Volume * vp;
+    vshutdown_thread_t * params;
+    int part, code, found, pass, schedule_version_save, count;
+    struct DiskPartition *diskP;
+    struct diskpartition_queue_t * dpq;
+    Device id;
  
-    memset((char *)dh, 0, sizeof(VolumeDiskHeader_t));
-    dh->stamp = h->stamp;
-    dh->id = h->id;
+    params = (vshutdown_thread_t *) args;
+
+    /* acquire the shutdown pass 0 lock */
+    assert(pthread_mutex_lock(&params->lock) == 0);
+
+    /* if there's still pass 0 work to be done,
+     * get a work entry, and do a pass 0 shutdown */
+    if (queue_IsNotEmpty(params)) {
+       dpq = queue_First(params, diskpartition_queue_t);
+       queue_Remove(dpq);
+       assert(pthread_mutex_unlock(&params->lock) == 0);
+       diskP = dpq->diskP;
+       free(dpq);
+       id = diskP->device;
+
+       count = 0;
+       while (ShutdownVolumeWalk_r(diskP, 0, &params->part_pass_head[id]))
+           count++;
+       params->stats[0][diskP->device] = count;
+       assert(pthread_mutex_lock(&params->lock) == 0);
+    }
+
+    params->n_threads_complete++;
+    if (params->n_threads_complete == params->n_threads) {
+      /* notify control thread that all workers have completed pass 0 */
+      assert(pthread_cond_signal(&params->master_cv) == 0);
+    }
+    while (params->pass == 0) {
+      assert(pthread_cond_wait(&params->cv, &params->lock) == 0);
+    }
+
+    /* switch locks */
+    assert(pthread_mutex_unlock(&params->lock) == 0);
+    VOL_LOCK;
+
+    pass = params->pass;
+    assert(pass > 0);
+
+    /* now escalate through the more complicated shutdowns */
+    while (pass <= 3) {
+       schedule_version_save = params->schedule_version;
+       found = 0;
+       /* find a disk partition to work on */
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           id = diskP->device;
+           if (params->part_thread_target[id] && !params->part_done_pass[id]) {
+               params->part_thread_target[id]--;
+               found = 1;
+               break;
+           }
+       }
+       
+       if (!found) {
+           /* hmm. for some reason the controller thread couldn't find anything for 
+            * us to do. let's see if there's anything we can do */
+           for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+               id = diskP->device;
+               if (diskP->vol_list.len && !params->part_done_pass[id]) {
+                   found = 1;
+                   break;
+               } else if (!params->part_done_pass[id]) {
+                   params->part_done_pass[id] = 1;
+                   params->n_parts_done_pass++;
+                   if (pass == 3) {
+                       Log("VShutdown:  done shutting down volumes on partition %s.\n",
+                           VPartitionPath(diskP));
+                   }
+               }
+           }
+       }
+       
+       /* do work on this partition until either the controller
+        * creates a new schedule, or we run out of things to do
+        * on this partition */
+       if (found) {
+           count = 0;
+           while (!params->part_done_pass[id] &&
+                  (schedule_version_save == params->schedule_version)) {
+               /* ShutdownVolumeWalk_r will drop the glock internally */
+               if (!ShutdownVolumeWalk_r(diskP, pass, &params->part_pass_head[id])) {
+                   if (!params->part_done_pass[id]) {
+                       params->part_done_pass[id] = 1;
+                       params->n_parts_done_pass++;
+                       if (pass == 3) {
+                           Log("VShutdown:  done shutting down volumes on partition %s.\n",
+                               VPartitionPath(diskP));
+                       }
+                   }
+                   break;
+               }
+               count++;
+           }
+
+           params->stats[pass][id] += count;
+       } else {
+           /* ok, everyone is done this pass, proceed */
+
+           /* barrier lock */
+           params->n_threads_complete++;
+           while (params->pass == pass) {
+               if (params->n_threads_complete == params->n_threads) {
+                   /* we are the last thread to complete, so we will
+                    * reinitialize worker pool state for the next pass */
+                   params->n_threads_complete = 0;
+                   params->n_parts_done_pass = 0;
+                   params->pass++;
+                   for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+                       id = diskP->device;
+                       params->part_done_pass[id] = 0;
+                       params->part_pass_head[id] = queue_First(&diskP->vol_list, rx_queue);
+                   }
+
+                   /* compute a new thread schedule before releasing all the workers */
+                   ShutdownCreateSchedule(params);
+
+                   /* wake up all the workers */
+                   assert(pthread_cond_broadcast(&params->cv) == 0);
+
+                   VOL_UNLOCK;
+                   Log("VShutdown:  pass %d completed using %d threads on %d partitions\n",
+                       pass, params->n_threads, params->n_parts);
+                   VOL_LOCK;
+               } else {
+                   assert(pthread_cond_wait(&params->cv, &vol_glock_mutex) == 0);
+               }
+           }
+           pass = params->pass;
+       }
+       
+       /* for fairness */
+       VOL_UNLOCK;
+       pthread_yield();
+       VOL_LOCK;
+    }
+
+    VOL_UNLOCK;
+
+    return NULL;
+}
+
+/* shut down all volumes on a given disk partition 
+ *
+ * note that this function will not allow mp-fast
+ * shutdown of a partition */
+int
+VShutdownByPartition_r(struct DiskPartition * dp)
+{
+    int pass, retVal;
+    int pass_stats[4];
+    int total;
+
+    /* wait for other exclusive ops to finish */
+    VVByPListWait_r(dp);
+
+    /* begin exclusive access */
+    VVByPListBeginExclusive_r(dp);
+
+    /* pick the low-hanging fruit first,
+     * then do the complicated ones last 
+     * (has the advantage of keeping
+     *  in-use volumes up until the bitter end) */
+    for (pass = 0, total=0; pass < 4; pass++) {
+       pass_stats[pass] = ShutdownVByPForPass_r(dp, pass);
+       total += pass_stats[pass];
+    }
+
+    /* end exclusive access */
+    VVByPListEndExclusive_r(dp);
+
+    Log("VShutdownByPartition:  shut down %d volumes on %s (pass[0]=%d, pass[1]=%d, pass[2]=%d, pass[3]=%d)\n",
+       total, VPartitionPath(dp), pass_stats[0], pass_stats[1], pass_stats[2], pass_stats[3]);
+
+    return retVal;
+}
+
+/* internal shutdown functionality
+ *
+ * for multi-pass shutdown:
+ * 0 to only "shutdown" {pre,un}attached and error state volumes
+ * 1 to also shutdown attached volumes w/ volume header loaded
+ * 2 to also shutdown attached volumes w/o volume header loaded
+ * 3 to also shutdown exclusive state volumes 
+ *
+ * caller MUST hold exclusive access on the hash chain
+ * because we drop vol_glock_mutex internally
+ * 
+ * this function is reentrant for passes 1--3 
+ * (e.g. multiple threads can cooperate to 
+ *  shutdown a partition mp-fast)
+ *
+ * pass 0 is not scaleable because the volume state data is
+ * synchronized by vol_glock mutex, and the locking overhead
+ * is too high to drop the lock long enough to do linked list
+ * traversal
+ */
+static int
+ShutdownVByPForPass_r(struct DiskPartition * dp, int pass)
+{
+    struct rx_queue * q = queue_First(&dp->vol_list, rx_queue);
+    register int i = 0;
+
+    while (ShutdownVolumeWalk_r(dp, pass, &q))
+       i++;
+
+    return i;
+}
+
+/* conditionally shutdown one volume on partition dp
+ * returns 1 if a volume was shutdown in this pass,
+ * 0 otherwise */
+static int
+ShutdownVolumeWalk_r(struct DiskPartition * dp, int pass,
+                    struct rx_queue ** idx)
+{
+    struct rx_queue *qp, *nqp;
+    Volume * vp;
+
+    qp = *idx;
+
+    for (queue_ScanFrom(&dp->vol_list, qp, qp, nqp, rx_queue)) {
+       vp = (Volume *) (((char *)qp) - offsetof(Volume, vol_list));
+       
+       switch (pass) {
+       case 0:
+           if ((V_attachState(vp) != VOL_STATE_UNATTACHED) &&
+               (V_attachState(vp) != VOL_STATE_ERROR) &&
+               (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+               break;
+           }
+       case 1:
+           if ((V_attachState(vp) == VOL_STATE_ATTACHED) &&
+               (vp->header == NULL)) {
+               break;
+           }
+       case 2:
+           if (IsExclusiveState(V_attachState(vp))) {
+               break;
+           }
+       case 3:
+           *idx = nqp;
+           DeleteVolumeFromVByPList_r(vp);
+           VShutdownVolume_r(vp);
+           vp = NULL;
+           return 1;
+       }
+    }
+
+    return 0;
+}
+
+/*
+ * shutdown a specific volume
+ */
+/* caller MUST NOT hold a heavyweight ref on vp */
+int
+VShutdownVolume_r(Volume * vp)
+{
+    int code;
+
+    VCreateReservation_r(vp);
+
+    if (LogLevel >= 5) {
+       Log("VShutdownVolume_r:  vid=%u, device=%d, state=%hu\n",
+           vp->hashid, vp->partition->device, V_attachState(vp));
+    }
+
+    /* wait for other blocking ops to finish */
+    VWaitExclusiveState_r(vp);
+
+    assert(IsValidState(V_attachState(vp)));
+    
+    switch(V_attachState(vp)) {
+    case VOL_STATE_SALVAGING:
+       /* make sure salvager knows we don't want
+        * the volume back */
+       VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN);
+    case VOL_STATE_PREATTACHED:
+    case VOL_STATE_ERROR:
+       VChangeState_r(vp, VOL_STATE_UNATTACHED);
+    case VOL_STATE_UNATTACHED:
+       break;
+    case VOL_STATE_GOING_OFFLINE:
+    case VOL_STATE_SHUTTING_DOWN:
+    case VOL_STATE_ATTACHED:
+       code = VHold_r(vp);
+       if (!code) {
+           if (LogLevel >= 5)
+               Log("VShutdown:  Attempting to take volume %u offline.\n",
+                   vp->hashid);
+
+           /* take the volume offline (drops reference count) */
+           VOffline_r(vp, "File server was shut down");
+       }
+       break;
+    }
+    
+    VCancelReservation_r(vp);
+    vp = NULL;
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Header I/O routines                             */
+/***************************************************/
+
+/* open a descriptor for the inode (h),
+ * read in an on-disk structure into buffer (to) of size (size),
+ * verify versionstamp in structure has magic (magic) and
+ * optionally verify version (version) if (version) is nonzero
+ */
+static void
+ReadHeader(Error * ec, IHandle_t * h, char *to, int size, bit32 magic,
+          bit32 version)
+{
+    struct versionStamp *vsn;
+    FdHandle_t *fdP;
+
+    *ec = 0;
+    if (h == NULL) {
+       *ec = VSALVAGE;
+       return;
+    }
+
+    fdP = IH_OPEN(h);
+    if (fdP == NULL) {
+       *ec = VSALVAGE;
+       return;
+    }
+
+    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
+       *ec = VSALVAGE;
+       FDH_REALLYCLOSE(fdP);
+       return;
+    }
+    vsn = (struct versionStamp *)to;
+    if (FDH_READ(fdP, to, size) != size || vsn->magic != magic) {
+       *ec = VSALVAGE;
+       FDH_REALLYCLOSE(fdP);
+       return;
+    }
+    FDH_CLOSE(fdP);
+
+    /* Check is conditional, in case caller wants to inspect version himself */
+    if (version && vsn->version != version) {
+       *ec = VSALVAGE;
+    }
+}
+
+void
+WriteVolumeHeader_r(Error * ec, Volume * vp)
+{
+    IHandle_t *h = V_diskDataHandle(vp);
+    FdHandle_t *fdP;
+
+    *ec = 0;
+
+    fdP = IH_OPEN(h);
+    if (fdP == NULL) {
+       *ec = VSALVAGE;
+       return;
+    }
+    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
+       *ec = VSALVAGE;
+       FDH_REALLYCLOSE(fdP);
+       return;
+    }
+    if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
+       != sizeof(V_disk(vp))) {
+       *ec = VSALVAGE;
+       FDH_REALLYCLOSE(fdP);
+       return;
+    }
+    FDH_CLOSE(fdP);
+}
+
+/* VolumeHeaderToDisk
+ * Allows for storing 64 bit inode numbers in on-disk volume header
+ * file.
+ */
+/* convert in-memory representation of a volume header to the
+ * on-disk representation of a volume header */
+void
+VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
+{
+
+    memset((char *)dh, 0, sizeof(VolumeDiskHeader_t));
+    dh->stamp = h->stamp;
+    dh->id = h->id;
      dh->parent = h->parent;
  
  #ifdef AFS_64BIT_IOPS_ENV
@@ -607,8 +1496,10 @@ VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h)
  }
  
  /* DiskToVolumeHeader
- * Reads volume header file from disk, convering 64 bit inodes
- * if required. Makes the assumption that AFS has *always* 
+ * Converts an on-disk representation of a volume header to
+ * the in-memory representation of a volume header.
+ *
+ * Makes the assumption that AFS has *always* 
   * zero'd the volume header file so that high parts of inode
   * numbers are 0 in older (SGI EFS) volume header files.
   */
@@ -642,127 +1533,372 @@ DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh)
  }
  
  
-void
-WriteVolumeHeader_r(ec, vp)
-     Error *ec;
-     Volume *vp;
-{
-    IHandle_t *h = V_diskDataHandle(vp);
-    FdHandle_t *fdP;
-
-    *ec = 0;
-
-    fdP = IH_OPEN(h);
-    if (fdP == NULL) {
-       *ec = VSALVAGE;
-       return;
-    }
-    if (FDH_SEEK(fdP, 0, SEEK_SET) < 0) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
-       return;
-    }
-    if (FDH_WRITE(fdP, (char *)&V_disk(vp), sizeof(V_disk(vp)))
-       != sizeof(V_disk(vp))) {
-       *ec = VSALVAGE;
-       FDH_REALLYCLOSE(fdP);
-       return;
-    }
-    FDH_CLOSE(fdP);
-}
+/***************************************************/
+/* Volume Attachment routines                      */
+/***************************************************/
  
-/* Attach an existing volume, given its pathname, and return a
-   pointer to the volume header information.  The volume also
-   normally goes online at this time.  An offline volume
-   must be reattached to make it go online */
+#ifdef AFS_DEMAND_ATTACH_FS
+/* pre-attach a volume given its path 
+ *
+ * a pre-attached volume will only have its partition
+ * and hashid fields initialized
+ *
+ * at first call to VGetVolume, the volume will be
+ * fully attached
+ */
  Volume *
-VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
+VPreAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
  {
-    Volume *retVal;
-    VATTACH_LOCK;
+    Volume * vp;
      VOL_LOCK;
-    retVal = VAttachVolumeByName_r(ec, partition, name, mode);
+    vp = VPreAttachVolumeByName_r(ec, partition, name, mode);
      VOL_UNLOCK;
-    VATTACH_UNLOCK;
-    return retVal;
+    return vp;
  }
  
  Volume *
-VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+VPreAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
  {
-    register Volume *vp;
+    register Volume *vp = NULL;
      int fd, n;
      struct afs_stat status;
-    struct VolumeDiskHeader diskHeader;
-    struct VolumeHeader iheader;
      struct DiskPartition *partp;
      char path[64];
      int isbusy = 0;
+    VolId volumeId;
      *ec = 0;
-    if (programType == volumeUtility) {
-       assert(VInit == 3);
-       VLockPartition_r(partition);
-    }
-    if (programType == fileServer) {
-       vp = VGetVolume_r(ec, VolumeNumber(name));
-       if (vp) {
-           if (V_inUse(vp))
-               return vp;
-           if (vp->specialStatus == VBUSY)
-               isbusy = 1;
-           VDetachVolume_r(ec, vp);
-           if (*ec) {
-               Log("VAttachVolume: Error detaching volume (%s)\n", name);
-           }
-       }
-    }
+
+    assert(programType == fileServer);
  
      if (!(partp = VGetPartition_r(partition, 0))) {
         *ec = VNOVOL;
-       Log("VAttachVolume: Error getting partition (%s)\n", partition);
-       goto done;
+       Log("VPreAttachVolume:  Error getting partition (%s)\n", partition);
+       return NULL;
      }
  
-    *ec = 0;
-    strcpy(path, VPartitionPath(partp));
-    strcat(path, "/");
-    strcat(path, name);
+    volumeId = VolumeNumber(name);
+
+    vp = VLookupVolume_r(ec, volumeId, NULL);
+    if (*ec) {
+       return NULL;
+    }
+
+    return VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+}
+
+/* pre-attach a volume given its partition and volume id
+ *
+ * if vp == NULL, then a new vp is created
+ * if vp != NULL, then we assumed it is already on the hash chain
+ */
+Volume * 
+VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, 
+                      Volume * vp, int vid)
+{
+    Volume *nvp = NULL;
+
+    *ec = 0;
+
+    /* check to see if pre-attach already happened */
+    if (vp && 
+       (V_attachState(vp) != VOL_STATE_UNATTACHED) && 
+       !IsErrorState(V_attachState(vp))) {
+       goto done;
+    } else if (vp) {
+       /* we're re-attaching a volume; clear out some old state */
+       memset(&vp->salvage, 0, sizeof(struct VolumeOnlineSalvage));
+    } else {
+       /* if we need to allocate a new Volume struct,
+        * go ahead and drop the vol glock, otherwise
+        * do the basic setup synchronised, as it's
+        * probably not worth dropping the lock */
+       VOL_UNLOCK;
+
+       /* allocate the volume structure */
+       vp = nvp = (Volume *) malloc(sizeof(Volume));
+       assert(vp != NULL);
+       memset(vp, 0, sizeof(Volume));
+       assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
+    }
+
+    /* link the volume with its associated vice partition */
+    vp->device = partp->device;
+    vp->partition = partp;
+    vp->hashid = vid;
+
+    /* if we dropped the lock, reacquire the lock,
+     * check for pre-attach races, and then add
+     * the volume to the hash table */
+    if (nvp) {
+       VOL_LOCK;
+       nvp = VLookupVolume_r(ec, vid, NULL);
+       if (*ec) {
+           free(vp);
+           vp = NULL;
+           goto done;
+       } else if (nvp) { /* race detected */
+           free(vp);
+           vp = nvp;
+           goto done;
+       } else {
+         /* hack to make up for VChangeState_r() decrementing 
+          * the old state counter */
+         VStats.state_levels[0]++;
+       }
+    }
+
+    /* put pre-attached volume onto the hash table
+     * and bring it up to the pre-attached state */
+    AddVolumeToHashTable(vp, vp->hashid);
+    AddVolumeToVByPList_r(vp);
+    VLRU_Init_Node_r(vp);
+    VChangeState_r(vp, VOL_STATE_PREATTACHED);
+
+    if (LogLevel >= 5)
+       Log("VPreAttachVolumeById_r:  volume %u pre-attached\n", vp->hashid);
+
+  done:
+    if (*ec)
+       return NULL;
+    else
+       return vp;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/* Attach an existing volume, given its pathname, and return a
+   pointer to the volume header information.  The volume also
+   normally goes online at this time.  An offline volume
+   must be reattached to make it go online */
+Volume *
+VAttachVolumeByName(Error * ec, char *partition, char *name, int mode)
+{
+    Volume *retVal;
+    VOL_LOCK;
+    retVal = VAttachVolumeByName_r(ec, partition, name, mode);
+    VOL_UNLOCK;
+    return retVal;
+}
+
+Volume *
+VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
+{
+    register Volume *vp = NULL, *svp = NULL;
+    int fd, n;
+    struct afs_stat status;
+    struct VolumeDiskHeader diskHeader;
+    struct VolumeHeader iheader;
+    struct DiskPartition *partp;
+    char path[64];
+    int isbusy = 0;
+    VolId volumeId;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolumeStats stats_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    *ec = 0;
+   
+    volumeId = VolumeNumber(name);
+
+    if (!(partp = VGetPartition_r(partition, 0))) {
+       *ec = VNOVOL;
+       Log("VAttachVolume: Error getting partition (%s)\n", partition);
+       goto done;
+    }
+
+    if (programType == volumeUtility) {
+       assert(VInit == 3);
+       VLockPartition_r(partition);
+    } else if (programType == fileServer) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       /* lookup the volume in the hash table */
+       vp = VLookupVolume_r(ec, volumeId, NULL);
+       if (*ec) {
+           return NULL;
+       }
+
+       if (vp) {
+           /* save any counters that are supposed to
+            * be monotonically increasing over the
+            * lifetime of the fileserver */
+           memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+       } else {
+           memset(&stats_save, 0, sizeof(VolumeStats));
+       }
+
+       /* if there's something in the hash table, and it's not
+        * in the pre-attach state, then we may need to detach
+        * it before proceeding */
+       if (vp && (V_attachState(vp) != VOL_STATE_PREATTACHED)) {
+           VCreateReservation_r(vp);
+           VWaitExclusiveState_r(vp);
+
+           /* at this point state must be one of:
+            *   UNATTACHED,
+            *   ATTACHED,
+            *   SHUTTING_DOWN,
+            *   GOING_OFFLINE,
+            *   SALVAGING,
+            *   ERROR
+            */
+
+           if (vp->specialStatus == VBUSY)
+               isbusy = 1;
+           
+           /* if it's already attached, see if we can return it */
+           if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+               VGetVolumeByVp_r(ec, vp);
+               if (V_inUse(vp)) {
+                   VCancelReservation_r(vp);
+                   return vp;
+               }
+
+               /* otherwise, we need to detach, and attempt to re-attach */
+               VDetachVolume_r(ec, vp);
+               if (*ec) {
+                   Log("VAttachVolume: Error detaching old volume instance (%s)\n", name);
+               }
+           } else {
+               /* if it isn't fully attached, delete from the hash tables,
+                  and let the refcounter handle the rest */
+               DeleteVolumeFromHashTable(vp);
+               DeleteVolumeFromVByPList_r(vp);
+           }
+
+           VCancelReservation_r(vp);
+           vp = NULL;
+       }
+
+       /* pre-attach volume if it hasn't been done yet */
+       if (!vp || 
+           (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+           (V_attachState(vp) == VOL_STATE_ERROR)) {
+           svp = vp;
+           vp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+           if (*ec) {
+               return NULL;
+           }
+       }
+
+       assert(vp != NULL);
+
+       /* handle pre-attach races 
+        *
+        * multiple threads can race to pre-attach a volume,
+        * but we can't let them race beyond that
+        * 
+        * our solution is to let the first thread to bring
+        * the volume into an exclusive state win; the other
+        * threads just wait until it finishes bringing the
+        * volume online, and then they do a vgetvolumebyvp
+        */
+       if (svp && (svp != vp)) {
+           /* wait for other exclusive ops to finish */
+           VCreateReservation_r(vp);
+           VWaitExclusiveState_r(vp);
+
+           /* get a heavyweight ref, kill the lightweight ref, and return */
+           VGetVolumeByVp_r(ec, vp);
+           VCancelReservation_r(vp);
+           return vp;
+       }
+
+       /* at this point, we are chosen as the thread to do
+        * demand attachment for this volume. all other threads
+        * doing a getvolume on vp->hashid will block until we finish */
+
+       /* make sure any old header cache entries are invalidated
+        * before proceeding */
+       FreeVolumeHeader(vp);
+
+       VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+       /* restore any saved counters */
+       memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+#else /* AFS_DEMAND_ATTACH_FS */
+       vp = VGetVolume_r(ec, volumeId);
+       if (vp) {
+           if (V_inUse(vp))
+               return vp;
+           if (vp->specialStatus == VBUSY)
+               isbusy = 1;
+           VDetachVolume_r(ec, vp);
+           if (*ec) {
+               Log("VAttachVolume: Error detaching volume (%s)\n", name);
+           }
+           vp = NULL;
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+
+    *ec = 0;
+    strcpy(path, VPartitionPath(partp));
+
      VOL_UNLOCK;
+
+    strcat(path, "/");
+    strcat(path, name);
      if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
         Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
         if (fd > -1)
             close(fd);
-       VOL_LOCK;
         *ec = VNOVOL;
+       VOL_LOCK;
         goto done;
      }
      n = read(fd, &diskHeader, sizeof(diskHeader));
      close(fd);
-    VOL_LOCK;
      if (n != sizeof(diskHeader)
         || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
         Log("VAttachVolume: Error reading volume header %s\n", path);
         *ec = VSALVAGE;
+       VOL_LOCK;
         goto done;
      }
      if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
         Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
         *ec = VSALVAGE;
+       VOL_LOCK;
         goto done;
      }
  
      DiskToVolumeHeader(&iheader, &diskHeader);
+#ifdef FSSYNC_BUILD_CLIENT
      if (programType == volumeUtility && mode != V_SECRETLY && mode != V_PEEK) {
-       if (FSYNC_askfs(iheader.id, partition, FSYNC_NEEDVOLUME, mode)
-           == FSYNC_DENIED) {
+        VOL_LOCK;
+       if (FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_NEEDVOLUME, mode, NULL)
+           != SYNC_OK) {
             Log("VAttachVolume: attach of volume %u apparently denied by file server\n", iheader.id);
             *ec = VNOVOL;       /* XXXX */
             goto done;
         }
+       VOL_UNLOCK;
+    }
+#endif
+
+    if (!vp) {
+      vp = (Volume *) calloc(1, sizeof(Volume));
+      assert(vp != NULL);
+      vp->device = partp->device;
+      vp->partition = partp;
+#ifdef AFS_DEMAND_ATTACH_FS
+      assert(pthread_cond_init(&V_attachCV(vp), NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
      }
  
-    vp = attach2(ec, path, &iheader, partp, isbusy);
+    /* attach2 is entered without any locks, and returns
+     * with vol_glock_mutex held */
+    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+
      if (programType == volumeUtility && vp) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       /* for dafs, we should tell the fileserver, except for V_PEEK
+         * where we know it is not necessary */
+       if (mode == V_PEEK) {
+           vp->needsPutBack = 0;
+       } else {
+           vp->needsPutBack = 1;
+       }
+#else /* !AFS_DEMAND_ATTACH_FS */
         /* duplicate computation in fssync.c about whether the server
          * takes the volume offline or not.  If the volume isn't
          * offline, we must not return it when we detach the volume,
@@ -772,6 +1908,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
             vp->needsPutBack = 0;
         else
             vp->needsPutBack = 1;
+#endif /* !AFS_DEMAND_ATTACH_FS */
      }
      /* OK, there's a problem here, but one that I don't know how to
       * fix right now, and that I don't think should arise often.
@@ -784,10 +1921,13 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
       * for all of that to happen, but if it does, probably the right
       * fix is for the server to allow the return of readonly volumes
       * that it doesn't think are really checked out. */
+#ifdef FSSYNC_BUILD_CLIENT
      if (programType == volumeUtility && vp == NULL &&
         mode != V_SECRETLY && mode != V_PEEK) {
-       FSYNC_askfs(iheader.id, partition, FSYNC_ON, 0);
-    } else if (programType == fileServer && vp) {
+       FSYNC_VolOp(iheader.id, partition, FSYNC_VOL_ON, 0, NULL);
+    } else 
+#endif
+    if (programType == fileServer && vp) {
         V_needsCallback(vp) = 0;
  #ifdef notdef
         if (VInit >= 2 && V_BreakVolumeCallbacks) {
@@ -795,7 +1935,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
             (*V_BreakVolumeCallbacks) (V_id(vp));
         }
  #endif
-       VUpdateVolume_r(ec, vp);
+       VUpdateVolume_r(ec, vp, 0);
         if (*ec) {
             Log("VAttachVolume: Error updating volume\n");
             if (vp)
@@ -803,7 +1943,8 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
             goto done;
         }
         if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
-           /* This is a hack: by temporarily settint the incore
+#ifndef AFS_DEMAND_ATTACH_FS
+           /* This is a hack: by temporarily setting the incore
              * dontSalvage flag ON, the volume will be put back on the
              * Update list (with dontSalvage OFF again).  It will then
              * come back in N minutes with DONT_SALVAGE eventually
@@ -812,6 +1953,7 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
              * offline without DONT SALVAGE having been set also
              * eventually get it set */
             V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
             VAddToVolumeUpdateList_r(ec, vp);
             if (*ec) {
                 Log("VAttachVolume: Error adding volume to update list\n");
@@ -828,25 +1970,196 @@ VAttachVolumeByName_r(Error * ec, char *partition, char *name, int mode)
      if (programType == volumeUtility) {
         VUnlockPartition_r(partition);
      }
-    if (*ec)
+    if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       if (vp) {
+           V_attachState(vp) = VOL_STATE_ERROR;
+           assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
         return NULL;
-    else
+    } else {
         return vp;
+    }
  }
  
-private Volume *
-attach2(Error * ec, char *path, register struct VolumeHeader * header,
-       struct DiskPartition * partp, int isbusy)
+#ifdef AFS_DEMAND_ATTACH_FS
+/* VAttachVolumeByVp_r
+ *
+ * finish attaching a volume that is
+ * in a less than fully attached state
+ */
+/* caller MUST hold a ref count on vp */
+static Volume *
+VAttachVolumeByVp_r(Error * ec, Volume * vp, int mode)
  {
-    register Volume *vp;
+    char name[VMAXPATHLEN];
+    int fd, n, reserve = 0;
+    struct afs_stat status;
+    struct VolumeDiskHeader diskHeader;
+    struct VolumeHeader iheader;
+    struct DiskPartition *partp;
+    char path[64];
+    int isbusy = 0;
+    VolId volumeId;
+    Volume * nvp;
+    VolumeStats stats_save;
+    *ec = 0;
  
-    VOL_UNLOCK;
+    /* volume utility should never call AttachByVp */
+    assert(programType == fileServer);
+   
+    volumeId = vp->hashid;
+    partp = vp->partition;
+    VolumeExternalName_r(volumeId, name, sizeof(name));
+
+
+    /* if another thread is performing a blocking op, wait */
+    VWaitExclusiveState_r(vp);
+
+    memcpy(&stats_save, &vp->stats, sizeof(VolumeStats));
+
+    /* if it's already attached, see if we can return it */
+    if (V_attachState(vp) == VOL_STATE_ATTACHED) {
+       VGetVolumeByVp_r(ec, vp);
+       if (V_inUse(vp)) {
+           return vp;
+       } else {
+           if (vp->specialStatus == VBUSY)
+               isbusy = 1;
+           VDetachVolume_r(ec, vp);
+           if (*ec) {
+               Log("VAttachVolume: Error detaching volume (%s)\n", name);
+           }
+           vp = NULL;
+       }
+    }
  
-    vp = (Volume *) calloc(1, sizeof(Volume));
+    /* pre-attach volume if it hasn't been done yet */
+    if (!vp || 
+       (V_attachState(vp) == VOL_STATE_UNATTACHED) ||
+       (V_attachState(vp) == VOL_STATE_ERROR)) {
+       nvp = VPreAttachVolumeById_r(ec, partp, vp, volumeId);
+       if (*ec) {
+           return NULL;
+       }
+       if (nvp != vp) {
+           reserve = 1;
+           VCreateReservation_r(nvp);
+           vp = nvp;
+       }
+    }
+    
      assert(vp != NULL);
+    VChangeState_r(vp, VOL_STATE_ATTACHING);
+
+    /* restore monotonically increasing stats */
+    memcpy(&vp->stats, &stats_save, sizeof(VolumeStats));
+
+    *ec = 0;
+
+
+    /* compute path to disk header, 
+     * read in header, 
+     * and verify magic and version stamps */
+    strcpy(path, VPartitionPath(partp));
+
+    VOL_UNLOCK;
+
+    strcat(path, "/");
+    strcat(path, name);
+    if ((fd = afs_open(path, O_RDONLY)) == -1 || afs_fstat(fd, &status) == -1) {
+       Log("VAttachVolume: Failed to open %s (errno %d)\n", path, errno);
+       if (fd > -1)
+           close(fd);
+       *ec = VNOVOL;
+       VOL_LOCK;
+       goto done;
+    }
+    n = read(fd, &diskHeader, sizeof(diskHeader));
+    close(fd);
+    if (n != sizeof(diskHeader)
+       || diskHeader.stamp.magic != VOLUMEHEADERMAGIC) {
+       Log("VAttachVolume: Error reading volume header %s\n", path);
+       *ec = VSALVAGE;
+       VOL_LOCK;
+       goto done;
+    }
+    if (diskHeader.stamp.version != VOLUMEHEADERVERSION) {
+       Log("VAttachVolume: Volume %s, version number is incorrect; volume needs salvaged\n", path);
+       *ec = VSALVAGE;
+       VOL_LOCK;
+       goto done;
+    }
+
+    /* convert on-disk header format to in-memory header format */
+    DiskToVolumeHeader(&iheader, &diskHeader);
+
+    /* do volume attach
+     *
+     * NOTE: attach2 is entered without any locks, and returns
+     * with vol_glock_mutex held */
+    vp = attach2(ec, volumeId, path, &iheader, partp, vp, isbusy, mode);
+
+    if (*ec || vp == NULL) {
+       goto done;
+    }
+
+    V_needsCallback(vp) = 0;
+    VUpdateVolume_r(ec, vp, 0);
+    if (*ec) {
+       Log("VAttachVolume: Error updating volume %u\n", vp->hashid);
+       VPutVolume_r(vp);
+       goto done;
+    }
+    if (VolumeWriteable(vp) && V_dontSalvage(vp) == 0) {
+#ifndef AFS_DEMAND_ATTACH_FS
+       /* This is a hack: by temporarily setting the incore
+        * dontSalvage flag ON, the volume will be put back on the
+        * Update list (with dontSalvage OFF again).  It will then
+        * come back in N minutes with DONT_SALVAGE eventually
+        * set.  This is the way that volumes that have never had
+        * it set get it set; or that volumes that have been
+        * offline without DONT SALVAGE having been set also
+        * eventually get it set */
+       V_dontSalvage(vp) = DONT_SALVAGE;
+#endif /* !AFS_DEMAND_ATTACH_FS */
+       VAddToVolumeUpdateList_r(ec, vp);
+       if (*ec) {
+           Log("VAttachVolume: Error adding volume %u to update list\n", vp->hashid);
+           if (vp)
+               VPutVolume_r(vp);
+           goto done;
+       }
+    }
+    if (LogLevel)
+       Log("VOnline:  volume %u (%s) attached and online\n", V_id(vp),
+           V_name(vp));
+  done:
+    if (reserve) {
+       VCancelReservation_r(nvp);
+       reserve = 0;
+    }
+    if (*ec && (*ec != VOFFLINE) && (*ec != VSALVAGE)) {
+       if (vp && !IsErrorState(V_attachState(vp))) {
+           VChangeState_r(vp, VOL_STATE_ERROR);
+       }
+       return NULL;
+    } else {
+       return vp;
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/*
+ * called without any locks held
+ * returns with vol_glock_mutex held
+ */
+private Volume * 
+attach2(Error * ec, VolId volumeId, char *path, register struct VolumeHeader * header,
+       struct DiskPartition * partp, register Volume * vp, int isbusy, int mode)
+{
      vp->specialStatus = (byte) (isbusy ? VBUSY : 0);
-    vp->device = partp->device;
-    vp->partition = partp;
      IH_INIT(vp->vnodeIndex[vLarge].handle, partp->device, header->parent,
             header->largeVnodeIndex);
      IH_INIT(vp->vnodeIndex[vSmall].handle, partp->device, header->parent,
@@ -857,8 +2170,15 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
      vp->shuttingDown = 0;
      vp->goingOffline = 0;
      vp->nUsers = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    vp->stats.last_attach = FT_ApproxTime();
+    vp->stats.attaches++;
+#endif
  
      VOL_LOCK;
+#ifdef AFS_DEMAND_ATTACH_FS
+    IncUInt64(&VStats.attaches);
+#endif
      vp->cacheCheck = ++VolumeCacheCheck;
      /* just in case this ever rolls over */
      if (!vp->cacheCheck)
@@ -866,15 +2186,76 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
      GetVolumeHeader(vp);
      VOL_UNLOCK;
  
+#if defined(AFS_DEMAND_ATTACH_FS) && defined(FSSYNC_BUILD_CLIENT)
+    /* demand attach changes the V_PEEK mechanism
+     *
+     * we can now suck the current disk data structure over
+     * the fssync interface without going to disk
+     *
+     * (technically, we don't need to restrict this feature
+     *  to demand attach fileservers.  However, I'm trying
+     *  to limit the number of common code changes)
+     */
+    if (programType != fileServer && mode == V_PEEK) {
+       SYNC_response res;
+       res.payload.len = sizeof(VolumeDiskData);
+       res.payload.buf = &vp->header->diskstuff;
+
+       if (FSYNC_VolOp(volumeId,
+                       VPartitionPath(partp),
+                       FSYNC_VOL_QUERY_HDR,
+                       FSYNC_WHATEVER,
+                       &res) == SYNC_OK) {
+           goto disk_header_loaded;
+       }
+    }
+#endif /* AFS_DEMAND_ATTACH_FS && FSSYNC_BUILD_CLIENT */
      (void)ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
                      sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
  
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* update stats */
      VOL_LOCK;
+    IncUInt64(&VStats.hdr_loads);
+    IncUInt64(&vp->stats.hdr_loads);
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    
      if (*ec) {
         Log("VAttachVolume: Error reading diskDataHandle vol header %s; error=%u\n", path, *ec);
      }
+
+ disk_header_loaded:
+
+#ifdef AFS_DEMAND_ATTACH_FS
      if (!*ec) {
-       struct IndexFileHeader iHead;
+
+       /* check for pending volume operations */
+       if (vp->pending_vol_op) {
+           /* see if the pending volume op requires exclusive access */
+           if (!VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+               /* mark the volume down */
+               *ec = VOFFLINE;
+               VChangeState_r(vp, VOL_STATE_UNATTACHED);
+               if (V_offlineMessage(vp)[0] == '\0')
+                   strlcpy(V_offlineMessage(vp),
+                           "A volume utility is running.", 
+                           sizeof(V_offlineMessage(vp)));
+               V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
+               /* check to see if we should set the specialStatus flag */
+               if (VVolOpSetVBusy_r(vp, vp->pending_vol_op)) {
+                   vp->specialStatus = VBUSY;
+               }
+           }
+       }
+
+       V_attachFlags(vp) |= VOL_HDR_LOADED;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    if (!*ec) {
+       struct IndexFileHeader iHead;
  
  #if OPENAFS_VOL_STATS
         /*
@@ -887,65 +2268,117 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
             V_stat_initialized(vp) = 1;
         }
  #endif /* OPENAFS_VOL_STATS */
-       VOL_UNLOCK;
+
         (void)ReadHeader(ec, vp->vnodeIndex[vSmall].handle,
                          (char *)&iHead, sizeof(iHead),
                          SMALLINDEXMAGIC, SMALLINDEXVERSION);
-       VOL_LOCK;
+
         if (*ec) {
             Log("VAttachVolume: Error reading smallVnode vol header %s; error=%u\n", path, *ec);
         }
      }
+
      if (!*ec) {
         struct IndexFileHeader iHead;
-       VOL_UNLOCK;
+
         (void)ReadHeader(ec, vp->vnodeIndex[vLarge].handle,
                          (char *)&iHead, sizeof(iHead),
                          LARGEINDEXMAGIC, LARGEINDEXVERSION);
-       VOL_LOCK;
+
         if (*ec) {
             Log("VAttachVolume: Error reading largeVnode vol header %s; error=%u\n", path, *ec);
         }
      }
+
  #ifdef AFS_NAMEI_ENV
      if (!*ec) {
         struct versionStamp stamp;
-       VOL_UNLOCK;
+
         (void)ReadHeader(ec, V_linkHandle(vp), (char *)&stamp,
                          sizeof(stamp), LINKTABLEMAGIC, LINKTABLEVERSION);
-       VOL_LOCK;
+
         if (*ec) {
             Log("VAttachVolume: Error reading namei vol header %s; error=%u\n", path, *ec);
         }
      }
-#endif
+#endif /* AFS_NAMEI_ENV */
+
+#if defined(AFS_DEMAND_ATTACH_FS)
+    if (*ec && ((*ec != VOFFLINE) || (V_attachState(vp) != VOL_STATE_UNATTACHED))) {
+        VOL_LOCK;
+       if (programType == fileServer) {
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+           vp->nUsers = 0;
+           *ec = VSALVAGING;
+       } else {
+           Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+           FreeVolume(vp);
+           *ec = VSALVAGE;
+       }
+       return NULL;
+    } else if (*ec) {
+       /* volume operation in progress */
+       VOL_LOCK;
+       return NULL;
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
      if (*ec) {
         Log("VAttachVolume: Error attaching volume %s; volume needs salvage; error=%u\n", path, *ec);
+        VOL_LOCK;
         FreeVolume(vp);
         return NULL;
      }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
      if (V_needsSalvaged(vp)) {
         if (vp->specialStatus)
             vp->specialStatus = 0;
-       Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
-       *ec = VSALVAGE;
+        VOL_LOCK;
+#if defined(AFS_DEMAND_ATTACH_FS)
+       if (programType == fileServer) {
+           VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+           vp->nUsers = 0;
+           *ec = VSALVAGING;
+       } else {
+           Log("VAttachVolume: volume salvage flag is ON for %s; volume needs salvage\n", path);
+           FreeVolume(vp);
+           *ec = VSALVAGE;
+       }
+#else /* AFS_DEMAND_ATTACH_FS */
         FreeVolume(vp);
+       *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
         return NULL;
      }
+
+    VOL_LOCK;
      if (programType == fileServer) {
  #ifndef FAST_RESTART
         if (V_inUse(vp) && VolumeWriteable(vp)) {
             if (!V_needsSalvaged(vp)) {
                 V_needsSalvaged(vp) = 1;
-               VUpdateVolume_r(ec, vp);
+               VUpdateVolume_r(ec, vp, 0);
             }
-           FreeVolume(vp);
+#if defined(AFS_DEMAND_ATTACH_FS)
+           VRequestSalvage_r(vp, SALVSYNC_NEEDED, VOL_SALVAGE_INVALIDATE_HEADER);
+           vp->nUsers = 0;
+           *ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
             Log("VAttachVolume: volume %s needs to be salvaged; not attached.\n", path);
+           FreeVolume(vp);
             *ec = VSALVAGE;
+#endif /* AFS_DEMAND_ATTACH_FS */
             return NULL;
         }
  #endif /* FAST_RESTART */
+
         if (V_destroyMe(vp) == DESTROY_ME) {
+#if defined(AFS_DEMAND_ATTACH_FS)
+           /* schedule a salvage so the volume goes away on disk */
+           VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+           VChangeState_r(vp, VOL_STATE_ERROR);
+           vp->nUsers = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
             FreeVolume(vp);
             Log("VAttachVolume: volume %s is junk; it should be destroyed at next salvage\n", path);
             *ec = VNOVOL;
@@ -953,18 +2386,21 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
         }
      }
  
-    AddVolumeToHashTable(vp, V_id(vp));
      vp->nextVnodeUnique = V_uniquifier(vp);
      vp->vnodeIndex[vSmall].bitmap = vp->vnodeIndex[vLarge].bitmap = NULL;
  #ifndef BITMAP_LATER
      if (programType == fileServer && VolumeWriteable(vp)) {
         int i;
         for (i = 0; i < nVNODECLASSES; i++) {
-           VOL_UNLOCK;
-           GetBitmap(ec, vp, i);
-           VOL_LOCK;
+           VGetBitmap_r(ec, vp, i);
             if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+               VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+               vp->nUsers = 0;
+               *ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
                 FreeVolume(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
                 Log("VAttachVolume: error getting bitmap for volume (%s)\n",
                     path);
                 return NULL;
@@ -982,6 +2418,12 @@ attach2(Error * ec, char *path, register struct VolumeHeader * header,
         }
      }
  
+    AddVolumeToHashTable(vp, V_id(vp));
+#ifdef AFS_DEMAND_ATTACH_FS
+    AddVolumeToVByPList_r(vp);
+    VLRU_Add_r(vp);
+    VChangeState_r(vp, VOL_STATE_ATTACHED);
+#endif
      return vp;
  }
  
@@ -994,11 +2436,9 @@ Volume *
  VAttachVolume(Error * ec, VolumeId volumeId, int mode)
  {
      Volume *retVal;
-    VATTACH_LOCK;
      VOL_LOCK;
      retVal = VAttachVolume_r(ec, volumeId, mode);
      VOL_UNLOCK;
-    VATTACH_UNLOCK;
      return retVal;
  }
  
@@ -1028,21 +2468,39 @@ VAttachVolume_r(Error * ec, VolumeId volumeId, int mode)
   * we still guarantee we won't context swap, but the ref count won't be
   * incremented (otherwise we'd violate the invariant).
   */
+/* NOTE: with the demand attach fileserver extensions, the global lock
+ * is dropped within VHold */
+#ifdef AFS_DEMAND_ATTACH_FS
  static int
  VHold_r(register Volume * vp)
  {
      Error error;
  
-    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
-       VolumeReplacements++;
-       ReadHeader(&error, V_diskDataHandle(vp), (char *)&V_disk(vp),
-                  sizeof(V_disk(vp)), VOLUMEINFOMAGIC, VOLUMEINFOVERSION);
-       if (error)
-           return error;
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
+
+    LoadVolumeHeader(&error, vp);
+    if (error) {
+       VCancelReservation_r(vp);
+       return error;
      }
      vp->nUsers++;
+    VCancelReservation_r(vp);
+    return 0;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VHold_r(register Volume * vp)
+{
+    Error error;
+
+    LoadVolumeHeader(&error, vp);
+    if (error)
+       return error;
+    vp->nUsers++;
      return 0;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
  static int
  VHold(register Volume * vp)
@@ -1054,59 +2512,26 @@ VHold(register Volume * vp)
      return retVal;
  }
  
-void
-VTakeOffline_r(register Volume * vp)
-{
-    assert(vp->nUsers > 0);
-    assert(programType == fileServer);
-    vp->goingOffline = 1;
-    V_needsSalvaged(vp) = 1;
-}
  
-void
-VTakeOffline(register Volume * vp)
-{
-    VOL_LOCK;
-    VTakeOffline_r(vp);
-    VOL_UNLOCK;
-}
+/***************************************************/
+/* get and put volume routines                     */
+/***************************************************/
  
  void
  VPutVolume_r(register Volume * vp)
  {
      assert(--vp->nUsers >= 0);
      if (vp->nUsers == 0) {
+       VCheckOffline(vp);
         ReleaseVolumeHeader(vp->header);
-       if (vp->goingOffline) {
-           Error error;
-           assert(programType == fileServer);
-           vp->goingOffline = 0;
-           V_inUse(vp) = 0;
-           VUpdateVolume_r(&error, vp);
-           VCloseVolumeHandles_r(vp);
-           if (LogLevel) {
-               Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
-                   V_name(vp));
-               if (V_offlineMessage(vp)[0])
-                   Log(" (%s)", V_offlineMessage(vp));
-               Log("\n");
-           }
-#ifdef AFS_PTHREAD_ENV
-           assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
-           LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
-       }
-       if (vp->shuttingDown) {
-           VReleaseVolumeHandles_r(vp);
-           FreeVolume(vp);
-           if (programType == fileServer)
-#ifdef AFS_PTHREAD_ENV
-               assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
-#else /* AFS_PTHREAD_ENV */
-               LWP_NoYieldSignal(VPutVolume);
-#endif /* AFS_PTHREAD_ENV */
+#ifdef AFS_DEMAND_ATTACH_FS
+       if (!VCheckDetach(vp)) {
+           VCheckSalvage(vp);
+           VCheckFree(vp);
         }
+#else /* AFS_DEMAND_ATTACH_FS */
+       VCheckDetach(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
      }
  }
  
@@ -1118,15 +2543,16 @@ VPutVolume(register Volume * vp)
      VOL_UNLOCK;
  }
  
+
  /* Get a pointer to an attached volume.  The pointer is returned regardless
     of whether or not the volume is in service or on/off line.  An error
     code, however, is returned with an indication of the volume's status */
  Volume *
-VGetVolume(Error * ec, VolId volumeId)
+VGetVolume(Error * ec, Error * client_ec, VolId volumeId)
  {
      Volume *retVal;
      VOL_LOCK;
-    retVal = VGetVolume_r(ec, volumeId);
+    retVal = GetVolume(ec, client_ec, volumeId, NULL, 0);
      VOL_UNLOCK;
      return retVal;
  }
@@ -1134,22 +2560,69 @@ VGetVolume(Error * ec, VolId volumeId)
  Volume *
  VGetVolume_r(Error * ec, VolId volumeId)
  {
-    Volume *vp;
-    unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V4 = 0, V5 = 0, V6 =
+    return GetVolume(ec, NULL, volumeId, NULL, 0);
+}
+
+/* try to get a volume we've previously looked up */
+/* for demand attach fs, caller MUST NOT hold a ref count on vp */
+Volume * 
+VGetVolumeByVp_r(Error * ec, Volume * vp)
+{
+    return GetVolume(ec, NULL, vp->hashid, vp, 0);
+}
+
+/* private interface for getting a volume handle
+ * volumeId must be provided.
+ * hint is an optional parameter to speed up hash lookups
+ * flags is not used at this time
+ */
+/* for demand attach fs, caller MUST NOT hold a ref count on hint */
+static Volume *
+GetVolume(Error * ec, Error * client_ec, VolId volumeId, Volume * hint, int flags)
+{
+    Volume *vp = hint;
+    /* pull this profiling/debugging code out of regular builds */
+#ifdef notdef
+#define VGET_CTR_INC(x) x++
+    unsigned short V0 = 0, V1 = 0, V2 = 0, V3 = 0, V5 = 0, V6 =
         0, V7 = 0, V8 = 0, V9 = 0;
      unsigned short V10 = 0, V11 = 0, V12 = 0, V13 = 0, V14 = 0, V15 = 0;
+#else
+#define VGET_CTR_INC(x)
+#endif
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    Volume *avp, * rvp = hint;
+
+    if (rvp) {
+       VCreateReservation_r(rvp);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
      for (;;) {
         *ec = 0;
-       V0++;
-       for (vp = VolumeHashTable[VOLUME_HASH(volumeId)];
-            vp && vp->hashid != volumeId; vp = vp->hashNext)
-           Vlooks++;
+       if (client_ec)
+           *client_ec = 0;
+       VGET_CTR_INC(V0);
+
+       vp = VLookupVolume_r(ec, volumeId, vp);
+       if (*ec) {
+           vp = NULL;
+           break;
+       }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+       if (rvp && (rvp != vp)) {
+           /* break reservation on old vp */
+           VCancelReservation_r(rvp);
+           rvp = NULL;
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
         if (!vp) {
-           V1++;
+           VGET_CTR_INC(V1);
             if (VInit < 2) {
-               V2++;
+               VGET_CTR_INC(V2);
                 /* Until we have reached an initialization level of 2
                  * we don't know whether this volume exists or not.
                  * We can't sleep and retry later because before a volume
@@ -1164,99 +2637,255 @@ VGetVolume_r(Error * ec, VolId volumeId)
             break;
         }
  
-       V3++;
-       VolumeGets++;
-       if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
-           V5++;
-           VolumeReplacements++;
-           ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
-                      sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
-                      VOLUMEINFOVERSION);
+       VGET_CTR_INC(V3);
+       IncUInt64(&VStats.hdr_gets);
+       
+#ifdef AFS_DEMAND_ATTACH_FS
+       /* block if someone else is performing an exclusive op on this volume */
+       if (rvp != vp) {
+           rvp = vp;
+           VCreateReservation_r(rvp);
+       }
+       VWaitExclusiveState_r(vp);
+
+       /* short circuit with VNOVOL in the following circumstances:
+        *
+        *   VOL_STATE_ERROR
+        *   VOL_STATE_SHUTTING_DOWN
+        */
+       if ((V_attachState(vp) == VOL_STATE_ERROR) ||
+           (V_attachState(vp) == VOL_STATE_SHUTTING_DOWN)) {
+           *ec = VNOVOL;
+           vp = NULL;
+           break;
+       }
+
+       /* allowable states:
+        *   UNATTACHED
+        *   PREATTACHED
+        *   ATTACHED
+        *   GOING_OFFLINE
+        *   SALVAGING
+        */
+
+       if (vp->salvage.requested) {
+           VUpdateSalvagePriority_r(vp);
+       }
+
+       if (V_attachState(vp) == VOL_STATE_PREATTACHED) {
+           avp = VAttachVolumeByVp_r(ec, vp, 0);
+           if (avp) {
+               if (vp != avp) {
+                   /* VAttachVolumeByVp_r can return a pointer
+                    * != the vp passed to it under certain
+                    * conditions; make sure we don't leak
+                    * reservations if that happens */
+                   vp = avp;
+                   VCancelReservation_r(rvp);
+                   rvp = avp;
+                   VCreateReservation_r(rvp);
+               }
+               VPutVolume_r(avp);
+           }
             if (*ec) {
-               V6++;
-               /* Only log the error if it was a totally unexpected error.  Simply
-                * a missing inode is likely to be caused by the volume being deleted */
-               if (errno != ENXIO || LogLevel)
-                   Log("Volume %u: couldn't reread volume header\n",
-                       vp->hashid);
+               int endloop = 0;
+               switch (*ec) {
+               case VSALVAGING:
+                   break;
+               case VOFFLINE:
+                   if (!vp->pending_vol_op) {
+                       endloop = 1;
+                   }
+                   break;
+               default:
+                   *ec = VNOVOL;
+                   endloop = 1;
+               }
+               if (endloop) {
+                   vp = NULL;
+                   break;
+               }
+           }
+       }
+
+       if ((V_attachState(vp) == VOL_STATE_SALVAGING) ||
+           (*ec == VSALVAGING)) {
+           if (client_ec) {
+               /* see CheckVnode() in afsfileprocs.c for an explanation
+                * of this error code logic */
+               afs_uint32 now = FT_ApproxTime();
+               if ((vp->stats.last_salvage + (10 * 60)) >= now) {
+                   *client_ec = VBUSY;
+               } else {
+                   *client_ec = VRESTARTING;
+               }
+           }
+           *ec = VSALVAGING;
+           vp = NULL;
+           break;
+       }
+
+       if (vp->pending_vol_op && !VVolOpLeaveOnline_r(vp, vp->pending_vol_op)) {
+           if (client_ec) {
+               /* see CheckVnode() in afsfileprocs.c for an explanation
+                * of this error code logic */
+               afs_uint32 now = FT_ApproxTime();
+               if ((vp->stats.last_vol_op + (10 * 60)) >= now) {
+                   *client_ec = VBUSY;
+               } else {
+                   *client_ec = VRESTARTING;
+               }
+           }
+           *ec = VOFFLINE;
+           vp = NULL;
+           break;
+       }
+
+       if (V_attachState(vp) == VOL_STATE_UNATTACHED) {
+           *ec = VOFFLINE;
+           vp = NULL;
+           break;
+       }
+#endif /* AFS_DEMAND_ATTACH_FS */
+       
+       LoadVolumeHeader(ec, vp);
+       if (*ec) {
+           VGET_CTR_INC(V6);
+           /* Only log the error if it was a totally unexpected error.  Simply
+            * a missing inode is likely to be caused by the volume being deleted */
+           if (errno != ENXIO || LogLevel)
+               Log("Volume %u: couldn't reread volume header\n",
+                   vp->hashid);
+#ifdef AFS_DEMAND_ATTACH_FS
+           if (programType == fileServer) {
+               VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+               *ec = VSALVAGING;
+           } else {
                 FreeVolume(vp);
                 vp = NULL;
-               break;
             }
+#else /* AFS_DEMAND_ATTACH_FS */
+           FreeVolume(vp);
+           vp = NULL;
+#endif /* AFS_DEMAND_ATTACH_FS */
+           break;
         }
-       V7++;
+
+       VGET_CTR_INC(V7);
         if (vp->shuttingDown) {
-           V8++;
+           VGET_CTR_INC(V8);
             *ec = VNOVOL;
             vp = NULL;
             break;
         }
+
         if (programType == fileServer) {
-           V9++;
+           VGET_CTR_INC(V9);
             if (vp->goingOffline) {
-               V10++;
-#ifdef AFS_PTHREAD_ENV
-               pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex);
+               VGET_CTR_INC(V10);
+#ifdef AFS_DEMAND_ATTACH_FS
+               /* wait for the volume to go offline */
+               if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+                   VWaitStateChange_r(vp);
+               }
+#elif defined(AFS_PTHREAD_ENV)
+               assert(pthread_cond_wait(&vol_put_volume_cond, &vol_glock_mutex) == 0);
  #else /* AFS_PTHREAD_ENV */
                 LWP_WaitProcess(VPutVolume);
  #endif /* AFS_PTHREAD_ENV */
                 continue;
             }
             if (vp->specialStatus) {
-               V11++;
+               VGET_CTR_INC(V11);
                 *ec = vp->specialStatus;
             } else if (V_inService(vp) == 0 || V_blessed(vp) == 0) {
-               V12++;
+               VGET_CTR_INC(V12);
                 *ec = VNOVOL;
             } else if (V_inUse(vp) == 0) {
-               V13++;
+               VGET_CTR_INC(V13);
                 *ec = VOFFLINE;
             } else {
-               V14++;
+               VGET_CTR_INC(V14);
             }
         }
         break;
      }
-    V15++;
+    VGET_CTR_INC(V15);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* if no error, bump nUsers */
+    if (vp) {
+       vp->nUsers++;
+       VLRU_UpdateAccess_r(vp);
+    }
+    if (rvp) {
+       VCancelReservation_r(rvp);
+       rvp = NULL;
+    }
+    if (client_ec && !*client_ec) {
+       *client_ec = *ec;
+    }
+#else /* AFS_DEMAND_ATTACH_FS */
      /* if no error, bump nUsers */
-    if (vp)
+    if (vp) {
         vp->nUsers++;
+    }
+    if (client_ec) {
+       *client_ec = *ec;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
      assert(vp || *ec);
      return vp;
  }
  
  
-/* For both VForceOffline and VOffline, we close all relevant handles.
- * For VOffline, if we re-attach the volume, the files may possible be
- * different than before. 
- */
-static void
-VReleaseVolumeHandles_r(Volume * vp)
+/***************************************************/
+/* Volume offline/detach routines                  */
+/***************************************************/
+
+/* caller MUST hold a heavyweight ref on vp */
+#ifdef AFS_DEMAND_ATTACH_FS
+void
+VTakeOffline_r(register Volume * vp)
  {
-    DFlushVolume(V_id(vp));
-    VReleaseVnodeFiles_r(vp);
+    assert(vp->nUsers > 0);
+    assert(programType == fileServer);
  
-    /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
-       IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
-       IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
-       IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
-       IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
-    }
+    VCreateReservation_r(vp);
+    VWaitExclusiveState_r(vp);
  
-    IH_RELEASE(vp->vnodeIndex[vLarge].handle);
-    IH_RELEASE(vp->vnodeIndex[vSmall].handle);
-    IH_RELEASE(vp->diskDataHandle);
-    IH_RELEASE(vp->linkHandle);
+    vp->goingOffline = 1;
+    V_needsSalvaged(vp) = 1;
+
+    VRequestSalvage_r(vp, SALVSYNC_ERROR, 0);
+    VCancelReservation_r(vp);
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+void
+VTakeOffline_r(register Volume * vp)
+{
+    assert(vp->nUsers > 0);
+    assert(programType == fileServer);
+
+    vp->goingOffline = 1;
+    V_needsSalvaged(vp) = 1;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+void
+VTakeOffline(register Volume * vp)
+{
+    VOL_LOCK;
+    VTakeOffline_r(vp);
+    VOL_UNLOCK;
  }
  
  /* Force the volume offline, set the salvage flag.  No further references to
   * the volume through the volume package will be honored. */
+/* for demand attach, caller MUST hold ref count on vp */
  void
-VForceOffline_r(Volume * vp)
+VForceOffline_r(Volume * vp, int flags)
  {
      Error error;
      if (!V_inUse(vp))
@@ -1267,7 +2896,17 @@ VForceOffline_r(Volume * vp)
      V_inUse(vp) = 0;
      vp->goingOffline = 0;
      V_needsSalvaged(vp) = 1;
-    VUpdateVolume_r(&error, vp);
+    if (!(flags & VOL_FORCEOFF_NOUPDATE)) {
+       VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT | VOL_UPDATE_NOFORCEOFF);
+    }
+#ifdef AFS_DEMAND_ATTACH_FS
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (programType == fileServer) {
+       VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+    }
+#endif
+    VChangeState_r(vp, VOL_STATE_ERROR);
+#endif /* AFS_DEMAND_ATTACH_FS */
  #ifdef AFS_PTHREAD_ENV
      assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
  #else /* AFS_PTHREAD_ENV */
@@ -1275,14 +2914,13 @@ VForceOffline_r(Volume * vp)
  #endif /* AFS_PTHREAD_ENV */
  
      VReleaseVolumeHandles_r(vp);
-
  }
  
  void
  VForceOffline(Volume * vp)
  {
      VOL_LOCK;
-    VForceOffline_r(vp);
+    VForceOffline_r(vp, 0);
      VOL_UNLOCK;
  }
  
@@ -1295,6 +2933,7 @@ VOffline_r(Volume * vp, char *message)
  {
      Error error;
      VolumeId vid = V_id(vp);
+
      assert(programType != volumeUtility);
      if (!V_inUse(vp)) {
         VPutVolume_r(vp);
@@ -1303,11 +2942,24 @@ VOffline_r(Volume * vp, char *message)
      if (V_offlineMessage(vp)[0] == '\0')
         strncpy(V_offlineMessage(vp), message, sizeof(V_offlineMessage(vp)));
      V_offlineMessage(vp)[sizeof(V_offlineMessage(vp)) - 1] = '\0';
+
      vp->goingOffline = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, VOL_STATE_GOING_OFFLINE);
+    VCreateReservation_r(vp);
+    VPutVolume_r(vp);
+
+    /* wait for the volume to go offline */
+    if (V_attachState(vp) == VOL_STATE_GOING_OFFLINE) {
+       VWaitStateChange_r(vp);
+    }
+    VCancelReservation_r(vp);
+#else /* AFS_DEMAND_ATTACH_FS */
      VPutVolume_r(vp);
      vp = VGetVolume_r(&error, vid);    /* Wait for it to go offline */
      if (vp)                    /* In case it was reattached... */
         VPutVolume_r(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
  }
  
  void
@@ -1318,36 +2970,12 @@ VOffline(Volume * vp, char *message)
      VOL_UNLOCK;
  }
  
-/* For VDetachVolume, we close all cached file descriptors, but keep
- * the Inode handles in case we need to read from a busy volume.
- */
-static void
-VCloseVolumeHandles_r(Volume * vp)
-{
-    DFlushVolume(V_id(vp));
-    VCloseVnodeFiles_r(vp);
-
-    /* Too time consuming and unnecessary for the volserver */
-    if (programType != volumeUtility) {
-       IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
-       IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
-       IH_CONDSYNC(vp->diskDataHandle);
-#ifdef AFS_NT40_ENV
-       IH_CONDSYNC(vp->linkHandle);
-#endif /* AFS_NT40_ENV */
-    }
-
-    IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
-    IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
-    IH_REALLYCLOSE(vp->diskDataHandle);
-    IH_REALLYCLOSE(vp->linkHandle);
-}
-
  /* This gets used for the most part by utility routines that don't want
   * to keep all the volume headers around.  Generally, the file server won't
   * call this routine, because then the offline message in the volume header
- * (or other information) will still be available to clients. For NAMEI, also
- * close the file handles.
+ * (or other information) won't be available to clients. For NAMEI, also
+ * close the file handles.  However, the fileserver does call this during
+ * an attach following a volume operation.
   */
  void
  VDetachVolume_r(Error * ec, Volume * vp)
@@ -1365,9 +2993,18 @@ VDetachVolume_r(Error * ec, Volume * vp)
      volume = V_id(vp);
      DeleteVolumeFromHashTable(vp);
      vp->shuttingDown = 1;
+#ifdef AFS_DEMAND_ATTACH_FS
+    DeleteVolumeFromVByPList_r(vp);
+    VLRU_Delete_r(vp);
+    VChangeState_r(vp, VOL_STATE_SHUTTING_DOWN);
+#endif /* AFS_DEMAND_ATTACH_FS */
      VPutVolume_r(vp);
      /* Will be detached sometime in the future--this is OK since volume is offline */
  
+    /* XXX the following code should really be moved to VCheckDetach() since the volume
+     * is not technically detached until the refcounts reach zero
+     */
+#ifdef FSSYNC_BUILD_CLIENT
      if (programType == volumeUtility && notifyServer) {
         /* 
          * Note:  The server is not notified in the case of a bogus volume 
@@ -1378,19 +3015,26 @@ VDetachVolume_r(Error * ec, Volume * vp)
          * would be two instances of the same volume, one of them bogus, 
          * which the file server would attempt to put on line 
          */
-       if (useDone)
+       if (useDone) {
             /* don't put online */
-           FSYNC_askfs(volume, tpartp->name, FSYNC_DONE, 0);
-       else {
+           FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_DONE, 0, NULL);
+       } else {
             /* fs can use it again */
-           FSYNC_askfs(volume, tpartp->name, FSYNC_ON, 0);
+           FSYNC_VolOp(volume, tpartp->name, FSYNC_VOL_ON, 0, NULL);
+
+           /* XXX this code path is only hit by volume utilities, thus
+            * V_BreakVolumeCallbacks will always be NULL.  if we really
+            * want to break callbacks in this path we need to use FSYNC_VolOp() */
+#ifdef notdef
             /* Dettaching it so break all callbacks on it */
             if (V_BreakVolumeCallbacks) {
                 Log("volume %u detached; breaking all call backs\n", volume);
                 (*V_BreakVolumeCallbacks) (volume);
             }
+#endif
         }
      }
+#endif /* FSSYNC_BUILD_CLIENT */
  }
  
  void
@@ -1402,134 +3046,148 @@ VDetachVolume(Error * ec, Volume * vp)
  }
  
  
-VnodeId
-VAllocBitmapEntry_r(Error * ec, Volume * vp, register struct vnodeIndex
-                   *index)
+/***************************************************/
+/* Volume fd/inode handle closing routines         */
+/***************************************************/
+
+/* For VDetachVolume, we close all cached file descriptors, but keep
+ * the Inode handles in case we need to read from a busy volume.
+ */
+/* for demand attach, caller MUST hold ref count on vp */
+static void
+VCloseVolumeHandles_r(Volume * vp)
  {
-    register byte *bp, *ep;
-    *ec = 0;
-    /* This test is probably redundant */
-    if (!VolumeWriteable(vp)) {
-       *ec = (bit32) VREADONLY;
-       return 0;
-    }
-#ifdef BITMAP_LATER
-    if ((programType == fileServer) && !index->bitmap) {
-       int i;
-       int wasVBUSY = 0;
-       if (vp->specialStatus == VBUSY) {
-           if (vp->goingOffline) {     /* vos dump waiting for the volume to
-                                        * go offline. We probably come here
-                                        * from AddNewReadableResidency */
-               wasVBUSY = 1;
-           } else {
-               VOL_UNLOCK;
-               while (vp->specialStatus == VBUSY)
-#ifdef AFS_PTHREAD_ENV
-                   sleep(2);
-#else /* AFS_PTHREAD_ENV */
-                   IOMGR_Sleep(2);
-#endif /* AFS_PTHREAD_ENV */
-               VOL_LOCK;
-           }
-       }
-       if (!index->bitmap) {
-           vp->specialStatus = VBUSY;  /* Stop anyone else from using it. */
-           for (i = 0; i < nVNODECLASSES; i++) {
-               VOL_UNLOCK;
-               GetBitmap(ec, vp, i);
-               VOL_LOCK;
-               if (*ec) {
-                   vp->specialStatus = 0;
-                   vp->shuttingDown = 1;       /* Let who has it free it. */
-                   return NULL;
-               }
-           }
-           if (!wasVBUSY)
-               vp->specialStatus = 0;  /* Allow others to have access. */
-       }
-    }
-#endif /* BITMAP_LATER */
-    bp = index->bitmap + index->bitmapOffset;
-    ep = index->bitmap + index->bitmapSize;
-    while (bp < ep) {
-       if ((*(bit32 *) bp) != (bit32) 0xffffffff) {
-           int o;
-           index->bitmapOffset = (afs_uint32) (bp - index->bitmap);
-           while (*bp == 0xff)
-               bp++;
-           o = ffs(~*bp) - 1;  /* ffs is documented in BSTRING(3) */
-           *bp |= (1 << o);
-           return (VnodeId) ((bp - index->bitmap) * 8 + o);
-       }
-       bp += sizeof(bit32) /* i.e. 4 */ ;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    state_save = VChangeState_r(vp, VOL_STATE_OFFLINING);
+#endif
+
+    /* demand attach fs
+     *
+     * XXX need to investigate whether we can perform
+     * DFlushVolume outside of vol_glock_mutex... 
+     *
+     * VCloseVnodeFiles_r drops the glock internally */
+    DFlushVolume(V_id(vp));
+    VCloseVnodeFiles_r(vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif
+
+    /* Too time consuming and unnecessary for the volserver */
+    if (programType != volumeUtility) {
+       IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+       IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+       IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+       IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
      }
-    /* No bit map entry--must grow bitmap */
-    bp = (byte *)
-       realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
-    assert(bp != NULL);
-    index->bitmap = bp;
-    bp += index->bitmapSize;
-    memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
-    index->bitmapOffset = index->bitmapSize;
-    index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
-    *bp = 1;
-    return index->bitmapOffset * 8;
-}
  
-VnodeId
-VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
-{
-    VnodeId retVal;
+    IH_REALLYCLOSE(vp->vnodeIndex[vLarge].handle);
+    IH_REALLYCLOSE(vp->vnodeIndex[vSmall].handle);
+    IH_REALLYCLOSE(vp->diskDataHandle);
+    IH_REALLYCLOSE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
      VOL_LOCK;
-    retVal = VAllocBitmapEntry_r(ec, vp, index);
-    VOL_UNLOCK;
-    return retVal;
+    VChangeState_r(vp, state_save);
+#endif
  }
  
-void
-VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
-                  unsigned bitNumber)
+/* For both VForceOffline and VOffline, we close all relevant handles.
+ * For VOffline, if we re-attach the volume, the files may possible be
+ * different than before. 
+ */
+/* for demand attach, caller MUST hold a ref count on vp */
+static void
+VReleaseVolumeHandles_r(Volume * vp)
  {
-    unsigned int offset;
-    *ec = 0;
-#ifdef BITMAP_LATER
-    if (!index->bitmap)
-       return;
-#endif /* BITMAP_LATER */
-    offset = bitNumber >> 3;
-    if (offset >= index->bitmapSize) {
-       *ec = VNOVNODE;
-       return;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    state_save = VChangeState_r(vp, VOL_STATE_DETACHING);
+#endif
+
+    /* XXX need to investigate whether we can perform
+     * DFlushVolume outside of vol_glock_mutex... */
+    DFlushVolume(V_id(vp));
+
+    VReleaseVnodeFiles_r(vp); /* releases the glock internally */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif
+
+    /* Too time consuming and unnecessary for the volserver */
+    if (programType != volumeUtility) {
+       IH_CONDSYNC(vp->vnodeIndex[vLarge].handle);
+       IH_CONDSYNC(vp->vnodeIndex[vSmall].handle);
+       IH_CONDSYNC(vp->diskDataHandle);
+#ifdef AFS_NT40_ENV
+       IH_CONDSYNC(vp->linkHandle);
+#endif /* AFS_NT40_ENV */
      }
-    if (offset < index->bitmapOffset)
-       index->bitmapOffset = offset & ~3;      /* Truncate to nearest bit32 */
-    *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
-}
  
-void
-VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
-                unsigned bitNumber)
-{
+    IH_RELEASE(vp->vnodeIndex[vLarge].handle);
+    IH_RELEASE(vp->vnodeIndex[vSmall].handle);
+    IH_RELEASE(vp->diskDataHandle);
+    IH_RELEASE(vp->linkHandle);
+
+#ifdef AFS_DEMAND_ATTACH_FS
      VOL_LOCK;
-    VFreeBitMapEntry_r(ec, index, bitNumber);
-    VOL_UNLOCK;
+    VChangeState_r(vp, state_save);
+#endif
  }
  
+
+/***************************************************/
+/* Volume write and fsync routines                 */
+/***************************************************/
+
  void
-VUpdateVolume_r(Error * ec, Volume * vp)
+VUpdateVolume_r(Error * ec, Volume * vp, int flags)
  {
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+
+    if (flags & VOL_UPDATE_WAIT) {
+       VCreateReservation_r(vp);
+       VWaitExclusiveState_r(vp);
+    }
+#endif
+
      *ec = 0;
      if (programType == fileServer)
         V_uniquifier(vp) =
             (V_inUse(vp) ? V_nextVnodeUnique(vp) +
              200 : V_nextVnodeUnique(vp));
-    /*printf("Writing volume header for '%s'\n", V_name(vp)); */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+    VOL_UNLOCK;
+#endif
+
      WriteVolumeHeader_r(ec, vp);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+    VChangeState_r(vp, state_save);
+    if (flags & VOL_UPDATE_WAIT) {
+       VCancelReservation_r(vp);
+    }
+#endif
+
      if (*ec) {
         Log("VUpdateVolume: error updating volume header, volume %u (%s)\n",
             V_id(vp), V_name(vp));
-       VForceOffline_r(vp);
+       /* try to update on-disk header, 
+        * while preventing infinite recursion */
+       if (!(flags & VOL_UPDATE_NOFORCEOFF)) {
+           VForceOffline_r(vp, VOL_FORCEOFF_NOUPDATE);
+       }
      }
  }
  
@@ -1537,22 +3195,38 @@ void
  VUpdateVolume(Error * ec, Volume * vp)
  {
      VOL_LOCK;
-    VUpdateVolume_r(ec, vp);
+    VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
      VOL_UNLOCK;
  }
  
  void
-VSyncVolume_r(Error * ec, Volume * vp)
+VSyncVolume_r(Error * ec, Volume * vp, int flags)
  {
      FdHandle_t *fdP;
-    VUpdateVolume_r(ec, vp);
-    if (!ec) {
-       int code;
+    int code;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif
+
+    if (flags & VOL_SYNC_WAIT) {
+       VUpdateVolume_r(ec, vp, VOL_UPDATE_WAIT);
+    } else {
+       VUpdateVolume_r(ec, vp, 0);
+    }
+    if (!*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       state_save = VChangeState_r(vp, VOL_STATE_UPDATING);
+       VOL_UNLOCK;
+#endif
         fdP = IH_OPEN(V_diskDataHandle(vp));
         assert(fdP != NULL);
         code = FDH_SYNC(fdP);
         assert(code == 0);
         FDH_CLOSE(fdP);
+#ifdef AFS_DEMAND_ATTACH_FS
+       VOL_LOCK;
+       VChangeState_r(vp, state_save);
+#endif
      }
  }
  
@@ -1560,369 +3234,2141 @@ void
  VSyncVolume(Error * ec, Volume * vp)
  {
      VOL_LOCK;
-    VSyncVolume_r(ec, vp);
+    VSyncVolume_r(ec, vp, VOL_SYNC_WAIT);
      VOL_UNLOCK;
  }
  
+
+/***************************************************/
+/* Volume dealloaction routines                    */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
  static void
  FreeVolume(Volume * vp)
+{
+    /* free the heap space, iff it's safe.
+     * otherwise, pull it out of the hash table, so it
+     * will get deallocated when all refs to it go away */
+    if (!VCheckFree(vp)) {
+       DeleteVolumeFromHashTable(vp);
+       DeleteVolumeFromVByPList_r(vp);
+
+       /* make sure we invalidate the header cache entry */
+       FreeVolumeHeader(vp);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+static void
+ReallyFreeVolume(Volume * vp)
  {
      int i;
      if (!vp)
         return;
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* debug */
+    VChangeState_r(vp, VOL_STATE_FREED);
+    if (vp->pending_vol_op)
+       free(vp->pending_vol_op);
+#endif /* AFS_DEMAND_ATTACH_FS */
      for (i = 0; i < nVNODECLASSES; i++)
         if (vp->vnodeIndex[i].bitmap)
             free(vp->vnodeIndex[i].bitmap);
      FreeVolumeHeader(vp);
+#ifndef AFS_DEMAND_ATTACH_FS
      DeleteVolumeFromHashTable(vp);
+#endif /* AFS_DEMAND_ATTACH_FS */
      free(vp);
  }
  
-static void
-GetBitmap(Error * ec, Volume * vp, VnodeClass class)
+/* check to see if we should shutdown this volume
+ * returns 1 if volume was freed, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckDetach(register Volume * vp)
  {
-    StreamHandle_t *file;
-    int nVnodes;
-    int size;
-    struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
-    struct vnodeIndex *vip = &vp->vnodeIndex[class];
-    struct VnodeDiskObject *vnode;
-    unsigned int unique = 0;
-    FdHandle_t *fdP;
-#ifdef BITMAP_LATER
-    byte *BitMap = 0;
-#endif /* BITMAP_LATER */
+    int ret = 0;
  
-    *ec = 0;
+    if (vp->nUsers || vp->nWaiters)
+       return ret;
  
-    fdP = IH_OPEN(vip->handle);
-    assert(fdP != NULL);
-    file = FDH_FDOPEN(fdP, "r");
-    assert(file != NULL);
-    vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
-    assert(vnode != NULL);
-    size = OS_SIZE(fdP->fd_fd);
-    assert(size != -1);
-    nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
-       >> vcp->logSize;
-    vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4;    /* The 10 is a little extra so
-                                                        * a few files can be created in this volume,
-                                                        * the whole thing is rounded up to nearest 4
-                                                        * bytes, because the bit map allocator likes
-                                                        * it that way */
-#ifdef BITMAP_LATER
-    BitMap = (byte *) calloc(1, vip->bitmapSize);
-    assert(BitMap != NULL);
-#else /* BITMAP_LATER */
-    vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
-    assert(vip->bitmap != NULL);
-    vip->bitmapOffset = 0;
-#endif /* BITMAP_LATER */
-    if (STREAM_SEEK(file, vcp->diskSize, 0) != -1) {
-       int bitNumber = 0;
-       for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
-           if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
-               break;
-           if (vnode->type != vNull) {
-               if (vnode->vnodeMagic != vcp->magic) {
-                   Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp));
-                   *ec = VSALVAGE;
-                   break;
-               }
-#ifdef BITMAP_LATER
-               *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
-#else /* BITMAP_LATER */
-               *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
-#endif /* BITMAP_LATER */
-               if (unique <= vnode->uniquifier)
-                   unique = vnode->uniquifier + 1;
-           }
-#ifndef AFS_PTHREAD_ENV
-           if ((bitNumber & 0x00ff) == 0x0ff) {        /* every 256 iterations */
-               IOMGR_Poll();
-           }
-#endif /* !AFS_PTHREAD_ENV */
+    if (vp->shuttingDown) {
+       ret = 1;
+       VReleaseVolumeHandles_r(vp);
+       VCheckSalvage(vp);
+       ReallyFreeVolume(vp);
+       if (programType == fileServer) {
+           assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
         }
      }
-    if (vp->nextVnodeUnique < unique) {
-       Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
-       *ec = VSALVAGE;
+    return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckDetach(register Volume * vp)
+{
+    int ret = 0;
+
+    if (vp->nUsers)
+       return ret;
+
+    if (vp->shuttingDown) {
+       ret = 1;
+       VReleaseVolumeHandles_r(vp);
+       ReallyFreeVolume(vp);
+       if (programType == fileServer) {
+#if defined(AFS_PTHREAD_ENV)
+           assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+           LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
+       }
      }
-    /* Paranoia, partly justified--I think fclose after fdopen
-     * doesn't seem to close fd.  In any event, the documentation
-     * doesn't specify, so it's safer to close it twice.
-     */
-    STREAM_CLOSE(file);
-    FDH_CLOSE(fdP);
-    free(vnode);
-#ifdef BITMAP_LATER
-    /* There may have been a racing condition with some other thread, both
-     * creating the bitmaps for this volume. If the other thread was faster
-     * the pointer to bitmap should already be filled and we can free ours.
-     */
-    if (vip->bitmap == NULL) {
-       vip->bitmap = BitMap;
-       vip->bitmapOffset = 0;
-    } else
-       free((byte *) BitMap);
-#endif /* BITMAP_LATER */
+    return ret;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
-static void
-GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
+/* check to see if we should offline this volume
+ * return 1 if volume went offline, 0 otherwise */
+#ifdef AFS_DEMAND_ATTACH_FS
+static int
+VCheckOffline(register Volume * vp)
  {
-    static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
-    char path[VMAXPATHLEN];
-    int found = 0;
-    struct DiskPartition *dp;
+    Volume * rvp = NULL;
+    int ret = 0;
  
-    *ec = 0;
-    name[0] = '/';
-    (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, volumeId);
-    for (dp = DiskPartitionList; dp; dp = dp->next) {
-       struct afs_stat status;
-       strcpy(path, VPartitionPath(dp));
-       strcat(path, name);
-       if (afs_stat(path, &status) == 0) {
-           strcpy(partition, dp->name);
-           found = 1;
-           break;
+    if (vp->goingOffline && !vp->nUsers) {
+       Error error;
+       assert(programType == fileServer);
+       assert((V_attachState(vp) != VOL_STATE_ATTACHED) &&
+              (V_attachState(vp) != VOL_STATE_FREED) &&
+              (V_attachState(vp) != VOL_STATE_PREATTACHED) &&
+              (V_attachState(vp) != VOL_STATE_UNATTACHED));
+
+       /* valid states:
+        *
+        * VOL_STATE_GOING_OFFLINE
+        * VOL_STATE_SHUTTING_DOWN
+        * IsErrorState(V_attachState(vp))
+        * IsExclusiveState(V_attachState(vp))
+        */
+
+       VCreateReservation_r(vp);
+       VChangeState_r(vp, VOL_STATE_OFFLINING);
+
+       ret = 1;
+       /* must clear the goingOffline flag before we drop the glock */
+       vp->goingOffline = 0;
+       V_inUse(vp) = 0;
+
+       VLRU_Delete_r(vp);
+
+       /* perform async operations */
+       VUpdateVolume_r(&error, vp, 0);
+       VCloseVolumeHandles_r(vp);
+
+       /* invalidate the volume header cache entry */
+       FreeVolumeHeader(vp);
+
+       if (LogLevel) {
+           Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
+               V_name(vp));
+           if (V_offlineMessage(vp)[0])
+               Log(" (%s)", V_offlineMessage(vp));
+           Log("\n");
+       }
+
+       /* if nothing changed state to error or salvaging,
+        * drop state to unattached */
+       if (!IsErrorState(V_attachState(vp))) {
+           VChangeState_r(vp, VOL_STATE_UNATTACHED);
         }
+       VCancelReservation_r(vp);
      }
-    if (!found) {
-       *ec = VNOVOL;
-       *partitionp = *namep = NULL;
-    } else {
-       *partitionp = partition;
-       *namep = name;
+    return ret;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static int
+VCheckOffline(register Volume * vp)
+{
+    Volume * rvp = NULL;
+    int ret = 0;
+
+    if (vp->goingOffline && !vp->nUsers) {
+       Error error;
+       assert(programType == fileServer);
+
+       ret = 1;
+       vp->goingOffline = 0;
+       V_inUse(vp) = 0;
+       VUpdateVolume_r(&error, vp, 0);
+       VCloseVolumeHandles_r(vp);
+       FreeVolumeHeader(vp);
+       if (LogLevel) {
+           Log("VOffline: Volume %u (%s) is now offline", V_id(vp),
+               V_name(vp));
+           if (V_offlineMessage(vp)[0])
+               Log(" (%s)", V_offlineMessage(vp));
+           Log("\n");
+       }
+#ifdef AFS_PTHREAD_ENV
+       assert(pthread_cond_broadcast(&vol_put_volume_cond) == 0);
+#else /* AFS_PTHREAD_ENV */
+       LWP_NoYieldSignal(VPutVolume);
+#endif /* AFS_PTHREAD_ENV */
      }
+    return ret;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
-int
-VolumeNumber(char *name)
+/***************************************************/
+/* demand attach fs ref counting routines          */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* the following two functions handle reference counting for
+ * asynchronous operations on volume structs.
+ *
+ * their purpose is to prevent a VDetachVolume or VShutdown
+ * from free()ing the Volume struct during an async i/o op */
+
+/* register with the async volume op ref counter */
+static void
+VCreateReservation_r(Volume * vp)
  {
-    if (*name == '/')
-       name++;
-    return atoi(name + 1);
+    vp->nWaiters++;
  }
  
-char *
-VolumeExternalName(VolumeId volumeId)
+/* unregister with the async volume op ref counter */
+static void
+VCancelReservation_r(Volume * vp)
  {
-    static char name[VMAXPATHLEN];
-    (void)afs_snprintf(name, sizeof name, VFORMAT, volumeId);
-    return name;
+    assert(--vp->nWaiters >= 0);
+    if (vp->nWaiters == 0) {
+       VCheckOffline(vp);
+       if (!VCheckDetach(vp)) {
+           VCheckSalvage(vp);
+           VCheckFree(vp);
+       }
+    }
  }
  
-#if OPENAFS_VOL_STATS
-#define OneDay (86400)         /* 24 hours' worth of seconds */
-#else
-#define OneDay (24*60*60)      /* 24 hours */
-#endif /* OPENAFS_VOL_STATS */
+/* check to see if we should free this volume now
+ * return 1 if volume was freed, 0 otherwise */
+static int
+VCheckFree(Volume * vp)
+{
+    int ret = 0;
+    if ((vp->nUsers == 0) &&
+       (vp->nWaiters == 0) &&
+       !(V_attachFlags(vp) & (VOL_IN_HASH | 
+                              VOL_ON_VBYP_LIST | 
+                              VOL_IS_BUSY |
+                              VOL_ON_VLRU))) {
+       ReallyFreeVolume(vp);
+       ret = 1;
+    }
+    return ret;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
  
-#define Midnight(date) ((date-TimeZoneCorrection)/OneDay*OneDay+TimeZoneCorrection)
  
-/*------------------------------------------------------------------------
- * [export] VAdjustVolumeStatistics
- *
- * Description:
- *     If we've passed midnight, we need to update all the day use
- *     statistics as well as zeroing the detailed volume statistics
- *     (if we are implementing them).
- *
- * Arguments:
- *     vp : Pointer to the volume structure describing the lucky
- *             volume being considered for update.
- *
- * Returns:
- *     0 (always!)
- *
- * Environment:
- *     Nothing interesting.
- *
- * Side Effects:
- *     As described.
- *------------------------------------------------------------------------*/
+/***************************************************/
+/* online volume operations routines               */
+/***************************************************/
  
+#ifdef AFS_DEMAND_ATTACH_FS
  int
-VAdjustVolumeStatistics_r(register Volume * vp)
+VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
  {
-    unsigned int now = FT_ApproxTime();
-
-    if (now - V_dayUseDate(vp) > OneDay) {
-       register ndays, i;
-
-       ndays = (now - V_dayUseDate(vp)) / OneDay;
-       for (i = 6; i > ndays - 1; i--)
-           V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays];
-       for (i = 0; i < ndays - 1 && i < 7; i++)
-           V_weekUse(vp)[i] = 0;
-       if (ndays <= 7)
-           V_weekUse(vp)[ndays - 1] = V_dayUse(vp);
-       V_dayUse(vp) = 0;
-       V_dayUseDate(vp) = Midnight(now);
+    FSSYNC_VolOp_info * info;
  
-#if OPENAFS_VOL_STATS
-       /*
-        * All we need to do is bzero the entire VOL_STATS_BYTES of
-        * the detailed volume statistics area.
-        */
-       memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
-#endif /* OPENAFS_VOL_STATS */
-    }
+    /* attach a vol op info node to the volume struct */
+    info = (FSSYNC_VolOp_info *) malloc(sizeof(FSSYNC_VolOp_info));
+    assert(info != NULL);
+    memcpy(info, vopinfo, sizeof(FSSYNC_VolOp_info));
+    vp->pending_vol_op = info;
  
-    /*It's been more than a day of collection */
-    /*
-     * Always return happily.
-     */
-    return (0);
-}                              /*VAdjustVolumeStatistics */
+    /* update stats */
+    vp->stats.last_vol_op = FT_ApproxTime();
+    vp->stats.vol_ops++;
+    IncUInt64(&VStats.vol_ops);
  
-int
-VAdjustVolumeStatistics(register Volume * vp)
-{
-    int retVal;
-    VOL_LOCK;
-    retVal = VAdjustVolumeStatistics_r(vp);
-    VOL_UNLOCK;
-    return retVal;
+    return 0;
  }
  
-void
-VBumpVolumeUsage_r(register Volume * vp)
+int
+VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
  {
-    unsigned int now = FT_ApproxTime();
-    if (now - V_dayUseDate(vp) > OneDay)
-       VAdjustVolumeStatistics_r(vp);
-    /*
-     * Save the volume header image to disk after every 128 bumps to dayUse.
-     */
-    if ((V_dayUse(vp)++ & 127) == 0) {
-       Error error;
-       VUpdateVolume_r(&error, vp);
+    if (vp->pending_vol_op) {
+       free(vp->pending_vol_op);
+       vp->pending_vol_op = NULL;
      }
+    return 0;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
  
-void
-VBumpVolumeUsage(register Volume * vp)
+int
+VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
  {
-    VOL_LOCK;
-    VBumpVolumeUsage_r(vp);
-    VOL_UNLOCK;
+    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+           (vopinfo->com.reason == V_READONLY ||
+            (!VolumeWriteable(vp) &&
+             (vopinfo->com.reason == V_CLONE ||
+              vopinfo->com.reason == V_DUMP))));
  }
  
-void
-VSetDiskUsage_r(void)
+int
+VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo)
  {
-    static int FifteenMinuteCounter = 0;
+    return (vopinfo->com.command == FSYNC_VOL_NEEDVOLUME &&
+           (vopinfo->com.reason == V_CLONE ||
+            vopinfo->com.reason == V_DUMP));
+}
  
-    while (VInit < 2) {
-       /* NOTE: Don't attempt to access the partitions list until the
-        * initialization level indicates that all volumes are attached,
-        * which implies that all partitions are initialized. */
-#ifdef AFS_PTHREAD_ENV
-       sleep(10);
-#else /* AFS_PTHREAD_ENV */
-       IOMGR_Sleep(10);
-#endif /* AFS_PTHREAD_ENV */
-    }
  
-    VResetDiskUsage_r();
-    if (++FifteenMinuteCounter == 3) {
-       FifteenMinuteCounter = 0;
-       VScanUpdateList();
+/***************************************************/
+/* online salvager routines                        */
+/***************************************************/
+#if defined(AFS_DEMAND_ATTACH_FS)
+#define SALVAGE_PRIO_UPDATE_INTERVAL 3      /* number of seconds between prio updates */
+#define SALVAGE_COUNT_MAX 16                /* number of online salvages we
+                                            * allow before moving the volume
+                                            * into a permanent error state
+                                            *
+                                            * once this threshold is reached,
+                                            * the operator will have to manually
+                                            * issue a 'bos salvage' to bring
+                                            * the volume back online
+                                            */
+
+/* check to see if we should salvage this volume
+ * returns 1 if salvage scheduled, 0 otherwise */
+static int
+VCheckSalvage(register Volume * vp)
+{
+    int ret = 0;
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (vp->nUsers || vp->nWaiters)
+       return ret;
+    if (vp->salvage.requested) {
+       VScheduleSalvage_r(vp);
+       ret = 1;
      }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
  }
  
-void
-VSetDiskUsage(void)
+/*
+ * request that a salvage be performed once
+ * ref counts reach zero
+ */
+int
+VRequestSalvage_r(Volume * vp, int reason, int flags)
  {
-    VOL_LOCK;
-    VSetDiskUsage_r();
-    VOL_UNLOCK;
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (programType != fileServer)
+       return 1;
+
+    if (!vp->salvage.requested) {
+       vp->salvage.requested = 1;
+       vp->salvage.reason = reason;
+       vp->stats.last_salvage = FT_ApproxTime();
+       if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
+           ReleaseVolumeHeader(vp->header);
+       }
+       if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
+           VChangeState_r(vp, VOL_STATE_SALVAGING);
+       } else {
+           Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
+           VChangeState_r(vp, VOL_STATE_ERROR);
+       }
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return 0;
  }
  
-/* The number of minutes that a volume hasn't been updated before the
- * "Dont salvage" flag in the volume header will be turned on */
-#define SALVAGE_INTERVAL       (10*60)
+/*
+ * update salvage priority
+ */
+static int
+VUpdateSalvagePriority_r(Volume * vp)
+{
+    int code, ret=0;
+    afs_uint32 now;
  
-static VolumeId *UpdateList;   /* Pointer to array of Volume ID's */
-static int nUpdatedVolumes;    /* Updated with entry in UpdateList, salvage after crash flag on */
-static int updateSize;         /* number of entries possible */
-#define UPDATE_LIST_SIZE 100   /* size increment */
+#ifdef SALVSYNC_BUILD_CLIENT
+    vp->salvage.prio++;
+    now = FT_ApproxTime();
  
-void
-VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
-{
-    *ec = 0;
-    vp->updateTime = FT_ApproxTime();
-    if (V_dontSalvage(vp) == 0)
-       return;
-    V_dontSalvage(vp) = 0;
-    VSyncVolume_r(ec, vp);
-    if (*ec)
-       return;
-    if (!UpdateList) {
-       updateSize = UPDATE_LIST_SIZE;
-       UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
-    } else {
-       if (nUpdatedVolumes == updateSize) {
-           updateSize += UPDATE_LIST_SIZE;
-           UpdateList =
-               (VolumeId *) realloc(UpdateList,
-                                    sizeof(VolumeId) * updateSize);
+    /* update the salvageserver priority queue occasionally so that
+     * frequently requested volumes get moved to the head of the queue 
+     */
+    if ((vp->salvage.scheduled) &&
+       (vp->stats.last_salvage_req < (now-SALVAGE_PRIO_UPDATE_INTERVAL))) {
+       code = SALVSYNC_SalvageVolume(vp->hashid,
+                                     VPartitionPath(vp->partition),
+                                     SALVSYNC_RAISEPRIO,
+                                     vp->salvage.reason,
+                                     vp->salvage.prio,
+                                     NULL);
+       vp->stats.last_salvage_req = now;
+       if (code != SYNC_OK) {
+           ret = 1;
         }
      }
-    assert(UpdateList != NULL);
-    UpdateList[nUpdatedVolumes++] = V_id(vp);
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
  }
  
-static void
-VScanUpdateList(void)
+
+/*
+ * schedule a salvage with the salvage server
+ */
+static int
+VScheduleSalvage_r(Volume * vp)
  {
-    register int i, gap;
-    register Volume *vp;
-    Error error;
+    int code, ret=0;
+#ifdef SALVSYNC_BUILD_CLIENT
+    VolState state_save;
+    char partName[16];
+
+    if (vp->nWaiters || vp->nUsers) {
+       return 1;
+    }
+
+    /* prevent endless salvage,attach,salvage,attach,... loops */
+    if (vp->stats.salvages >= SALVAGE_COUNT_MAX)
+       return 1;
+
+    if (!vp->salvage.scheduled) {
+       /* if we haven't previously scheduled a salvage, do so now 
+        *
+        * set the volume to an exclusive state and drop the lock
+        * around the SALVSYNC call
+        */
+       strlcpy(partName, VPartitionPath(vp->partition), sizeof(partName));
+       state_save = VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
+       V_attachFlags(vp) |= VOL_IS_BUSY;
+       VOL_UNLOCK;
+
+       /* can't use V_id() since there's no guarantee
+        * we have the disk data header at this point */
+       code = SALVSYNC_SalvageVolume(vp->hashid,
+                                     partName,
+                                     SALVSYNC_SALVAGE,
+                                     vp->salvage.reason,
+                                     vp->salvage.prio,
+                                     NULL);
+       VOL_LOCK;
+       VChangeState_r(vp, state_save);
+       V_attachFlags(vp) &= ~(VOL_IS_BUSY);
+
+       if (code == SYNC_OK) {
+           vp->salvage.scheduled = 1;
+           vp->stats.salvages++;
+           vp->stats.last_salvage_req = FT_ApproxTime();
+           IncUInt64(&VStats.salvages);
+       } else {
+           ret = 1;
+           switch(code) {
+           case SYNC_BAD_COMMAND:
+           case SYNC_COM_ERROR:
+               break;
+           case SYNC_DENIED:
+               Log("VScheduleSalvage_r:  SALVSYNC request denied\n");
+               break;
+           default:
+               Log("VScheduleSalvage_r:  SALVSYNC unknown protocol error\n");
+               break;
+           }
+       }
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+/*
+ * cancel a scheduled salvage operation
+ */
+static int
+VCancelSalvage_r(Volume * vp, int reason)
+{
+    int code, ret = 0;
+
+#ifdef SALVSYNC_BUILD_CLIENT
+    if (vp->salvage.scheduled) {
+       code = SALVSYNC_SalvageVolume(vp->hashid,
+                                     VPartitionPath(vp->partition),
+                                     SALVSYNC_CANCEL,
+                                     reason,
+                                     0,
+                                     NULL);
+       if (code == SYNC_OK) {
+           vp->salvage.scheduled = 0;
+       } else {
+           ret = 1;
+       }
+    }
+#endif /* SALVSYNC_BUILD_CLIENT */
+    return ret;
+}
+
+/* This must be called by any volume utility which needs to run while the
+   file server is also running.  This is separated from VInitVolumePackage so
+   that a utility can fork--and each of the children can independently
+   initialize communication with the file server */
+#ifdef SALVSYNC_BUILD_CLIENT
+int
+VConnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VConnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VConnectSALV_r(void)
+{
+    assert((programType != salvageServer) &&
+          (programType != volumeUtility));
+    return SALVSYNC_clientInit();
+}
+
+int
+VDisconnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    VDisconnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VDisconnectSALV_r(void)
+{ 
+    assert((programType != salvageServer) &&
+          (programType != volumeUtility));
+    return SALVSYNC_clientFinis();
+}
+
+int
+VReconnectSALV(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VReconnectSALV_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VReconnectSALV_r(void)
+{
+    assert((programType != salvageServer) &&
+          (programType != volumeUtility));
+    return SALVSYNC_clientReconnect();
+}
+#endif /* SALVSYNC_BUILD_CLIENT */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* FSSYNC routines                                 */
+/***************************************************/
+
+/* This must be called by any volume utility which needs to run while the
+   file server is also running.  This is separated from VInitVolumePackage so
+   that a utility can fork--and each of the children can independently
+   initialize communication with the file server */
+#ifdef FSSYNC_BUILD_CLIENT
+int
+VConnectFS(void)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VConnectFS_r();
+    VOL_UNLOCK;
+    return retVal;
+}
+
+int
+VConnectFS_r(void)
+{
+    int rc;
+    assert((VInit == 2) && 
+          (programType != fileServer) &&
+          (programType != salvager));
+    rc = FSYNC_clientInit();
+    if (rc)
+       VInit = 3;
+    return rc;
+}
+
+void
+VDisconnectFS_r(void)
+{
+    assert((programType != fileServer) &&
+          (programType != salvager));
+    FSYNC_clientFinis();
+    VInit = 2;
+}
+
+void
+VDisconnectFS(void)
+{
+    VOL_LOCK;
+    VDisconnectFS_r();
+    VOL_UNLOCK;
+}
+
+static int
+VChildProcReconnectFS_r(void)
+{
+    return FSYNC_clientChildProcReconnect();
+}
+
+int
+VChildProcReconnectFS(void)
+{
+    int ret;
+    VOL_LOCK;
+    ret = VChildProcReconnectFS_r();
+    VOL_UNLOCK;
+    return ret;
+}
+#endif /* FSSYNC_BUILD_CLIENT */
+
+
+/***************************************************/
+/* volume bitmap routines                          */
+/***************************************************/
+
+/*
+ * For demand attach fs, flags parameter controls
+ * locking behavior.  If (flags & VOL_ALLOC_BITMAP_WAIT)
+ * is set, then this function will create a reservation
+ * and block on any other exclusive operations.  Otherwise,
+ * this function assumes the caller already has exclusive
+ * access to vp, and we just change the volume state.
+ */
+VnodeId
+VAllocBitmapEntry_r(Error * ec, Volume * vp, 
+                   struct vnodeIndex *index, int flags)
+{
+    VnodeId ret;
+    register byte *bp, *ep;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    *ec = 0;
+
+    /* This test is probably redundant */
+    if (!VolumeWriteable(vp)) {
+       *ec = (bit32) VREADONLY;
+       return 0;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (flags & VOL_ALLOC_BITMAP_WAIT) {
+       VCreateReservation_r(vp);
+       VWaitExclusiveState_r(vp);
+    }
+    state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+#ifdef BITMAP_LATER
+    if ((programType == fileServer) && !index->bitmap) {
+       int i;
+#ifndef AFS_DEMAND_ATTACH_FS
+       /* demand attach fs uses the volume state to avoid races.
+        * specialStatus field is not used at all */
+       int wasVBUSY = 0;
+       if (vp->specialStatus == VBUSY) {
+           if (vp->goingOffline) {     /* vos dump waiting for the volume to
+                                        * go offline. We probably come here
+                                        * from AddNewReadableResidency */
+               wasVBUSY = 1;
+           } else {
+               while (vp->specialStatus == VBUSY) {
+#ifdef AFS_PTHREAD_ENV
+                   VOL_UNLOCK;
+                   sleep(2);
+                   VOL_LOCK;
+#else /* AFS_PTHREAD_ENV */
+                   IOMGR_Sleep(2);
+#endif /* AFS_DEMAND_ATTACH_FS */
+               }
+           }
+       }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+       if (!index->bitmap) {
+#ifndef AFS_DEMAND_ATTACH_FS
+           vp->specialStatus = VBUSY;  /* Stop anyone else from using it. */
+#endif /* AFS_DEMAND_ATTACH_FS */
+           for (i = 0; i < nVNODECLASSES; i++) {
+               VGetBitmap_r(ec, vp, i);
+               if (*ec) {
+#ifdef AFS_DEMAND_ATTACH_FS
+                   VRequestSalvage_r(vp, SALVSYNC_ERROR, VOL_SALVAGE_INVALIDATE_HEADER);
+                   *ec = VSALVAGING;
+#else /* AFS_DEMAND_ATTACH_FS */
+                   DeleteVolumeFromHashTable(vp);
+                   vp->shuttingDown = 1;       /* Let who has it free it. */
+                   vp->specialStatus = 0;
+#endif /* AFS_DEMAND_ATTACH_FS */
+                   ret = NULL;
+                   goto done;
+               }
+           }
+#ifndef AFS_DEMAND_ATTACH_FS
+           if (!wasVBUSY)
+               vp->specialStatus = 0;  /* Allow others to have access. */
+#endif /* AFS_DEMAND_ATTACH_FS */
+       }
+    }
+#endif /* BITMAP_LATER */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_UNLOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+    bp = index->bitmap + index->bitmapOffset;
+    ep = index->bitmap + index->bitmapSize;
+    while (bp < ep) {
+       if ((*(bit32 *) bp) != (bit32) 0xffffffff) {
+           int o;
+           index->bitmapOffset = (afs_uint32) (bp - index->bitmap);
+           while (*bp == 0xff)
+               bp++;
+           o = ffs(~*bp) - 1;  /* ffs is documented in BSTRING(3) */
+           *bp |= (1 << o);
+           ret = (VnodeId) ((bp - index->bitmap) * 8 + o);
+#ifdef AFS_DEMAND_ATTACH_FS
+           VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+           goto done;
+       }
+       bp += sizeof(bit32) /* i.e. 4 */ ;
+    }
+    /* No bit map entry--must grow bitmap */
+    bp = (byte *)
+       realloc(index->bitmap, index->bitmapSize + VOLUME_BITMAP_GROWSIZE);
+    assert(bp != NULL);
+    index->bitmap = bp;
+    bp += index->bitmapSize;
+    memset(bp, 0, VOLUME_BITMAP_GROWSIZE);
+    index->bitmapOffset = index->bitmapSize;
+    index->bitmapSize += VOLUME_BITMAP_GROWSIZE;
+    *bp = 1;
+    ret = index->bitmapOffset * 8;
+#ifdef AFS_DEMAND_ATTACH_FS
+    VOL_LOCK;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+ done:
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, state_save);
+    if (flags & VOL_ALLOC_BITMAP_WAIT) {
+       VCancelReservation_r(vp);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+    return ret;
+}
+
+VnodeId
+VAllocBitmapEntry(Error * ec, Volume * vp, register struct vnodeIndex * index)
+{
+    VnodeId retVal;
+    VOL_LOCK;
+    retVal = VAllocBitmapEntry_r(ec, vp, index, VOL_ALLOC_BITMAP_WAIT);
+    VOL_UNLOCK;
+    return retVal;
+}
+
+void
+VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
+                  unsigned bitNumber)
+{
+    unsigned int offset;
+
+    *ec = 0;
+#ifdef BITMAP_LATER
+    if (!index->bitmap)
+       return;
+#endif /* BITMAP_LATER */
+    offset = bitNumber >> 3;
+    if (offset >= index->bitmapSize) {
+       *ec = VNOVNODE;
+       return;
+    }
+    if (offset < index->bitmapOffset)
+       index->bitmapOffset = offset & ~3;      /* Truncate to nearest bit32 */
+    *(index->bitmap + offset) &= ~(1 << (bitNumber & 0x7));
+}
+
+void
+VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
+                unsigned bitNumber)
+{
+    VOL_LOCK;
+    VFreeBitMapEntry_r(ec, index, bitNumber);
+    VOL_UNLOCK;
+}
+
+/* this function will drop the glock internally.
+ * for old pthread fileservers, this is safe thanks to vbusy.
+ *
+ * for demand attach fs, caller must have already called
+ * VCreateReservation_r and VWaitExclusiveState_r */
+static void
+VGetBitmap_r(Error * ec, Volume * vp, VnodeClass class)
+{
+    StreamHandle_t *file;
+    int nVnodes;
+    int size;
+    struct VnodeClassInfo *vcp = &VnodeClassInfo[class];
+    struct vnodeIndex *vip = &vp->vnodeIndex[class];
+    struct VnodeDiskObject *vnode;
+    unsigned int unique = 0;
+    FdHandle_t *fdP;
+#ifdef BITMAP_LATER
+    byte *BitMap = 0;
+#endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState state_save;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    *ec = 0;
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    state_save = VChangeState_r(vp, VOL_STATE_GET_BITMAP);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    VOL_UNLOCK;
+
+    fdP = IH_OPEN(vip->handle);
+    assert(fdP != NULL);
+    file = FDH_FDOPEN(fdP, "r");
+    assert(file != NULL);
+    vnode = (VnodeDiskObject *) malloc(vcp->diskSize);
+    assert(vnode != NULL);
+    size = OS_SIZE(fdP->fd_fd);
+    assert(size != -1);
+    nVnodes = (size <= vcp->diskSize ? 0 : size - vcp->diskSize)
+       >> vcp->logSize;
+    vip->bitmapSize = ((nVnodes / 8) + 10) / 4 * 4;    /* The 10 is a little extra so
+                                                        * a few files can be created in this volume,
+                                                        * the whole thing is rounded up to nearest 4
+                                                        * bytes, because the bit map allocator likes
+                                                        * it that way */
+#ifdef BITMAP_LATER
+    BitMap = (byte *) calloc(1, vip->bitmapSize);
+    assert(BitMap != NULL);
+#else /* BITMAP_LATER */
+    vip->bitmap = (byte *) calloc(1, vip->bitmapSize);
+    assert(vip->bitmap != NULL);
+    vip->bitmapOffset = 0;
+#endif /* BITMAP_LATER */
+    if (STREAM_SEEK(file, vcp->diskSize, 0) != -1) {
+       int bitNumber = 0;
+       for (bitNumber = 0; bitNumber < nVnodes + 100; bitNumber++) {
+           if (STREAM_READ(vnode, vcp->diskSize, 1, file) != 1)
+               break;
+           if (vnode->type != vNull) {
+               if (vnode->vnodeMagic != vcp->magic) {
+                   Log("GetBitmap: addled vnode index in volume %s; volume needs salvage\n", V_name(vp));
+                   *ec = VSALVAGE;
+                   break;
+               }
+#ifdef BITMAP_LATER
+               *(BitMap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
+#else /* BITMAP_LATER */
+               *(vip->bitmap + (bitNumber >> 3)) |= (1 << (bitNumber & 0x7));
+#endif /* BITMAP_LATER */
+               if (unique <= vnode->uniquifier)
+                   unique = vnode->uniquifier + 1;
+           }
+#ifndef AFS_PTHREAD_ENV
+           if ((bitNumber & 0x00ff) == 0x0ff) {        /* every 256 iterations */
+               IOMGR_Poll();
+           }
+#endif /* !AFS_PTHREAD_ENV */
+       }
+    }
+    if (vp->nextVnodeUnique < unique) {
+       Log("GetBitmap: bad volume uniquifier for volume %s; volume needs salvage\n", V_name(vp));
+       *ec = VSALVAGE;
+    }
+    /* Paranoia, partly justified--I think fclose after fdopen
+     * doesn't seem to close fd.  In any event, the documentation
+     * doesn't specify, so it's safer to close it twice.
+     */
+    STREAM_CLOSE(file);
+    FDH_CLOSE(fdP);
+    free(vnode);
+
+    VOL_LOCK;
+#ifdef BITMAP_LATER
+    /* There may have been a racing condition with some other thread, both
+     * creating the bitmaps for this volume. If the other thread was faster
+     * the pointer to bitmap should already be filled and we can free ours.
+     */
+    if (vip->bitmap == NULL) {
+       vip->bitmap = BitMap;
+       vip->bitmapOffset = 0;
+    } else
+       free((byte *) BitMap);
+#endif /* BITMAP_LATER */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VChangeState_r(vp, state_save);
+#endif /* AFS_DEMAND_ATTACH_FS */
+}
+
+
+/***************************************************/
+/* demand attach fs state machine routines         */
+/***************************************************/
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* wait for the volume to change states */
+static void
+VWaitStateChange_r(Volume * vp)
+{
+    VolState state_save = V_attachState(vp);
+
+    assert(vp->nWaiters || vp->nUsers);
+    do {
+       assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
+    } while (V_attachState(vp) == state_save);
+    assert(V_attachState(vp) != VOL_STATE_FREED);
+}
+
+/* wait for blocking ops to end */
+static void
+VWaitExclusiveState_r(Volume * vp)
+{
+    assert(vp->nWaiters || vp->nUsers);
+    while (IsExclusiveState(V_attachState(vp))) {
+       assert(pthread_cond_wait(&V_attachCV(vp), &vol_glock_mutex) == 0);
+    }
+    assert(V_attachState(vp) != VOL_STATE_FREED);
+}
+
+/* change state, and notify other threads,
+ * return previous state to caller */
+VolState
+VChangeState_r(Volume * vp, VolState new_state)
+{
+    VolState old_state = V_attachState(vp);
+
+    /* XXX profiling need to make sure these counters
+     * don't kill performance... */
+    VStats.state_levels[old_state]--;
+    VStats.state_levels[new_state]++;
+
+    V_attachState(vp) = new_state;
+    assert(pthread_cond_broadcast(&V_attachCV(vp)) == 0);
+    return old_state;
+}
+
+/* tells caller whether or not the current state requires
+ * exclusive access without holding glock */
+static int
+IsExclusiveState(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_UPDATING:
+    case VOL_STATE_ATTACHING:
+    case VOL_STATE_GET_BITMAP:
+    case VOL_STATE_HDR_LOADING:
+    case VOL_STATE_HDR_ATTACHING:
+    case VOL_STATE_OFFLINING:
+    case VOL_STATE_DETACHING:
+       return 1;
+    }
+    return 0;
+}
+
+/* tell caller whether V_attachState is an error condition */
+static int
+IsErrorState(VolState state)
+{
+    switch (state) {
+    case VOL_STATE_ERROR:
+    case VOL_STATE_SALVAGING:
+       return 1;
+    }
+    return 0;
+}
+
+/* tell caller whether V_attachState is valid */
+static int
+IsValidState(VolState state)
+{
+    if ((state >= 0) && 
+       (state < VOL_STATE_COUNT) &&
+       (state != VOL_STATE_FREED)) {
+       return 1;
+    }
+    return 0;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume Path and Volume Number utility routines  */
+/***************************************************/
+
+static void
+GetVolumePath(Error * ec, VolId volumeId, char **partitionp, char **namep)
+{
+    static char partition[VMAXPATHLEN], name[VMAXPATHLEN];
+    char path[VMAXPATHLEN];
+    int found = 0;
+    struct DiskPartition *dp;
+
+    *ec = 0;
+    name[0] = '/';
+    (void)afs_snprintf(&name[1], (sizeof name) - 1, VFORMAT, volumeId);
+    for (dp = DiskPartitionList; dp; dp = dp->next) {
+       struct afs_stat status;
+       strcpy(path, VPartitionPath(dp));
+       strcat(path, name);
+       if (afs_stat(path, &status) == 0) {
+           strcpy(partition, dp->name);
+           found = 1;
+           break;
+       }
+    }
+    if (!found) {
+       *ec = VNOVOL;
+       *partitionp = *namep = NULL;
+    } else {
+       *partitionp = partition;
+       *namep = name;
+    }
+}
+
+int
+VolumeNumber(char *name)
+{
+    if (*name == '/')
+       name++;
+    return atoi(name + 1);
+}
+
+char *
+VolumeExternalName(VolumeId volumeId)
+{
+    static char name[VMAXPATHLEN];
+    (void)afs_snprintf(name, sizeof name, VFORMAT, volumeId);
+    return name;
+}
+
+static int
+VolumeExternalName_r(VolumeId volumeId, char * name, size_t len)
+{
+    return afs_snprintf(name, len, VFORMAT, volumeId);
+}
+
+
+/***************************************************/
+/* Volume Usage Statistics routines                */
+/***************************************************/
+
+#if OPENAFS_VOL_STATS
+#define OneDay (86400)         /* 24 hours' worth of seconds */
+#else
+#define OneDay (24*60*60)      /* 24 hours */
+#endif /* OPENAFS_VOL_STATS */
+
+#define Midnight(date) ((date-TimeZoneCorrection)/OneDay*OneDay+TimeZoneCorrection)
+
+/*------------------------------------------------------------------------
+ * [export] VAdjustVolumeStatistics
+ *
+ * Description:
+ *     If we've passed midnight, we need to update all the day use
+ *     statistics as well as zeroing the detailed volume statistics
+ *     (if we are implementing them).
+ *
+ * Arguments:
+ *     vp : Pointer to the volume structure describing the lucky
+ *             volume being considered for update.
+ *
+ * Returns:
+ *     0 (always!)
+ *
+ * Environment:
+ *     Nothing interesting.
+ *
+ * Side Effects:
+ *     As described.
+ *------------------------------------------------------------------------*/
+
+int
+VAdjustVolumeStatistics_r(register Volume * vp)
+{
+    unsigned int now = FT_ApproxTime();
+
+    if (now - V_dayUseDate(vp) > OneDay) {
+       register int ndays, i;
+
+       ndays = (now - V_dayUseDate(vp)) / OneDay;
+       for (i = 6; i > ndays - 1; i--)
+           V_weekUse(vp)[i] = V_weekUse(vp)[i - ndays];
+       for (i = 0; i < ndays - 1 && i < 7; i++)
+           V_weekUse(vp)[i] = 0;
+       if (ndays <= 7)
+           V_weekUse(vp)[ndays - 1] = V_dayUse(vp);
+       V_dayUse(vp) = 0;
+       V_dayUseDate(vp) = Midnight(now);
+
+#if OPENAFS_VOL_STATS
+       /*
+        * All we need to do is bzero the entire VOL_STATS_BYTES of
+        * the detailed volume statistics area.
+        */
+       memset((char *)(V_stat_area(vp)), 0, VOL_STATS_BYTES);
+#endif /* OPENAFS_VOL_STATS */
+    }
+
+    /*It's been more than a day of collection */
+    /*
+     * Always return happily.
+     */
+    return (0);
+}                              /*VAdjustVolumeStatistics */
+
+int
+VAdjustVolumeStatistics(register Volume * vp)
+{
+    int retVal;
+    VOL_LOCK;
+    retVal = VAdjustVolumeStatistics_r(vp);
+    VOL_UNLOCK;
+    return retVal;
+}
+
+void
+VBumpVolumeUsage_r(register Volume * vp)
+{
+    unsigned int now = FT_ApproxTime();
+    if (now - V_dayUseDate(vp) > OneDay)
+       VAdjustVolumeStatistics_r(vp);
+    /*
+     * Save the volume header image to disk after every 128 bumps to dayUse.
+     */
+    if ((V_dayUse(vp)++ & 127) == 0) {
+       Error error;
+       VUpdateVolume_r(&error, vp, VOL_UPDATE_WAIT);
+    }
+}
+
+void
+VBumpVolumeUsage(register Volume * vp)
+{
+    VOL_LOCK;
+    VBumpVolumeUsage_r(vp);
+    VOL_UNLOCK;
+}
+
+void
+VSetDiskUsage_r(void)
+{
+#ifndef AFS_DEMAND_ATTACH_FS
+    static int FifteenMinuteCounter = 0;
+#endif
+
+    while (VInit < 2) {
+       /* NOTE: Don't attempt to access the partitions list until the
+        * initialization level indicates that all volumes are attached,
+        * which implies that all partitions are initialized. */
+#ifdef AFS_PTHREAD_ENV
+       sleep(10);
+#else /* AFS_PTHREAD_ENV */
+       IOMGR_Sleep(10);
+#endif /* AFS_PTHREAD_ENV */
+    }
+
+    VResetDiskUsage_r();
+
+#ifndef AFS_DEMAND_ATTACH_FS
+    if (++FifteenMinuteCounter == 3) {
+       FifteenMinuteCounter = 0;
+       VScanUpdateList();
+    }
+#endif /* !AFS_DEMAND_ATTACH_FS */
+}
+
+void
+VSetDiskUsage(void)
+{
+    VOL_LOCK;
+    VSetDiskUsage_r();
+    VOL_UNLOCK;
+}
+
+
+/***************************************************/
+/* Volume Update List routines                     */
+/***************************************************/
+
+/* The number of minutes that a volume hasn't been updated before the
+ * "Dont salvage" flag in the volume header will be turned on */
+#define SALVAGE_INTERVAL       (10*60)
+
+/*
+ * demand attach fs
+ *
+ * volume update list functionality has been moved into the VLRU
+ * the DONT_SALVAGE flag is now set during VLRU demotion
+ */
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static VolumeId *UpdateList = NULL;    /* Pointer to array of Volume ID's */
+static int nUpdatedVolumes = 0;                /* Updated with entry in UpdateList, salvage after crash flag on */
+static int updateSize = 0;             /* number of entries possible */
+#define UPDATE_LIST_SIZE 128           /* initial size increment (must be a power of 2!) */
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+void
+VAddToVolumeUpdateList_r(Error * ec, Volume * vp)
+{
+    *ec = 0;
+    vp->updateTime = FT_ApproxTime();
+    if (V_dontSalvage(vp) == 0)
+       return;
+    V_dontSalvage(vp) = 0;
+    VSyncVolume_r(ec, vp, 0);
+#ifdef AFS_DEMAND_ATTACH_FS
+    V_attachFlags(vp) &= ~(VOL_HDR_DONTSALV);
+#else /* !AFS_DEMAND_ATTACH_FS */
+    if (*ec)
+       return;
+    if (UpdateList == NULL) {
+       updateSize = UPDATE_LIST_SIZE;
+       UpdateList = (VolumeId *) malloc(sizeof(VolumeId) * updateSize);
+    } else {
+       if (nUpdatedVolumes == updateSize) {
+           updateSize << 1;
+           if (updateSize > 524288) {
+               Log("warning: there is likely a bug in the volume update scanner\n");
+               return;
+           }
+           UpdateList =
+               (VolumeId *) realloc(UpdateList,
+                                    sizeof(VolumeId) * updateSize);
+       }
+    }
+    assert(UpdateList != NULL);
+    UpdateList[nUpdatedVolumes++] = V_id(vp);
+#endif /* !AFS_DEMAND_ATTACH_FS */
+}
+
+#ifndef AFS_DEMAND_ATTACH_FS
+static void
+VScanUpdateList(void)
+{
+    register int i, gap;
+    register Volume *vp;
+    Error error;
      afs_uint32 now = FT_ApproxTime();
      /* Be careful with this code, since it works with interleaved calls to AddToVolumeUpdateList */
      for (i = gap = 0; i < nUpdatedVolumes; i++) {
+       if (gap)
+           UpdateList[i - gap] = UpdateList[i];
+
+       /* XXX this routine needlessly messes up the Volume LRU by
+        * breaking the LRU temporal-locality assumptions.....
+        * we should use a special volume header allocator here */
         vp = VGetVolume_r(&error, UpdateList[i - gap] = UpdateList[i]);
         if (error) {
             gap++;
         } else if (vp->nUsers == 1 && now - vp->updateTime > SALVAGE_INTERVAL) {
             V_dontSalvage(vp) = DONT_SALVAGE;
-           VUpdateVolume_r(&error, vp);        /* No need to fsync--not critical */
+           VUpdateVolume_r(&error, vp, 0);     /* No need to fsync--not critical */
             gap++;
         }
-       if (vp)
+
+       if (vp) {
+           VPutVolume_r(vp);
+       }
+
+#ifndef AFS_PTHREAD_ENV
+       IOMGR_Poll();
+#endif /* !AFS_PTHREAD_ENV */
+    }
+    nUpdatedVolumes -= gap;
+}
+#endif /* !AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume LRU routines                             */
+/***************************************************/
+
+/* demand attach fs
+ * volume LRU
+ *
+ * with demand attach fs, we attempt to soft detach(1)
+ * volumes which have not been accessed in a long time
+ * in order to speed up fileserver shutdown
+ *
+ * (1) by soft detach we mean a process very similar
+ *     to VOffline, except the final state of the 
+ *     Volume will be VOL_STATE_PREATTACHED, instead
+ *     of the usual VOL_STATE_UNATTACHED
+ */
+#ifdef AFS_DEMAND_ATTACH_FS
+
+/* implementation is reminiscent of a generational GC
+ *
+ * queue 0 is newly attached volumes. this queue is
+ * sorted by attach timestamp
+ *
+ * queue 1 is volumes that have been around a bit
+ * longer than queue 0. this queue is sorted by
+ * attach timestamp
+ *
+ * queue 2 is volumes tha have been around the longest.
+ * this queue is unsorted
+ *
+ * queue 3 is volumes that have been marked as
+ * candidates for soft detachment. this queue is
+ * unsorted
+ */
+#define VLRU_GENERATIONS  3   /* number of generations in VLRU */
+#define VLRU_QUEUES       5   /* total number of VLRU queues */
+struct VLRU_q {
+    volatile struct rx_queue q;
+    volatile int len;
+    volatile int busy;
+    pthread_cond_t cv;
+};
+struct VLRU {
+    struct VLRU_q q[VLRU_QUEUES];
+
+    /* VLRU config */
+    afs_uint32 promotion_interval[VLRU_GENERATIONS-1];  /* interval between promotions */
+    afs_uint32 scan_interval[VLRU_GENERATIONS+1];       /* interval between scans for candidates */
+
+    /* state */
+    int next_idx;
+    afs_uint32 last_promotion[VLRU_GENERATIONS-1];      /* timestamp of last promotion scan */
+    afs_uint32 last_scan[VLRU_GENERATIONS+1];           /* timestamp of last detach scan */
+
+    int scanner_state;                                  /* state of scanner thread */
+    pthread_cond_t cv;                                  /* state transition CV */
+};
+
+static struct VLRU volume_LRU;
+
+/* valid scanner states */
+#define VLRU_SCANNER_STATE_OFFLINE        0
+#define VLRU_SCANNER_STATE_ONLINE         1
+#define VLRU_SCANNER_STATE_SHUTTING_DOWN  2
+#define VLRU_SCANNER_STATE_PAUSING        3
+#define VLRU_SCANNER_STATE_PAUSED         4
+
+/* vlru disk data header stuff */
+#define VLRU_DISK_MAGIC      0x7a8b9cad
+#define VLRU_DISK_VERSION    1
+
+/* vlru default expiration time (for eventual fs state serialization of vlru data) */
+#define VLRU_DUMP_EXPIRATION_TIME   (60*60*24*7)  /* expire vlru data after 1 week */
+
+
+static afs_uint32 VLRU_offline_thresh = VLRU_DEFAULT_OFFLINE_THRESH;
+static afs_uint32 VLRU_offline_interval = VLRU_DEFAULT_OFFLINE_INTERVAL;
+static afs_uint32 VLRU_offline_max = VLRU_DEFAULT_OFFLINE_MAX;
+static afs_uint32 VLRU_enabled = 1;
+
+/* queue synchronization routines */
+static void VLRU_BeginExclusive_r(struct VLRU_q * q);
+static void VLRU_EndExclusive_r(struct VLRU_q * q);
+static void VLRU_Wait_r(struct VLRU_q * q);
+
+/* set the VLRU parameters 
+ *
+ * valid options are:
+ *  VLRU_SET_THRESH -- set the period of inactivity after
+ *    which volumes are eligible for being detached
+ *  VLRU_SET_INTERVAL -- the time interval between calls
+ *    to the volume LRU "garbage collector"
+ *  VLRU_SET_MAX -- the max number of volumes to deallocate
+ *    in one GC pass
+ */
+void
+VLRU_SetOptions(int option, afs_uint32 val)
+{
+    if (option == VLRU_SET_THRESH) {
+       VLRU_offline_thresh = val;
+    } else if (option == VLRU_SET_INTERVAL) {
+       VLRU_offline_interval = val;
+    } else if (option == VLRU_SET_MAX) {
+       VLRU_offline_max = val;
+    } else if (option == VLRU_SET_ENABLED) {
+       VLRU_enabled = val;
+    }
+    VLRU_ComputeConstants();
+}
+
+/* compute the VLRU internal timing parameters based upon the user's inputs */
+static void
+VLRU_ComputeConstants(void)
+{
+    afs_uint32 factor = VLRU_offline_thresh / VLRU_offline_interval;
+
+    /* compute the candidate scan interval */
+    volume_LRU.scan_interval[VLRU_QUEUE_CANDIDATE] = VLRU_offline_interval;
+
+    /* compute the promotion intervals */
+    volume_LRU.promotion_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh * 2;
+    volume_LRU.promotion_interval[VLRU_QUEUE_MID] = VLRU_offline_thresh * 4;
+
+    if (factor > 16) {
+       /* compute the gen 0 scan interval */
+       volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_thresh / 8;
+    } else {
+       /* compute the gen 0 scan interval */
+       volume_LRU.scan_interval[VLRU_QUEUE_NEW] = VLRU_offline_interval * 2;
+    }
+}
+
+/* initialize VLRU */
+static void
+VInitVLRU(void)
+{
+    pthread_t tid;
+    pthread_attr_t attrs;
+    int i;
+
+    if (!VLRU_enabled) {
+       Log("VLRU: disabled\n");
+       return;
+    }
+
+    /* initialize each of the VLRU queues */
+    for (i = 0; i < VLRU_QUEUES; i++) {
+       queue_Init(&volume_LRU.q[i]);
+       volume_LRU.q[i].len = 0;
+       volume_LRU.q[i].busy = 0;
+       assert(pthread_cond_init(&volume_LRU.q[i].cv, NULL) == 0);
+    }
+
+    /* setup the timing constants */
+    VLRU_ComputeConstants();
+
+    /* XXX put inside LogLevel check? */
+    Log("VLRU: starting scanner with the following configuration parameters:\n");
+    Log("VLRU:  offlining volumes after minimum of %d seconds of inactivity\n", VLRU_offline_thresh);
+    Log("VLRU:  running VLRU soft detach pass every %d seconds\n", VLRU_offline_interval);
+    Log("VLRU:  taking up to %d volumes offline per pass\n", VLRU_offline_max);
+    Log("VLRU:  scanning generation 0 for inactive volumes every %d seconds\n", volume_LRU.scan_interval[0]);
+    Log("VLRU:  scanning for promotion/demotion between generations 0 and 1 every %d seconds\n", volume_LRU.promotion_interval[0]);
+    Log("VLRU:  scanning for promotion/demotion between generations 1 and 2 every %d seconds\n", volume_LRU.promotion_interval[1]);
+
+    /* start up the VLRU scanner */
+    volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+    if (programType == fileServer) {
+       assert(pthread_cond_init(&volume_LRU.cv, NULL) == 0);
+       assert(pthread_attr_init(&attrs) == 0);
+       assert(pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED) == 0);
+       assert(pthread_create(&tid, &attrs, &VLRU_ScannerThread, NULL) == 0);
+    }
+}
+
+/* initialize LRU support for a volume */
+static void
+VLRU_Init_Node_r(volatile Volume * vp)
+{
+    if (!VLRU_enabled)
+       return;
+
+    assert(queue_IsNotOnQueue(&vp->vlru));
+    vp->vlru.idx = VLRU_QUEUE_INVALID;
+}
+
+/* add volume to VLRU 
+ * now supports adding to queues other
+ * than new for vlru state restore
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_Add_r(volatile Volume * vp)
+{
+    int idx;
+
+    if (!VLRU_enabled)
+       return;
+
+    if (queue_IsOnQueue(&vp->vlru))
+       return;
+
+    VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+
+    /* repeat check since VLRU_Wait_r may have dropped
+     * the glock */
+    if (queue_IsNotOnQueue(&vp->vlru)) {
+       idx = vp->vlru.idx;
+       if ((idx < 0) || (idx >= VLRU_QUEUE_INVALID)) {
+           idx = vp->vlru.idx = VLRU_QUEUE_NEW;
+       }
+       queue_Prepend(&volume_LRU.q[idx], &vp->vlru);
+       volume_LRU.q[idx].len++;
+       V_attachFlags(vp) |= VOL_ON_VLRU;
+       vp->stats.last_promote = FT_ApproxTime();
+    }
+}
+
+/* delete volume from VLRU 
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_Delete_r(volatile Volume * vp)
+{
+    int idx;
+
+    if (!VLRU_enabled)
+       return;
+
+    if (queue_IsNotOnQueue(&vp->vlru))
+       return;
+
+    /* handle races */
+    do {
+      idx = vp->vlru.idx;
+      if (idx == VLRU_QUEUE_INVALID)
+         return;
+      VLRU_Wait_r(&volume_LRU.q[idx]);
+    } while (idx != vp->vlru.idx);
+
+    /* now remove from the VLRU and update 
+     * the appropriate counter */
+    queue_Remove(&vp->vlru);
+    volume_LRU.q[idx].len--;
+    vp->vlru.idx = VLRU_QUEUE_INVALID;
+    V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+}
+
+/* signal that volume was just accessed.
+ * caller MUST hold a ref count on vp */
+static void
+VLRU_UpdateAccess_r(volatile Volume * vp)
+{
+    afs_uint32 live_interval;
+    Volume * rvp = NULL;
+
+    if (!VLRU_enabled)
+       return;
+
+    if (queue_IsNotOnQueue(&vp->vlru))
+       return;
+
+    assert(V_attachFlags(vp) & VOL_ON_VLRU);
+
+    /* update the access timestamp */
+    vp->stats.last_get = FT_ApproxTime();
+
+    /*
+     * if the volume is on the soft detach candidate
+     * list, we need to safely move it back to a
+     * regular generation.  this has to be done
+     * carefully so we don't race against the scanner
+     * thread.
+     */
+
+    /* if this volume is on the soft detach candidate queue,
+     * then grab exclusive access to the necessary queues */
+    if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+       rvp = vp;
+       VCreateReservation_r(rvp);
+
+       VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+       VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+       VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+       VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+
+    /* make sure multiple threads don't race to update */
+    if (vp->vlru.idx == VLRU_QUEUE_CANDIDATE) {
+       VLRU_SwitchQueues(vp, VLRU_QUEUE_NEW, 1);
+    }
+
+    if (rvp) {
+      VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+      VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_NEW]);
+      VCancelReservation_r(rvp);
+    }
+}
+
+/* switch a volume between two VLRU queues */
+static void
+VLRU_SwitchQueues(volatile Volume * vp, int new_idx, int append)
+{
+    if (queue_IsNotOnQueue(&vp->vlru))
+       return;
+
+    queue_Remove(&vp->vlru);
+    volume_LRU.q[vp->vlru.idx].len--;
+    
+    /* put the volume back on the correct generational queue */
+    if (append) {
+       queue_Append(&volume_LRU.q[new_idx], &vp->vlru);
+    } else {
+       queue_Prepend(&volume_LRU.q[new_idx], &vp->vlru);
+    }
+
+    volume_LRU.q[new_idx].len++;
+    vp->vlru.idx = new_idx;
+}
+
+/* VLRU GC thread */
+static void *
+VLRU_ScannerThread(void * args)
+{
+    afs_uint32 now, min_delay, delay;
+    afs_uint32 next_scan[VLRU_GENERATIONS];
+    afs_uint32 next_promotion[VLRU_GENERATIONS];
+    int i, min_idx, min_op, overdue, state;
+
+    /* set t=0 for promotion cycle to be 
+     * fileserver startup */
+    now = FT_ApproxTime();
+    for (i=0; i < VLRU_GENERATIONS-1; i++) {
+       volume_LRU.last_promotion[i] = now;
+    }
+
+    /* don't start the scanner until VLRU_offline_thresh
+     * plus a small delay for VInitVolumePackage to finish
+     * has gone by */
+
+    sleep(VLRU_offline_thresh + 60);
+
+    /* set t=0 for scan cycle to be now */
+    now = FT_ApproxTime();
+    for (i=0; i < VLRU_GENERATIONS+1; i++) {
+       volume_LRU.last_scan[i] = now;
+    }
+
+    VOL_LOCK;
+    if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_OFFLINE) {
+       volume_LRU.scanner_state = VLRU_SCANNER_STATE_ONLINE;
+    }
+
+    while ((state = volume_LRU.scanner_state) != VLRU_SCANNER_STATE_SHUTTING_DOWN) {
+       /* check to see if we've been asked to pause */
+       if (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSING) {
+           volume_LRU.scanner_state = VLRU_SCANNER_STATE_PAUSED;
+           assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+           do {
+               assert(pthread_cond_wait(&volume_LRU.cv, &vol_glock_mutex) == 0);
+           } while (volume_LRU.scanner_state == VLRU_SCANNER_STATE_PAUSED);
+       }
+
+       /* scheduling can happen outside the glock */
+       VOL_UNLOCK;
+
+       /* figure out what is next on the schedule */
+
+       /* figure out a potential schedule for the new generation first */
+       overdue = 0;
+       min_delay = volume_LRU.scan_interval[0] + volume_LRU.last_scan[0] - now;
+       min_idx = 0;
+       min_op = 0;
+       if (min_delay > volume_LRU.scan_interval[0]) {
+           /* unsigned overflow -- we're overdue to run this scan */
+           min_delay = 0;
+           overdue = 1;
+       }
+
+       /* if we're not overdue for gen 0, figure out schedule for candidate gen */
+       if (!overdue) {
+           i = VLRU_QUEUE_CANDIDATE;
+           delay = volume_LRU.scan_interval[i] + volume_LRU.last_scan[i] - now;
+           if (delay < min_delay) {
+               min_delay = delay;
+               min_idx = i;
+           }
+           if (delay > volume_LRU.scan_interval[i]) {
+               /* unsigned overflow -- we're overdue to run this scan */
+               min_delay = 0;
+               min_idx = i;
+               overdue = 1;
+               break;
+           }
+       }
+
+       /* if we're still not overdue for something, figure out schedules for promotions */
+       for (i=0; !overdue && i < VLRU_GENERATIONS-1; i++) {
+           delay = volume_LRU.promotion_interval[i] + volume_LRU.last_promotion[i] - now;
+           if (delay < min_delay) {
+               min_delay = delay;
+               min_idx = i;
+               min_op = 1;
+           }
+           if (delay > volume_LRU.promotion_interval[i]) {
+               /* unsigned overflow -- we're overdue to run this promotion */
+               min_delay = 0;
+               min_idx = i;
+               min_op = 1;
+               overdue = 1;
+               break;
+           }
+       }
+
+       /* sleep as needed */
+       if (min_delay) {
+           sleep(min_delay);
+       }
+
+       /* do whatever is next */
+       VOL_LOCK;
+       if (min_op) {
+           VLRU_Promote_r(min_idx);
+           VLRU_Demote_r(min_idx+1);
+       } else {
+           VLRU_Scan_r(min_idx);
+       }
+       now = FT_ApproxTime();
+    }
+
+    Log("VLRU scanner asked to go offline (scanner_state=%d)\n", state);
+
+    /* signal that scanner is down */
+    volume_LRU.scanner_state = VLRU_SCANNER_STATE_OFFLINE;
+    assert(pthread_cond_broadcast(&volume_LRU.cv) == 0);
+    VOL_UNLOCK;
+    return NULL;
+}
+
+/* run the promotions */
+static void
+VLRU_Promote_r(int idx)
+{
+    int len, chaining, promote;
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    Volume * vp, *start, *end;
+
+    /* get exclusive access to two chains, and drop the glock */
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+    VLRU_Wait_r(&volume_LRU.q[idx+1]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx+1]);
+    VOL_UNLOCK;
+
+    thresh = volume_LRU.promotion_interval[idx];
+    now = FT_ApproxTime();
+
+    len = chaining = 0;
+    for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+       vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+       promote = (((vp->stats.last_promote + thresh) <= now) &&
+                  (vp->stats.last_get >= vp->stats.last_promote));
+
+       if (chaining) {
+           if (promote) {
+               vp->vlru.idx++;
+               len++;
+               start = vp;
+           } else {
+               /* promote and prepend chain */
+               queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+               chaining = 0;
+           }
+       } else {
+           if (promote) {
+               vp->vlru.idx++;
+               len++;
+               chaining = 1;
+               start = end = vp;
+           }
+       }
+    }
+
+    if (chaining) {
+       /* promote and prepend */
+       queue_MoveChainAfter(&volume_LRU.q[idx+1], &start->vlru, &end->vlru);
+    }
+
+    if (len) {
+       volume_LRU.q[idx].len -= len;
+       volume_LRU.q[idx+1].len += len;
+    }
+
+    /* release exclusive access to the two chains */
+    VOL_LOCK;
+    volume_LRU.last_promotion[idx] = now;
+    VLRU_EndExclusive_r(&volume_LRU.q[idx+1]);
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* run the demotions */
+static void
+VLRU_Demote_r(int idx)
+{
+    Error ec;
+    int len, chaining, demote;
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    Volume * vp, *start, *end;
+    Volume ** salv_flag_vec = NULL;
+    int salv_vec_offset = 0;
+
+    assert(idx == VLRU_QUEUE_MID || idx == VLRU_QUEUE_OLD);
+
+    /* get exclusive access to two chains, and drop the glock */
+    VLRU_Wait_r(&volume_LRU.q[idx-1]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx-1]);
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+    VOL_UNLOCK;
+
+    /* no big deal if this allocation fails */
+    if (volume_LRU.q[idx].len) {
+       salv_flag_vec = (Volume **) malloc(volume_LRU.q[idx].len * sizeof(Volume *));
+    }
+
+    now = FT_ApproxTime();
+    thresh = volume_LRU.promotion_interval[idx-1];
+
+    len = chaining = 0;
+    for (queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+       vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+       demote = (((vp->stats.last_promote + thresh) <= now) &&
+                 (vp->stats.last_get < (now - thresh)));
+
+       /* we now do volume update list DONT_SALVAGE flag setting during
+        * demotion passes */
+       if (salv_flag_vec &&
+           !(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+           demote && 
+           (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+           (V_attachState(vp) == VOL_STATE_ATTACHED)) {
+           salv_flag_vec[salv_vec_offset++] = vp;
+           VCreateReservation_r(vp);
+       }
+
+       if (chaining) {
+           if (demote) {
+               vp->vlru.idx--;
+               len++;
+               start = vp;
+           } else {
+               /* demote and append chain */
+               queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+               chaining = 0;
+           }
+       } else {
+           if (demote) {
+               vp->vlru.idx--;
+               len++;
+               chaining = 1;
+               start = end = vp;
+           }
+       }
+    }
+
+    if (chaining) {
+       queue_MoveChainBefore(&volume_LRU.q[idx-1], &start->vlru, &end->vlru);
+    }
+
+    if (len) {
+       volume_LRU.q[idx].len -= len;
+       volume_LRU.q[idx-1].len += len;
+    }
+
+    /* release exclusive access to the two chains */
+    VOL_LOCK;
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+    VLRU_EndExclusive_r(&volume_LRU.q[idx-1]);
+
+    /* now go back and set the DONT_SALVAGE flags as appropriate */
+    if (salv_flag_vec) {
+       int i;
+       for (i = 0; i < salv_vec_offset; i++) {
+           vp = salv_flag_vec[i];
+           if (!(V_attachFlags(vp) & VOL_HDR_DONTSALV) &&
+               (vp->updateTime < (now - SALVAGE_INTERVAL)) &&
+               (V_attachState(vp) == VOL_STATE_ATTACHED)) {
+               ec = VHold_r(vp);
+               if (!ec) {
+                   V_attachFlags(vp) |= VOL_HDR_DONTSALV;
+                   V_dontSalvage(vp) = DONT_SALVAGE;
+                   VUpdateVolume_r(&ec, vp, 0);
+                   VPutVolume_r(vp);
+               }
+           }
+           VCancelReservation_r(vp);
+       }
+       free(salv_flag_vec);
+    }
+}
+
+/* run a pass of the VLRU GC scanner */
+static void
+VLRU_Scan_r(int idx)
+{
+    afs_uint32 now, thresh;
+    struct rx_queue *qp, *nqp;
+    volatile Volume * vp;
+    int i, locked = 1;
+
+    assert(idx == VLRU_QUEUE_NEW || idx == VLRU_QUEUE_CANDIDATE);
+
+    /* gain exclusive access to the idx VLRU */
+    VLRU_Wait_r(&volume_LRU.q[idx]);
+    VLRU_BeginExclusive_r(&volume_LRU.q[idx]);
+
+    if (idx != VLRU_QUEUE_CANDIDATE) {
+       /* gain exclusive access to the candidate VLRU */
+       VLRU_Wait_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+       VLRU_BeginExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+
+    now = FT_ApproxTime();
+    thresh = now - VLRU_offline_thresh;
+
+    /* perform candidate selection and soft detaching */
+    if (idx == VLRU_QUEUE_CANDIDATE) {
+       /* soft detach some volumes from the candidate pool */
+       VOL_UNLOCK;
+       locked = 0;
+
+       for (i=0,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue)) {
+           vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+           if (i >= VLRU_offline_max) {
+               break;
+           }
+           /* check timestamp to see if it's a candidate for soft detaching */
+           if (vp->stats.last_get <= thresh) {
+               VOL_LOCK;
+               if (VCheckSoftDetach(vp, thresh))
+                   i++;
+               VOL_UNLOCK;
+           }
+       }
+    } else {
+       /* scan for volumes to become soft detach candidates */
+       for (i=1,queue_ScanBackwards(&volume_LRU.q[idx], qp, nqp, rx_queue),i++) {
+           vp = (Volume *)((char *)qp - offsetof(Volume, vlru));
+
+           /* check timestamp to see if it's a candidate for soft detaching */
+           if (vp->stats.last_get <= thresh) {
+               VCheckSoftDetachCandidate(vp, thresh);
+           }
+
+           if (!(i&0x7f)) {   /* lock coarsening optimization */
+               VOL_UNLOCK;
+               pthread_yield();
+               VOL_LOCK;
+           }
+       }
+    }
+
+    /* relinquish exclusive access to the VLRU chains */
+    if (!locked) {
+       VOL_LOCK;
+    }
+    volume_LRU.last_scan[idx] = now;
+    if (idx != VLRU_QUEUE_CANDIDATE) {
+       VLRU_EndExclusive_r(&volume_LRU.q[VLRU_QUEUE_CANDIDATE]);
+    }
+    VLRU_EndExclusive_r(&volume_LRU.q[idx]);
+}
+
+/* check whether volume is safe to soft detach
+ * caller MUST NOT hold a ref count on vp */
+static int
+VCheckSoftDetach(volatile Volume * vp, afs_uint32 thresh)
+{
+    int ret=0;
+
+    if (vp->nUsers || vp->nWaiters)
+       return 0;
+
+    if (vp->stats.last_get <= thresh) {
+       ret = VSoftDetachVolume_r(vp, thresh);
+    }
+
+    return ret;
+}
+
+/* check whether volume should be made a 
+ * soft detach candidate */
+static int
+VCheckSoftDetachCandidate(volatile Volume * vp, afs_uint32 thresh)
+{
+    int idx, ret = 0;
+    if (vp->nUsers || vp->nWaiters)
+       return 0;
+
+    idx = vp->vlru.idx;
+
+    assert(idx == VLRU_QUEUE_NEW);
+
+    if (vp->stats.last_get <= thresh) {
+       /* move to candidate pool */
+       queue_Remove(&vp->vlru);
+       volume_LRU.q[VLRU_QUEUE_NEW].len--;
+       queue_Prepend(&volume_LRU.q[VLRU_QUEUE_CANDIDATE], &vp->vlru);
+       vp->vlru.idx = VLRU_QUEUE_CANDIDATE;
+       volume_LRU.q[VLRU_QUEUE_CANDIDATE].len++;
+       ret = 1;
+    }
+
+    return ret;
+}
+
+
+/* begin exclusive access on VLRU */
+static void
+VLRU_BeginExclusive_r(struct VLRU_q * q)
+{
+    assert(q->busy == 0);
+    q->busy = 1;
+}
+
+/* end exclusive access on VLRU */
+static void
+VLRU_EndExclusive_r(struct VLRU_q * q)
+{
+    assert(q->busy);
+    q->busy = 0;
+    assert(pthread_cond_broadcast(&q->cv) == 0);
+}
+
+/* wait for another thread to end exclusive access on VLRU */
+static void
+VLRU_Wait_r(struct VLRU_q * q)
+{
+    while(q->busy) {
+       assert(pthread_cond_wait(&q->cv, &vol_glock_mutex) == 0);
+    }
+}
+
+/* demand attach fs
+ * volume soft detach
+ *
+ * caller MUST NOT hold a ref count on vp */
+static int
+VSoftDetachVolume_r(volatile Volume * vp, afs_uint32 thresh)
+{
+    afs_uint32 ts_save;
+    int ret = 0;
+
+    assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+
+    ts_save = vp->stats.last_get;
+    if (ts_save > thresh)
+       return 0;
+
+    if (vp->nUsers || vp->nWaiters)
+       return 0;
+
+    if (IsExclusiveState(V_attachState(vp))) {
+       return 0;
+    }
+
+    switch (V_attachState(vp)) {
+    case VOL_STATE_UNATTACHED:
+    case VOL_STATE_PREATTACHED:
+    case VOL_STATE_ERROR:
+    case VOL_STATE_GOING_OFFLINE:
+    case VOL_STATE_SHUTTING_DOWN:
+    case VOL_STATE_SALVAGING:
+       volume_LRU.q[vp->vlru.idx].len--;
+
+       /* create and cancel a reservation to
+        * give the volume an opportunity to
+        * be deallocated */
+       VCreateReservation_r(vp);
+       queue_Remove(&vp->vlru);
+       vp->vlru.idx = VLRU_QUEUE_INVALID;
+       V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+       VCancelReservation_r(vp);
+       return 0;
+    }
+
+    /* hold the volume and take it offline.
+     * no need for reservations, as VHold_r
+     * takes care of that internally. */
+    if (VHold_r(vp) == 0) {
+       /* vhold drops the glock, so now we should
+        * check to make sure we aren't racing against
+        * other threads.  if we are racing, offlining vp
+        * would be wasteful, and block the scanner for a while 
+        */
+       if (vp->nWaiters || 
+           (vp->nUsers > 1) ||
+           (vp->shuttingDown) ||
+           (vp->goingOffline) ||
+           (vp->stats.last_get != ts_save)) {
+           /* looks like we're racing someone else. bail */
             VPutVolume_r(vp);
-#ifndef AFS_PTHREAD_ENV
-       IOMGR_Poll();
-#endif /* !AFS_PTHREAD_ENV */
+           vp = NULL;
+       } else {
+           /* pull it off the VLRU */
+           assert(vp->vlru.idx == VLRU_QUEUE_CANDIDATE);
+           volume_LRU.q[VLRU_QUEUE_CANDIDATE].len--;
+           queue_Remove(&vp->vlru);
+           vp->vlru.idx = VLRU_QUEUE_INVALID;
+           V_attachFlags(vp) &= ~(VOL_ON_VLRU);
+
+           /* take if offline */
+           VOffline_r(vp, "volume has been soft detached");
+
+           /* invalidate the volume header cache */
+           FreeVolumeHeader(vp);
+
+           /* update stats */
+           IncUInt64(&VStats.soft_detaches);
+           vp->stats.soft_detaches++;
+
+           /* put in pre-attached state so demand
+            * attacher can work on it */
+           VChangeState_r(vp, VOL_STATE_PREATTACHED);
+           ret = 1;
+       }
      }
-    nUpdatedVolumes -= gap;
+    return ret;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  
  /***************************************************/
-/* Add on routines to manage a volume header cache */
+/* Volume Header Cache routines                    */
  /***************************************************/
  
-static struct volHeader *volumeLRU;
+struct volume_hdr_LRU_t volume_hdr_LRU;
  
  /* Allocate a bunch of headers; string them together */
  static void
-InitLRU(int howMany)
+VInitVolumeHeaderCache(afs_uint32 howMany)
  {
      register struct volHeader *hp;
      if (programType != fileServer)
         return;
+    queue_Init(&volume_hdr_LRU);
+#ifdef AFS_DEMAND_ATTACH_FS
+    volume_hdr_LRU.stats.free = 0;
+    volume_hdr_LRU.stats.used = howMany;
+    volume_hdr_LRU.stats.attached = 0;
+#endif
      hp = (struct volHeader *)(calloc(howMany, sizeof(struct volHeader)));
      while (howMany--)
         ReleaseVolumeHeader(hp++);
  }
  
+#ifdef AFS_DEMAND_ATTACH_FS
  /* Get a volume header from the LRU list; update the old one if necessary */
  /* Returns 1 if there was already a header, which is removed from the LRU list */
+/* caller MUST has a ref count on vp */
  static int
  GetVolumeHeader(register Volume * vp)
  {
@@ -1931,54 +5377,211 @@ GetVolumeHeader(register Volume * vp)
      int old;
      static int everLogged = 0;
  
+    /* XXX debug 9/19/05 we've apparently got
+     * a ref counting bug somewhere that's
+     * breaking the nUsers == 0 => header on LRU
+     * assumption */
+    if (vp->header && queue_IsNotOnQueue(vp->header)) {
+       Log("nUsers == 0, but header not on LRU\n");
+       return 1;
+    }
+
      old = (vp->header != NULL);        /* old == volume already has a header */
+
      if (programType != fileServer) {
+       /* for volume utilities, we allocate volHeaders as needed */
         if (!vp->header) {
             hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
             assert(hd != NULL);
             vp->header = hd;
             hd->back = vp;
+           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
         }
      } else {
         if (old) {
+           /* the header we previously dropped in the lru is
+            * still available. pull it off the lru and return */
             hd = vp->header;
-           if (volumeLRU == hd)
-               volumeLRU = hd->next;
+           queue_Remove(hd);
             assert(hd->back == vp);
         } else {
-           if (volumeLRU)
-               /* not currently in use and least recently used */
-               hd = volumeLRU->prev;
-           else {
-               hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
-               /* make it look like single elt LRU */
-               hd->prev = hd->next = hd;
+           /* we need to grab a new element off the LRU */
+           if (queue_IsNotEmpty(&volume_hdr_LRU)) {
+               /* grab an element and pull off of LRU */
+               hd = queue_First(&volume_hdr_LRU, volHeader);
+               queue_Remove(hd);
+           } else {
+               /* LRU is empty, so allocate a new volHeader 
+                * this is probably indicative of a leak, so let the user know */
+               hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
+               assert(hd != NULL);
                 if (!everLogged) {
                     Log("****Allocated more volume headers, probably leak****\n");
                     everLogged = 1;
                 }
+               volume_hdr_LRU.stats.free++;
             }
             if (hd->back) {
+               VolState vp_save, back_save;
+               /* this header used to belong to someone else. 
+                * we'll need to check if the header needs to
+                * be sync'd out to disk */
+
+               /* if hd->back were in an exclusive state, then
+                * its volHeader would not be on the LRU... */
+               assert(!IsExclusiveState(V_attachState(hd->back)));
+
                 if (hd->diskstuff.inUse) {
+                   /* volume was in use, so we'll need to sync
+                    * its header to disk */
+                   back_save = VChangeState_r(hd->back, VOL_STATE_UPDATING);
+                   vp_save = VChangeState_r(vp, VOL_STATE_HDR_ATTACHING);
+                   VCreateReservation_r(hd->back);
+                   VOL_UNLOCK;
+
                     WriteVolumeHeader_r(&error, hd->back);
                     /* Ignore errors; catch them later */
+
+                   VOL_LOCK;
+               }
+
+               V_attachFlags(hd->back) &= ~(VOL_HDR_ATTACHED | VOL_HDR_LOADED | VOL_HDR_IN_LRU);
+               hd->back->header = NULL;
+
+               if (hd->diskstuff.inUse) {
+                   VChangeState_r(hd->back, back_save);
+                   VCancelReservation_r(hd->back);
+                   VChangeState_r(vp, vp_save);
                 }
-               hd->back->header = 0;
+           } else {
+               volume_hdr_LRU.stats.attached++;
             }
             hd->back = vp;
             vp->header = hd;
+           V_attachFlags(vp) |= VOL_HDR_ATTACHED;
+       }
+       volume_hdr_LRU.stats.free--;
+       volume_hdr_LRU.stats.used++;
+    }
+    IncUInt64(&VStats.hdr_gets);
+    IncUInt64(&vp->stats.hdr_gets);
+    vp->stats.last_hdr_get = FT_ApproxTime();
+    return old;
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+/* Get a volume header from the LRU list; update the old one if necessary */
+/* Returns 1 if there was already a header, which is removed from the LRU list */
+static int
+GetVolumeHeader(register Volume * vp)
+{
+    Error error;
+    register struct volHeader *hd;
+    int old;
+    static int everLogged = 0;
+
+    old = (vp->header != NULL);        /* old == volume already has a header */
+
+    if (programType != fileServer) {
+       /* for volume utilities, we allocate volHeaders as needed */
+       if (!vp->header) {
+           hd = (struct volHeader *)calloc(1, sizeof(*vp->header));
+           assert(hd != NULL);
+           vp->header = hd;
+           hd->back = vp;
         }
-       if (hd->next) {         /* hd->next != 0 --> in LRU chain (we zero it later) */
-           hd->prev->next = hd->next;  /* pull hd out of LRU list */
-           hd->next->prev = hd->prev;  /* if hd only element, this is noop */
+    } else {
+       /* for the fileserver, we keep a volume header cache */
+       if (old) {
+           /* the header we previously dropped in the lru is
+            * still available. pull it off the lru and return */
+           hd = vp->header;
+           queue_Remove(hd);
+           assert(hd->back == vp);
+       } else {
+           /* we need to grab a new element off the LRU */
+           if (queue_IsNotEmpty(&volume_hdr_LRU)) {
+               /* grab an element */
+               hd = queue_First(&volume_hdr_LRU, volHeader);
+               queue_Remove(hd);
+           } else {
+               /* LRU is empty, so allocate a new volHeader 
+                * this is probably indicative of a leak, so let the user know */
+               hd = (struct volHeader *)calloc(1, sizeof(struct volHeader));
+               assert(hd != NULL);
+               if (!everLogged) {
+                   Log("****Allocated more volume headers, probably leak****\n");
+                   everLogged = 1;
+               }
+           }
+           if (hd->back) {
+               /* this header used to belong to someone else. 
+                * we'll need to check if the header needs to
+                * be sync'd out to disk */
+
+               if (hd->diskstuff.inUse) {
+                   WriteVolumeHeader_r(&error, hd->back);
+                   /* Ignore errors; catch them later */
+               }
+               hd->back->header = NULL;
+           }
+           hd->back = vp;
+           vp->header = hd;
         }
-       hd->next = hd->prev = 0;
-       /* if not in LRU chain, next test won't be true */
-       if (hd == volumeLRU)    /* last header item, turn into empty list */
-           volumeLRU = NULL;
      }
      return old;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/* make sure a volume header is attached to
+ * vp, and has the correct data loaded from
+ * disk. */
+#ifdef AFS_DEMAND_ATTACH_FS
+/* caller MUST hold a ref count on vp */
+static void
+LoadVolumeHeader(Error * ec, Volume * vp)
+{
+    VolState state_save;
+    *ec = 0;
+
+    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+       IncUInt64(&VStats.hdr_loads);
+       state_save = VChangeState_r(vp, VOL_STATE_HDR_LOADING);
+       VOL_UNLOCK;
+
+       ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+                  sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+                  VOLUMEINFOVERSION);
+       IncUInt64(&vp->stats.hdr_loads);
+
+       VOL_LOCK;
+       if (!*ec)
+           V_attachFlags(vp) |= VOL_HDR_LOADED;
+       VChangeState_r(vp, state_save);
+    }
+    if (*ec) {
+       /* maintain (nUsers==0) => header in LRU invariant */
+       ReleaseVolumeHeader(vp->header);
+    }
+}
+#else /* AFS_DEMAND_ATTACH_FS */
+static void
+LoadVolumeHeader(Error * ec, Volume * vp)
+{
+    *ec = 0;
+    if (vp->nUsers == 0 && !GetVolumeHeader(vp)) {
+       IncUInt64(&VStats.hdr_loads);
+
+       ReadHeader(ec, V_diskDataHandle(vp), (char *)&V_disk(vp),
+                  sizeof(V_disk(vp)), VOLUMEINFOMAGIC,
+                  VOLUMEINFOVERSION);
+    }
+    if (*ec) {
+       /* maintain (nUsers==0) => header in LRU invariant */
+       ReleaseVolumeHeader(vp->header);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
  
  /* Put it at the top of the LRU chain */
  static void
@@ -1986,18 +5589,22 @@ ReleaseVolumeHeader(register struct volHeader *hd)
  {
      if (programType != fileServer)
         return;
-    if (!hd || hd->next)       /* no header, or header already released */
+    if (!hd || queue_IsOnQueue(hd))    /* no header, or header already released */
         return;
-    if (!volumeLRU) {
-       hd->next = hd->prev = hd;
-    } else {
-       hd->prev = volumeLRU->prev;
-       hd->next = volumeLRU;
-       hd->prev->next = hd->next->prev = hd;
+    queue_Append(&volume_hdr_LRU, hd);
+#ifdef AFS_DEMAND_ATTACH_FS
+    if (hd->back) {
+       V_attachFlags(hd->back) |= VOL_HDR_IN_LRU;
      }
-    volumeLRU = hd;
+    volume_hdr_LRU.stats.free++;
+    volume_hdr_LRU.stats.used--;
+#endif
  }
  
+/* for fileserver, return header to LRU, and
+ * invalidate it as a cache entry.
+ *
+ * for volume utilities, free the heap space */
  static void
  FreeVolumeHeader(register Volume * vp)
  {
@@ -2006,57 +5613,349 @@ FreeVolumeHeader(register Volume * vp)
         return;
      if (programType == fileServer) {
         ReleaseVolumeHeader(hd);
-       hd->back = 0;
+       hd->back = NULL;
      } else {
         free(hd);
      }
-    vp->header = 0;
+#ifdef AFS_DEMAND_ATTACH_FS
+    V_attachFlags(vp) &= ~(VOL_HDR_ATTACHED | VOL_HDR_IN_LRU | VOL_HDR_LOADED);
+    volume_hdr_LRU.stats.attached--;
+#endif
+    vp->header = NULL;
  }
  
  
  /***************************************************/
-/* Routines to add volume to hash chain, delete it */
+/* Volume Hash Table routines                      */
  /***************************************************/
  
+int 
+VSetVolHashSize(int logsize)
+{
+    /* 64 to 16384 hash buckets seems like a reasonable range */
+    if ((logsize < 6 ) || (logsize > 14)) {
+        return -1;
+    }
+    
+    if (!VInit) {
+        VolumeHashTable.Size = 1 << logsize;
+        VolumeHashTable.Mask = VolumeHashTable.Size - 1;
+    } else {
+       /* we can't yet support runtime modification of this
+        * parameter. we'll need a configuration rwlock to
+        * make runtime modification feasible.... */
+       return -1;
+    }
+    return 0;
+}
+
+static void
+VInitVolumeHash(void)
+{
+    register int i;
+
+    VolumeHashTable.Table = (VolumeHashChainHead *) calloc(VolumeHashTable.Size, 
+                                                          sizeof(VolumeHashChainHead));
+    assert(VolumeHashTable.Table != NULL);
+    
+    for (i=0; i < VolumeHashTable.Size; i++) {
+       queue_Init(&VolumeHashTable.Table[i]);
+#ifdef AFS_DEMAND_ATTACH_FS
+       assert(pthread_cond_init(&VolumeHashTable.Table[i].chain_busy_cv, NULL) == 0);
+#endif /* AFS_DEMAND_ATTACH_FS */
+    }
+}
+
+/* for demand-attach, caller MUST hold a ref count on vp */
  static void
  AddVolumeToHashTable(register Volume * vp, int hashid)
  {
-    int hash = VOLUME_HASH(hashid);
+    VolumeHashChainHead * head;
+
+    if (queue_IsOnQueue(vp))
+       return;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    V_attachFlags(vp) |= VOL_IN_HASH;
+    vp->chainCacheCheck = ++head->cacheCheck;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len++;
      vp->hashid = hashid;
-    vp->hashNext = VolumeHashTable[hash];
-    VolumeHashTable[hash] = vp;
+    queue_Append(head, vp);
      vp->vnodeHashOffset = VolumeHashOffset_r();
  }
  
+/* for demand-attach, caller MUST hold a ref count on vp */
  static void
  DeleteVolumeFromHashTable(register Volume * vp)
  {
-    int hash = VOLUME_HASH(vp->hashid);
-    if (VolumeHashTable[hash] == vp)
-       VolumeHashTable[hash] = vp->hashNext;
-    else {
-       Volume *tvp = VolumeHashTable[hash];
-       if (tvp == NULL)
-           return;
-       while (tvp->hashNext && tvp->hashNext != vp)
-           tvp = tvp->hashNext;
-       if (tvp->hashNext == NULL)
-           return;
-       tvp->hashNext = vp->hashNext;
+    VolumeHashChainHead * head;
+
+    if (!queue_IsOnQueue(vp))
+       return;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(vp->hashid)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    V_attachFlags(vp) &= ~(VOL_IN_HASH);
+    head->cacheCheck++;
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    head->len--;
+    queue_Remove(vp);
+    /* do NOT reset hashid to zero, as the online
+     * salvager package may need to know the volume id
+     * after the volume is removed from the hash */
+}
+
+/* - look up a volume id in the hash table
+ * - occasionally rebalance hash chains
+ * - update lookup statistics accordingly
+ */
+/* the hint parameter allows us to short-circuit on
+ * DEMAND_ATTACH_FS if the cacheChecks match between
+ * the hash chain head and hint
+ * caller MUST hold a refcount on hint */
+Volume *
+VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint)
+{
+    register int looks = 0;
+    Volume * vp, *np, *pp;
+    VolumeHashChainHead * head;
+    *ec = 0;
+
+    head = &VolumeHashTable.Table[VOLUME_HASH(volumeId)];
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* wait for the hash chain to become available */
+    VHashWait_r(head);
+
+    /* check to see if we can short circuit without walking the hash chain */
+    if (hint && (hint->chainCacheCheck == head->cacheCheck)) {
+       IncUInt64(&hint->stats.hash_short_circuits);
+       return hint;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    /* someday we need to either do per-chain locks, RWlocks,
+     * or both for volhash access. 
+     * (and move to a data structure with better cache locality) */
+
+    /* search the chain for this volume id */
+    for(queue_Scan(head, vp, np, Volume)) {
+       looks++;
+       if ((vp->hashid == volumeId)) {
+           break;
+       }
+    }
+
+    if (queue_IsEnd(head, vp)) {
+       vp = NULL;
+    }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* update hash chain statistics */
+    {
+       afs_uint64 lks;
+       FillInt64(lks, 0, looks);
+       AddUInt64(head->looks, lks, &head->looks);
+       AddUInt64(VStats.hash_looks, lks, &VStats.hash_looks);
+       IncUInt64(&head->gets);
+    }
+
+    if (vp) {
+       afs_uint64 thresh;
+       IncUInt64(&vp->stats.hash_lookups);
+
+       /* for demand attach fileserver, we permit occasional hash chain reordering
+        * so that frequently looked up volumes move towards the head of the chain */
+       pp = queue_Prev(vp, Volume);
+       if (!queue_IsEnd(head, pp)) {
+           FillInt64(thresh, 0, VOLUME_HASH_REORDER_THRESHOLD);
+           AddUInt64(thresh, pp->stats.hash_lookups, &thresh);
+           if (GEInt64(vp->stats.hash_lookups, thresh)) {
+               VReorderHash_r(head, pp, vp);
+           }
+       }
+
+       /* update the short-circuit cache check */
+       vp->chainCacheCheck = head->cacheCheck;
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */    
+
+    return vp;
+}
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* perform volume hash chain reordering.
+ *
+ * advance a subchain beginning at vp ahead of
+ * the adjacent subchain ending at pp */
+static void
+VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp)
+{
+    Volume *tp, *np, *lp;
+    afs_uint64 move_thresh;
+
+    /* this should never be called if the chain is already busy, so
+     * no need to wait for other exclusive chain ops to finish */
+
+    /* this is a rather heavy set of operations,
+     * so let's set the chain busy flag and drop
+     * the vol_glock */
+    VHashBeginExclusive_r(head);
+    VOL_UNLOCK;
+
+    /* scan forward in the chain from vp looking for the last element
+     * in the chain we want to advance */
+    FillInt64(move_thresh, 0, VOLUME_HASH_REORDER_CHAIN_THRESH);
+    AddUInt64(move_thresh, pp->stats.hash_lookups, &move_thresh);
+    for(queue_ScanFrom(head, vp, tp, np, Volume)) {
+       if (LTInt64(tp->stats.hash_lookups, move_thresh)) {
+           break;
+       }
+    }
+    lp = queue_Prev(tp, Volume);
+
+    /* scan backwards from pp to determine where to splice and
+     * insert the subchain we're advancing */
+    for(queue_ScanBackwardsFrom(head, pp, tp, np, Volume)) {
+       if (GTInt64(tp->stats.hash_lookups, move_thresh)) {
+           break;
+       }
+    }
+    tp = queue_Next(tp, Volume);
+
+    /* rebalance chain(vp,...,lp) ahead of chain(tp,...,pp) */
+    queue_MoveChainBefore(tp,vp,lp);
+
+    VOL_LOCK;
+    IncUInt64(&VStats.hash_reorders);
+    head->cacheCheck++;
+    IncUInt64(&head->reorders);
+
+    /* wake up any threads waiting for the hash chain */
+    VHashEndExclusive_r(head);
+}
+
+
+/* demand-attach fs volume hash
+ * asynchronous exclusive operations */
+
+/* take exclusive control over the hash chain */
+static void
+VHashBeginExclusive_r(VolumeHashChainHead * head)
+{
+    assert(head->busy == 0);
+    head->busy = 1;
+}
+
+/* relinquish exclusive control over the hash chain */
+static void
+VHashEndExclusive_r(VolumeHashChainHead * head)
+{
+    assert(head->busy);
+    head->busy = 0;
+    assert(pthread_cond_broadcast(&head->chain_busy_cv) == 0);
+}
+
+/* wait for another thread to finish its exclusive ops */
+static void
+VHashWait_r(VolumeHashChainHead * head)
+{
+    while (head->busy) {
+       assert(pthread_cond_wait(&head->chain_busy_cv, &vol_glock_mutex) == 0);
+    }
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
+/***************************************************/
+/* Volume by Partition List routines               */
+/***************************************************/
+
+/*
+ * demand attach fileserver adds a
+ * linked list of volumes to each
+ * partition object, thus allowing
+ * for quick enumeration of all
+ * volumes on a partition
+ */
+
+#ifdef AFS_DEMAND_ATTACH_FS
+static void
+AddVolumeToVByPList_r(Volume * vp)
+{
+    if (queue_IsNotOnQueue(&vp->vol_list)) {
+       queue_Append(&vp->partition->vol_list, &vp->vol_list);
+       V_attachFlags(vp) |= VOL_ON_VBYP_LIST;
+       vp->partition->vol_list.len++;
+    }
+}
+
+static void
+DeleteVolumeFromVByPList_r(Volume * vp)
+{
+    if (queue_IsOnQueue(&vp->vol_list)) {
+       queue_Remove(&vp->vol_list);
+       V_attachFlags(vp) &= ~(VOL_ON_VBYP_LIST);
+       vp->partition->vol_list.len--;
+    }
+}
+
+/* take exclusive control over the list */
+static void
+VVByPListBeginExclusive_r(struct DiskPartition * dp)
+{
+    assert(dp->vol_list.busy == 0);
+    dp->vol_list.busy = 1;
+}
+
+/* relinquish exclusive control over the list */
+static void
+VVByPListEndExclusive_r(struct DiskPartition * dp)
+{
+    assert(dp->vol_list.busy);
+    dp->vol_list.busy = 0;
+    assert(pthread_cond_broadcast(&dp->vol_list.cv) == 0);
+}
+
+/* wait for another thread to finish its exclusive ops */
+static void
+VVByPListWait_r(struct DiskPartition * dp)
+{
+    while (dp->vol_list.busy) {
+       assert(pthread_cond_wait(&dp->vol_list.cv, &vol_glock_mutex) == 0);
      }
-    vp->hashid = 0;
  }
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+/***************************************************/
+/* Volume Cache Statistics routines                */
+/***************************************************/
  
  void
  VPrintCacheStats_r(void)
  {
+    afs_uint32 get_hi, get_lo, load_hi, load_lo;
      register struct VnodeClassInfo *vcp;
      vcp = &VnodeClassInfo[vLarge];
      Log("Large vnode cache, %d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
      vcp = &VnodeClassInfo[vSmall];
      Log("Small vnode cache,%d entries, %d allocs, %d gets (%d reads), %d writes\n", vcp->cacheSize, vcp->allocs, vcp->gets, vcp->reads, vcp->writes);
+    SplitInt64(VStats.hdr_gets, get_hi, get_lo);
+    SplitInt64(VStats.hdr_loads, load_hi, load_lo);
      Log("Volume header cache, %d entries, %d gets, %d replacements\n",
-       VolumeCacheSize, VolumeGets, VolumeReplacements);
+       VStats.hdr_cache_size, get_lo, load_lo);
  }
  
  void
@@ -2067,3 +5966,259 @@ VPrintCacheStats(void)
      VOL_UNLOCK;
  }
  
+#ifdef AFS_DEMAND_ATTACH_FS
+static double
+UInt64ToDouble(afs_uint64 * x)
+{
+    static double c32 = 4.0 * 1.073741824 * 1000000000.0;
+    afs_uint32 h, l;
+    SplitInt64(*x, h, l);
+    return (((double)h) * c32) + ((double) l);
+}
+
+static char *
+DoubleToPrintable(double x, char * buf, int len)
+{
+    static double billion = 1000000000.0;
+    afs_uint32 y[3];
+
+    y[0] = (afs_uint32) (x / (billion * billion));
+    y[1] = (afs_uint32) ((x - (((double)y[0]) * billion * billion)) / billion);
+    y[2] = (afs_uint32) (x - ((((double)y[0]) * billion * billion) + (((double)y[1]) * billion)));
+
+    if (y[0]) {
+       snprintf(buf, len, "%d%09d%09d", y[0], y[1], y[2]);
+    } else if (y[1]) {
+       snprintf(buf, len, "%d%09d", y[1], y[2]);
+    } else {
+       snprintf(buf, len, "%d", y[2]);
+    }
+    buf[len-1] = '\0';
+    return buf;
+}
+
+static void
+VPrintExtendedCacheStats_r(int flags)
+{
+    int i, j;
+    struct stats {
+       double min;
+       double max;
+       double sum;
+       double avg;
+    };
+    struct stats looks, gets, reorders, len;
+    struct stats ch_looks, ch_gets, ch_reorders;
+    char pr_buf[4][32];
+    VolumeHashChainHead *head;
+    Volume *vp, *np;
+
+    /* zero out stats */
+    memset(&looks, 0, sizeof(struct stats));
+    memset(&gets, 0, sizeof(struct stats));
+    memset(&reorders, 0, sizeof(struct stats));
+    memset(&len, 0, sizeof(struct stats));
+    memset(&ch_looks, 0, sizeof(struct stats));
+    memset(&ch_gets, 0, sizeof(struct stats));
+    memset(&ch_reorders, 0, sizeof(struct stats));
+
+    for (i = 0; i < VolumeHashTable.Size; i++) {
+       head = &VolumeHashTable.Table[i];
+
+       VHashWait_r(head);
+       VHashBeginExclusive_r(head);
+       VOL_UNLOCK;
+
+       ch_looks.sum    = UInt64ToDouble(&head->looks);
+       ch_gets.sum     = UInt64ToDouble(&head->gets);
+       ch_reorders.sum = UInt64ToDouble(&head->reorders);
+
+       /* update global statistics */
+       {
+           looks.sum    += ch_looks.sum;
+           gets.sum     += ch_gets.sum;
+           reorders.sum += ch_reorders.sum;
+           len.sum      += (double)head->len;
+           
+           if (i == 0) {
+               len.min      = (double) head->len;
+               len.max      = (double) head->len;
+               looks.min    = ch_looks.sum;
+               looks.max    = ch_looks.sum;
+               gets.min     = ch_gets.sum;
+               gets.max     = ch_gets.sum;
+               reorders.min = ch_reorders.sum;
+               reorders.max = ch_reorders.sum;
+           } else {
+               if (((double)head->len) < len.min)
+                   len.min = (double) head->len;
+               if (((double)head->len) > len.max)
+                   len.max = (double) head->len;
+               if (ch_looks.sum < looks.min)
+                   looks.min = ch_looks.sum;
+               else if (ch_looks.sum > looks.max)
+                   looks.max = ch_looks.sum;
+               if (ch_gets.sum < gets.min)
+                   gets.min = ch_gets.sum;
+               else if (ch_gets.sum > gets.max)
+                   gets.max = ch_gets.sum;
+               if (ch_reorders.sum < reorders.min)
+                   reorders.min = ch_reorders.sum;
+               else if (ch_reorders.sum > reorders.max)
+                   reorders.max = ch_reorders.sum;
+           }
+       }
+
+       if ((flags & VOL_STATS_PER_CHAIN2) && queue_IsNotEmpty(head)) {
+           /* compute detailed per-chain stats */
+           struct stats hdr_loads, hdr_gets;
+           double v_looks, v_loads, v_gets;
+
+           /* initialize stats with data from first element in chain */
+           vp = queue_First(head, Volume);
+           v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+           v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+           v_gets  = UInt64ToDouble(&vp->stats.hdr_gets);
+           ch_gets.min = ch_gets.max = v_looks;
+           hdr_loads.min = hdr_loads.max = v_loads;
+           hdr_gets.min = hdr_gets.max = v_gets;
+           hdr_loads.sum = hdr_gets.sum = 0;
+
+           vp = queue_Next(vp, Volume);
+
+           /* pull in stats from remaining elements in chain */
+           for (queue_ScanFrom(head, vp, vp, np, Volume)) {
+               v_looks = UInt64ToDouble(&vp->stats.hash_lookups);
+               v_loads = UInt64ToDouble(&vp->stats.hdr_loads);
+               v_gets  = UInt64ToDouble(&vp->stats.hdr_gets);
+
+               hdr_loads.sum += v_loads;
+               hdr_gets.sum += v_gets;
+
+               if (v_looks < ch_gets.min)
+                   ch_gets.min = v_looks;
+               else if (v_looks > ch_gets.max)
+                   ch_gets.max = v_looks;
+
+               if (v_loads < hdr_loads.min)
+                   hdr_loads.min = v_loads;
+               else if (v_loads > hdr_loads.max)
+                   hdr_loads.max = v_loads;
+
+               if (v_gets < hdr_gets.min)
+                   hdr_gets.min = v_gets;
+               else if (v_gets > hdr_gets.max)
+                   hdr_gets.max = v_gets;
+           }
+
+           /* compute per-chain averages */
+           ch_gets.avg = ch_gets.sum / ((double)head->len);
+           hdr_loads.avg = hdr_loads.sum / ((double)head->len);
+           hdr_gets.avg = hdr_gets.sum / ((double)head->len);
+
+           /* dump per-chain stats */
+           Log("Volume hash chain %d : len=%d, looks=%s, reorders=%s\n",
+               i, head->len, 
+               DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+               DoubleToPrintable(ch_reorders.sum, pr_buf[1], sizeof(pr_buf[1])));
+           Log("\tVolume gets : min=%s, max=%s, avg=%s, total=%s\n",
+               DoubleToPrintable(ch_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+               DoubleToPrintable(ch_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+               DoubleToPrintable(ch_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+               DoubleToPrintable(ch_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+           Log("\tHDR gets : min=%s, max=%s, avg=%s, total=%s\n",
+               DoubleToPrintable(hdr_gets.min, pr_buf[0], sizeof(pr_buf[0])),
+               DoubleToPrintable(hdr_gets.max, pr_buf[1], sizeof(pr_buf[1])),
+               DoubleToPrintable(hdr_gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+               DoubleToPrintable(hdr_gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+           Log("\tHDR loads : min=%s, max=%s, avg=%s, total=%s\n",
+               DoubleToPrintable(hdr_loads.min, pr_buf[0], sizeof(pr_buf[0])),
+               DoubleToPrintable(hdr_loads.max, pr_buf[1], sizeof(pr_buf[1])),
+               DoubleToPrintable(hdr_loads.avg, pr_buf[2], sizeof(pr_buf[2])),
+               DoubleToPrintable(hdr_loads.sum, pr_buf[3], sizeof(pr_buf[3])));
+       } else if (flags & VOL_STATS_PER_CHAIN) {
+           /* dump simple per-chain stats */
+           Log("Volume hash chain %d : len=%d, looks=%s, gets=%s, reorders=%s\n",
+               i, head->len, 
+               DoubleToPrintable(ch_looks.sum, pr_buf[0], sizeof(pr_buf[0])),
+               DoubleToPrintable(ch_gets.sum, pr_buf[1], sizeof(pr_buf[1])),
+               DoubleToPrintable(ch_reorders.sum, pr_buf[2], sizeof(pr_buf[2])));
+       }
+
+       VOL_LOCK;
+       VHashEndExclusive_r(head);
+    }
+
+    VOL_UNLOCK;
+
+    /* compute global averages */
+    len.avg      = len.sum      / ((double)VolumeHashTable.Size);
+    looks.avg    = looks.sum    / ((double)VolumeHashTable.Size);
+    gets.avg     = gets.sum     / ((double)VolumeHashTable.Size);
+    reorders.avg = reorders.sum / ((double)VolumeHashTable.Size);
+
+    /* dump global stats */
+    Log("Volume hash summary: %d buckets\n", VolumeHashTable.Size);
+    Log(" chain length : min=%s, max=%s, avg=%s, total=%s\n",
+       DoubleToPrintable(len.min, pr_buf[0], sizeof(pr_buf[0])),
+       DoubleToPrintable(len.max, pr_buf[1], sizeof(pr_buf[1])),
+       DoubleToPrintable(len.avg, pr_buf[2], sizeof(pr_buf[2])),
+       DoubleToPrintable(len.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" looks : min=%s, max=%s, avg=%s, total=%s\n",
+       DoubleToPrintable(looks.min, pr_buf[0], sizeof(pr_buf[0])),
+       DoubleToPrintable(looks.max, pr_buf[1], sizeof(pr_buf[1])),
+       DoubleToPrintable(looks.avg, pr_buf[2], sizeof(pr_buf[2])),
+       DoubleToPrintable(looks.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" gets : min=%s, max=%s, avg=%s, total=%s\n",
+       DoubleToPrintable(gets.min, pr_buf[0], sizeof(pr_buf[0])),
+       DoubleToPrintable(gets.max, pr_buf[1], sizeof(pr_buf[1])),
+       DoubleToPrintable(gets.avg, pr_buf[2], sizeof(pr_buf[2])),
+       DoubleToPrintable(gets.sum, pr_buf[3], sizeof(pr_buf[3])));
+    Log(" reorders : min=%s, max=%s, avg=%s, total=%s\n",
+       DoubleToPrintable(reorders.min, pr_buf[0], sizeof(pr_buf[0])),
+       DoubleToPrintable(reorders.max, pr_buf[1], sizeof(pr_buf[1])),
+       DoubleToPrintable(reorders.avg, pr_buf[2], sizeof(pr_buf[2])),
+       DoubleToPrintable(reorders.sum, pr_buf[3], sizeof(pr_buf[3])));
+
+    /* print extended disk related statistics */
+    {
+       struct DiskPartition * diskP;
+       afs_uint32 vol_count[VOLMAXPARTS+1];
+       byte part_exists[VOLMAXPARTS+1];
+       Device id;
+       int i;
+
+       memset(vol_count, 0, sizeof(vol_count));
+       memset(part_exists, 0, sizeof(part_exists));
+
+       VOL_LOCK;
+
+       for (diskP = DiskPartitionList; diskP; diskP = diskP->next) {
+           id = diskP->device;
+           vol_count[id] = diskP->vol_list.len;
+           part_exists[id] = 1;
+       }
+
+       VOL_UNLOCK;
+       for (i = 0; i <= VOLMAXPARTS; i++) {
+           if (part_exists[i]) {
+               diskP = VGetPartitionById_r(i, 0);
+               if (diskP) {
+                   Log("Partition %s has %d online volumes\n", 
+                       VPartitionPath(diskP), diskP->vol_list.len);
+               }
+           }
+       }
+       VOL_LOCK;
+    }
+
+}
+
+void
+VPrintExtendedCacheStats(int flags)
+{
+    VOL_LOCK;
+    VPrintExtendedCacheStats_r(flags);
+    VOL_UNLOCK;
+}
+#endif /* AFS_DEMAND_ATTACH_FS */
diff --git a/src/vol/volume.h b/src/vol/volume.h

index c66a09b7c5f4d97ea56c8b7ace4819ead90e3faa..09190bc3109d8e6ed6605df5a756e1d6eeaf54ab 100644 (file)
--- a/src/vol/volume.h
+++ b/src/vol/volume.h
@@ -5,6 +5,8 @@
   * This software has been released under the terms of the IBM Public
   * License.  For details, see the LICENSE file in the top-level source
   * directory or online at http://www.openafs.org/dl/license10.html
+ *
+ * Portions Copyright (c) 2006 Sine Nomine Associates
   */
  
  /*
@@ -24,44 +26,44 @@
  #define VolumeWriteable2(vol)          (vol.type == readwriteVolume)
  typedef bit32 FileOffset;      /* Offset in this file */
  #define Date afs_uint32
+#include "daemon_com.h"
+#include "fssync.h"
  
  #ifdef AFS_PTHREAD_ENV
  #include <assert.h>
  #include <pthread.h>
  extern pthread_mutex_t vol_glock_mutex;
-extern pthread_mutex_t vol_attach_mutex;
-extern pthread_mutex_t vol_fsync_mutex;
  extern pthread_mutex_t vol_trans_mutex;
  extern pthread_cond_t vol_put_volume_cond;
  extern pthread_cond_t vol_sleep_cond;
  extern int vol_attach_threads;
-/* this lock has been deprecated */
-#define VATTACH_LOCK
-#define VATTACH_UNLOCK
  #define VOL_LOCK \
      assert(pthread_mutex_lock(&vol_glock_mutex) == 0)
  #define VOL_UNLOCK \
      assert(pthread_mutex_unlock(&vol_glock_mutex) == 0)
-#define VFSYNC_LOCK \
-    assert(pthread_mutex_lock(&vol_fsync_mutex) == 0)
-#define VFSYNC_UNLOCK \
-    assert(pthread_mutex_unlock(&vol_fsync_mutex) == 0)
+#define VSALVSYNC_LOCK \
+    assert(pthread_mutex_lock(&vol_salvsync_mutex) == 0)
+#define VSALVSYNC_UNLOCK \
+    assert(pthread_mutex_unlock(&vol_salvsync_mutex) == 0)
  #define VTRANS_LOCK \
      assert(pthread_mutex_lock(&vol_trans_mutex) == 0)
  #define VTRANS_UNLOCK \
      assert(pthread_mutex_unlock(&vol_trans_mutex) == 0)
  #else /* AFS_PTHREAD_ENV */
-#define VATTACH_LOCK
-#define VATTACH_UNLOCK
  #define VOL_LOCK
  #define VOL_UNLOCK
-#define VFSYNC_LOCK
-#define VFSYNC_UNLOCK
+#define VSALVSYNC_LOCK
+#define VSALVSYNC_UNLOCK
  #define VTRANS_LOCK
  #define VTRANS_UNLOCK
  #endif /* AFS_PTHREAD_ENV */
  
-typedef enum { fileServer, volumeUtility, salvager } ProgramType;
+typedef enum { fileServer,       /* the fileserver process */
+              volumeUtility,    /* volserver, or a single volume salvager (non-dafs) */
+              salvager,         /* standalone whole-partition salvager */
+              salvageServer,    /* dafs online salvager */
+              debugUtility      /* fssync-debug or similar utility */
+} ProgramType;
  extern ProgramType programType;        /* The type of program using the package */
  
  /* Some initialization parameters for the volume package */
@@ -76,6 +78,70 @@ struct versionStamp {                /* Version stamp for critical volume files */
                                  * that created this file */
  };
  
+#ifdef AFS_DEMAND_ATTACH_FS
+/*
+ * demand attach fs
+ * volume state machine
+ *
+ * these must be contiguous in order for IsValidState() to work correctly 
+ */
+#define VOL_STATE_UNATTACHED     0       /* volume is unattached */
+#define VOL_STATE_PREATTACHED    1       /* volume has been pre-attached */
+#define VOL_STATE_ATTACHING      2       /* volume is transitioning to fully attached */
+#define VOL_STATE_ATTACHED       3       /* volume has been fully attached */
+#define VOL_STATE_UPDATING       4       /* volume is updating on-disk structures */
+#define VOL_STATE_GET_BITMAP     5       /* volume is getting bitmap entries */
+#define VOL_STATE_HDR_LOADING    6       /* volume is loading disk header */
+#define VOL_STATE_HDR_ATTACHING  7       /* volume is getting a header from the LRU */
+#define VOL_STATE_SHUTTING_DOWN  8       /* volume is shutting down */
+#define VOL_STATE_GOING_OFFLINE  9       /* volume is going offline */
+#define VOL_STATE_OFFLINING      10      /* volume is transitioning to offline */
+#define VOL_STATE_DETACHING      11      /* volume is transitioning to detached */
+#define VOL_STATE_SALVSYNC_REQ   12      /* volume is blocked on a salvsync request */
+#define VOL_STATE_SALVAGING      13      /* volume is being salvaged */
+#define VOL_STATE_ERROR          14      /* volume is in an error state */
+#define VOL_STATE_FREED          15      /* debugging aid */
+
+#define VOL_STATE_COUNT          16      /* total number of valid states */
+
+/* V_attachFlags bits */
+#define VOL_HDR_ATTACHED   0x1     /* volume header is attached to Volume struct */
+#define VOL_HDR_LOADED     0x2     /* volume header contents are valid */
+#define VOL_HDR_IN_LRU     0x4     /* volume header is in LRU */
+#define VOL_IN_HASH        0x8     /* volume is in hash table */
+#define VOL_ON_VBYP_LIST   0x10    /* volume is on VByP list */
+#define VOL_IS_BUSY        0x20    /* volume is not to be free()d */
+#define VOL_ON_VLRU        0x40    /* volume is on the VLRU */
+#define VOL_HDR_DONTSALV   0x80    /* volume header DONTSALVAGE flag is set */
+
+/* VPrintExtendedCacheStats flags */
+#define VOL_STATS_PER_CHAIN   0x1  /* compute simple per-chain stats */
+#define VOL_STATS_PER_CHAIN2  0x2  /* compute per-chain stats that require scanning
+                                   * every element of the chain */
+
+/* VLRU_SetOptions options */
+#define VLRU_SET_THRESH       1
+#define VLRU_SET_INTERVAL     2
+#define VLRU_SET_MAX          3
+#define VLRU_SET_ENABLED      4
+
+/* valid VLRU queue names */
+#define VLRU_QUEUE_NEW 0            /* LRU queue for new volumes */
+#define VLRU_QUEUE_MID 1            /* survivor generation */
+#define VLRU_QUEUE_OLD 2            /* old generation */
+#define VLRU_QUEUE_CANDIDATE 3      /* soft detach candidate pool */
+#define VLRU_QUEUE_HELD 4           /* volumes which are not allowed
+                                    * to be soft detached */
+#define VLRU_QUEUE_INVALID 5        /* invalid queue id */
+
+/* default scanner timing parameters */
+#define VLRU_DEFAULT_OFFLINE_THRESH (60*60*2) /* 2 hours */
+#define VLRU_DEFAULT_OFFLINE_INTERVAL (60*2) /* 2 minutes */
+#define VLRU_DEFAULT_OFFLINE_MAX 8 /* 8 volumes */
+
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+
  /* Magic numbers and version stamps for each type of file */
  #define VOLUMEHEADERMAGIC      ((bit32)0x88a1bb3c)
  #define VOLUMEINFOMAGIC                ((bit32)0x78a1b2c5)
@@ -297,8 +363,144 @@ typedef struct VolumeDiskData {
  /**************************************/
  /* Memory resident volume information */
  /**************************************/
+
+/* global volume package stats */
+typedef struct VolPkgStats {
+#ifdef AFS_DEMAND_ATTACH_FS
+    /*
+     * demand attach fs
+     * extended volume package statistics
+     */
+
+    /* levels */
+    afs_uint32 state_levels[VOL_STATE_COUNT];
+
+    /* counters */
+    afs_uint64 hash_looks;           /* number of hash chain element traversals */
+    afs_uint64 hash_reorders;        /* number of hash chain reorders */
+    afs_uint64 salvages;             /* online salvages since fileserver start */
+    afs_uint64 vol_ops;              /* volume operations since fileserver start */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
+    afs_uint64 hdr_loads;            /* header loads from disk */
+    afs_uint64 hdr_gets;             /* header pulls out of LRU */
+    afs_uint64 attaches;             /* volume attaches since fileserver start */
+    afs_uint64 soft_detaches;        /* soft detach ops since fileserver start */
+
+    /* configuration parameters */
+    afs_uint32 hdr_cache_size;       /* size of volume header cache */
+} VolPkgStats;
+extern VolPkgStats VStats;
+
+/*
+ * volume header cache supporting structures
+ */
+#ifdef AFS_DEMAND_ATTACH_FS
+struct volume_hdr_LRU_stats {
+    afs_uint32 free;
+    afs_uint32 used;
+    afs_uint32 attached;
+};
+#endif
+
+struct volume_hdr_LRU_t {
+    struct rx_queue lru;
+#ifdef AFS_DEMAND_ATTACH_FS
+    struct volume_hdr_LRU_stats stats;
+#endif
+};
+extern struct volume_hdr_LRU_t volume_hdr_LRU;
+
+/*
+ * volume hash chain supporting structures
+ */
+typedef struct VolumeHashChainHead {
+    struct rx_queue queue;
+    int len;
+    /* someday we could put a per-chain lock here... */
+#ifdef AFS_DEMAND_ATTACH_FS
+    int busy;
+    int cacheCheck;
+
+    /* per-chain statistics */
+    afs_uint64 looks;
+    afs_uint64 gets;
+    afs_uint64 reorders;
+
+    pthread_cond_t chain_busy_cv;
+#endif /* AFS_DEMAND_ATTACH_FS */
+} VolumeHashChainHead;
+
+typedef struct VolumeHashTable {
+    int Size;
+    int Mask;
+    VolumeHashChainHead * Table;
+} VolumeHashTable_t;
+extern VolumeHashTable_t VolumeHashTable;
+
+struct VolumeHashChainStats {
+    afs_int32 table_size;
+    afs_int32 chain_len;
+#ifdef AFS_DEMAND_ATTACH_FS
+    afs_int32 chain_cacheCheck;
+    afs_int32 chain_busy;
+    afs_uint64 chain_looks;
+    afs_uint64 chain_gets;
+    afs_uint64 chain_reorders;
+#endif
+};
+
+
+#ifdef AFS_DEMAND_ATTACH_FS
+/* demand attach fs
+ * extended per-volume statistics 
+ *
+ * please note that this structure lives across the entire
+ * lifetime of the fileserver process
+ */
+typedef struct VolumeStats {
+    /* counters */
+    afs_uint64 hash_lookups;         /* hash table lookups */
+    afs_uint64 hash_short_circuits;  /* short circuited hash lookups (due to cacheCheck) */
+    afs_uint64 hdr_loads;            /* header loads from disk */
+    afs_uint64 hdr_gets;             /* header pulls out of LRU */
+    afs_uint16 attaches;             /* attaches of this volume since fileserver start */
+    afs_uint16 soft_detaches;        /* soft detaches of this volume */
+    afs_uint16 salvages;             /* online salvages since fileserver start */
+    afs_uint16 vol_ops;              /* volume operations since fileserver start */
+
+    /* timestamps */
+    afs_uint32 last_attach;      /* unix timestamp of last VAttach */
+    afs_uint32 last_get;         /* unix timestamp of last VGet/VHold */
+    afs_uint32 last_promote;     /* unix timestamp of last VLRU promote/demote */
+    afs_uint32 last_hdr_get;     /* unix timestamp of last GetVolumeHeader() */
+    afs_uint32 last_salvage;     /* unix timestamp of last initiation of an online salvage */
+    afs_uint32 last_salvage_req; /* unix timestamp of last SALVSYNC request */
+    afs_uint32 last_vol_op;      /* unix timestamp of last volume operation */
+} VolumeStats;
+
+/* demand attach fs
+ * online salvager state */
+typedef struct VolumeOnlineSalvage {
+    afs_uint32 prio;            /* number of VGetVolume's since salvage requested */
+    int reason;                 /* reason for requesting online salvage */
+    byte requested;             /* flag specifying that salvage should be scheduled */
+    byte scheduled;             /* flag specifying whether online salvage scheduled */
+    byte reserved[2];           /* padding */
+} VolumeOnlineSalvage;
+
+/* demand attach fs
+ * volume LRU state */
+typedef struct VolumeVLRUState {
+    struct rx_queue lru;        /* VLRU queue pointers */
+    int idx;                    /* VLRU generation index */
+} VolumeVLRUState;
+
+typedef afs_uint16 VolState;    /* attachment state type */
+#endif /* AFS_DEMAND_ATTACH_FS */
+
  typedef struct Volume {
-    struct Volume *hashNext;   /* Next in hash resolution table */
+    struct rx_queue q;          /* Volume hash chain pointers */
      VolumeId hashid;           /* Volume number -- for hash table lookup */
      struct volHeader *header;  /* Cached disk data */
      Device device;             /* Unix device for the volume */
@@ -339,10 +541,23 @@ typedef struct Volume {
      afs_uint32 updateTime;     /* Time that this volume was put on the updated
                                  * volume list--the list of volumes that will be
                                  * salvaged should the file server crash */
+#ifdef AFS_DEMAND_ATTACH_FS
+    VolState attach_state;      /* what stage of attachment has been completed */
+    afs_uint16 attach_flags;    /* flags related to attachment state */
+    pthread_cond_t attach_cv;   /* state change condition variable */
+    short nWaiters;             /* volume package internal ref count */
+    int chainCacheCheck;        /* Volume hash chain cache check */
+    struct rx_queue vol_list;   /* per-partition volume list (VByPList) */
+
+    VolumeOnlineSalvage salvage;  /* online salvager state */
+    VolumeStats stats;            /* per-volume statistics */
+    VolumeVLRUState vlru;         /* state specific to the VLRU */
+    FSSYNC_VolOp_info * pending_vol_op;  /* fssync command info for any pending vol ops */
+#endif /* AFS_DEMAND_ATTACH_FS */
  } Volume;
  
  struct volHeader {
-    struct volHeader *prev, *next;     /* LRU pointers */
+    struct rx_queue lru;
      VolumeDiskData diskstuff;  /* General volume info read from disk */
      Volume *back;              /* back pointer to current volume structure */
  };
@@ -356,6 +571,11 @@ struct volHeader {
  #define V_vnodeIndex(vp)       ((vp)->vnodeIndex)
  #define V_nextVnodeUnique(vp)  ((vp)->nextVnodeUnique)
  #define V_linkHandle(vp)       ((vp)->linkHandle)
+#ifdef AFS_DEMAND_ATTACH_FS
+#define V_attachState(vp)       ((vp)->attach_state)
+#define V_attachFlags(vp)       ((vp)->attach_flags)
+#define V_attachCV(vp)          ((vp)->attach_cv)
+#endif /* AFS_DEMAND_ATTACH_FS */
  
  /* N.B. V_id must be this, rather than vp->id, or some programs will break, probably */
  #define V_stamp(vp)            ((vp)->header->diskstuff.stamp)
@@ -414,7 +634,7 @@ struct volHeader {
  
  extern char *VSalvageMessage;  /* Canonical message when a volume is forced
                                  * offline */
-extern Volume *VGetVolume(Error * ec, VolId volumeId);
+extern Volume *VGetVolume(Error * ec, Error * client_ec, VolId volumeId);
  extern Volume *VGetVolume_r(Error * ec, VolId volumeId);
  extern void VPutVolume(Volume *);
  extern void VPutVolume_r(Volume *);
@@ -422,6 +642,9 @@ extern void VOffline(Volume * vp, char *message);
  extern void VOffline_r(Volume * vp, char *message);
  extern int VConnectFS(void);
  extern int VConnectFS_r(void);
+extern void VDisconnectFS(void);
+extern void VDisconnectFS_r(void);
+extern int VChildProcReconnectFS(void);
  extern Volume *VAttachVolume(Error * ec, VolumeId volumeId, int mode);
  extern Volume *VAttachVolume_r(Error * ec, VolumeId volumeId, int mode);
  extern Volume *VCreateVolume(Error * ec, char *partname, VolId volumeId,
@@ -431,7 +654,7 @@ extern Volume *VCreateVolume_r(Error * ec, char *partname, VolId volumeId,
  extern VnodeId VAllocBitmapEntry(Error * ec, Volume * vp,
                                  struct vnodeIndex *index);
  extern VnodeId VAllocBitmapEntry_r(Error * ec, Volume * vp,
-                                  struct vnodeIndex *index);
+                                  struct vnodeIndex *index, int flags);
  extern void VFreeBitMapEntry(Error * ec, register struct vnodeIndex *index,
                              unsigned bitNumber);
  extern void VFreeBitMapEntry_r(Error * ec, register struct vnodeIndex *index,
@@ -444,13 +667,13 @@ extern Volume *VAttachVolumeByName_r(Error * ec, char *partition, char *name,
                                      int mode);
  extern void VShutdown(void);
  extern void VUpdateVolume(Error * ec, Volume * vp);
-extern void VUpdateVolume_r(Error * ec, Volume * vp);
+extern void VUpdateVolume_r(Error * ec, Volume * vp, int flags);
  extern void VAddToVolumeUpdateList(Error * ec, Volume * vp);
  extern void VAddToVolumeUpdateList_r(Error * ec, Volume * vp);
  extern void VDetachVolume(Error * ec, Volume * vp);
  extern void VDetachVolume_r(Error * ec, Volume * vp);
  extern void VForceOffline(Volume * vp);
-extern void VForceOffline_r(Volume * vp);
+extern void VForceOffline_r(Volume * vp, int flags);
  extern void VBumpVolumeUsage(register Volume * vp);
  extern void VBumpVolumeUsage_r(register Volume * vp);
  extern void VSetDiskUsage(void);
@@ -459,12 +682,41 @@ extern void VReleaseVnodeFiles_r(Volume * vp);
  extern void VCloseVnodeFiles_r(Volume * vp);
  extern struct DiskPartition *VGetPartition(char *name, int abortp);
  extern struct DiskPartition *VGetPartition_r(char *name, int abortp);
-extern int VInitVolumePackage(ProgramType pt, int nLargeVnodes,
-                             int nSmallVnodes, int connect, int volcache);
+extern int VInitVolumePackage(ProgramType pt, afs_uint32 nLargeVnodes,
+                             afs_uint32 nSmallVnodes, int connect, afs_uint32 volcache);
  extern void DiskToVolumeHeader(VolumeHeader_t * h, VolumeDiskHeader_t * dh);
  extern void VolumeHeaderToDisk(VolumeDiskHeader_t * dh, VolumeHeader_t * h);
  extern void VTakeOffline_r(register Volume * vp);
  extern void VTakeOffline(register Volume * vp);
+extern Volume * VLookupVolume_r(Error * ec, VolId volumeId, Volume * hint);
+
+#ifdef AFS_DEMAND_ATTACH_FS
+extern Volume *VPreAttachVolumeByName(Error * ec, char *partition, char *name, 
+                                      int mode);
+extern Volume *VPreAttachVolumeByName_r(Error * ec, char *partition, char *name,
+                                    int mode);
+extern Volume *VPreAttachVolumeById_r(Error * ec, struct DiskPartition * partp, 
+                                     Volume * vp, int volume_id);
+extern Volume *VGetVolumeByVp_r(Error * ec, Volume * vp);
+extern int VShutdownByPartition_r(struct DiskPartition * dp);
+extern int VShutdownVolume_r(Volume * vp);
+extern int VConnectSALV(void);
+extern int VConnectSALV_r(void);
+extern int VReconnectSALV(void);
+extern int VReconnectSALV_r(void);
+extern int VDisconnectSALV(void);
+extern int VDisconnectSALV_r(void);
+extern void VPrintExtendedCacheStats(int flags);
+extern void VPrintExtendedCacheStats_r(int flags);
+extern VolState VChangeState_r(Volume * vp, VolState new_state);
+extern void VLRU_SetOptions(int option, afs_uint32 val);
+extern int VSetVolHashSize(int logsize);
+extern int VRequestSalvage_r(Volume * vp, int reason, int flags);
+extern int VRegisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+extern int VDeregisterVolOp_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+#endif /* AFS_DEMAND_ATTACH_FS */
+extern int VVolOpLeaveOnline_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
+extern int VVolOpSetVBusy_r(Volume * vp, FSSYNC_VolOp_info * vopinfo);
  
  
  /* Naive formula relating number of file size to number of 1K blocks in file */
@@ -500,6 +752,26 @@ extern void VTakeOffline(register Volume * vp);
                                  * getting the most recent data. */
  
  
+
+/* VUpdateVolume_r flags */
+#define VOL_UPDATE_WAIT          0x1  /* for demand attach, wait for other exclusive ops to end */
+#define VOL_UPDATE_NOFORCEOFF    0x2  /* don't force offline on failure. this is to prevent
+                                      * infinite recursion between vupdate and vforceoff */
+
+/* VForceOffline_r flags */
+#define VOL_FORCEOFF_NOUPDATE    0x1  /* don't force update on forceoff. this is to prevent
+                                      * infinite recursion between vupdate and vforceoff */
+
+/* VSyncVolume_r flags */
+#define VOL_SYNC_WAIT            0x1  /* for demand attach, wait for other exclusive ops to end */
+
+/* VAllocBitmapEntry_r flags */
+#define VOL_ALLOC_BITMAP_WAIT    0x1  /* for demand attach, wait for other exclusive ops to end */
+
+/* VRequestSalvage_r flags */
+#define VOL_SALVAGE_INVALIDATE_HEADER 0x1 /* for demand attach fs, invalidate volume header cache */
+
+
  #if    defined(NEARINODE_HINT)
  #define V_pref(vp,nearInode)  nearInodeHash(V_id(vp),(nearInode)); (nearInode) %= V_partition(vp)->f_files
  #else
diff --git a/src/volser/NTMakefile b/src/volser/NTMakefile

index 5e6fa35e939ea413bbdd46ed94ebaae9f8e6da46..ded4d7363443397eca9f74cfebd55daf9409f245 100644 (file)
--- a/src/volser/NTMakefile
+++ b/src/volser/NTMakefile
@@ -5,6 +5,8 @@
  # License.  For details, see the LICENSE file in the top-level source
  # directory or online at http://www.openafs.org/dl/license10.html
  
+AFSDEV_AUXCDEFINES = -DFSSYNC_BUILD_CLIENT
+
  RELDIR=volser
  !INCLUDE ..\config\NTMakefile.$(SYS_NAME)
  !INCLUDE ..\config\NTMakefile.version
diff --git a/src/volser/dumpstuff.c b/src/volser/dumpstuff.c

index 911c35ae44ef1c5d903a65c49c5475f3f324bb11..fc16c527931de79413f58daeb9b0d12d3d6095ed 100644 (file)
--- a/src/volser/dumpstuff.c
+++ b/src/volser/dumpstuff.c
@@ -51,6 +51,7 @@ RCSID
  #include <afs/volume.h>
  #include <afs/partition.h>
  #include "dump.h"
+#include <afs/daemon_com.h>
  #include <afs/fssync.h>
  #include <afs/acl.h>
  #include "volser.h"
diff --git a/src/volser/volprocs.c b/src/volser/volprocs.c

index ae1664fd27b496ae1c19f9de418d018b14d93a24..5bba7c10f502bb18a973a2044daebb73bb07f763 100644 (file)
--- a/src/volser/volprocs.c
+++ b/src/volser/volprocs.c
@@ -61,6 +61,7 @@ RCSID
  #include <afs/volume.h>
  #include <afs/partition.h>
  #include "vol.h"
+#include <afs/daemon_com.h>
  #include <afs/fssync.h>
  #include <afs/acl.h>
  #include "afs/audit.h"
@@ -844,7 +845,7 @@ VolReClone(struct rx_call *acid, afs_int32 atrans, afs_int32 cloneId)
  
      {
         struct DiskPartition *tpartp = originalvp->partition;
-       FSYNC_askfs(cloneId, tpartp->name, FSYNC_RESTOREVOLUME, 0);
+       FSYNC_VolOp(cloneId, tpartp->name, FSYNC_VOL_BREAKCBKS, 0, NULL);
      }
      return 0;
  
@@ -1355,8 +1356,7 @@ VolRestore(struct rx_call *acid, afs_int32 atrans, afs_int32 aflags,
      DFlushVolume(V_parentId(tt->volume)); /* Ensure dir buffers get dropped */
  
      code = RestoreVolume(acid, tt->volume, (aflags & 1), cookie);      /* last is incrementalp */
-    FSYNC_askfs(tt->volid, NULL, FSYNC_RESTOREVOLUME, 0l);     /*break call backs on the
-                                                                * restored volume */
+    FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_BREAKCBKS, 0l, NULL);
      tt->rxCallPtr = (struct rx_call *)0;
      tcode = TRELE(tt);
  
@@ -1422,7 +1422,7 @@ VolSetForwarding(struct rx_call *acid, afs_int32 atid, afs_int32 anewsite)
      }
      strcpy(tt->lastProcName, "SetForwarding");
      tt->rxCallPtr = acid;
-    FSYNC_askfs(tt->volid, NULL, FSYNC_MOVEVOLUME, anewsite);
+    FSYNC_VolOp(tt->volid, NULL, FSYNC_VOL_MOVE, anewsite, NULL);
      tt->rxCallPtr = (struct rx_call *)0;
      if (TRELE(tt))
         return VOLSERTRELE_ERROR;
@@ -1672,6 +1672,9 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries)
  
      /* Only report attached partitions */
      for (i = 0; i < VOLMAXPARTS; i++) {
+#ifdef AFS_DEMAND_ATTACH_FS
+       dp = VGetPartitionById(i, 0);
+#else
         if (i < 26) {
             namehead[6] = i + 'a';
             namehead[7] = '\0';
@@ -1682,6 +1685,7 @@ XVolListPartitions(struct rx_call *acid, struct partEntries *pEntries)
             namehead[8] = '\0';
         }
         dp = VGetPartition(namehead, 0);
+#endif
         if (dp)
             partList.partId[j++] = i;
      }
@@ -1792,7 +1796,7 @@ VolListOneVolume(struct rx_call *acid, afs_int32 partid, afs_int32
                 pntr->volid = volid;
                 goto drop;
             }
-           tv = VAttachVolumeByName(&error, pname, volname, V_READONLY);
+           tv = VAttachVolumeByName(&error, pname, volname, V_PEEK);
             if (error) {
                 pntr->status = 0;       /*things are messed up */
                 strcpy(pntr->name, volname);
@@ -2007,7 +2011,7 @@ VolXListOneVolume(struct rx_call *a_rxCidP, afs_int32 a_partID,
             /*
              * Attach the volume, give up on the volume if we can't.
              */
-           tv = VAttachVolumeByName(&error, pname, volname, V_READONLY);
+           tv = VAttachVolumeByName(&error, pname, volname, V_PEEK);
             if (error) {
                 xInfoP->status = 0;     /*things are messed up */
                 strcpy(xInfoP->name, volname);
@@ -2819,7 +2823,7 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId,
         return EIO;
      }
      close(fd);
-    FSYNC_askfs(volumeId, pname, FSYNC_RESTOREVOLUME, 0);
+    FSYNC_VolOp(volumeId, pname, FSYNC_VOL_BREAKCBKS, 0, NULL);
  
      for (dp = DiskPartitionList; dp && strcmp(dp->name, pname);
          dp = dp->next);
@@ -2854,8 +2858,8 @@ SAFSVolConvertROtoRWvolume(struct rx_call *acid, afs_int32 partId,
      if (unlink(opath) < 0) {
         Log("1 SAFS_VolConvertROtoRWvolume: Couldn't unlink RO header, error = %d\n", error);
      }
-    FSYNC_askfs(volumeId, pname, FSYNC_DONE, 0);
-    FSYNC_askfs(h.id, pname, FSYNC_ON, 0);
+    FSYNC_VolOp(volumeId, pname, FSYNC_VOL_DONE, 0, NULL);
+    FSYNC_VolOp(h.id, pname, FSYNC_VOL_ON, 0, NULL);
      return 0;
  #else /* AFS_NAMEI_ENV */
      return EINVAL;
diff --git a/src/volser/volser.p.h b/src/volser/volser.p.h

index 9e5b015c7cff4b79ed66e575b0cd2d9e7052abe6..e0111f0e4aad275ca2b5d65a0652c42bbe82fee2 100644 (file)
--- a/src/volser/volser.p.h
+++ b/src/volser/volser.p.h
@@ -15,6 +15,8 @@
  #include <pthread.h>
  #endif
  
+#include <afs/voldefs.h>
+
  /* vflags, representing state of the volume */
  #define        VTDeleteOnSalvage       1       /* delete on next salvage */
  #define        VTOutOfService          2       /* never put this volume online */
@@ -110,7 +112,6 @@ extern struct volser_trans *QI_GlobalWriteTrans;
  #define INVALID_BID 0
  #define VOLSER_MAXVOLNAME 65
  #define VOLSER_OLDMAXVOLNAME 32
-#define        VOLMAXPARTS     255
  
  /*flags used for interfacing with the  backup system */
  struct volDescription {                /*used for interfacing with the backup system */
author	Tom Keiser <tkeiser@sinenomine.net>
	Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)
committer	Derrick Brashear <shadow@dementia.org>
	Fri, 17 Mar 2006 19:54:26 +0000 (19:54 +0000)
Makefile.in		patch \| blob \| history
acinclude.m4		patch \| blob \| history
configure.in		patch \| blob \| history
src/auth/Makefile.in		patch \| blob \| history
src/bozo/bos.c		patch \| blob \| history
src/bozo/bosserver.c		patch \| blob \| history
src/bozo/fsbnodeops.c		patch \| blob \| history
src/cf/osconf.m4		patch \| blob \| history
src/config/param.rs_aix51.h		patch \| blob \| history
src/config/param.rs_aix52.h		patch \| blob \| history
src/config/param.rs_aix53.h		patch \| blob \| history
src/config/stds.h		patch \| blob \| history
src/rx/rx_queue.h		patch \| blob \| history
src/tsalvaged/Makefile.in	[new file with mode: 0644]	patch \| blob
src/tsalvaged/salvsync-debug.c	[new file with mode: 0644]	patch \| blob
src/tviced/Makefile.in		patch \| blob \| history
src/tviced/NTMakefile		patch \| blob \| history
src/tviced/serialize_state.c	[new file with mode: 0644]	patch \| blob
src/tviced/serialize_state.h	[new file with mode: 0644]	patch \| blob
src/tviced/state_analyzer.c	[new file with mode: 0644]	patch \| blob
src/tvolser/Makefile.in		patch \| blob \| history
src/util/Makefile.in		patch \| blob \| history
src/util/afsutil_prototypes.h		patch \| blob \| history
src/util/dirpath.c		patch \| blob \| history
src/util/dirpath.hin		patch \| blob \| history
src/util/dirpath_nt.h		patch \| blob \| history
src/util/errors.h		patch \| blob \| history
src/util/strnlen.c	[new file with mode: 0644]	patch \| blob
src/viced/Makefile.in		patch \| blob \| history
src/viced/NTMakefile		patch \| blob \| history
src/viced/afsfileprocs.c		patch \| blob \| history
src/viced/callback.c		patch \| blob \| history
src/viced/callback.h	[new file with mode: 0644]	patch \| blob
src/viced/host.c		patch \| blob \| history
src/viced/host.h		patch \| blob \| history
src/viced/viced.c		patch \| blob \| history
src/viced/viced.h		patch \| blob \| history
src/viced/viced_prototypes.h		patch \| blob \| history
src/vol/Makefile.in		patch \| blob \| history
src/vol/NTMakefile		patch \| blob \| history
src/vol/daemon_com.c	[new file with mode: 0644]	patch \| blob
src/vol/daemon_com.h	[new file with mode: 0644]	patch \| blob
src/vol/fssync-client.c	[new file with mode: 0644]	patch \| blob
src/vol/fssync-debug.c	[new file with mode: 0644]	patch \| blob
src/vol/fssync-server.c	[new file with mode: 0644]	patch \| blob
src/vol/fssync.c	[deleted file]	patch \| blob \| history
src/vol/fssync.h		patch \| blob \| history
src/vol/nuke.c		patch \| blob \| history
src/vol/partition.c		patch \| blob \| history
src/vol/partition.h		patch \| blob \| history
src/vol/purge.c		patch \| blob \| history
src/vol/salvage.h		patch \| blob \| history
src/vol/salvaged.c	[new file with mode: 0644]	patch \| blob
src/vol/salvager.c	[new file with mode: 0644]	patch \| blob
src/vol/salvsync-client.c	[new file with mode: 0644]	patch \| blob
src/vol/salvsync-server.c	[new file with mode: 0644]	patch \| blob
src/vol/salvsync.h	[new file with mode: 0644]	patch \| blob
src/vol/test/listVicepx.c		patch \| blob \| history
src/vol/test/updateDirInode.c		patch \| blob \| history
src/vol/vnode.c		patch \| blob \| history
src/vol/vnode.h		patch \| blob \| history
src/vol/vol-salvage.c		patch \| blob \| history
src/vol/vol-salvage.h	[new file with mode: 0644]	patch \| blob
src/vol/voldefs.h		patch \| blob \| history
src/vol/volinodes.h		patch \| blob \| history
src/vol/volume.c		patch \| blob \| history
src/vol/volume.h		patch \| blob \| history
src/volser/NTMakefile		patch \| blob \| history
src/volser/dumpstuff.c		patch \| blob \| history
src/volser/volprocs.c		patch \| blob \| history
src/volser/volser.p.h		patch \| blob \| history