From: Andrew Deason <adeason@sinenomine.net>
Date: Wed, 29 Jul 2009 15:56:34 +0000 (-0400)
Subject: DAFS: avoid shutdown hang during salvage
X-Git-Tag: openafs-devel-1_5_63~55
X-Git-Url: https://git.michaelhowe.org/gitweb/?a=commitdiff_plain;h=64bad257d66b7d70ebdb62a10217818730e2e0a6;p=packages%2Fo%2Fopenafs.git

DAFS: avoid shutdown hang during salvage

Right now, when the fileserver shuts down, it tries to cancel any
demand-salvages in progress. This causes the fileserver to hang, since
during shutdown, the salvageserver may already be gone. This avoids
cancelling in-progress salvages, and allows salvages to continue after
the fileserver has shut down.

To do this, the salvageserver now closes SALVSYNC sockets in spawned
children, so the children don't get SALVSYNC requests directed at them,
since they can't handle them. The salvageserver also now periodically
scans for logs of salvaging children that have terminated, and
incorporates them into SalsrvLog, just like it does for it's own
children. The salvageserver children that actually perform salvages now
also set the inUse volume header field to their programType, so the
fileserver knows not to schedule new salvages for that volume in the
event of a fileserver restart during a salvage.

FIXES 124486

Reviewed-on: http://gerrit.openafs.org/279
Reviewed-by: Derrick Brashear <shadow@dementia.org>
Tested-by: Derrick Brashear <shadow@dementia.org>
---

diff --git a/src/vol/salvaged.c b/src/vol/salvaged.c
index 6af6013a3..c382200a0 100644
--- a/src/vol/salvaged.c
+++ b/src/vol/salvaged.c
@@ -165,6 +165,9 @@ static int Reap_Child(char * prog, int * pid, int * status);
 static void * SalvageLogCleanupThread(void *);
 static int SalvageLogCleanup(int pid);
 
+static void * SalvageLogScanningThread(void *);
+static void ScanLogs(struct rx_queue *log_watch_queue);
+
 struct log_cleanup_node {
     struct rx_queue q;
     int pid;
@@ -524,6 +527,10 @@ SalvageServer(void)
 			  &attrs, 
 			  &SalvageLogCleanupThread,
 			  NULL) == 0);
+    assert(pthread_create(&tid,
+			  &attrs,
+			  &SalvageLogScanningThread,
+			  NULL) == 0);
 
     /* loop forever serving requests */
     while (1) {
@@ -745,3 +752,115 @@ SalvageLogCleanup(int pid)
 
     return 0;
 }
+
+/* wake up every five minutes to see if a non-child salvage has finished */
+#define SALVAGE_SCAN_POLL_INTERVAL 300
+
+/**
+ * Thread to look for SalvageLog.$pid files that are not from our child
+ * worker salvagers, and notify SalvageLogCleanupThread to clean them
+ * up. This can happen if we restart during salvages, or the
+ * salvageserver crashes or something.
+ *
+ * @param arg  unused
+ *
+ * @return always NULL
+ */
+static void *
+SalvageLogScanningThread(void * arg)
+{
+    struct rx_queue log_watch_queue;
+    struct log_cleanup_node * cleanup;
+
+    queue_Init(&log_watch_queue);
+
+    {
+	DIR *dp;
+	struct dirent *dirp;
+	char prefix[AFSDIR_PATH_MAX];
+	size_t prefix_len;
+
+	afs_snprintf(prefix, sizeof(prefix), "%s.", AFSDIR_SLVGLOG_FILE);
+	prefix_len = strlen(prefix);
+
+	dp = opendir(AFSDIR_LOGS_DIR);
+	assert(dp);
+
+	while ((dirp = readdir(dp)) != NULL) {
+	    pid_t pid;
+	    struct log_cleanup_node *cleanup;
+	    int i;
+
+	    if (strncmp(dirp->d_name, prefix, prefix_len) != 0) {
+		/* not a salvage logfile; skip */
+		continue;
+	    }
+
+	    errno = 0;
+	    pid = strtol(dirp->d_name + prefix_len, NULL, 10);
+
+	    if (errno != 0) {
+		/* file is SalvageLog.<something> but <something> isn't
+		 * a pid, so skip */
+		 continue;
+	    }
+
+	    VOL_LOCK;
+	    for (i = 0; i < Parallel; ++i) {
+		if (pid == child_slot[i]) {
+		    break;
+		}
+	    }
+	    VOL_UNLOCK;
+	    if (i < Parallel) {
+		/* this pid is one of our children, so the reaper thread
+		 * will take care of it; skip */
+		continue;
+	    }
+
+	    cleanup =
+		(struct log_cleanup_node *) malloc(sizeof(struct log_cleanup_node));
+	    cleanup->pid = pid;
+
+	    queue_Append(&log_watch_queue, cleanup);
+	}
+
+	closedir(dp);
+    }
+
+    ScanLogs(&log_watch_queue);
+
+    while (queue_IsNotEmpty(&log_watch_queue)) {
+	sleep(SALVAGE_SCAN_POLL_INTERVAL);
+	ScanLogs(&log_watch_queue);
+    }
+
+    return NULL;
+}
+
+/**
+ * look through log_watch_queue, and if any processes are not still
+ * running, hand them off to the SalvageLogCleanupThread
+ *
+ * @param log_watch_queue  a queue of PIDs that we should clean up if
+ * that PID has died
+ */
+static void
+ScanLogs(struct rx_queue *log_watch_queue)
+{
+    struct log_cleanup_node *cleanup, *next;
+
+    assert(pthread_mutex_lock(&worker_lock) == 0);
+
+    for (queue_Scan(log_watch_queue, cleanup, next, log_cleanup_node)) {
+	/* if a process is still running, assume it's the salvage process
+	 * still going, and keep waiting for it */
+	if (kill(cleanup->pid, 0) < 0 && errno == ESRCH) {
+	    queue_Remove(cleanup);
+	    queue_Append(&log_cleanup_queue, cleanup);
+	    assert(pthread_cond_signal(&log_cleanup_queue.queue_change_cv) == 0);
+	}
+    }
+
+    assert(pthread_mutex_unlock(&worker_lock) == 0);
+}
diff --git a/src/vol/salvsync-server.c b/src/vol/salvsync-server.c
index bdc8bd0ff..6eb46af67 100644
--- a/src/vol/salvsync-server.c
+++ b/src/vol/salvsync-server.c
@@ -189,6 +189,9 @@ static struct QueueHead pendingQueue;  /* volumes being salvaged */
  */
 static int partition_salvaging[VOLMAXPARTS+1];
 
+static int HandlerFD[MAXHANDLERS];
+static void (*HandlerProc[MAXHANDLERS]) (int);
+
 #define VSHASH_SIZE 64
 #define VSHASH_MASK (VSHASH_SIZE-1)
 #define VSHASH(vid) ((vid)&VSHASH_MASK)
@@ -292,6 +295,21 @@ SALVSYNC_salvInit(void)
     assert(pthread_create(&tid, &tattr, SALVSYNC_syncThread, NULL) == 0);
 }
 
+static void
+CleanFDs(void)
+{
+    int i;
+    for (i = 0; i < MAXHANDLERS; ++i) {
+	if (HandlerFD[i] >= 0) {
+	    SALVSYNC_Drop(HandlerFD[i]);
+	}
+    }
+
+    /* just in case we were in AcceptOff mode, and thus this fd wouldn't
+     * have a handler */
+    close(salvsync_server_state.fd);
+    salvsync_server_state.fd = -1;
+}
 
 static fd_set SALVSYNC_readfds;
 
@@ -304,6 +322,11 @@ SALVSYNC_syncThread(void * args)
     int tid;
     SYNC_server_state_t * state = &salvsync_server_state;
 
+    /* when we fork, the child needs to close the salvsync server sockets,
+     * otherwise, it may get salvsync requests, instead of the parent
+     * salvageserver */
+    assert(pthread_atfork(NULL, NULL, CleanFDs) == 0);
+
     SYNC_getAddr(&state->endpoint, &state->addr);
     SYNC_cleanupSock(state);
 
@@ -362,6 +385,12 @@ SALVSYNC_com(osi_socket fd)
     SALVSYNC_command scom;
     SALVSYNC_response sres;
     SYNC_PROTO_BUF_DECL(buf);
+
+    memset(&com, 0, sizeof(com));
+    memset(&res, 0, sizeof(res));
+    memset(&scom, 0, sizeof(scom));
+    memset(&sres, 0, sizeof(sres));
+    memset(&sres_hdr, 0, sizeof(sres));
     
     com.payload.buf = (void *)buf;
     com.payload.len = SYNC_PROTO_MAX_LEN;
@@ -758,9 +787,6 @@ AcceptOff(void)
 
 /* The multiple FD handling code. */
 
-static int HandlerFD[MAXHANDLERS];
-static void (*HandlerProc[MAXHANDLERS]) (int);
-
 static void
 InitHandler(void)
 {
diff --git a/src/vol/vol-salvage.c b/src/vol/vol-salvage.c
index 2f5866c7c..51bdd158f 100644
--- a/src/vol/vol-salvage.c
+++ b/src/vol/vol-salvage.c
@@ -3237,6 +3237,44 @@ AskOffline(VolumeId volumeId, char * partition)
 	Log("AskOffline:  request for fileserver to take volume offline failed; salvage aborting.\n");
 	Abort("Salvage aborted\n");
     }
+
+#ifdef AFS_DEMAND_ATTACH_FS
+    /* set inUse = programType in the volume header. We do this in case
+     * the fileserver restarts/crashes while we are salvaging.
+     * Otherwise, the fileserver could attach the volume again on
+     * startup while we are salvaging, which would be very bad, or
+     * schedule another salvage while we are salvaging, which would be
+     * annoying. */
+    if (!Testing) {
+	int fd;
+	IHandle_t *h;
+	char name[VMAXPATHLEN];
+	struct VolumeHeader header;
+	struct VolumeDiskHeader diskHeader;
+	struct VolumeDiskData volHeader;
+
+	afs_snprintf(name, sizeof(name), "%s/" VFORMAT, fileSysPathName,
+	    afs_printable_uint32_lu(volumeId));
+
+	fd = afs_open(name, O_RDONLY);
+	assert(fd >= 0);
+	assert(read(fd, &diskHeader, sizeof(diskHeader)) == sizeof(diskHeader));
+	assert(diskHeader.stamp.magic == VOLUMEHEADERMAGIC);
+	close(fd);
+
+	DiskToVolumeHeader(&header, &diskHeader);
+
+	IH_INIT(h, fileSysDevice, header.parent, header.volumeInfo);
+	assert(IH_IREAD(h, 0, (char*)&volHeader, sizeof(volHeader)) == sizeof(volHeader));
+	assert(volHeader.stamp.magic == VOLUMEINFOMAGIC);
+
+	volHeader.inUse = programType;
+
+	assert(IH_IWRITE(h, 0, (char*)&volHeader, sizeof(volHeader)) == sizeof(volHeader));
+
+	IH_RELEASE(h);
+    }
+#endif /* AFS_DEMAND_ATTACH_FS */
 }
 
 void
diff --git a/src/vol/volume.c b/src/vol/volume.c
index d5ed2fc7a..b440e9f7f 100644
--- a/src/vol/volume.c
+++ b/src/vol/volume.c
@@ -353,7 +353,6 @@ static void VVByPListWait_r(struct DiskPartition64 * dp);
 static int VCheckSalvage(register Volume * vp);
 static int VUpdateSalvagePriority_r(Volume * vp);
 static int VScheduleSalvage_r(Volume * vp);
-static int VCancelSalvage_r(Volume * vp, int reason);
 
 /* Volume hash table */
 static void VReorderHash_r(VolumeHashChainHead * head, Volume * pp, Volume * vp);
@@ -1344,9 +1343,10 @@ VShutdownVolume_r(Volume * vp)
     
     switch(V_attachState(vp)) {
     case VOL_STATE_SALVAGING:
-	/* make sure salvager knows we don't want
-	 * the volume back */
-	VCancelSalvage_r(vp, SALVSYNC_SHUTDOWN);
+	/* Leave salvaging volumes alone. Any in-progress salvages will
+	 * continue working after viced shuts down. This is intentional.
+	 */
+
     case VOL_STATE_PREATTACHED:
     case VOL_STATE_ERROR:
 	VChangeState_r(vp, VOL_STATE_UNATTACHED);
@@ -4008,6 +4008,35 @@ VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
 	vp->salvage.requested = 1;
 	vp->salvage.reason = reason;
 	vp->stats.last_salvage = FT_ApproxTime();
+	if (VIsSalvager(V_inUse(vp))) {
+	    Log("VRequestSalvage: volume %u appears to be salvaging, but we\n", vp->hashid);
+	    Log("  didn't request a salvage. Forcing it offline waiting for the\n");
+	    Log("  salvage to finish; if you are sure no salvage is running,\n");
+	    Log("  run a salvage manually.\n");
+
+	    /* make sure neither VScheduleSalvage_r nor
+	     * VUpdateSalvagePriority_r try to schedule another salvage */
+	    vp->salvage.requested = vp->salvage.scheduled = 0;
+
+	    /* these stats aren't correct, but doing this makes them
+	     * slightly closer to being correct */
+	    vp->stats.salvages++;
+	    vp->stats.last_salvage_req = FT_ApproxTime();
+	    IncUInt64(&VStats.salvages);
+
+	    VChangeState_r(vp, VOL_STATE_ERROR);
+	    *ec = VSALVAGE;
+	    code = 1;
+
+	} else if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
+	    VChangeState_r(vp, VOL_STATE_SALVAGING);
+	    *ec = VSALVAGING;
+	} else {
+	    Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
+	    VChangeState_r(vp, VOL_STATE_ERROR);
+	    *ec = VSALVAGE;
+	    code = 1;
+	}
 	if (flags & VOL_SALVAGE_INVALIDATE_HEADER) {
 	    /* Instead of ReleaseVolumeHeader, we do FreeVolumeHeader() 
                so that the the next VAttachVolumeByVp_r() invocation 
@@ -4017,15 +4046,6 @@ VRequestSalvage_r(Error * ec, Volume * vp, int reason, int flags)
 	    */
 	    FreeVolumeHeader(vp);
 	}
-	if (vp->stats.salvages < SALVAGE_COUNT_MAX) {
-	    VChangeState_r(vp, VOL_STATE_SALVAGING);
-	    *ec = VSALVAGING;
-	} else {
-	    Log("VRequestSalvage: volume %u online salvaged too many times; forced offline.\n", vp->hashid);
-	    VChangeState_r(vp, VOL_STATE_ERROR);
-	    *ec = VSALVAGE;
-	    code = 1;
-	}
     }
     return code;
 }
@@ -4186,57 +4206,6 @@ VScheduleSalvage_r(Volume * vp)
     return ret;
 }
 
-/**
- * ask salvageserver to cancel a scheduled salvage operation.
- *
- * @param[in] vp      pointer to volume object
- * @param[in] reason  SALVSYNC protocol reason code
- *
- * @return operation status
- *    @retval 0 success
- *    @retval 1 request failed
- *
- * @pre VOL_LOCK is held.
- *
- * @post salvageserver is sent a request to cancel the volume salvage.
- *       volume is transitioned to a hard error state.
- *
- * @internal volume package internal use only.
- */
-static int
-VCancelSalvage_r(Volume * vp, int reason)
-{
-    int code, ret = 0;
-
-#ifdef SALVSYNC_BUILD_CLIENT
-    if (vp->salvage.scheduled) {
-	VChangeState_r(vp, VOL_STATE_SALVSYNC_REQ);
-	VOL_UNLOCK;
-
-	/* can't use V_id() since there's no guarantee
-	 * we have the disk data header at this point */
-	code = SALVSYNC_SalvageVolume(vp->hashid,
-				      VPartitionPath(vp->partition),
-				      SALVSYNC_CANCEL,
-				      reason,
-				      0,
-				      NULL);
-
-	VOL_LOCK;
-	VChangeState_r(vp, VOL_STATE_ERROR);
-
-	if (code == SYNC_OK) {
-	    vp->salvage.scheduled = 0;
-	    vp->salvage.requested = 0;
-	} else {
-	    ret = 1;
-	}
-    }
-#endif /* SALVSYNC_BUILD_CLIENT */
-    return ret;
-}
-
-
 #ifdef SALVSYNC_BUILD_CLIENT
 /**
  * connect to the salvageserver SYNC service.
diff --git a/src/vol/volume_inline.h b/src/vol/volume_inline.h
index ef54b4577..a26a6dccd 100644
--- a/src/vol/volume_inline.h
+++ b/src/vol/volume_inline.h
@@ -11,6 +11,26 @@
 #define _AFS_VOL_VOLUME_INLINE_H 1
 
 #include "volume.h"
+/**
+ * tell caller whether the given program type represents a salvaging
+ * program.
+ *
+ * @param type  program type enumeration
+ *
+ * @return whether program state is a salvager
+ *   @retval 0  type is a non-salvaging program
+ *   @retval 1  type is a salvaging program
+ */
+static_inline int
+VIsSalvager(ProgramType type)
+{
+    switch(type) {
+    case salvager:
+    case salvageServer:
+	return 1;
+    }
+    return 0;
+}
 
 /***************************************************/
 /* demand attach fs state machine routines         */