From 69a28651099e3c3ff9582c3dee04db1db47ac3f7 Mon Sep 17 00:00:00 2001 From: Andrew Deason Date: Thu, 29 Sep 2011 14:49:53 -0500 Subject: [PATCH] DAFS: Do not serialize state for invalid hosts When we serialize host information for DAFS during shutdown, we have no guarantee that the host is in a valid state when we look at it. This can result in a host being saved to disk when we are waiting for the host to respond to an RPC, and so the information about the host is invalid. For example, we can save a host that has the HWHO_INPROGRESS flag set, and when it is restored later, this can cause odd behavior since the flag is set but no thread is actually waiting for the host to respond. So instead, during state serialization, try to determine if a host may be in an invalid state, and simply skip the host if it may. Reviewed-on: http://gerrit.openafs.org/5528 Tested-by: BuildBot Reviewed-by: Derrick Brashear (cherry picked from commit 5c6bd04211d587efde4b0915a62273aafb2d306b) Change-Id: I9bf8cfec80ff9e626777375e94743ac621b52cb3 Reviewed-on: http://gerrit.openafs.org/5546 Tested-by: BuildBot Reviewed-by: Derrick Brashear --- src/tviced/serialize_state.c | 14 ++++++------ src/viced/host.c | 41 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/src/tviced/serialize_state.c b/src/tviced/serialize_state.c index 146ba0d8a..cc2962326 100644 --- a/src/tviced/serialize_state.c +++ b/src/tviced/serialize_state.c @@ -182,13 +182,13 @@ fs_stateSave(void) * BUT, this still has one flaw -- what do we do about rx worker threads that * are blocked in the host package making an RPC call to a cm??? * - * perhaps we need a refcounter that keeps track of threads blocked in rpc calls - * with H_LOCK dropped (and the host struct likely left in an inconsistent state) - * - * or better yet, we need to associate a state machine with each host object - * (kind of like demand attach Volume structures). - * - * sigh. I suspect we'll need to revisit this issue + * currently we try to detect if a host struct is in an inconsistent state + * when we go to save it to disk, and just skip the hosts that we think may + * be inconsistent (see h_isBusy_r in host.c). This has the problem of causing + * more InitCallBackState's when we come back up, but the number of hosts in + * such a state should be small. In the future, we could try to lock hosts + * (with some deadline so we don't wait forever) before serializing, but at + * least for now it does not seem worth the trouble. */ if (fs_state.options.fs_state_verify_before_save) { diff --git a/src/viced/host.c b/src/viced/host.c index a93b93bd5..2d9d74046 100644 --- a/src/viced/host.c +++ b/src/viced/host.c @@ -2937,6 +2937,37 @@ static int h_stateVerifyUuidHash(struct fs_dump_state * state, struct host * h); static void h_hostToDiskEntry_r(struct host * in, struct hostDiskEntry * out); static void h_diskEntryToHost_r(struct hostDiskEntry * in, struct host * out); +/** + * Is this host busy? + * + * This is just a hint and should not be trusted; this should probably only be + * used by the host state serialization code when trying to detect if a host + * can be sanely serialized to disk or not. If this function returns 1, the + * host may be in an invalid state and thus should not be saved to disk. + */ +static int +h_isBusy_r(struct host *host) +{ + struct Lock *hostLock = &host->lock; + int locked = 0; + + LOCK_LOCK(hostLock); + if (hostLock->excl_locked || hostLock->readers_reading) { + locked = 1; + } + LOCK_UNLOCK(hostLock); + + if (locked) { + return 1; + } + + if ((host->hostFlags & HWHO_INPROGRESS) || !(host->hostFlags & ALTADDR)) { + /* We shouldn't hit this if the host wasn't locked, but just in case... */ + return 1; + } + + return 0; +} /* this procedure saves all host state to disk for fast startup */ int @@ -3258,6 +3289,16 @@ h_stateSaveHost(struct host * host, void* rock) struct iovec iov[4]; int iovcnt = 2; + if (h_isBusy_r(host)) { + char hoststr[16]; + ViceLog(1, ("Not saving host %s:%d to disk; host appears busy\n", + afs_inet_ntoa_r(host->host, hoststr), (int)ntohs(host->port))); + /* Make sure we don't try to save callbacks to disk for this host, or + * we'll get confused on restore */ + DeleteAllCallBacks_r(host, 1); + return 0; + } + memset(&hdr, 0, sizeof(hdr)); if (state->h_hdr->index_max < host->index) { -- 2.39.5