From 6dd052cbab09c95e97c910421cfaf68713e906c5 Mon Sep 17 00:00:00 2001 From: Andrew Deason Date: Thu, 26 Apr 2018 12:27:12 -0500 Subject: [PATCH] afs: Stop looking for dcaches on Get*DSlot errors In various places in the code, we'll be looking for a dslot, calling afs_GetValidDSlot (or afs_GetUnusedDSlot) in a loop. In a few places, we currently keep looking for the dslot when we get an error back, since afs_GetValidDSlot may return successfully for other slots, and we might find the dslot we're looking for. This behavior was introduced in a few commits, including: - commit 2679af76 (afs: Traverse discard/free dslot list if errors) - commit 00fd34a6 (afs: Handle easy GetValidDSlot errors) - commit 9a558660 (afs: Cope with afs_GetValidDSlot errors) This behavior means that if afs_GetValidDSlot/afs_GetUnusedDSlot returns an error for a particular dcache slot, but other slots are okay, then we may still find the dcache we're looking for. However, by far the most common reason that afs_GetValidDSlot/afs_GetUnusedDSlot fails is because our disk cache is completely unusable; it is very rare that only a few slots cannot be used, but others are fine (this would mean that the disk cache was corrupted in oddly specific ways, or there are small isolated errors in the underlying disk). So continuing the dcache search in these situations is not very useful. On Linux, this is most commonly seen by the underlying disk cache i/o calls returning -EINTR, which can happen if a SIGKILL signal is pending for the current process when we try to do the i/o. In this situation, all attempts to read in a dslot from disk will fail; trying other slots or waiting will not improve the situation. Depending on which specific code path encounters an afs_Get*DSlot error, we can then flood the log with "disk cache read error in CacheItems" messages emitted from afs_UFSGetDSlot, since we keep calling afs_Get*DSlot in our loop. The worst offender of this is usually afs_GetDSlotFromList via afs_AllocDCache, since we end up calling afs_GetUnusedDSlot for every single dslot in the free and discard lists. However, our other call sites that are looking for dcaches for a specific file can still generate quite a few of these messages, since we'll end up calling afs_GetValidDSlot for every slot in a dcache hash chain. So to avoid flooding the log in these situations, change most callers of afs_GetValidDSlot and afs_GetUnusedDSlot to stop on the first error, and act like we never found a dcache that we were looking for. This commit also adjusts one caller in afs_ProcessOpCreate, which was not handling errors from afs_GetValidDSlot at all, and changes FlushVolumeData to be able to return error codes. Reviewed-on: https://gerrit.openafs.org/13034 Reviewed-by: Benjamin Kaduk Tested-by: BuildBot (cherry picked from commit 12f4fd2901fee8bf27c2cec97efd3d242c6ff025) Change-Id: I2a9865e510be39d1b5bcb9280419630036c00bef Reviewed-on: https://gerrit.openafs.org/13191 Tested-by: BuildBot Reviewed-by: Andrew Deason Reviewed-by: Joe Gorse Reviewed-by: Mark Vitale Reviewed-by: Michael Meffie Reviewed-by: Benjamin Kaduk --- src/afs/afs_dcache.c | 35 +++++++++++++++++++---------------- src/afs/afs_disconnected.c | 24 +++++++++++++++--------- src/afs/afs_pioctl.c | 13 +++++++------ src/afs/afs_segments.c | 2 +- 4 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/afs/afs_dcache.c b/src/afs/afs_dcache.c index 6dd79a58b..303bfca28 100644 --- a/src/afs/afs_dcache.c +++ b/src/afs/afs_dcache.c @@ -1133,7 +1133,7 @@ afs_GetDSlotFromList(afs_int32 *indexp) { struct dcache *tdc; - for ( ; *indexp != NULLIDX; indexp = &afs_dvnextTbl[*indexp]) { + if (*indexp != NULLIDX) { tdc = afs_GetUnusedDSlot(*indexp); if (tdc) { osi_Assert(tdc->refCount == 1); @@ -1409,9 +1409,9 @@ afs_TryToSmush(struct vcache *avc, afs_ucred_t *acred, int sync) tdc = afs_GetValidDSlot(index); if (!tdc) { /* afs_TryToSmush is best-effort; we may not actually discard - * everything, so failure to discard a dcache due to an i/o + * everything, so failure to discard dcaches due to an i/o * error is okay. */ - continue; + break; } if (!FidCmp(&tdc->f.fid, &avc->f.fid)) { if (sync) { @@ -1502,13 +1502,14 @@ afs_DCacheMissingChunks(struct vcache *avc) i = afs_dvnextTbl[index]; if (afs_indexUnique[index] == avc->f.fid.Fid.Unique) { tdc = afs_GetValidDSlot(index); - if (tdc) { - if (!FidCmp(&tdc->f.fid, &avc->f.fid)) { - totalChunks--; - } - ReleaseReadLock(&tdc->tlock); - afs_PutDCache(tdc); - } + if (!tdc) { + break; + } + if (!FidCmp(&tdc->f.fid, &avc->f.fid)) { + totalChunks--; + } + ReleaseReadLock(&tdc->tlock); + afs_PutDCache(tdc); } } ReleaseWriteLock(&afs_xdcache); @@ -1561,7 +1562,8 @@ afs_FindDCache(struct vcache *avc, afs_size_t abyte) /* afs_FindDCache is best-effort; we may not find the given * file/offset, so if we cannot find the given dcache due to * i/o errors, that is okay. */ - continue; + index = NULLIDX; + break; } ReleaseReadLock(&tdc->tlock); if (!FidCmp(&tdc->f.fid, &avc->f.fid) && chunk == tdc->f.chunk) { @@ -1908,12 +1910,13 @@ afs_GetDCache(struct vcache *avc, afs_size_t abyte, if (afs_indexUnique[index] == avc->f.fid.Fid.Unique) { tdc = afs_GetValidDSlot(index); if (!tdc) { - /* we got an i/o error when trying to get the given dslot, - * but do not bail out just yet; it is possible the dcache - * we're looking for is elsewhere, so it doesn't matter if - * we can't load this one. */ + /* we got an i/o error when trying to get the given dslot. + * it's possible the dslot we're looking for is elsewhere, + * but most likely the disk cache is currently unusable, so + * all afs_GetValidDSlot calls will fail, so just bail out. */ dslot_error = 1; - continue; + index = NULLIDX; + break; } ReleaseReadLock(&tdc->tlock); /* diff --git a/src/afs/afs_disconnected.c b/src/afs/afs_disconnected.c index fd6abe2c5..928133885 100644 --- a/src/afs/afs_disconnected.c +++ b/src/afs/afs_disconnected.c @@ -72,13 +72,15 @@ afs_FindDCacheByFid(struct VenusFid *afid) for (index = afs_dvhashTbl[i]; index != NULLIDX;) { if (afs_indexUnique[index] == afid->Fid.Unique) { tdc = afs_GetValidDSlot(index); - if (tdc) { - ReleaseReadLock(&tdc->tlock); - if (!FidCmp(&tdc->f.fid, afid)) { - break; /* leaving refCount high for caller */ - } - afs_PutDCache(tdc); - } + if (!tdc) { + index = NULLIDX; + break; + } + ReleaseReadLock(&tdc->tlock); + if (!FidCmp(&tdc->f.fid, afid)) { + break; /* leaving refCount high for caller */ + } + afs_PutDCache(tdc); } index = afs_dvnextTbl[index]; } @@ -859,6 +861,11 @@ afs_ProcessOpCreate(struct vcache *avc, struct vrequest *areq, for (index = afs_dvhashTbl[hash]; index != NULLIDX; index = hash) { hash = afs_dvnextTbl[index]; tdc = afs_GetValidDSlot(index); + if (!tdc) { + ReleaseWriteLock(&afs_xdcache); + code = EIO; + goto end; + } ReleaseReadLock(&tdc->tlock); if (afs_indexUnique[index] == avc->f.fid.Fid.Unique) { if (!FidCmp(&tdc->f.fid, &avc->f.fid)) { @@ -879,8 +886,7 @@ afs_ProcessOpCreate(struct vcache *avc, struct vrequest *areq, memcpy(&tdc->f.fid, &newFid, sizeof(struct VenusFid)); } /* if fid match */ } /* if uniquifier match */ - if (tdc) - afs_PutDCache(tdc); + afs_PutDCache(tdc); } /* for all dcaches in this hash bucket */ ReleaseWriteLock(&afs_xdcache); diff --git a/src/afs/afs_pioctl.c b/src/afs/afs_pioctl.c index 6c8da541c..cfc79a2b7 100644 --- a/src/afs/afs_pioctl.c +++ b/src/afs/afs_pioctl.c @@ -3444,7 +3444,7 @@ DECL_PIOCTL(PSetCellStatus) return 0; } -static void +static int FlushVolumeData(struct VenusFid *afid, afs_ucred_t * acred) { afs_int32 i; @@ -3455,6 +3455,7 @@ FlushVolumeData(struct VenusFid *afid, afs_ucred_t * acred) afs_int32 cell = 0; afs_int32 volume = 0; struct afs_q *tq, *uq; + int code = 0; #ifdef AFS_DARWIN80_ENV vnode_t vp; #endif @@ -3523,7 +3524,8 @@ FlushVolumeData(struct VenusFid *afid, afs_ucred_t * acred) continue; /* never had any data */ tdc = afs_GetValidDSlot(i); if (!tdc) { - continue; + code = EIO; + break; } if (tdc->refCount <= 1) { /* too high, in use by running sys call */ ReleaseReadLock(&tdc->tlock); @@ -3564,6 +3566,7 @@ FlushVolumeData(struct VenusFid *afid, afs_ucred_t * acred) /* probably, a user is doing this, probably, because things are screwed up. * maybe it's the dnlc's fault? */ osi_dnlc_purge(); + return code; } /*! @@ -3594,8 +3597,7 @@ DECL_PIOCTL(PFlushVolumeData) if (!afs_resourceinit_flag) /* afs daemons haven't started yet */ return EIO; /* Inappropriate ioctl for device */ - FlushVolumeData(&avc->f.fid, *acred); - return 0; + return FlushVolumeData(&avc->f.fid, *acred); } /*! @@ -3625,8 +3627,7 @@ DECL_PIOCTL(PFlushAllVolumeData) if (!afs_resourceinit_flag) /* afs daemons haven't started yet */ return EIO; /* Inappropriate ioctl for device */ - FlushVolumeData(NULL, *acred); - return 0; + return FlushVolumeData(NULL, *acred); } /*! diff --git a/src/afs/afs_segments.c b/src/afs/afs_segments.c index 5a32da4b0..fc1903436 100644 --- a/src/afs/afs_segments.c +++ b/src/afs/afs_segments.c @@ -380,7 +380,7 @@ afs_StoreAllSegments(struct vcache *avc, struct vrequest *areq, * be updated so we don't be able to use that cached * chunk in the future. That's inefficient, but not * an error. */ - continue; + break; } ReleaseReadLock(&tdc->tlock); -- 2.39.5