From 95b857399d71cb1f6619e625bff256f8c4c72c6a Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Wed, 22 Apr 2015 15:06:12 -0300 Subject: [PATCH] Linux: mmap: Apply recursion check only to recursion cases The CPageWrite flag was originally added to prevent a scenario where a thread doing "writepage" would realize that the cache was too full and that some of its contents need to be written back to the server. Before writing back it would ask the OS to flush any dirty VM associated with the vcache entries that are to be written, to make sure the data is not stale. This flush could itself trigger writeback, leading to deadly recursion. One such scenario is a process doing mmap writes to a file larger than the cache. With some kernel versions and some callers of writepage, this can cause the mapping to be marked as being in an error state, leading to EIO errors passed back to user space. Make the recursion check more specific to only bail when the calling thread is one that was originally seen writing. A list of current writers is maintained instead of a single state flag. This lets other threads (like the flusher thread) go on with writeback to the same file, and limits the WRITEPAGE_ACTIVATE return case to call sites that can deal with it. In testing this helps avoid EIO errors when writing large chunks of data through mmap. Thanks to Yadav Yadavendra for extensive analysis and testing. Change-Id: Ic3136d7050c62e3ffac5e52441171f322b60fe86 Reviewed-on: http://gerrit.openafs.org/11124 Reviewed-by: Daria Brashear Tested-by: BuildBot --- src/afs/LINUX/osi_vcache.c | 3 +++ src/afs/LINUX/osi_vm.c | 2 +- src/afs/LINUX/osi_vnodeops.c | 44 ++++++++++++++++++++++++++++++++---- src/afs/afs.h | 15 +++++++++--- src/afs/afs_vcache.c | 20 ++++++++++++++++ 5 files changed, 76 insertions(+), 8 deletions(-) diff --git a/src/afs/LINUX/osi_vcache.c b/src/afs/LINUX/osi_vcache.c index 391e7d455..e64265f76 100644 --- a/src/afs/LINUX/osi_vcache.c +++ b/src/afs/LINUX/osi_vcache.c @@ -125,6 +125,9 @@ osi_NewVnode(void) tvc->v = ip; #endif + INIT_LIST_HEAD(&tvc->pagewriters); + spin_lock_init(&tvc->pagewriter_lock); + return tvc; } diff --git a/src/afs/LINUX/osi_vm.c b/src/afs/LINUX/osi_vm.c index d9e767d2d..2bea0209b 100644 --- a/src/afs/LINUX/osi_vm.c +++ b/src/afs/LINUX/osi_vm.c @@ -86,7 +86,7 @@ osi_VM_StoreAllSegments(struct vcache *avc) { struct inode *ip = AFSTOV(avc); - if (avc->f.states & CPageWrite) + if (!list_empty(&avc->pagewriters)) return; /* someone already writing */ /* filemap_fdatasync() only exported in 2.4.5 and above */ diff --git a/src/afs/LINUX/osi_vnodeops.c b/src/afs/LINUX/osi_vnodeops.c index 5b9d5aa48..62abc471d 100644 --- a/src/afs/LINUX/osi_vnodeops.c +++ b/src/afs/LINUX/osi_vnodeops.c @@ -2578,10 +2578,27 @@ out: * locked */ static inline int afs_linux_prepare_writeback(struct vcache *avc) { - if (avc->f.states & CPageWrite) { - return AOP_WRITEPAGE_ACTIVATE; + pid_t pid; + struct pagewriter *pw; + + pid = MyPidxx2Pid(MyPidxx); + /* Prevent recursion into the writeback code */ + spin_lock(&avc->pagewriter_lock); + list_for_each_entry(pw, &avc->pagewriters, link) { + if (pw->writer == pid) { + spin_unlock(&avc->pagewriter_lock); + return AOP_WRITEPAGE_ACTIVATE; + } } - avc->f.states |= CPageWrite; + spin_unlock(&avc->pagewriter_lock); + + /* Add ourselves to writer list */ + pw = osi_Alloc(sizeof(struct pagewriter)); + pw->writer = pid; + spin_lock(&avc->pagewriter_lock); + list_add_tail(&pw->link, &avc->pagewriters); + spin_unlock(&avc->pagewriter_lock); + return 0; } @@ -2600,7 +2617,26 @@ afs_linux_dopartialwrite(struct vcache *avc, cred_t *credp) { static inline void afs_linux_complete_writeback(struct vcache *avc) { - avc->f.states &= ~CPageWrite; + struct pagewriter *pw, *store; + pid_t pid; + struct list_head tofree; + + INIT_LIST_HEAD(&tofree); + pid = MyPidxx2Pid(MyPidxx); + /* Remove ourselves from writer list */ + spin_lock(&avc->pagewriter_lock); + list_for_each_entry_safe(pw, store, &avc->pagewriters, link) { + if (pw->writer == pid) { + list_del(&pw->link); + /* osi_Free may sleep so we need to defer it */ + list_add_tail(&pw->link, &tofree); + } + } + spin_unlock(&avc->pagewriter_lock); + list_for_each_entry_safe(pw, store, &tofree, link) { + list_del(&pw->link); + osi_Free(pw, sizeof(struct pagewriter)); + } } /* Writeback a given page syncronously. Called with no AFS locks held */ diff --git a/src/afs/afs.h b/src/afs/afs.h index 2e2799c23..49dee4322 100644 --- a/src/afs/afs.h +++ b/src/afs/afs.h @@ -652,9 +652,7 @@ struct SimpleLocks { #define CBulkStat 0x00020000 /* loaded by a bulk stat, and not ref'd since */ #define CUnlinkedDel 0x00040000 #define CVFlushed 0x00080000 -#ifdef AFS_LINUX22_ENV -#define CPageWrite 0x00200000 /* to detect vm deadlock - linux */ -#elif defined(AFS_SGI_ENV) +#if defined(AFS_SGI_ENV) #define CWritingUFS 0x00200000 /* to detect vm deadlock - used by sgi */ #elif defined(AFS_DARWIN80_ENV) #define CEvent 0x00200000 /* to preclude deadlock when sending events */ @@ -950,8 +948,19 @@ struct vcache { void *vpacRock; /* used to read or write in visible partitions */ #endif afs_uint32 lastBRLWarnTime; /* last time we warned about byte-range locks */ +#ifdef AFS_LINUX26_ENV + spinlock_t pagewriter_lock; + struct list_head pagewriters; /* threads that are writing vm pages */ +#endif }; +#ifdef AFS_LINUX26_ENV +struct pagewriter { + struct list_head link; + pid_t writer; +}; +#endif + #define DONT_CHECK_MODE_BITS 0 #define CHECK_MODE_BITS 1 #define CMB_ALLOW_EXEC_AS_READ 2 /* For the NFS xlator */ diff --git a/src/afs/afs_vcache.c b/src/afs/afs_vcache.c index 45cde7f7f..03dffd2d4 100644 --- a/src/afs/afs_vcache.c +++ b/src/afs/afs_vcache.c @@ -184,6 +184,26 @@ afs_FlushVCache(struct vcache *avc, int *slept) /* remove entry from the volume hash table */ QRemove(&avc->vhashq); +#if defined(AFS_LINUX26_ENV) + { + struct pagewriter *pw, *store; + struct list_head tofree; + + INIT_LIST_HEAD(&tofree); + spin_lock(&avc->pagewriter_lock); + list_for_each_entry_safe(pw, store, &avc->pagewriters, link) { + list_del(&pw->link); + /* afs_osi_Free may sleep so we need to defer it */ + list_add_tail(&pw->link, &tofree); + } + spin_unlock(&avc->pagewriter_lock); + list_for_each_entry_safe(pw, store, &tofree, link) { + list_del(&pw->link); + afs_osi_Free(pw, sizeof(struct pagewriter)); + } + } +#endif + if (avc->mvid) osi_FreeSmallSpace(avc->mvid); avc->mvid = (struct VenusFid *)0; -- 2.39.5