From e4c8f208d66ff5f0e07e0eabc63f4601e6d941d9 Mon Sep 17 00:00:00 2001 From: Eygene Ryabinkin Date: Tue, 17 Jan 2012 18:46:41 +0400 Subject: [PATCH 2/2] NULLFS: fix panics when lowervp is locked with LK_SHARED null_nodeget is designed to be racey when the underlying vnode is locked in the shared mode. But those who lost the race will vrele(9) the created null vnode with v_vnlock that points to the native v_lock, instead of lowervp's one. This will make null_reclaim() to be upset and panic, because it tries to lock vp->v_lock in the exclusive mode, but VOP_RECLAIM is called with acquired v_vnlock. This bug only shows up when the underlying vnode is locked LK_SHARED; this is the case for the NFS filesystem that is mounted by the client with readdir+ usage mode and the server side uses nullfs as the local filesystem. We could fix the null_reclaim() to adopt for this case, but I am not a big fan of it: - this will put a tight dependency on the null_nodeget() and null_reclaim(); this bug proved it to be a wrong idea; - every racey thread will go through the full vnode machinery independently to allow only one thread to succeed and others to put the obtained vnode into the recycling path. Since the new code is conditionalized on the LK_EXCLUSIVE case for the lower vnode, it won't add the overhead for the exclusively locked lower vnodes (currently, this is the majority of cases), but will correctly handle the shared locks with all-local processing. Nullfs locking was also done in the per-lowervp-hash-bucket mode: we are using NNULLNODECACHE mutexes to protect each bucket set independently. This will lower the lock contention for the null_nodeget() without adding side-effects. Signed-off-by: Eygene Ryabinkin --- sys/fs/nullfs/null_subr.c | 550 +++++++++++++++++++++++++++++++++++--------- sys/fs/nullfs/null_vnops.c | 11 +- 2 files changed, 448 insertions(+), 113 deletions(-) diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c index aea98f3..bd12b00 100644 --- a/sys/fs/nullfs/null_subr.c +++ b/sys/fs/nullfs/null_subr.c @@ -50,25 +50,111 @@ #define NNULLNODECACHE 16 /* - * Null layer cache: - * Each cache entry holds a reference to the lower vnode - * along with a pointer to the alias vnode. When an - * entry is added the lower vnode is VREF'd. When the - * alias is removed the lower vnode is vrele'd. + * Holds the number of waiters and the wait state for + * the new nullfs node allocation (combination of mountpoint + * and lower vnode). */ +struct null_waiters { + short count; + short flags; + struct mount *mp; + struct vnode *lowervp; + LIST_ENTRY(null_waiters) entries; +#define WAITERS_OWNED 0x01 /* Have active allocation thread */ +}; + +/* Macros to manipulate waiters ownership */ +#define GRAB_WAITERS(w) ((w)->flags |= WAITERS_OWNED) +#define RELEASE_WAITERS(w) ((w)->flags &= ~WAITERS_OWNED) + + +/* + * Null layer caches + * ================= + * + * We have two caches: + * - null vnode cache that holds fully-initialized vnodes; + * - waiters cache: it identifies vnodes that are currently + * being initialized and counts number of threads that + * wait for the creation to be completed. + * + * Each null cache entry holds a reference to the lower vnode + * and a reference to the nullfs mount point this vnode belongs to; + * these two form the lookup key for the cache. + * + * All caches are technically hashes (see hashinit(9)); each + * hash bucket is protected with its own mutex that is taken + * from the mutex array null_hashmtx[]. Each mutex protects: + * - null vnode cache bucket; + * - the corresponding waiters cache bucket and the + * contents of each waiter structure inside it: + * we can't afford to have mutex for every single waiter list. + * + * + * Reference counting for vnodes is done in the following way: + * + * - when the lowervp is associated with the nullfs vnode + * for the first time (e.g. when new nullfs vnode is created), + * it is vref'ed (more precisely, it should already be vref'ed + * before it is passed to null_nodeget); + * + * - when the nullfs vnode is referenced (taken from the cache) + * more times, the associated lowervp is not vref'ed; + * + * - when the nullfs vnode is reclaimed (thus, the last reference + * to it was gone), lowervp is released. + * + * Put simply, nullfs vnode is properly refcounted and its lowervp + * is referenced once when nullfs node is created and released + * once when nullfs node is completely destroyed. + * + */ + +#define NULL_NHASH_IDX(vp) \ + ((((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash) #define NULL_NHASH(vp) \ - (&null_node_hashtbl[(((uintptr_t)vp)>>LOG2_SIZEVNODE) & null_node_hash]) + (&null_node_hashtbl[NULL_NHASH_IDX(vp)]) + +#define NULL_WHASH(vp) \ + (&null_waiters_hashtbl[NULL_NHASH_IDX(vp)]) + +#define NULL_MHASH(vp) \ + (&null_hashmtx[NULL_NHASH_IDX(vp)]) static LIST_HEAD(null_node_hashhead, null_node) *null_node_hashtbl; -static u_long null_node_hash; -struct mtx null_hashmtx; +static LIST_HEAD(null_waiters_hashhead, null_waiters) *null_waiters_hashtbl; +static u_long null_node_hash, null_waiters_hash; + +struct mtx null_hashmtx[NNULLNODECACHE]; + static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table"); MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part"); -static struct vnode * null_hashget(struct mount *, struct vnode *); -static struct vnode * null_hashins(struct mount *, struct null_node *); + +static struct vnode * null_hashlookup(struct null_node_hashhead *_hd, + struct mount *_mp, struct vnode *_lowervp); +static void null_hashinsert(struct null_node_hashhead *_hd, + struct null_node *_xp); + +static struct null_waiters * +null_waiterslookup(struct null_waiters_hashhead *_hd, + struct mount *_mp, struct vnode *_lowervp); +static inline void null_waitersinsert(struct null_waiters_hashhead *_hd, + struct null_waiters *_entry); +static inline void null_waitersremove(struct null_waiters *_entry); + +static inline void waiters_ref(struct null_waiters *_waiters); +static inline void waiters_unref(struct null_waiters *_waiters); +static inline void waiters_alloc_thread_failed(struct null_waiters *_waiters); + +static int +null_do_waiters(struct null_waiters_hashhead *_waiters_bucket, + struct null_node_hashhead *_node_bucket, + struct mount *_mp, struct vnode *_lowervp, struct mtx *_hash_mtx, + struct null_waiters **_waitersp, struct vnode **_vpp); + /* * Initialise cache headers @@ -77,10 +163,17 @@ int nullfs_init(vfsp) struct vfsconf *vfsp; { + size_t i; NULLFSDEBUG("nullfs_init\n"); /* printed during system boot */ - null_node_hashtbl = hashinit(NNULLNODECACHE, M_NULLFSHASH, &null_node_hash); - mtx_init(&null_hashmtx, "nullhs", NULL, MTX_DEF); + null_node_hashtbl = hashinit(NNULLNODECACHE, + M_NULLFSHASH, &null_node_hash); + null_waiters_hashtbl = hashinit(NNULLNODECACHE, + M_NULLFSHASH, &null_waiters_hash); + for (i = 0; i < NNULLNODECACHE; i++) { + mtx_init(&null_hashmtx[i], "nullhs", + NULL, MTX_DEF); + } return (0); } @@ -88,86 +181,147 @@ int nullfs_uninit(vfsp) struct vfsconf *vfsp; { - - mtx_destroy(&null_hashmtx); - hashdestroy(null_node_hashtbl, M_NULLFSHASH, null_node_hash); + size_t i; + + for (i = 0; i < NNULLNODECACHE; i++) + mtx_destroy(&null_hashmtx[i]); + hashdestroy(null_node_hashtbl, + M_NULLFSHASH, null_node_hash); + hashdestroy(null_waiters_hashtbl, + M_NULLFSHASH, null_waiters_hash); return (0); } + + +/* + * Operations with null node hash + */ + + /* - * Return a VREF'ed alias for lower vnode if already exists, else 0. - * Lower vnode should be locked on entry and will be left locked on exit. + * Looks up the given vnode/mountpoint combination in vnode hash. + * + * Must be called with locked lowervp and acquired mutex for the + * given null hashtable bucket. */ static struct vnode * -null_hashget(mp, lowervp) - struct mount *mp; - struct vnode *lowervp; +null_hashlookup(struct null_node_hashhead *hd, struct mount *mp, + struct vnode *lowervp) { - struct null_node_hashhead *hd; struct null_node *a; struct vnode *vp; - ASSERT_VOP_LOCKED(lowervp, "null_hashget"); + ASSERT_VOP_LOCKED(lowervp, "null_hashlookup"); - /* - * Find hash base, and then search the (two-way) linked - * list looking for a null_node structure which is referencing - * the lower vnode. If found, the increment the null_node - * reference count (but NOT the lower vnode's VREF counter). - */ - hd = NULL_NHASH(lowervp); - mtx_lock(&null_hashmtx); LIST_FOREACH(a, hd, null_hash) { - if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) { + if (a->null_lowervp == lowervp && + NULLTOV(a)->v_mount == mp) { /* * Since we have the lower node locked the nullfs - * node can not be in the process of recycling. If - * it had been recycled before we grabed the lower - * lock it would not have been found on the hash. + * node can not be in the process of recycling. + * If it had been recycled before we grabbed + * the lower lock it would not have been found + * on the hash. */ vp = NULLTOV(a); - vref(vp); - mtx_unlock(&null_hashmtx); return (vp); } } - mtx_unlock(&null_hashmtx); return (NULLVP); } + /* - * Act like null_hashget, but add passed null_node to hash if no existing - * node found. + * Inserts the given null vnode to the hashtable. + * + * Must be called with locked lowervp and acquired mutex for the + * given null hashtable bucket. */ -static struct vnode * -null_hashins(mp, xp) - struct mount *mp; - struct null_node *xp; +static void +null_hashinsert(struct null_node_hashhead *hd, struct null_node *xp) { - struct null_node_hashhead *hd; - struct null_node *oxp; - struct vnode *ovp; - - hd = NULL_NHASH(xp->null_lowervp); - mtx_lock(&null_hashmtx); - LIST_FOREACH(oxp, hd, null_hash) { - if (oxp->null_lowervp == xp->null_lowervp && - NULLTOV(oxp)->v_mount == mp) { - /* - * See null_hashget for a description of this - * operation. - */ - ovp = NULLTOV(oxp); - vref(ovp); - mtx_unlock(&null_hashmtx); - return (ovp); - } - } + ASSERT_VOP_LOCKED(xp->null_lowervp, "null_hashinsert"); + LIST_INSERT_HEAD(hd, xp, null_hash); - mtx_unlock(&null_hashmtx); - return (NULLVP); } + +/* + * Remove node from hash. + * + * Must be called with non-locked mutex + * for the given null hashtable bucket. + */ +void +null_hashrem(struct null_node *xp) +{ + struct mtx *hash_mtx = NULL_MHASH(xp->null_lowervp); + + mtx_lock(hash_mtx); + LIST_REMOVE(xp, null_hash); + mtx_unlock(hash_mtx); +} + + + +/* + * Operations with null waiters hash. + */ + + +/* + * Looks up the given vnode/mountpoint combination in + * the vnode waiters hash. + * + * Must be called with locked lowervp and acquired mutex for the + * given null hashtable bucket. + */ +static struct null_waiters * +null_waiterslookup(struct null_waiters_hashhead *hd, + struct mount *mp, struct vnode *lowervp) +{ + struct null_waiters *a; + + ASSERT_VOP_LOCKED(lowervp, "null_waiterslookup"); + + LIST_FOREACH(a, hd, entries) { + if (a->lowervp == lowervp && a->mp == mp) + return (a); + } + return (NULL); +} + + +/* + * Inserts waiters descriptor into the hash bucket. + * + * Must be called with locked lowervp and acquired mutex for the + * given null hashtable bucket. + */ +static inline void +null_waitersinsert(struct null_waiters_hashhead *hd, + struct null_waiters *entry) +{ + ASSERT_VOP_LOCKED(entry->lowervp, "null_waitersinsert"); + + LIST_INSERT_HEAD(hd, entry, entries); +} + + +/* + * Remove waiters from hash. + * + * Must be called with acquired mutex + * for the given null hashtable bucket. + */ +static inline void +null_waitersremove(struct null_waiters *entry) +{ + LIST_REMOVE(entry, entries); +} + + static void null_insmntque_dtr(struct vnode *vp, void *xp) { @@ -182,41 +336,176 @@ null_insmntque_dtr(struct vnode *vp, void *xp) vput(vp); } + +/* + * A helper for null_nodeget to deal with waiters machinery. + * + * Checks if we have to wait for another thread to allocate + * the new null vnode or to allocate it ourselves (become the + * allocating thread). + * + * The passed hash_mtx must correspond to the mutex that protects + * the given waiters_bucket and must be locked. + * + * Non-zero return code means failure; code from errno(2) will + * be returned in this case. + * + * For zero return code: + * + * - non-NULL *vpp means that we had found the valid null vnode + * in the cache (after waiting for the master allocating thread), + * so it can be returned to our caller. + * + * - otherwise *waitersp will hold the pointer to the valid + * waiters object and we are the allocating thread by ourself; + * waiters count for this lowervp/mp combo will be increased + * in this case. + */ +static int +null_do_waiters(struct null_waiters_hashhead *waiters_bucket, + struct null_node_hashhead *node_bucket, + struct mount *mp, struct vnode *lowervp, struct mtx *hash_mtx, + struct null_waiters **waitersp, struct vnode **vpp) +{ + struct null_waiters *waiters; + + *waitersp = NULL; + *vpp = NULL; + + waiters = null_waiterslookup(waiters_bucket, mp, lowervp); + if (waiters == NULL) { + /* Allocate new waiter. */ + waiters = (struct null_waiters *)malloc(sizeof(*waiters), + M_NULLFSNODE, M_NOWAIT); + if (waiters == NULL) + return (ENOMEM); + waiters->flags = WAITERS_OWNED; + waiters->count = -1; + waiters->mp = mp; + waiters->lowervp = lowervp; + waiters_ref(waiters); + null_waitersinsert(waiters_bucket, waiters); + } else { + /* Avoid lost wakeups by bounding our sleep */ + int timo = (hz < 100 ? 1 : hz / 100); + + waiters_ref(waiters); + while(waiters->flags & WAITERS_OWNED) { + mtx_sleep(waiters, hash_mtx, 0, + "0_nget", timo); + } + + *vpp = null_hashlookup(node_bucket, mp, lowervp); + if (*vpp != NULL) { + waiters_unref(waiters); + return (0); + } + /* + * The allocating thread had failed. + * Become the allocating thread. + */ + GRAB_WAITERS(waiters); + } + + *waitersp = waiters; + return (0); +} + + /* * Make a new or get existing nullfs node. - * Vp is the alias vnode, lowervp is the lower vnode. + * + * The lowervp assumed to be locked and having "spare" reference + * (being vref'ed by the caller of null_nodeget). This routine will + * vrele(lowervp) if nullfs node was taken from hash, in accordance + * with the refcounting rules outlined above at "Null layer caches". * - * The lowervp assumed to be locked and having "spare" reference. This routine - * vrele lowervp if nullfs node was taken from hash. Otherwise it "transfers" - * the caller's "spare" reference to created nullfs vnode. + * In the case of creating new nullfs node we will not put multiple + * threads that want to create the same node into the race in that + * every thread will do the initialization on its own and the winner + * will be determined at the time of new node insertion into the null + * vnode hash. Instead, the first thread that will result in creation + * of the new node will be allowed to do it and the other ones will + * wait for it (see null_do_waiters() for the implementation). + * + * This simplifies the handling of race losers, since in the former + * case they will need to release the created nullfs nodes that are + * not completely initialized (they can't hold the reference to the + * lowervp * and can't use its v_lock as v_vnlock), so this will + * require the additional processing in null_reclaim and, possibly, + * other routines. + * + * + * IMPLEMENTATION DETAILS + * + * The first allocating thread should make the others to wait + * for the successful allocation. And it must also vref() + * the new node for the waiters before it will return to avoid the + * cases when waiters will get the vnode that was already released + * or is in the process of being recycled. Alternatively, all + * waiters must be waked up before null_nodeget of the allocating + * thread will return, but that's hard to accomplish, because + * we can't rely that the wakeup will be instantly delivered. */ int -null_nodeget(mp, lowervp, vpp) - struct mount *mp; - struct vnode *lowervp; - struct vnode **vpp; +null_nodeget(struct mount *mp, struct vnode *lowervp, + struct vnode **vpp) { + struct null_waiters *waiters; struct null_node *xp; struct vnode *vp; - int error; + struct mtx *hash_mtx; + struct null_node_hashhead *node_bucket; + struct null_waiters_hashhead *waiters_bucket; + int i, error; + +#define WAITERS_ALLOC_THREAD_FAILED(w, mtx) do { \ + if ((w)) { \ + mtx_lock((mtx)); \ + waiters_alloc_thread_failed((w)); \ + mtx_unlock((mtx)); \ + } \ +} while (0) ASSERT_VOP_LOCKED(lowervp, "lowervp"); - KASSERT(lowervp->v_usecount >= 1, ("Unreferenced vnode %p\n", lowervp)); + KASSERT(lowervp->v_usecount >= 1, + ("Unreferenced vnode %p\n", lowervp)); + + node_bucket = NULL_NHASH(lowervp); + waiters_bucket = NULL_WHASH(lowervp); + hash_mtx = NULL_MHASH(lowervp); + + mtx_lock(hash_mtx); - /* Lookup the hash firstly */ - *vpp = null_hashget(mp, lowervp); + /* Do hash lookup and return the existing vnode (if any) */ + *vpp = null_hashlookup(node_bucket, mp, lowervp); if (*vpp != NULL) { + mtx_unlock(hash_mtx); + vref(*vpp); vrele(lowervp); return (0); } /* - * We do not serialize vnode creation, instead we will check for - * duplicates later, when adding new vnode to hash. - * - * Note that duplicate can only appear in hash if the lowervp is - * locked LK_SHARED. + * Since lowervp is passed to us locked, we can't have + * another active thread racing for the same lowervp + * if the lock is exclusive. Thus we can skip + * the waiters dance if lowervp locked with LK_EXCLUSIVE. */ + if (VOP_ISLOCKED(lowervp) == LK_EXCLUSIVE) { + waiters = NULL; + } else { + int rc = + null_do_waiters(waiters_bucket, node_bucket, + mp, lowervp, hash_mtx, &waiters, vpp); + if (rc != 0 || *vpp != NULL) { + mtx_unlock(hash_mtx); + vrele(lowervp); + return (rc); + } + } + + mtx_unlock(hash_mtx); /* * Do the MALLOC before the getnewvnode since doing so afterward @@ -226,8 +515,14 @@ null_nodeget(mp, lowervp, vpp) xp = malloc(sizeof(struct null_node), M_NULLFSNODE, M_WAITOK); + #if 0 + /* DEBUG: slow down our code to allow races to come up */ + pause("0pause", hz / 10); + #endif + error = getnewvnode("null", mp, &null_vnodeops, &vp); if (error) { + WAITERS_ALLOC_THREAD_FAILED(waiters, hash_mtx); vput(lowervp); free(xp, M_NULLFSNODE); return (error); @@ -241,36 +536,27 @@ null_nodeget(mp, lowervp, vpp) if (vp->v_vnlock == NULL) panic("null_nodeget: Passed a NULL vnlock.\n"); error = insmntque1(vp, mp, null_insmntque_dtr, xp); - if (error != 0) + if (error != 0) { + /* null_insmntque_dtr will free the stuff for us */ + WAITERS_ALLOC_THREAD_FAILED(waiters, hash_mtx); return (error); - /* - * Atomically insert our new node into the hash or vget existing - * if someone else has beaten us to it. - */ - *vpp = null_hashins(mp, xp); - if (*vpp != NULL) { - vrele(lowervp); - vp->v_vnlock = &vp->v_lock; - xp->null_lowervp = NULL; - vrele(vp); - return (0); } - *vpp = vp; - - return (0); -} -/* - * Remove node from hash. - */ -void -null_hashrem(xp) - struct null_node *xp; -{ + mtx_lock(hash_mtx); + null_hashinsert(node_bucket, xp); + if (waiters) { + /* insmntque1() vref's our vp, so count only waiters */ + for (i = 0; i < waiters->count; i++) + vref(vp); + RELEASE_WAITERS(waiters); + wakeup(waiters); + waiters_unref(waiters); + } + mtx_unlock(hash_mtx); - mtx_lock(&null_hashmtx); - LIST_REMOVE(xp, null_hash); - mtx_unlock(&null_hashmtx); + *vpp = vp; + return (0); +#undef WAITERS_ALLOC_THREAD_FAILED } #ifdef DIAGNOSTIC @@ -311,3 +597,53 @@ null_checkvp(vp, fil, lno) return (a->null_lowervp); } #endif + + +/* + * Increments waiters count. + * + * Mutex from null_hashmtx that correspond to this waiters + * structure must be locked. + */ +static inline void +waiters_ref(struct null_waiters *waiters) +{ + waiters->count++; +} + + +/* + * Decrements waiters count and destroys the structure if + * all waiters are gone. + * + * Mutex from null_hashmtx that correspond to this waiters + * structure must be locked. + */ +static inline void +waiters_unref(struct null_waiters *waiters) +{ + waiters->count--; + if (waiters->count == -1) { + null_waitersremove(waiters); + free((void *)waiters, M_NULLFSNODE); + } +} + + +/* + * Called when allocating thread fails. + * + * Signals waiters of this allocation that master thread + * was failed. Since waiting threads will become the + * allocating thread, we can wakeup only single thread. + * + * Mutex from null_hashmtx that correspond to this waiters + * structure must be locked. + */ +static inline void +waiters_alloc_thread_failed(struct null_waiters *waiters) +{ + RELEASE_WAITERS(waiters); + waiters_unref(waiters); + wakeup_one(waiters); +} diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c index e0645bd..02e20c9 100644 --- a/sys/fs/nullfs/null_vnops.c +++ b/sys/fs/nullfs/null_vnops.c @@ -701,8 +701,10 @@ null_reclaim(struct vop_reclaim_args *ap) struct null_node *xp = VTONULL(vp); struct vnode *lowervp = xp->null_lowervp; - if (lowervp) - null_hashrem(xp); + if (lowervp == NULL) + panic("null_reclaim: reclaiming a node with no lowervp"); + + null_hashrem(xp); /* * Use the interlock to protect the clearing of v_data to * prevent faults in null_lock(). @@ -713,10 +715,7 @@ null_reclaim(struct vop_reclaim_args *ap) vp->v_object = NULL; vp->v_vnlock = &vp->v_lock; VI_UNLOCK(vp); - if (lowervp) - vput(lowervp); - else - panic("null_reclaim: reclaiming a node with no lowervp"); + vput(lowervp); free(xp, M_NULLFSNODE); return (0); -- 1.7.8.3