From: Trond Myklebust NFSv4: Basic code for recovering file OPEN state after a server reboot. --- fs/nfs/inode.c | 2 fs/nfs/nfs4proc.c | 235 ++++++++++++++++++++++++++++++++++++++++--------- fs/nfs/nfs4state.c | 129 ++++++++++++++++++++++---- include/linux/nfs_fs.h | 14 ++ 4 files changed, 317 insertions(+), 63 deletions(-) diff -puN fs/nfs/inode.c~nfs-24-state_recovery fs/nfs/inode.c --- 25/fs/nfs/inode.c~nfs-24-state_recovery 2004-01-14 02:09:56.000000000 -0800 +++ 25-akpm/fs/nfs/inode.c 2004-01-14 02:09:56.000000000 -0800 @@ -1448,6 +1448,8 @@ static int nfs4_fill_super(struct super_ clp->cl_cred = rpcauth_lookupcred(clnt->cl_auth, 0); memcpy(clp->cl_ipaddr, server->ip_addr, sizeof(clp->cl_ipaddr)); } + if (list_empty(&clp->cl_superblocks)) + clear_bit(NFS4CLNT_OK, &clp->cl_state); list_add_tail(&server->nfs4_siblings, &clp->cl_superblocks); clnt = rpc_clone_client(clp->cl_rpcclient); server->nfs4_state = clp; diff -puN fs/nfs/nfs4proc.c~nfs-24-state_recovery fs/nfs/nfs4proc.c --- 25/fs/nfs/nfs4proc.c~nfs-24-state_recovery 2004-01-14 02:09:56.000000000 -0800 +++ 25-akpm/fs/nfs/nfs4proc.c 2004-01-14 02:09:56.000000000 -0800 @@ -48,9 +48,12 @@ #define NFSDBG_FACILITY NFSDBG_PROC +#define NFS4_POLL_RETRY_TIME (15*HZ) + #define GET_OP(cp,name) &cp->ops[cp->req_nops].u.name #define OPNUM(cp) cp->ops[cp->req_nops].opnum +static int nfs4_async_handle_error(struct rpc_task *, struct nfs_server *); extern u32 *nfs4_decode_dirent(u32 *p, struct nfs_entry *entry, int plus); extern struct rpc_procinfo nfs4_procedures[]; @@ -532,7 +535,6 @@ nfs4_do_open(struct inode *dir, struct q struct nfs_openargs o_arg = { .fh = NFS_FH(dir), .share_access = flags & (FMODE_READ|FMODE_WRITE), - .clientid = NFS_SERVER(dir)->nfs4_state->cl_clientid, .opentype = (flags & O_CREAT) ? NFS4_OPEN_CREATE : NFS4_OPEN_NOCREATE, .createmode = (flags & O_EXCL) ? NFS4_CREATE_EXCLUSIVE : NFS4_CREATE_UNCHECKED, .name = name, @@ -553,6 +555,7 @@ nfs4_do_open(struct inode *dir, struct q .rpc_cred = cred, }; +retry: status = -ENOMEM; if (!(sp = nfs4_get_state_owner(NFS_SERVER(dir), cred))) { dprintk("nfs4_do_open: nfs4_get_state_owner failed!\n"); @@ -569,6 +572,7 @@ nfs4_do_open(struct inode *dir, struct q down(&sp->so_sema); o_arg.seqid = sp->so_seqid; o_arg.id = sp->so_id; + o_arg.clientid = NFS_SERVER(dir)->nfs4_state->cl_clientid, status = rpc_call_sync(server->client, &msg, 0); nfs4_increment_seqid(status, sp); @@ -623,6 +627,9 @@ out_up: nfs4_put_open_state(state); if (inode) iput(inode); + status = nfs4_handle_error(server, status); + if (!status) + goto retry; out: return ERR_PTR(status); } @@ -651,7 +658,9 @@ nfs4_do_setattr(struct nfs_server *serve .rpc_argp = &arg, .rpc_resp = &res, }; + int status; +retry: fattr->valid = 0; if (state) @@ -659,7 +668,13 @@ nfs4_do_setattr(struct nfs_server *serve else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); - return(rpc_call_sync(server->client, &msg, 0)); + status = rpc_call_sync(server->client, &msg, 0); + if (status) { + status = nfs4_handle_error(server, status); + if (!status) + goto retry; + } + return status; } /* @@ -707,48 +722,12 @@ static int nfs4_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr) { - struct nfs4_client *clp; struct nfs4_compound compound; struct nfs4_op ops[4]; unsigned char * p; struct qstr q; int status; - clp = server->nfs4_state; - - down_write(&clp->cl_sem); - /* Has the clientid already been initialized? */ - if (clp->cl_state != NFS4CLNT_NEW) - /* Yep, so just read the root attributes and the lease time. */ - goto no_setclientid; - - /* - * SETCLIENTID. - * Until delegations are imported, we don't bother setting the program - * number and port to anything meaningful. - */ - if ((status = nfs4_proc_setclientid(clp, 0, 0))) - goto out_unlock; - - /* - * SETCLIENTID_CONFIRM, plus root filehandle. - * We also get the lease time here. - */ - if ((status = nfs4_proc_setclientid_confirm(clp))) - goto out_unlock; - - /* - * Now that we have instantiated the clientid and determined - * the lease time, we can initialize the renew daemon for this - * server. - * FIXME: we only need one renewd daemon per server. - */ - nfs4_schedule_state_renewal(clp); - clp->cl_state = NFS4CLNT_OK; - -no_setclientid: - up_write(&clp->cl_sem); - /* * Now we do a separate LOOKUP for each component of the mount path. * The LOOKUPs are done separately so that we can conveniently @@ -787,9 +766,6 @@ no_setclientid: } break; } - return status; -out_unlock: - up_write(&clp->cl_sem); out: return status; } @@ -1411,12 +1387,30 @@ nfs4_proc_pathconf(struct nfs_server *se } static void +nfs4_restart_read(struct rpc_task *task) +{ + struct nfs_read_data *data = (struct nfs_read_data *)task->tk_calldata; + struct nfs_page *req; + + rpc_restart_call(task); + req = nfs_list_entry(data->pages.next); + if (req->wb_state) + memcpy(&data->args.stateid, &req->wb_state->stateid, sizeof(data->args.stateid)); + else + memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); +} + +static void nfs4_read_done(struct rpc_task *task) { struct nfs_read_data *data = (struct nfs_read_data *) task->tk_calldata; struct inode *inode = data->inode; struct nfs_fattr *fattr = data->res.fattr; + if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + task->tk_action = nfs4_restart_read; + return; + } if (task->tk_status > 0) renew_lease(NFS_SERVER(inode), data->timestamp); /* Check cache consistency */ @@ -1485,11 +1479,29 @@ nfs4_write_refresh_inode(struct inode *i } static void +nfs4_restart_write(struct rpc_task *task) +{ + struct nfs_write_data *data = (struct nfs_write_data *)task->tk_calldata; + struct nfs_page *req; + + rpc_restart_call(task); + req = nfs_list_entry(data->pages.next); + if (req->wb_state) + memcpy(&data->args.stateid, &req->wb_state->stateid, sizeof(data->args.stateid)); + else + memcpy(&data->args.stateid, &zero_stateid, sizeof(data->args.stateid)); +} + +static void nfs4_write_done(struct rpc_task *task) { struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; struct inode *inode = data->inode; + if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + task->tk_action = nfs4_restart_write; + return; + } if (task->tk_status >= 0) renew_lease(NFS_SERVER(inode), data->timestamp); nfs4_write_refresh_inode(inode, data->res.fattr); @@ -1552,8 +1564,13 @@ static void nfs4_commit_done(struct rpc_task *task) { struct nfs_write_data *data = (struct nfs_write_data *) task->tk_calldata; + struct inode *inode = data->inode; - nfs4_write_refresh_inode(data->inode, data->res.fattr); + if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) { + task->tk_action = nfs4_restart_write; + return; + } + nfs4_write_refresh_inode(inode, data->res.fattr); /* Call back common NFS writeback processing */ nfs_commit_done(task); } @@ -1599,6 +1616,14 @@ renew_done(struct rpc_task *task) { struct nfs4_client *clp = (struct nfs4_client *)task->tk_msg.rpc_argp; unsigned long timestamp = (unsigned long)task->tk_calldata; + + if (task->tk_status < 0) { + switch (task->tk_status) { + case -NFS4ERR_STALE_CLIENTID: + nfs4_schedule_state_recovery(clp); + return; + } + } spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; @@ -1617,6 +1642,25 @@ nfs4_proc_async_renew(struct nfs4_client return rpc_call_async(clp->cl_rpcclient, &msg, 0, renew_done, (void *)jiffies); } +int +nfs4_proc_renew(struct nfs4_client *clp) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RENEW], + .rpc_argp = clp, + .rpc_cred = clp->cl_cred, + }; + unsigned long now = jiffies; + int status; + + status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); + spin_lock(&clp->cl_lock); + if (time_before(clp->cl_last_renewal,now)) + clp->cl_last_renewal = now; + spin_unlock(&clp->cl_lock); + return status; +} + /* * We will need to arrange for the VFS layer to provide an atomic open. * Until then, this open method is prone to inefficiency and race conditions @@ -1697,6 +1741,113 @@ nfs4_request_init(struct nfs_page *req, req->wb_cred = get_rpccred(state->owner->so_cred); } +static int +nfs4_async_handle_error(struct rpc_task *task, struct nfs_server *server) +{ + struct nfs4_client *clp = server->nfs4_state; + + if (!clp) + return 0; + switch(task->tk_status) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); + nfs4_schedule_state_recovery(clp); + task->tk_status = 0; + return -EAGAIN; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + rpc_delay(task, NFS4_POLL_RETRY_TIME); + task->tk_status = 0; + return -EAGAIN; + } + return 0; +} + +int +nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs4_client *clp) +{ + DEFINE_WAIT(wait); + sigset_t oldset; + int interruptible, res; + + might_sleep(); + + rpc_clnt_sigmask(clnt, &oldset); + interruptible = TASK_UNINTERRUPTIBLE; + if (clnt->cl_intr) + interruptible = TASK_INTERRUPTIBLE; + do { + res = 0; + prepare_to_wait(&clp->cl_waitq, &wait, interruptible); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_OK, &clp->cl_state) && + !test_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state)) + break; + if (clnt->cl_intr && signalled()) { + res = -ERESTARTSYS; + break; + } + schedule(); + } while(!test_bit(NFS4CLNT_OK, &clp->cl_state)); + finish_wait(&clp->cl_waitq, &wait); + rpc_clnt_sigunmask(clnt, &oldset); + return res; +} + +static int +nfs4_delay(struct rpc_clnt *clnt) +{ + sigset_t oldset; + int res = 0; + + might_sleep(); + + rpc_clnt_sigmask(clnt, &oldset); + if (clnt->cl_intr) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(NFS4_POLL_RETRY_TIME); + if (signalled()) + res = -ERESTARTSYS; + } else { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(NFS4_POLL_RETRY_TIME); + } + rpc_clnt_sigunmask(clnt, &oldset); + return res; +} + +/* This is the error handling routine for processes that are allowed + * to sleep. + */ +int +nfs4_handle_error(struct nfs_server *server, int errorcode) +{ + struct nfs4_client *clp = server->nfs4_state; + int ret = errorcode; + + switch(errorcode) { + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_EXPIRED: + ret = nfs4_wait_clnt_recover(server->client, clp); + break; + case -NFS4ERR_GRACE: + case -NFS4ERR_DELAY: + ret = nfs4_delay(server->client); + break; + default: + if (errorcode <= -1000) { + printk(KERN_WARNING "%s could not handle NFSv4 error %d\n", + __FUNCTION__, -errorcode); + ret = -EIO; + } + } + /* We failed to handle the error */ + return ret; +} + static int nfs4_request_compatible(struct nfs_page *req, struct file *filp, struct page *page) diff -puN fs/nfs/nfs4state.c~nfs-24-state_recovery fs/nfs/nfs4state.c --- 25/fs/nfs/nfs4state.c~nfs-24-state_recovery 2004-01-14 02:09:56.000000000 -0800 +++ 25-akpm/fs/nfs/nfs4state.c 2004-01-14 02:09:56.000000000 -0800 @@ -56,6 +56,7 @@ nfs4_stateid one_stateid = static LIST_HEAD(nfs4_clientid_list); +static void nfs4_recover_state(void *); extern void nfs4_renew_state(void *); void @@ -98,9 +99,12 @@ nfs4_alloc_client(struct in_addr *addr) INIT_LIST_HEAD(&clp->cl_unused); spin_lock_init(&clp->cl_lock); atomic_set(&clp->cl_count, 1); + INIT_WORK(&clp->cl_recoverd, nfs4_recover_state, clp); INIT_WORK(&clp->cl_renewd, nfs4_renew_state, clp); INIT_LIST_HEAD(&clp->cl_superblocks); - clp->cl_state = NFS4CLNT_NEW; + init_waitqueue_head(&clp->cl_waitq); + INIT_RPC_WAITQ(&clp->cl_rpcwaitq, "NFS4 client"); + clp->cl_state = 1 << NFS4CLNT_NEW; } return clp; } @@ -155,6 +159,9 @@ nfs4_put_client(struct nfs4_client *clp) return; list_del(&clp->cl_servers); spin_unlock(&state_spinlock); + BUG_ON(!list_empty(&clp->cl_superblocks)); + wake_up_all(&clp->cl_waitq); + rpc_wake_up(&clp->cl_rpcwaitq); nfs4_kill_renewd(clp); nfs4_free_client(clp); } @@ -175,6 +182,7 @@ nfs4_client_grab_unused(struct nfs4_clie atomic_inc(&sp->so_count); sp->so_cred = cred; list_move(&sp->so_list, &clp->cl_state_owners); + sp->so_generation = clp->cl_generation; clp->cl_nunused--; } return sp; @@ -215,13 +223,17 @@ nfs4_get_state_owner(struct nfs_server * new->so_client = clp; new->so_id = nfs4_alloc_lockowner_id(clp); new->so_cred = cred; + new->so_generation = clp->cl_generation; sp = new; new = NULL; } spin_unlock(&clp->cl_lock); if (new) kfree(new); - if (!sp) + if (sp) { + if (!test_bit(NFS4CLNT_OK, &clp->cl_state)) + nfs4_wait_clnt_recover(server->client, clp); + } else put_rpccred(cred); return sp; } @@ -353,6 +365,7 @@ nfs4_put_open_state(struct nfs4_state *s { struct inode *inode = state->inode; struct nfs4_state_owner *owner = state->owner; + int status = 0; if (!atomic_dec_and_lock(&state->count, &inode->i_lock)) return; @@ -360,8 +373,16 @@ nfs4_put_open_state(struct nfs4_state *s spin_unlock(&inode->i_lock); down(&owner->so_sema); list_del(&state->open_states); - if (state->state != 0) - nfs4_do_close(inode, state); + if (state->state != 0) { + do { + status = nfs4_do_close(inode, state); + if (!status) + break; + up(&owner->so_sema); + status = nfs4_handle_error(NFS_SERVER(inode), status); + down(&owner->so_sema); + } while (!status); + } up(&owner->so_sema); iput(inode); nfs4_free_open_state(state); @@ -392,41 +413,81 @@ struct reclaimer_args { * State recovery routine */ void -nfs4_recover_state(struct nfs4_client *clp) +nfs4_recover_state(void *data) { + struct nfs4_client *clp = (struct nfs4_client *)data; struct reclaimer_args args = { .clp = clp, }; + might_sleep(); + init_completion(&args.complete); down_read(&clp->cl_sem); - if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0) + if (test_and_set_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state)) goto out_failed; + if (kernel_thread(reclaimer, &args, CLONE_KERNEL) < 0) + goto out_failed_clear; wait_for_completion(&args.complete); return; +out_failed_clear: + smp_mb__before_clear_bit(); + clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state); + smp_mb__after_clear_bit(); + wake_up_all(&clp->cl_waitq); + rpc_wake_up(&clp->cl_rpcwaitq); out_failed: up_read(&clp->cl_sem); } -static void +/* + * Schedule a state recovery attempt + */ +void +nfs4_schedule_state_recovery(struct nfs4_client *clp) +{ + if (!clp) + return; + smp_mb__before_clear_bit(); + clear_bit(NFS4CLNT_OK, &clp->cl_state); + smp_mb__after_clear_bit(); + schedule_work(&clp->cl_recoverd); +} + +static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp) { struct nfs4_state *state; - int status; + int status = 0; list_for_each_entry(state, &sp->so_states, open_states) { status = nfs4_open_reclaim(sp, state); - if (status) { - /* - * Open state on this file cannot be recovered - * All we can do is revert to using the zero stateid. - */ - memset(state->stateid.data, 0, + if (status >= 0) + continue; + switch (status) { + default: + printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", + __FUNCTION__, status); + case -NFS4ERR_EXPIRED: + case -NFS4ERR_NO_GRACE: + case -NFS4ERR_RECLAIM_BAD: + case -NFS4ERR_RECLAIM_CONFLICT: + /* + * Open state on this file cannot be recovered + * All we can do is revert to using the zero stateid. + */ + memset(state->stateid.data, 0, sizeof(state->stateid.data)); - /* Mark the file as being 'closed' */ - state->state = 0; + /* Mark the file as being 'closed' */ + state->state = 0; + break; + case -NFS4ERR_STALE_CLIENTID: + goto out_err; } } + return 0; +out_err: + return status; } static int @@ -435,6 +496,7 @@ reclaimer(void *ptr) struct reclaimer_args *args = (struct reclaimer_args *)ptr; struct nfs4_client *clp = args->clp; struct nfs4_state_owner *sp; + int generation; int status; daemonize("%u.%u.%u.%u-reclaim", NIPQUAD(clp->cl_addr)); @@ -445,29 +507,58 @@ reclaimer(void *ptr) /* Are there any NFS mounts out there? */ if (list_empty(&clp->cl_superblocks)) goto out; + if (!test_bit(NFS4CLNT_NEW, &clp->cl_state)) { + status = nfs4_proc_renew(clp); + if (status == 0) { + set_bit(NFS4CLNT_OK, &clp->cl_state); + goto out; + } + } status = nfs4_proc_setclientid(clp, 0, 0); if (status) goto out_error; status = nfs4_proc_setclientid_confirm(clp); if (status) goto out_error; + generation = ++(clp->cl_generation); + clear_bit(NFS4CLNT_NEW, &clp->cl_state); + set_bit(NFS4CLNT_OK, &clp->cl_state); + up_read(&clp->cl_sem); + nfs4_schedule_state_renewal(clp); +restart_loop: spin_lock(&clp->cl_lock); list_for_each_entry(sp, &clp->cl_state_owners, so_list) { + if (sp->so_generation - generation <= 0) + continue; atomic_inc(&sp->so_count); spin_unlock(&clp->cl_lock); down(&sp->so_sema); - nfs4_reclaim_open_state(sp); + if (sp->so_generation - generation < 0) { + smp_rmb(); + sp->so_generation = clp->cl_generation; + status = nfs4_reclaim_open_state(sp); + } up(&sp->so_sema); nfs4_put_state_owner(sp); - spin_lock(&clp->cl_lock); + if (status < 0) { + if (status == -NFS4ERR_STALE_CLIENTID) + nfs4_schedule_state_recovery(clp); + goto out; + } + goto restart_loop; } spin_unlock(&clp->cl_lock); out: - up_read(&clp->cl_sem); + smp_mb__before_clear_bit(); + clear_bit(NFS4CLNT_SETUP_STATE, &clp->cl_state); + smp_mb__after_clear_bit(); + wake_up_all(&clp->cl_waitq); + rpc_wake_up(&clp->cl_rpcwaitq); return 0; out_error: printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %u.%u.%u.%u\n", NIPQUAD(clp->cl_addr.s_addr)); + up_read(&clp->cl_sem); goto out; } diff -puN include/linux/nfs_fs.h~nfs-24-state_recovery include/linux/nfs_fs.h --- 25/include/linux/nfs_fs.h~nfs-24-state_recovery 2004-01-14 02:09:56.000000000 -0800 +++ 25-akpm/include/linux/nfs_fs.h 2004-01-14 02:09:56.000000000 -0800 @@ -465,6 +465,7 @@ extern void * nfs_root_data(void); enum nfs4_client_state { NFS4CLNT_OK = 0, NFS4CLNT_NEW, + NFS4CLNT_SETUP_STATE, }; /* @@ -475,7 +476,8 @@ struct nfs4_client { struct in_addr cl_addr; /* Server identifier */ u64 cl_clientid; /* constant */ nfs4_verifier cl_confirm; - enum nfs4_client_state cl_state; + unsigned long cl_state; + long cl_generation; u32 cl_lockowner_id; @@ -499,6 +501,10 @@ struct nfs4_client { unsigned long cl_lease_time; unsigned long cl_last_renewal; struct work_struct cl_renewd; + struct work_struct cl_recoverd; + + wait_queue_head_t cl_waitq; + struct rpc_wait_queue cl_rpcwaitq; /* Our own IP address, as a null-terminated string. * This is used to generate the clientid, and the callback address. @@ -523,6 +529,7 @@ struct nfs4_state_owner { u32 so_seqid; /* protected by so_sema */ unsigned int so_flags; /* protected by so_sema */ atomic_t so_count; + long so_generation; struct rpc_cred *so_cred; /* Associated cred */ struct list_head so_states; @@ -556,7 +563,9 @@ extern int nfs4_proc_setclientid(struct extern int nfs4_proc_setclientid_confirm(struct nfs4_client *); extern int nfs4_open_reclaim(struct nfs4_state_owner *, struct nfs4_state *); extern int nfs4_proc_async_renew(struct nfs4_client *); +extern int nfs4_proc_renew(struct nfs4_client *); extern int nfs4_do_close(struct inode *, struct nfs4_state *); +extern int nfs4_wait_clnt_recover(struct rpc_clnt *, struct nfs4_client *); /* nfs4renewd.c */ extern void nfs4_schedule_state_renewal(struct nfs4_client *); @@ -573,7 +582,8 @@ extern void nfs4_put_state_owner(struct extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *); extern void nfs4_put_open_state(struct nfs4_state *); extern void nfs4_increment_seqid(int status, struct nfs4_state_owner *sp); -extern void nfs4_recover_state(struct nfs4_client *); +extern int nfs4_handle_error(struct nfs_server *, int); +extern void nfs4_schedule_state_recovery(struct nfs4_client *); struct nfs4_mount_data; #else _