From: Linus Torvalds Date: Mon, 6 Apr 2009 20:25:56 +0000 (-0700) Subject: Merge branch 'for-2.6.30' of git://linux-nfs.org/~bfields/linux X-Git-Url: http://www.pilppa.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=a63856252d2112e7c452696037a86ceb12f47f80;hp=-c;p=linux-2.6-omap-h63xx.git Merge branch 'for-2.6.30' of git://linux-nfs.org/~bfields/linux * 'for-2.6.30' of git://linux-nfs.org/~bfields/linux: (81 commits) nfsd41: define nfsd4_set_statp as noop for !CONFIG_NFSD_V4 nfsd41: define NFSD_DRC_SIZE_SHIFT in set_max_drc nfsd41: Documentation/filesystems/nfs41-server.txt nfsd41: CREATE_EXCLUSIVE4_1 nfsd41: SUPPATTR_EXCLCREAT attribute nfsd41: support for 3-word long attribute bitmask nfsd: dynamically skip encoded fattr bitmap in _nfsd4_verify nfsd41: pass writable attrs mask to nfsd4_decode_fattr nfsd41: provide support for minor version 1 at rpc level nfsd41: control nfsv4.1 svc via /proc/fs/nfsd/versions nfsd41: add OPEN4_SHARE_ACCESS_WANT nfs4_stateid bmap nfsd41: access_valid nfsd41: clientid handling nfsd41: check encode size for sessions maxresponse cached nfsd41: stateid handling nfsd: pass nfsd4_compound_state* to nfs4_preprocess_{state,seq}id_op nfsd41: destroy_session operation nfsd41: non-page DRC for solo sequence responses nfsd41: Add a create session replay cache nfsd41: create_session operation ... --- a63856252d2112e7c452696037a86ceb12f47f80 diff --combined fs/nfsd/nfsctl.c index a4ed8644d69,a9b8c75bf0b..af16849d243 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@@ -60,6 -60,7 +60,7 @@@ enum NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, + NFSD_Pool_Stats, NFSD_Versions, NFSD_Ports, NFSD_MaxBlkSize, @@@ -172,6 -173,16 +173,16 @@@ static const struct file_operations exp .owner = THIS_MODULE, }; + extern int nfsd_pool_stats_open(struct inode *inode, struct file *file); + + static struct file_operations pool_stats_operations = { + .open = nfsd_pool_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .owner = THIS_MODULE, + }; + /*----------------------------------------------------------------------------*/ /* * payload - write methods @@@ -781,8 -792,9 +792,9 @@@ out_free static ssize_t __write_versions(struct file *file, char *buf, size_t size) { char *mesg = buf; - char *vers, sign; + char *vers, *minorp, sign; int len, num; + unsigned minor; ssize_t tlen = 0; char *sep; @@@ -803,9 -815,20 +815,20 @@@ do { sign = *vers; if (sign == '+' || sign == '-') - num = simple_strtol((vers+1), NULL, 0); + num = simple_strtol((vers+1), &minorp, 0); else - num = simple_strtol(vers, NULL, 0); + num = simple_strtol(vers, &minorp, 0); + if (*minorp == '.') { + if (num < 4) + return -EINVAL; + minor = simple_strtoul(minorp+1, NULL, 0); + if (minor == 0) + return -EINVAL; + if (nfsd_minorversion(minor, sign == '-' ? + NFSD_CLEAR : NFSD_SET) < 0) + return -EINVAL; + goto next; + } switch(num) { case 2: case 3: @@@ -815,6 -838,7 +838,7 @@@ default: return -EINVAL; } + next: vers += len + 1; tlen += len; } while ((len = qword_get(&mesg, vers, size)) > 0); @@@ -833,6 -857,13 +857,13 @@@ num); sep = " "; } + if (nfsd_vers(4, NFSD_AVAIL)) + for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++) + len += sprintf(buf+len, " %c4.%u", + (nfsd_vers(4, NFSD_TEST) && + nfsd_minorversion(minor, NFSD_TEST)) ? + '+' : '-', + minor); len += sprintf(buf+len, "\n"); return len; } @@@ -938,12 -969,10 +969,12 @@@ static ssize_t __write_ports(struct fil char transport[16]; int port; if (sscanf(buf, "%15s %4d", transport, &port) == 2) { + if (port < 1 || port > 65535) + return -EINVAL; err = nfsd_create_serv(); if (!err) { err = svc_create_xprt(nfsd_serv, - transport, port, + transport, PF_INET, port, SVC_SOCK_ANONYMOUS); if (err == -ENOENT) /* Give a reasonable perror msg for @@@ -962,7 -991,7 +993,7 @@@ char transport[16]; int port; if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) { - if (port == 0) + if (port < 1 || port > 65535) return -EINVAL; if (nfsd_serv) { xprt = svc_find_xprt(nfsd_serv, transport, @@@ -1248,6 -1277,7 +1279,7 @@@ static int nfsd_fill_super(struct super [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO}, [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, diff --combined fs/nfsd/nfssvc.c index 7c09852be71,469c931cca9..cbba4a93578 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@@ -22,6 -22,7 +22,7 @@@ #include #include #include + #include #include #include @@@ -40,9 -41,6 +41,6 @@@ extern struct svc_program nfsd_program; static int nfsd(void *vrqstp); struct timeval nfssvc_boot; - static atomic_t nfsd_busy; - static unsigned long nfsd_last_call; - static DEFINE_SPINLOCK(nfsd_call_lock); /* * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members @@@ -123,6 -121,8 +121,8 @@@ struct svc_program nfsd_program = }; + u32 nfsd_supported_minorversion; + int nfsd_vers(int vers, enum vers_op change) { if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) @@@ -149,6 -149,28 +149,28 @@@ } return 0; } + + int nfsd_minorversion(u32 minorversion, enum vers_op change) + { + if (minorversion > NFSD_SUPPORTED_MINOR_VERSION) + return -1; + switch(change) { + case NFSD_SET: + nfsd_supported_minorversion = minorversion; + break; + case NFSD_CLEAR: + if (minorversion == 0) + return -1; + nfsd_supported_minorversion = minorversion - 1; + break; + case NFSD_TEST: + return minorversion <= nfsd_supported_minorversion; + case NFSD_AVAIL: + return minorversion <= NFSD_SUPPORTED_MINOR_VERSION; + } + return 0; + } + /* * Maximum number of nfsd processes */ @@@ -200,6 -222,28 +222,28 @@@ void nfsd_reset_versions(void } } + /* + * Each session guarantees a negotiated per slot memory cache for replies + * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated + * NFSv4.1 server might want to use more memory for a DRC than a machine + * with mutiple services. + * + * Impose a hard limit on the number of pages for the DRC which varies + * according to the machines free pages. This is of course only a default. + * + * For now this is a #defined shift which could be under admin control + * in the future. + */ + static void set_max_drc(void) + { + /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */ + #define NFSD_DRC_SIZE_SHIFT 7 + nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages() + >> NFSD_DRC_SIZE_SHIFT; + nfsd_serv->sv_drc_pages_used = 0; + dprintk("%s svc_drc_max_pages %u\n", __func__, + nfsd_serv->sv_drc_max_pages); + } int nfsd_create_serv(void) { @@@ -227,11 -271,13 +271,12 @@@ nfsd_max_blksize /= 2; } - atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, - AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; + else + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; @@@ -243,7 -289,7 +288,7 @@@ static int nfsd_init_socks(int port if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@@ -252,7 -298,7 +297,7 @@@ if (error < 0) return error; - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, SVC_SOCK_DEFAULTS); if (error < 0) return error; @@@ -375,26 -421,6 +420,6 @@@ nfsd_svc(unsigned short port, int nrser return error; } - static inline void - update_thread_usage(int busy_threads) - { - unsigned long prev_call; - unsigned long diff; - int decile; - - spin_lock(&nfsd_call_lock); - prev_call = nfsd_last_call; - nfsd_last_call = jiffies; - decile = busy_threads*10/nfsdstats.th_cnt; - if (decile>0 && decile <= 10) { - diff = nfsd_last_call - prev_call; - if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP) - nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP; - if (decile == 10) - nfsdstats.th_fullcnt++; - } - spin_unlock(&nfsd_call_lock); - } /* * This is the NFS server kernel thread @@@ -403,6 -429,7 +428,6 @@@ static in nfsd(void *vrqstp) { struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp; - struct fs_struct *fsp; int err, preverr = 0; /* Lock module and set up kernel thread */ @@@ -411,11 -438,13 +436,11 @@@ /* At this point, the thread shares current->fs * with the init process. We need to create files with a * umask of 0 instead of init's umask. */ - fsp = copy_fs_struct(current->fs); - if (!fsp) { + if (unshare_fs_struct() < 0) { printk("Unable to start nfsd thread: out of memory\n"); goto out; } - exit_fs(current); - current->fs = fsp; + current->fs->umask = 0; /* @@@ -460,8 -489,6 +485,6 @@@ continue; } - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_inc(&nfsd_busy); /* Lock the export hash tables for reading. */ exp_readlock(); @@@ -470,8 -497,6 +493,6 @@@ /* Unlock export hash tables */ exp_readunlock(); - update_thread_usage(atomic_read(&nfsd_busy)); - atomic_dec(&nfsd_busy); } /* Clear signals before calling svc_exit_thread() */ @@@ -539,6 -564,10 +560,10 @@@ nfsd_dispatch(struct svc_rqst *rqstp, _ + rqstp->rq_res.head[0].iov_len; rqstp->rq_res.head[0].iov_len += sizeof(__be32); + /* NFSv4.1 DRC requires statp */ + if (rqstp->rq_vers == 4) + nfsd4_set_statp(rqstp, statp); + /* Now call the procedure handler, and encode NFS status. */ nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); nfserr = map_new_errors(rqstp->rq_vers, nfserr); @@@ -570,3 -599,10 +595,10 @@@ nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); return 1; } + + int nfsd_pool_stats_open(struct inode *inode, struct file *file) + { + if (nfsd_serv == NULL) + return -ENODEV; + return svc_pool_stats_open(nfsd_serv, file); + } diff --combined fs/nfsd/vfs.c index 78376b6c023,8790571b30f..ab93fcfef25 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@@ -356,7 -356,7 +356,7 @@@ nfsd_setattr(struct svc_rqst *rqstp, st put_write_access(inode); goto out_nfserr; } - DQUOT_INIT(inode); + vfs_dq_init(inode); } /* sanitize the mode change */ @@@ -366,8 -366,9 +366,9 @@@ } /* Revoke setuid/setgid on chown */ - if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || - ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { + if (!S_ISDIR(inode->i_mode) && + (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) { iap->ia_valid |= ATTR_KILL_PRIV; if (iap->ia_valid & ATTR_MODE) { /* we're setting mode too, just clear the s*id bits */ @@@ -723,7 -724,7 +724,7 @@@ nfsd_open(struct svc_rqst *rqstp, struc else flags = O_WRONLY|O_LARGEFILE; - DQUOT_INIT(inode); + vfs_dq_init(inode); } *filp = dentry_open(dget(dentry), mntget(fhp->fh_export->ex_path.mnt), flags, cred); @@@ -960,7 -961,7 +961,7 @@@ static void kill_suid(struct dentry *de static __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, - unsigned long cnt, int *stablep) + unsigned long *cnt, int *stablep) { struct svc_export *exp; struct dentry *dentry; @@@ -974,7 -975,7 +975,7 @@@ err = nfserr_perm; if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && - (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) + (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt))) goto out; #endif @@@ -998,18 -999,15 +999,18 @@@ if (!EX_ISSYNC(exp)) stable = 0; - if (stable && !EX_WGATHER(exp)) + if (stable && !EX_WGATHER(exp)) { + spin_lock(&file->f_lock); file->f_flags |= O_SYNC; + spin_unlock(&file->f_lock); + } /* Write the data. */ oldfs = get_fs(); set_fs(KERNEL_DS); host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); set_fs(oldfs); if (host_err >= 0) { - nfsdstats.io_write += cnt; + nfsdstats.io_write += host_err; fsnotify_modify(file->f_path.dentry); } @@@ -1054,9 -1052,10 +1055,10 @@@ } dprintk("nfsd: write complete host_err=%d\n", host_err); - if (host_err >= 0) + if (host_err >= 0) { err = 0; - else + *cnt = host_err; + } else err = nfserrno(host_err); out: return err; @@@ -1098,7 -1097,7 +1100,7 @@@ out */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, - loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, + loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) { __be32 err = 0; @@@ -1179,6 -1178,21 +1181,21 @@@ nfsd_create_setattr(struct svc_rqst *rq return 0; } + /* HPUX client sometimes creates a file in mode 000, and sets size to 0. + * setting size to 0 may fail for some specific file systems by the permission + * checking which requires WRITE permission but the mode is 000. + * we ignore the resizing(to 0) on the just new created file, since the size is + * 0 after file created. + * + * call this only after vfs_create() is called. + * */ + static void + nfsd_check_ignore_resizing(struct iattr *iap) + { + if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0)) + iap->ia_valid &= ~ATTR_SIZE; + } + /* * Create a file (regular, directory, device, fifo); UNIX sockets * not yet implemented. @@@ -1274,6 -1288,8 +1291,8 @@@ nfsd_create(struct svc_rqst *rqstp, str switch (type) { case S_IFREG: host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); + if (!host_err) + nfsd_check_ignore_resizing(iap); break; case S_IFDIR: host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); @@@ -1427,6 -1443,8 +1446,8 @@@ nfsd_create_v3(struct svc_rqst *rqstp, /* setattr will sync the child (or not) */ } + nfsd_check_ignore_resizing(iap); + if (createmode == NFS3_CREATE_EXCLUSIVE) { /* Cram the verifier into atime/mtime */ iap->ia_valid = ATTR_MTIME|ATTR_ATIME diff --combined include/linux/sunrpc/svc.h index d3a4c023193,d209c630a4a..2a30775959e --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@@ -24,6 -24,15 +24,15 @@@ */ typedef int (*svc_thread_fn)(void *); + /* statistics for svc_pool structures */ + struct svc_pool_stats { + unsigned long packets; + unsigned long sockets_queued; + unsigned long threads_woken; + unsigned long overloads_avoided; + unsigned long threads_timedout; + }; + /* * * RPC service thread pool. @@@ -41,6 -50,8 +50,8 @@@ struct svc_pool struct list_head sp_sockets; /* pending sockets */ unsigned int sp_nrthreads; /* # of threads in pool */ struct list_head sp_all_threads; /* all server threads */ + int sp_nwaking; /* number of threads woken but not yet active */ + struct svc_pool_stats sp_stats; /* statistics on pool operation */ } ____cacheline_aligned_in_smp; /* @@@ -69,6 -80,7 +80,6 @@@ struct svc_serv struct list_head sv_tempsocks; /* all temporary sockets */ int sv_tmpcnt; /* count of temporary sockets */ struct timer_list sv_temptimer; /* timer for aging temporary sockets */ - sa_family_t sv_family; /* listener's address family */ char * sv_name; /* service name */ @@@ -83,6 -95,8 +94,8 @@@ struct module * sv_module; /* optional module to count when * adding threads */ svc_thread_fn sv_function; /* main function for threads */ + unsigned int sv_drc_max_pages; /* Total pages for DRC */ + unsigned int sv_drc_pages_used;/* DRC pages used */ }; /* @@@ -218,6 -232,7 +231,7 @@@ struct svc_rqst struct svc_cred rq_cred; /* auth info */ void * rq_xprt_ctxt; /* transport specific context ptr */ struct svc_deferred_req*rq_deferred; /* deferred request we are replaying */ + int rq_usedeferral; /* use deferral */ size_t rq_xprt_hlen; /* xprt header len */ struct xdr_buf rq_arg; @@@ -263,6 -278,7 +277,7 @@@ * cache pages */ wait_queue_head_t rq_wait; /* synchronization */ struct task_struct *rq_task; /* service thread */ + int rq_waking; /* 1 if thread is being woken */ }; /* @@@ -384,19 -400,20 +399,20 @@@ struct svc_procedure /* * Function prototypes. */ -struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t, +struct svc_serv *svc_create(struct svc_program *, unsigned int, void (*shutdown)(struct svc_serv *)); struct svc_rqst *svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool); void svc_exit_thread(struct svc_rqst *); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, - sa_family_t, void (*shutdown)(struct svc_serv *), + void (*shutdown)(struct svc_serv *), svc_thread_fn, struct module *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); + int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_destroy(struct svc_serv *); int svc_process(struct svc_rqst *); -int svc_register(const struct svc_serv *, const unsigned short, - const unsigned short); +int svc_register(const struct svc_serv *, const int, + const unsigned short, const unsigned short); void svc_wake_up(struct svc_serv *); void svc_reserve(struct svc_rqst *rqstp, int space); diff --combined net/sunrpc/svc.c index 9b49a6ab8de,45984cbe1bf..8847add6ca1 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@@ -312,12 -312,13 +312,12 @@@ svc_pool_map_set_cpumask(struct task_st switch (m->mode) { case SVC_POOL_PERCPU: { - set_cpus_allowed_ptr(task, &cpumask_of_cpu(node)); + set_cpus_allowed_ptr(task, cpumask_of(node)); break; } case SVC_POOL_PERNODE: { - node_to_cpumask_ptr(nodecpumask, node); - set_cpus_allowed_ptr(task, nodecpumask); + set_cpus_allowed_ptr(task, cpumask_of_node(node)); break; } } @@@ -358,7 -359,7 +358,7 @@@ svc_pool_for_cpu(struct svc_serv *serv */ static struct svc_serv * __svc_create(struct svc_program *prog, unsigned int bufsize, int npools, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { struct svc_serv *serv; unsigned int vers; @@@ -367,6 -368,7 +367,6 @@@ if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL))) return NULL; - serv->sv_family = family; serv->sv_name = prog->pg_name; serv->sv_program = prog; serv->sv_nrthreads = 1; @@@ -425,21 -427,21 +425,21 @@@ struct svc_serv * svc_create(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv)) + void (*shutdown)(struct svc_serv *serv)) { - return __svc_create(prog, bufsize, /*npools*/1, family, shutdown); + return __svc_create(prog, bufsize, /*npools*/1, shutdown); } EXPORT_SYMBOL_GPL(svc_create); struct svc_serv * svc_create_pooled(struct svc_program *prog, unsigned int bufsize, - sa_family_t family, void (*shutdown)(struct svc_serv *serv), + void (*shutdown)(struct svc_serv *serv), svc_thread_fn func, struct module *mod) { struct svc_serv *serv; unsigned int npools = svc_pool_map_get(); - serv = __svc_create(prog, bufsize, npools, family, shutdown); + serv = __svc_create(prog, bufsize, npools, shutdown); if (serv != NULL) { serv->sv_function = func; @@@ -717,6 -719,8 +717,6 @@@ svc_exit_thread(struct svc_rqst *rqstp } EXPORT_SYMBOL_GPL(svc_exit_thread); -#ifdef CONFIG_SUNRPC_REGISTER_V4 - /* * Register an "inet" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@@ -731,13 -735,12 +731,13 @@@ static int __svc_rpcb_register4(const u const unsigned short protocol, const unsigned short port) { - struct sockaddr_in sin = { + const struct sockaddr_in sin = { .sin_family = AF_INET, .sin_addr.s_addr = htonl(INADDR_ANY), .sin_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@@ -747,23 -750,13 +747,23 @@@ netid = RPCBIND_NETID_TCP; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin, netid); + + /* + * User space didn't support rpcbind v4, so retry this + * registration request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, protocol, port); + + return error; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) /* * Register an "inet6" protocol family netid with the local * rpcbind daemon via an rpcbind v4 SET request. @@@ -778,13 -771,12 +778,13 @@@ static int __svc_rpcb_register6(const u const unsigned short protocol, const unsigned short port) { - struct sockaddr_in6 sin6 = { + const struct sockaddr_in6 sin6 = { .sin6_family = AF_INET6, .sin6_addr = IN6ADDR_ANY_INIT, .sin6_port = htons(port), }; - char *netid; + const char *netid; + int error; switch (protocol) { case IPPROTO_UDP: @@@ -794,22 -786,12 +794,22 @@@ netid = RPCBIND_NETID_TCP6; break; default: - return -EPROTONOSUPPORT; + return -ENOPROTOOPT; } - return rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, netid); + error = rpcb_v4_register(program, version, + (const struct sockaddr *)&sin6, netid); + + /* + * User space didn't support rpcbind version 4, so we won't + * use a PF_INET6 listener. + */ + if (error == -EPROTONOSUPPORT) + error = -EAFNOSUPPORT; + + return error; } +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ /* * Register a kernel RPC service via rpcbind version 4. @@@ -817,43 -799,69 +817,43 @@@ * Returns zero on success; a negative errno value is returned * if any error occurs. */ -static int __svc_register(const u32 program, const u32 version, - const sa_family_t family, +static int __svc_register(const char *progname, + const u32 program, const u32 version, + const int family, const unsigned short protocol, const unsigned short port) { - int error; + int error = -EAFNOSUPPORT; switch (family) { - case AF_INET: - return __svc_rpcb_register4(program, version, + case PF_INET: + error = __svc_rpcb_register4(program, version, protocol, port); - case AF_INET6: + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case PF_INET6: error = __svc_rpcb_register6(program, version, protocol, port); - if (error < 0) - return error; - - /* - * Work around bug in some versions of Linux rpcbind - * which don't allow registration of both inet and - * inet6 netids. - * - * Error return ignored for now. - */ - __svc_rpcb_register4(program, version, - protocol, port); - return 0; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ } - return -EAFNOSUPPORT; -} - -#else /* CONFIG_SUNRPC_REGISTER_V4 */ - -/* - * Register a kernel RPC service via rpcbind version 2. - * - * Returns zero on success; a negative errno value is returned - * if any error occurs. - */ -static int __svc_register(const u32 program, const u32 version, - sa_family_t family, - const unsigned short protocol, - const unsigned short port) -{ - if (family != AF_INET) - return -EAFNOSUPPORT; - - return rpcb_register(program, version, protocol, port); + if (error < 0) + printk(KERN_WARNING "svc: failed to register %sv%u RPC " + "service (errno %d).\n", progname, version, -error); + return error; } -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /** * svc_register - register an RPC service with the local portmapper * @serv: svc_serv struct for the service to register + * @family: protocol family of service's listener socket * @proto: transport protocol number to advertise * @port: port to advertise * - * Service is registered for any address in serv's address family + * Service is registered for any address in the passed-in protocol family */ -int svc_register(const struct svc_serv *serv, const unsigned short proto, - const unsigned short port) +int svc_register(const struct svc_serv *serv, const int family, + const unsigned short proto, const unsigned short port) { struct svc_program *progp; unsigned int i; @@@ -871,15 -879,15 +871,15 @@@ i, proto == IPPROTO_UDP? "udp" : "tcp", port, - serv->sv_family, + family, progp->pg_vers[i]->vs_hidden? " (but not telling portmap)" : ""); if (progp->pg_vers[i]->vs_hidden) continue; - error = __svc_register(progp->pg_prog, i, - serv->sv_family, proto, port); + error = __svc_register(progp->pg_name, progp->pg_prog, + i, family, proto, port); if (error < 0) break; } @@@ -888,31 -896,38 +888,31 @@@ return error; } -#ifdef CONFIG_SUNRPC_REGISTER_V4 - +/* + * If user space is running rpcbind, it should take the v4 UNSET + * and clear everything for this [program, version]. If user space + * is running portmap, it will reject the v4 UNSET, but won't have + * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient + * in this case to clear all existing entries for [program, version]. + */ static void __svc_unregister(const u32 program, const u32 version, const char *progname) { - struct sockaddr_in6 sin6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_ANY_INIT, - .sin6_port = 0, - }; int error; - error = rpcb_v4_register(program, version, - (struct sockaddr *)&sin6, ""); - dprintk("svc: %s(%sv%u), error %d\n", - __func__, progname, version, error); -} - -#else /* CONFIG_SUNRPC_REGISTER_V4 */ + error = rpcb_v4_register(program, version, NULL, ""); -static void __svc_unregister(const u32 program, const u32 version, - const char *progname) -{ - int error; + /* + * User space didn't support rpcbind v4, so retry this + * request with the legacy rpcbind v2 protocol. + */ + if (error == -EPROTONOSUPPORT) + error = rpcb_register(program, version, 0, 0); - error = rpcb_register(program, version, 0, 0); dprintk("svc: %s(%sv%u), error %d\n", __func__, progname, version, error); } -#endif /* CONFIG_SUNRPC_REGISTER_V4 */ - /* * All netids, bind addresses and ports registered for [program, version] * are removed from the local rpcbind database (if the service is not @@@ -1008,6 -1023,8 +1008,8 @@@ svc_process(struct svc_rqst *rqstp rqstp->rq_res.tail[0].iov_len = 0; /* Will be turned off only in gss privacy case: */ rqstp->rq_splice_ok = 1; + /* Will be turned off only when NFSv4 Sessions are used */ + rqstp->rq_usedeferral = 1; /* Setup reply header */ rqstp->rq_xprt->xpt_ops->xpo_prep_reply_hdr(rqstp); @@@ -1078,7 -1095,6 +1080,6 @@@ procp = versp->vs_proc + proc; if (proc >= versp->vs_nproc || !procp->pc_func) goto err_bad_proc; - rqstp->rq_server = serv; rqstp->rq_procinfo = procp; /* Syntactic check complete */ diff --combined net/sunrpc/svc_xprt.c index 2819ee093f3,600d0918e3a..c200d92e57e --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@@ -14,6 -14,8 +14,8 @@@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT + #define SVC_MAX_WAKING 5 + static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt); static int svc_deferred_recv(struct svc_rqst *rqstp); static struct cache_deferred_req *svc_defer(struct cache_req *req); @@@ -161,9 -163,7 +163,9 @@@ EXPORT_SYMBOL_GPL(svc_xprt_init) static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl, struct svc_serv *serv, - unsigned short port, int flags) + const int family, + const unsigned short port, + int flags) { struct sockaddr_in sin = { .sin_family = AF_INET, @@@ -178,12 -178,12 +180,12 @@@ struct sockaddr *sap; size_t len; - switch (serv->sv_family) { - case AF_INET: + switch (family) { + case PF_INET: sap = (struct sockaddr *)&sin; len = sizeof(sin); break; - case AF_INET6: + case PF_INET6: sap = (struct sockaddr *)&sin6; len = sizeof(sin6); break; @@@ -194,8 -194,7 +196,8 @@@ return xcl->xcl_ops->xpo_create(serv, sap, len, flags); } -int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port, +int svc_create_xprt(struct svc_serv *serv, const char *xprt_name, + const int family, const unsigned short port, int flags) { struct svc_xprt_class *xcl; @@@ -212,7 -211,7 +214,7 @@@ goto err; spin_unlock(&svc_xprt_class_lock); - newxprt = __svc_xpo_create(xcl, serv, port, flags); + newxprt = __svc_xpo_create(xcl, serv, family, port, flags); if (IS_ERR(newxprt)) { module_put(xcl->xcl_owner); return PTR_ERR(newxprt); @@@ -301,6 -300,7 +303,7 @@@ void svc_xprt_enqueue(struct svc_xprt * struct svc_pool *pool; struct svc_rqst *rqstp; int cpu; + int thread_avail; if (!(xprt->xpt_flags & ((1<sp_lock); - if (!list_empty(&pool->sp_threads) && - !list_empty(&pool->sp_sockets)) - printk(KERN_ERR - "svc_xprt_enqueue: " - "threads and transports both waiting??\n"); - if (test_bit(XPT_DEAD, &xprt->xpt_flags)) { /* Don't enqueue dead transports */ dprintk("svc: transport %p is dead, not enqueued\n", xprt); goto out_unlock; } + pool->sp_stats.packets++; + /* Mark transport as busy. It will remain in this state until * the provider calls svc_xprt_received. We update XPT_BUSY * atomically because it also guards against trying to enqueue @@@ -356,7 -352,15 +355,15 @@@ } process: - if (!list_empty(&pool->sp_threads)) { + /* Work out whether threads are available */ + thread_avail = !list_empty(&pool->sp_threads); /* threads are asleep */ + if (pool->sp_nwaking >= SVC_MAX_WAKING) { + /* too many threads are runnable and trying to wake up */ + thread_avail = 0; + pool->sp_stats.overloads_avoided++; + } + + if (thread_avail) { rqstp = list_entry(pool->sp_threads.next, struct svc_rqst, rq_list); @@@ -371,11 -375,15 +378,15 @@@ svc_xprt_get(xprt); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); + rqstp->rq_waking = 1; + pool->sp_nwaking++; + pool->sp_stats.threads_woken++; BUG_ON(xprt->xpt_pool != pool); wake_up(&rqstp->rq_wait); } else { dprintk("svc: transport %p put into queue\n", xprt); list_add_tail(&xprt->xpt_ready, &pool->sp_sockets); + pool->sp_stats.sockets_queued++; BUG_ON(xprt->xpt_pool != pool); } @@@ -588,6 -596,7 +599,7 @@@ int svc_recv(struct svc_rqst *rqstp, lo int pages; struct xdr_buf *arg; DECLARE_WAITQUEUE(wait, current); + long time_left; dprintk("svc: server %p waiting for data (to = %ld)\n", rqstp, timeout); @@@ -636,6 -645,11 +648,11 @@@ return -EINTR; spin_lock_bh(&pool->sp_lock); + if (rqstp->rq_waking) { + rqstp->rq_waking = 0; + pool->sp_nwaking--; + BUG_ON(pool->sp_nwaking < 0); + } xprt = svc_xprt_dequeue(pool); if (xprt) { rqstp->rq_xprt = xprt; @@@ -668,12 -682,14 +685,14 @@@ add_wait_queue(&rqstp->rq_wait, &wait); spin_unlock_bh(&pool->sp_lock); - schedule_timeout(timeout); + time_left = schedule_timeout(timeout); try_to_freeze(); spin_lock_bh(&pool->sp_lock); remove_wait_queue(&rqstp->rq_wait, &wait); + if (!time_left) + pool->sp_stats.threads_timedout++; xprt = rqstp->rq_xprt; if (!xprt) { @@@ -958,7 -974,7 +977,7 @@@ static struct cache_deferred_req *svc_d struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); struct svc_deferred_req *dr; - if (rqstp->rq_arg.page_len) + if (rqstp->rq_arg.page_len || !rqstp->rq_usedeferral) return NULL; /* if more than a page, give up FIXME */ if (rqstp->rq_deferred) { dr = rqstp->rq_deferred; @@@ -1036,13 -1052,7 +1055,13 @@@ static struct svc_deferred_req *svc_def return dr; } -/* +/** + * svc_find_xprt - find an RPC transport instance + * @serv: pointer to svc_serv to search + * @xcl_name: C string containing transport's class name + * @af: Address family of transport's local address + * @port: transport's IP port number + * * Return the transport instance pointer for the endpoint accepting * connections/peer traffic from the specified transport class, * address family and port. @@@ -1051,14 -1061,14 +1070,14 @@@ * wild-card, and will result in matching the first transport in the * service's list that has a matching class name. */ -struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name, - int af, int port) +struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name, + const sa_family_t af, const unsigned short port) { struct svc_xprt *xprt; struct svc_xprt *found = NULL; /* Sanity check the args */ - if (!serv || !xcl_name) + if (serv == NULL || xcl_name == NULL) return found; spin_lock_bh(&serv->sv_lock); @@@ -1067,7 -1077,7 +1086,7 @@@ continue; if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family) continue; - if (port && port != svc_xprt_local_port(xprt)) + if (port != 0 && port != svc_xprt_local_port(xprt)) continue; found = xprt; svc_xprt_get(xprt); @@@ -1112,3 -1122,93 +1131,93 @@@ int svc_xprt_names(struct svc_serv *ser return totlen; } EXPORT_SYMBOL_GPL(svc_xprt_names); + + + /*----------------------------------------------------------------------------*/ + + static void *svc_pool_stats_start(struct seq_file *m, loff_t *pos) + { + unsigned int pidx = (unsigned int)*pos; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_start, *pidx=%u\n", pidx); + + lock_kernel(); + /* bump up the pseudo refcount while traversing */ + svc_get(serv); + unlock_kernel(); + + if (!pidx) + return SEQ_START_TOKEN; + return (pidx > serv->sv_nrpools ? NULL : &serv->sv_pools[pidx-1]); + } + + static void *svc_pool_stats_next(struct seq_file *m, void *p, loff_t *pos) + { + struct svc_pool *pool = p; + struct svc_serv *serv = m->private; + + dprintk("svc_pool_stats_next, *pos=%llu\n", *pos); + + if (p == SEQ_START_TOKEN) { + pool = &serv->sv_pools[0]; + } else { + unsigned int pidx = (pool - &serv->sv_pools[0]); + if (pidx < serv->sv_nrpools-1) + pool = &serv->sv_pools[pidx+1]; + else + pool = NULL; + } + ++*pos; + return pool; + } + + static void svc_pool_stats_stop(struct seq_file *m, void *p) + { + struct svc_serv *serv = m->private; + + lock_kernel(); + /* this function really, really should have been called svc_put() */ + svc_destroy(serv); + unlock_kernel(); + } + + static int svc_pool_stats_show(struct seq_file *m, void *p) + { + struct svc_pool *pool = p; + + if (p == SEQ_START_TOKEN) { + seq_puts(m, "# pool packets-arrived sockets-enqueued threads-woken overloads-avoided threads-timedout\n"); + return 0; + } + + seq_printf(m, "%u %lu %lu %lu %lu %lu\n", + pool->sp_id, + pool->sp_stats.packets, + pool->sp_stats.sockets_queued, + pool->sp_stats.threads_woken, + pool->sp_stats.overloads_avoided, + pool->sp_stats.threads_timedout); + + return 0; + } + + static const struct seq_operations svc_pool_stats_seq_ops = { + .start = svc_pool_stats_start, + .next = svc_pool_stats_next, + .stop = svc_pool_stats_stop, + .show = svc_pool_stats_show, + }; + + int svc_pool_stats_open(struct svc_serv *serv, struct file *file) + { + int err; + + err = seq_open(file, &svc_pool_stats_seq_ops); + if (!err) + ((struct seq_file *) file->private_data)->private = serv; + return err; + } + EXPORT_SYMBOL(svc_pool_stats_open); + + /*----------------------------------------------------------------------------*/ diff --combined net/sunrpc/svcsock.c index 9d504234af4,7a2a90fb2e0..af3198814c1 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@@ -345,7 -345,6 +345,6 @@@ static void svc_sock_setbufsize(struct lock_sock(sock->sk); sock->sk->sk_sndbuf = snd * 2; sock->sk->sk_rcvbuf = rcv * 2; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; release_sock(sock->sk); #endif } @@@ -797,23 -796,6 +796,6 @@@ static int svc_tcp_recvfrom(struct svc_ test_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags), test_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags)); - if (test_and_clear_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags)) - /* sndbuf needs to have room for one request - * per thread, otherwise we can stall even when the - * network isn't a bottleneck. - * - * We count all threads rather than threads in a - * particular pool, which provides an upper bound - * on the number of threads which will access the socket. - * - * rcvbuf just needs to be able to hold a few requests. - * Normally they will be removed from the queue - * as soon a a complete request arrives. - */ - svc_sock_setbufsize(svsk->sk_sock, - (serv->sv_nrthreads+3) * serv->sv_max_mesg, - 3 * serv->sv_max_mesg); - clear_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); /* Receive data. If we haven't got the record length yet, get @@@ -1061,15 -1043,6 +1043,6 @@@ static void svc_tcp_init(struct svc_soc tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; - /* initialise setting must have enough space to - * receive and respond to one request. - * svc_tcp_recvfrom will re-adjust if necessary - */ - svc_sock_setbufsize(svsk->sk_sock, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg, - 3 * svsk->sk_xprt.xpt_server->sv_max_mesg); - - set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); if (sk->sk_state != TCP_ESTABLISHED) set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags); @@@ -1110,6 -1083,7 +1083,6 @@@ static struct svc_sock *svc_setup_socke struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); - int val; dprintk("svc: svc_setup_socket %p\n", sock); if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) { @@@ -1121,7 -1095,7 +1094,7 @@@ /* Register socket with portmapper */ if (*errp >= 0 && pmap_register) - *errp = svc_register(serv, inet->sk_protocol, + *errp = svc_register(serv, inet->sk_family, inet->sk_protocol, ntohs(inet_sk(inet)->sport)); if (*errp < 0) { @@@ -1139,9 -1113,27 +1112,15 @@@ /* Initialize the socket */ if (sock->type == SOCK_DGRAM) svc_udp_init(svsk, serv); - else + else { + /* initialise setting must have enough space to + * receive and respond to one request. + */ + svc_sock_setbufsize(svsk->sk_sock, 4 * serv->sv_max_mesg, + 4 * serv->sv_max_mesg); svc_tcp_init(svsk, serv); + } - /* - * We start one listener per sv_serv. We want AF_INET - * requests to be automatically shunted to our AF_INET6 - * listener using a mapped IPv4 address. Make sure - * no-one starts an equivalent IPv4 listener, which - * would steal our incoming connections. - */ - val = 0; - if (serv->sv_family == AF_INET6) - kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, - (char *)&val, sizeof(val)); - dprintk("svc: svc_setup_socket created %p (inet %p)\n", svsk, svsk->sk_sk); @@@ -1209,8 -1201,6 +1188,8 @@@ static struct svc_xprt *svc_create_sock struct sockaddr_storage addr; struct sockaddr *newsin = (struct sockaddr *)&addr; int newlen; + int family; + int val; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk("svc: svc_create_socket(%s, %d, %s)\n", @@@ -1222,35 -1212,14 +1201,35 @@@ "sockets supported\n"); return ERR_PTR(-EINVAL); } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + switch (sin->sa_family) { + case AF_INET6: + family = PF_INET6; + break; + case AF_INET: + family = PF_INET; + break; + default: + return ERR_PTR(-EINVAL); + } - error = sock_create_kern(sin->sa_family, type, protocol, &sock); + error = sock_create_kern(family, type, protocol, &sock); if (error < 0) return ERR_PTR(error); svc_reclassify_socket(sock); + /* + * If this is an PF_INET6 listener, we want to avoid + * getting requests from IPv4 remotes. Those should + * be shunted to a PF_INET listener via rpcbind. + */ + val = 1; + if (family == PF_INET6) + kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY, + (char *)&val, sizeof(val)); + if (type == SOCK_STREAM) sock->sk->sk_reuse = 1; /* allow address reuse */ error = kernel_bind(sock, sin, len);