]> www.pilppa.org Git - linux-2.6-omap-h63xx.git/blob - fs/proc/base.c
4b74dba69a6d6761aeed6595ba543fbcfc76730a
[linux-2.6-omap-h63xx.git] / fs / proc / base.c
1 /*
2  *  linux/fs/proc/base.c
3  *
4  *  Copyright (C) 1991, 1992 Linus Torvalds
5  *
6  *  proc base directory handling functions
7  *
8  *  1999, Al Viro. Rewritten. Now it covers the whole per-process part.
9  *  Instead of using magical inumbers to determine the kind of object
10  *  we allocate and fill in-core inodes upon lookup. They don't even
11  *  go into icache. We cache the reference to task_struct upon lookup too.
12  *  Eventually it should become a filesystem in its own. We don't use the
13  *  rest of procfs anymore.
14  *
15  *
16  *  Changelog:
17  *  17-Jan-2005
18  *  Allan Bezerra
19  *  Bruna Moreira <bruna.moreira@indt.org.br>
20  *  Edjard Mota <edjard.mota@indt.org.br>
21  *  Ilias Biris <ilias.biris@indt.org.br>
22  *  Mauricio Lin <mauricio.lin@indt.org.br>
23  *
24  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
25  *
26  *  A new process specific entry (smaps) included in /proc. It shows the
27  *  size of rss for each memory area. The maps entry lacks information
28  *  about physical memory size (rss) for each mapped file, i.e.,
29  *  rss information for executables and library files.
30  *  This additional information is useful for any tools that need to know
31  *  about physical memory consumption for a process specific library.
32  *
33  *  Changelog:
34  *  21-Feb-2005
35  *  Embedded Linux Lab - 10LE Instituto Nokia de Tecnologia - INdT
36  *  Pud inclusion in the page table walking.
37  *
38  *  ChangeLog:
39  *  10-Mar-2005
40  *  10LE Instituto Nokia de Tecnologia - INdT:
41  *  A better way to walks through the page table as suggested by Hugh Dickins.
42  *
43  *  Simo Piiroinen <simo.piiroinen@nokia.com>:
44  *  Smaps information related to shared, private, clean and dirty pages.
45  *
46  *  Paul Mundt <paul.mundt@nokia.com>:
47  *  Overall revision about smaps.
48  */
49
50 #include <asm/uaccess.h>
51
52 #include <linux/errno.h>
53 #include <linux/time.h>
54 #include <linux/proc_fs.h>
55 #include <linux/stat.h>
56 #include <linux/init.h>
57 #include <linux/capability.h>
58 #include <linux/file.h>
59 #include <linux/fdtable.h>
60 #include <linux/string.h>
61 #include <linux/seq_file.h>
62 #include <linux/namei.h>
63 #include <linux/mnt_namespace.h>
64 #include <linux/mm.h>
65 #include <linux/rcupdate.h>
66 #include <linux/kallsyms.h>
67 #include <linux/resource.h>
68 #include <linux/module.h>
69 #include <linux/mount.h>
70 #include <linux/security.h>
71 #include <linux/ptrace.h>
72 #include <linux/tracehook.h>
73 #include <linux/cgroup.h>
74 #include <linux/cpuset.h>
75 #include <linux/audit.h>
76 #include <linux/poll.h>
77 #include <linux/nsproxy.h>
78 #include <linux/oom.h>
79 #include <linux/elf.h>
80 #include <linux/pid_namespace.h>
81 #include "internal.h"
82
83 /* NOTE:
84  *      Implementing inode permission operations in /proc is almost
85  *      certainly an error.  Permission checks need to happen during
86  *      each system call not at open time.  The reason is that most of
87  *      what we wish to check for permissions in /proc varies at runtime.
88  *
89  *      The classic example of a problem is opening file descriptors
90  *      in /proc for a task before it execs a suid executable.
91  */
92
93 struct pid_entry {
94         char *name;
95         int len;
96         mode_t mode;
97         const struct inode_operations *iop;
98         const struct file_operations *fop;
99         union proc_op op;
100 };
101
102 #define NOD(NAME, MODE, IOP, FOP, OP) {                 \
103         .name = (NAME),                                 \
104         .len  = sizeof(NAME) - 1,                       \
105         .mode = MODE,                                   \
106         .iop  = IOP,                                    \
107         .fop  = FOP,                                    \
108         .op   = OP,                                     \
109 }
110
111 #define DIR(NAME, MODE, OTYPE)                                                  \
112         NOD(NAME, (S_IFDIR|(MODE)),                                             \
113                 &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations,   \
114                 {} )
115 #define LNK(NAME, OTYPE)                                        \
116         NOD(NAME, (S_IFLNK|S_IRWXUGO),                          \
117                 &proc_pid_link_inode_operations, NULL,          \
118                 { .proc_get_link = &proc_##OTYPE##_link } )
119 #define REG(NAME, MODE, OTYPE)                          \
120         NOD(NAME, (S_IFREG|(MODE)), NULL,               \
121                 &proc_##OTYPE##_operations, {})
122 #define INF(NAME, MODE, OTYPE)                          \
123         NOD(NAME, (S_IFREG|(MODE)),                     \
124                 NULL, &proc_info_file_operations,       \
125                 { .proc_read = &proc_##OTYPE } )
126 #define ONE(NAME, MODE, OTYPE)                          \
127         NOD(NAME, (S_IFREG|(MODE)),                     \
128                 NULL, &proc_single_file_operations,     \
129                 { .proc_show = &proc_##OTYPE } )
130
131 /*
132  * Count the number of hardlinks for the pid_entry table, excluding the .
133  * and .. links.
134  */
135 static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
136         unsigned int n)
137 {
138         unsigned int i;
139         unsigned int count;
140
141         count = 0;
142         for (i = 0; i < n; ++i) {
143                 if (S_ISDIR(entries[i].mode))
144                         ++count;
145         }
146
147         return count;
148 }
149
150 int maps_protect;
151 EXPORT_SYMBOL(maps_protect);
152
153 static struct fs_struct *get_fs_struct(struct task_struct *task)
154 {
155         struct fs_struct *fs;
156         task_lock(task);
157         fs = task->fs;
158         if(fs)
159                 atomic_inc(&fs->count);
160         task_unlock(task);
161         return fs;
162 }
163
164 static int get_nr_threads(struct task_struct *tsk)
165 {
166         /* Must be called with the rcu_read_lock held */
167         unsigned long flags;
168         int count = 0;
169
170         if (lock_task_sighand(tsk, &flags)) {
171                 count = atomic_read(&tsk->signal->count);
172                 unlock_task_sighand(tsk, &flags);
173         }
174         return count;
175 }
176
177 static int proc_cwd_link(struct inode *inode, struct path *path)
178 {
179         struct task_struct *task = get_proc_task(inode);
180         struct fs_struct *fs = NULL;
181         int result = -ENOENT;
182
183         if (task) {
184                 fs = get_fs_struct(task);
185                 put_task_struct(task);
186         }
187         if (fs) {
188                 read_lock(&fs->lock);
189                 *path = fs->pwd;
190                 path_get(&fs->pwd);
191                 read_unlock(&fs->lock);
192                 result = 0;
193                 put_fs_struct(fs);
194         }
195         return result;
196 }
197
198 static int proc_root_link(struct inode *inode, struct path *path)
199 {
200         struct task_struct *task = get_proc_task(inode);
201         struct fs_struct *fs = NULL;
202         int result = -ENOENT;
203
204         if (task) {
205                 fs = get_fs_struct(task);
206                 put_task_struct(task);
207         }
208         if (fs) {
209                 read_lock(&fs->lock);
210                 *path = fs->root;
211                 path_get(&fs->root);
212                 read_unlock(&fs->lock);
213                 result = 0;
214                 put_fs_struct(fs);
215         }
216         return result;
217 }
218
219 /*
220  * Return zero if current may access user memory in @task, -error if not.
221  */
222 static int check_mem_permission(struct task_struct *task)
223 {
224         /*
225          * A task can always look at itself, in case it chooses
226          * to use system calls instead of load instructions.
227          */
228         if (task == current)
229                 return 0;
230
231         /*
232          * If current is actively ptrace'ing, and would also be
233          * permitted to freshly attach with ptrace now, permit it.
234          */
235         if (task_is_stopped_or_traced(task)) {
236                 int match;
237                 rcu_read_lock();
238                 match = (tracehook_tracer_task(task) == current);
239                 rcu_read_unlock();
240                 if (match && ptrace_may_access(task, PTRACE_MODE_ATTACH))
241                         return 0;
242         }
243
244         /*
245          * Noone else is allowed.
246          */
247         return -EPERM;
248 }
249
250 struct mm_struct *mm_for_maps(struct task_struct *task)
251 {
252         struct mm_struct *mm = get_task_mm(task);
253         if (!mm)
254                 return NULL;
255         down_read(&mm->mmap_sem);
256         task_lock(task);
257         if (task->mm != mm)
258                 goto out;
259         if (task->mm != current->mm &&
260             __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
261                 goto out;
262         task_unlock(task);
263         return mm;
264 out:
265         task_unlock(task);
266         up_read(&mm->mmap_sem);
267         mmput(mm);
268         return NULL;
269 }
270
271 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
272 {
273         int res = 0;
274         unsigned int len;
275         struct mm_struct *mm = get_task_mm(task);
276         if (!mm)
277                 goto out;
278         if (!mm->arg_end)
279                 goto out_mm;    /* Shh! No looking before we're done */
280
281         len = mm->arg_end - mm->arg_start;
282  
283         if (len > PAGE_SIZE)
284                 len = PAGE_SIZE;
285  
286         res = access_process_vm(task, mm->arg_start, buffer, len, 0);
287
288         // If the nul at the end of args has been overwritten, then
289         // assume application is using setproctitle(3).
290         if (res > 0 && buffer[res-1] != '\0' && len < PAGE_SIZE) {
291                 len = strnlen(buffer, res);
292                 if (len < res) {
293                     res = len;
294                 } else {
295                         len = mm->env_end - mm->env_start;
296                         if (len > PAGE_SIZE - res)
297                                 len = PAGE_SIZE - res;
298                         res += access_process_vm(task, mm->env_start, buffer+res, len, 0);
299                         res = strnlen(buffer, res);
300                 }
301         }
302 out_mm:
303         mmput(mm);
304 out:
305         return res;
306 }
307
308 static int proc_pid_auxv(struct task_struct *task, char *buffer)
309 {
310         int res = 0;
311         struct mm_struct *mm = get_task_mm(task);
312         if (mm) {
313                 unsigned int nwords = 0;
314                 do
315                         nwords += 2;
316                 while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */
317                 res = nwords * sizeof(mm->saved_auxv[0]);
318                 if (res > PAGE_SIZE)
319                         res = PAGE_SIZE;
320                 memcpy(buffer, mm->saved_auxv, res);
321                 mmput(mm);
322         }
323         return res;
324 }
325
326
327 #ifdef CONFIG_KALLSYMS
328 /*
329  * Provides a wchan file via kallsyms in a proper one-value-per-file format.
330  * Returns the resolved symbol.  If that fails, simply return the address.
331  */
332 static int proc_pid_wchan(struct task_struct *task, char *buffer)
333 {
334         unsigned long wchan;
335         char symname[KSYM_NAME_LEN];
336
337         wchan = get_wchan(task);
338
339         if (lookup_symbol_name(wchan, symname) < 0)
340                 return sprintf(buffer, "%lu", wchan);
341         else
342                 return sprintf(buffer, "%s", symname);
343 }
344 #endif /* CONFIG_KALLSYMS */
345
346 #ifdef CONFIG_SCHEDSTATS
347 /*
348  * Provides /proc/PID/schedstat
349  */
350 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
351 {
352         return sprintf(buffer, "%llu %llu %lu\n",
353                         task->sched_info.cpu_time,
354                         task->sched_info.run_delay,
355                         task->sched_info.pcount);
356 }
357 #endif
358
359 #ifdef CONFIG_LATENCYTOP
360 static int lstats_show_proc(struct seq_file *m, void *v)
361 {
362         int i;
363         struct inode *inode = m->private;
364         struct task_struct *task = get_proc_task(inode);
365
366         if (!task)
367                 return -ESRCH;
368         seq_puts(m, "Latency Top version : v0.1\n");
369         for (i = 0; i < 32; i++) {
370                 if (task->latency_record[i].backtrace[0]) {
371                         int q;
372                         seq_printf(m, "%i %li %li ",
373                                 task->latency_record[i].count,
374                                 task->latency_record[i].time,
375                                 task->latency_record[i].max);
376                         for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
377                                 char sym[KSYM_NAME_LEN];
378                                 char *c;
379                                 if (!task->latency_record[i].backtrace[q])
380                                         break;
381                                 if (task->latency_record[i].backtrace[q] == ULONG_MAX)
382                                         break;
383                                 sprint_symbol(sym, task->latency_record[i].backtrace[q]);
384                                 c = strchr(sym, '+');
385                                 if (c)
386                                         *c = 0;
387                                 seq_printf(m, "%s ", sym);
388                         }
389                         seq_printf(m, "\n");
390                 }
391
392         }
393         put_task_struct(task);
394         return 0;
395 }
396
397 static int lstats_open(struct inode *inode, struct file *file)
398 {
399         return single_open(file, lstats_show_proc, inode);
400 }
401
402 static ssize_t lstats_write(struct file *file, const char __user *buf,
403                             size_t count, loff_t *offs)
404 {
405         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
406
407         if (!task)
408                 return -ESRCH;
409         clear_all_latency_tracing(task);
410         put_task_struct(task);
411
412         return count;
413 }
414
415 static const struct file_operations proc_lstats_operations = {
416         .open           = lstats_open,
417         .read           = seq_read,
418         .write          = lstats_write,
419         .llseek         = seq_lseek,
420         .release        = single_release,
421 };
422
423 #endif
424
425 /* The badness from the OOM killer */
426 unsigned long badness(struct task_struct *p, unsigned long uptime);
427 static int proc_oom_score(struct task_struct *task, char *buffer)
428 {
429         unsigned long points;
430         struct timespec uptime;
431
432         do_posix_clock_monotonic_gettime(&uptime);
433         read_lock(&tasklist_lock);
434         points = badness(task, uptime.tv_sec);
435         read_unlock(&tasklist_lock);
436         return sprintf(buffer, "%lu\n", points);
437 }
438
439 struct limit_names {
440         char *name;
441         char *unit;
442 };
443
444 static const struct limit_names lnames[RLIM_NLIMITS] = {
445         [RLIMIT_CPU] = {"Max cpu time", "ms"},
446         [RLIMIT_FSIZE] = {"Max file size", "bytes"},
447         [RLIMIT_DATA] = {"Max data size", "bytes"},
448         [RLIMIT_STACK] = {"Max stack size", "bytes"},
449         [RLIMIT_CORE] = {"Max core file size", "bytes"},
450         [RLIMIT_RSS] = {"Max resident set", "bytes"},
451         [RLIMIT_NPROC] = {"Max processes", "processes"},
452         [RLIMIT_NOFILE] = {"Max open files", "files"},
453         [RLIMIT_MEMLOCK] = {"Max locked memory", "bytes"},
454         [RLIMIT_AS] = {"Max address space", "bytes"},
455         [RLIMIT_LOCKS] = {"Max file locks", "locks"},
456         [RLIMIT_SIGPENDING] = {"Max pending signals", "signals"},
457         [RLIMIT_MSGQUEUE] = {"Max msgqueue size", "bytes"},
458         [RLIMIT_NICE] = {"Max nice priority", NULL},
459         [RLIMIT_RTPRIO] = {"Max realtime priority", NULL},
460         [RLIMIT_RTTIME] = {"Max realtime timeout", "us"},
461 };
462
463 /* Display limits for a process */
464 static int proc_pid_limits(struct task_struct *task, char *buffer)
465 {
466         unsigned int i;
467         int count = 0;
468         unsigned long flags;
469         char *bufptr = buffer;
470
471         struct rlimit rlim[RLIM_NLIMITS];
472
473         rcu_read_lock();
474         if (!lock_task_sighand(task,&flags)) {
475                 rcu_read_unlock();
476                 return 0;
477         }
478         memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS);
479         unlock_task_sighand(task, &flags);
480         rcu_read_unlock();
481
482         /*
483          * print the file header
484          */
485         count += sprintf(&bufptr[count], "%-25s %-20s %-20s %-10s\n",
486                         "Limit", "Soft Limit", "Hard Limit", "Units");
487
488         for (i = 0; i < RLIM_NLIMITS; i++) {
489                 if (rlim[i].rlim_cur == RLIM_INFINITY)
490                         count += sprintf(&bufptr[count], "%-25s %-20s ",
491                                          lnames[i].name, "unlimited");
492                 else
493                         count += sprintf(&bufptr[count], "%-25s %-20lu ",
494                                          lnames[i].name, rlim[i].rlim_cur);
495
496                 if (rlim[i].rlim_max == RLIM_INFINITY)
497                         count += sprintf(&bufptr[count], "%-20s ", "unlimited");
498                 else
499                         count += sprintf(&bufptr[count], "%-20lu ",
500                                          rlim[i].rlim_max);
501
502                 if (lnames[i].unit)
503                         count += sprintf(&bufptr[count], "%-10s\n",
504                                          lnames[i].unit);
505                 else
506                         count += sprintf(&bufptr[count], "\n");
507         }
508
509         return count;
510 }
511
512 /************************************************************************/
513 /*                       Here the fs part begins                        */
514 /************************************************************************/
515
516 /* permission checks */
517 static int proc_fd_access_allowed(struct inode *inode)
518 {
519         struct task_struct *task;
520         int allowed = 0;
521         /* Allow access to a task's file descriptors if it is us or we
522          * may use ptrace attach to the process and find out that
523          * information.
524          */
525         task = get_proc_task(inode);
526         if (task) {
527                 allowed = ptrace_may_access(task, PTRACE_MODE_READ);
528                 put_task_struct(task);
529         }
530         return allowed;
531 }
532
533 static int proc_setattr(struct dentry *dentry, struct iattr *attr)
534 {
535         int error;
536         struct inode *inode = dentry->d_inode;
537
538         if (attr->ia_valid & ATTR_MODE)
539                 return -EPERM;
540
541         error = inode_change_ok(inode, attr);
542         if (!error)
543                 error = inode_setattr(inode, attr);
544         return error;
545 }
546
547 static const struct inode_operations proc_def_inode_operations = {
548         .setattr        = proc_setattr,
549 };
550
551 static int mounts_open_common(struct inode *inode, struct file *file,
552                               const struct seq_operations *op)
553 {
554         struct task_struct *task = get_proc_task(inode);
555         struct nsproxy *nsp;
556         struct mnt_namespace *ns = NULL;
557         struct fs_struct *fs = NULL;
558         struct path root;
559         struct proc_mounts *p;
560         int ret = -EINVAL;
561
562         if (task) {
563                 rcu_read_lock();
564                 nsp = task_nsproxy(task);
565                 if (nsp) {
566                         ns = nsp->mnt_ns;
567                         if (ns)
568                                 get_mnt_ns(ns);
569                 }
570                 rcu_read_unlock();
571                 if (ns)
572                         fs = get_fs_struct(task);
573                 put_task_struct(task);
574         }
575
576         if (!ns)
577                 goto err;
578         if (!fs)
579                 goto err_put_ns;
580
581         read_lock(&fs->lock);
582         root = fs->root;
583         path_get(&root);
584         read_unlock(&fs->lock);
585         put_fs_struct(fs);
586
587         ret = -ENOMEM;
588         p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
589         if (!p)
590                 goto err_put_path;
591
592         file->private_data = &p->m;
593         ret = seq_open(file, op);
594         if (ret)
595                 goto err_free;
596
597         p->m.private = p;
598         p->ns = ns;
599         p->root = root;
600         p->event = ns->event;
601
602         return 0;
603
604  err_free:
605         kfree(p);
606  err_put_path:
607         path_put(&root);
608  err_put_ns:
609         put_mnt_ns(ns);
610  err:
611         return ret;
612 }
613
614 static int mounts_release(struct inode *inode, struct file *file)
615 {
616         struct proc_mounts *p = file->private_data;
617         path_put(&p->root);
618         put_mnt_ns(p->ns);
619         return seq_release(inode, file);
620 }
621
622 static unsigned mounts_poll(struct file *file, poll_table *wait)
623 {
624         struct proc_mounts *p = file->private_data;
625         struct mnt_namespace *ns = p->ns;
626         unsigned res = 0;
627
628         poll_wait(file, &ns->poll, wait);
629
630         spin_lock(&vfsmount_lock);
631         if (p->event != ns->event) {
632                 p->event = ns->event;
633                 res = POLLERR;
634         }
635         spin_unlock(&vfsmount_lock);
636
637         return res;
638 }
639
640 static int mounts_open(struct inode *inode, struct file *file)
641 {
642         return mounts_open_common(inode, file, &mounts_op);
643 }
644
645 static const struct file_operations proc_mounts_operations = {
646         .open           = mounts_open,
647         .read           = seq_read,
648         .llseek         = seq_lseek,
649         .release        = mounts_release,
650         .poll           = mounts_poll,
651 };
652
653 static int mountinfo_open(struct inode *inode, struct file *file)
654 {
655         return mounts_open_common(inode, file, &mountinfo_op);
656 }
657
658 static const struct file_operations proc_mountinfo_operations = {
659         .open           = mountinfo_open,
660         .read           = seq_read,
661         .llseek         = seq_lseek,
662         .release        = mounts_release,
663         .poll           = mounts_poll,
664 };
665
666 static int mountstats_open(struct inode *inode, struct file *file)
667 {
668         return mounts_open_common(inode, file, &mountstats_op);
669 }
670
671 static const struct file_operations proc_mountstats_operations = {
672         .open           = mountstats_open,
673         .read           = seq_read,
674         .llseek         = seq_lseek,
675         .release        = mounts_release,
676 };
677
678 #define PROC_BLOCK_SIZE (3*1024)                /* 4K page size but our output routines use some slack for overruns */
679
680 static ssize_t proc_info_read(struct file * file, char __user * buf,
681                           size_t count, loff_t *ppos)
682 {
683         struct inode * inode = file->f_path.dentry->d_inode;
684         unsigned long page;
685         ssize_t length;
686         struct task_struct *task = get_proc_task(inode);
687
688         length = -ESRCH;
689         if (!task)
690                 goto out_no_task;
691
692         if (count > PROC_BLOCK_SIZE)
693                 count = PROC_BLOCK_SIZE;
694
695         length = -ENOMEM;
696         if (!(page = __get_free_page(GFP_TEMPORARY)))
697                 goto out;
698
699         length = PROC_I(inode)->op.proc_read(task, (char*)page);
700
701         if (length >= 0)
702                 length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
703         free_page(page);
704 out:
705         put_task_struct(task);
706 out_no_task:
707         return length;
708 }
709
710 static const struct file_operations proc_info_file_operations = {
711         .read           = proc_info_read,
712 };
713
714 static int proc_single_show(struct seq_file *m, void *v)
715 {
716         struct inode *inode = m->private;
717         struct pid_namespace *ns;
718         struct pid *pid;
719         struct task_struct *task;
720         int ret;
721
722         ns = inode->i_sb->s_fs_info;
723         pid = proc_pid(inode);
724         task = get_pid_task(pid, PIDTYPE_PID);
725         if (!task)
726                 return -ESRCH;
727
728         ret = PROC_I(inode)->op.proc_show(m, ns, pid, task);
729
730         put_task_struct(task);
731         return ret;
732 }
733
734 static int proc_single_open(struct inode *inode, struct file *filp)
735 {
736         int ret;
737         ret = single_open(filp, proc_single_show, NULL);
738         if (!ret) {
739                 struct seq_file *m = filp->private_data;
740
741                 m->private = inode;
742         }
743         return ret;
744 }
745
746 static const struct file_operations proc_single_file_operations = {
747         .open           = proc_single_open,
748         .read           = seq_read,
749         .llseek         = seq_lseek,
750         .release        = single_release,
751 };
752
753 static int mem_open(struct inode* inode, struct file* file)
754 {
755         file->private_data = (void*)((long)current->self_exec_id);
756         return 0;
757 }
758
759 static ssize_t mem_read(struct file * file, char __user * buf,
760                         size_t count, loff_t *ppos)
761 {
762         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
763         char *page;
764         unsigned long src = *ppos;
765         int ret = -ESRCH;
766         struct mm_struct *mm;
767
768         if (!task)
769                 goto out_no_task;
770
771         if (check_mem_permission(task))
772                 goto out;
773
774         ret = -ENOMEM;
775         page = (char *)__get_free_page(GFP_TEMPORARY);
776         if (!page)
777                 goto out;
778
779         ret = 0;
780  
781         mm = get_task_mm(task);
782         if (!mm)
783                 goto out_free;
784
785         ret = -EIO;
786  
787         if (file->private_data != (void*)((long)current->self_exec_id))
788                 goto out_put;
789
790         ret = 0;
791  
792         while (count > 0) {
793                 int this_len, retval;
794
795                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
796                 retval = access_process_vm(task, src, page, this_len, 0);
797                 if (!retval || check_mem_permission(task)) {
798                         if (!ret)
799                                 ret = -EIO;
800                         break;
801                 }
802
803                 if (copy_to_user(buf, page, retval)) {
804                         ret = -EFAULT;
805                         break;
806                 }
807  
808                 ret += retval;
809                 src += retval;
810                 buf += retval;
811                 count -= retval;
812         }
813         *ppos = src;
814
815 out_put:
816         mmput(mm);
817 out_free:
818         free_page((unsigned long) page);
819 out:
820         put_task_struct(task);
821 out_no_task:
822         return ret;
823 }
824
825 #define mem_write NULL
826
827 #ifndef mem_write
828 /* This is a security hazard */
829 static ssize_t mem_write(struct file * file, const char __user *buf,
830                          size_t count, loff_t *ppos)
831 {
832         int copied;
833         char *page;
834         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
835         unsigned long dst = *ppos;
836
837         copied = -ESRCH;
838         if (!task)
839                 goto out_no_task;
840
841         if (check_mem_permission(task))
842                 goto out;
843
844         copied = -ENOMEM;
845         page = (char *)__get_free_page(GFP_TEMPORARY);
846         if (!page)
847                 goto out;
848
849         copied = 0;
850         while (count > 0) {
851                 int this_len, retval;
852
853                 this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
854                 if (copy_from_user(page, buf, this_len)) {
855                         copied = -EFAULT;
856                         break;
857                 }
858                 retval = access_process_vm(task, dst, page, this_len, 1);
859                 if (!retval) {
860                         if (!copied)
861                                 copied = -EIO;
862                         break;
863                 }
864                 copied += retval;
865                 buf += retval;
866                 dst += retval;
867                 count -= retval;                        
868         }
869         *ppos = dst;
870         free_page((unsigned long) page);
871 out:
872         put_task_struct(task);
873 out_no_task:
874         return copied;
875 }
876 #endif
877
878 loff_t mem_lseek(struct file *file, loff_t offset, int orig)
879 {
880         switch (orig) {
881         case 0:
882                 file->f_pos = offset;
883                 break;
884         case 1:
885                 file->f_pos += offset;
886                 break;
887         default:
888                 return -EINVAL;
889         }
890         force_successful_syscall_return();
891         return file->f_pos;
892 }
893
894 static const struct file_operations proc_mem_operations = {
895         .llseek         = mem_lseek,
896         .read           = mem_read,
897         .write          = mem_write,
898         .open           = mem_open,
899 };
900
901 static ssize_t environ_read(struct file *file, char __user *buf,
902                         size_t count, loff_t *ppos)
903 {
904         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
905         char *page;
906         unsigned long src = *ppos;
907         int ret = -ESRCH;
908         struct mm_struct *mm;
909
910         if (!task)
911                 goto out_no_task;
912
913         if (!ptrace_may_access(task, PTRACE_MODE_READ))
914                 goto out;
915
916         ret = -ENOMEM;
917         page = (char *)__get_free_page(GFP_TEMPORARY);
918         if (!page)
919                 goto out;
920
921         ret = 0;
922
923         mm = get_task_mm(task);
924         if (!mm)
925                 goto out_free;
926
927         while (count > 0) {
928                 int this_len, retval, max_len;
929
930                 this_len = mm->env_end - (mm->env_start + src);
931
932                 if (this_len <= 0)
933                         break;
934
935                 max_len = (count > PAGE_SIZE) ? PAGE_SIZE : count;
936                 this_len = (this_len > max_len) ? max_len : this_len;
937
938                 retval = access_process_vm(task, (mm->env_start + src),
939                         page, this_len, 0);
940
941                 if (retval <= 0) {
942                         ret = retval;
943                         break;
944                 }
945
946                 if (copy_to_user(buf, page, retval)) {
947                         ret = -EFAULT;
948                         break;
949                 }
950
951                 ret += retval;
952                 src += retval;
953                 buf += retval;
954                 count -= retval;
955         }
956         *ppos = src;
957
958         mmput(mm);
959 out_free:
960         free_page((unsigned long) page);
961 out:
962         put_task_struct(task);
963 out_no_task:
964         return ret;
965 }
966
967 static const struct file_operations proc_environ_operations = {
968         .read           = environ_read,
969 };
970
971 static ssize_t oom_adjust_read(struct file *file, char __user *buf,
972                                 size_t count, loff_t *ppos)
973 {
974         struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
975         char buffer[PROC_NUMBUF];
976         size_t len;
977         int oom_adjust;
978
979         if (!task)
980                 return -ESRCH;
981         oom_adjust = task->oomkilladj;
982         put_task_struct(task);
983
984         len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
985
986         return simple_read_from_buffer(buf, count, ppos, buffer, len);
987 }
988
989 static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
990                                 size_t count, loff_t *ppos)
991 {
992         struct task_struct *task;
993         char buffer[PROC_NUMBUF], *end;
994         int oom_adjust;
995
996         memset(buffer, 0, sizeof(buffer));
997         if (count > sizeof(buffer) - 1)
998                 count = sizeof(buffer) - 1;
999         if (copy_from_user(buffer, buf, count))
1000                 return -EFAULT;
1001         oom_adjust = simple_strtol(buffer, &end, 0);
1002         if ((oom_adjust < OOM_ADJUST_MIN || oom_adjust > OOM_ADJUST_MAX) &&
1003              oom_adjust != OOM_DISABLE)
1004                 return -EINVAL;
1005         if (*end == '\n')
1006                 end++;
1007         task = get_proc_task(file->f_path.dentry->d_inode);
1008         if (!task)
1009                 return -ESRCH;
1010         if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
1011                 put_task_struct(task);
1012                 return -EACCES;
1013         }
1014         task->oomkilladj = oom_adjust;
1015         put_task_struct(task);
1016         if (end - buffer == 0)
1017                 return -EIO;
1018         return end - buffer;
1019 }
1020
1021 static const struct file_operations proc_oom_adjust_operations = {
1022         .read           = oom_adjust_read,
1023         .write          = oom_adjust_write,
1024 };
1025
1026 #ifdef CONFIG_AUDITSYSCALL
1027 #define TMPBUFLEN 21
1028 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
1029                                   size_t count, loff_t *ppos)
1030 {
1031         struct inode * inode = file->f_path.dentry->d_inode;
1032         struct task_struct *task = get_proc_task(inode);
1033         ssize_t length;
1034         char tmpbuf[TMPBUFLEN];
1035
1036         if (!task)
1037                 return -ESRCH;
1038         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1039                                 audit_get_loginuid(task));
1040         put_task_struct(task);
1041         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1042 }
1043
1044 static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
1045                                    size_t count, loff_t *ppos)
1046 {
1047         struct inode * inode = file->f_path.dentry->d_inode;
1048         char *page, *tmp;
1049         ssize_t length;
1050         uid_t loginuid;
1051
1052         if (!capable(CAP_AUDIT_CONTROL))
1053                 return -EPERM;
1054
1055         if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
1056                 return -EPERM;
1057
1058         if (count >= PAGE_SIZE)
1059                 count = PAGE_SIZE - 1;
1060
1061         if (*ppos != 0) {
1062                 /* No partial writes. */
1063                 return -EINVAL;
1064         }
1065         page = (char*)__get_free_page(GFP_TEMPORARY);
1066         if (!page)
1067                 return -ENOMEM;
1068         length = -EFAULT;
1069         if (copy_from_user(page, buf, count))
1070                 goto out_free_page;
1071
1072         page[count] = '\0';
1073         loginuid = simple_strtoul(page, &tmp, 10);
1074         if (tmp == page) {
1075                 length = -EINVAL;
1076                 goto out_free_page;
1077
1078         }
1079         length = audit_set_loginuid(current, loginuid);
1080         if (likely(length == 0))
1081                 length = count;
1082
1083 out_free_page:
1084         free_page((unsigned long) page);
1085         return length;
1086 }
1087
1088 static const struct file_operations proc_loginuid_operations = {
1089         .read           = proc_loginuid_read,
1090         .write          = proc_loginuid_write,
1091 };
1092
1093 static ssize_t proc_sessionid_read(struct file * file, char __user * buf,
1094                                   size_t count, loff_t *ppos)
1095 {
1096         struct inode * inode = file->f_path.dentry->d_inode;
1097         struct task_struct *task = get_proc_task(inode);
1098         ssize_t length;
1099         char tmpbuf[TMPBUFLEN];
1100
1101         if (!task)
1102                 return -ESRCH;
1103         length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
1104                                 audit_get_sessionid(task));
1105         put_task_struct(task);
1106         return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
1107 }
1108
1109 static const struct file_operations proc_sessionid_operations = {
1110         .read           = proc_sessionid_read,
1111 };
1112 #endif
1113
1114 #ifdef CONFIG_FAULT_INJECTION
1115 static ssize_t proc_fault_inject_read(struct file * file, char __user * buf,
1116                                       size_t count, loff_t *ppos)
1117 {
1118         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
1119         char buffer[PROC_NUMBUF];
1120         size_t len;
1121         int make_it_fail;
1122
1123         if (!task)
1124                 return -ESRCH;
1125         make_it_fail = task->make_it_fail;
1126         put_task_struct(task);
1127
1128         len = snprintf(buffer, sizeof(buffer), "%i\n", make_it_fail);
1129
1130         return simple_read_from_buffer(buf, count, ppos, buffer, len);
1131 }
1132
1133 static ssize_t proc_fault_inject_write(struct file * file,
1134                         const char __user * buf, size_t count, loff_t *ppos)
1135 {
1136         struct task_struct *task;
1137         char buffer[PROC_NUMBUF], *end;
1138         int make_it_fail;
1139
1140         if (!capable(CAP_SYS_RESOURCE))
1141                 return -EPERM;
1142         memset(buffer, 0, sizeof(buffer));
1143         if (count > sizeof(buffer) - 1)
1144                 count = sizeof(buffer) - 1;
1145         if (copy_from_user(buffer, buf, count))
1146                 return -EFAULT;
1147         make_it_fail = simple_strtol(buffer, &end, 0);
1148         if (*end == '\n')
1149                 end++;
1150         task = get_proc_task(file->f_dentry->d_inode);
1151         if (!task)
1152                 return -ESRCH;
1153         task->make_it_fail = make_it_fail;
1154         put_task_struct(task);
1155         if (end - buffer == 0)
1156                 return -EIO;
1157         return end - buffer;
1158 }
1159
1160 static const struct file_operations proc_fault_inject_operations = {
1161         .read           = proc_fault_inject_read,
1162         .write          = proc_fault_inject_write,
1163 };
1164 #endif
1165
1166
1167 #ifdef CONFIG_SCHED_DEBUG
1168 /*
1169  * Print out various scheduling related per-task fields:
1170  */
1171 static int sched_show(struct seq_file *m, void *v)
1172 {
1173         struct inode *inode = m->private;
1174         struct task_struct *p;
1175
1176         WARN_ON(!inode);
1177
1178         p = get_proc_task(inode);
1179         if (!p)
1180                 return -ESRCH;
1181         proc_sched_show_task(p, m);
1182
1183         put_task_struct(p);
1184
1185         return 0;
1186 }
1187
1188 static ssize_t
1189 sched_write(struct file *file, const char __user *buf,
1190             size_t count, loff_t *offset)
1191 {
1192         struct inode *inode = file->f_path.dentry->d_inode;
1193         struct task_struct *p;
1194
1195         WARN_ON(!inode);
1196
1197         p = get_proc_task(inode);
1198         if (!p)
1199                 return -ESRCH;
1200         proc_sched_set_task(p);
1201
1202         put_task_struct(p);
1203
1204         return count;
1205 }
1206
1207 static int sched_open(struct inode *inode, struct file *filp)
1208 {
1209         int ret;
1210
1211         ret = single_open(filp, sched_show, NULL);
1212         if (!ret) {
1213                 struct seq_file *m = filp->private_data;
1214
1215                 m->private = inode;
1216         }
1217         return ret;
1218 }
1219
1220 static const struct file_operations proc_pid_sched_operations = {
1221         .open           = sched_open,
1222         .read           = seq_read,
1223         .write          = sched_write,
1224         .llseek         = seq_lseek,
1225         .release        = single_release,
1226 };
1227
1228 #endif
1229
1230 /*
1231  * We added or removed a vma mapping the executable. The vmas are only mapped
1232  * during exec and are not mapped with the mmap system call.
1233  * Callers must hold down_write() on the mm's mmap_sem for these
1234  */
1235 void added_exe_file_vma(struct mm_struct *mm)
1236 {
1237         mm->num_exe_file_vmas++;
1238 }
1239
1240 void removed_exe_file_vma(struct mm_struct *mm)
1241 {
1242         mm->num_exe_file_vmas--;
1243         if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
1244                 fput(mm->exe_file);
1245                 mm->exe_file = NULL;
1246         }
1247
1248 }
1249
1250 void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
1251 {
1252         if (new_exe_file)
1253                 get_file(new_exe_file);
1254         if (mm->exe_file)
1255                 fput(mm->exe_file);
1256         mm->exe_file = new_exe_file;
1257         mm->num_exe_file_vmas = 0;
1258 }
1259
1260 struct file *get_mm_exe_file(struct mm_struct *mm)
1261 {
1262         struct file *exe_file;
1263
1264         /* We need mmap_sem to protect against races with removal of
1265          * VM_EXECUTABLE vmas */
1266         down_read(&mm->mmap_sem);
1267         exe_file = mm->exe_file;
1268         if (exe_file)
1269                 get_file(exe_file);
1270         up_read(&mm->mmap_sem);
1271         return exe_file;
1272 }
1273
1274 void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
1275 {
1276         /* It's safe to write the exe_file pointer without exe_file_lock because
1277          * this is called during fork when the task is not yet in /proc */
1278         newmm->exe_file = get_mm_exe_file(oldmm);
1279 }
1280
1281 static int proc_exe_link(struct inode *inode, struct path *exe_path)
1282 {
1283         struct task_struct *task;
1284         struct mm_struct *mm;
1285         struct file *exe_file;
1286
1287         task = get_proc_task(inode);
1288         if (!task)
1289                 return -ENOENT;
1290         mm = get_task_mm(task);
1291         put_task_struct(task);
1292         if (!mm)
1293                 return -ENOENT;
1294         exe_file = get_mm_exe_file(mm);
1295         mmput(mm);
1296         if (exe_file) {
1297                 *exe_path = exe_file->f_path;
1298                 path_get(&exe_file->f_path);
1299                 fput(exe_file);
1300                 return 0;
1301         } else
1302                 return -ENOENT;
1303 }
1304
1305 static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
1306 {
1307         struct inode *inode = dentry->d_inode;
1308         int error = -EACCES;
1309
1310         /* We don't need a base pointer in the /proc filesystem */
1311         path_put(&nd->path);
1312
1313         /* Are we allowed to snoop on the tasks file descriptors? */
1314         if (!proc_fd_access_allowed(inode))
1315                 goto out;
1316
1317         error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
1318         nd->last_type = LAST_BIND;
1319 out:
1320         return ERR_PTR(error);
1321 }
1322
1323 static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
1324 {
1325         char *tmp = (char*)__get_free_page(GFP_TEMPORARY);
1326         char *pathname;
1327         int len;
1328
1329         if (!tmp)
1330                 return -ENOMEM;
1331
1332         pathname = d_path(path, tmp, PAGE_SIZE);
1333         len = PTR_ERR(pathname);
1334         if (IS_ERR(pathname))
1335                 goto out;
1336         len = tmp + PAGE_SIZE - 1 - pathname;
1337
1338         if (len > buflen)
1339                 len = buflen;
1340         if (copy_to_user(buffer, pathname, len))
1341                 len = -EFAULT;
1342  out:
1343         free_page((unsigned long)tmp);
1344         return len;
1345 }
1346
1347 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
1348 {
1349         int error = -EACCES;
1350         struct inode *inode = dentry->d_inode;
1351         struct path path;
1352
1353         /* Are we allowed to snoop on the tasks file descriptors? */
1354         if (!proc_fd_access_allowed(inode))
1355                 goto out;
1356
1357         error = PROC_I(inode)->op.proc_get_link(inode, &path);
1358         if (error)
1359                 goto out;
1360
1361         error = do_proc_readlink(&path, buffer, buflen);
1362         path_put(&path);
1363 out:
1364         return error;
1365 }
1366
1367 static const struct inode_operations proc_pid_link_inode_operations = {
1368         .readlink       = proc_pid_readlink,
1369         .follow_link    = proc_pid_follow_link,
1370         .setattr        = proc_setattr,
1371 };
1372
1373
1374 /* building an inode */
1375
1376 static int task_dumpable(struct task_struct *task)
1377 {
1378         int dumpable = 0;
1379         struct mm_struct *mm;
1380
1381         task_lock(task);
1382         mm = task->mm;
1383         if (mm)
1384                 dumpable = get_dumpable(mm);
1385         task_unlock(task);
1386         if(dumpable == 1)
1387                 return 1;
1388         return 0;
1389 }
1390
1391
1392 static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
1393 {
1394         struct inode * inode;
1395         struct proc_inode *ei;
1396
1397         /* We need a new inode */
1398
1399         inode = new_inode(sb);
1400         if (!inode)
1401                 goto out;
1402
1403         /* Common stuff */
1404         ei = PROC_I(inode);
1405         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
1406         inode->i_op = &proc_def_inode_operations;
1407
1408         /*
1409          * grab the reference to task.
1410          */
1411         ei->pid = get_task_pid(task, PIDTYPE_PID);
1412         if (!ei->pid)
1413                 goto out_unlock;
1414
1415         inode->i_uid = 0;
1416         inode->i_gid = 0;
1417         if (task_dumpable(task)) {
1418                 inode->i_uid = task->euid;
1419                 inode->i_gid = task->egid;
1420         }
1421         security_task_to_inode(task, inode);
1422
1423 out:
1424         return inode;
1425
1426 out_unlock:
1427         iput(inode);
1428         return NULL;
1429 }
1430
1431 static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
1432 {
1433         struct inode *inode = dentry->d_inode;
1434         struct task_struct *task;
1435         generic_fillattr(inode, stat);
1436
1437         rcu_read_lock();
1438         stat->uid = 0;
1439         stat->gid = 0;
1440         task = pid_task(proc_pid(inode), PIDTYPE_PID);
1441         if (task) {
1442                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1443                     task_dumpable(task)) {
1444                         stat->uid = task->euid;
1445                         stat->gid = task->egid;
1446                 }
1447         }
1448         rcu_read_unlock();
1449         return 0;
1450 }
1451
1452 /* dentry stuff */
1453
1454 /*
1455  *      Exceptional case: normally we are not allowed to unhash a busy
1456  * directory. In this case, however, we can do it - no aliasing problems
1457  * due to the way we treat inodes.
1458  *
1459  * Rewrite the inode's ownerships here because the owning task may have
1460  * performed a setuid(), etc.
1461  *
1462  * Before the /proc/pid/status file was created the only way to read
1463  * the effective uid of a /process was to stat /proc/pid.  Reading
1464  * /proc/pid/status is slow enough that procps and other packages
1465  * kept stating /proc/pid.  To keep the rules in /proc simple I have
1466  * made this apply to all per process world readable and executable
1467  * directories.
1468  */
1469 static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
1470 {
1471         struct inode *inode = dentry->d_inode;
1472         struct task_struct *task = get_proc_task(inode);
1473         if (task) {
1474                 if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
1475                     task_dumpable(task)) {
1476                         inode->i_uid = task->euid;
1477                         inode->i_gid = task->egid;
1478                 } else {
1479                         inode->i_uid = 0;
1480                         inode->i_gid = 0;
1481                 }
1482                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1483                 security_task_to_inode(task, inode);
1484                 put_task_struct(task);
1485                 return 1;
1486         }
1487         d_drop(dentry);
1488         return 0;
1489 }
1490
1491 static int pid_delete_dentry(struct dentry * dentry)
1492 {
1493         /* Is the task we represent dead?
1494          * If so, then don't put the dentry on the lru list,
1495          * kill it immediately.
1496          */
1497         return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
1498 }
1499
1500 static struct dentry_operations pid_dentry_operations =
1501 {
1502         .d_revalidate   = pid_revalidate,
1503         .d_delete       = pid_delete_dentry,
1504 };
1505
1506 /* Lookups */
1507
1508 typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
1509                                 struct task_struct *, const void *);
1510
1511 /*
1512  * Fill a directory entry.
1513  *
1514  * If possible create the dcache entry and derive our inode number and
1515  * file type from dcache entry.
1516  *
1517  * Since all of the proc inode numbers are dynamically generated, the inode
1518  * numbers do not exist until the inode is cache.  This means creating the
1519  * the dcache entry in readdir is necessary to keep the inode numbers
1520  * reported by readdir in sync with the inode numbers reported
1521  * by stat.
1522  */
1523 static int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
1524         char *name, int len,
1525         instantiate_t instantiate, struct task_struct *task, const void *ptr)
1526 {
1527         struct dentry *child, *dir = filp->f_path.dentry;
1528         struct inode *inode;
1529         struct qstr qname;
1530         ino_t ino = 0;
1531         unsigned type = DT_UNKNOWN;
1532
1533         qname.name = name;
1534         qname.len  = len;
1535         qname.hash = full_name_hash(name, len);
1536
1537         child = d_lookup(dir, &qname);
1538         if (!child) {
1539                 struct dentry *new;
1540                 new = d_alloc(dir, &qname);
1541                 if (new) {
1542                         child = instantiate(dir->d_inode, new, task, ptr);
1543                         if (child)
1544                                 dput(new);
1545                         else
1546                                 child = new;
1547                 }
1548         }
1549         if (!child || IS_ERR(child) || !child->d_inode)
1550                 goto end_instantiate;
1551         inode = child->d_inode;
1552         if (inode) {
1553                 ino = inode->i_ino;
1554                 type = inode->i_mode >> 12;
1555         }
1556         dput(child);
1557 end_instantiate:
1558         if (!ino)
1559                 ino = find_inode_number(dir, &qname);
1560         if (!ino)
1561                 ino = 1;
1562         return filldir(dirent, name, len, filp->f_pos, ino, type);
1563 }
1564
1565 static unsigned name_to_int(struct dentry *dentry)
1566 {
1567         const char *name = dentry->d_name.name;
1568         int len = dentry->d_name.len;
1569         unsigned n = 0;
1570
1571         if (len > 1 && *name == '0')
1572                 goto out;
1573         while (len-- > 0) {
1574                 unsigned c = *name++ - '0';
1575                 if (c > 9)
1576                         goto out;
1577                 if (n >= (~0U-9)/10)
1578                         goto out;
1579                 n *= 10;
1580                 n += c;
1581         }
1582         return n;
1583 out:
1584         return ~0U;
1585 }
1586
1587 #define PROC_FDINFO_MAX 64
1588
1589 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
1590 {
1591         struct task_struct *task = get_proc_task(inode);
1592         struct files_struct *files = NULL;
1593         struct file *file;
1594         int fd = proc_fd(inode);
1595
1596         if (task) {
1597                 files = get_files_struct(task);
1598                 put_task_struct(task);
1599         }
1600         if (files) {
1601                 /*
1602                  * We are not taking a ref to the file structure, so we must
1603                  * hold ->file_lock.
1604                  */
1605                 spin_lock(&files->file_lock);
1606                 file = fcheck_files(files, fd);
1607                 if (file) {
1608                         if (path) {
1609                                 *path = file->f_path;
1610                                 path_get(&file->f_path);
1611                         }
1612                         if (info)
1613                                 snprintf(info, PROC_FDINFO_MAX,
1614                                          "pos:\t%lli\n"
1615                                          "flags:\t0%o\n",
1616                                          (long long) file->f_pos,
1617                                          file->f_flags);
1618                         spin_unlock(&files->file_lock);
1619                         put_files_struct(files);
1620                         return 0;
1621                 }
1622                 spin_unlock(&files->file_lock);
1623                 put_files_struct(files);
1624         }
1625         return -ENOENT;
1626 }
1627
1628 static int proc_fd_link(struct inode *inode, struct path *path)
1629 {
1630         return proc_fd_info(inode, path, NULL);
1631 }
1632
1633 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
1634 {
1635         struct inode *inode = dentry->d_inode;
1636         struct task_struct *task = get_proc_task(inode);
1637         int fd = proc_fd(inode);
1638         struct files_struct *files;
1639
1640         if (task) {
1641                 files = get_files_struct(task);
1642                 if (files) {
1643                         rcu_read_lock();
1644                         if (fcheck_files(files, fd)) {
1645                                 rcu_read_unlock();
1646                                 put_files_struct(files);
1647                                 if (task_dumpable(task)) {
1648                                         inode->i_uid = task->euid;
1649                                         inode->i_gid = task->egid;
1650                                 } else {
1651                                         inode->i_uid = 0;
1652                                         inode->i_gid = 0;
1653                                 }
1654                                 inode->i_mode &= ~(S_ISUID | S_ISGID);
1655                                 security_task_to_inode(task, inode);
1656                                 put_task_struct(task);
1657                                 return 1;
1658                         }
1659                         rcu_read_unlock();
1660                         put_files_struct(files);
1661                 }
1662                 put_task_struct(task);
1663         }
1664         d_drop(dentry);
1665         return 0;
1666 }
1667
1668 static struct dentry_operations tid_fd_dentry_operations =
1669 {
1670         .d_revalidate   = tid_fd_revalidate,
1671         .d_delete       = pid_delete_dentry,
1672 };
1673
1674 static struct dentry *proc_fd_instantiate(struct inode *dir,
1675         struct dentry *dentry, struct task_struct *task, const void *ptr)
1676 {
1677         unsigned fd = *(const unsigned *)ptr;
1678         struct file *file;
1679         struct files_struct *files;
1680         struct inode *inode;
1681         struct proc_inode *ei;
1682         struct dentry *error = ERR_PTR(-ENOENT);
1683
1684         inode = proc_pid_make_inode(dir->i_sb, task);
1685         if (!inode)
1686                 goto out;
1687         ei = PROC_I(inode);
1688         ei->fd = fd;
1689         files = get_files_struct(task);
1690         if (!files)
1691                 goto out_iput;
1692         inode->i_mode = S_IFLNK;
1693
1694         /*
1695          * We are not taking a ref to the file structure, so we must
1696          * hold ->file_lock.
1697          */
1698         spin_lock(&files->file_lock);
1699         file = fcheck_files(files, fd);
1700         if (!file)
1701                 goto out_unlock;
1702         if (file->f_mode & 1)
1703                 inode->i_mode |= S_IRUSR | S_IXUSR;
1704         if (file->f_mode & 2)
1705                 inode->i_mode |= S_IWUSR | S_IXUSR;
1706         spin_unlock(&files->file_lock);
1707         put_files_struct(files);
1708
1709         inode->i_op = &proc_pid_link_inode_operations;
1710         inode->i_size = 64;
1711         ei->op.proc_get_link = proc_fd_link;
1712         dentry->d_op = &tid_fd_dentry_operations;
1713         d_add(dentry, inode);
1714         /* Close the race of the process dying before we return the dentry */
1715         if (tid_fd_revalidate(dentry, NULL))
1716                 error = NULL;
1717
1718  out:
1719         return error;
1720 out_unlock:
1721         spin_unlock(&files->file_lock);
1722         put_files_struct(files);
1723 out_iput:
1724         iput(inode);
1725         goto out;
1726 }
1727
1728 static struct dentry *proc_lookupfd_common(struct inode *dir,
1729                                            struct dentry *dentry,
1730                                            instantiate_t instantiate)
1731 {
1732         struct task_struct *task = get_proc_task(dir);
1733         unsigned fd = name_to_int(dentry);
1734         struct dentry *result = ERR_PTR(-ENOENT);
1735
1736         if (!task)
1737                 goto out_no_task;
1738         if (fd == ~0U)
1739                 goto out;
1740
1741         result = instantiate(dir, dentry, task, &fd);
1742 out:
1743         put_task_struct(task);
1744 out_no_task:
1745         return result;
1746 }
1747
1748 static int proc_readfd_common(struct file * filp, void * dirent,
1749                               filldir_t filldir, instantiate_t instantiate)
1750 {
1751         struct dentry *dentry = filp->f_path.dentry;
1752         struct inode *inode = dentry->d_inode;
1753         struct task_struct *p = get_proc_task(inode);
1754         unsigned int fd, ino;
1755         int retval;
1756         struct files_struct * files;
1757
1758         retval = -ENOENT;
1759         if (!p)
1760                 goto out_no_task;
1761         retval = 0;
1762
1763         fd = filp->f_pos;
1764         switch (fd) {
1765                 case 0:
1766                         if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
1767                                 goto out;
1768                         filp->f_pos++;
1769                 case 1:
1770                         ino = parent_ino(dentry);
1771                         if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
1772                                 goto out;
1773                         filp->f_pos++;
1774                 default:
1775                         files = get_files_struct(p);
1776                         if (!files)
1777                                 goto out;
1778                         rcu_read_lock();
1779                         for (fd = filp->f_pos-2;
1780                              fd < files_fdtable(files)->max_fds;
1781                              fd++, filp->f_pos++) {
1782                                 char name[PROC_NUMBUF];
1783                                 int len;
1784
1785                                 if (!fcheck_files(files, fd))
1786                                         continue;
1787                                 rcu_read_unlock();
1788
1789                                 len = snprintf(name, sizeof(name), "%d", fd);
1790                                 if (proc_fill_cache(filp, dirent, filldir,
1791                                                     name, len, instantiate,
1792                                                     p, &fd) < 0) {
1793                                         rcu_read_lock();
1794                                         break;
1795                                 }
1796                                 rcu_read_lock();
1797                         }
1798                         rcu_read_unlock();
1799                         put_files_struct(files);
1800         }
1801 out:
1802         put_task_struct(p);
1803 out_no_task:
1804         return retval;
1805 }
1806
1807 static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry,
1808                                     struct nameidata *nd)
1809 {
1810         return proc_lookupfd_common(dir, dentry, proc_fd_instantiate);
1811 }
1812
1813 static int proc_readfd(struct file *filp, void *dirent, filldir_t filldir)
1814 {
1815         return proc_readfd_common(filp, dirent, filldir, proc_fd_instantiate);
1816 }
1817
1818 static ssize_t proc_fdinfo_read(struct file *file, char __user *buf,
1819                                       size_t len, loff_t *ppos)
1820 {
1821         char tmp[PROC_FDINFO_MAX];
1822         int err = proc_fd_info(file->f_path.dentry->d_inode, NULL, tmp);
1823         if (!err)
1824                 err = simple_read_from_buffer(buf, len, ppos, tmp, strlen(tmp));
1825         return err;
1826 }
1827
1828 static const struct file_operations proc_fdinfo_file_operations = {
1829         .open           = nonseekable_open,
1830         .read           = proc_fdinfo_read,
1831 };
1832
1833 static const struct file_operations proc_fd_operations = {
1834         .read           = generic_read_dir,
1835         .readdir        = proc_readfd,
1836 };
1837
1838 /*
1839  * /proc/pid/fd needs a special permission handler so that a process can still
1840  * access /proc/self/fd after it has executed a setuid().
1841  */
1842 static int proc_fd_permission(struct inode *inode, int mask,
1843                                 struct nameidata *nd)
1844 {
1845         int rv;
1846
1847         rv = generic_permission(inode, mask, NULL);
1848         if (rv == 0)
1849                 return 0;
1850         if (task_pid(current) == proc_pid(inode))
1851                 rv = 0;
1852         return rv;
1853 }
1854
1855 /*
1856  * proc directories can do almost nothing..
1857  */
1858 static const struct inode_operations proc_fd_inode_operations = {
1859         .lookup         = proc_lookupfd,
1860         .permission     = proc_fd_permission,
1861         .setattr        = proc_setattr,
1862 };
1863
1864 static struct dentry *proc_fdinfo_instantiate(struct inode *dir,
1865         struct dentry *dentry, struct task_struct *task, const void *ptr)
1866 {
1867         unsigned fd = *(unsigned *)ptr;
1868         struct inode *inode;
1869         struct proc_inode *ei;
1870         struct dentry *error = ERR_PTR(-ENOENT);
1871
1872         inode = proc_pid_make_inode(dir->i_sb, task);
1873         if (!inode)
1874                 goto out;
1875         ei = PROC_I(inode);
1876         ei->fd = fd;
1877         inode->i_mode = S_IFREG | S_IRUSR;
1878         inode->i_fop = &proc_fdinfo_file_operations;
1879         dentry->d_op = &tid_fd_dentry_operations;
1880         d_add(dentry, inode);
1881         /* Close the race of the process dying before we return the dentry */
1882         if (tid_fd_revalidate(dentry, NULL))
1883                 error = NULL;
1884
1885  out:
1886         return error;
1887 }
1888
1889 static struct dentry *proc_lookupfdinfo(struct inode *dir,
1890                                         struct dentry *dentry,
1891                                         struct nameidata *nd)
1892 {
1893         return proc_lookupfd_common(dir, dentry, proc_fdinfo_instantiate);
1894 }
1895
1896 static int proc_readfdinfo(struct file *filp, void *dirent, filldir_t filldir)
1897 {
1898         return proc_readfd_common(filp, dirent, filldir,
1899                                   proc_fdinfo_instantiate);
1900 }
1901
1902 static const struct file_operations proc_fdinfo_operations = {
1903         .read           = generic_read_dir,
1904         .readdir        = proc_readfdinfo,
1905 };
1906
1907 /*
1908  * proc directories can do almost nothing..
1909  */
1910 static const struct inode_operations proc_fdinfo_inode_operations = {
1911         .lookup         = proc_lookupfdinfo,
1912         .setattr        = proc_setattr,
1913 };
1914
1915
1916 static struct dentry *proc_pident_instantiate(struct inode *dir,
1917         struct dentry *dentry, struct task_struct *task, const void *ptr)
1918 {
1919         const struct pid_entry *p = ptr;
1920         struct inode *inode;
1921         struct proc_inode *ei;
1922         struct dentry *error = ERR_PTR(-EINVAL);
1923
1924         inode = proc_pid_make_inode(dir->i_sb, task);
1925         if (!inode)
1926                 goto out;
1927
1928         ei = PROC_I(inode);
1929         inode->i_mode = p->mode;
1930         if (S_ISDIR(inode->i_mode))
1931                 inode->i_nlink = 2;     /* Use getattr to fix if necessary */
1932         if (p->iop)
1933                 inode->i_op = p->iop;
1934         if (p->fop)
1935                 inode->i_fop = p->fop;
1936         ei->op = p->op;
1937         dentry->d_op = &pid_dentry_operations;
1938         d_add(dentry, inode);
1939         /* Close the race of the process dying before we return the dentry */
1940         if (pid_revalidate(dentry, NULL))
1941                 error = NULL;
1942 out:
1943         return error;
1944 }
1945
1946 static struct dentry *proc_pident_lookup(struct inode *dir, 
1947                                          struct dentry *dentry,
1948                                          const struct pid_entry *ents,
1949                                          unsigned int nents)
1950 {
1951         struct inode *inode;
1952         struct dentry *error;
1953         struct task_struct *task = get_proc_task(dir);
1954         const struct pid_entry *p, *last;
1955
1956         error = ERR_PTR(-ENOENT);
1957         inode = NULL;
1958
1959         if (!task)
1960                 goto out_no_task;
1961
1962         /*
1963          * Yes, it does not scale. And it should not. Don't add
1964          * new entries into /proc/<tgid>/ without very good reasons.
1965          */
1966         last = &ents[nents - 1];
1967         for (p = ents; p <= last; p++) {
1968                 if (p->len != dentry->d_name.len)
1969                         continue;
1970                 if (!memcmp(dentry->d_name.name, p->name, p->len))
1971                         break;
1972         }
1973         if (p > last)
1974                 goto out;
1975
1976         error = proc_pident_instantiate(dir, dentry, task, p);
1977 out:
1978         put_task_struct(task);
1979 out_no_task:
1980         return error;
1981 }
1982
1983 static int proc_pident_fill_cache(struct file *filp, void *dirent,
1984         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
1985 {
1986         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
1987                                 proc_pident_instantiate, task, p);
1988 }
1989
1990 static int proc_pident_readdir(struct file *filp,
1991                 void *dirent, filldir_t filldir,
1992                 const struct pid_entry *ents, unsigned int nents)
1993 {
1994         int i;
1995         struct dentry *dentry = filp->f_path.dentry;
1996         struct inode *inode = dentry->d_inode;
1997         struct task_struct *task = get_proc_task(inode);
1998         const struct pid_entry *p, *last;
1999         ino_t ino;
2000         int ret;
2001
2002         ret = -ENOENT;
2003         if (!task)
2004                 goto out_no_task;
2005
2006         ret = 0;
2007         i = filp->f_pos;
2008         switch (i) {
2009         case 0:
2010                 ino = inode->i_ino;
2011                 if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
2012                         goto out;
2013                 i++;
2014                 filp->f_pos++;
2015                 /* fall through */
2016         case 1:
2017                 ino = parent_ino(dentry);
2018                 if (filldir(dirent, "..", 2, i, ino, DT_DIR) < 0)
2019                         goto out;
2020                 i++;
2021                 filp->f_pos++;
2022                 /* fall through */
2023         default:
2024                 i -= 2;
2025                 if (i >= nents) {
2026                         ret = 1;
2027                         goto out;
2028                 }
2029                 p = ents + i;
2030                 last = &ents[nents - 1];
2031                 while (p <= last) {
2032                         if (proc_pident_fill_cache(filp, dirent, filldir, task, p) < 0)
2033                                 goto out;
2034                         filp->f_pos++;
2035                         p++;
2036                 }
2037         }
2038
2039         ret = 1;
2040 out:
2041         put_task_struct(task);
2042 out_no_task:
2043         return ret;
2044 }
2045
2046 #ifdef CONFIG_SECURITY
2047 static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
2048                                   size_t count, loff_t *ppos)
2049 {
2050         struct inode * inode = file->f_path.dentry->d_inode;
2051         char *p = NULL;
2052         ssize_t length;
2053         struct task_struct *task = get_proc_task(inode);
2054
2055         if (!task)
2056                 return -ESRCH;
2057
2058         length = security_getprocattr(task,
2059                                       (char*)file->f_path.dentry->d_name.name,
2060                                       &p);
2061         put_task_struct(task);
2062         if (length > 0)
2063                 length = simple_read_from_buffer(buf, count, ppos, p, length);
2064         kfree(p);
2065         return length;
2066 }
2067
2068 static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
2069                                    size_t count, loff_t *ppos)
2070 {
2071         struct inode * inode = file->f_path.dentry->d_inode;
2072         char *page;
2073         ssize_t length;
2074         struct task_struct *task = get_proc_task(inode);
2075
2076         length = -ESRCH;
2077         if (!task)
2078                 goto out_no_task;
2079         if (count > PAGE_SIZE)
2080                 count = PAGE_SIZE;
2081
2082         /* No partial writes. */
2083         length = -EINVAL;
2084         if (*ppos != 0)
2085                 goto out;
2086
2087         length = -ENOMEM;
2088         page = (char*)__get_free_page(GFP_TEMPORARY);
2089         if (!page)
2090                 goto out;
2091
2092         length = -EFAULT;
2093         if (copy_from_user(page, buf, count))
2094                 goto out_free;
2095
2096         length = security_setprocattr(task,
2097                                       (char*)file->f_path.dentry->d_name.name,
2098                                       (void*)page, count);
2099 out_free:
2100         free_page((unsigned long) page);
2101 out:
2102         put_task_struct(task);
2103 out_no_task:
2104         return length;
2105 }
2106
2107 static const struct file_operations proc_pid_attr_operations = {
2108         .read           = proc_pid_attr_read,
2109         .write          = proc_pid_attr_write,
2110 };
2111
2112 static const struct pid_entry attr_dir_stuff[] = {
2113         REG("current",    S_IRUGO|S_IWUGO, pid_attr),
2114         REG("prev",       S_IRUGO,         pid_attr),
2115         REG("exec",       S_IRUGO|S_IWUGO, pid_attr),
2116         REG("fscreate",   S_IRUGO|S_IWUGO, pid_attr),
2117         REG("keycreate",  S_IRUGO|S_IWUGO, pid_attr),
2118         REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr),
2119 };
2120
2121 static int proc_attr_dir_readdir(struct file * filp,
2122                              void * dirent, filldir_t filldir)
2123 {
2124         return proc_pident_readdir(filp,dirent,filldir,
2125                                    attr_dir_stuff,ARRAY_SIZE(attr_dir_stuff));
2126 }
2127
2128 static const struct file_operations proc_attr_dir_operations = {
2129         .read           = generic_read_dir,
2130         .readdir        = proc_attr_dir_readdir,
2131 };
2132
2133 static struct dentry *proc_attr_dir_lookup(struct inode *dir,
2134                                 struct dentry *dentry, struct nameidata *nd)
2135 {
2136         return proc_pident_lookup(dir, dentry,
2137                                   attr_dir_stuff, ARRAY_SIZE(attr_dir_stuff));
2138 }
2139
2140 static const struct inode_operations proc_attr_dir_inode_operations = {
2141         .lookup         = proc_attr_dir_lookup,
2142         .getattr        = pid_getattr,
2143         .setattr        = proc_setattr,
2144 };
2145
2146 #endif
2147
2148 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2149 static ssize_t proc_coredump_filter_read(struct file *file, char __user *buf,
2150                                          size_t count, loff_t *ppos)
2151 {
2152         struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
2153         struct mm_struct *mm;
2154         char buffer[PROC_NUMBUF];
2155         size_t len;
2156         int ret;
2157
2158         if (!task)
2159                 return -ESRCH;
2160
2161         ret = 0;
2162         mm = get_task_mm(task);
2163         if (mm) {
2164                 len = snprintf(buffer, sizeof(buffer), "%08lx\n",
2165                                ((mm->flags & MMF_DUMP_FILTER_MASK) >>
2166                                 MMF_DUMP_FILTER_SHIFT));
2167                 mmput(mm);
2168                 ret = simple_read_from_buffer(buf, count, ppos, buffer, len);
2169         }
2170
2171         put_task_struct(task);
2172
2173         return ret;
2174 }
2175
2176 static ssize_t proc_coredump_filter_write(struct file *file,
2177                                           const char __user *buf,
2178                                           size_t count,
2179                                           loff_t *ppos)
2180 {
2181         struct task_struct *task;
2182         struct mm_struct *mm;
2183         char buffer[PROC_NUMBUF], *end;
2184         unsigned int val;
2185         int ret;
2186         int i;
2187         unsigned long mask;
2188
2189         ret = -EFAULT;
2190         memset(buffer, 0, sizeof(buffer));
2191         if (count > sizeof(buffer) - 1)
2192                 count = sizeof(buffer) - 1;
2193         if (copy_from_user(buffer, buf, count))
2194                 goto out_no_task;
2195
2196         ret = -EINVAL;
2197         val = (unsigned int)simple_strtoul(buffer, &end, 0);
2198         if (*end == '\n')
2199                 end++;
2200         if (end - buffer == 0)
2201                 goto out_no_task;
2202
2203         ret = -ESRCH;
2204         task = get_proc_task(file->f_dentry->d_inode);
2205         if (!task)
2206                 goto out_no_task;
2207
2208         ret = end - buffer;
2209         mm = get_task_mm(task);
2210         if (!mm)
2211                 goto out_no_mm;
2212
2213         for (i = 0, mask = 1; i < MMF_DUMP_FILTER_BITS; i++, mask <<= 1) {
2214                 if (val & mask)
2215                         set_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2216                 else
2217                         clear_bit(i + MMF_DUMP_FILTER_SHIFT, &mm->flags);
2218         }
2219
2220         mmput(mm);
2221  out_no_mm:
2222         put_task_struct(task);
2223  out_no_task:
2224         return ret;
2225 }
2226
2227 static const struct file_operations proc_coredump_filter_operations = {
2228         .read           = proc_coredump_filter_read,
2229         .write          = proc_coredump_filter_write,
2230 };
2231 #endif
2232
2233 /*
2234  * /proc/self:
2235  */
2236 static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
2237                               int buflen)
2238 {
2239         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2240         pid_t tgid = task_tgid_nr_ns(current, ns);
2241         char tmp[PROC_NUMBUF];
2242         if (!tgid)
2243                 return -ENOENT;
2244         sprintf(tmp, "%d", tgid);
2245         return vfs_readlink(dentry,buffer,buflen,tmp);
2246 }
2247
2248 static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
2249 {
2250         struct pid_namespace *ns = dentry->d_sb->s_fs_info;
2251         pid_t tgid = task_tgid_nr_ns(current, ns);
2252         char tmp[PROC_NUMBUF];
2253         if (!tgid)
2254                 return ERR_PTR(-ENOENT);
2255         sprintf(tmp, "%d", task_tgid_nr_ns(current, ns));
2256         return ERR_PTR(vfs_follow_link(nd,tmp));
2257 }
2258
2259 static const struct inode_operations proc_self_inode_operations = {
2260         .readlink       = proc_self_readlink,
2261         .follow_link    = proc_self_follow_link,
2262 };
2263
2264 /*
2265  * proc base
2266  *
2267  * These are the directory entries in the root directory of /proc
2268  * that properly belong to the /proc filesystem, as they describe
2269  * describe something that is process related.
2270  */
2271 static const struct pid_entry proc_base_stuff[] = {
2272         NOD("self", S_IFLNK|S_IRWXUGO,
2273                 &proc_self_inode_operations, NULL, {}),
2274 };
2275
2276 /*
2277  *      Exceptional case: normally we are not allowed to unhash a busy
2278  * directory. In this case, however, we can do it - no aliasing problems
2279  * due to the way we treat inodes.
2280  */
2281 static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
2282 {
2283         struct inode *inode = dentry->d_inode;
2284         struct task_struct *task = get_proc_task(inode);
2285         if (task) {
2286                 put_task_struct(task);
2287                 return 1;
2288         }
2289         d_drop(dentry);
2290         return 0;
2291 }
2292
2293 static struct dentry_operations proc_base_dentry_operations =
2294 {
2295         .d_revalidate   = proc_base_revalidate,
2296         .d_delete       = pid_delete_dentry,
2297 };
2298
2299 static struct dentry *proc_base_instantiate(struct inode *dir,
2300         struct dentry *dentry, struct task_struct *task, const void *ptr)
2301 {
2302         const struct pid_entry *p = ptr;
2303         struct inode *inode;
2304         struct proc_inode *ei;
2305         struct dentry *error = ERR_PTR(-EINVAL);
2306
2307         /* Allocate the inode */
2308         error = ERR_PTR(-ENOMEM);
2309         inode = new_inode(dir->i_sb);
2310         if (!inode)
2311                 goto out;
2312
2313         /* Initialize the inode */
2314         ei = PROC_I(inode);
2315         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
2316
2317         /*
2318          * grab the reference to the task.
2319          */
2320         ei->pid = get_task_pid(task, PIDTYPE_PID);
2321         if (!ei->pid)
2322                 goto out_iput;
2323
2324         inode->i_uid = 0;
2325         inode->i_gid = 0;
2326         inode->i_mode = p->mode;
2327         if (S_ISDIR(inode->i_mode))
2328                 inode->i_nlink = 2;
2329         if (S_ISLNK(inode->i_mode))
2330                 inode->i_size = 64;
2331         if (p->iop)
2332                 inode->i_op = p->iop;
2333         if (p->fop)
2334                 inode->i_fop = p->fop;
2335         ei->op = p->op;
2336         dentry->d_op = &proc_base_dentry_operations;
2337         d_add(dentry, inode);
2338         error = NULL;
2339 out:
2340         return error;
2341 out_iput:
2342         iput(inode);
2343         goto out;
2344 }
2345
2346 static struct dentry *proc_base_lookup(struct inode *dir, struct dentry *dentry)
2347 {
2348         struct dentry *error;
2349         struct task_struct *task = get_proc_task(dir);
2350         const struct pid_entry *p, *last;
2351
2352         error = ERR_PTR(-ENOENT);
2353
2354         if (!task)
2355                 goto out_no_task;
2356
2357         /* Lookup the directory entry */
2358         last = &proc_base_stuff[ARRAY_SIZE(proc_base_stuff) - 1];
2359         for (p = proc_base_stuff; p <= last; p++) {
2360                 if (p->len != dentry->d_name.len)
2361                         continue;
2362                 if (!memcmp(dentry->d_name.name, p->name, p->len))
2363                         break;
2364         }
2365         if (p > last)
2366                 goto out;
2367
2368         error = proc_base_instantiate(dir, dentry, task, p);
2369
2370 out:
2371         put_task_struct(task);
2372 out_no_task:
2373         return error;
2374 }
2375
2376 static int proc_base_fill_cache(struct file *filp, void *dirent,
2377         filldir_t filldir, struct task_struct *task, const struct pid_entry *p)
2378 {
2379         return proc_fill_cache(filp, dirent, filldir, p->name, p->len,
2380                                 proc_base_instantiate, task, p);
2381 }
2382
2383 #ifdef CONFIG_TASK_IO_ACCOUNTING
2384 static int do_io_accounting(struct task_struct *task, char *buffer, int whole)
2385 {
2386         u64 rchar, wchar, syscr, syscw;
2387         struct task_io_accounting ioac;
2388
2389         if (!whole) {
2390                 rchar = task->rchar;
2391                 wchar = task->wchar;
2392                 syscr = task->syscr;
2393                 syscw = task->syscw;
2394                 memcpy(&ioac, &task->ioac, sizeof(ioac));
2395         } else {
2396                 unsigned long flags;
2397                 struct task_struct *t = task;
2398                 rchar = wchar = syscr = syscw = 0;
2399                 memset(&ioac, 0, sizeof(ioac));
2400
2401                 rcu_read_lock();
2402                 do {
2403                         rchar += t->rchar;
2404                         wchar += t->wchar;
2405                         syscr += t->syscr;
2406                         syscw += t->syscw;
2407
2408                         ioac.read_bytes += t->ioac.read_bytes;
2409                         ioac.write_bytes += t->ioac.write_bytes;
2410                         ioac.cancelled_write_bytes +=
2411                                         t->ioac.cancelled_write_bytes;
2412                         t = next_thread(t);
2413                 } while (t != task);
2414                 rcu_read_unlock();
2415
2416                 if (lock_task_sighand(task, &flags)) {
2417                         struct signal_struct *sig = task->signal;
2418
2419                         rchar += sig->rchar;
2420                         wchar += sig->wchar;
2421                         syscr += sig->syscr;
2422                         syscw += sig->syscw;
2423
2424                         ioac.read_bytes += sig->ioac.read_bytes;
2425                         ioac.write_bytes += sig->ioac.write_bytes;
2426                         ioac.cancelled_write_bytes +=
2427                                         sig->ioac.cancelled_write_bytes;
2428
2429                         unlock_task_sighand(task, &flags);
2430                 }
2431         }
2432
2433         return sprintf(buffer,
2434                         "rchar: %llu\n"
2435                         "wchar: %llu\n"
2436                         "syscr: %llu\n"
2437                         "syscw: %llu\n"
2438                         "read_bytes: %llu\n"
2439                         "write_bytes: %llu\n"
2440                         "cancelled_write_bytes: %llu\n",
2441                         (unsigned long long)rchar,
2442                         (unsigned long long)wchar,
2443                         (unsigned long long)syscr,
2444                         (unsigned long long)syscw,
2445                         (unsigned long long)ioac.read_bytes,
2446                         (unsigned long long)ioac.write_bytes,
2447                         (unsigned long long)ioac.cancelled_write_bytes);
2448 }
2449
2450 static int proc_tid_io_accounting(struct task_struct *task, char *buffer)
2451 {
2452         return do_io_accounting(task, buffer, 0);
2453 }
2454
2455 static int proc_tgid_io_accounting(struct task_struct *task, char *buffer)
2456 {
2457         return do_io_accounting(task, buffer, 1);
2458 }
2459 #endif /* CONFIG_TASK_IO_ACCOUNTING */
2460
2461 /*
2462  * Thread groups
2463  */
2464 static const struct file_operations proc_task_operations;
2465 static const struct inode_operations proc_task_inode_operations;
2466
2467 static const struct pid_entry tgid_base_stuff[] = {
2468         DIR("task",       S_IRUGO|S_IXUGO, task),
2469         DIR("fd",         S_IRUSR|S_IXUSR, fd),
2470         DIR("fdinfo",     S_IRUSR|S_IXUSR, fdinfo),
2471 #ifdef CONFIG_NET
2472         DIR("net",        S_IRUGO|S_IXUGO, net),
2473 #endif
2474         REG("environ",    S_IRUSR, environ),
2475         INF("auxv",       S_IRUSR, pid_auxv),
2476         ONE("status",     S_IRUGO, pid_status),
2477         INF("limits",     S_IRUSR, pid_limits),
2478 #ifdef CONFIG_SCHED_DEBUG
2479         REG("sched",      S_IRUGO|S_IWUSR, pid_sched),
2480 #endif
2481         INF("cmdline",    S_IRUGO, pid_cmdline),
2482         ONE("stat",       S_IRUGO, tgid_stat),
2483         ONE("statm",      S_IRUGO, pid_statm),
2484         REG("maps",       S_IRUGO, maps),
2485 #ifdef CONFIG_NUMA
2486         REG("numa_maps",  S_IRUGO, numa_maps),
2487 #endif
2488         REG("mem",        S_IRUSR|S_IWUSR, mem),
2489         LNK("cwd",        cwd),
2490         LNK("root",       root),
2491         LNK("exe",        exe),
2492         REG("mounts",     S_IRUGO, mounts),
2493         REG("mountinfo",  S_IRUGO, mountinfo),
2494         REG("mountstats", S_IRUSR, mountstats),
2495 #ifdef CONFIG_PROC_PAGE_MONITOR
2496         REG("clear_refs", S_IWUSR, clear_refs),
2497         REG("smaps",      S_IRUGO, smaps),
2498         REG("pagemap",    S_IRUSR, pagemap),
2499 #endif
2500 #ifdef CONFIG_SECURITY
2501         DIR("attr",       S_IRUGO|S_IXUGO, attr_dir),
2502 #endif
2503 #ifdef CONFIG_KALLSYMS
2504         INF("wchan",      S_IRUGO, pid_wchan),
2505 #endif
2506 #ifdef CONFIG_SCHEDSTATS
2507         INF("schedstat",  S_IRUGO, pid_schedstat),
2508 #endif
2509 #ifdef CONFIG_LATENCYTOP
2510         REG("latency",  S_IRUGO, lstats),
2511 #endif
2512 #ifdef CONFIG_PROC_PID_CPUSET
2513         REG("cpuset",     S_IRUGO, cpuset),
2514 #endif
2515 #ifdef CONFIG_CGROUPS
2516         REG("cgroup",  S_IRUGO, cgroup),
2517 #endif
2518         INF("oom_score",  S_IRUGO, oom_score),
2519         REG("oom_adj",    S_IRUGO|S_IWUSR, oom_adjust),
2520 #ifdef CONFIG_AUDITSYSCALL
2521         REG("loginuid",   S_IWUSR|S_IRUGO, loginuid),
2522         REG("sessionid",  S_IRUGO, sessionid),
2523 #endif
2524 #ifdef CONFIG_FAULT_INJECTION
2525         REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2526 #endif
2527 #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE)
2528         REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter),
2529 #endif
2530 #ifdef CONFIG_TASK_IO_ACCOUNTING
2531         INF("io",       S_IRUGO, tgid_io_accounting),
2532 #endif
2533 };
2534
2535 static int proc_tgid_base_readdir(struct file * filp,
2536                              void * dirent, filldir_t filldir)
2537 {
2538         return proc_pident_readdir(filp,dirent,filldir,
2539                                    tgid_base_stuff,ARRAY_SIZE(tgid_base_stuff));
2540 }
2541
2542 static const struct file_operations proc_tgid_base_operations = {
2543         .read           = generic_read_dir,
2544         .readdir        = proc_tgid_base_readdir,
2545 };
2546
2547 static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2548         return proc_pident_lookup(dir, dentry,
2549                                   tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
2550 }
2551
2552 static const struct inode_operations proc_tgid_base_inode_operations = {
2553         .lookup         = proc_tgid_base_lookup,
2554         .getattr        = pid_getattr,
2555         .setattr        = proc_setattr,
2556 };
2557
2558 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
2559 {
2560         struct dentry *dentry, *leader, *dir;
2561         char buf[PROC_NUMBUF];
2562         struct qstr name;
2563
2564         name.name = buf;
2565         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2566         dentry = d_hash_and_lookup(mnt->mnt_root, &name);
2567         if (dentry) {
2568                 if (!(current->flags & PF_EXITING))
2569                         shrink_dcache_parent(dentry);
2570                 d_drop(dentry);
2571                 dput(dentry);
2572         }
2573
2574         if (tgid == 0)
2575                 goto out;
2576
2577         name.name = buf;
2578         name.len = snprintf(buf, sizeof(buf), "%d", tgid);
2579         leader = d_hash_and_lookup(mnt->mnt_root, &name);
2580         if (!leader)
2581                 goto out;
2582
2583         name.name = "task";
2584         name.len = strlen(name.name);
2585         dir = d_hash_and_lookup(leader, &name);
2586         if (!dir)
2587                 goto out_put_leader;
2588
2589         name.name = buf;
2590         name.len = snprintf(buf, sizeof(buf), "%d", pid);
2591         dentry = d_hash_and_lookup(dir, &name);
2592         if (dentry) {
2593                 shrink_dcache_parent(dentry);
2594                 d_drop(dentry);
2595                 dput(dentry);
2596         }
2597
2598         dput(dir);
2599 out_put_leader:
2600         dput(leader);
2601 out:
2602         return;
2603 }
2604
2605 /**
2606  * proc_flush_task -  Remove dcache entries for @task from the /proc dcache.
2607  * @task: task that should be flushed.
2608  *
2609  * When flushing dentries from proc, one needs to flush them from global
2610  * proc (proc_mnt) and from all the namespaces' procs this task was seen
2611  * in. This call is supposed to do all of this job.
2612  *
2613  * Looks in the dcache for
2614  * /proc/@pid
2615  * /proc/@tgid/task/@pid
2616  * if either directory is present flushes it and all of it'ts children
2617  * from the dcache.
2618  *
2619  * It is safe and reasonable to cache /proc entries for a task until
2620  * that task exits.  After that they just clog up the dcache with
2621  * useless entries, possibly causing useful dcache entries to be
2622  * flushed instead.  This routine is proved to flush those useless
2623  * dcache entries at process exit time.
2624  *
2625  * NOTE: This routine is just an optimization so it does not guarantee
2626  *       that no dcache entries will exist at process exit time it
2627  *       just makes it very unlikely that any will persist.
2628  */
2629
2630 void proc_flush_task(struct task_struct *task)
2631 {
2632         int i;
2633         struct pid *pid, *tgid = NULL;
2634         struct upid *upid;
2635
2636         pid = task_pid(task);
2637         if (thread_group_leader(task))
2638                 tgid = task_tgid(task);
2639
2640         for (i = 0; i <= pid->level; i++) {
2641                 upid = &pid->numbers[i];
2642                 proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr,
2643                         tgid ? tgid->numbers[i].nr : 0);
2644         }
2645
2646         upid = &pid->numbers[pid->level];
2647         if (upid->nr == 1)
2648                 pid_ns_release_proc(upid->ns);
2649 }
2650
2651 static struct dentry *proc_pid_instantiate(struct inode *dir,
2652                                            struct dentry * dentry,
2653                                            struct task_struct *task, const void *ptr)
2654 {
2655         struct dentry *error = ERR_PTR(-ENOENT);
2656         struct inode *inode;
2657
2658         inode = proc_pid_make_inode(dir->i_sb, task);
2659         if (!inode)
2660                 goto out;
2661
2662         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2663         inode->i_op = &proc_tgid_base_inode_operations;
2664         inode->i_fop = &proc_tgid_base_operations;
2665         inode->i_flags|=S_IMMUTABLE;
2666
2667         inode->i_nlink = 2 + pid_entry_count_dirs(tgid_base_stuff,
2668                 ARRAY_SIZE(tgid_base_stuff));
2669
2670         dentry->d_op = &pid_dentry_operations;
2671
2672         d_add(dentry, inode);
2673         /* Close the race of the process dying before we return the dentry */
2674         if (pid_revalidate(dentry, NULL))
2675                 error = NULL;
2676 out:
2677         return error;
2678 }
2679
2680 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2681 {
2682         struct dentry *result = ERR_PTR(-ENOENT);
2683         struct task_struct *task;
2684         unsigned tgid;
2685         struct pid_namespace *ns;
2686
2687         result = proc_base_lookup(dir, dentry);
2688         if (!IS_ERR(result) || PTR_ERR(result) != -ENOENT)
2689                 goto out;
2690
2691         tgid = name_to_int(dentry);
2692         if (tgid == ~0U)
2693                 goto out;
2694
2695         ns = dentry->d_sb->s_fs_info;
2696         rcu_read_lock();
2697         task = find_task_by_pid_ns(tgid, ns);
2698         if (task)
2699                 get_task_struct(task);
2700         rcu_read_unlock();
2701         if (!task)
2702                 goto out;
2703
2704         result = proc_pid_instantiate(dir, dentry, task, NULL);
2705         put_task_struct(task);
2706 out:
2707         return result;
2708 }
2709
2710 /*
2711  * Find the first task with tgid >= tgid
2712  *
2713  */
2714 struct tgid_iter {
2715         unsigned int tgid;
2716         struct task_struct *task;
2717 };
2718 static struct tgid_iter next_tgid(struct pid_namespace *ns, struct tgid_iter iter)
2719 {
2720         struct pid *pid;
2721
2722         if (iter.task)
2723                 put_task_struct(iter.task);
2724         rcu_read_lock();
2725 retry:
2726         iter.task = NULL;
2727         pid = find_ge_pid(iter.tgid, ns);
2728         if (pid) {
2729                 iter.tgid = pid_nr_ns(pid, ns);
2730                 iter.task = pid_task(pid, PIDTYPE_PID);
2731                 /* What we to know is if the pid we have find is the
2732                  * pid of a thread_group_leader.  Testing for task
2733                  * being a thread_group_leader is the obvious thing
2734                  * todo but there is a window when it fails, due to
2735                  * the pid transfer logic in de_thread.
2736                  *
2737                  * So we perform the straight forward test of seeing
2738                  * if the pid we have found is the pid of a thread
2739                  * group leader, and don't worry if the task we have
2740                  * found doesn't happen to be a thread group leader.
2741                  * As we don't care in the case of readdir.
2742                  */
2743                 if (!iter.task || !has_group_leader_pid(iter.task)) {
2744                         iter.tgid += 1;
2745                         goto retry;
2746                 }
2747                 get_task_struct(iter.task);
2748         }
2749         rcu_read_unlock();
2750         return iter;
2751 }
2752
2753 #define TGID_OFFSET (FIRST_PROCESS_ENTRY + ARRAY_SIZE(proc_base_stuff))
2754
2755 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
2756         struct tgid_iter iter)
2757 {
2758         char name[PROC_NUMBUF];
2759         int len = snprintf(name, sizeof(name), "%d", iter.tgid);
2760         return proc_fill_cache(filp, dirent, filldir, name, len,
2761                                 proc_pid_instantiate, iter.task, NULL);
2762 }
2763
2764 /* for the /proc/ directory itself, after non-process stuff has been done */
2765 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
2766 {
2767         unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
2768         struct task_struct *reaper = get_proc_task(filp->f_path.dentry->d_inode);
2769         struct tgid_iter iter;
2770         struct pid_namespace *ns;
2771
2772         if (!reaper)
2773                 goto out_no_task;
2774
2775         for (; nr < ARRAY_SIZE(proc_base_stuff); filp->f_pos++, nr++) {
2776                 const struct pid_entry *p = &proc_base_stuff[nr];
2777                 if (proc_base_fill_cache(filp, dirent, filldir, reaper, p) < 0)
2778                         goto out;
2779         }
2780
2781         ns = filp->f_dentry->d_sb->s_fs_info;
2782         iter.task = NULL;
2783         iter.tgid = filp->f_pos - TGID_OFFSET;
2784         for (iter = next_tgid(ns, iter);
2785              iter.task;
2786              iter.tgid += 1, iter = next_tgid(ns, iter)) {
2787                 filp->f_pos = iter.tgid + TGID_OFFSET;
2788                 if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
2789                         put_task_struct(iter.task);
2790                         goto out;
2791                 }
2792         }
2793         filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
2794 out:
2795         put_task_struct(reaper);
2796 out_no_task:
2797         return 0;
2798 }
2799
2800 /*
2801  * Tasks
2802  */
2803 static const struct pid_entry tid_base_stuff[] = {
2804         DIR("fd",        S_IRUSR|S_IXUSR, fd),
2805         DIR("fdinfo",    S_IRUSR|S_IXUSR, fdinfo),
2806         REG("environ",   S_IRUSR, environ),
2807         INF("auxv",      S_IRUSR, pid_auxv),
2808         ONE("status",    S_IRUGO, pid_status),
2809         INF("limits",    S_IRUSR, pid_limits),
2810 #ifdef CONFIG_SCHED_DEBUG
2811         REG("sched",     S_IRUGO|S_IWUSR, pid_sched),
2812 #endif
2813         INF("cmdline",   S_IRUGO, pid_cmdline),
2814         ONE("stat",      S_IRUGO, tid_stat),
2815         ONE("statm",     S_IRUGO, pid_statm),
2816         REG("maps",      S_IRUGO, maps),
2817 #ifdef CONFIG_NUMA
2818         REG("numa_maps", S_IRUGO, numa_maps),
2819 #endif
2820         REG("mem",       S_IRUSR|S_IWUSR, mem),
2821         LNK("cwd",       cwd),
2822         LNK("root",      root),
2823         LNK("exe",       exe),
2824         REG("mounts",    S_IRUGO, mounts),
2825         REG("mountinfo",  S_IRUGO, mountinfo),
2826 #ifdef CONFIG_PROC_PAGE_MONITOR
2827         REG("clear_refs", S_IWUSR, clear_refs),
2828         REG("smaps",     S_IRUGO, smaps),
2829         REG("pagemap",    S_IRUSR, pagemap),
2830 #endif
2831 #ifdef CONFIG_SECURITY
2832         DIR("attr",      S_IRUGO|S_IXUGO, attr_dir),
2833 #endif
2834 #ifdef CONFIG_KALLSYMS
2835         INF("wchan",     S_IRUGO, pid_wchan),
2836 #endif
2837 #ifdef CONFIG_SCHEDSTATS
2838         INF("schedstat", S_IRUGO, pid_schedstat),
2839 #endif
2840 #ifdef CONFIG_LATENCYTOP
2841         REG("latency",  S_IRUGO, lstats),
2842 #endif
2843 #ifdef CONFIG_PROC_PID_CPUSET
2844         REG("cpuset",    S_IRUGO, cpuset),
2845 #endif
2846 #ifdef CONFIG_CGROUPS
2847         REG("cgroup",  S_IRUGO, cgroup),
2848 #endif
2849         INF("oom_score", S_IRUGO, oom_score),
2850         REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
2851 #ifdef CONFIG_AUDITSYSCALL
2852         REG("loginuid",  S_IWUSR|S_IRUGO, loginuid),
2853         REG("sessionid",  S_IRUSR, sessionid),
2854 #endif
2855 #ifdef CONFIG_FAULT_INJECTION
2856         REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
2857 #endif
2858 #ifdef CONFIG_TASK_IO_ACCOUNTING
2859         INF("io",       S_IRUGO, tid_io_accounting),
2860 #endif
2861 };
2862
2863 static int proc_tid_base_readdir(struct file * filp,
2864                              void * dirent, filldir_t filldir)
2865 {
2866         return proc_pident_readdir(filp,dirent,filldir,
2867                                    tid_base_stuff,ARRAY_SIZE(tid_base_stuff));
2868 }
2869
2870 static struct dentry *proc_tid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
2871         return proc_pident_lookup(dir, dentry,
2872                                   tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
2873 }
2874
2875 static const struct file_operations proc_tid_base_operations = {
2876         .read           = generic_read_dir,
2877         .readdir        = proc_tid_base_readdir,
2878 };
2879
2880 static const struct inode_operations proc_tid_base_inode_operations = {
2881         .lookup         = proc_tid_base_lookup,
2882         .getattr        = pid_getattr,
2883         .setattr        = proc_setattr,
2884 };
2885
2886 static struct dentry *proc_task_instantiate(struct inode *dir,
2887         struct dentry *dentry, struct task_struct *task, const void *ptr)
2888 {
2889         struct dentry *error = ERR_PTR(-ENOENT);
2890         struct inode *inode;
2891         inode = proc_pid_make_inode(dir->i_sb, task);
2892
2893         if (!inode)
2894                 goto out;
2895         inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
2896         inode->i_op = &proc_tid_base_inode_operations;
2897         inode->i_fop = &proc_tid_base_operations;
2898         inode->i_flags|=S_IMMUTABLE;
2899
2900         inode->i_nlink = 2 + pid_entry_count_dirs(tid_base_stuff,
2901                 ARRAY_SIZE(tid_base_stuff));
2902
2903         dentry->d_op = &pid_dentry_operations;
2904
2905         d_add(dentry, inode);
2906         /* Close the race of the process dying before we return the dentry */
2907         if (pid_revalidate(dentry, NULL))
2908                 error = NULL;
2909 out:
2910         return error;
2911 }
2912
2913 static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
2914 {
2915         struct dentry *result = ERR_PTR(-ENOENT);
2916         struct task_struct *task;
2917         struct task_struct *leader = get_proc_task(dir);
2918         unsigned tid;
2919         struct pid_namespace *ns;
2920
2921         if (!leader)
2922                 goto out_no_task;
2923
2924         tid = name_to_int(dentry);
2925         if (tid == ~0U)
2926                 goto out;
2927
2928         ns = dentry->d_sb->s_fs_info;
2929         rcu_read_lock();
2930         task = find_task_by_pid_ns(tid, ns);
2931         if (task)
2932                 get_task_struct(task);
2933         rcu_read_unlock();
2934         if (!task)
2935                 goto out;
2936         if (!same_thread_group(leader, task))
2937                 goto out_drop_task;
2938
2939         result = proc_task_instantiate(dir, dentry, task, NULL);
2940 out_drop_task:
2941         put_task_struct(task);
2942 out:
2943         put_task_struct(leader);
2944 out_no_task:
2945         return result;
2946 }
2947
2948 /*
2949  * Find the first tid of a thread group to return to user space.
2950  *
2951  * Usually this is just the thread group leader, but if the users
2952  * buffer was too small or there was a seek into the middle of the
2953  * directory we have more work todo.
2954  *
2955  * In the case of a short read we start with find_task_by_pid.
2956  *
2957  * In the case of a seek we start with the leader and walk nr
2958  * threads past it.
2959  */
2960 static struct task_struct *first_tid(struct task_struct *leader,
2961                 int tid, int nr, struct pid_namespace *ns)
2962 {
2963         struct task_struct *pos;
2964
2965         rcu_read_lock();
2966         /* Attempt to start with the pid of a thread */
2967         if (tid && (nr > 0)) {
2968                 pos = find_task_by_pid_ns(tid, ns);
2969                 if (pos && (pos->group_leader == leader))
2970                         goto found;
2971         }
2972
2973         /* If nr exceeds the number of threads there is nothing todo */
2974         pos = NULL;
2975         if (nr && nr >= get_nr_threads(leader))
2976                 goto out;
2977
2978         /* If we haven't found our starting place yet start
2979          * with the leader and walk nr threads forward.
2980          */
2981         for (pos = leader; nr > 0; --nr) {
2982                 pos = next_thread(pos);
2983                 if (pos == leader) {
2984                         pos = NULL;
2985                         goto out;
2986                 }
2987         }
2988 found:
2989         get_task_struct(pos);
2990 out:
2991         rcu_read_unlock();
2992         return pos;
2993 }
2994
2995 /*
2996  * Find the next thread in the thread list.
2997  * Return NULL if there is an error or no next thread.
2998  *
2999  * The reference to the input task_struct is released.
3000  */
3001 static struct task_struct *next_tid(struct task_struct *start)
3002 {
3003         struct task_struct *pos = NULL;
3004         rcu_read_lock();
3005         if (pid_alive(start)) {
3006                 pos = next_thread(start);
3007                 if (thread_group_leader(pos))
3008                         pos = NULL;
3009                 else
3010                         get_task_struct(pos);
3011         }
3012         rcu_read_unlock();
3013         put_task_struct(start);
3014         return pos;
3015 }
3016
3017 static int proc_task_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
3018         struct task_struct *task, int tid)
3019 {
3020         char name[PROC_NUMBUF];
3021         int len = snprintf(name, sizeof(name), "%d", tid);
3022         return proc_fill_cache(filp, dirent, filldir, name, len,
3023                                 proc_task_instantiate, task, NULL);
3024 }
3025
3026 /* for the /proc/TGID/task/ directories */
3027 static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
3028 {
3029         struct dentry *dentry = filp->f_path.dentry;
3030         struct inode *inode = dentry->d_inode;
3031         struct task_struct *leader = NULL;
3032         struct task_struct *task;
3033         int retval = -ENOENT;
3034         ino_t ino;
3035         int tid;
3036         unsigned long pos = filp->f_pos;  /* avoiding "long long" filp->f_pos */
3037         struct pid_namespace *ns;
3038
3039         task = get_proc_task(inode);
3040         if (!task)
3041                 goto out_no_task;
3042         rcu_read_lock();
3043         if (pid_alive(task)) {
3044                 leader = task->group_leader;
3045                 get_task_struct(leader);
3046         }
3047         rcu_read_unlock();
3048         put_task_struct(task);
3049         if (!leader)
3050                 goto out_no_task;
3051         retval = 0;
3052
3053         switch (pos) {
3054         case 0:
3055                 ino = inode->i_ino;
3056                 if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0)
3057                         goto out;
3058                 pos++;
3059                 /* fall through */
3060         case 1:
3061                 ino = parent_ino(dentry);
3062                 if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0)
3063                         goto out;
3064                 pos++;
3065                 /* fall through */
3066         }
3067
3068         /* f_version caches the tgid value that the last readdir call couldn't
3069          * return. lseek aka telldir automagically resets f_version to 0.
3070          */
3071         ns = filp->f_dentry->d_sb->s_fs_info;
3072         tid = (int)filp->f_version;
3073         filp->f_version = 0;
3074         for (task = first_tid(leader, tid, pos - 2, ns);
3075              task;
3076              task = next_tid(task), pos++) {
3077                 tid = task_pid_nr_ns(task, ns);
3078                 if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) {
3079                         /* returning this tgid failed, save it as the first
3080                          * pid for the next readir call */
3081                         filp->f_version = (u64)tid;
3082                         put_task_struct(task);
3083                         break;
3084                 }
3085         }
3086 out:
3087         filp->f_pos = pos;
3088         put_task_struct(leader);
3089 out_no_task:
3090         return retval;
3091 }
3092
3093 static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
3094 {
3095         struct inode *inode = dentry->d_inode;
3096         struct task_struct *p = get_proc_task(inode);
3097         generic_fillattr(inode, stat);
3098
3099         if (p) {
3100                 rcu_read_lock();
3101                 stat->nlink += get_nr_threads(p);
3102                 rcu_read_unlock();
3103                 put_task_struct(p);
3104         }
3105
3106         return 0;
3107 }
3108
3109 static const struct inode_operations proc_task_inode_operations = {
3110         .lookup         = proc_task_lookup,
3111         .getattr        = proc_task_getattr,
3112         .setattr        = proc_setattr,
3113 };
3114
3115 static const struct file_operations proc_task_operations = {
3116         .read           = generic_read_dir,
3117         .readdir        = proc_task_readdir,
3118 };