On 06/19, Oleg Nesterov wrote:It's completely untested and could be split into three patches. But I think that it solves the issues found so far, and that it will work with Eric's unshare(CLONE_NEWPID) too. What do you think about this approach? Thanks, Louis On 20/06/10 20:06 +0200, Oleg Nesterov wrote: This patch does four things: - defer pid_ns_release_proc()->mntput() to a worqueue context, so that pid_ns_release_proc() can be called in atomic context; - introduce pid_ns->nr_pids, so that we can count the number of pids allocated by alloc_pidmap(); - move the call to pid_ns_prepare_proc() to alloc_pid(), where we know when the first pid of a namespace is allocated; - move the call to pid_ns_release_proc() to free_pid(), where we are now able to know when the last pid of a namespace is detached. This solves the missing mntput() in copy_process() cleanup path, since free_pid() is called to cleanup alloc_pid(). This solves the multi-threaded init doing exec issue, since all sub-threads including former leader have called proc_flush_task() when the last pid is detached. This solves the EXIT_DEAD tasks issue for the same reason. Signed-off-by: Louis Rilling <louis.rilling@kerlabs.com> --- fs/proc/base.c | 4 ---- fs/proc/root.c | 11 ++++++++++- include/linux/pid_namespace.h | 3 +++ kernel/fork.c | 6 ------ kernel/pid.c | 16 +++++++++++++--- kernel/pid_namespace.c | 1 + 6 files changed, 27 insertions(+), 14 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index acb7ef8..455b109 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2745,10 +2745,6 @@ void proc_flush_task(struct task_struct *task) proc_flush_task_mnt(upid->ns->proc_mnt, upid->nr, tgid->numbers[i].nr); } - - upid = &pid->numbers[pid->level]; - if (upid->nr == 1) - pid_ns_release_proc(upid->ns); } static struct dentry *proc_pid_instantiate(struct inode *dir, diff --git a/fs/proc/root.c b/fs/proc/root.c index 4258384..9876cd9 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -215,7 +215,16 @@ int pid_ns_prepare_proc(struct pid_namespace *ns) return 0; } -void pid_ns_release_proc(struct pid_namespace *ns) +static void do_pid_ns_release_proc(struct work_struct *work) { + struct pid_namespace *ns; + + ns = container_of(work, struct pid_namespace, release_proc_work); mntput(ns->proc_mnt); } + +void pid_ns_release_proc(struct pid_namespace *ns) +{ + INIT_WORK(&ns->release_proc_work, do_pid_ns_release_proc); + schedule_work(&ns->release_proc_work); +} diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 38d1032..1010733 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/threads.h> +#include <linux/workqueue.h> #include <linux/nsproxy.h> #include <linux/kref.h> @@ -19,6 +20,7 @@ struct bsd_acct_struct; struct pid_namespace { struct kref kref; struct pidmap pidmap[PIDMAP_ENTRIES]; + atomic_t nr_pids; int last_pid; struct task_struct *child_reaper; struct kmem_cache *pid_cachep; @@ -26,6 +28,7 @@ struct pid_namespace { struct pid_namespace *parent; #ifdef CONFIG_PROC_FS struct vfsmount *proc_mnt; + struct work_struct release_proc_work; #endif #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; diff --git a/kernel/fork.c b/kernel/fork.c index b6cce14..b063a9c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1154,12 +1154,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, pid = alloc_pid(p->nsproxy->pid_ns); if (!pid) goto bad_fork_cleanup_io; - - if (clone_flags & CLONE_NEWPID) { - retval = pid_ns_prepare_proc(p->nsproxy->pid_ns); - if (retval < 0) - goto bad_fork_free_pid; - } } p->pid = pid_nr(pid); diff --git a/kernel/pid.c b/kernel/pid.c index e9fd8c1..fdb73e1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -34,6 +34,7 @@ #include <linux/bootmem.h> #include <linux/hash.h> #include <linux/pid_namespace.h> +#include <linux/proc_fs.h> #include <linux/init_task.h> #include <linux/syscalls.h> @@ -112,7 +113,7 @@ EXPORT_SYMBOL(is_container_init); static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); -static void free_pidmap(struct upid *upid) +static bool free_pidmap(struct upid *upid) { int nr = upid->nr; struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; @@ -120,6 +121,7 @@ static void free_pidmap(struct upid *upid) clear_bit(offset, map->page); atomic_inc(&map->nr_free); + return atomic_dec_and_test(&upid->ns->nr_pids); } static int alloc_pidmap(struct pid_namespace *pid_ns) @@ -154,6 +156,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns) do { if (!test_and_set_bit(offset, map->page)) { atomic_dec(&map->nr_free); + atomic_inc(&pid_ns->nr_pids); pid_ns->last_pid = pid; return pid; } @@ -226,6 +229,7 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ + struct upid *upid; int i; unsigned long flags; @@ -234,8 +238,11 @@ void free_pid(struct pid *pid) hlist_del_rcu(&pid->numbers[i].pid_chain); spin_unlock_irqrestore(&pidmap_lock, flags); - for (i = 0; i <= pid->level; i++) - free_pidmap(pid->numbers + i); + for (i = 0; i <= pid->level; i++) { + upid = pid->numbers + i; + if (free_pidmap(upid)) + pid_ns_release_proc(upid->ns); + } call_rcu(&pid->rcu, delayed_put_pid); } @@ -276,6 +283,9 @@ struct pid *alloc_pid(struct pid_namespace *ns) &pid_hash[pid_hashfn(upid->nr, upid->ns)]); spin_unlock_irq(&pidmap_lock); + if (pid->numbers[pid->level].nr == 1) + pid_ns_prepare_proc(ns); + out: return pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index a5aff94..beba2b4 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -92,6 +92,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p set_bit(0, ns->pidmap[0].page); atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); + atomic_set(&ns->nr_pids, 0); for (i = 1; i < PIDMAP_ENTRIES; i++) atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); -- 1.5.6.5 --
| Greg KH | Og dreams of kernels |
| Jens Axboe | [PATCH 31/33] Fusion: sg chaining support |
| Arnd Bergmann | Re: finding your own dead "CONFIG_" variables |
| Mark Brown | [PATCH 2/2] Subject: natsemi: Allow users to disable workaround for DspCfg reset |
| Tony Breeds | [LGUEST] Look in object dir for .config |
git: | |
| Brian Downing | Re: Git in a Nutshell guide |
| John Benes | Re: master has some toys |
| Matthias Lederhofer | [PATCH 4/7] introduce GIT_WORK_TREE to specify the work tree |
| Alexander Sulfrian | [RFC/PATCH] RE: git calls SSH_ASKPASS even if DISPLAY is not set |
| Junio C Hamano | Re: Rss produced by git is not valid xml? |
| Linux Kernel Mailing List | iSeries: fix section mismatch in iseries_veth |
| Linux Kernel Mailing List | ixbge: remove TX lock and redo TX accounting. |
| L |
