[Crash-utility] [PATCH] Show missing tasks in ps

Dave Anderson anderson at redhat.com
Tue Nov 30 20:33:08 UTC 2010


----- "Michael Holzheu" <holzheu at linux.vnet.ibm.com> wrote:

> Hi Dave,
> 
> I got an s390x dump of a Linux 2.6.36 system, where a task (kmcheck, pid=44) is
> missing in the ps output. I debugged the problem and I think that I found the
> reason:
> 
> It looks like that crash does not walk the linked list of the pid hash table
> to the end, if it finds a NULL pointer in the pid.tasks[PIDTYPE_PID=0]
> array. Unfortunately, for the struct pid that is before our lost task in the
> linked list this condition is true. Therefore crash does not find our task.

That sounds similar to the fix Bob Montgomery made in 5.0.7:

     - Fix for the potential to miss one or more tasks in 2.6.23 and earlier
       kernels, presumably due to catching an entry the kernel's pid_hash[]
       chain in transition.  Without the patch, the task will simply not be
       seen in the gathered task list.
       (bob.montgomery at hp.com)

where this was his patch posting -- which fixed refresh_hlist_task_table_v2():

  [Crash-utility] Missing PID 1 is crash problem with losing tasks
  https://www.redhat.com/archives/crash-utility/2010-August/msg00049.html

and where your patch fixes refresh_hlist_task_table_v3().

I'll give it a test run...

Thanks,
  Dave
 

> The attached patch seems to fix this problem.
> 
> Here my crash debug log with the 2.6.36 dump:
> ---------------------------------------------
> Task "kmcheck" is in hash slot 2941 in the linked list at position 2:
> 
> crash> print pid_hash[2941]
> $4 = {
>   first = 0x3f5fb7f8
> }
> 
> crash> upid
> struct upid {
>     int nr;
>     struct pid_namespace *ns;
>     struct hlist_node pid_chain;
> }
> SIZE: 32
> 
> crash> upid.pid_chain
> struct upid {
>   [16] struct hlist_node pid_chain;
> }
> 
> crash> eval 0x3f5fb7f8 - 16
> hexadecimal: 3f5fb7e8  
> 
> crash> upid 3f5fb7e8   <<<<---- the first upid in the list
> struct upid {
>   nr = 565, 
>   ns = 0x81d8f8, 
>   pid_chain = {
>     next = 0x3edea2b0, 
>     pprev = 0x96554e8
>   }
> }
> 
> crash> pid
> struct pid {
>     atomic_t count;
>     unsigned int level;
>     struct hlist_head tasks[3];
>     struct rcu_head rcu;
>     struct upid numbers[1];
> }
> SIZE: 80
> 
> crash> pid.numbers
> struct pid {
>   [48] struct upid numbers[1];
> }
> 
> crash> eval 3f5fb7e8 - 48
> hexadecimal: 3f5fb7b8  
> 
> crash> pid 3f5fb7b8
> struct pid {
>   count = {
>     counter = 1
>   }, 
>   level = 0, 
>   tasks = {{
>       first = 0x0 <<<----------- tasks[0] is NULL
>     }, {
>       first = 0x3d488620
>     }, {
>       first = 0x0
>     }}, 
>   rcu = {
>     next = 0x5a5a5a5a5a5a5a5a, 
>     func = 0x5a5a5a5a5a5a5a5a
>   }, 
>   numbers = {{
>       nr = 565, 
>       ns = 0x81d8f8, 
>       pid_chain = {
>         next = 0x3edea2b0,  <<<--------- Pointer to second element in
> list
>         pprev = 0x96554e8
>       }
>     }}
> }
> 
> crash> eval 0x3edea2b0 - 16
> hexadecimal: 3edea2a0   <<<-- The second upid in the list
> 
> crash> upid 0x3edea2a0
> struct upid {
>   nr = 44,                 <<<--- Our missing pid=44 (kmcheck)
>   ns = 0x81d8f8, 
>   pid_chain = {
>     next = 0x0, 
>     pprev = 0x3f5fb7f8
>   }
> }
> 
> crash> eval 0x3edea2a0 - 48
> hexadecimal: 3edea270  
> 
> crash> pid 3edea270
> struct pid {
>   count = {
>     counter = 5
>   }, 
>   level = 0, 
>   tasks = {{
>       first = 0x3e799908   <<<--- Pointer to our task_struct.pids
>     }, {
>       first = 0x0
>     }, {
>       first = 0x0
>     }}, 
>   rcu = {
>     next = 0x5a5a5a5a5a5a5a5a, 
>     func = 0x5a5a5a5a5a5a5a5a
>   }, 
>   numbers = {{
>       nr = 44, 
>       ns = 0x81d8f8, 
>       pid_chain = {
>         next = 0x0, 
>         pprev = 0x3f5fb7f8
>       }
>     }}
> }
> 
> crash> task_struct.pids
> struct task_struct {
>    [712] struct pid_link pids[3];
> }
> 
> crash> eval 0x3e799908 - 712
> hexadecimal: 3e799640  
> 
> crash> task_struct 3e799640 | grep comm
>   comm = "kmcheck\000\000\000\000\000\000\000\000", <<<--- here it is
> ---
>  task.c |    4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
> 
> --- a/task.c
> +++ b/task.c
> @@ -2006,7 +2006,7 @@ do_chained:
>                  }
>  
>  		if (pid_tasks_0 == 0)
> -			continue;
> +			goto chain_next;
>  
>  		next = pid_tasks_0 - OFFSET(task_struct_pids);
>  
> @@ -2042,7 +2042,7 @@ do_chained:
>  		}
>  
>  		cnt++;
> -
> +chain_next:
>  		if (pnext) {
>  			kpp = pnext;
>  			upid = pnext - OFFSET(upid_pid_chain);
> 
> 
> --
> Crash-utility mailing list
> Crash-utility at redhat.com
> https://www.redhat.com/mailman/listinfo/crash-utility




More information about the Crash-utility mailing list