记一次pthread_cancel_init段错误分析(bpftrace)

Posted rtoax

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了记一次pthread_cancel_init段错误分析(bpftrace)相关的知识,希望对你有一定的参考价值。

记一次pthread_cancel_init段错误分析(bpftrace)


rtoax
2021年5月17日


1. coredump栈

gdb app coredump.dat

[...]

#15 0x00000000004fceba in SignalHandler (sigid=11)
    at /work/workspace/VOS_platform/workspace/os/../../platform/vos/vos_signal.c:80
#16 <signal handler called>
#17 0x00007ff8f170cd72 in strcmp () from /lib64/ld-linux-x86-64.so.2
#18 0x00007ff8f16fbae2 in _dl_map_object () from /lib64/ld-linux-x86-64.so.2
#19 0x00007ff8f1707254 in dl_open_worker () from /lib64/ld-linux-x86-64.so.2
#20 0x00007ff8f1702784 in _dl_catch_error () from /lib64/ld-linux-x86-64.so.2
#21 0x00007ff8f1706b3b in _dl_open () from /lib64/ld-linux-x86-64.so.2
#22 0x00007ff8ef0136d2 in do_dlopen () from /lib64/libc.so.6
#23 0x00007ff8f1702784 in _dl_catch_error () from /lib64/ld-linux-x86-64.so.2
#24 0x00007ff8ef013792 in __libc_dlopen_mode () from /lib64/libc.so.6
#25 0x00007ff8f14e6d73 in pthread_cancel_init () from /lib64/libpthread.so.0
#26 0x00007ff8f14e3b57 in pthread_cancel () from /lib64/libpthread.so.0

[...]

此调用栈的详情:

1.1. frame 26: pthread_cancel

int
__pthread_cancel (pthread_t th)
{
  [...]
#ifdef SHARED
  pthread_cancel_init ();
#endif
  [...]
}

1.2. frame 25: pthread_cancel_init

void
__attribute_noinline__
pthread_cancel_init (void)
{
  [...]
  handle = __libc_dlopen (LIBGCC_S_SO);
  [...]
}

1.3. frame 24: __libc_dlopen_mode

__libc_dlopen:

#define __libc_dlopen(name) \\
  __libc_dlopen_mode (name, RTLD_NOW | __RTLD_DLOPEN)

__libc_dlopen_mode

void *
__libc_dlopen_mode (const char *name, int mode)
{
  [...]
#ifdef SHARED
  return (dlerror_run (do_dlopen, &args) ? NULL : (void *) args.map);
#else
  [...]
}

1.4. frame 23: _dl_catch_error

dlerror_run

static int
dlerror_run (void (*operate) (void *), void *args)
{
  [...]
  int result = (GLRO(dl_catch_error) (&objname, &last_errstring, &malloced,
				      operate, args)
  [...]
}

_dl_catch_error

int
internal_function
_dl_catch_error (const char **objname, const char **errstring,
		 bool *mallocedp, void (*operate) (void *), void *args)
{
  [...]  if (__builtin_expect (errcode, 0) == 0)
    {
      [...]
      (*operate) (args);
      [...]
      return 0;
    }
  [...]
}

1.5. frame 22: do_dlopen

static void
do_dlopen (void *ptr)
{
  struct do_dlopen_args *args = (struct do_dlopen_args *) ptr;
  /* Open and relocate the shared object.  */
  args->map = GLRO(dl_open) (args->name, args->mode, args->caller_dlopen,
			     __LM_ID_CALLER, __libc_argc, __libc_argv,
			     __environ);
}

1.6. frame 21: _dl_open

void *
_dl_open (const char *file, int mode, const void *caller_dlopen, Lmid_t nsid,
	  int argc, char *argv[], char *env[])
{
  [...]
  int errcode = _dl_catch_error (&objname, &errstring, &malloced,
				 dl_open_worker, &args);
  [...]
}

1.7. frame 20: _dl_catch_error

同上。

1.8. frame 19: dl_open_worker

static void
dl_open_worker (void *a)
{
  [...]
  struct link_map *new;
  args->map = new = _dl_map_object (call_map, file, lt_loaded, 0,
				    mode | __RTLD_CALLMAP, args->nsid);
  [...]
}

1.9. frame 18: _dl_map_object

struct link_map *
internal_function
_dl_map_object (struct link_map *loader, const char *name,
		int type, int trace_mode, int mode, Lmid_t nsid)
{
  [...]
    /* Look for this name among those already loaded.  */
  for (l = GL(dl_ns)[nsid]._ns_loaded; l; l = l->l_next)
    {
    [...]
	  soname = ((const char *) D_PTR (l, l_info[DT_STRTAB])
		    + l->l_info[DT_SONAME]->d_un.d_val);
	  if (strcmp (name, soname) != 0)
	    continue;
    [...]
    }
  [...]
}

1.10. frame 17: strcmp

2. 正常的pthread_cancel

通过进行用户栈分析,正常的pthread_cancel的代码路径为:

    strcmp+0
    _dl_check_map_versions+312
    dl_open_worker+1153
    _dl_catch_error+100
    do_dlopen+66
    start_thread+197

3. 分析

源码见下一章:

./a.out
Stack: 00000000908eadb7, 0x007ffc908eadb7.
Heap:  00000000008c9010, 0x000000008c9010.

Stack: 000000005625bf07, 0x007f975625bf07.
Heap:  00000000500008c0, 0x007f97500008c0.

Stack: 0000000055a5af07, 0x007f9755a5af07.
Heap:  00000000480008c0, 0x007f97480008c0.
cat /proc/$(pidof a.out)/maps
[...]
008c9000-008ea000 rw-p 00000000 00:00 0                                  [heap]
7f9748000000-7f9748021000 rw-p 00000000 00:00 0 
7f9748021000-7f974c000000 ---p 00000000 00:00 0 
7f9750000000-7f9750021000 rw-p 00000000 00:00 0 
7f9750021000-7f9754000000 ---p 00000000 00:00 0 
7f9755045000-7f975505a000 r-xp 00000000 fd:00 5053616                    /usr/lib64/libgcc_s-4.8.5-20150702.so.1
[...]

源码见下一章:

_dl_check_map_versions(link map 0x00007f9750000b30, verbose 0, trace_mode 0)

_dl_check_map_versions在glibc中的几个调用:

  /* So far, so good.  Now check the versions.  */
  for (unsigned int i = 0; i < new->l_searchlist.r_nlist; ++i)
    if (new->l_searchlist.r_list[i]->l_real->l_versions == NULL)
      (void) _dl_check_map_versions (new->l_searchlist.r_list[i]->l_real,
				     0, 0);
int
internal_function
_dl_check_all_versions (struct link_map *map, int verbose, int trace_mode)
{
  struct link_map *l;
  int result = 0;

  for (l = map; l != NULL; l = l->l_next)
    result |= (! l->l_faked
	       && _dl_check_map_versions (l, verbose, trace_mode));

  return result;
}

4. 本文用到的所有代码

4.1. test.c

#include <stdio.h>
#include <malloc.h>
#include <pthread.h>

pthread_t t_1, t_2;

#define CHECK_STACK() \\
	char __c = 'a'; \\
	printf("Stack: %016x, %016p.\\n", &__c, &__c);

#define CHECK_HEAP() \\
	char *____c = malloc(1024); \\
	printf("Heap:  %016x, %016p.\\n", ____c, ____c); \\
	free(____c);

void *t1(void *arg) {
	CHECK_STACK();
	CHECK_HEAP();
	while(1) {
		sleep(1);
		printf("t1 running.\\n");
	}
}

void *t2(void *arg) {
	CHECK_STACK();
	CHECK_HEAP();

	while(1) {
		pthread_create(&t_1, NULL, t1, NULL);
		sleep(10);
		printf("cancel t1.\\n");
		pthread_cancel(t_1);
		printf("t2 exit.\\n");
		sleep(3);
	}
	pthread_exit(NULL);
}

int main() {
	CHECK_STACK();
	CHECK_HEAP();

	pthread_create(&t_2, NULL, t2, NULL);

	pthread_join(t_2, NULL);
}

4.2. uprobe.bt

#!/usr/bin/env bpftrace
/*
 *  分析 CuUpApp Coredump
 *	荣涛 2021年5月17日
 */

//#include <link.h>

BEGIN
{
	printf("Uprobe pthread... Hit Ctrl-C to end.\\n");
	printf("%-6s %-16s %-6s\\n", "PID", "COMM", "TID");
}

uprobe:/lib64/ld-linux-x86-64.so.2:_dl_map_object
/comm == "a.out"/
{
	printf("%-6d %-16s %s( ,%s, , ,%x, %d)\\n",
			pid, comm, "_dl_map_object", str(arg1), arg4, arg5);
}
uprobe:/lib64/ld-linux-x86-64.so.2:strcmp
/comm == "a.out"/
{
//	printf("%-6d %-16s %s(%s ,%s)\\n",
//			pid, comm, "strcmp", str(arg0), str(arg1));
//	@[ustack] = count();
}

uprobe:/lib64/ld-linux-x86-64.so.2:_dl_check_map_versions
/comm == "a.out"/
{
	printf("%-6d %-16s %s(link map 0x%016lx, verbose %d, trace_mode %d)\\n", 
			pid, comm, "_dl_check_map_versions", arg0, arg1, arg2);
//	@[ustack] = count();
}

END
{
	printf("exit.\\n");
}

以上是关于记一次pthread_cancel_init段错误分析(bpftrace)的主要内容,如果未能解决你的问题,请参考以下文章

记一次pg_rman备份postgresql数据库报段错误的处理过程

记一次pg_rman备份postgresql数据库报段错误的处理过程

记一次pthread_key_create导致的__nptl_deallocate_tsd段错误

记一次ora-1652错误的解决过程

记一次FreeRTOS错误配置导致无法进入临界区

记一次Redis错误排查经历(redis cluster 节点重启后无限同步问题)