TDengine crash时生成core文件的方法

TDengine Database产品提供服务端和客户端两个部分,其中服务端目前仅支持Linux环境,客户端通过提供库的方式,支持用户在Linux环境或Windows环境运行。

TDengine Database服务或客户端由于bug的存在,会出现crash的情况。当crash的时候,需要能生成core文件,支持更高效的分析原因,从而能快速的定位并解决bug。

Linux环境生成core的方法

在Linux环境中,生成core的两个条件:

1、core file size不为0。

该值缺省是0,表示不开启core的生成,需要修改成不为0的值; 2、设置保存core文件的目录有写权限。

缺省是在运行程序的当前目录下,或systemctl启动的程序则在“/”根目录下。生成的core文件名为core,且多次发生crash时,只会保存最早一次core文件。

TDengine Database可以使用两种方式来实现。

一、shell命令方式

1、设置不限制core file size

ulimit -c unlimited

2、设置生成core文件的目录

sudo sysctl -w kernel.core_pattern='/<corefile_dir>/core-%e-%p'

其中 corefile_dir : 生成core后保存的路径(需要提前建好)。%e 和 %p是在名称中增加程序名和pid。这样就可以保存多次core文件。

缺点:这样会将系统环境都修改了,用户其他程序crash时,生成core文件也会保存到指定的路径中!!!!

如果不修改kernel.core_pattern参数,只让生成core文件名称中增加pid,可以将 kernel.core_used_pid修改成1。

sudo sysctl kernel.core_uses_pid=1

注:启动程序时的用户权限,需要能有<corefile_dir>目录的写权限。

二、代码API方式

可以使用系统提供的函数来完成上述参数的设置。

代码示例如下:

  // 1. set ulimit -c unlimited
  struct rlimit rlim;
  struct rlimit rlim_new;
  if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
    pPrint("the old unlimited para: rlim_cur=%d, rlim_max=%d", rlim.rlim_cur, rlim.rlim_max);
    rlim_new.rlim_cur = RLIM_INFINITY;
    rlim_new.rlim_max = RLIM_INFINITY;
    if (setrlimit(RLIMIT_CORE, &rlim_new) != 0) {
      pPrint("set unlimited fail, error: %s", strerror(errno));
      rlim_new.rlim_cur = rlim.rlim_max;
      rlim_new.rlim_max = rlim.rlim_max;
      (void)setrlimit(RLIMIT_CORE, &rlim_new);
    }
  }
  
  if (getrlimit(RLIMIT_CORE, &rlim) == 0) {
    pPrint("the new unlimited para: rlim_cur=%d, rlim_max=%d", rlim.rlim_cur, rlim.rlim_max);
  }

  // 2. set pid into core file name
  struct __sysctl_args args;
  int     old_usespid = 0;
  size_t  old_len     = 0;
  int     new_usespid = 1;
  size_t  new_len     = sizeof(new_usespid);  
  int name[] = {CTL_KERN, KERN_CORE_USES_PID};  
  memset(&args, 0, sizeof(struct __sysctl_args));
  args.name    = name;
  args.nlen    = sizeof(name)/sizeof(name[0]);
  args.oldval  = &old_usespid;
  args.oldlenp = &old_len;
  args.newval  = &new_usespid;
  args.newlen  = new_len;  
  old_len = sizeof(old_usespid);  
  if (syscall(SYS__sysctl, &args) == -1) {
      pPrint("_sysctl(kern_core_uses_pid) set fail: %s", strerror(errno));
  }  
  pPrint("The old core_uses_pid[%d]: %d", old_len, old_usespid);
  old_usespid = 0;
  old_len     = 0;
  memset(&args, 0, sizeof(struct __sysctl_args));
  args.name    = name;
  args.nlen    = sizeof(name)/sizeof(name[0]);
  args.oldval  = &old_usespid;
  args.oldlenp = &old_len;  
  old_len = sizeof(old_usespid);  
  if (syscall(SYS__sysctl, &args) == -1) {
      pPrint("_sysctl(kern_core_uses_pid) get fail: %s", strerror(errno));
  }  
  pPrint("The new core_uses_pid[%d]: %d", old_len, old_usespid);

设置kernal.core_pattern的函数示例:

  // 3. create the path for saving core file
  int status; 
  char coredump_dir[32] = "/var/log/taosdump";
  if (opendir(coredump_dir) == NULL) {
    status = mkdir(coredump_dir, S_IRWXU | S_IRWXG | S_IRWXO); 
    if (status) {
      pPrint("mkdir fail, error: %s\n", strerror(errno));
    }
  }

  // 4. set kernel.core_pattern
   struct __sysctl_args args;
   char    old_corefile[128];
   size_t  old_len;
   char    new_corefile[128] = "/var/log/taosdump/core-%e-%p";
   size_t  new_len = sizeof(new_corefile);   
   int name[] = {CTL_KERN, KERN_CORE_PATTERN};
   memset(&args, 0, sizeof(struct __sysctl_args));
   args.name    = name;
   args.nlen    = sizeof(name)/sizeof(name[0]);
   args.oldval  = old_corefile;
   args.oldlenp = &old_len;
   args.newval  = new_corefile;
   args.newlen  = new_len;
   old_len = sizeof(old_corefile);
   if (syscall(SYS__sysctl, &args) == -1) {
       pPrint("_sysctl(kern_core_pattern) set fail: %s", strerror(errno));
   }   
   pPrint("The old kern_core_pattern: %*s\n", old_len, old_corefile);
   memset(&args, 0, sizeof(struct __sysctl_args));
   args.name    = name;
   args.nlen    = sizeof(name)/sizeof(name[0]);
   args.oldval  = old_corefile;
   args.oldlenp = &old_len;   
   old_len = sizeof(old_corefile);
   if (syscall(SYS__sysctl, &args) == -1) {
       pPrint("_sysctl(kern_core_pattern) get fail: %s", strerror(errno));
   }   
   pPrint("The new kern_core_pattern: %*s\n", old_len, old_corefile);

当在linux环境出现crash时,获得core文件以及对应的应用程序,然后使用gdb进行分析,示例如下:

plum@plum-VirtualBox:~/git/TDinternal/debug/build/bin$ sudo gdb ./taos core-taos-22675
GNU gdb (Ubuntu 7.11.1-0ubuntu1~16.5) 7.11.1
Copyright (C) 2016 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "x86_64-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./taos...done.
Core was generated by `/home/plum/git/TDengine/debug/build/bin/taos -c /etc/taos'.
(gdb) bt
#0  0x00000000004261d4 in tsParseOneRowData (str=0x980, pDataBlocks=0x7fffd8033ca0, schema=0x7fffd8034884, spd=0x7fffedcd2660, error=0x79fbf0 "", timePrec=0)
    at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:411
#1  0x00000000004266f8 in tsParseValues (str=0x7fffedcd23b0, pDataBlock=0x7fffd8033ca0, pMeterMeta=0x7fffd803483c, maxRows=14, spd=0x7fffedcd2660, error=0x79fbf0 "")
    at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:508
#2  0x0000000000426d8b in doParseInsertStatement (pSql=0x797dd0, pTableHashList=0x7fffd80342e0, str=0x7fffedcd23b0, spd=0x7fffedcd2660, totalNum=0x7fffedcd23cc)
    at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:640
#3  0x00000000004281da in doParserInsertSql (pSql=0x797dd0, str=0x7fffd8034c64 "0, '123', '11111\\'9911111');") at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:1036
#4  0x00000000004284b9 in tsParseInsertSql (pSql=0x797dd0, sql=0x7fffd8034c30 "insert into test.demo (ts, task_id, message) values(0, '123', '11111\\'9911111');", acct=0x7074a2 "0", db=0x7074c2 "0.test")
    at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:1103
#5  0x00000000004285f2 in tsParseSql (pSql=0x797dd0, acct=0x7074a2 "0", db=0x7074c2 "0.test", multiVnodeInsertion=false) at /home/plum/git/TDinternal/community/src/client/src/tscParseInsert.c:1127
#6  0x0000000000455572 in taos_query_imp (pObj=0x707450, pSql=0x797dd0) at /home/plum/git/TDinternal/community/src/client/src/tscSql.c:235
#7  0x0000000000455907 in taos_query (taos=0x707450, sqlstr=0x7fffd80008c0 "insert into test.demo (ts, task_id, message) values(0, '123', '11111\\'9911111');")
    at /home/plum/git/TDinternal/community/src/client/src/tscSql.c:293
#8  0x0000000000414b01 in shellRunCommandOnServer (con=0x707450, command=0x7fffd80008c0 "insert into test.demo (ts, task_id, message) values(0, '123', '11111\\'9911111');")
    at /home/plum/git/TDinternal/community/src/kit/shell/src/shellEngine.c:253
#9  0x00000000004149ad in shellRunCommand (con=0x707450, command=0x7fffd80008c0 "insert into test.demo (ts, task_id, message) values(0, '123', '11111\\'9911111');")
    at /home/plum/git/TDinternal/community/src/kit/shell/src/shellEngine.c:215
#10 0x0000000000417b75 in shellLoopQuery (arg=0x707450) at /home/plum/git/TDinternal/community/src/kit/shell/src/shellLinux.c:291
#11 0x00007ffff7bc16ba in start_thread (arg=0x7fffedcd4700) at pthread_create.c:333
#12 0x00007ffff73e641d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109

windows环境生成core的方法

一、设置生成dmp文件方式

通过改注册表的设置让操作系统在程序crash的时候自动生成dump,并放到特定的目录下

增加注册表HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows\Windows Error Reporting\LocalDumps

添加项如下图:

TDengine 时序数据库 - TDengine crash时生成core文件的方法 coredump regedit

其中DumpType代表的含义是:0 = Create a custom dump ,1 = Mini dump ,2 = Full dump

程序崩溃后,就会在c:\TDengine目录下生成dump文件。

二、使用VS调试dmp文件

用VS打开dmp文件。测试时dmp文件时本地产生的,因此VS会依据dmp文件自行找到exe,pdb和源代码的路径。因此直接点击调试,程序会出错代码行中断。

TDengine 时序数据库 - TDengine crash时生成core文件的方法 coredump debug

但若dump文件是exe在另一台机器上产生的,则我们最好把exe,pdb,dmp放到同一文件夹下,必须保证pdb与出问题的exe是同一时间生成的,用VS打开dump文件后还需要设置符号表文件路径和源代码路径:

(1) 当把pdb文件与dmp文件放入同一目录下时,就不需设置其路径,否则需要设置

工具->选项->调试->符号:

TDengine 时序数据库 - TDengine crash时生成core文件的方法 coredump symbol

2)还需设置源代码路径:

属性->调试源代码:

TDengine 时序数据库 - TDengine crash时生成core文件的方法 coredump code

这样点击“使用仅限本机进行调试”,即可在出错代码行中断:

TDengine 时序数据库 - TDengine crash时生成core文件的方法 coredump debug2