当前位置: 首页 > 工具软件 > rr-project > 使用案例 >

Mozilla rr

龙俊美
2023-12-01

Mozilla rr快速学习

比GDB更加强大的调试工具

说明:程序运行中,经常有一些段错误是无法复现的,可能在某些特定条件下才能触发,如多线程的执行顺序,gc操作等等,复现调试起来非常困难,而rr做的事情是,只要通过rr记录一次错误,可以无限次调试,并且每次调试都和段错误发送情况一模一样

官网:https://rr-project.org/

github:https://github.com/mozilla/rr

安装:

快速入门

通过以下test.c 简单了解下rr, main函数创建执行6个线程,打印出线程创建的顺序,执行的顺序以及指针

#include<pthread.h>
#include<stdio.h>
#include<stdlib.h>

#define N 6

void *thread(void *p)
{
    int *id;
    id = p;
    printf("thread id:%d, p:%p\n", *id, id);
}

int main()
{
    pthread_t threads[N];
    int i;
    int *p;

    for(i=0;i < N;i++)
    {
        printf("create id:%d\n", i);
        p = (int *) malloc(sizeof(int));
        *p = 0;
        *p = i;
        pthread_create(threads+i, NULL, thread, p);
    }

    pthread_exit(NULL);
    return 0;
}

编译执行,可以看到由于线程执行的顺序不一致导致每次打印的信息都不一样

[root@localhost rrtest]# gcc test.c -o test  -lpthread -g
[root@localhost rrtest]# ./test 
create id:0
create id:1
create id:2
create id:3
create id:4
create id:5
thread id:0, p:0x20ba670
thread id:3, p:0x20baa30
thread id:2, p:0x20ba8f0
thread id:1, p:0x20ba7b0
thread id:4, p:0x20bab70
thread id:5, p:0x20bacb0
[root@localhost rrtest]# ./test 
create id:0
create id:1
create id:2
create id:3
thread id:1, p:0x11f37b0
create id:4
thread id:3, p:0x11f3a30
thread id:2, p:0x11f38f0
thread id:0, p:0x11f3670
create id:5
thread id:4, p:0x11f3b70
thread id:5, p:0x11f3cb0

现在换成rr执行一次,并保留,再进行回放。

执行rr record ./test记录程序运行,再执行rr replay 进行回放,可以看到每次回父和刚才记录的执行顺序一模一样,打印指针也一模一样

[root@localhost rrtest]# rr record ./test 
rr: Saving execution to trace directory `/root/.local/share/rr/test-0'.
create id:0
create id:1
thread id:0, p:0x6f2670
create id:2
thread id:2, p:0x6f28f0
create id:3
thread id:3, p:0x6f2a30
create id:4
create id:5
thread id:4, p:0x6f2b70
thread id:1, p:0x6f27b0
thread id:5, p:0x6f2cb0
[root@localhost rrtest]# rr replay
0x00007fe2bc5f4f60 in _start () from /lib64/ld-linux-x86-64.so.2
(rr) c
Continuing.
create id:0
create id:1
thread id:0, p:0x6f2670
create id:2
thread id:2, p:0x6f28f0
create id:3
thread id:3, p:0x6f2a30
create id:4
create id:5
thread id:4, p:0x6f2b70
thread id:1, p:0x6f27b0
thread id:5, p:0x6f2cb0
[New Thread 9457.9463]

Thread 2 received signal SIGKILL, Killed.
[Switching to Thread 9457.9463]
0x0000000070000002 in ?? ()
(rr) run
The program being debugged has been started already.
Start it from the beginning? (y or n) y
Starting program: /root/.local/share/rr/test-0/mmap_hardlink_3_test 

Program stopped.
0x00007fe2bc5f4f60 in _start () from /lib64/ld-linux-x86-64.so.2
(rr) c
Continuing.
create id:0
create id:1
thread id:0, p:0x6f2670
create id:2
thread id:2, p:0x6f28f0
create id:3
thread id:3, p:0x6f2a30
create id:4
create id:5
thread id:4, p:0x6f2b70
thread id:1, p:0x6f27b0
thread id:5, p:0x6f2cb0
[New Thread 9457.9463]

Thread 2 received signal SIGKILL, Killed.
[Switching to Thread 9457.9463]
0x0000000070000002 in ?? ()

可以看到rr的作用,保留程序运行的完整情况,再进行回放,记录在出现段错误的情况下,可以不断的进行回放调试。

并且在rr下可以使用gdb的功能, 如打断点调试之类的

(rr) b pthread_exit
Breakpoint 2 at 0x400500
(rr) c
Continuing.
create id:0
create id:1
thread id:0, p:0x6f2670
create id:2
thread id:2, p:0x6f28f0
create id:3
thread id:3, p:0x6f2a30
create id:4
create id:5
thread id:4, p:0x6f2b70
[New Thread 9457.9459]
[New Thread 9457.9463]

Thread 1 hit Breakpoint 2, __pthread_exit (value=0x0) at pthread_exit.c:25
25	{
(rr)

另外rr还有一个很强大的功能 watch -l 配合gdb反向执行,可以观察到某个值的变化情况,如对0x6f2a30这个指针的志进行观察

(rr) c
Continuing.
create id:0
create id:1
thread id:0, p:0x6f2670
create id:2
thread id:2, p:0x6f28f0
create id:3
thread id:3, p:0x6f2a30
create id:4
create id:5
thread id:4, p:0x6f2b70
thread id:1, p:0x6f27b0
thread id:5, p:0x6f2cb0
[New Thread 9457.9463]

Thread 2 received signal SIGKILL, Killed.
[Switching to Thread 9457.9463]
0x0000000070000002 in ?? ()
(rr) watch -l *(int *)0x6f2a30
Hardware watchpoint 1: -location *(int *)0x6f2a30
(rr) reverse-continue 
Continuing.
[New Thread 9457.9457]
warning: Corrupted shared library list: 0x6f2e10 != 0x7fe2bc81a990
[New Thread 9457.9459]
[Switching to Thread 9457.9457]

Thread 3 hit Hardware watchpoint 1: -location *(int *)0x6f2a30

Old value = 3
New value = 0
0x000000000040067c in main () at test.c:25
25	        *p = i;
(rr) reverse-continue 
Continuing.

Thread 3 hit Hardware watchpoint 1: -location *(int *)0x6f2a30

Old value = 0
New value = <unreadable>
0x0000000070000000 in ?? ()

rr 调试nginx

rr调试nginx必须加上-w,详见这个issue https://github.com/mozilla/rr/issues/2058

先在一个终端执行rr record -w ,发起curl触发nginx的coredump,后再执行killall nginx

[root@localhost luajit-bug-report]# rr record -w ./nginx/sbin/nginx -p . -c nginx.conf
[root@localhost luajit-bug-report]# killall nginx
[root@localhost luajit-bug-report]# tail -1 logs/error.log 
2019/03/04 14:07:17 [alert] 10361#0: worker process 10362 exited on signal 6 (core dumped)
[root@localhost luajit-bug-report]# rr ps
PID	PPID	EXIT	CMD
10360	--	0	./nginx/sbin/nginx -p . -c nginx.conf
10361	10360	0	(forked without exec)
10362	10361	-6	(forked without exec)
10363	10361	0	(forked without exec)
10373	10361	0	(forked without exec)

观察看日志中nginx有一个pid为10362的worker进程coredump退出了,再执行rr ps 可以看到pid为10362的进程异常退出,接着就可以调试了

rr replay -f 10362
(rr) c
(rr) bt

转载于:https://my.oschina.net/u/4001231/blog/3017627

 类似资料:

相关阅读

相关文章

相关问答