1. 现象

    gdb对OpenIPMI core的分析表明 domain_id 未被初始化好就被使用,具体信息如下:

    Thread debugging using libthread_db enabled]
    > Using host libthread_db library "/lib64/libthread_db.so.1".
    > Core was generated by `/usr/lsd/bin/tcmeoer'.
    > Program terminated with signal 11, Segmentation fault.
    > #0  0x00007f2e87731ab2 in ipmi_domain_iterate_entities () from
    > /lib64/libOpenIPMI.so.0
    > Missing separate debuginfos, use: debuginfo-install
    > OpenIPMI-libs-2.0.19-11.el7.x86_64 gdbm-1.10-8.el7.x86_64
    > glib2-2.46.2-4.el7.x86_64 glibc-2.17-78.el7.x86_64
    > hiredis-0.12.1-1.el7.x86_64 keyutils-libs-1.5.8-3.el7.x86_64
    > krb5-libs-1.12.2-14.el7.x86_64 libcom_err-1.42.9-7.el7.x86_64
    > libffi-3.0.13-11.el7.x86_64 libgcc-4.8.3-9.el7.x86_64
    > libselinux-2.2.2-6.el7.x86_64 libstdc++-4.8.3-9.el7.x86_64
    > openssl-libs-1.0.1e-42.el7.x86_64 pciutils-libs-3.2.1-4.el7.x86_64
    > pcre-8.32-14.el7.x86_64 python-libs-2.7.5-16.el7.x86_64
    > xz-libs-5.1.2-9alpha.el7.x86_64 zlib-1.2.7-13.el7.x86_64
    > (gdb) bt
    > #0  0x00007f2e87731ab2 in ipmi_domain_iterate_entities () at
    > /lib64/libOpenIPMI.so.0
    > #1  0x00000000004e47e3 in sensor_check_timeout (cb_data=0x0,
    > id=0x2ed5ce0) at src/sysmon.c:300
    > #2  0x00007f2e874e7735 in process_timers () at
    > /lib64/libOpenIPMIposix.so.0
    > #3  0x00007f2e874e8d0b in sel_select_loop () at
    > /lib64/libOpenIPMIposix.so.0
    > #4  0x00007f2e872df12d in operation_loop () at
    > /lib64/libOpenIPMIpthread.so.0
    > #5  0x00000000004e4a40 in sensor_thread (data=0x2ee8100) at
    > src/sysmon.c:416
    > #6  0x00007f2e88ac8df5 in start_thread () at /lib64/libpthread.so.0
    > #7  0x00007f2e85df61ad in clone () at /lib64/libc.so.6
    > (gdb) p domain_id.domain
    > $1 = (ipmi_domain_t *) 0x0
    > (gdb) p handle_entity
    > $2 = {void (ipmi_entity_t *, void *)} 0x4e477a <handle_entity>
    >
    > == related source code is as below -==
    > static void handle_entity(ipmi_entity_t * ent, void *b_data)


    其中domain_id使用的地方在c文件头就有定义:
    ipmi_domain_id_t domain_id;
    如果它没有初始化话,默认很可能为0. 这个domain_id 它会被后面的timer线程周期调用:

    void sensor_check_timeout(void *cb_data, os_hnd_timer_id_t * id)
    {

        ipmi_domain_iterate_entities(domain_id.domain, handle_entity, NULL);

        os_hnd->start_timer(os_hnd,
                            check_timer, &check_timeout, sensor_check_timeout, NULL);
    }

    其中 ipmi_domain_iterate_entities()必须要求第一个参数domain_id.domain 不为空,否则 domain→entities会产生segment fault.

    int ipmi_domain_iterate_entities(ipmi_domain_t      *domain,
                     ipmi_entity_ptr_cb handler,
                     void               *cb_data)
    {
        CHECK_DOMAIN_LOCK(domain);

        ipmi_entities_iterate_entities(domain->entities, handler, cb_data);
        return 0;
    }

    一般情况正常情况下,domain_id.domain 它在OpenIPMI初始化的时候通过调用setup_done()被初始化,如下面的代码所示:
    rv = ipmi_open_domain("", &con, 1, setup_done, NULL, NULL, NULL, NULL, 0, &domain_id);
        if (rv) {
            LOGLIB_TRACE( "ipmi_init_domain: %s\n", strerror(rv));
            pthread_mutex_unlock(&ipmi_mutex);
            return -4;
        }
    void
    setup_done(ipmi_domain_t * domain,
               int err,
               unsigned int conn_num,
               unsigned int port_num, int still_connected, void *user_data)
    {
        int rv;


        domain_id = ipmi_domain_convert_to_id(domain);
        LOGLIB_TRACE("domain: %p", domain);

        /* Register a callback functin entity_change. When a new entity
           is created, entity_change is called */
        rv = ipmi_domain_add_entity_update_handler(domain, entity_change, domain);
        if (rv) {
            LOGLIB_ERROR("ipmi_domain_add_entity_update_handler return error: %d\n", rv);
            return;
        }
    }


    2. 原因分析

    根据日志,并没有发现有异常提前退出的地方:

     rv = ipmi_parse_args(&curr_arg, argc, argv, &args);
        if (rv) {
            LOGLIB_ERROR(
                    "Error parsing command arguments, argument %d: %s\n",
                    curr_arg, strerror(rv));
            usage();
            return -2;
        }

        pthread_mutex_lock(&ipmi_mutex);
        rv = ipmi_args_setup_con(args, os_hnd, NULL, &con);
        if (rv) {
            LOGLIB_ERROR( "ipmi_ip_setup_con: %s", strerror(rv));
            pthread_mutex_unlock(&ipmi_mutex);
            return -3;
        }

        rv = ipmi_open_domain("", &con, 1, setup_done, NULL, NULL, NULL, NULL, 0, &domain_id);
        if (rv) {
            LOGLIB_TRACE( "ipmi_init_domain: %s\n", strerror(rv));
            pthread_mutex_unlock(&ipmi_mutex);
            return -4;
        }

        LOGLIB_INFO("domain: %p", domain_id.domain);

        rv = init_alert_shmbuf(ACCESS_EVENTS_MEMORY);
        if (rv != 0) {
            LOGLIB_ERROR( "init_alert_shmbuf: %x\n", rv);
            pthread_mutex_unlock(&ipmi_mutex);
            leave();
        }

        rv = os_hnd->alloc_timer(os_hnd, &check_timer);
        if (rv) {
            LOGLIB_ERROR( "alloc_timer: %x\n", rv);
            fprintf(stdout, "alloc_timer: %x\n", rv);
            pthread_mutex_unlock(&ipmi_mutex);
            //rv = ipmi_args_setup_con(args, os_hnd, sel, &con);
            leave();
        }
     /*
         * Wait until setup_done() thread has been ready, otherwise later timer
         * thread sensor_check_timeout may core
         */
        while(domain_id.domain == NULL) {
            LOGLIB_INFO("Waiting ipmi smi connection ready...\n");
            sleep(5);
        }

        /* Extend the delay to pool BMC sensors to ensure the setup_done() finished */
        check_timeout.tv_sec = 10;
        check_timeout.tv_usec = 0;
        os_hnd->start_timer(os_hnd,
                            check_timer, &check_timeout, sensor_check_timeout, NULL);

        rv = os_hnd->create_thread(os_hnd, 0, sensor_thread, os_hnd);
        if (rv) {
            LOGLIB_ERROR( "create sensor_thread: %x\n", rv);
            pthread_mutex_unlock(&ipmi_mutex);
            return -5;
        }

      上面所有可能异常退出的地方都没有发现打印信息,因此可以判断程序没有异常退出。奇怪的是上面gdb信息表明,domain_id.domain这个指针为空,就说明它还没有被初始化。但是setup_done()已经早早让 ipmi_open_domain()去执行了,难道它没有被执行完而timer线程就已经执行了?

    3. 代码证实

    通过研究open_ipmi_domain()函数源代码,可以看到这个函数最后一个参数为空和不为空的处理差异:

    为空的时候: domain_id被放入到一个初始化队列里面,这样就导致setup_done()可能延迟很晚执行。
    之所以这么说,是因为open_ipmi_domain()会调用:
    rv = ipmi_domain_add_connect_change_handler(domain,
                                con_change_handler,
                                con_change_cb_data);

    而 ipmi_domain_add_connect_change_handler()实现如下:
    int ipmi_domain_add_connect_change_handler(ipmi_domain_t      *domain,
                           ipmi_domain_con_cb handler,
                           void               *cb_data)
    {
        if (locked_list_add(domain->con_change_handlers, handler, cb_data))
        return 0;
        else
        return ENOMEM;
    }


可以看到回调函数被放到一个list里面,被统一的scheduler调度执行。因此很可能这个open_ipmi_domain()函数返回之后, setup_done()都没有得到执行。


而当open_ipmi_domain()最后一个参数不为空的时候:

domin_id会在open_ipmi_domain()返回之前初始化完成,这个可以由open_ipmi_domain()中的这两行证实:

if (new_domain)
    *new_domain = ipmi_domain_convert_to_id(domain);

通过上面的分析,再结合我们自己写的openIPMI代码,可以看到确实存在timer线程已经开始执行但setup_done()还没有开始执行的可能,这就会导致段错误。

4.实验验证

根据上面的分析,把上面open_ipmi_domain()的最后一个参数设置成了我们期望用到的domain_id的地址,理论上就不会再出现上面的段错误了。修改完了代码之后,反复重复同样的测试,整晚上都没有重现那个问题。

5.经验教训

虽然上面的这部分代码是参考 标准OpenIPMI-2.0.21/sample/sample2.c实现的,但实际在BMC比较繁忙的时候还会出现段错误。可见,开源的代码并不能(当然也没有义务)保证没有隐患。另外,在调试这个问题的过程中,刚开始由于对OpenIPMI源代码太多的恐惧感和陌生感,迟迟没有深入阅读被引用到的open_ipmi_domain()的具体实现,导致花费了比较多的时间去猜测和使用,但总达不到看完了open_ipmi_domain()之后恍然大悟的快感。因此今后碰到类似问题,一定要深入源代码、勇于阅读相关的代码,哪怕它最不熟悉,它的执行逻辑一定是确定的,虽然我们的猜测可能不确定。