多线程random_r比单线程版本慢

辛才俊

2023-03-14

问题内容：

以下程序与此处描述的程序基本相同。当我使用两个线程（NTHREADS== 2）运行并编译程序时，得到以下运行时间：

real        0m14.120s
user        0m25.570s
sys         0m0.050s

当仅使用一个线程（NTHREADS == 1）运行时，即使仅使用一个内核，运行时间也会明显缩短。

real        0m4.705s
user        0m4.660s
sys         0m0.010s

我的系统是双核的，我知道random_r是线程安全的，并且我很确定它是非阻塞的。如果在没有random_r的情况下运行同一程序，并且使用余弦和正弦值的计算作为替换，则双线程版本的运行时间约为预期的1/2。

#include <pthread.h>
#include <stdlib.h>
#include <stdio.h>

#define NTHREADS 2
#define PRNG_BUFSZ 8
#define ITERATIONS 1000000000

void* thread_run(void* arg) {
    int r1, i, totalIterations = ITERATIONS / NTHREADS;
    for (i = 0; i < totalIterations; i++){
        random_r((struct random_data*)arg, &r1);
    }
    printf("%i\n", r1);
}

int main(int argc, char** argv) {
    struct random_data* rand_states = (struct random_data*)calloc(NTHREADS, sizeof(struct random_data));
    char* rand_statebufs = (char*)calloc(NTHREADS, PRNG_BUFSZ);
    pthread_t* thread_ids;
    int t = 0;
    thread_ids = (pthread_t*)calloc(NTHREADS, sizeof(pthread_t));
    /* create threads */
    for (t = 0; t < NTHREADS; t++) {
        initstate_r(random(), &rand_statebufs[t], PRNG_BUFSZ, &rand_states[t]);
        pthread_create(&thread_ids[t], NULL, &thread_run, &rand_states[t]);
    }
    for (t = 0; t < NTHREADS; t++) {
        pthread_join(thread_ids[t], NULL);
    }
    free(thread_ids);
    free(rand_states);
    free(rand_statebufs);
}

我很困惑，为什么在生成随机数时两个线程版本的性能比单线程版本差很多，因为考虑将random_r用于多线程应用程序。

问题答案：

一个非常简单的更改以将数据分配到内存中：

struct random_data* rand_states = (struct random_data*)calloc(NTHREADS * 64, sizeof(struct random_data));
char* rand_statebufs = (char*)calloc(NTHREADS*64, PRNG_BUFSZ);
pthread_t* thread_ids;
int t = 0;
thread_ids = (pthread_t*)calloc(NTHREADS, sizeof(pthread_t));
/* create threads */
for (t = 0; t < NTHREADS; t++) {
    initstate_r(random(), &rand_statebufs[t*64], PRNG_BUFSZ, &rand_states[t*64]);
    pthread_create(&thread_ids[t], NULL, &thread_run, &rand_states[t*64]);
}

导致我的双核计算机上的运行时间大大缩短。

这将证实它要测试的怀疑-您正在两个单独的线程中对同一高速缓存行中的值进行突变，因此具有高速缓存争用。赫伯·萨特（Herb Sutter）的“机器体系结构-您的编程语言从未告诉过您的话题”值得一看，如果您还有时间不知道的话，他演示了从1:20左右开始的虚假共享。

计算您的缓存行大小，并创建每个线程的数据，使其与之对齐。

将线程的所有数据整理到一个结构中，然后对齐它会更干净一些：

#define CACHE_LINE_SIZE 64

struct thread_data {
    struct random_data random_data;
    char statebuf[PRNG_BUFSZ];
    char padding[CACHE_LINE_SIZE - sizeof ( struct random_data )-PRNG_BUFSZ];
};

int main ( int argc, char** argv )
{
    printf ( "%zd\n", sizeof ( struct thread_data ) );

    void* apointer;

    if ( posix_memalign ( &apointer, sizeof ( struct thread_data ), NTHREADS * sizeof ( struct thread_data ) ) )
        exit ( 1 );

    struct thread_data* thread_states = apointer;

    memset ( apointer, 0, NTHREADS * sizeof ( struct thread_data ) );

    pthread_t* thread_ids;

    int t = 0;

    thread_ids = ( pthread_t* ) calloc ( NTHREADS, sizeof ( pthread_t ) );

    /* create threads */
    for ( t = 0; t < NTHREADS; t++ ) {
        initstate_r ( random(), thread_states[t].statebuf, PRNG_BUFSZ, &thread_states[t].random_data );
        pthread_create ( &thread_ids[t], NULL, &thread_run, &thread_states[t].random_data );
    }

    for ( t = 0; t < NTHREADS; t++ ) {
        pthread_join ( thread_ids[t], NULL );
    }

    free ( thread_ids );
    free ( thread_states );
}

与CACHE_LINE_SIZE64：

refugio:$ gcc -O3 -o bin/nixuz_random_r src/nixuz_random_r.c -lpthread
refugio:$ time bin/nixuz_random_r 
64
63499495
944240966

real    0m1.278s
user    0m2.540s
sys 0m0.000s

或者，您可以使用两倍的缓存行大小，并使用malloc-额外的填充可确保变异的内存位于单独的行上，因为malloc为16（IIRC），而不是64字节对齐。

（我将ITERATIONS减少了十倍，而不是拥有笨拙的机器）

多线程random_r比单线程版本慢

相关阅读

相关文章

相关问答

相关工具

相关文档