lib pthread:单线程性能与多线程

Posted

技术标签:

【中文标题】lib pthread:单线程性能与多线程【英文标题】:lib pthread: single threaded performance vs multithreaded 【发布时间】:2012-12-19 08:43:37 【问题描述】:

(注意:我看到了这个帖子,如果是同样的问题,请告诉我:C: performance of pthread, low than single thrad)

我正在学习 pthread 库。我编写了同一个 C 程序的两个版本。该程序采用BAM files 的大列表并使用samtools library 计算记录数。

这是单线程程序:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include "bam.h"

/** maximum number of threads */
static const int MAX_COUNT_THREADS=4;

struct Param
    
    char* filename;
    ;


static void printCount(const char* filename,unsigned long count)
    
    fprintf(stdout,"%s\t%ld\n",filename,count);
    

static void* scan_bam(void* ptr)
    
    unsigned long count=0;
    struct Param* params=(struct Param*)ptr;

    bamFile in=bam_open(params->filename, "r") ;
    bam_header_t *header= NULL;
    bam1_t *b=bam_init1();

    time_t rawtime;
    time ( &rawtime );

    fprintf(stderr,"STARTING : %s %s",params->filename,ctime(&rawtime));
    if(in==0)
        
        fprintf(stderr,"Cannot read %s.\n",params->filename);
        exit(EXIT_FAILURE);
        
    header= bam_header_read(in);
    while((bam_read1(in, b)) > 0)
        
        ++count;
        
    bam_destroy1(b);
    bam_header_destroy(header);
           bam_close(in);


    printCount(params->filename,count);

    time ( &rawtime );
    fprintf(stderr,"end for %s %s",params->filename,ctime(&rawtime));
    free(params);
    return NULL;
    


int main(int argc,char** argv)
    
    int optind=1;

    while(optind<argc)
    
    struct Param* params=(struct Param*)malloc(sizeof(struct Param));

    if(params==0)
        
        fprintf(stderr,"Out of memory.\n");
        exit(EXIT_FAILURE);
        
    params->filename=argv[optind++];
    scan_bam(params);
    
    return EXIT_SUCCESS;
    

多线程程序。该程序最多可以使用 5 个线程,并使用条件锁来计算线程数,并告诉主程序在需要时启动一个新线程。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <time.h>
#include "bam.h"

#define VERIFY_ZERO(a) do if(a!=0) \
    fprintf(stderr,"Test failed at %s line %d (ret=%d).\n",__FILE__,__LINE__,a);\
    exit(EXIT_FAILURE);\
     while(0)



/** maximum number of threads */
static const int MAX_COUNT_THREADS=5;

struct Param
    
    pthread_t thread;
    char* filename;
    ;

struct GLOBALS 
    /** lock to print */
    pthread_mutex_t mutex_print;
    /** condition: wait for free thread */
    pthread_cond_t  accept_new_thread;
    /** condition lock */
    pthread_mutex_t  accept_new_thread_lock;
    /** number of running threads */
    int number_of_threads;
    ;

static struct GLOBALS globals=
    PTHREAD_MUTEX_INITIALIZER,
    PTHREAD_COND_INITIALIZER,
    PTHREAD_MUTEX_INITIALIZER,
    0
    ;


static void printCount(const char* filename,unsigned long count)
    
    int ret=pthread_mutex_lock(&globals.mutex_print);
    VERIFY_ZERO(ret);
    fprintf(stdout,"%s\t%ld\n",filename,count);
    ret=pthread_mutex_unlock(&globals.mutex_print);
    VERIFY_ZERO(ret);
    

static void* scan_bam(void* ptr)
    
    unsigned long count=0;
    struct Param* params=(struct Param*)ptr;
    bamFile in=bam_open(params->filename, "r") ;
    bam_header_t *header= NULL;
    bam1_t *b=bam_init1();
    time_t rawtime;
    time ( &rawtime );

    fprintf(stderr,"STARTING : %s %s",params->filename,ctime(&rawtime));
    if(in==0)
        
        fprintf(stderr,"Cannot read %s.\n",params->filename);
        exit(EXIT_FAILURE);
        
    header= bam_header_read(in);
    while((bam_read1(in, b)) > 0)
        
        ++count;

        
    bam_destroy1(b);
    bam_header_destroy(header);
           bam_close(in);


    printCount(params->filename,count);

    time ( &rawtime );
    fprintf(stderr,"end1 for %s %s",params->filename,ctime(&rawtime));
    pthread_mutex_lock( &globals.accept_new_thread_lock);
    globals.number_of_threads--;
    pthread_cond_signal(&globals.accept_new_thread);
    pthread_mutex_unlock(&globals.accept_new_thread_lock);
    time ( &rawtime );
    fprintf(stderr,"end2 for %s %s",params->filename,ctime(&rawtime));
    free(params);
    return NULL;
    


int main(int argc,char** argv)
    
    int optind=1;

    while(optind<argc)
    
    struct Param* params=(struct Param*)malloc(sizeof(struct Param));
    if(params==0)
        
        fprintf(stderr,"Out of memory.\n");
        exit(EXIT_FAILURE);
        
    pthread_mutex_lock(&globals.accept_new_thread_lock);
    while (globals.number_of_threads > MAX_COUNT_THREADS)
        
            pthread_cond_wait(&globals.accept_new_thread, &globals.accept_new_thread_lock);
        
    globals.number_of_threads++;
    pthread_mutex_unlock(&globals.accept_new_thread_lock);

    params->filename=argv[optind++];
    fprintf(stderr,"creating %s\n",params->filename);
    pthread_create (&(params->thread), NULL, scan_bam,params);
    pthread_detach(params->thread);

    
    pthread_mutex_lock(&globals.accept_new_thread_lock);
    while (globals.number_of_threads > 0)
    
        pthread_cond_wait(&globals.accept_new_thread, &globals.accept_new_thread_lock);
    
    pthread_mutex_unlock(&globals.accept_new_thread_lock);
    pthread_cond_destroy(&globals.accept_new_thread);
    return EXIT_SUCCESS;
    

编译运行多线程程序

gcc -O3 -Wall jeter.c -pthread -I/usr/local/package/samtools-0.1.18 -L/usr/local/package/samtools-0.1.18/  -lbam -lz
$ time (find .// -name "*recal.bam" | grep Item1[0-9] | xargs ./a.out )
creating ./Item10/recal.bam
creating ./Item11/recal.bam
creating ./Item12/recal.bam
creating ./Item13/recal.bam
creating ./Item14/recal.bam
creating ./Item15/recal.bam
STARTING : ./Item10/recal.bam Tue Dec 18 15:12:48 2012
STARTING : ./Item11/recal.bam Tue Dec 18 15:12:48 2012
STARTING : ./Item12/recal.bam Tue Dec 18 15:12:48 2012
STARTING : ./Item14/recal.bam Tue Dec 18 15:12:48 2012
STARTING : ./Item13/recal.bam Tue Dec 18 15:12:48 2012
STARTING : ./Item15/recal.bam Tue Dec 18 15:12:48 2012
./Item10/recal.bam    185784310
end1 for ./Item10/recal.bam Tue Dec 18 15:38:16 2012
end2 for ./Item10/recal.bam Tue Dec 18 15:38:16 2012
creating ./Item16/recal.bam
STARTING : ./Item16/recal.bam Tue Dec 18 15:38:16 2012
./Item11/recal.bam    204408906
end1 for ./Item11/recal.bam Tue Dec 18 15:41:52 2012
end2 for ./Item11/recal.bam Tue Dec 18 15:41:52 2012
creating ./Item17/recal.bam
STARTING : ./Item17/recal.bam Tue Dec 18 15:41:52 2012
./Item12/recal.bam    207766317
end1 for ./Item12/recal.bam Tue Dec 18 15:42:17 2012
end2 for ./Item12/recal.bam Tue Dec 18 15:42:17 2012
creating ./Item18/recal.bam
STARTING : ./Item18/recal.bam Tue Dec 18 15:42:17 2012
./Item15/recal.bam    224957522
end1 for ./Item15/recal.bam Tue Dec 18 15:44:54 2012
end2 for ./Item15/recal.bam Tue Dec 18 15:44:54 2012
creating ./Item19/recal.bam
STARTING : ./Item19/recal.bam Tue Dec 18 15:44:54 2012
./Item13/recal.bam    224548326
end1 for ./Item13/recal.bam Tue Dec 18 15:45:32 2012
end2 for ./Item13/recal.bam Tue Dec 18 15:45:32 2012
./Item14/recal.bam    241267346
end1 for ./Item14/recal.bam Tue Dec 18 15:48:28 2012
end2 for ./Item14/recal.bam Tue Dec 18 15:48:28 2012
./Item16/recal.bam    227446579
end1 for ./Item16/recal.bam Tue Dec 18 16:12:15 2012
end2 for ./Item16/recal.bam Tue Dec 18 16:12:15 2012
./Item17/recal.bam    215307379
end1 for ./Item17/recal.bam Tue Dec 18 16:13:05 2012
end2 for ./Item17/recal.bam Tue Dec 18 16:13:05 2012
./Item18/recal.bam    225914723
end1 for ./Item18/recal.bam Tue Dec 18 16:13:48 2012
end2 for ./Item18/recal.bam Tue Dec 18 16:13:48 2012
./Item19/recal.bam    225509630
end1 for ./Item19/recal.bam Tue Dec 18 16:14:06 2012
end2 for ./Item19/recal.bam Tue Dec 18 16:14:06 2012

real    61m17.560s
user    66m0.476s
sys    4m5.980s

编译运行单线程程序

$ gcc -O3 -Wall jeter2.c -I/usr/local/package/samtools-0.1.18 -L/usr/local/package/samtools-0.1.18/  -lbam -lz

time (find .// -name "*recal.bam" | grep Item1[0-9] | xargs ./a.out )
STARTING : ./Item10/recal.bam Tue Dec 18 16:15:25 2012
./Item10/recal.bam    185784310
end for ./Item10/recal.bam Tue Dec 18 16:20:43 2012
STARTING : ./Item11/recal.bam Tue Dec 18 16:20:43 2012
./Item11/recal.bam    204408906
end for ./Item11/recal.bam Tue Dec 18 16:26:20 2012
STARTING : ./Item12/recal.bam Tue Dec 18 16:26:20 2012
./Item12/recal.bam    207766317
end for ./Item12/recal.bam Tue Dec 18 16:31:56 2012
STARTING : ./Item13/recal.bam Tue Dec 18 16:31:56 2012
./Item13/recal.bam    224548326
end for ./Item13/recal.bam Tue Dec 18 16:38:05 2012
STARTING : ./Item14/recal.bam Tue Dec 18 16:38:05 2012
./Item14/recal.bam    241267346
end for ./Item14/recal.bam Tue Dec 18 16:44:59 2012
STARTING : ./Item15/recal.bam Tue Dec 18 16:44:59 2012
./Item15/recal.bam    224957522
end for ./Item15/recal.bam Tue Dec 18 16:50:56 2012
STARTING : ./Item16/recal.bam Tue Dec 18 16:50:56 2012
./Item16/recal.bam    227446579
end for ./Item16/recal.bam Tue Dec 18 16:58:07 2012
STARTING : ./Item17/recal.bam Tue Dec 18 16:58:07 2012
./Item17/recal.bam    215307379
end for ./Item17/recal.bam Tue Dec 18 17:04:58 2012
STARTING : ./Item18/recal.bam Tue Dec 18 17:04:58 2012
./Item18/recal.bam    225914723
end for ./Item18/recal.bam Tue Dec 18 17:11:31 2012
STARTING : ./Item19/recal.bam Tue Dec 18 17:11:31 2012
./Item19/recal.bam    225509630
end for ./Item19/recal.bam Tue Dec 18 17:18:19 2012

.

real    62m54.503s
user    53m39.529s
sys    3m44.580s

这两个程序都已运行了 ~1H00。所以 MT 程序的运行速度比另一个慢。为什么 ?是否可以加速该代码?

【问题讨论】:

这里的问题可能是文件系统。在线程程序中,您尝试同时读取多个文件,可能从同一个磁盘读取,效率不高。 @JoachimPileborg - 我希望有人会为这种磁盘操作生成一些实际的统计数据。海报一直说使用多个线程要么无济于事,要么实际上减慢了速度,但我没有看到任何数字/图表。如果有人真的用一个本地微调器、一个本地 SSD 和一个网络磁盘(也许还有一些组合)来尝试这个,那就太好了。 【参考方案1】:

看起来 I/O 操作(从文件中读取)在您的程序中占主导地位,因此很可能您不会从线程中获得太多好处,无论它做得多么好。

还要注意,多线程变体实际上要快一点;你需要比较实时用户时间由于多线程而更大,因为它总结了所有线程在用户模式下花费的时间。 内核时间也是如此。

【讨论】:

+1,尽管我要补充一点,如果文件位于不同的磁盘上,尤其是众所周知的延迟和容易断开连接的网络磁盘,则可以实现更大的加速。另外,我想知道如果重新设计应用程序以池化线程并避免痛苦的持续创建/终止/销毁线程会产生什么影响?【参考方案2】:

首先我想澄清一点:

多线程并不一定意味着您的程序会运行得更快!

这可能意味着,但这取决于您的程序可以真正执行的并行度。

我在您的 MT 代码中看到您正在等待条件变量,这意味着那里可能存在争用点,并且这里没有真正的并行性。

当你的线程每个都可以做一大堆工作而不必等待其他线程时,实际上可以通过线程获得真正的加速。如果它们可以完全独立完成,您可能会得到一些加速。

然后,该加速还取决于诸如错误共享之类的事情,即(假设是多核 CPU),如果您在内存中有一些数组并且线程 A 需要该数组中的元素 2,而另一个运行的线程 B 也需要元素 4在那个数组中,元素是 sizeof(int) 例如,那么你就会知道你遇到了缓存问题。每次任何一个线程写入数组时,都需要更新缓存。假设高速缓存行是 64k。这是虚假分享。这可能会导致相当严重的减速。

还有其他原因导致您的 MT 程序无法比单线程程序加速。你应该问的最终问题是:

我可以让线程做不依赖于任何其他线程的工作吗?

从你让你的线程做的所有等待来看,似乎没有。

【讨论】:

我明白了。感谢您提供有用的答案。稍后我会验证最佳答案。

以上是关于lib pthread:单线程性能与多线程的主要内容,如果未能解决你的问题,请参考以下文章

单线程与多线程

Redis单线程与多线程模型

Redis单线程与多线程模型

单线程与多线程的区别

操作系统基础知识之————单线程(Thread)与多线程的区别

爬虫热身——性能相关