memcpy speed

David Gobbi dgobbi at irus.rri.on.ca
Sun Mar 26 13:06:15 EST 2000


Well, I decided to run a benchmark of memcpy() versus 
loops of the form

j = count;
while (--j >= 0) 
{
  *cp1++ = *cp2++;
}

and loops of the form

for (j = 0; j < count; j++)
{
  *cp1++ = *cp2++;
}


Depending on the architecture and the data type, you need to copy
at least 32 bytes or memcpy is much slower than copying
the data in a loop.  There is often a factor of >5 improvement
in looping over using memcpy!

Also, with gcc, the 'j = count; while (--j >= 0)' form of looping is
around 15% to 30% faster than 'for (j = 0; j < count; j++)' form
for copying less than 16 bytes.  Under IRIX CC, the two forms
give identical results.  

If you are doing anything more CPU-intensive than a copy inside
the loop, I doubt that there a significant difference between
the two forms.


Here are the actual numbers: the processors should have been working
entirely in cache during these benchmarks.


/*
times below are given in 0.01 seconds per 100000 kbytes 

note that count = size/sizeof(type)

Celeron 366 (overclocked to 450) RedHat 6.1
g++ -02 

size      char  short    int double memcpy

 1         140    n/a    n/a    n/a   1048
 2         105     80    n/a    n/a    524
 4          87     59     29    n/a    267
 8         108     47     26     14    291
 16         88     55     22     13    151
 32         79     45     26     11     83
 64         75     40     22     13     48
 128        72     37     20     11     30
 256        71     36     19     11     26
 512        71     37     19      9     15
 1024       70     35     18      9      9


again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

size      char  short    int double memcpy

 1         188    n/a    n/a    n/a   1099
 2         129     94    n/a    n/a    549
 4         100     64     40    n/a    274
 8         116     55     32     23    307
 16        134     64     24     19    153
 32        126     55     29     19     83
 64        121     51     24     26     48
 128        73     49     25     21     30
 256        71     48     18     20     26
 512        71     48     31     19     16
 1024       70     46     30     19     11
*/

/*
sparc Ultra 1 ???MHz solaris 7
gcc -O2

size      char  short    int double memcpy

 1         258    n/a    n/a    n/a   1208
 2         388    130    n/a    n/a    788
 4         352    264     65    n/a    550
 8         261    182     96     78    341
 16        200    123     86     92    262
 32        169     92     53     80    207
 64        160     90     52     74     80
 128       159     81     42     74     54
 256       156     80     41     73     38
 512       155     79     36     70     59
 1024      156     77     39     73     52


again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

size      char  short    int double memcpy

 1         306    n/a    n/a    n/a   1252
 2         472    167    n/a    n/a    814
 4         354    254     81    n/a    566
 8         235    168    121     89    340
 16        187    120     78     93    244
 32        188     96     50     81    205
 64        160     89     51     81     85
 128       166     81     48     75     52
 256       158     82     44     72     35
 512       159     80     42     72     58
 1024      160     77     40     74     55
*/

/* 
MIPS R1 225MHz IRIX6.5
CC -O2 -OPT:Olimit=2434

size      char  short    int double memcpy

 1          91    n/a    n/a    n/a   1184
 2          92    254    n/a    n/a    478
 4          91    319    119    n/a    308
 8          93     80    115     63    113
 16        109     68     36     63     91
 32         96     66     28     16     62
 64         93     57     29     18     34
 128        92     68     26     14     25
 256        91     69     25     13     19
 512        92     47     34     12     15
 1024       91     46     34     11     14

again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

 1          91    n/a    n/a    n/a   1184
 2          95    243    n/a    n/a    569
 4          91    321    126    n/a    319
 8          91     75    182     64    114
 16        110     65     39     66     90
 32         95     56     31     17     63
 64         93     55     27     16     36
 128        92     69     34     13     24
 256        92     47     25     12     19
 512        92     47     30     11     15
 1024       92     46     23     11     14
*/

/*
MIPS R1 250MHz IRIX6.5
gcc -O2

size      char  short    int double memcpy

 1         209    n/a    n/a    n/a   1183
 2         341    114    n/a    n/a    614
 4         194    171     52    n/a    342
 8         211    103     91     29    148
 16        154     83     51     46    108
 32        144     75     42     30     73
 64        140     72     38     22     38
 128       184     70     36     19     27
 256       138     70     35     18     20
 512       137     92     34     23     16
 1024      136     68     34     18     14

again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

 1        1594    n/a    n/a    n/a   1184
 2         797    797    n/a    n/a    616
 4         546    398    398    n/a    341
 8         342    273    199    199    148
 16        239    170    136    101    106
 32        227    119     92     69     72
 64        162     94     59     43     37
 128       150    103     47     34     28
 256       143     75     41     24     20
 512       140     71     49     20     16
 1024      184     70     36     19     15
*/


--
  David Gobbi, MSc                    dgobbi at irus.rri.on.ca
  Advanced Imaging Research Group
  Robarts Research Institute, University of Western Ontario
-------------- next part --------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

template<class T> 
inline static void copymem(T *vp1, T *vp2, size_t size)
{
  int i;
  int j = 100000*1024/size;
  while (--j >= 0)
    {
    T *sp1 = (T *)vp1;
    T *sp2 = (T *)vp2;
    i = size/sizeof(T);
    while (--i >= 0)
      {
      *sp1++ = *sp2++;
      }
    }
}

#define benchin(MEMSIZE) \
{        \
  fprintf(stderr," %-6d",MEMSIZE);     \
    \
  copymem((char *)vp1,(char *)vp2,MEMSIZE);    \
  c1 = clock();    \
  copymem((char *)vp1,(char *)vp2,MEMSIZE);    \
  c2 = clock();    \
  fprintf(stderr," %6ld",(c2-c1)/10000);    \
    \
  if (MEMSIZE >= sizeof(short))    \
    {    \
    copymem((short *)vp1,(short *)vp2,MEMSIZE);    \
    c1 = clock();    \
    copymem((short *)vp1,(short *)vp2,MEMSIZE);    \
    c2 = clock();    \
    fprintf(stderr," %6ld",(c2-c1)/10000);    \
    }    \
  else    \
    {    \
    fprintf(stderr,"    n/a");    \
    }    \
     \
  if (MEMSIZE >= sizeof(int))    \
    {    \
    copymem((int *)vp1,(int *)vp2,MEMSIZE);    \
    c1 = clock();    \
    copymem((int *)vp1,(int *)vp2,MEMSIZE);    \
    c2 = clock();    \
    fprintf(stderr," %6ld",(c2-c1)/10000);    \
    }    \
  else    \
    {    \
    fprintf(stderr,"    n/a");    \
    }    \
    \
  if (MEMSIZE >= sizeof(double))    \
    {    \
    copymem((double *)vp1,(double *)vp2,MEMSIZE);    \
    c1 = clock();    \
    copymem((double *)vp1,(double *)vp2,MEMSIZE);    \
    c2 = clock();    \
    fprintf(stderr," %6ld",(c2-c1)/10000);    \
    }    \
  else    \
    {    \
    fprintf(stderr,"    n/a");    \
    }    \
    \
  for (j = 0; j < 100000*1024/MEMSIZE; j++)    \
    {    \
    memcpy(vp1,vp2,MEMSIZE);    \
    }    \
  c1 = clock();    \
  j = 100000*1024/MEMSIZE; \
  while (j >= 0)    \
    {    \
    memcpy(vp1,vp2,MEMSIZE);    \
    }    \
  c2 = clock();    \
    \
  fprintf(stderr," %6ld\n",(c2-c1)/10000);    \
}   

int main(int argc, char *argv[])
{
  long c1,c2,c3;
  int j;
  void *vp1,*vp2;

  vp1 = malloc(1024);
  vp2 = malloc(1024);

  fprintf(stdout,"size      char  short    int double memcpy\n\n");

  benchin(1);
  benchin(2);
  benchin(4);
  benchin(8);
  benchin(16);
  benchin(32);
  benchin(64);
  benchin(128);
  benchin(256);
  benchin(512);
  benchin(1024);
}

/*
times below are given in 0.01 seconds per 100000 kbytes copied

Celeron 366 (overclocked to 450) RedHat 6.1
g++ -02 

size      char  short    int double memcpy

 1         140    n/a    n/a    n/a   1048
 2         105     80    n/a    n/a    524
 4          87     59     29    n/a    267
 8         108     47     26     14    291
 16         88     55     22     13    151
 32         79     45     26     11     83
 64         75     40     22     13     48
 128        72     37     20     11     30
 256        71     36     19     11     26
 512        71     37     19      9     15
 1024       70     35     18      9      9


again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

size      char  short    int double memcpy

 1         188    n/a    n/a    n/a   1099
 2         129     94    n/a    n/a    549
 4         100     64     40    n/a    274
 8         116     55     32     23    307
 16        134     64     24     19    153
 32        126     55     29     19     83
 64        121     51     24     26     48
 128        73     49     25     21     30
 256        71     48     18     20     26
 512        71     48     31     19     16
 1024       70     46     30     19     11
*/

/*
sparc Ultra 1 ???MHz solaris 7
gcc -O2

size      char  short    int double memcpy

 1         258    n/a    n/a    n/a   1208
 2         388    130    n/a    n/a    788
 4         352    264     65    n/a    550
 8         261    182     96     78    341
 16        200    123     86     92    262
 32        169     92     53     80    207
 64        160     90     52     74     80
 128       159     81     42     74     54
 256       156     80     41     73     38
 512       155     79     36     70     59
 1024      156     77     39     73     52


again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

size      char  short    int double memcpy

 1         306    n/a    n/a    n/a   1252
 2         472    167    n/a    n/a    814
 4         354    254     81    n/a    566
 8         235    168    121     89    340
 16        187    120     78     93    244
 32        188     96     50     81    205
 64        160     89     51     81     85
 128       166     81     48     75     52
 256       158     82     44     72     35
 512       159     80     42     72     58
 1024      160     77     40     74     55
*/

/* 
MIPS R1 225MHz IRIX6.5
CC -O2 -OPT:Olimit=2434

size      char  short    int double memcpy

 1          91    n/a    n/a    n/a   1184
 2          92    254    n/a    n/a    478
 4          91    319    119    n/a    308
 8          93     80    115     63    113
 16        109     68     36     63     91
 32         96     66     28     16     62
 64         93     57     29     18     34
 128        92     68     26     14     25
 256        91     69     25     13     19
 512        92     47     34     12     15
 1024       91     46     34     11     14

again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

 1          91    n/a    n/a    n/a   1184
 2          95    243    n/a    n/a    569
 4          91    321    126    n/a    319
 8          91     75    182     64    114
 16        110     65     39     66     90
 32         95     56     31     17     63
 64         93     55     27     16     36
 128        92     69     34     13     24
 256        92     47     25     12     19
 512        92     47     30     11     15
 1024       92     46     23     11     14
*/

/*
MIPS R1 250MHz IRIX6.5
gcc -O2

size      char  short    int double memcpy

 1         209    n/a    n/a    n/a   1183
 2         341    114    n/a    n/a    614
 4         194    171     52    n/a    342
 8         211    103     91     29    148
 16        154     83     51     46    108
 32        144     75     42     30     73
 64        140     72     38     22     38
 128       184     70     36     19     27
 256       138     70     35     18     20
 512       137     92     34     23     16
 1024      136     68     34     18     14

again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'

 1        1594    n/a    n/a    n/a   1184
 2         797    797    n/a    n/a    616
 4         546    398    398    n/a    341
 8         342    273    199    199    148
 16        239    170    136    101    106
 32        227    119     92     69     72
 64        162     94     59     43     37
 128       150    103     47     34     28
 256       143     75     41     24     20
 512       140     71     49     20     16
 1024      184     70     36     19     15
*/


More information about the vtk-developers mailing list