memcpy speed
David Gobbi
dgobbi at irus.rri.on.ca
Sun Mar 26 13:06:15 EST 2000
Well, I decided to run a benchmark of memcpy() versus
loops of the form
j = count;
while (--j >= 0)
{
*cp1++ = *cp2++;
}
and loops of the form
for (j = 0; j < count; j++)
{
*cp1++ = *cp2++;
}
Depending on the architecture and the data type, you need to copy
at least 32 bytes or memcpy is much slower than copying
the data in a loop. There is often a factor of >5 improvement
in looping over using memcpy!
Also, with gcc, the 'j = count; while (--j >= 0)' form of looping is
around 15% to 30% faster than 'for (j = 0; j < count; j++)' form
for copying less than 16 bytes. Under IRIX CC, the two forms
give identical results.
If you are doing anything more CPU-intensive than a copy inside
the loop, I doubt that there a significant difference between
the two forms.
Here are the actual numbers: the processors should have been working
entirely in cache during these benchmarks.
/*
times below are given in 0.01 seconds per 100000 kbytes
note that count = size/sizeof(type)
Celeron 366 (overclocked to 450) RedHat 6.1
g++ -02
size char short int double memcpy
1 140 n/a n/a n/a 1048
2 105 80 n/a n/a 524
4 87 59 29 n/a 267
8 108 47 26 14 291
16 88 55 22 13 151
32 79 45 26 11 83
64 75 40 22 13 48
128 72 37 20 11 30
256 71 36 19 11 26
512 71 37 19 9 15
1024 70 35 18 9 9
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
size char short int double memcpy
1 188 n/a n/a n/a 1099
2 129 94 n/a n/a 549
4 100 64 40 n/a 274
8 116 55 32 23 307
16 134 64 24 19 153
32 126 55 29 19 83
64 121 51 24 26 48
128 73 49 25 21 30
256 71 48 18 20 26
512 71 48 31 19 16
1024 70 46 30 19 11
*/
/*
sparc Ultra 1 ???MHz solaris 7
gcc -O2
size char short int double memcpy
1 258 n/a n/a n/a 1208
2 388 130 n/a n/a 788
4 352 264 65 n/a 550
8 261 182 96 78 341
16 200 123 86 92 262
32 169 92 53 80 207
64 160 90 52 74 80
128 159 81 42 74 54
256 156 80 41 73 38
512 155 79 36 70 59
1024 156 77 39 73 52
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
size char short int double memcpy
1 306 n/a n/a n/a 1252
2 472 167 n/a n/a 814
4 354 254 81 n/a 566
8 235 168 121 89 340
16 187 120 78 93 244
32 188 96 50 81 205
64 160 89 51 81 85
128 166 81 48 75 52
256 158 82 44 72 35
512 159 80 42 72 58
1024 160 77 40 74 55
*/
/*
MIPS R1 225MHz IRIX6.5
CC -O2 -OPT:Olimit=2434
size char short int double memcpy
1 91 n/a n/a n/a 1184
2 92 254 n/a n/a 478
4 91 319 119 n/a 308
8 93 80 115 63 113
16 109 68 36 63 91
32 96 66 28 16 62
64 93 57 29 18 34
128 92 68 26 14 25
256 91 69 25 13 19
512 92 47 34 12 15
1024 91 46 34 11 14
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
1 91 n/a n/a n/a 1184
2 95 243 n/a n/a 569
4 91 321 126 n/a 319
8 91 75 182 64 114
16 110 65 39 66 90
32 95 56 31 17 63
64 93 55 27 16 36
128 92 69 34 13 24
256 92 47 25 12 19
512 92 47 30 11 15
1024 92 46 23 11 14
*/
/*
MIPS R1 250MHz IRIX6.5
gcc -O2
size char short int double memcpy
1 209 n/a n/a n/a 1183
2 341 114 n/a n/a 614
4 194 171 52 n/a 342
8 211 103 91 29 148
16 154 83 51 46 108
32 144 75 42 30 73
64 140 72 38 22 38
128 184 70 36 19 27
256 138 70 35 18 20
512 137 92 34 23 16
1024 136 68 34 18 14
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
1 1594 n/a n/a n/a 1184
2 797 797 n/a n/a 616
4 546 398 398 n/a 341
8 342 273 199 199 148
16 239 170 136 101 106
32 227 119 92 69 72
64 162 94 59 43 37
128 150 103 47 34 28
256 143 75 41 24 20
512 140 71 49 20 16
1024 184 70 36 19 15
*/
--
David Gobbi, MSc dgobbi at irus.rri.on.ca
Advanced Imaging Research Group
Robarts Research Institute, University of Western Ontario
-------------- next part --------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
template<class T>
inline static void copymem(T *vp1, T *vp2, size_t size)
{
int i;
int j = 100000*1024/size;
while (--j >= 0)
{
T *sp1 = (T *)vp1;
T *sp2 = (T *)vp2;
i = size/sizeof(T);
while (--i >= 0)
{
*sp1++ = *sp2++;
}
}
}
#define benchin(MEMSIZE) \
{ \
fprintf(stderr," %-6d",MEMSIZE); \
\
copymem((char *)vp1,(char *)vp2,MEMSIZE); \
c1 = clock(); \
copymem((char *)vp1,(char *)vp2,MEMSIZE); \
c2 = clock(); \
fprintf(stderr," %6ld",(c2-c1)/10000); \
\
if (MEMSIZE >= sizeof(short)) \
{ \
copymem((short *)vp1,(short *)vp2,MEMSIZE); \
c1 = clock(); \
copymem((short *)vp1,(short *)vp2,MEMSIZE); \
c2 = clock(); \
fprintf(stderr," %6ld",(c2-c1)/10000); \
} \
else \
{ \
fprintf(stderr," n/a"); \
} \
\
if (MEMSIZE >= sizeof(int)) \
{ \
copymem((int *)vp1,(int *)vp2,MEMSIZE); \
c1 = clock(); \
copymem((int *)vp1,(int *)vp2,MEMSIZE); \
c2 = clock(); \
fprintf(stderr," %6ld",(c2-c1)/10000); \
} \
else \
{ \
fprintf(stderr," n/a"); \
} \
\
if (MEMSIZE >= sizeof(double)) \
{ \
copymem((double *)vp1,(double *)vp2,MEMSIZE); \
c1 = clock(); \
copymem((double *)vp1,(double *)vp2,MEMSIZE); \
c2 = clock(); \
fprintf(stderr," %6ld",(c2-c1)/10000); \
} \
else \
{ \
fprintf(stderr," n/a"); \
} \
\
for (j = 0; j < 100000*1024/MEMSIZE; j++) \
{ \
memcpy(vp1,vp2,MEMSIZE); \
} \
c1 = clock(); \
j = 100000*1024/MEMSIZE; \
while (j >= 0) \
{ \
memcpy(vp1,vp2,MEMSIZE); \
} \
c2 = clock(); \
\
fprintf(stderr," %6ld\n",(c2-c1)/10000); \
}
int main(int argc, char *argv[])
{
long c1,c2,c3;
int j;
void *vp1,*vp2;
vp1 = malloc(1024);
vp2 = malloc(1024);
fprintf(stdout,"size char short int double memcpy\n\n");
benchin(1);
benchin(2);
benchin(4);
benchin(8);
benchin(16);
benchin(32);
benchin(64);
benchin(128);
benchin(256);
benchin(512);
benchin(1024);
}
/*
times below are given in 0.01 seconds per 100000 kbytes copied
Celeron 366 (overclocked to 450) RedHat 6.1
g++ -02
size char short int double memcpy
1 140 n/a n/a n/a 1048
2 105 80 n/a n/a 524
4 87 59 29 n/a 267
8 108 47 26 14 291
16 88 55 22 13 151
32 79 45 26 11 83
64 75 40 22 13 48
128 72 37 20 11 30
256 71 36 19 11 26
512 71 37 19 9 15
1024 70 35 18 9 9
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
size char short int double memcpy
1 188 n/a n/a n/a 1099
2 129 94 n/a n/a 549
4 100 64 40 n/a 274
8 116 55 32 23 307
16 134 64 24 19 153
32 126 55 29 19 83
64 121 51 24 26 48
128 73 49 25 21 30
256 71 48 18 20 26
512 71 48 31 19 16
1024 70 46 30 19 11
*/
/*
sparc Ultra 1 ???MHz solaris 7
gcc -O2
size char short int double memcpy
1 258 n/a n/a n/a 1208
2 388 130 n/a n/a 788
4 352 264 65 n/a 550
8 261 182 96 78 341
16 200 123 86 92 262
32 169 92 53 80 207
64 160 90 52 74 80
128 159 81 42 74 54
256 156 80 41 73 38
512 155 79 36 70 59
1024 156 77 39 73 52
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
size char short int double memcpy
1 306 n/a n/a n/a 1252
2 472 167 n/a n/a 814
4 354 254 81 n/a 566
8 235 168 121 89 340
16 187 120 78 93 244
32 188 96 50 81 205
64 160 89 51 81 85
128 166 81 48 75 52
256 158 82 44 72 35
512 159 80 42 72 58
1024 160 77 40 74 55
*/
/*
MIPS R1 225MHz IRIX6.5
CC -O2 -OPT:Olimit=2434
size char short int double memcpy
1 91 n/a n/a n/a 1184
2 92 254 n/a n/a 478
4 91 319 119 n/a 308
8 93 80 115 63 113
16 109 68 36 63 91
32 96 66 28 16 62
64 93 57 29 18 34
128 92 68 26 14 25
256 91 69 25 13 19
512 92 47 34 12 15
1024 91 46 34 11 14
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
1 91 n/a n/a n/a 1184
2 95 243 n/a n/a 569
4 91 321 126 n/a 319
8 91 75 182 64 114
16 110 65 39 66 90
32 95 56 31 17 63
64 93 55 27 16 36
128 92 69 34 13 24
256 92 47 25 12 19
512 92 47 30 11 15
1024 92 46 23 11 14
*/
/*
MIPS R1 250MHz IRIX6.5
gcc -O2
size char short int double memcpy
1 209 n/a n/a n/a 1183
2 341 114 n/a n/a 614
4 194 171 52 n/a 342
8 211 103 91 29 148
16 154 83 51 46 108
32 144 75 42 30 73
64 140 72 38 22 38
128 184 70 36 19 27
256 138 70 35 18 20
512 137 92 34 23 16
1024 136 68 34 18 14
again, using 'for (i = 0; i < count; i++)'
instead of 'j = count; while (--j >= 0)'
1 1594 n/a n/a n/a 1184
2 797 797 n/a n/a 616
4 546 398 398 n/a 341
8 342 273 199 199 148
16 239 170 136 101 106
32 227 119 92 69 72
64 162 94 59 43 37
128 150 103 47 34 28
256 143 75 41 24 20
512 140 71 49 20 16
1024 184 70 36 19 15
*/
More information about the vtk-developers
mailing list