10
#include "lib/memcpy.c"
15
# include <cuda_runtime.h>
23
#define SIZE (64 * 1024 * 1024l)
27
void fast_memcpy_gcc(void *dst, void *src, size_t size) {
28
__builtin_memcpy(dst, src, size);
31
void fast_memcpy_glibc(void *dst, void *src, size_t size) {
32
memcpy(dst, src, size);
35
void fast_memcpy_bcopy(void *dst, void *src, size_t size) {
36
bcopy(dst, src, size);
39
void fast_memcpy_vik(void *dst, void *src, size_t size) {
40
fast_memcpy(dst, src, size);
43
void fast_memcpy_intrinsic(void *dst, void *src, size_t size) {
44
char *dst_end=dst+size;
46
__m128i res = _mm_stream_load_si128((__m128i *)src);
47
*((__m128i *)dst)=res;
53
#ifndef __INTEL_COMPILER
54
void fast_memcpy_repmovs(void *dst, void *src, size_t size) {
55
__asm__ __volatile__ (
63
// "shr $2, %%rcx \n\t"
67
: "p" (dst), "p" (src), "r" (size)
68
: "%esi", "%edi", "%ecx"
73
void fast_memcpy_dqa_ntps(void *dst, void *src, size_t size) {
74
size_t sse_size = (size / 128);
76
__asm__ __volatile__ (
81
"prefetchnta 128(%0,%%rax) \n\t"
82
"prefetchnta 160(%0,%%rax) \n\t"
83
"prefetchnta 192(%0,%%rax) \n\t"
84
"prefetchnta 224(%0,%%rax) \n\t"
87
"movdqa (%0,%%rax), %%xmm0 \n\t"
88
"movdqa 16(%0,%%rax), %%xmm1 \n\t"
89
"movdqa 32(%0,%%rax), %%xmm2 \n\t"
90
"movdqa 48(%0,%%rax), %%xmm3 \n\t"
91
"movdqa 64(%0,%%rax), %%xmm4 \n\t"
92
"movdqa 80(%0,%%rax), %%xmm5 \n\t"
93
"movdqa 96(%0,%%rax), %%xmm6 \n\t"
94
"movdqa 112(%0,%%rax), %%xmm7 \n\t"
96
"movntps %%xmm0, (%1,%%rax) \n\t"
97
"movntps %%xmm1, 16(%1,%%rax) \n\t"
98
"movntps %%xmm2, 32(%1,%%rax) \n\t"
99
"movntps %%xmm3, 48(%1,%%rax) \n\t"
100
"movntps %%xmm4, 64(%1,%%rax) \n\t"
101
"movntps %%xmm5, 80(%1,%%rax) \n\t"
102
"movntps %%xmm6, 96(%1,%%rax) \n\t"
103
"movntps %%xmm7, 112(%1,%%rax) \n\t"
105
"add $128, %%rax \n\t"
112
: "p" (dst), "p" (src), "r" (sse_size)
117
void fast_memcpy_ntdqa_ntps(void *dst, void *src, size_t size) {
118
size_t sse_size = (size / 128);
120
__asm__ __volatile__ (
125
"prefetchnta 128(%0,%%rax) \n\t"
126
"prefetchnta 160(%0,%%rax) \n\t"
127
"prefetchnta 192(%0,%%rax) \n\t"
128
"prefetchnta 224(%0,%%rax) \n\t"
131
"movntdqa (%0,%%rax), %%xmm0 \n\t"
132
"movntdqa 16(%0,%%rax), %%xmm1 \n\t"
133
"movntdqa 32(%0,%%rax), %%xmm2 \n\t"
134
"movntdqa 48(%0,%%rax), %%xmm3 \n\t"
135
"movntdqa 64(%0,%%rax), %%xmm4 \n\t"
136
"movntdqa 80(%0,%%rax), %%xmm5 \n\t"
137
"movntdqa 96(%0,%%rax), %%xmm6 \n\t"
138
"movntdqa 112(%0,%%rax), %%xmm7 \n\t"
140
"movntps %%xmm0, (%1,%%rax) \n\t"
141
"movntps %%xmm1, 16(%1,%%rax) \n\t"
142
"movntps %%xmm2, 32(%1,%%rax) \n\t"
143
"movntps %%xmm3, 48(%1,%%rax) \n\t"
144
"movntps %%xmm4, 64(%1,%%rax) \n\t"
145
"movntps %%xmm5, 80(%1,%%rax) \n\t"
146
"movntps %%xmm6, 96(%1,%%rax) \n\t"
147
"movntps %%xmm7, 112(%1,%%rax) \n\t"
149
"add $128, %%rax \n\t"
156
: "p" (dst), "p" (src), "r" (sse_size)
162
void fast_memcpy_ntdqa_ntdq(void *dst, void *src, size_t size) {
163
size_t sse_size = (size / 128);
165
__asm__ __volatile__ (
170
"prefetchnta 128(%0,%%rax) \n\t"
171
"prefetchnta 160(%0,%%rax) \n\t"
172
"prefetchnta 192(%0,%%rax) \n\t"
173
"prefetchnta 224(%0,%%rax) \n\t"
176
"movntdqa (%0,%%rax), %%xmm0 \n\t"
177
"movntdqa 16(%0,%%rax), %%xmm1 \n\t"
178
"movntdqa 32(%0,%%rax), %%xmm2 \n\t"
179
"movntdqa 48(%0,%%rax), %%xmm3 \n\t"
180
"movntdqa 64(%0,%%rax), %%xmm4 \n\t"
181
"movntdqa 80(%0,%%rax), %%xmm5 \n\t"
182
"movntdqa 96(%0,%%rax), %%xmm6 \n\t"
183
"movntdqa 112(%0,%%rax), %%xmm7 \n\t"
185
"movntdq %%xmm0, (%1,%%rax) \n\t"
186
"movntdq %%xmm1, 16(%1,%%rax) \n\t"
187
"movntdq %%xmm2, 32(%1,%%rax) \n\t"
188
"movntdq %%xmm3, 48(%1,%%rax) \n\t"
189
"movntdq %%xmm4, 64(%1,%%rax) \n\t"
190
"movntdq %%xmm5, 80(%1,%%rax) \n\t"
191
"movntdq %%xmm6, 96(%1,%%rax) \n\t"
192
"movntdq %%xmm7, 112(%1,%%rax) \n\t"
194
"add $128, %%rax \n\t"
201
: "p" (dst), "p" (src), "r" (sse_size)
207
void fast_memcpy_aps(void *dst, void *src, size_t size) {
208
size_t sse_size = (size / 128);
210
__asm__ __volatile__ (
215
"prefetchnta 128(%0,%%rax) \n\t"
216
"prefetchnta 160(%0,%%rax) \n\t"
217
"prefetchnta 192(%0,%%rax) \n\t"
218
"prefetchnta 224(%0,%%rax) \n\t"
221
"movaps (%0,%%rax), %%xmm0 \n\t"
222
"movaps 16(%0,%%rax), %%xmm1 \n\t"
223
"movaps 32(%0,%%rax), %%xmm2 \n\t"
224
"movaps 48(%0,%%rax), %%xmm3 \n\t"
225
"movaps 64(%0,%%rax), %%xmm4 \n\t"
226
"movaps 80(%0,%%rax), %%xmm5 \n\t"
227
"movaps 96(%0,%%rax), %%xmm6 \n\t"
228
"movaps 112(%0,%%rax), %%xmm7 \n\t"
230
"movaps %%xmm0, (%1,%%rax) \n\t"
231
"movaps %%xmm1, 16(%1,%%rax) \n\t"
232
"movaps %%xmm2, 32(%1,%%rax) \n\t"
233
"movaps %%xmm3, 48(%1,%%rax) \n\t"
234
"movaps %%xmm4, 64(%1,%%rax) \n\t"
235
"movaps %%xmm5, 80(%1,%%rax) \n\t"
236
"movaps %%xmm6, 96(%1,%%rax) \n\t"
237
"movaps %%xmm7, 112(%1,%%rax) \n\t"
239
"add $128, %%rax \n\t"
246
: "p" (dst), "p" (src), "r" (sse_size)
251
void fast_memcpy_avx(void *dst, void *src, size_t size) {
252
size_t sse_size = (size / 512);
254
__asm__ __volatile__ (
259
/* "prefetchnta 128(%0,%%rax) \n\t"
260
"prefetchnta 160(%0,%%rax) \n\t"
261
"prefetchnta 192(%0,%%rax) \n\t"
262
"prefetchnta 224(%0,%%rax) \n\t"*/
265
"vmovdqa (%0,%%rax), %%ymm0 \n\t"
266
"vmovdqa 32(%0,%%rax), %%ymm1 \n\t"
267
"vmovdqa 64(%0,%%rax), %%ymm2 \n\t"
268
"vmovdqa 96(%0,%%rax), %%ymm3 \n\t"
269
"vmovdqa 128(%0,%%rax), %%ymm4 \n\t"
270
"vmovdqa 160(%0,%%rax), %%ymm5 \n\t"
271
"vmovdqa 192(%0,%%rax), %%ymm6 \n\t"
272
"vmovdqa 224(%0,%%rax), %%ymm7 \n\t"
274
"vmovdqa 256(%0,%%rax), %%ymm8 \n\t"
275
"vmovdqa 288(%0,%%rax), %%ymm9 \n\t"
276
"vmovdqa 320(%0,%%rax), %%ymm10 \n\t"
277
"vmovdqa 352(%0,%%rax), %%ymm11 \n\t"
278
"vmovdqa 384(%0,%%rax), %%ymm12 \n\t"
279
"vmovdqa 416(%0,%%rax), %%ymm13 \n\t"
280
"vmovdqa 448(%0,%%rax), %%ymm14 \n\t"
281
"vmovdqa 480(%0,%%rax), %%ymm15 \n\t"
283
"vmovntps %%ymm0, (%1,%%rax) \n\t"
284
"vmovntps %%ymm1, 32(%1,%%rax) \n\t"
285
"vmovntps %%ymm2, 64(%1,%%rax) \n\t"
286
"vmovntps %%ymm3, 96(%1,%%rax) \n\t"
287
"vmovntps %%ymm4, 128(%1,%%rax) \n\t"
288
"vmovntps %%ymm5, 160(%1,%%rax) \n\t"
289
"vmovntps %%ymm6, 192(%1,%%rax) \n\t"
290
"vmovntps %%ymm7, 224(%1,%%rax) \n\t"
292
"vmovntps %%ymm8, 256(%1,%%rax) \n\t"
293
"vmovntps %%ymm9, 288(%1,%%rax) \n\t"
294
"vmovntps %%ymm10, 320(%1,%%rax) \n\t"
295
"vmovntps %%ymm11, 352(%1,%%rax) \n\t"
296
"vmovntps %%ymm12, 384(%1,%%rax) \n\t"
297
"vmovntps %%ymm13, 416(%1,%%rax) \n\t"
298
"vmovntps %%ymm14, 448(%1,%%rax) \n\t"
299
"vmovntps %%ymm15, 480(%1,%%rax) \n\t"
301
"add $512, %%rax \n\t"
308
: "p" (dst), "p" (src), "r" (sse_size)
317
typedef void (*malloc_function)(void **data, size_t size);
318
typedef void (*memcpy_function)(void *dst, void *src, size_t size);
319
typedef void (*free_function)(void* data, size_t size);
324
malloc_function src_malloc;
325
malloc_function dst_malloc;
326
memcpy_function memcpy;
327
free_function src_free;
328
free_function dst_free;
330
int ready, start, done;
334
void *memcpy_thread(volatile struct test_t *ctx) {
338
ctx->src_malloc(&src, ctx->size);
339
ctx->dst_malloc(&dst, ctx->size);
344
for (i = 0; i < ITERS; i++) {
345
ctx->memcpy(dst, src, ctx->size);
349
ctx->dst_free(dst, ctx->size);
350
ctx->src_free(src, ctx->size);
354
void run_memcpy(size_t threads, struct test_t *test, struct timeval *start, struct timeval *end) {
357
volatile struct test_t ctx[threads];
358
pthread_t th[threads];
360
for (i = 0; i < threads; i++) {
361
memcpy((void*)&ctx[i], test, sizeof(struct test_t));
362
err = pthread_create(&th[i], NULL, (void *(*)(void*))&memcpy_thread, (void*)&ctx[i]);
363
if (err) perror("pthread failed");
366
for (i = 0; i < threads; i++) {
367
while (!ctx[i].ready);
370
for (i = 0; i < threads; i++) {
374
gettimeofday(start, NULL);
376
for (i = 0; i < threads; i++) {
377
while (!ctx[i].done);
380
gettimeofday(end, NULL);
382
for (i = 0; i < threads; i++) {
383
pthread_join(th[i], NULL);
387
int hw_get_cpu_count(void) {
393
err = sched_getaffinity(getpid(), sizeof(mask), &mask);
397
cpu_count = CPU_COUNT(&mask);
399
for (cpu_count = 0; cpu_count < CPU_SETSIZE; cpu_count++) {
400
if (!CPU_ISSET(cpu_count, &mask)) break;
409
void ALLOC_std(void **a, size_t size) {
410
posix_memalign(a, 4096, size);
413
void FREE_std(void *a, size_t size) {
418
void ALLOC_cuda(void **a, size_t size) {
419
cudaHostAlloc(a, SIZE, cudaHostAllocPortable);
422
void FREE_cuda(void *a, size_t size) {
426
void ALLOC_cudawc(void **a, size_t size) {
427
cudaHostAlloc(a, SIZE, cudaHostAllocPortable|cudaHostAllocWriteCombined);
430
void FREE_cudawc(void *a, size_t size) {
433
#endif /* USE_CUDA */
435
#define PTEST(memtest, mem_in, mem_out, count) \
436
memset(&test, 0, sizeof(struct test_t)); \
438
test.src_malloc = ALLOC_##mem_in; \
439
test.dst_malloc = ALLOC_##mem_out; \
440
test.src_free = FREE_##mem_in; \
441
test.dst_free = &FREE_##mem_out; \
442
test.memcpy = fast_memcpy_##memtest; \
443
run_memcpy(count, &test, &tv1, &tv2); \
444
printf("Test %s (%u threads, %s => %s): %.2lf GB/s\n", #memtest, count, #mem_in, #mem_out, 1000000. * count * SIZE * ITERS / ((tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec) / (1024 * 1024 * 1024));
448
# define PTESTS(test, mem_in, mem_out, count) \
449
PTEST(test, mem_in, mem_out, THREADS)
451
# define PTESTS(test, mem_in, mem_out, count) \
452
for (i = 2; i < cpus; i *= 2) { \
453
PTEST(test, mem_in, mem_out, i) \
455
if ((i != cpus)&&((cpus%2)==0)) { \
456
PTEST(test, mem_in, mem_out, cpus/2) \
458
PTEST(test, mem_in, mem_out, cpus)
459
# endif /* THREADS count */
461
# define PTESTS(test, mem_in, mem_out, count)
464
void mptest(memcpy_function test, void *dst, void *src, size_t size) {
467
#pragma omp parallel for
468
for (i = 0; i < ITERS; i++) {
469
test(dst + i * size, src + i * size, size);
474
# define MPTEST(test, mem_in, mem_out) \
475
ALLOC_##mem_in(&a, SIZE * ITERS); \
476
ALLOC_##mem_out(&b, SIZE * ITERS); \
477
gettimeofday(&tv1, NULL); \
478
mptest(fast_memcpy_##test, a, b, SIZE); \
479
gettimeofday(&tv2, NULL); \
480
FREE_##mem_out(b, SIZE * ITERS); \
481
FREE_##mem_in(a, SIZE * ITERS); \
482
printf("Test %s (%s => %s): %.2lf GB/s\n", #test, #mem_in, #mem_out, 1000000. * SIZE * ITERS / ((tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec) / (1024 * 1024 * 1024));\
483
PTESTS(test, mem_in, mem_out, cpus)
485
# define MPTEST(test, mem_in, mem_out)
486
#endif /* MULTIMODE */
488
#define TEST(test, mem_in, mem_out) \
489
ALLOC_##mem_in(&a, SIZE); \
490
ALLOC_##mem_out(&b, SIZE); \
491
gettimeofday(&tv1, NULL); \
492
for (i = 0; i < ITERS; i++) { \
493
fast_memcpy_##test(a, b, SIZE); \
495
gettimeofday(&tv2, NULL); \
496
FREE_##mem_out(b, SIZE); \
497
FREE_##mem_in(a, SIZE); \
498
printf("Test %s (%s => %s): %.2lf GB/s\n", #test, #mem_in, #mem_out, 1000000. * SIZE * ITERS / ((tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec) / (1024 * 1024 * 1024));\
499
MPTEST(test, mem_in, mem_out) \
500
PTESTS(test, mem_in, mem_out, cpus)
504
# define TESTS(test) \
505
TEST(test, std, std) \
506
TEST(test, cuda, cuda) \
507
TEST(test, cudawc, cudawc) \
508
TEST(test, std, cuda) \
509
TEST(test, cuda, std) \
510
TEST(test, std, cudawc) \
511
TEST(test, cudawc, std) \
512
TEST(test, cuda, cudawc) \
513
TEST(test, cudawc, cuda)
515
# define TESTS(test) \
517
#endif /* USE_CUDA */
521
size_t cpus = hw_get_cpu_count();
523
struct timeval tv1, tv2;
527
cudaHostAlloc(&a, SIZE, cudaHostAllocPortable);//4);//cudaHostAllocWriteCombined);
528
#endif /* USE_CUDA */
536
#ifndef __INTEL_COMPILER