pastebin - collaborative debugging tool
rovema.kpaste.net RSS


x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction vs. |sched_getcpu()|
Posted by Anonymous on Sat 26th Nov 2022 15:28
raw | new post
view followups (newest first): x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction vs. |sched_getcpu()| by Anonymous

  1. rmainz@derfwpc5131:~/work/x86_rdtscp/try3_perf$ cat x86_rdtscp2_perf.c
  2. /*
  3.  * x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction
  4.  * vs. |sched_getcpu()|
  5.  *
  6.  * Compile with:
  7.  * $ gcc -std=gnu17 -m32 -Wall x86_rdtscp2_perf.c -lpthread
  8.  *
  9.  * Written by Roland Mainz <roland.mainz@nrubsig.org>
  10.  *
  11.  */
  12.  
  13. #define _XOPEN_SOURCE 700
  14. #define _GNU_SOURCE 1
  15.  
  16. #include <stdio.h>
  17. #include <stdlib.h>
  18. #include <errno.h>
  19.  
  20. #include <pthread.h>
  21. #include <sched.h>
  22.  
  23. #include <x86intrin.h>
  24.  
  25.  
  26. int main(int ac, char *av[])
  27. {
  28.         (void)puts("#start.");
  29.  
  30.         unsigned int A = 0;
  31.  
  32. #define USE_FIXED_CPU_NUMBER 1
  33. #if USE_FIXED_CPU_NUMBER
  34.         int             cpu;
  35.         cpu_set_t       cpuset;
  36.         pthread_t       thread;
  37.  
  38.         thread = pthread_self();
  39.         cpu = sched_getcpu();
  40.  
  41.         CPU_ZERO(&cpuset);
  42.         CPU_SET(cpu, &cpuset);
  43.  
  44.         if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset) == 0) {
  45.                 (void)printf("# pthread_setaffinity_np(), "
  46.                         "thread fixed to cpu %d\n",
  47.                         cpu);
  48.         }
  49.         else {
  50.                 /*
  51.                  * If you specify a CPU number beyond the maximum number
  52.                  * of CPUs, then |pthread_setaffinity_np()| can have a
  53.                  * |errno == 0|
  54.                  */
  55.                 perror("pthread_setaffinity_np");
  56.         }
  57. #endif /* USE_FIXED_CPU_NUMBER */
  58.  
  59.         struct timespec start, stop;
  60.         long long accum;
  61.         long long dummy = 0LL; /* use dummy variable to defeat optimiser */
  62.  
  63.         if (clock_gettime(CLOCK_REALTIME, &start) == -1) {
  64.                 perror("clock gettime");
  65.                 exit(EXIT_FAILURE);
  66.         }
  67.  
  68.         for (long l = 0 ; l < 100000000 ; l++) {
  69.                 dummy += _rdtscp(&A);
  70.                 dummy += A;
  71.         }
  72.  
  73.         if (clock_gettime(CLOCK_REALTIME, &stop) == -1 ) {
  74.                 perror("clock gettime");
  75.                 exit(EXIT_FAILURE);
  76.         }
  77.  
  78. #define NSECPERSEC (1000000000UL)
  79.         accum = (stop.tv_sec - start.tv_sec)*NSECPERSEC +
  80.                 (stop.tv_nsec - start.tv_nsec);
  81.         (void)printf("# time needed for x86 rdtscp instruction:\t%20.20f\n",
  82.                 (double)((long double)accum/(long double)NSECPERSEC));
  83.  
  84.  
  85.  
  86.         if (clock_gettime(CLOCK_REALTIME, &start) == -1) {
  87.                 perror( "clock gettime" );
  88.                 exit(EXIT_FAILURE);
  89.         }
  90.  
  91.         for (long l = 0 ; l < 100000000 ; l++) {
  92.                 dummy += sched_getcpu();
  93.         }
  94.  
  95.  
  96.         if (clock_gettime(CLOCK_REALTIME, &stop) == -1) {
  97.                 perror("clock gettime");
  98.                 exit(EXIT_FAILURE);
  99.         }
  100.  
  101.         accum = (stop.tv_sec - start.tv_sec)*NSECPERSEC +
  102.                 (stop.tv_nsec - start.tv_nsec);
  103.         (void)printf("# time needed for |sched_getcpu()|:      \t%20.20f\n",
  104.                 (double)((long double)accum/(long double)NSECPERSEC));
  105.  
  106.         (void)dummy;
  107.  
  108.         (void)puts("#done.");
  109.  
  110.         return EXIT_SUCCESS;
  111. }
  112. rmainz@derfwpc5131:~/work/x86_rdtscp/try3_perf$ cat Makefile
  113. #
  114. # Makefile for x86_rdtscp2_perf.c - compare execution time x86 RDTSCP
  115. # instruction vs. |sched_getcpu()|
  116. #
  117.  
  118. all: \
  119.         x86_rdtscp2_perf_32 \
  120.         x86_rdtscp2_perf_64
  121.  
  122. # explicitly use -O0 to disable peephole optimisations
  123. x86_rdtscp2_perf_32: x86_rdtscp2_perf.c
  124.         gcc -std=gnu17 -m32 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_32
  125.  
  126. x86_rdtscp2_perf_64: x86_rdtscp2_perf.c
  127.         gcc -std=gnu17 -m64 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_64
  128.  
  129.  
  130. tests: \
  131.         x86_rdtscp2_perf_32 \
  132.         x86_rdtscp2_perf_64
  133.         ksh93 -c 'set -o xtrace ; time ./x86_rdtscp2_perf_32 ; time ./x86_rdtscp2_perf_64'
  134.  
  135. clean:
  136.         rm -f \
  137.                 x86_rdtscp2_perf_32 \
  138.                 x86_rdtscp2_perf_64
  139. # EOF.
  140. rmainz@derfwpc5131:~/work/x86_rdtscp/try3_perf$ make tests
  141. ksh93 -c 'set -o xtrace ; time ./x86_rdtscp2_perf_32 ; time ./x86_rdtscp2_perf_64'
  142. + ./x86_rdtscp2_perf_32
  143. #start.
  144. # pthread_setaffinity_np(), thread fixed to cpu 0
  145. # time needed for x86 rdtscp instruction:       1.29958247999999998434
  146. # time needed for |sched_getcpu()|:             3.39503941599999992107
  147. #done.
  148.  
  149. real    0m34.77s
  150. user    0m21.45s
  151. sys     0m13.29s
  152. + ./x86_rdtscp2_perf_64
  153. #start.
  154. # pthread_setaffinity_np(), thread fixed to cpu 6
  155. # time needed for x86 rdtscp instruction:       1.20903293999999994490
  156. # time needed for |sched_getcpu()|:             1.38745017499999989674
  157. #done.
  158.  
  159. real    0m2.60s
  160. user    0m2.60s
  161. sys     0m0.00s

Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.

Syntax highlighting:

To highlight particular lines, prefix each line with {%HIGHLIGHT}




All content is user-submitted.
The administrators of this site (kpaste.net) are not responsible for their content.
Abuse reports should be emailed to us at