pastebin - collaborative debugging tool
rovema.kpaste.net RSS


x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction vs. |sched_getcpu()|
Posted by Anonymous on Sat 26th Nov 2022 15:49
raw | new post
view followups (newest first): x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction vs. |sched_getcpu()| by Anonymous
modification of post by Anonymous (view diff)

  1. rmainz@derfwpc5131:~/work/x86_rdtscp/try3_perf$ make tests
  2. gcc -std=gnu17 -m32 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_32
  3. gcc -std=gnu17 -m64 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_64
  4. ksh93 -c 'set -o xtrace ; \
  5.         cat x86_rdtscp2_perf.c ; \
  6.         cat Makefile ; \
  7.         time ./x86_rdtscp2_perf_32 ; \
  8.         time ./x86_rdtscp2_perf_64'
  9. + cat x86_rdtscp2_perf.c
  10. /*
  11.  * x86_rdtscp2_perf.c - compare execution time x86 RDTSCP instruction
  12.  * vs. |sched_getcpu()|
  13.  *
  14.  * Compile with:
  15.  * $ gcc -std=gnu17 -m32 -Wall x86_rdtscp2_perf.c -lpthread
  16.  *
  17.  * Written by Roland Mainz <roland.mainz@nrubsig.org>
  18.  *
  19.  */
  20.  
  21. #define _XOPEN_SOURCE 700
  22. #define _GNU_SOURCE 1
  23.  
  24. #include <stdio.h>
  25. #include <stdlib.h>
  26. #include <errno.h>
  27.  
  28. #include <pthread.h>
  29. #include <sched.h>
  30.  
  31. #include <x86intrin.h>
  32.  
  33.  
  34. int main(int ac, char *av[])
  35. {
  36.         (void)puts("#start.");
  37.  
  38.         unsigned int A = 0;
  39.  
  40. #define USE_FIXED_CPU_NUMBER 1
  41. #if USE_FIXED_CPU_NUMBER
  42.         int             cpu;
  43.         cpu_set_t       cpuset;
  44.         pthread_t       thread;
  45.  
  46.         thread = pthread_self();
  47.         cpu = sched_getcpu();
  48.  
  49.         CPU_ZERO(&cpuset);
  50.         CPU_SET(cpu, &cpuset);
  51.  
  52.         if (pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset) == 0) {
  53.                 (void)printf("# pthread_setaffinity_np(), "
  54.                         "thread fixed to cpu %d\n",
  55.                         cpu);
  56.         }
  57.         else {
  58.                 /*
  59.                  * If you specify a CPU number beyond the maximum number
  60.                  * of CPUs, then |pthread_setaffinity_np()| can have a
  61.                  * |errno == 0|
  62.                  */
  63.                 perror("pthread_setaffinity_np");
  64.         }
  65. #endif /* USE_FIXED_CPU_NUMBER */
  66.  
  67.         struct timespec start, stop;
  68.         long long accum;
  69.         long long dummy = 0LL; /* use dummy variable to defeat optimiser */
  70.  
  71.         if (clock_gettime(CLOCK_REALTIME, &start) == -1) {
  72.                 perror("clock gettime");
  73.                 exit(EXIT_FAILURE);
  74.         }
  75.  
  76.         for (long l = 0 ; l < 100000000 ; l++) {
  77.                 dummy += _rdtscp(&A);
  78.                 dummy += A;
  79.         }
  80.  
  81.         if (clock_gettime(CLOCK_REALTIME, &stop) == -1 ) {
  82.                 perror("clock gettime");
  83.                 exit(EXIT_FAILURE);
  84.         }
  85.  
  86. #define NSECPERSEC (1000000000LL)
  87.         accum = (stop.tv_sec - start.tv_sec)*NSECPERSEC +
  88.                 (stop.tv_nsec - start.tv_nsec);
  89.         (void)printf("# time needed for x86 rdtscp instruction:\t%20.20f\n",
  90.                 (double)(((long double)accum)/((long double)NSECPERSEC)));
  91.  
  92.  
  93.  
  94.         if (clock_gettime(CLOCK_REALTIME, &start) == -1) {
  95.                 perror( "clock gettime" );
  96.                 exit(EXIT_FAILURE);
  97.         }
  98.  
  99.         for (long l = 0 ; l < 100000000 ; l++) {
  100.                 dummy += sched_getcpu();
  101.         }
  102.  
  103.  
  104.         if (clock_gettime(CLOCK_REALTIME, &stop) == -1) {
  105.                 perror("clock gettime");
  106.                 exit(EXIT_FAILURE);
  107.         }
  108.  
  109.         accum = (stop.tv_sec - start.tv_sec)*NSECPERSEC +
  110.                 (stop.tv_nsec - start.tv_nsec);
  111.         (void)printf("# time needed for |sched_getcpu()|:      \t%20.20f\n",
  112.                 (double)(((long double)accum)/((long double)NSECPERSEC)));
  113.  
  114.         (void)dummy;
  115.  
  116.         (void)puts("#done.");
  117.  
  118.         return EXIT_SUCCESS;
  119. }
  120. + cat Makefile
  121. #
  122. # Makefile for x86_rdtscp2_perf.c - compare execution time x86 RDTSCP
  123. # instruction vs. |sched_getcpu()|
  124. #
  125.  
  126. all: \
  127.         x86_rdtscp2_perf_32 \
  128.         x86_rdtscp2_perf_64
  129.  
  130. # explicitly use -O0 to disable peephole optimisations
  131. x86_rdtscp2_perf_32: x86_rdtscp2_perf.c
  132.         gcc -std=gnu17 -m32 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_32
  133.  
  134. x86_rdtscp2_perf_64: x86_rdtscp2_perf.c
  135.         gcc -std=gnu17 -m64 -Wall -g -O0 x86_rdtscp2_perf.c -lpthread -o x86_rdtscp2_perf_64
  136.  
  137.  
  138. tests: \
  139.         x86_rdtscp2_perf_32 \
  140.         x86_rdtscp2_perf_64
  141.         ksh93 -c 'set -o xtrace ; \
  142.                 cat x86_rdtscp2_perf.c ; \
  143.                 cat Makefile ; \
  144.                 time ./x86_rdtscp2_perf_32 ; \
  145.                 time ./x86_rdtscp2_perf_64'
  146.  
  147. clean:
  148.         rm -f \
  149.                 x86_rdtscp2_perf_32 \
  150.                 x86_rdtscp2_perf_64
  151. # EOF.
  152. + ./x86_rdtscp2_perf_32
  153. #start.
  154. # pthread_setaffinity_np(), thread fixed to cpu 1
  155. # time needed for x86 rdtscp instruction:       1.32610883200000007065
  156. # time needed for |sched_getcpu()|:             33.35152474899999930358
  157. #done.
  158.  
  159. real    0m34.68s
  160. user    0m22.13s
  161. sys     0m12.53s
  162. + ./x86_rdtscp2_perf_64
  163. #start.
  164. # pthread_setaffinity_np(), thread fixed to cpu 0
  165. # time needed for x86 rdtscp instruction:       1.23842487100000009370
  166. # time needed for |sched_getcpu()|:             1.95299446999999992691
  167. #done.
  168.  
  169. real    0m3.19s
  170. user    0m3.19s
  171. sys     0m0.00s

Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.

Syntax highlighting:

To highlight particular lines, prefix each line with {%HIGHLIGHT}




All content is user-submitted.
The administrators of this site (kpaste.net) are not responsible for their content.
Abuse reports should be emailed to us at