reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
   88
   89
   90
   91
   92
   93
   94
   95
   96
   97
   98
   99
  100
  101
  102
  103
  104
  105
  106
  107
  108
  109
  110
  111
  112
  113
  114
  115
  116
  117
  118
  119
  120
// Mini-benchmark for tsan VTS worst case performance
// Idea:
// 1) Spawn M + N threads (M >> N)
//    We'll call the 'M' threads as 'garbage threads'.
// 2) Make sure all threads have created thus no TIDs were reused
// 3) Join the garbage threads
// 4) Do many sync operations on the remaining N threads
//
// It turns out that due to O(M+N) VTS complexity the (4) is much slower with
// when N is large.
//
// Some numbers:
// a) clang++ native O1 with n_iterations=200kk takes
//      5s regardless of M
//    clang++ tsanv2 O1 with n_iterations=20kk takes
//      23.5s with M=200
//      11.5s with M=1
//    i.e. tsanv2 is ~23x to ~47x slower than native, depends on M.
// b) g++ native O1 with n_iterations=200kk takes
//      5.5s regardless of M
//    g++ tsanv1 O1 with n_iterations=2kk takes
//      39.5s with M=200
//      20.5s with M=1
//    i.e. tsanv1 is ~370x to ~720x slower than native, depends on M.

#include <assert.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>

class __attribute__((aligned(64))) Mutex {
 public:
  Mutex()  { pthread_mutex_init(&m_, NULL); }
  ~Mutex() { pthread_mutex_destroy(&m_); }
  void Lock() { pthread_mutex_lock(&m_); }
  void Unlock() { pthread_mutex_unlock(&m_); }

 private:
  pthread_mutex_t m_;
};

const int kNumMutexes = 1024;
Mutex mutexes[kNumMutexes];

int n_threads, n_iterations;

pthread_barrier_t all_threads_ready, main_threads_ready;

void* GarbageThread(void *unused) {
  pthread_barrier_wait(&all_threads_ready);
  return 0;
}

void *Thread(void *arg) {
  long idx = (long)arg;
  pthread_barrier_wait(&all_threads_ready);

  // Wait for the main thread to join the garbage threads.
  pthread_barrier_wait(&main_threads_ready);

  printf("Thread %ld go!\n", idx);
  int offset = idx * kNumMutexes / n_threads;
  for (int i = 0; i < n_iterations; i++) {
    mutexes[(offset + i) % kNumMutexes].Lock();
    mutexes[(offset + i) % kNumMutexes].Unlock();
  }
  printf("Thread %ld done\n", idx);
  return 0;
}

int main(int argc, char **argv) {
  int n_garbage_threads;
  if (argc == 1) {
    n_threads = 2;
    n_garbage_threads = 200;
    n_iterations = 20000000;
  } else if (argc == 4) {
    n_threads = atoi(argv[1]);
    assert(n_threads > 0 && n_threads <= 32);
    n_garbage_threads = atoi(argv[2]);
    assert(n_garbage_threads > 0 && n_garbage_threads <= 16000);
    n_iterations = atoi(argv[3]);
  } else {
    printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]);
    return 1;
  }
  printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n",
         __FILE__, n_threads, n_garbage_threads, n_iterations);

  pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1);
  pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1);

  pthread_t *t = new pthread_t[n_threads];
  {
    pthread_t *g_t = new pthread_t[n_garbage_threads];
    for (int i = 0; i < n_garbage_threads; i++) {
      int status = pthread_create(&g_t[i], 0, GarbageThread, NULL);
      assert(status == 0);
    }
    for (int i = 0; i < n_threads; i++) {
      int status = pthread_create(&t[i], 0, Thread, (void*)i);
      assert(status == 0);
    }
    pthread_barrier_wait(&all_threads_ready);
    printf("All threads started! Killing the garbage threads.\n");
    for (int i = 0; i < n_garbage_threads; i++) {
      pthread_join(g_t[i], 0);
    }
    delete [] g_t;
  }
  printf("Resuming the main threads.\n");
  pthread_barrier_wait(&main_threads_ready);


  for (int i = 0; i < n_threads; i++) {
    pthread_join(t[i], 0);
  }
  delete [] t;
  return 0;
}