reference, declarationdefinition
definition → references, declarations, derived classes, virtual overrides
reference to multiple definitions → definitions
unreferenced
    1
    2
    3
    4
    5
    6
    7
    8
    9
   10
   11
   12
   13
   14
   15
   16
   17
   18
   19
   20
   21
   22
   23
   24
   25
   26
   27
   28
   29
   30
   31
   32
   33
   34
   35
   36
   37
   38
   39
   40
   41
   42
   43
   44
   45
   46
   47
   48
   49
   50
   51
   52
   53
   54
   55
   56
   57
   58
   59
   60
   61
   62
   63
   64
   65
   66
   67
   68
   69
   70
   71
   72
   73
   74
   75
   76
   77
   78
   79
   80
   81
   82
   83
   84
   85
   86
   87
#include <stdio.h>
#include <vector>
#include <pthread.h>
#include <malloc.h>
#include <algorithm>

using namespace std;

const size_t kNumThreds = 16;
const size_t kNumIters = 1 << 23;

inline void break_optimization(void *arg) {
  __asm__ __volatile__("" : : "r" (arg) : "memory");
}

__attribute__((noinline))
static void *MallocThread(void *t) {
  size_t total_malloced = 0, total_freed = 0;
  size_t max_in_use = 0;
  size_t tid = reinterpret_cast<size_t>(t);
  vector<pair<char *, size_t> > allocated;
  allocated.reserve(kNumIters);
  for (size_t i = 1; i < kNumIters; i++) {
    if ((i % (kNumIters / 4)) == 0 && tid == 0)
      fprintf(stderr, "   T[%ld] iter %ld\n", tid, i);
    bool allocate = (i % 5) <= 2;  // 60% malloc, 40% free
    if (i > kNumIters / 4)
      allocate = i % 2;  // then switch to 50% malloc, 50% free
    if (allocate) {
      size_t size = 1 + (i % 200);
      if ((i % 10001) == 0)
        size *= 4096;
      total_malloced += size;
      char *x = new char[size];
      x[0] = x[size - 1] = x[size / 2] = 0;
      allocated.push_back(make_pair(x, size));
      max_in_use = max(max_in_use, total_malloced - total_freed);
    } else {
      if (allocated.empty()) continue;
      size_t slot = i % allocated.size();
      char *p = allocated[slot].first;
      p[0] = 0;  // emulate last user touch of the block
      size_t size = allocated[slot].second;
      total_freed += size;
      swap(allocated[slot], allocated.back());
      allocated.pop_back();
      delete [] p;
    }
  }
  if (tid == 0)
    fprintf(stderr, "   T[%ld] total_malloced: %ldM in use %ldM max %ldM\n",
           tid, total_malloced >> 20, (total_malloced - total_freed) >> 20,
           max_in_use >> 20);
  for (size_t i = 0; i < allocated.size(); i++)
    delete [] allocated[i].first;
  return 0;
}

template <int depth>
struct DeepStack {
  __attribute__((noinline))
  static void *run(void *t) {
    break_optimization(0);
    DeepStack<depth - 1>::run(t);
    break_optimization(0);
    return 0;
  }
};

template<>
struct DeepStack<0> {
  static void *run(void *t) {
    MallocThread(t);
    return 0;
  }
};

// Build with -Dstandalone_malloc_test=main to make it a separate program.
int standalone_malloc_test() {
  pthread_t t[kNumThreds];
  for (size_t i = 0; i < kNumThreds; i++)
    pthread_create(&t[i], 0, DeepStack<200>::run, reinterpret_cast<void *>(i));
  for (size_t i = 0; i < kNumThreds; i++)
    pthread_join(t[i], 0);
  malloc_stats();
  return 0;
}