/* WARNING: auto-generated (by sortbench/benchgen.py); do not edit */

#include <unistd.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <algorithm>
#include "cpucycles.h"
#include "avx2sort.h"
typedef int32_t sorttype;

#define N 131072
#define TIMINGS 64
typedef long long num;

sorttype r[N] __attribute__((aligned(4096)));
sorttype perm[TIMINGS+1][N] __attribute__((aligned(4096)));
sorttype x[(TIMINGS+1)*(N+1)] __attribute__((aligned(4096)));
sorttype y[N] __attribute__((aligned(4096)));

num t[TIMINGS+1] __attribute__((aligned(4096)));

int main()
{
  printf("cpucycles_version %s\n",cpucycles_version());
  printf("cpucycles_implementation %s\n",cpucycles_implementation());
  srandom(getpid());

  for (num loop = 0;loop < 2;++loop)
    for (num j = 0;j <= TIMINGS;++j)
      t[j] = cpucycles();
  printf("overhead");
  for (num i = 0;i < TIMINGS;++i)
    printf(" %lld",t[i+1]-t[i]);
  printf("\n");

  for (num j = 0;j <= TIMINGS;++j)
    for (num i = 0;i < N;++i)
      perm[j][i] = i;

  num randomized = 0;
  for (num npos = 0;;++npos) {
    num nposlow = npos%24;
    num nposhigh = npos/24;
    num n = round(exp2(nposhigh+round(27.75001*nposlow)/665));
    if (n > N) break;
    num npad = n+!(n&1);

    for (;randomized < n;++randomized)
      for (num j = 0;j <= TIMINGS;++j) {
        num pos = random()%(randomized+1);
        sorttype tmp = perm[j][pos];
        perm[j][pos] = perm[j][randomized];
        perm[j][randomized] = tmp;
      }

    for (num i = 0;i < n;++i) {
      r[i] = 0;
      for (num loop = 0;loop < 8;++loop)
        r[i] = (r[i]^(r[i]<<15))+random();
    }
    for (num j = 0;j <= TIMINGS;++j)
      for (num i = 0;i < n;++i)
        x[j*npad+i] = r[perm[j][i]];

    num sum = 0;
    for (num j = 0;j <= TIMINGS;++j)
      for (num i = 0;i < n;++i)
        sum += x[j*npad+i];

    for (num j = 0;j <= TIMINGS;++j)
      t[j] = cpucycles();

    for (num j = 0;j <= TIMINGS;++j) {
      sorttype *y;
      t[j] = cpucycles();
      y = x+j*npad;
      avx2::quicksort(y,n);
    }

    for (num i = 0;i < n;++i)
      y[i] = r[i];
    std::sort(y,y+n);
    for (num j = 0;j <= TIMINGS;++j)
      for (num i = 0;i < n;++i)
        assert(y[i] == x[j*npad+i]);

    for (num j = 0;j <= TIMINGS;++j)
      for (num i = 0;i < n;++i)
        sum -= x[j*npad+i];
    assert(sum == 0);

    printf("%lld",n);
    for (num i = 0;i < TIMINGS;++i)
      printf(" %lld",t[i+1]-t[i]);
    printf("\n");
    fflush(stdout);
  }
  return 0;
}
