-rw-r--r-- 1258 djbsort-20260127/float32/avx2useint32/sort.c raw
/* WARNING: auto-generated (by autogen/useint); do not edit */
#include <immintrin.h>
typedef __m256i int32x8;
#define int32x8_load(z) _mm256_loadu_si256((__m256i *) (z))
#define int32x8_store(z,i) _mm256_storeu_si256((__m256i *) (z),(i))
#define int32x8_broadcast _mm256_set1_epi32
#define int32x8_floatmask(y) _mm256_srli_epi32(_mm256_srai_epi32(y,31),1)
#include "djbsort.h"
#include "float32_sort.h"
#include "crypto_int32.h"
void float32_sort(float *x,long long n)
{
int32_t *y = (int32_t *) x;
long long j;
for (j = 0;j+16 <= n;j += 16) {
int32x8 y0 = int32x8_load(y+j);
int32x8 y1 = int32x8_load(y+j+8);
y0 ^= int32x8_floatmask(y0);
y1 ^= int32x8_floatmask(y1);
int32x8_store(y+j,y0);
int32x8_store(y+j+8,y1);
}
for (;j < n;++j) {
int32_t yj = y[j];
yj ^= ((uint32_t) crypto_int32_negative_mask(yj)) >> 1;
y[j] = yj;
}
djbsort_int32(y,n);
for (j = 0;j+16 <= n;j += 16) {
int32x8 y0 = int32x8_load(y+j);
int32x8 y1 = int32x8_load(y+j+8);
y0 ^= int32x8_floatmask(y0);
y1 ^= int32x8_floatmask(y1);
int32x8_store(y+j,y0);
int32x8_store(y+j+8,y1);
}
for (;j < n;++j) {
int32_t yj = y[j];
yj ^= ((uint32_t) crypto_int32_negative_mask(yj)) >> 1;
y[j] = yj;
}
}