214 lines
6.8 KiB
C
214 lines
6.8 KiB
C
/* bench-phi-knc.c: benchmarks and tests for the Xeon PHI Knights Corner optimizations. */
|
|
|
|
#include <immintrin.h>
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#include <stdint.h>
|
|
|
|
/* For CLOCK_REALTIME? */
|
|
#include <unistd.h>
|
|
#include <time.h>
|
|
|
|
/* For memcpy */
|
|
#include <string.h>
|
|
|
|
/* include the increasingly inacurately named header for our F32 dot product code. */
|
|
#include "ggml-phi-knc.h"
|
|
|
|
/* include the header for our Q8K_Q5K dot product code. */
|
|
#include "ggml-phi-knc-dot_q5_K_q8_K.h"
|
|
|
|
// largest Float32 vectors to get the dot product of.
|
|
#define F32_MAXVEC 1024768
|
|
// how many benchmarks we will run in total.
|
|
#define F32_RUNCOUNT 12
|
|
#define F32_ITEMS_PER_RUN {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768}
|
|
|
|
int main(void)
|
|
{
|
|
int vecRuns[F32_RUNCOUNT] = F32_ITEMS_PER_RUN;
|
|
|
|
// seed the random number generator.
|
|
srand(time(NULL));
|
|
|
|
// Run benchmarks for our F32 dot product functions. Benchmark them against a naieve implementation.
|
|
for (uint8_t runCount = 0; runCount < F32_RUNCOUNT; ++runCount)
|
|
{
|
|
struct timespec start, middle, end;
|
|
double vector_time;
|
|
double scalar_time;
|
|
float scalar = 0.0f;
|
|
float vector = 0.0f;
|
|
|
|
// Generate random input vector of [-1, 1] values.
|
|
float vec1[F32_MAXVEC] __attribute__((aligned(64)));
|
|
for (int i = 0; i < vecRuns[runCount]; i++)
|
|
vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
|
|
|
|
// Generate a second random input vector of [-1, 1] values.
|
|
float vec2[F32_MAXVEC] __attribute__((aligned(64)));
|
|
for (int i = 0; i < vecRuns[runCount]; i++)
|
|
vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
|
|
|
|
// on your mark..
|
|
clock_gettime(CLOCK_MONOTONIC, &start);
|
|
|
|
// call dot product
|
|
ggml_vec_dot_f32(vecRuns[runCount], &vector, 0, vec1, 0, vec2, 0, 0);
|
|
|
|
// save the middle point..
|
|
clock_gettime(CLOCK_MONOTONIC, &middle);
|
|
|
|
// do the same work by hand;
|
|
for (int i = 0; i < vecRuns[runCount]; ++i)
|
|
scalar += vec1[i]*vec2[i];
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &end);
|
|
|
|
printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
|
|
printf("%.9f\tvs\t%.9f\n", vector, scalar);
|
|
|
|
vector_time = middle.tv_sec - start.tv_sec;
|
|
vector_time += (middle.tv_nsec - start.tv_nsec) / 1000000000.0;
|
|
|
|
scalar_time = end.tv_sec - middle.tv_sec;
|
|
scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
|
|
|
|
printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
|
|
}
|
|
|
|
fflush(stdout);
|
|
|
|
// Generate a random input vector of 256 4 bit values.
|
|
uint8x16_t q4[8];
|
|
uint8_t * q4ptr = (uint8_t *)q4;
|
|
for (int i = 0; i < 128; i++)
|
|
q4ptr[i] = rand() && 0xFF;
|
|
|
|
// Generate a random input vector of 256 1 bit values.
|
|
uint8x16_t q1[2];
|
|
uint8_t * q1ptr = (uint8_t *)q1;
|
|
for (int i = 0; i < 32; i++)
|
|
q1ptr[i] = rand() && 0xFF;
|
|
|
|
// Get our reference, unshifted result.
|
|
uint8x16_t q5[16];
|
|
GGML_5bit_Unpack_Unaligned(q4, (uint8_t *)q1, q5);
|
|
|
|
printf("successfully got a Q5.\n");
|
|
|
|
// Perform alignment tests, for GGML_5bit_Unpack_Unaligned.
|
|
// Try to run GGML_5bit_Unpack_Unaligned with all possible misalignments, and get it to fail.
|
|
for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
|
|
{
|
|
uint8x16_t q5new[16];
|
|
uint8x16_t q4Shifted[9];
|
|
|
|
// create an off-by-shiftCount copy of q4.
|
|
q4ptr = ((uint8_t *)q4Shifted) + shiftCount;
|
|
memcpy (q4ptr, q4, 128);
|
|
|
|
// call the unaligned form of this function:
|
|
GGML_5bit_Unpack_Unaligned((uint8x16_t *)q4ptr, (uint8_t *)q1, q5new);
|
|
|
|
for (uint32_t byteCount = 0; byteCount < 256; ++byteCount)
|
|
{
|
|
if ( ((uint8_t *)q5new)[byteCount] != ((uint8_t *)q5)[byteCount] )
|
|
{
|
|
printf("whoops!\nshiftCount: %d\nbyteCount: %d\n", shiftCount, byteCount);
|
|
exit (-1);
|
|
}
|
|
}
|
|
|
|
printf("Got a Q5 offset by %d\n", shiftCount);
|
|
}
|
|
|
|
// Generate a random input vector of 256 8 bit values.
|
|
int8x16_t q8[16];
|
|
int8_t * q8ptr = (int8_t *)q8;
|
|
for (int i = 0; i < 256; i++)
|
|
q8ptr[i] = rand() && 0xFF;
|
|
|
|
// Generate eight random scales, one for each pair of sums.
|
|
uint8_t scale[8];
|
|
for (int i = 0; i < 8; i++)
|
|
scale[i] = rand() && 0xFF;
|
|
|
|
// Generate a random X scale.
|
|
float rndScaleX = 2 * (0.5 - rand() / (float)RAND_MAX);
|
|
ggml_fp16_t scaleX = GGML_PHI_FP32_TO_FP16(rndScaleX);
|
|
|
|
// Display the random X scale. Verifies FP32_TO_FP16_TO_FP32 is working.
|
|
printf("rndScaleX: %f\n", rndScaleX);
|
|
printf("scaleX: %x\n", scaleX);
|
|
printf("newScaleX: %f\n", GGML_PHI_FP16_TO_FP32(scaleX));
|
|
|
|
// Generate a random Y scale.
|
|
float scaleY = 2 * (0.5 - rand() / (float)RAND_MAX);
|
|
printf("scaleY: %f\n", scaleY);
|
|
|
|
// Create a place for our golden result.
|
|
float32x16_t res;
|
|
|
|
// Clear res.
|
|
GGML_F32x16_VEC_ZERO(&res);
|
|
|
|
// Generate an initial result, to compare to.
|
|
GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (q8, q5, scale, scaleX, scaleY, &res);
|
|
|
|
// Generate a sum of the result.
|
|
float sum = 0.0f;
|
|
for (int l = 0; l < 16; ++l) sum += ((float *)&res)[l];
|
|
|
|
printf("Got a res: %f\n", sum);
|
|
|
|
// Perform alignment tests, for GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned.
|
|
// try to run GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned with all possible mis-alignments, and get it to fail.
|
|
for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
|
|
{
|
|
float32x16_t resNew1;
|
|
int8x16_t q8Shifted[17];
|
|
|
|
// Create an off-by-shiftCount copy of q8.
|
|
q8ptr = ((int8_t *)q8Shifted)+shiftCount;
|
|
memcpy (q8ptr, q8, 256);
|
|
|
|
// Clear resNew.
|
|
GGML_F32x16_VEC_ZERO(&resNew1);
|
|
|
|
// Call the unaligned form of this function:
|
|
GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned ((int8x16_t *)q8ptr, q5, scale, scaleX, scaleY, &resNew1);
|
|
|
|
// check the result against our reference.
|
|
for (uint32_t floatCount = 0; floatCount < 64; ++floatCount)
|
|
{
|
|
if ( ((int8_t *)&resNew1)[floatCount] != ((int8_t *)&res)[floatCount] )
|
|
{
|
|
printf("whoops!\nshiftCount: %d\nfloatCount: %d\n", shiftCount, floatCount);
|
|
for (uint32_t row = 0; row < 16 ; ++row)
|
|
{
|
|
for (int col1 = 0; col1 < 4; ++col1)
|
|
{
|
|
printf("%2.2x\t", ((int8_t *)&resNew1)[(4*row)+col1]);
|
|
}
|
|
printf(" vs ");
|
|
for (int col2 = 0; col2 < 4; ++col2)
|
|
{
|
|
printf("%2.2x\t", ((int8_t *)&res)[(4*row)+col2]);
|
|
}
|
|
printf ("\n");
|
|
}
|
|
exit (-1);
|
|
}
|
|
}
|
|
|
|
// Generate a sum of our new result.
|
|
float sumf = 0.0f;
|
|
for (int l = 0; l < 16; ++l) sumf += ((float *)&resNew1)[l];
|
|
|
|
printf("Got a res from a Q8 offset by %d: %f\n", ((uint64_t) q8ptr) & 0x3F, sumf);
|
|
}
|
|
|
|
return 0;
|
|
}
|