llama.cpp/bench-phi-knc.c

214 lines
6.8 KiB
C

/* bench-phi-knc.c: benchmarks and tests for the Xeon PHI Knights Corner optimizations. */
#include <immintrin.h>
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
/* For CLOCK_REALTIME? */
#include <unistd.h>
#include <time.h>
/* For memcpy */
#include <string.h>
/* include the increasingly inacurately named header for our F32 dot product code. */
#include "ggml-phi-knc.h"
/* include the header for our Q8K_Q5K dot product code. */
#include "ggml-phi-knc-dot_q5_K_q8_K.h"
// largest Float32 vectors to get the dot product of.
#define F32_MAXVEC 1024768
// how many benchmarks we will run in total.
#define F32_RUNCOUNT 12
#define F32_ITEMS_PER_RUN {10, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 1024768}
int main(void)
{
int vecRuns[F32_RUNCOUNT] = F32_ITEMS_PER_RUN;
// seed the random number generator.
srand(time(NULL));
// Run benchmarks for our F32 dot product functions. Benchmark them against a naieve implementation.
for (uint8_t runCount = 0; runCount < F32_RUNCOUNT; ++runCount)
{
struct timespec start, middle, end;
double vector_time;
double scalar_time;
float scalar = 0.0f;
float vector = 0.0f;
// Generate random input vector of [-1, 1] values.
float vec1[F32_MAXVEC] __attribute__((aligned(64)));
for (int i = 0; i < vecRuns[runCount]; i++)
vec1[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
// Generate a second random input vector of [-1, 1] values.
float vec2[F32_MAXVEC] __attribute__((aligned(64)));
for (int i = 0; i < vecRuns[runCount]; i++)
vec2[i] = 2 * (0.5 - rand() / (float)RAND_MAX);
// on your mark..
clock_gettime(CLOCK_MONOTONIC, &start);
// call dot product
ggml_vec_dot_f32(vecRuns[runCount], &vector, 0, vec1, 0, vec2, 0, 0);
// save the middle point..
clock_gettime(CLOCK_MONOTONIC, &middle);
// do the same work by hand;
for (int i = 0; i < vecRuns[runCount]; ++i)
scalar += vec1[i]*vec2[i];
clock_gettime(CLOCK_MONOTONIC, &end);
printf("vector\tvs\tscalar (%d items)\n", vecRuns[runCount]);
printf("%.9f\tvs\t%.9f\n", vector, scalar);
vector_time = middle.tv_sec - start.tv_sec;
vector_time += (middle.tv_nsec - start.tv_nsec) / 1000000000.0;
scalar_time = end.tv_sec - middle.tv_sec;
scalar_time += (end.tv_nsec - middle.tv_nsec) / 1000000000.0;
printf("%.9f\tvs\t%.9f\n", vector_time, scalar_time);
}
fflush(stdout);
// Generate a random input vector of 256 4 bit values.
uint8x16_t q4[8];
uint8_t * q4ptr = (uint8_t *)q4;
for (int i = 0; i < 128; i++)
q4ptr[i] = rand() && 0xFF;
// Generate a random input vector of 256 1 bit values.
uint8x16_t q1[2];
uint8_t * q1ptr = (uint8_t *)q1;
for (int i = 0; i < 32; i++)
q1ptr[i] = rand() && 0xFF;
// Get our reference, unshifted result.
uint8x16_t q5[16];
GGML_5bit_Unpack_Unaligned(q4, (uint8_t *)q1, q5);
printf("successfully got a Q5.\n");
// Perform alignment tests, for GGML_5bit_Unpack_Unaligned.
// Try to run GGML_5bit_Unpack_Unaligned with all possible misalignments, and get it to fail.
for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
{
uint8x16_t q5new[16];
uint8x16_t q4Shifted[9];
// create an off-by-shiftCount copy of q4.
q4ptr = ((uint8_t *)q4Shifted) + shiftCount;
memcpy (q4ptr, q4, 128);
// call the unaligned form of this function:
GGML_5bit_Unpack_Unaligned((uint8x16_t *)q4ptr, (uint8_t *)q1, q5new);
for (uint32_t byteCount = 0; byteCount < 256; ++byteCount)
{
if ( ((uint8_t *)q5new)[byteCount] != ((uint8_t *)q5)[byteCount] )
{
printf("whoops!\nshiftCount: %d\nbyteCount: %d\n", shiftCount, byteCount);
exit (-1);
}
}
printf("Got a Q5 offset by %d\n", shiftCount);
}
// Generate a random input vector of 256 8 bit values.
int8x16_t q8[16];
int8_t * q8ptr = (int8_t *)q8;
for (int i = 0; i < 256; i++)
q8ptr[i] = rand() && 0xFF;
// Generate eight random scales, one for each pair of sums.
uint8_t scale[8];
for (int i = 0; i < 8; i++)
scale[i] = rand() && 0xFF;
// Generate a random X scale.
float rndScaleX = 2 * (0.5 - rand() / (float)RAND_MAX);
ggml_fp16_t scaleX = GGML_PHI_FP32_TO_FP16(rndScaleX);
// Display the random X scale. Verifies FP32_TO_FP16_TO_FP32 is working.
printf("rndScaleX: %f\n", rndScaleX);
printf("scaleX: %x\n", scaleX);
printf("newScaleX: %f\n", GGML_PHI_FP16_TO_FP32(scaleX));
// Generate a random Y scale.
float scaleY = 2 * (0.5 - rand() / (float)RAND_MAX);
printf("scaleY: %f\n", scaleY);
// Create a place for our golden result.
float32x16_t res;
// Clear res.
GGML_F32x16_VEC_ZERO(&res);
// Generate an initial result, to compare to.
GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned (q8, q5, scale, scaleX, scaleY, &res);
// Generate a sum of the result.
float sum = 0.0f;
for (int l = 0; l < 16; ++l) sum += ((float *)&res)[l];
printf("Got a res: %f\n", sum);
// Perform alignment tests, for GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned.
// try to run GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned with all possible mis-alignments, and get it to fail.
for (uint8_t shiftCount = 1; shiftCount < 16; ++shiftCount)
{
float32x16_t resNew1;
int8x16_t q8Shifted[17];
// Create an off-by-shiftCount copy of q8.
q8ptr = ((int8_t *)q8Shifted)+shiftCount;
memcpy (q8ptr, q8, 256);
// Clear resNew.
GGML_F32x16_VEC_ZERO(&resNew1);
// Call the unaligned form of this function:
GGML_8X_2xI8x16_2xI8x16_MUL_2xI16x16_S_FMA_I32x16_Unaligned ((int8x16_t *)q8ptr, q5, scale, scaleX, scaleY, &resNew1);
// check the result against our reference.
for (uint32_t floatCount = 0; floatCount < 64; ++floatCount)
{
if ( ((int8_t *)&resNew1)[floatCount] != ((int8_t *)&res)[floatCount] )
{
printf("whoops!\nshiftCount: %d\nfloatCount: %d\n", shiftCount, floatCount);
for (uint32_t row = 0; row < 16 ; ++row)
{
for (int col1 = 0; col1 < 4; ++col1)
{
printf("%2.2x\t", ((int8_t *)&resNew1)[(4*row)+col1]);
}
printf(" vs ");
for (int col2 = 0; col2 < 4; ++col2)
{
printf("%2.2x\t", ((int8_t *)&res)[(4*row)+col2]);
}
printf ("\n");
}
exit (-1);
}
}
// Generate a sum of our new result.
float sumf = 0.0f;
for (int l = 0; l < 16; ++l) sumf += ((float *)&resNew1)[l];
printf("Got a res from a Q8 offset by %d: %f\n", ((uint64_t) q8ptr) & 0x3F, sumf);
}
return 0;
}