diff --git a/docs/ops.md b/docs/ops.md index b395d2315c..2b2770cb76 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -32,7 +32,7 @@ Legend: | CONV_TRANSPOSE_1D | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CONV_TRANSPOSE_2D | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | COS | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | ❌ | ❌ | -| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | +| COUNT_EQUAL | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | | CPY | ❌ | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | 🟡 | ❌ | ❌ | | CROSS_ENTROPY_LOSS | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | | CROSS_ENTROPY_LOSS_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/ops/Metal.csv b/docs/ops/Metal.csv index 5f7450e91f..02fd75fdbf 100644 --- a/docs/ops/Metal.csv +++ b/docs/ops/Metal.csv @@ -965,6 +965,7 @@ "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,1,2560],ne_kernel=[3,3,1,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[12,12,2,2560],ne_kernel=[3,3,2,2560],s0=1,s1=1,p0=1,p1=1,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[5,5,1,32],ne_kernel=[3,4,1,32],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" +"Metal","IM2COL","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[2,2,1536,729],ne_kernel=[2,2,1536,4096],s0=1,s1=1,p0=0,p1=0,d0=1,d1=1,is_2D=1","support","1","yes","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f32,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f32,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" "Metal","IM2COL_3D","type_input=f32,type_kernel=f16,dst_type=f16,ne_input=[10,10,10,9],ne_kernel=[3,3,3,1],IC=3,s0=1,s1=1,s2=1,p0=1,p1=1,p2=1,d0=1,d1=1,d2=1,v=0","support","0","no","Metal" @@ -4964,8 +4965,9 @@ "Metal","CONV_TRANSPOSE_1D","ne_input=[2,1,1,1],ne_kernel=[3,1,1,1],s0=1,p0=0,d0=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[3,2,3,1],ne_kernel=[2,2,1,3],stride=1","support","1","yes","Metal" "Metal","CONV_TRANSPOSE_2D","ne_input=[10,10,9,1],ne_kernel=[3,3,1,9],stride=2","support","1","yes","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","0","no","Metal" -"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","0","no","Metal" +"Metal","CONV_TRANSPOSE_2D","ne_input=[129,63,35,1],ne_kernel=[3,3,48,35],stride=1","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,500,1,1]","support","1","yes","Metal" +"Metal","COUNT_EQUAL","type=f32,ne=[4,5000,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,1,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[32,513,1,1]","support","1","yes","Metal" "Metal","ARGMAX","type=f32,ne=[100,10,1,1]","support","1","yes","Metal" @@ -5715,15 +5717,15 @@ "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" "Metal","RMS_NORM","type=f32,ne=[64,5,4,3],v=0,eps=0.000001,inplace=1","support","1","yes","Metal" "Metal","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" -"Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[6,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[3,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","1","yes","Metal" @@ -5733,6 +5735,15 @@ "Metal","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" "Metal","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1024,1,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1024,4,1],ne_b=[9,1024,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,1536,1,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,1536,4,1],ne_b=[9,1536,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[18,2048,1,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" +"Metal","SSM_CONV","type=f32,ne_a=[9,2048,4,1],ne_b=[9,2048,1,1]","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" "Metal","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Metal" @@ -8916,6 +8927,8 @@ "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=0,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=0.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX","type=f32,ne=[32,2,32,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f32,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" +"Metal","SOFT_MAX","type=f32,ne=[200001,2,3,1],mask=1,sinks=1,m_prec=f16,nr23=[1,1],scale=0.100000,max_bias=8.000000,inplace=0","support","1","yes","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[15,15,1,1],scale=1.000000,max_bias=0.000000","support","0","no","Metal" "Metal","SOFT_MAX_BACK","type=f32,ne=[16,16,2,3],scale=1.000000,max_bias=0.000000","support","0","no","Metal" @@ -9542,311 +9555,311 @@ "Metal","ARGSORT","type=f32,ne=[2048,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2049,2,1,3],order=1","support","1","yes","Metal" "Metal","ARGSORT","type=f32,ne=[2,8,8192,1],order=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15","support","1","yes","Metal" -"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[12,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[13,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[15,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[19,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[27,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[43,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[64,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[75,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[128,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[139,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[256,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[267,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[512,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[523,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1035,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2059,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4096,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[4107,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8192,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[8203,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16395,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32768,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[32779,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65536,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[65547,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131072,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[131083,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262144,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[262155,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=100,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=500,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=1023,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524288,1,1,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[524299,1,2,1],k=9999,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=1,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=2,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=3,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=7,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[60,10,10,10],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1023,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1024,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[1025,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[16384,1,1,1],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2047,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2048,2,1,3],k=15,ties=0","support","1","yes","Metal" +"Metal","TOP_K","type=f32,ne=[2049,2,1,3],k=15,ties=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=0","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[512,512,3,2],scale_factor=2,mode=nearest,transpose=1","support","1","yes","Metal" "Metal","UPSCALE","type=f32,ne=[2,5,7,11],ne_tgt=[5,7,11,13],mode=nearest,flags=none","support","1","yes","Metal" @@ -9891,8 +9904,9 @@ "Metal","GROUP_NORM","type=f32,ne=[64,64,320,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","GROUP_NORM","type=f32,ne=[9,9,1280,1],num_groups=32,eps=0.000001","support","1","yes","Metal" "Metal","ACC","type=f32,ne_a=[256,17,1,1],ne_b=[256,16,1,1]","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],pad_0=1,pad_1=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[33,17,2,1],pad_0=4,pad_1=3,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,3,1],lp0=1,rp0=1,lp1=1,rp1=1,lp2=1,rp2=1,lp3=1,rp3=1,v=0,circular=0","support","0","no","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[512,34,2,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","PAD_REFLECT_1D","type=f32,ne_a=[3000,384,4,1],pad_0=10,pad_1=9","support","1","yes","Metal" "Metal","ROLL","shift0=3,shift1=-2,shift3=1,shift4=-1","support","0","no","Metal" @@ -9923,17 +9937,41 @@ "Metal","FILL","type=f32,ne=[303,207,11,3],c=2.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[800,600,4,4],c=-152.000000","support","1","yes","Metal" "Metal","FILL","type=f32,ne=[2048,512,2,2],c=3.500000","support","1","yes","Metal" +"Metal","DIAG","type=f32,ne=[10,1,4,3]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[79,1,19,13]","support","0","no","Metal" +"Metal","DIAG","type=f32,ne=[256,1,8,16]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[10,10,4,3],ne_rhs=[3,10,4,3]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[11,11,1,1],ne_rhs=[5,11,1,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[17,17,2,4],ne_rhs=[9,17,2,4]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[30,30,7,1],ne_rhs=[8,30,7,1]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[42,42,5,2],ne_rhs=[10,42,5,2]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[10,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,2,2],ne_rhs=[64,64,2,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[79,79,5,3],ne_rhs=[417,79,5,3]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,2],ne_rhs=[32,128,4,2]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[80,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[79,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,2,8],ne_rhs=[81,80,2,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[80,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[79,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[80,80,8,8],ne_rhs=[81,80,8,8]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[84,84,4,4],ne_rhs=[32,84,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[95,95,8,8],ne_rhs=[40,95,8,8]","support","0","no","Metal" "Metal","SOLVE_TRI","type=f32,ne_lhs=[100,100,4,4],ne_rhs=[41,100,4,4]","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0","support","0","no","Metal" -"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1","support","1","yes","Metal" -"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[31,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,4],ne_rhs=[32,128,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,3,4],ne_rhs=[32,128,3,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[128,128,4,1],ne_rhs=[32,128,4,1]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[200,64,4,4]","support","0","no","Metal" +"Metal","SOLVE_TRI","type=f32,ne_lhs=[64,64,4,4],ne_rhs=[384,64,4,4]","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=0,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=0","support","1","yes","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=0","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[512,512,1,1],lp0=0,rp0=1,lp1=0,rp1=1,lp2=0,rp2=0,lp3=0,rp3=0,v=1,circular=1","support","0","no","Metal" +"Metal","PAD","type=f32,ne_a=[11,22,33,44],lp0=1,rp0=2,lp1=3,rp1=4,lp2=5,rp2=6,lp3=7,rp3=8,v=1,circular=1","support","0","no","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f32,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=f16,permute=[0,1,2,3]","support","1","yes","Metal" "Metal","FLASH_ATTN_EXT","hsk=40,hsv=40,nh=4,nr23=[1,1],kv=113,nb=1,mask=1,sinks=1,max_bias=0.000000,logit_softcap=0.000000,prec=f32,type_KV=bf16,permute=[0,1,2,3]","support","1","yes","Metal" diff --git a/ggml/src/ggml-metal/ggml-metal-device.cpp b/ggml/src/ggml-metal/ggml-metal-device.cpp index 680904d132..b0734797f1 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.cpp +++ b/ggml/src/ggml-metal/ggml-metal-device.cpp @@ -1684,3 +1684,60 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd(ggm return res; } + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset(ggml_metal_library_t lib, const ggml_tensor * op) { + GGML_ASSERT(op->type == GGML_TYPE_I64); + + char base[256]; + char name[256]; + + snprintf(base, 256, "kernel_memset_%s", ggml_type_name(op->type)); + snprintf(name, 256, "%s", base); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr); + } + + return res; +} + +ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal(ggml_metal_library_t lib, const ggml_tensor * op) { + assert(op->op == GGML_OP_COUNT_EQUAL); + + GGML_TENSOR_LOCALS(int64_t, ne0, op->src[0], ne); + + GGML_ASSERT(op->src[0]->type == op->src[1]->type); + GGML_ASSERT(op->src[0]->type == GGML_TYPE_I32); + GGML_ASSERT(op->type == GGML_TYPE_I64); + + // note: the kernel only supports i32 output due to metal atomic add only supporting atomic_int + GGML_ASSERT(ggml_nelements(op->src[0]) < (1LL << 31)); + + char base[256]; + char name[256]; + + int nsg = 1; + while (32*nsg < ne00 && nsg < 32) { + nsg *= 2; + } + + snprintf(base, 256, "kernel_count_equal_%s", ggml_type_name(op->src[0]->type)); + snprintf(name, 256, "%s_nsg=%d", base, nsg); + + ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name); + if (!res.pipeline) { + ggml_metal_cv_t cv = ggml_metal_cv_init(); + + ggml_metal_cv_set_int16(cv, nsg, FC_COUNT_EQUAL + 0); + + res = ggml_metal_library_compile_pipeline(lib, base, name, cv); + + ggml_metal_cv_free(cv); + } + + res.smem = 32 * sizeof(int32_t); + res.nsg = nsg; + + return res; +} diff --git a/ggml/src/ggml-metal/ggml-metal-device.h b/ggml/src/ggml-metal/ggml-metal-device.h index 0a8b9211a7..d983b666ca 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.h +++ b/ggml/src/ggml-metal/ggml-metal-device.h @@ -147,6 +147,8 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_arange struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_timestep_embedding(ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_adamw (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_opt_step_sgd (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_memset (ggml_metal_library_t lib, const struct ggml_tensor * op); +struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_count_equal (ggml_metal_library_t lib, const struct ggml_tensor * op); struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_flash_attn_ext_pad( ggml_metal_library_t lib, diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m index f24270bb1c..59badd0043 100644 --- a/ggml/src/ggml-metal/ggml-metal-device.m +++ b/ggml/src/ggml-metal/ggml-metal-device.m @@ -1023,6 +1023,11 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te return has_simdgroup_reduction && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_L2_NORM: return has_simdgroup_reduction && (op->ne[0] % 4 == 0 && ggml_is_contiguous_1(op->src[0])); + case GGML_OP_COUNT_EQUAL: + return has_simdgroup_reduction && + op->src[0]->type == GGML_TYPE_I32 && + op->src[1]->type == GGML_TYPE_I32 && + op->type == GGML_TYPE_I64; case GGML_OP_ARGMAX: return has_simdgroup_reduction; case GGML_OP_NORM: diff --git a/ggml/src/ggml-metal/ggml-metal-impl.h b/ggml/src/ggml-metal/ggml-metal-impl.h index 8944b07e90..d3b0e732ec 100644 --- a/ggml/src/ggml-metal/ggml-metal-impl.h +++ b/ggml/src/ggml-metal/ggml-metal-impl.h @@ -78,6 +78,7 @@ #define FC_MUL_MM 700 #define FC_ROPE 800 #define FC_SSM_CONV 900 +#define FC_COUNT_EQUAL 1000 // op-specific constants #define OP_FLASH_ATTN_EXT_NQPTG 8 @@ -894,6 +895,25 @@ typedef struct { float step; } ggml_metal_kargs_arange; +typedef struct { + int64_t val; +} ggml_metal_kargs_memset; + +typedef struct { + int32_t ne00; + int32_t ne01; + int32_t ne02; + int32_t ne03; + uint64_t nb00; + uint64_t nb01; + uint64_t nb02; + uint64_t nb03; + uint64_t nb10; + uint64_t nb11; + uint64_t nb12; + uint64_t nb13; +} ggml_metal_kargs_count_equal; + typedef struct { int32_t k0; int32_t k1; diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index e99c1763f6..acf2aa9184 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -448,7 +448,11 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) { { n_fuse = ggml_metal_op_opt_step_sgd(ctx, idx); } break; - default: + case GGML_OP_COUNT_EQUAL: + { + n_fuse = ggml_metal_op_count_equal(ctx, idx); + } break; + default: { GGML_LOG_ERROR("%s: error: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(node->op)); GGML_ABORT("fatal error"); @@ -4090,3 +4094,64 @@ int ggml_metal_op_opt_step_sgd(ggml_metal_op_t ctx, int idx) { return 1; } + +int ggml_metal_op_count_equal(ggml_metal_op_t ctx, int idx) { + ggml_tensor * op = ctx->node(idx); + + ggml_metal_library_t lib = ctx->lib; + ggml_metal_encoder_t enc = ctx->enc; + + GGML_TENSOR_LOCALS(int32_t, ne0, op->src[0], ne); + GGML_TENSOR_LOCALS(uint64_t, nb0, op->src[0], nb); + GGML_TENSOR_LOCALS(uint64_t, nb1, op->src[1], nb); + + { + ggml_metal_kargs_memset args = { /*.val =*/ 0 }; + + auto pipeline = ggml_metal_library_get_pipeline_memset(lib, op); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 1); + + ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1); + } + + ggml_metal_op_concurrency_reset(ctx); + + { + ggml_metal_kargs_count_equal args = { + /*.ne00 =*/ ne00, + /*.ne01 =*/ ne01, + /*.ne02 =*/ ne02, + /*.ne03 =*/ ne03, + /*.nb00 =*/ nb00, + /*.nb01 =*/ nb01, + /*.nb02 =*/ nb02, + /*.nb03 =*/ nb03, + /*.nb10 =*/ nb10, + /*.nb11 =*/ nb11, + /*.nb12 =*/ nb12, + /*.nb13 =*/ nb13, + }; + + auto pipeline = ggml_metal_library_get_pipeline_count_equal(lib, op); + + const size_t smem = pipeline.smem; + + const int nth = 32*pipeline.nsg; + + GGML_ASSERT(nth <= ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)); + + ggml_metal_encoder_set_pipeline(enc, pipeline); + ggml_metal_encoder_set_bytes(enc, &args, sizeof(args), 0); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[0]), 1); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op->src[1]), 2); + ggml_metal_encoder_set_buffer(enc, ggml_metal_get_buffer_id(op), 3); + + ggml_metal_encoder_set_threadgroup_memory_size(enc, smem, 0); + ggml_metal_encoder_dispatch_threadgroups(enc, ne01, ne02, ne03, nth, 1, 1); + } + + return 1; +} diff --git a/ggml/src/ggml-metal/ggml-metal-ops.h b/ggml/src/ggml-metal/ggml-metal-ops.h index 902b544523..c1025d3567 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.h +++ b/ggml/src/ggml-metal/ggml-metal-ops.h @@ -87,6 +87,7 @@ int ggml_metal_op_leaky_relu (ggml_metal_op_t ctx, int idx); int ggml_metal_op_tri (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_adamw (ggml_metal_op_t ctx, int idx); int ggml_metal_op_opt_step_sgd (ggml_metal_op_t ctx, int idx); +int ggml_metal_op_count_equal (ggml_metal_op_t ctx, int idx); #ifdef __cplusplus } diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal index 3154beff9b..67b30e0d93 100644 --- a/ggml/src/ggml-metal/ggml-metal.metal +++ b/ggml/src/ggml-metal/ggml-metal.metal @@ -1790,6 +1790,7 @@ kernel void kernel_op_sum_f32( return; } + // TODO: become function constant const uint nsg = (ntg.x + 31) / 32; float sumf = 0; @@ -9914,3 +9915,75 @@ kernel void kernel_opt_step_sgd_f32( x[gid] = x[gid] * (1.0f - pars[0] * pars[1]) - pars[0] * g[gid]; } + +template +kernel void kernel_memset( + constant ggml_metal_kargs_fill & args, + device T * dst, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = args.val; +} + +typedef decltype(kernel_memset) kernel_memset_t; + +template [[host_name("kernel_memset_i64")]] kernel kernel_memset_t kernel_memset; + +constant short FC_count_equal_nsg [[function_constant(FC_COUNT_EQUAL + 0)]]; + +template +kernel void kernel_count_equal( + constant ggml_metal_kargs_count_equal & args, + device const char * src0, + device const char * src1, + device atomic_int * dst, + threadgroup int32_t * shmem_i32 [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + ushort3 tpitg[[thread_position_in_threadgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort3 ntg[[threads_per_threadgroup]]) { + const short NSG = FC_count_equal_nsg; + + const int i3 = tgpig.z; + const int i2 = tgpig.y; + const int i1 = tgpig.x; + + if (i3 >= args.ne03 || i2 >= args.ne02 || i1 >= args.ne01) { + return; + } + + int sum = 0; + + device const char * base0 = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03; + device const char * base1 = src1 + i1*args.nb11 + i2*args.nb12 + i3*args.nb13; + + for (int64_t i0 = tpitg.x; i0 < args.ne00; i0 += ntg.x) { + const T v0 = *(device const T *)(base0 + i0*args.nb00); + const T v1 = *(device const T *)(base1 + i0*args.nb10); + sum += (v0 == v1); + } + + sum = simd_sum(sum); + + if (tiisg == 0) { + shmem_i32[sgitg] = sum; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + if (sgitg == 0) { + float v = 0.0f; + if (tpitg.x < NSG) { + v = shmem_i32[tpitg.x]; + } + + float total = simd_sum(v); + if (tpitg.x == 0) { + atomic_fetch_add_explicit(dst, (int32_t) total, memory_order_relaxed); + } + } +} + +typedef decltype(kernel_count_equal) kernel_count_equal_t; + +template [[host_name("kernel_count_equal_i32")]] kernel kernel_count_equal_t kernel_count_equal;