- for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
- vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
+ for (IndexType k = 0; k < NumRegs; ++k)
+ vec_add_dpbusd_32x4(outs[k], in0, col0[k], in1, col1[k], in2, col2[k], in3, col3[k]);