+%endmacro
+
+; in: 4/8 rows of 4 words in %1..%8
+; out: 4 rows of 4/8 word in m0..m3
+; clobbers: m4, m5, m6, m7
+%macro TRANSPOSE4x8W_LOAD 8
+%if mmsize==8
+ TRANSPOSE4x4W_LOAD %1, %2, %3, %4
+%else
+ movq m0, %1
+ movq m2, %2
+ movq m1, %3
+ movq m3, %4
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpckldq m0, m1
+ punpckhdq m2, m1
+
+ movq m4, %5
+ movq m6, %6
+ movq m5, %7
+ movq m7, %8
+ punpcklwd m4, m6
+ punpcklwd m5, m7
+ mova m6, m4
+ punpckldq m4, m5
+ punpckhdq m6, m5
+
+ punpckhqdq m1, m0, m4
+ punpckhqdq m3, m2, m6
+ punpcklqdq m0, m4
+ punpcklqdq m2, m6
+%endif
+%endmacro
+
+; in: 2 rows of 4/8 words in m1..m2
+; out: 4/8 rows of 2 words in %1..%8
+; clobbers: m0, m1
+%macro TRANSPOSE8x2W_STORE 8
+%if mmsize==8
+ TRANSPOSE4x2W_STORE %1, %2, %3, %4