hi, i am programmer from china.
i find the miscalculation about InvertMatrix4s.
i try a test
RM := InvertMatrix4s1(M2);
M := RM;
MulMatrix4s1(RM, M2, M);
but RM is not identity matrix
so i think it is a miscalculation.
In addition, i'd love to provide my sse version, i test that it is right.
function InvertMatrix4s2(M: TMatrix4s): TMatrix4s;
begin
asm
movlps xmm7, [eax]
movhps xmm7, [eax+$10] // tmp7 xmm0
// 4 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
movlps xmm1, [eax+$20]
movhps xmm1, [eax+$30] // row1 xmm1
// 5 row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
movaps xmm0, xmm7
shufps xmm0, xmm1, $88
shufps xmm1, xmm7, $DD
// 6 row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
// 7 row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
movlps xmm7, [eax+$8]
movhps xmm7, [eax+$18]
// 8 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
movlps xmm3, [eax+$28]
movhps xmm3, [eax+$38]
// 9 row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
movaps xmm2, xmm7
shufps xmm2, xmm3, $88
shufps xmm3, xmm7, $DD
// 10 row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
// 11 row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
// -----------------------------------------------
movaps xmm7, xmm2
mulps xmm7, xmm3
// 12 tmp1 = _mm_mul_ps(row2, row3);
shufps xmm7, xmm7, $B1
// 13 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
movaps xmm4, xmm1
mulps xmm4, xmm7
// 14 minor0 = _mm_mul_ps(row1, tmp1);
movaps xmm5, xmm0
mulps xmm5, xmm7
// 15 minor1 = _mm_mul_ps(row0, tmp1);
shufps xmm7, xmm7, $4E
// 16 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm1
mulps xmm6, xmm7
subps xmm6, xmm4
movaps xmm4, xmm6
// 17 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
movaps xmm6, xmm0
mulps xmm6, xmm7
subps xmm6, xmm5
movaps xmm5, xmm6
// 18 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
shufps xmm5, xmm5, $4E
// 19 minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
movups [edx], xmm4
movups [edx+$10], xmm5
// -----------------------------------------------
movaps xmm7, xmm1
mulps xmm7, xmm2
// 20 tmp1 = _mm_mul_ps(row1, row2);
shufps xmm7, xmm7, $B1
// 21 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
movaps xmm6, xmm3
mulps xmm6, xmm7
addps xmm4, xmm6
// 22 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
movaps xmm5, xmm0
mulps xmm5, xmm7
// 23 minor3 = _mm_mul_ps(row0, tmp1);
shufps xmm7, xmm7, $4E
// 24 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm3
mulps xmm6, xmm7
subps xmm4, xmm6
// 25 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
movaps xmm6, xmm0
mulps xmm6, xmm7
subps xmm6, xmm5
movaps xmm5, xmm6
// 26 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
shufps xmm5, xmm5, $4E
// 27 minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
movups [edx], xmm4
movups [edx+$30], xmm5
// -----------------------------------------------
movaps xmm7, xmm1
shufps xmm7, xmm7, $4E
mulps xmm7, xmm3
// 28 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
shufps xmm7, xmm7, $B1
// 29 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
shufps xmm2, xmm2, $4E
// 30 row2 = _mm_shuffle_ps(row2, row2, 0x4E);
movaps xmm6, xmm2
mulps xmm6, xmm7
addps xmm4, xmm6
// 31 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
movaps xmm5, xmm0
mulps xmm5, xmm7
// 32 minor2 = _mm_mul_ps(row0, tmp1);
shufps xmm7, xmm7, $4E
// 33 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm2
mulps xmm6, xmm7
subps xmm4, xmm6
// 34 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
movaps xmm6, xmm0
mulps xmm6, xmm7
subps xmm6, xmm5
movaps xmm5, xmm6
// 35 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
shufps xmm5, xmm5, $4E
// 36 minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
movups [edx], xmm4
movups [edx+$20], xmm5
// -----------------------------------------------
movaps xmm7, xmm0
mulps xmm7, xmm1
// 37 tmp1 = _mm_mul_ps(row0, row1);
shufps xmm7, xmm7, $B1
// 38 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
movaps xmm6, xmm3
mulps xmm6, xmm7
addps xmm5, xmm6
// 39 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
movaps xmm6, xmm2
mulps xmm6, xmm7
movups xmm4, [edx+$30]
subps xmm6, xmm4
movaps xmm4, xmm6
// 40 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
shufps xmm7, xmm7, $4E
// 41 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm3
mulps xmm6, xmm7
subps xmm6, xmm5
movaps xmm5, xmm6
// 42 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
movaps xmm6, xmm2
mulps xmm6, xmm7
subps xmm4, xmm6
// 43 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
movups [edx+$20], xmm5
movups [edx+$30], xmm4
// -----------------------------------------------
movaps xmm7, xmm0
mulps xmm7, xmm3
// 44 tmp1 = _mm_mul_ps(row0, row3);
shufps xmm7, xmm7, $B1
// 45 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
movaps xmm6, xmm2
mulps xmm6, xmm7
movups xmm4, [edx+$10]
subps xmm4, xmm6
// 46 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
movaps xmm6, xmm1
mulps xmm6, xmm7
addps xmm5, xmm6
// 47 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
shufps xmm7, xmm7, $4E
// 48 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm2
mulps xmm6, xmm7
addps xmm4, xmm6
// 49 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
movaps xmm6, xmm1
mulps xmm6, xmm7
subps xmm5, xmm6
// 50 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
movups [edx+$10], xmm4
movups [edx+$20], xmm5
// -----------------------------------------------
movaps xmm7, xmm0
mulps xmm7, xmm2
// 51 tmp1 = _mm_mul_ps(row0, row2);
shufps xmm7, xmm7, $B1
// 52 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
movaps xmm6, xmm3
mulps xmm6, xmm7
addps xmm4, xmm6
// 53 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
movaps xmm6, xmm1
mulps xmm6, xmm7
movups xmm5, [edx+$30]
subps xmm5, xmm6
// 54 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
shufps xmm7, xmm7, $4E
// 55 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
movaps xmm6, xmm3
mulps xmm6, xmm7
subps xmm4, xmm6
// 56 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
movaps xmm6, xmm1
mulps xmm6, xmm7
addps xmm5, xmm6
// 57 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
movups [edx+$10], xmm4
movups [edx+$30], xmm5
// -----------------------------------------------
movaps xmm6, xmm0
movups xmm4, [edx]
mulps xmm6, xmm4
// 58 det = _mm_mul_ps(row0, minor0);
movaps xmm7, xmm6
shufps xmm7, xmm7, $4E
addps xmm6, xmm7
// 59 det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
movaps xmm7, xmm6
shufps xmm7, xmm7, $B1
addss xmm6, xmm7
// 60 det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
rcpss xmm7, xmm6
// 61 tmp1 = _mm_rcp_ss(det);
movaps xmm5, xmm7
mulss xmm5, xmm5
mulss xmm6, xmm5
addss xmm7, xmm7
subss xmm7, xmm6
// 62 det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
shufps xmm7, xmm7, $00
// 63 det = _mm_shuffle_ps(det, det, 0x00);
mulps xmm4, xmm7
// 64 minor0 = _mm_mul_ps(det, minor0);
movups [edx], xmm4
// 65 _mm_storel_pi((__m64*)(src), minor0);
// 66 _mm_storeh_pi((__m64*)(src+2), minor0);
movups xmm4, [edx+$10]
mulps xmm4, xmm7
// 67 minor1 = _mm_mul_ps(det, minor1);
movups [edx+$10], xmm4
// 68 _mm_storel_pi((__m64*)(src+4), minor1);
// 69 _mm_storeh_pi((__m64*)(src+6), minor1);
movups xmm4, [edx+$20]
mulps xmm4, xmm7
// 70 minor2 = _mm_mul_ps(det, minor2);
movups [edx+$20], xmm4
// 71 _mm_storel_pi((__m64*)(src+

, minor2);
// 72 _mm_storeh_pi((__m64*)(src+10), minor2);
movups xmm4, [edx+$30]
mulps xmm4, xmm7
// 73 minor3 = _mm_mul_ps(det, minor3);
movups [edx+$30], xmm4
// 74 _mm_storel_pi((__m64*)(src+12), minor3);
// 75 _mm_storeh_pi((__m64*)(src+14), minor3);
end;
end;
it saves about 25% time calculated.
excuse me my poor english.