CAST II Game Engine Community

Please login or register.

Login with username, password and session length
Advanced search  

Author Topic: Miscalculation About InvertMatrix4s  (Read 840 times)

0 Members and 1 Guest are viewing this topic.

admin

  • Administrator
  • Full Member
  • *****
  • Posts: 192
    • CAST II Engine
Re: Miscalculation About InvertMatrix4s
« Reply #5 on: January 02, 2010, 06:42:19 AM »

Wow! Thank you! There really was an error! And in some cases it caused incorrect behaviour in editor.
Here is corrected and slightly optimized version which is 10%-15% slower then your fast SSE version. Not bad for pure pascal.:)
Code: [Select]
function InvertMatrix4s(const M: TMatrix4s): TMatrix4s;
var det, OneOverDet: Single;

  function MatDet3x3(a, b, c,
                     d, e, f,
                     g, h, i: Single): Single; {$IFDEF INLINE} inline; {$ENDIF}
  begin
    Result := a*e*i + b*f*g + c*d*h - a*f*h - b*d*i - c*e*g;
  end;

begin
  det := MatDet(M);
  if det <> 0 then begin
    OneOverDet := 1/Det;
    Result._11 :=  matdet3x3(M.A[05], M.A[09], M.A[13], M.A[06], M.A[10], M.A[14], M.A[07], M.A[11], M.A[15])*OneOverDet;
    Result._12 := -matdet3x3(M.A[01], M.A[09], M.A[13], M.A[02], M.A[10], M.A[14], M.A[03], M.A[11], M.A[15])*OneOverDet;
    Result._13 :=  matdet3x3(M.A[01], M.A[05], M.A[13], M.A[02], M.A[06], M.A[14], M.A[03], M.A[07], M.A[15])*OneOverDet;
    Result._14 := -matdet3x3(M.A[01], M.A[05], M.A[09], M.A[02], M.A[06], M.A[10], M.A[03], M.A[07], M.A[11])*OneOverDet;

    Result._21 := -matdet3x3(M.A[04], M.A[08], M.A[12], M.A[06], M.A[10], M.A[14], M.A[07], M.A[11], M.A[15])*OneOverDet;
    Result._22 :=  matdet3x3(M.A[00], M.A[08], M.A[12], M.A[02], M.A[10], M.A[14], M.A[03], M.A[11], M.A[15])*OneOverDet;
    Result._23 := -matdet3x3(M.A[00], M.A[04], M.A[12], M.A[02], M.A[06], M.A[14], M.A[03], M.A[07], M.A[15])*OneOverDet;
    Result._24 :=  matdet3x3(M.A[00], M.A[04], M.A[08], M.A[02], M.A[06], M.A[10], M.A[03], M.A[07], M.A[11])*OneOverDet;

    Result._31 :=  matdet3x3(M.A[04], M.A[08], M.A[12], M.A[05], M.A[09], M.A[13], M.A[07], M.A[11], M.A[15])*OneOverDet;
    Result._32 := -matdet3x3(M.A[00], M.A[08], M.A[12], M.A[01], M.A[09], M.A[13], M.A[03], M.A[11], M.A[15])*OneOverDet;
    Result._33 :=  matdet3x3(M.A[00], M.A[04], M.A[12], M.A[01], M.A[05], M.A[13], M.A[03], M.A[07], M.A[15])*OneOverDet;
    Result._34 := -matdet3x3(M.A[00], M.A[04], M.A[08], M.A[01], M.A[05], M.A[09], M.A[03], M.A[07], M.A[11])*OneOverDet;

    Result._41 := -matdet3x3(M.A[04], M.A[08], M.A[12], M.A[05], M.A[09], M.A[13], M.A[06], M.A[10], M.A[14])*OneOverDet;
    Result._42 :=  matdet3x3(M.A[00], M.A[08], M.A[12], M.A[01], M.A[09], M.A[13], M.A[02], M.A[10], M.A[14])*OneOverDet;
    Result._43 := -matdet3x3(M.A[00], M.A[04], M.A[12], M.A[01], M.A[05], M.A[13], M.A[02], M.A[06], M.A[14])*OneOverDet;
    Result._44 :=  matdet3x3(M.A[00], M.A[04], M.A[08], M.A[01], M.A[05], M.A[09], M.A[02], M.A[06], M.A[10])*OneOverDet;
  end else Result := IdentityMatrix4s;
end;
Logged

xlnrony

  • Newbie
  • *
  • Posts: 3
Re: Miscalculation About InvertMatrix4s
« Reply #4 on: January 01, 2010, 07:41:26 PM »

Memo1
(1.74 0.00 0.00 0.00) (1.24 1.00 0.00 0.00) (0.59 0.00 1.00 0.00) (0.55 0.00 0.00 1.00)-- 475022
(1.00 0.00 0.00 0.00) (0.00 1.00 0.00 0.00) (0.00 0.00 1.00 0.00) (0.00 0.00 0.00 1.00)-- 272033
this is my result tested
Logged

xlnrony

  • Newbie
  • *
  • Posts: 3
Re: Miscalculation About InvertMatrix4s
« Reply #3 on: January 01, 2010, 06:09:53 PM »

it is my test unit
Logged

admin

  • Administrator
  • Full Member
  • *****
  • Posts: 192
    • CAST II Engine
Re: Miscalculation About InvertMatrix4s
« Reply #2 on: January 01, 2010, 11:40:20 AM »

Can you please give an example matrix which causes wrong results with InvertMatrix4s()?
I've tested some and all is OK.

Anyway, thanks for the optimized version. Matrix inversions is not a bottleneck yet but...
Logged

xlnrony

  • Newbie
  • *
  • Posts: 3
Miscalculation About InvertMatrix4s
« Reply #1 on: January 01, 2010, 05:47:27 AM »

hi, i am programmer from china.
i find the miscalculation about InvertMatrix4s.

i try a test
RM := InvertMatrix4s1(M2);
  M := RM;
  MulMatrix4s1(RM, M2, M);
but RM is not identity matrix
so i think it is a miscalculation.

In addition, i'd love to provide my sse version, i test that it is right.
function InvertMatrix4s2(M: TMatrix4s): TMatrix4s;
begin
asm
  movlps xmm7, [eax]
  movhps xmm7, [eax+$10]                // tmp7 xmm0
// 4 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
  movlps xmm1, [eax+$20]
  movhps xmm1, [eax+$30]                // row1 xmm1
// 5 row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
  movaps xmm0, xmm7
  shufps xmm0, xmm1, $88
  shufps xmm1, xmm7, $DD
// 6 row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
// 7 row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
  movlps xmm7, [eax+$8]
  movhps xmm7, [eax+$18]
// 8 tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
  movlps xmm3, [eax+$28]
  movhps xmm3, [eax+$38]
// 9 row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
  movaps xmm2, xmm7
  shufps xmm2, xmm3, $88
  shufps xmm3, xmm7, $DD
// 10 row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
// 11 row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
// -----------------------------------------------
  movaps xmm7, xmm2
  mulps  xmm7, xmm3
// 12 tmp1 = _mm_mul_ps(row2, row3);
  shufps xmm7, xmm7, $B1
// 13 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  movaps xmm4, xmm1
  mulps  xmm4, xmm7
// 14 minor0 = _mm_mul_ps(row1, tmp1);
  movaps xmm5, xmm0
  mulps  xmm5, xmm7
// 15 minor1 = _mm_mul_ps(row0, tmp1);
  shufps xmm7, xmm7, $4E
// 16 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm1
  mulps  xmm6, xmm7
  subps  xmm6, xmm4
  movaps xmm4, xmm6
// 17 minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
  movaps xmm6, xmm0
  mulps  xmm6, xmm7
  subps  xmm6, xmm5
  movaps xmm5, xmm6
// 18 minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
  shufps xmm5, xmm5, $4E
// 19 minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
  movups [edx], xmm4
  movups [edx+$10], xmm5
// -----------------------------------------------
  movaps xmm7, xmm1
  mulps  xmm7, xmm2
// 20 tmp1 = _mm_mul_ps(row1, row2);
  shufps xmm7, xmm7, $B1
// 21 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  addps  xmm4, xmm6
// 22 minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
  movaps xmm5, xmm0
  mulps  xmm5, xmm7
// 23 minor3 = _mm_mul_ps(row0, tmp1);
  shufps xmm7, xmm7, $4E
// 24 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  subps  xmm4, xmm6
// 25 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
  movaps xmm6, xmm0
  mulps  xmm6, xmm7
  subps  xmm6, xmm5
  movaps xmm5, xmm6
// 26 minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
  shufps xmm5, xmm5, $4E
// 27 minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
  movups [edx], xmm4
  movups [edx+$30], xmm5
// -----------------------------------------------
  movaps xmm7, xmm1
  shufps xmm7, xmm7, $4E
  mulps  xmm7, xmm3
// 28 tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
  shufps xmm7, xmm7, $B1
// 29 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  shufps xmm2, xmm2, $4E
// 30 row2 = _mm_shuffle_ps(row2, row2, 0x4E);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  addps  xmm4, xmm6
// 31 minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
  movaps xmm5, xmm0
  mulps  xmm5, xmm7
// 32 minor2 = _mm_mul_ps(row0, tmp1);
  shufps xmm7, xmm7, $4E
// 33 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  subps  xmm4, xmm6
// 34 minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
  movaps xmm6, xmm0
  mulps  xmm6, xmm7
  subps  xmm6, xmm5
  movaps xmm5, xmm6
// 35 minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
  shufps xmm5, xmm5, $4E
// 36 minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
  movups [edx], xmm4
  movups [edx+$20], xmm5
// -----------------------------------------------
  movaps xmm7, xmm0
  mulps  xmm7, xmm1
// 37 tmp1 = _mm_mul_ps(row0, row1);
  shufps xmm7, xmm7, $B1
// 38 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  addps  xmm5, xmm6
// 39 minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  movups xmm4, [edx+$30]
  subps  xmm6, xmm4
  movaps xmm4, xmm6
// 40 minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
  shufps xmm7, xmm7, $4E
// 41 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  subps  xmm6, xmm5
  movaps xmm5, xmm6
// 42 minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  subps  xmm4, xmm6
// 43 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
  movups [edx+$20], xmm5
  movups [edx+$30], xmm4
// -----------------------------------------------
  movaps xmm7, xmm0
  mulps  xmm7, xmm3
// 44 tmp1 = _mm_mul_ps(row0, row3);
  shufps xmm7, xmm7, $B1
// 45 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  movups xmm4, [edx+$10]
  subps  xmm4, xmm6
// 46 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
  movaps xmm6, xmm1
  mulps  xmm6, xmm7
  addps  xmm5, xmm6
// 47 minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
  shufps xmm7, xmm7, $4E
// 48 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm2
  mulps  xmm6, xmm7
  addps  xmm4, xmm6
// 49 minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
  movaps xmm6, xmm1
  mulps  xmm6, xmm7
  subps  xmm5, xmm6
// 50 minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
  movups [edx+$10], xmm4
  movups [edx+$20], xmm5
// -----------------------------------------------
  movaps xmm7, xmm0
  mulps  xmm7, xmm2
// 51 tmp1 = _mm_mul_ps(row0, row2);
  shufps xmm7, xmm7, $B1
// 52 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  addps  xmm4, xmm6
// 53 minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
  movaps xmm6, xmm1
  mulps  xmm6, xmm7
  movups xmm5, [edx+$30]
  subps  xmm5, xmm6
// 54 minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
  shufps xmm7, xmm7, $4E
// 55 tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
  movaps xmm6, xmm3
  mulps  xmm6, xmm7
  subps  xmm4, xmm6
// 56 minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
  movaps xmm6, xmm1
  mulps  xmm6, xmm7
  addps  xmm5, xmm6
// 57 minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
  movups [edx+$10], xmm4
  movups [edx+$30], xmm5
// -----------------------------------------------
  movaps xmm6, xmm0
  movups xmm4, [edx]
  mulps  xmm6, xmm4
// 58 det = _mm_mul_ps(row0, minor0);
  movaps xmm7, xmm6
  shufps xmm7, xmm7, $4E
  addps  xmm6, xmm7
// 59 det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
  movaps xmm7, xmm6
  shufps xmm7, xmm7, $B1
  addss  xmm6, xmm7
// 60 det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
  rcpss  xmm7, xmm6
// 61 tmp1 = _mm_rcp_ss(det);
  movaps xmm5, xmm7
  mulss  xmm5, xmm5
  mulss  xmm6, xmm5
  addss  xmm7, xmm7
  subss  xmm7, xmm6
// 62 det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
  shufps xmm7, xmm7, $00
// 63 det = _mm_shuffle_ps(det, det, 0x00);
  mulps  xmm4, xmm7
// 64 minor0 = _mm_mul_ps(det, minor0);
  movups [edx], xmm4
// 65 _mm_storel_pi((__m64*)(src), minor0);
// 66 _mm_storeh_pi((__m64*)(src+2), minor0);
  movups xmm4, [edx+$10]
  mulps  xmm4, xmm7
// 67 minor1 = _mm_mul_ps(det, minor1);
  movups [edx+$10], xmm4
// 68 _mm_storel_pi((__m64*)(src+4), minor1);
// 69 _mm_storeh_pi((__m64*)(src+6), minor1);
  movups xmm4, [edx+$20]
  mulps  xmm4, xmm7
// 70 minor2 = _mm_mul_ps(det, minor2);
  movups [edx+$20], xmm4
// 71 _mm_storel_pi((__m64*)(src+ 8), minor2);
// 72 _mm_storeh_pi((__m64*)(src+10), minor2);
  movups xmm4, [edx+$30]
  mulps  xmm4, xmm7
// 73 minor3 = _mm_mul_ps(det, minor3);
  movups [edx+$30], xmm4
// 74 _mm_storel_pi((__m64*)(src+12), minor3);
// 75 _mm_storeh_pi((__m64*)(src+14), minor3);
end;
end;

it saves about 25% time calculated.
excuse me my poor english.

Logged
 

+ Quick Reply

With Quick-Reply you can write a post when viewing a topic without loading a new page. You can still use bulletin board code and smileys as you would in a normal post.

Warning: this topic has not been posted in for at least 120 days.
Unless you're sure you want to reply, please consider starting a new topic.

Name: Email:
Verification:
Type the letters shown in the picture
Listen to the letters / Request another image
Type the letters shown in the picture:

Please enter the number 234 in the field: