summaryrefslogtreecommitdiffstats
path: root/libraries/atlas/atlas.patch
blob: dea4dcc0b2eeb7a5ba5c70180507a7a8fcde619d (plain)
diff -rupN ATLAS/CONFIG/src/backend/archinfo_x86.c atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c
--- ATLAS/CONFIG/src/backend/archinfo_x86.c	2009-02-18 19:47:37.000000000 +0100
+++ atlas-3.8.3/CONFIG/src/backend/archinfo_x86.c	2009-11-12 13:47:23.777451677 +0100
@@ -320,7 +320,7 @@ enum MACHTYPE Chip2Mach(enum CHIP chip, 
          iret = IntP4;
          break;
       case 3:
-      case 4:
+      case 4: ; case 6:
          iret = IntP4E;
          break;
       default:
diff -rupN ATLAS/include/atlas_lvl3.h atlas-3.8.3/include/atlas_lvl3.h
--- ATLAS/include/atlas_lvl3.h	2009-02-18 19:47:35.000000000 +0100
+++ atlas-3.8.3/include/atlas_lvl3.h	2009-11-12 13:52:49.308496090 +0100
@@ -126,7 +126,7 @@
 #define CPAT Mjoin(C_ATL_, PRE);
 
 #ifndef ATL_MaxMalloc
-   #define ATL_MaxMalloc 67108864
+   #define ATL_MaxMalloc XXX_MaxMalloc_XXX
 #endif
 
 typedef void (*MAT2BLK)(int, int, const TYPE*, int, TYPE*, const SCALAR);
diff -rupN ATLAS/src/blas/gemm/ATL_cmmJITcp.c atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c
--- ATLAS/src/blas/gemm/ATL_cmmJITcp.c	2009-02-18 19:47:44.000000000 +0100
+++ atlas-3.8.3/src/blas/gemm/ATL_cmmJITcp.c	2009-11-12 12:44:34.816529051 +0100
@@ -268,7 +268,8 @@ static void Mjoin(PATL,mmK)
    {
       NBmm0 = NBmm1 = NBmmX = Mjoin(PATLU,pKBmm);
       if (SCALAR_IS_ZERO(beta))
-         Mjoin(PATL,gezero)(M, N, C, ldc);
+         /* Mjoin(PATL,gezero)(M, N, C, ldc); */
+         { Mjoin(PATLU,gezero)(M, N, pC, ldpc); Mjoin(PATLU,gezero)(M, N, pC+ipc, ldpc); }
    }
    if (nblk)
    {
diff -rupN ATLAS/src/blas/gemm/ATL_gereal2cplx.c atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c
--- ATLAS/src/blas/gemm/ATL_gereal2cplx.c	2009-02-18 19:47:44.000000000 +0100
+++ atlas-3.8.3/src/blas/gemm/ATL_gereal2cplx.c	2009-11-12 12:49:49.331651677 +0100
@@ -43,7 +43,53 @@ void Mjoin(PATL,gereal2cplx)
    const int ldc2 = (ldc-M)<<1;
    int i, j;
 
-   if (ialp == ATL_rzero && ibet == ATL_rzero)
+/*
+ * Cannot read C if BETA is 0
+ */
+   if (rbet == ATL_rzero && ibet == ATL_rzero)
+   {
+      if (ialp == ATL_rzero)  /* alpha is a real number */
+      {
+         if (ralp == ATL_rone) /* alpha = 1.0 */
+         {
+            for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+            {
+               for (i=0; i < M; i++, C += 2)
+               {
+                  *C = R[i];
+                  C[1] = I[i];
+               }
+            }
+         }
+         else
+         {
+            for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+            {
+               for (i=0; i < M; i++, C += 2)
+               {
+                  *C = ralp * R[i];
+                  C[1] = ralp * I[i];
+               }
+            }
+         }
+      }
+      else                   /* alpha is a complex number */
+      {
+         for (j=0; j < N; j++, R += ldr, I += ldi, C += ldc2)
+         {
+            for (i=0; i < M; i++, C += 2)
+            {
+               ra = R[i]; ia = I[i];
+               C[0] = ralp * ra - ialp * ia;
+               C[1] = ralp * ia + ialp * ra;
+            }
+         }
+      }
+   }
+/*
+ * If alpha and beta are both real numbers
+ */
+   else if (ialp == ATL_rzero && ibet == ATL_rzero)
    {
       if (ralp == ATL_rone && rbet == ATL_rone)
       {
diff -rupN ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c
--- ATLAS/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c	2009-02-18 19:48:26.000000000 +0100
+++ atlas-3.8.3/tune/blas/gemm/CASES/ATL_smm14x1x84_sseCU.c	2009-11-12 12:35:50.453038827 +0100
@@ -27,6 +27,13 @@
  * POSSIBILITY OF SUCH DAMAGE.
  *
  */
+#if KB > 84
+   #error "KB cannot exceed 84!"
+#endif
+#if (KB/4)*4 != KB
+   #error "KB must be a multiple of 4!"
+#endif
+
 #ifndef ATL_GAS_x8664
    #error "This kernel requires x86-64 assembly!"
 #endif
@@ -58,25 +65,25 @@
  * Integer register usage shown be these defines
  */
 #define pA      %rcx
-#define pA10	%rbx
-#define ldab	%rbp
-#define mldab	%rdx
+#define pA10    %rbx
+#define ldab    %rbp
+#define mldab   %rdx
 #define mldab5  %rax
 #define pB      %rdi
 #define pC      %rsi
 #define incCn   %r10
 #define stM     %r9
 #define stN     %r11
-#define pfA	%r8
-#define pA5 	pA
-#define pB0	pB
+#define pfA %r8
+#define pA5     pA
+#define pB0 pB
 #if MB == 0
-   #define	stM0	%r12
-   #define	incAm	%r13
+   #define  stM0    %r12
+   #define  incAm   %r13
 #endif
 /*       rax     used in 32/64 conversion */
 
-#define NBso	(KB*4)
+#define NBso    (KB*4)
 #define MBKBso  (MB*KB*4)
 #define NB2so   (NBso+NBso)
 #define NB3so   (NBso+NBso+NBso)
@@ -95,22 +102,22 @@
 /*
  * SSE2 register usage shown be these defines
  */
-#define rA0	%xmm0
-#define rB0	%xmm1
-#define rC0	%xmm2
-#define rC1	%xmm3
-#define rC2	%xmm4
-#define rC3	%xmm5
-#define rC4	%xmm6
-#define rC5	%xmm7
-#define rC6	%xmm8
-#define rC7	%xmm9
-#define rC8	%xmm10
-#define rC9	%xmm11
-#define rC10	%xmm12
-#define rC11	%xmm13
-#define rC12	%xmm14
-#define rC13	%xmm15
+#define rA0 %xmm0
+#define rB0 %xmm1
+#define rC0 %xmm2
+#define rC1 %xmm3
+#define rC2 %xmm4
+#define rC3 %xmm5
+#define rC4 %xmm6
+#define rC5 %xmm7
+#define rC6 %xmm8
+#define rC7 %xmm9
+#define rC8 %xmm10
+#define rC9 %xmm11
+#define rC10    %xmm12
+#define rC11    %xmm13
+#define rC12    %xmm14
+#define rC13    %xmm15
 /*
  * Prefetch defines
  */
@@ -127,99 +134,99 @@
 #if MB != 0
    #define incAm $MBKBso-NB14so+176
 #endif
-	.text
+    .text
 .global ATL_asmdecor(ATL_USERMM)
 ATL_asmdecor(ATL_USERMM):
 /*
  *      Save callee-saved iregs
  */
-	movq	%rbp, -8(%rsp)
-	movq	%rbx, -16(%rsp)
+    movq    %rbp, -8(%rsp)
+    movq    %rbx, -16(%rsp)
 #if MB == 0
-	movq	%r12, -32(%rsp)
-	movq	%r13, -40(%rsp)
+    movq    %r12, -32(%rsp)
+    movq    %r13, -40(%rsp)
 #endif
 #ifdef BETAX
    #define BOF -56
-	movss	%xmm1, BOF(%rsp)
-	movss	%xmm1, BOF+4(%rsp)
-	movss	%xmm1, BOF+8(%rsp)
-	movss	%xmm1, BOF+12(%rsp)
+    movss   %xmm1, BOF(%rsp)
+    movss   %xmm1, BOF+4(%rsp)
+    movss   %xmm1, BOF+8(%rsp)
+    movss   %xmm1, BOF+12(%rsp)
 #endif
 /*
  *      pA already comes in right reg
  *      Initialize pB = B; pC = C; NBso = NB * sizeof;
  */
-	movq	%rsi, stN
-	movq	%rdi, %rax
-	movq	16(%rsp), pC
-			prefC((pC))
-			prefC(64(pC))
-	movq	%r9, pB
-			prefB((pB))
-			prefB(64(pB))
-	movq	%rax, stM
+    movq    %rsi, stN
+    movq    %rdi, %rax
+    movq    16(%rsp), pC
+            prefC((pC))
+            prefC(64(pC))
+    movq    %r9, pB
+            prefB((pB))
+            prefB(64(pB))
+    movq    %rax, stM
 /*
  *      stM = pA + NBNBso;  stN = pB + NBNBso;
  */
 #if MB == 0
-	movq	stM, pfA
-	imulq	$NBso, pfA
-			prefB(128(pB))
-	movq	pfA, incAm
-	addq	pA5, pfA
-	addq	$176-NB14so, incAm
+    movq    stM, pfA
+    imulq   $NBso, pfA
+            prefB(128(pB))
+    movq    pfA, incAm
+    addq    pA5, pfA
+    addq    $176-NB14so, incAm
 #else
-	movq	$MBKBso, pfA
-	addq	pA5, pfA
-			prefB(128(pB))
+    movq    $MBKBso, pfA
+    addq    pA5, pfA
+            prefB(128(pB))
 #endif
 /*
  *      convert ldc to 64 bits, and then set incCn = (ldc - MB)*sizeof
  */
-	movl	24(%rsp), %eax
-	cltq
-	movq	%rax, incCn
-	subq	stM, incCn
-	addq	$14, incCn
+    movl    24(%rsp), %eax
+    cltq
+    movq    %rax, incCn
+    subq    stM, incCn
+    addq    $14, incCn
 #ifdef SREAL
-	shl	$2, incCn
+    shl $2, incCn
 #else
-	shl	$3, incCn
-			prefC(128(pC))
-			prefC(192(pC))
+    shl $3, incCn
+            prefC(128(pC))
+            prefC(192(pC))
 #endif
 /*
  *      Find M/14 if MB is not set
  */
 #if MB == 0
-	cmp	$84, stM
-	jne	MB_LT84
-/*      movq	$84/14, stM */
-	movq	$6, stM
+    cmp $84, stM
+    jne MB_LT84
+/*      movq    $84/14, stM */
+    movq    $6, stM
 MBFOUND:
-	subq	$1, stM
-	movq	stM, stM0
+    subq    $1, stM
+    movq    stM, stM0
 #endif
-	addq	$120, pA5
-	addq	$120, pB0
-	movq	$KB*4, ldab
-	movq	$-KB*5*4, mldab5
-	movq	$-KB*4, mldab
-	subq	mldab5, pA5
-	lea	KB*4(pA5, ldab,4), pA10
-/*	movq	$NB, stN */
+    addq    $120, pA5
+    addq    $120, pB0
+    movq    $KB*4, ldab
+    movq    $-KB*5*4, mldab5
+    movq    $-KB*4, mldab
+    subq    mldab5, pA5
+    lea KB*4(pA5, ldab,4), pA10
+/*  movq    $NB, stN */
 
 UNLOOP:
 #if MB == 0
-	movq	stM0, stM
-	cmp	$0, stM
-	je	MLAST
+    movq    stM0, stM
+    cmp $0, stM
+    je  MLAST
 #else
    #ifdef ATL_DivAns
-	movq	$ATL_DivAns-1, stM
+    movq    $ATL_DivAns-1, stM
    #else
-	movq	$MB/14-1, stM
+    movq    $MB/14-1, stM
    #endif
 #endif
 #if MB == 0 || MB > 14
@@ -227,992 +234,992 @@ UMLOOP:
 /*
  *      rC[0-13] = pC[0-13] * beta
  */
-	ALIGN16
+    ALIGN16
 /*UKLOOP: */
 #ifdef BETA1
-	movaps	0-120(pA10,mldab5,2), rC0
-	movaps	0-120(pB0), rB0
-	mulps	rB0, rC0
-	addss	(pC), rC0
-	movaps	0-120(pA5, mldab,4), rC1
-	mulps	rB0, rC1
-	addss	CMUL(4)(pC), rC1
-	movaps	0-120(pA10, mldab,8), rC2
-	mulps	rB0, rC2
-	addss	CMUL(8)(pC), rC2
-	movaps	0-120(pA5, mldab,2), rC3
-	mulps	rB0, rC3
-	addss	CMUL(12)(pC), rC3
-	movaps	0-120(pA5, mldab), rC4
-	mulps	rB0, rC4
-	addss	CMUL(16)(pC), rC4
-	movaps	0-120(pA5), rC5
-	mulps	rB0, rC5
-	addss	CMUL(20)(pC), rC5
-	movaps	0-120(pA5, ldab), rC6
-	mulps	rB0, rC6
-	addss	CMUL(24)(pC), rC6
-	movaps	0-120(pA5, ldab,2), rC7
-	mulps	rB0, rC7
-	addss	CMUL(28)(pC), rC7
-	movaps	0-120(pA10, mldab,2), rC8
-	mulps	rB0, rC8
-	addss	CMUL(32)(pC), rC8
-	movaps	0-120(pA5,ldab,4), rC9
-	mulps	rB0, rC9
-	addss	CMUL(36)(pC), rC9
-	movaps	0-120(pA10), rC10
-	mulps	rB0, rC10
-	addss	CMUL(40)(pC), rC10
-	movaps	0-120(pA10,ldab), rC11
-	mulps	rB0, rC11
-	addss	CMUL(44)(pC), rC11
-	movaps	0-120(pA10,ldab,2), rC12
-	mulps	rB0, rC12
-	addss	CMUL(48)(pC), rC12
-	movaps	0-120(pA5,ldab,8), rC13
-	mulps	rB0, rC13
-	addss	CMUL(52)(pC), rC13
+    movaps  0-120(pA10,mldab5,2), rC0
+    movaps  0-120(pB0), rB0
+    mulps   rB0, rC0
+    addss   (pC), rC0
+    movaps  0-120(pA5, mldab,4), rC1
+    mulps   rB0, rC1
+    addss   CMUL(4)(pC), rC1
+    movaps  0-120(pA10, mldab,8), rC2
+    mulps   rB0, rC2
+    addss   CMUL(8)(pC), rC2
+    movaps  0-120(pA5, mldab,2), rC3
+    mulps   rB0, rC3
+    addss   CMUL(12)(pC), rC3
+    movaps  0-120(pA5, mldab), rC4
+    mulps   rB0, rC4
+    addss   CMUL(16)(pC), rC4
+    movaps  0-120(pA5), rC5
+    mulps   rB0, rC5
+    addss   CMUL(20)(pC), rC5
+    movaps  0-120(pA5, ldab), rC6
+    mulps   rB0, rC6
+    addss   CMUL(24)(pC), rC6
+    movaps  0-120(pA5, ldab,2), rC7
+    mulps   rB0, rC7
+    addss   CMUL(28)(pC), rC7
+    movaps  0-120(pA10, mldab,2), rC8
+    mulps   rB0, rC8
+    addss   CMUL(32)(pC), rC8
+    movaps  0-120(pA5,ldab,4), rC9
+    mulps   rB0, rC9
+    addss   CMUL(36)(pC), rC9
+    movaps  0-120(pA10), rC10
+    mulps   rB0, rC10
+    addss   CMUL(40)(pC), rC10
+    movaps  0-120(pA10,ldab), rC11
+    mulps   rB0, rC11
+    addss   CMUL(44)(pC), rC11
+    movaps  0-120(pA10,ldab,2), rC12
+    mulps   rB0, rC12
+    addss   CMUL(48)(pC), rC12
+    movaps  0-120(pA5,ldab,8), rC13
+    mulps   rB0, rC13
+    addss   CMUL(52)(pC), rC13
 #else
-	movaps	0-120(pA10,mldab5,2), rC0
-	movaps	0-120(pB0), rC13
-	mulps	rC13, rC0
-	movaps	0-120(pA5, mldab,4), rC1
-	mulps	rC13, rC1
-	movaps	0-120(pA10, mldab,8), rC2
-	mulps	rC13, rC2
-	movaps	0-120(pA5, mldab,2), rC3
-	mulps	rC13, rC3
-	movaps	0-120(pA5, mldab), rC4
-	mulps	rC13, rC4
-	movaps	0-120(pA5), rC5
-	mulps	rC13, rC5
-	movaps	0-120(pA5, ldab), rC6
-	mulps	rC13, rC6
-	movaps	0-120(pA5, ldab,2), rC7
-	mulps	rC13, rC7
-	movaps	0-120(pA10, mldab,2), rC8
-	mulps	rC13, rC8
-	movaps	0-120(pA5,ldab,4), rC9
-	mulps	rC13, rC9
-	movaps	0-120(pA10), rC10
-	mulps	rC13, rC10
-	movaps	0-120(pA10,ldab), rC11
-	mulps	rC13, rC11
-	movaps	0-120(pA10,ldab,2), rC12
-	mulps	rC13, rC12
-	mulps 	0-120(pA5,ldab,8), rC13
+    movaps  0-120(pA10,mldab5,2), rC0
+    movaps  0-120(pB0), rC13
+    mulps   rC13, rC0
+    movaps  0-120(pA5, mldab,4), rC1
+    mulps   rC13, rC1
+    movaps  0-120(pA10, mldab,8), rC2
+    mulps   rC13, rC2
+    movaps  0-120(pA5, mldab,2), rC3
+    mulps   rC13, rC3
+    movaps  0-120(pA5, mldab), rC4
+    mulps   rC13, rC4
+    movaps  0-120(pA5), rC5
+    mulps   rC13, rC5
+    movaps  0-120(pA5, ldab), rC6
+    mulps   rC13, rC6
+    movaps  0-120(pA5, ldab,2), rC7
+    mulps   rC13, rC7
+    movaps  0-120(pA10, mldab,2), rC8
+    mulps   rC13, rC8
+    movaps  0-120(pA5,ldab,4), rC9
+    mulps   rC13, rC9
+    movaps  0-120(pA10), rC10
+    mulps   rC13, rC10
+    movaps  0-120(pA10,ldab), rC11
+    mulps   rC13, rC11
+    movaps  0-120(pA10,ldab,2), rC12
+    mulps   rC13, rC12
+    mulps   0-120(pA5,ldab,8), rC13
 #endif
 
 #if KB > 4
-	movaps	16-120(pA10,mldab5,2), rA0
-	movaps	16-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	16-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	16-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	16-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	16-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	16-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	16-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	16-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	16-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	16-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	16-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	16-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	16-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	16-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  16-120(pA10,mldab5,2), rA0
+    movaps  16-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  16-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  16-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  16-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  16-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  16-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  16-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  16-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  16-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  16-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  16-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  16-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  16-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   16-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 8
-	movaps	32-120(pA10,mldab5,2), rA0
-	movaps	32-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	32-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	32-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	32-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	32-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	32-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	32-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	32-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	32-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	32-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	32-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	32-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	32-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	32-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  32-120(pA10,mldab5,2), rA0
+    movaps  32-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  32-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  32-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  32-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  32-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  32-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  32-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  32-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  32-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  32-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  32-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  32-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  32-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   32-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 12
-	movaps	48-120(pA10,mldab5,2), rA0
-	movaps	48-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	48-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	48-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	48-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	48-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	48-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	48-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	48-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	48-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	48-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	48-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	48-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	48-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	48-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  48-120(pA10,mldab5,2), rA0
+    movaps  48-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  48-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  48-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  48-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  48-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  48-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  48-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  48-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  48-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  48-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  48-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  48-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  48-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   48-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 16
-	movaps	64-120(pA10,mldab5,2), rA0
-	movaps	64-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	64-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	64-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	64-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	64-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	64-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	64-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	64-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	64-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	64-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	64-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	64-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	64-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	64-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  64-120(pA10,mldab5,2), rA0
+    movaps  64-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  64-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  64-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  64-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  64-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  64-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  64-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  64-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  64-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  64-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  64-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  64-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  64-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   64-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 20
-	movaps	80-120(pA10,mldab5,2), rA0
-	movaps	80-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	80-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	80-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	80-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	80-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	80-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	80-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	80-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	80-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	80-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	80-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	80-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	80-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	80-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  80-120(pA10,mldab5,2), rA0
+    movaps  80-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  80-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  80-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  80-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  80-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  80-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  80-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  80-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  80-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  80-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  80-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  80-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  80-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   80-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 24
-	movaps	96-120(pA10,mldab5,2), rA0
-	movaps	96-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	96-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	96-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	96-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	96-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	96-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	96-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	96-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	96-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	96-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	96-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	96-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	96-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	96-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  96-120(pA10,mldab5,2), rA0
+    movaps  96-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  96-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  96-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  96-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  96-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  96-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  96-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  96-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  96-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  96-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  96-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  96-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  96-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   96-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 28
-	movaps	112-120(pA10,mldab5,2), rA0
-	movaps	112-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	112-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	112-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	112-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	112-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	112-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	112-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	112-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	112-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	112-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	112-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	112-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	112-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	112-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  112-120(pA10,mldab5,2), rA0
+    movaps  112-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  112-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  112-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  112-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  112-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  112-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  112-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  112-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  112-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  112-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  112-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  112-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  112-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   112-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 #ifndef SREAL
-					pref2((pfA))
-					pref2(64(pfA))
+                    pref2((pfA))
+                    pref2(64(pfA))
 #endif
 
 #if KB > 32
-	movaps	128-120(pA10,mldab5,2), rA0
-	movaps	128-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	128-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	128-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	128-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	128-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	128-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	128-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	128-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	128-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	128-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	128-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	128-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	128-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	128-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  128-120(pA10,mldab5,2), rA0
+    movaps  128-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  128-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  128-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  128-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  128-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  128-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  128-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  128-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  128-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  128-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  128-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  128-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  128-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   128-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 36
-	movaps	144-120(pA10,mldab5,2), rA0
-	movaps	144-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	144-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	144-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	144-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	144-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	144-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	144-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	144-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	144-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	144-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	144-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	144-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	144-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	144-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  144-120(pA10,mldab5,2), rA0
+    movaps  144-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  144-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  144-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  144-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  144-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  144-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  144-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  144-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  144-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  144-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  144-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  144-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  144-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   144-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 40
-	movaps	160-120(pA10,mldab5,2), rA0
-	movaps	160-120(pB0), rB0
-	mulps	rB0, rA0
-				addq $176, pB0
-	addps	rA0, rC0
-	movaps	160-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	160-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	160-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	160-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	160-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	160-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	160-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	160-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	160-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	160-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	160-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	160-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-				addq $176, pA10
-	addps	rA0, rC12
-	mulps	160-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
-				addq $176, pA5
+    movaps  160-120(pA10,mldab5,2), rA0
+    movaps  160-120(pB0), rB0
+    mulps   rB0, rA0
+                addq $176, pB0
+    addps   rA0, rC0
+    movaps  160-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  160-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  160-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  160-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  160-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  160-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  160-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  160-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  160-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  160-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  160-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  160-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+                addq $176, pA10
+    addps   rA0, rC12
+    mulps   160-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
+                addq $176, pA5
 #else
-				addq $176, pB0
-				addq $176, pA10
-				addq $176, pA5
+                addq $176, pB0
+                addq $176, pA10
+                addq $176, pA5
 #endif
 
 #if KB > 44
-	movaps	0-120(pA10,mldab5,2), rA0
-	movaps	0-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	0-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	0-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	0-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	0-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	0-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	0-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	0-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	0-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	0-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	0-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	0-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	0-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	0-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  0-120(pA10,mldab5,2), rA0
+    movaps  0-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  0-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  0-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  0-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  0-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  0-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  0-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  0-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  0-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  0-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  0-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  0-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  0-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   0-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 48
-	movaps	16-120(pA10,mldab5,2), rA0
-	movaps	16-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	16-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	16-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	16-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	16-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	16-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	16-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	16-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	16-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	16-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	16-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	16-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	16-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	16-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  16-120(pA10,mldab5,2), rA0
+    movaps  16-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  16-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  16-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  16-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  16-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  16-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  16-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  16-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  16-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  16-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  16-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  16-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  16-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   16-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 52
-	movaps	32-120(pA10,mldab5,2), rA0
-	movaps	32-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	32-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	32-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	32-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	32-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	32-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	32-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	32-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	32-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	32-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	32-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	32-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	32-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	32-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  32-120(pA10,mldab5,2), rA0
+    movaps  32-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  32-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  32-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  32-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  32-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  32-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  32-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  32-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  32-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  32-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  32-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  32-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  32-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   32-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 56
-	movaps	48-120(pA10,mldab5,2), rA0
-	movaps	48-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	48-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	48-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	48-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	48-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	48-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	48-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	48-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	48-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	48-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	48-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	48-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	48-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	48-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  48-120(pA10,mldab5,2), rA0
+    movaps  48-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  48-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  48-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  48-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  48-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  48-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  48-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  48-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  48-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  48-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  48-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  48-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  48-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   48-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 60
-	movaps	64-120(pA10,mldab5,2), rA0
-	movaps	64-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	64-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	64-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	64-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	64-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	64-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	64-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	64-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	64-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	64-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	64-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	64-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	64-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	64-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  64-120(pA10,mldab5,2), rA0
+    movaps  64-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  64-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  64-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  64-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  64-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  64-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  64-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  64-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  64-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  64-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  64-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  64-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  64-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   64-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 64
-	movaps	80-120(pA10,mldab5,2), rA0
-	movaps	80-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	80-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	80-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	80-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	80-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	80-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	80-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	80-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	80-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	80-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	80-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	80-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	80-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	80-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  80-120(pA10,mldab5,2), rA0
+    movaps  80-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  80-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  80-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  80-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  80-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  80-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  80-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  80-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  80-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  80-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  80-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  80-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  80-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   80-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 68
-	movaps	96-120(pA10,mldab5,2), rA0
-	movaps	96-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	96-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	96-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	96-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	96-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	96-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	96-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	96-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	96-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	96-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	96-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	96-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	96-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	96-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  96-120(pA10,mldab5,2), rA0
+    movaps  96-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  96-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  96-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  96-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  96-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  96-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  96-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  96-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  96-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  96-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  96-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  96-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  96-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   96-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 72
-	movaps	112-120(pA10,mldab5,2), rA0
-	movaps	112-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	112-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	112-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	112-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	112-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	112-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	112-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	112-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	112-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	112-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	112-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	112-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	112-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	112-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  112-120(pA10,mldab5,2), rA0
+    movaps  112-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  112-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  112-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  112-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  112-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  112-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  112-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  112-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  112-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  112-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  112-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  112-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  112-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   112-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 76
-	movaps	128-120(pA10,mldab5,2), rA0
-	movaps	128-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	128-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	128-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	128-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	128-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	128-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	128-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	128-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	128-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	128-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	128-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	128-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	128-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	128-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  128-120(pA10,mldab5,2), rA0
+    movaps  128-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  128-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  128-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  128-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  128-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  128-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  128-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  128-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  128-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  128-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  128-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  128-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  128-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   128-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 80
-	movaps	144-120(pA10,mldab5,2), rA0
-	movaps	144-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	144-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	144-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	144-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	144-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	144-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	144-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	144-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	144-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	144-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	144-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	144-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	144-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	144-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  144-120(pA10,mldab5,2), rA0
+    movaps  144-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  144-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  144-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  144-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  144-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  144-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  144-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  144-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  144-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  144-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  144-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  144-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  144-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   144-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 /*UKLOOP */
@@ -1220,234 +1227,234 @@ UMLOOP:
  *      Get these bastard things summed up correctly
  */
 
-					/* rC0 = c0a    c0b    c0c    c0d */
-					/* rC1 = c1a    c1b    c1c    c1d */
-					/* rC2 = c2a    c2b    c2c    c2d */
-					/* rC3 = c3a    c3b    c3c    c3d */
+                    /* rC0 = c0a    c0b    c0c    c0d */
+                    /* rC1 = c1a    c1b    c1c    c1d */
+                    /* rC2 = c2a    c2b    c2c    c2d */
+                    /* rC3 = c3a    c3b    c3c    c3d */
 /* */
-	movaps		rC2, rB0	/* rB0 = c2a    c2b    c2c    c2d */
-						prefC((pC))
-						prefC(64(pC))
-	movaps		rC0, rA0	/* rA0 = c0a    c0b    c0c    c0d */
-	unpckhps	rC3, rB0	/* rB0 = c2c    c3c    c2d    c3d */
-	unpckhps	rC1, rA0	/* rA0 = c0c    c1c    c0d    c1d */
-	unpcklps	rC3, rC2	/* rC2 = c2a    c3a    c2b    c3b */
-	movlhps		rB0, rC3	/* rC3 = c3a    c3b    c2c    c3c */
-	unpcklps	rC1, rC0	/* rC0 = c0a    c1a    c0b    c1b */
-	movhlps		rA0, rC3	/* rC3 = c0d    c1d    c2c    c3c */
-	movlhps		rC2, rA0	/* rA0 = c0c    c1c    c2a    c3a */
-	movhlps		rC0, rB0	/* rB0 = c0b    c1b    c2d    c3d */
-	addps		rA0, rC3	/* rC3 = c0cd   c1cd   c2ac   c3ac */
-	movlhps		rC0, rC1	/* rC1 = c1a    c1b    c0a    c1a */
-	movhlps		rC1, rC2	/* rC2 = c0a    c1a    c2b    c3b */
-	movaps		rC4, rA0	/* rA0 = c4a    c4b    c4c    c4d */
-	addps		rB0, rC2	/* rC2 = c0ab   c1ab   c2bd   c3bd */
-	movaps		rC6, rB0	/* rB0 = c6a    c6b    c6c    c6d */
-	addps		rC2, rC3	/* rC3 = c0abcd c1abcd c2bdac c3bdac */
-
-
-					/* rC4 = c4a    c4b    c4c    c4d */
-					/* rC5 = c5a    c5b    c5c    c5d */
-					/* rC6 = c6a    c6b    c6c    c6d */
-					/* rC7 = c7a    c7b    c7c    c7d */
-					/* rC8  = c08a    c08b    c08c    c08d */
-					/* rC9  = c09a    c09b    c09c    c09d */
-					/* rC10 = c10a    c10b    c10c    c10d */
-					/* rC11 = c11a    c11b    c11c    c11d */
-					/* rC12 = c12a    c12b    c12c    c12d */
-					/* rC13 = c13a    c13b    c13c    c13d */
+    movaps      rC2, rB0    /* rB0 = c2a    c2b    c2c    c2d */
+                        prefC((pC))
+                        prefC(64(pC))
+    movaps      rC0, rA0    /* rA0 = c0a    c0b    c0c    c0d */
+    unpckhps    rC3, rB0    /* rB0 = c2c    c3c    c2d    c3d */
+    unpckhps    rC1, rA0    /* rA0 = c0c    c1c    c0d    c1d */
+    unpcklps    rC3, rC2    /* rC2 = c2a    c3a    c2b    c3b */
+    movlhps     rB0, rC3    /* rC3 = c3a    c3b    c2c    c3c */
+    unpcklps    rC1, rC0    /* rC0 = c0a    c1a    c0b    c1b */
+    movhlps     rA0, rC3    /* rC3 = c0d    c1d    c2c    c3c */
+    movlhps     rC2, rA0    /* rA0 = c0c    c1c    c2a    c3a */
+    movhlps     rC0, rB0    /* rB0 = c0b    c1b    c2d    c3d */
+    addps       rA0, rC3    /* rC3 = c0cd   c1cd   c2ac   c3ac */
+    movlhps     rC0, rC1    /* rC1 = c1a    c1b    c0a    c1a */
+    movhlps     rC1, rC2    /* rC2 = c0a    c1a    c2b    c3b */
+    movaps      rC4, rA0    /* rA0 = c4a    c4b    c4c    c4d */
+    addps       rB0, rC2    /* rC2 = c0ab   c1ab   c2bd   c3bd */
+    movaps      rC6, rB0    /* rB0 = c6a    c6b    c6c    c6d */
+    addps       rC2, rC3    /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+
+
+                    /* rC4 = c4a    c4b    c4c    c4d */
+                    /* rC5 = c5a    c5b    c5c    c5d */
+                    /* rC6 = c6a    c6b    c6c    c6d */
+                    /* rC7 = c7a    c7b    c7c    c7d */
+                    /* rC8  = c08a    c08b    c08c    c08d */
+                    /* rC9  = c09a    c09b    c09c    c09d */
+                    /* rC10 = c10a    c10b    c10c    c10d */
+                    /* rC11 = c11a    c11b    c11c    c11d */
+                    /* rC12 = c12a    c12b    c12c    c12d */
+                    /* rC13 = c13a    c13b    c13c    c13d */
 /* */
-	movaps		rC10, rC0	/* rC0 = c10a    c10b    c10c    c10d */
-						prefC(128(pC))
+    movaps      rC10, rC0   /* rC0 = c10a    c10b    c10c    c10d */
+                        prefC(128(pC))
 #ifdef SREAL
-						pref2((pfA))
+                        pref2((pfA))
 #else
-						prefC(192(pC))
+                        prefC(192(pC))
 #endif
-	movaps		rC8 , rC1	/* rC1 = c08a    c08b    c08c    c08d */
-	movaps          rC12, rC2 	/* rC2  = c12a    c12b    c12c    c12d */
-	unpckhps	rC7, rB0	/* rB0 = c6c    c7c    c6d    c7d */
-	unpckhps	rC5, rA0	/* rA0 = c4c    c5c    c4d    c5d */
-	unpcklps	rC7, rC6	/* rC6 = c6a    c7a    c6b    c7b */
-	unpckhps	rC11, rC0	/* rC0 = c10c    c11c    c10d    c11d */
-	unpckhps	rC9 , rC1	/* rC1 = c08c    c09c    c08d    c09d */
-	movlhps		rB0, rC7	/* rC7 = c7a    c7b    c6c    c7c */
-	unpcklps	rC5, rC4	/* rC4 = c4a    c5a    c4b    c5b */
-	movhlps		rA0, rC7	/* rC7 = c4d    c5d    c6c    c7c */
-	movlhps		rC6, rA0	/* rA0 = c4c    c5c    c6a    c7a */
-	unpcklps	rC11, rC10	/* rC10 = c10a    c11a    c10b    c11b */
-	movhlps		rC4, rB0	/* rB0 = c4b    c5b    c6d    c7d */
-	movlhps		rC0, rC11	/* rC11 = c11a    c11b    c10c    c11c */
-	addps		rA0, rC7	/* rC7 = c4cd   c5cd   c6ac   c7ac */
+    movaps      rC8 , rC1   /* rC1 = c08a    c08b    c08c    c08d */
+    movaps          rC12, rC2   /* rC2  = c12a    c12b    c12c    c12d */
+    unpckhps    rC7, rB0    /* rB0 = c6c    c7c    c6d    c7d */
+    unpckhps    rC5, rA0    /* rA0 = c4c    c5c    c4d    c5d */
+    unpcklps    rC7, rC6    /* rC6 = c6a    c7a    c6b    c7b */
+    unpckhps    rC11, rC0   /* rC0 = c10c    c11c    c10d    c11d */
+    unpckhps    rC9 , rC1   /* rC1 = c08c    c09c    c08d    c09d */
+    movlhps     rB0, rC7    /* rC7 = c7a    c7b    c6c    c7c */
+    unpcklps    rC5, rC4    /* rC4 = c4a    c5a    c4b    c5b */
+    movhlps     rA0, rC7    /* rC7 = c4d    c5d    c6c    c7c */
+    movlhps     rC6, rA0    /* rA0 = c4c    c5c    c6a    c7a */
+    unpcklps    rC11, rC10  /* rC10 = c10a    c11a    c10b    c11b */
+    movhlps     rC4, rB0    /* rB0 = c4b    c5b    c6d    c7d */
+    movlhps     rC0, rC11   /* rC11 = c11a    c11b    c10c    c11c */
+    addps       rA0, rC7    /* rC7 = c4cd   c5cd   c6ac   c7ac */
 #ifdef BETAX
    #ifdef SREAL
-			movups	(pC), rA0
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-			movups	16(pC), rC4
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-			movups	32(pC), rC5
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
-			movlps	48(pC), rC1
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
-						pref2(64(pfA))
-			mulps	BOF(%rsp), rA0
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-			mulps	BOF(%rsp), rC4
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-			mulps	BOF(%rsp), rC5
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
-			mulps	BOF(%rsp), rC1
+            movups  (pC), rA0
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+            movups  16(pC), rC4
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+            movups  32(pC), rC5
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+            movlps  48(pC), rC1
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+                        pref2(64(pfA))
+            mulps   BOF(%rsp), rA0
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+            mulps   BOF(%rsp), rC4
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+            mulps   BOF(%rsp), rC5
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+            mulps   BOF(%rsp), rC1
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
-			addps	rA0, rC3
-						addq	$68, pfA
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
-			addps	rC4, rC7
-			addps	rC5, rC11
-			addps	rC1, rC12
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
+            addps   rA0, rC3
+                        addq    $68, pfA
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
+            addps   rC4, rC7
+            addps   rC5, rC11
+            addps   rC1, rC12
    #else  /* BETA = X, complex type */
-			movups	(pC), rA0
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-			movups	16(pC), rC4
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
- 			shufps	$0x88, rC4, rA0    /* rA0 = c0 c1 c2 c3 */
-			movups	32(pC), rC4        /* rC4 = c4 X  c5 X */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-			movups	48(pC), rC5        /* rC5 = c6 X c7 X */
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
- 			shufps	$0x88, rC5, rC4    /* rC4 = c4 c5 c6 c7 */
-			movups	64(pC), rC5	   /* rC5 = c8 X  c9 X */
-			movups	80(pC), rC1        /* rC1 = c10 X c11 X */
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
- 			shufps	$0x88, rC1, rC5    /* rC5 = c8 c9 c10 c11 */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-			movss	96(pC), rC1
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-			movss	104(pC), rB0
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-			unpcklps	rB0, rC1
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
-						prefC(256(pC))
-			mulps	BOF(%rsp), rA0
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-			mulps	BOF(%rsp), rC4
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-			mulps	BOF(%rsp), rC5
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
-			mulps	BOF(%rsp), rC1
+            movups  (pC), rA0
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+            movups  16(pC), rC4
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+            shufps  $0x88, rC4, rA0    /* rA0 = c0 c1 c2 c3 */
+            movups  32(pC), rC4        /* rC4 = c4 X  c5 X */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+            movups  48(pC), rC5        /* rC5 = c6 X c7 X */
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+            shufps  $0x88, rC5, rC4    /* rC4 = c4 c5 c6 c7 */
+            movups  64(pC), rC5    /* rC5 = c8 X  c9 X */
+            movups  80(pC), rC1        /* rC1 = c10 X c11 X */
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+            shufps  $0x88, rC1, rC5    /* rC5 = c8 c9 c10 c11 */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+            movss   96(pC), rC1
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+            movss   104(pC), rB0
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+            unpcklps    rB0, rC1
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+                        prefC(256(pC))
+            mulps   BOF(%rsp), rA0
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+            mulps   BOF(%rsp), rC4
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+            mulps   BOF(%rsp), rC5
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+            mulps   BOF(%rsp), rC1
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
-			addps	rA0, rC3
-						prefC(192(pC))
-						addq	$68, pfA
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
-			addps	rC4, rC7
-			addps	rC5, rC11
-			addps	rC1, rC12
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
+            addps   rA0, rC3
+                        prefC(192(pC))
+                        addq    $68, pfA
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
+            addps   rC4, rC7
+            addps   rC5, rC11
+            addps   rC1, rC12
    #endif
 
 #else
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
    #ifdef SREAL
-						pref2(64(pfA))
+                        pref2(64(pfA))
    #else
-						prefC(256(pC))
+                        prefC(256(pC))
    #endif
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
    #ifndef SREAL
-						prefC(192(pC))
+                        prefC(192(pC))
    #endif
-						addq	$68, pfA
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
+                        addq    $68, pfA
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
 
 #endif
 /*
  *      Write results back to C;  pC += 14;
  */
 #ifdef SREAL
-	movups	rC3, (pC)
-	movups	rC7, 16(pC)
-	movups	rC11, 32(pC)
-	movlps	rC12, 48(pC)
-	addq	$56, pC
+    movups  rC3, (pC)
+    movups  rC7, 16(pC)
+    movups  rC11, 32(pC)
+    movlps  rC12, 48(pC)
+    addq    $56, pC
 #else
-	movss	rC3, (pC)
-	movss	rC7, 32(pC)
-	movhlps	rC3, rC0
-	movhlps	rC7, rC6
-	movss	rC0, 16(pC)
-	movss	rC6, 48(pC)
-	shufps	$0x55, rC3, rC3
-	shufps	$0x55, rC7, rC7
-	movss	rC3, 8(pC)
-	movss	rC7, 40(pC)
-	shufps	$0x55, rC0, rC0
-	shufps	$0x55, rC6, rC6
-	movss	rC0, 24(pC)
-	movss	rC6, 56(pC)
-
-	movss	rC11, 64(pC)
-	movhlps	rC11, rC2
-	movss	rC12, 96(pC)
-	movss	rC2, 80(pC)
-	shufps	$0x55, rC11, rC11
-	shufps	$0x55, rC12, rC12
-	movss	rC11, 72(pC)
-	shufps	$0x55, rC2, rC2
-	movss	rC12, 104(pC)
-	movss	rC2, 88(pC)
+    movss   rC3, (pC)
+    movss   rC7, 32(pC)
+    movhlps rC3, rC0
+    movhlps rC7, rC6
+    movss   rC0, 16(pC)
+    movss   rC6, 48(pC)
+    shufps  $0x55, rC3, rC3
+    shufps  $0x55, rC7, rC7
+    movss   rC3, 8(pC)
+    movss   rC7, 40(pC)
+    shufps  $0x55, rC0, rC0
+    shufps  $0x55, rC6, rC6
+    movss   rC0, 24(pC)
+    movss   rC6, 56(pC)
+
+    movss   rC11, 64(pC)
+    movhlps rC11, rC2
+    movss   rC12, 96(pC)
+    movss   rC2, 80(pC)
+    shufps  $0x55, rC11, rC11
+    shufps  $0x55, rC12, rC12
+    movss   rC11, 72(pC)
+    shufps  $0x55, rC2, rC2
+    movss   rC12, 104(pC)
+    movss   rC2, 88(pC)
 
-	addq	$112, pC
+    addq    $112, pC
 #endif
 /*
  *      Write results back to C
  */
-		addq	$NB14so-176, pA5
-		addq	$NB14so-176, pA10
-		subq	$176, pB0
+        addq    $NB14so-176, pA5
+        addq    $NB14so-176, pA10
+        subq    $176, pB0
 /*
  *      pC += 14;  pA += 14*NB; pB -= NB;
  */
 /*
  *      while (pA != stM);
  */
-	subq	$1, stM
-	jne	UMLOOP
+    subq    $1, stM
+    jne UMLOOP
 #endif
 
 /*
@@ -1459,994 +1466,994 @@ MLAST:
 #endif
 /*UKLOOP: */
 #ifdef BETA1
-	movaps	0-120(pA10,mldab5,2), rC0
-	movaps	0-120(pB0), rB0
-	mulps	rB0, rC0
-	addss	(pC), rC0
-	movaps	0-120(pA5, mldab,4), rC1
-	mulps	rB0, rC1
-	addss	CMUL(4)(pC), rC1
-	movaps	0-120(pA10, mldab,8), rC2
-	mulps	rB0, rC2
-	addss	CMUL(8)(pC), rC2
-	movaps	0-120(pA5, mldab,2), rC3
-	mulps	rB0, rC3
-	addss	CMUL(12)(pC), rC3
-	movaps	0-120(pA5, mldab), rC4
-	mulps	rB0, rC4
-	addss	CMUL(16)(pC), rC4
-	movaps	0-120(pA5), rC5
-	mulps	rB0, rC5
-	addss	CMUL(20)(pC), rC5
-	movaps	0-120(pA5, ldab), rC6
-	mulps	rB0, rC6
-	addss	CMUL(24)(pC), rC6
-	movaps	0-120(pA5, ldab,2), rC7
-	mulps	rB0, rC7
-	addss	CMUL(28)(pC), rC7
-	movaps	0-120(pA10, mldab,2), rC8
-	mulps	rB0, rC8
-	addss	CMUL(32)(pC), rC8
-	movaps	0-120(pA5,ldab,4), rC9
-	mulps	rB0, rC9
-	addss	CMUL(36)(pC), rC9
-	movaps	0-120(pA10), rC10
-	mulps	rB0, rC10
-	addss	CMUL(40)(pC), rC10
-	movaps	0-120(pA10,ldab), rC11
-	mulps	rB0, rC11
-	addss	CMUL(44)(pC), rC11
-	movaps	0-120(pA10,ldab,2), rC12
-	mulps	rB0, rC12
-	addss	CMUL(48)(pC), rC12
-	movaps	0-120(pA5,ldab,8), rC13
-	mulps	rB0, rC13
-	addss	CMUL(52)(pC), rC13
+    movaps  0-120(pA10,mldab5,2), rC0
+    movaps  0-120(pB0), rB0
+    mulps   rB0, rC0
+    addss   (pC), rC0
+    movaps  0-120(pA5, mldab,4), rC1
+    mulps   rB0, rC1
+    addss   CMUL(4)(pC), rC1
+    movaps  0-120(pA10, mldab,8), rC2
+    mulps   rB0, rC2
+    addss   CMUL(8)(pC), rC2
+    movaps  0-120(pA5, mldab,2), rC3
+    mulps   rB0, rC3
+    addss   CMUL(12)(pC), rC3
+    movaps  0-120(pA5, mldab), rC4
+    mulps   rB0, rC4
+    addss   CMUL(16)(pC), rC4
+    movaps  0-120(pA5), rC5
+    mulps   rB0, rC5
+    addss   CMUL(20)(pC), rC5
+    movaps  0-120(pA5, ldab), rC6
+    mulps   rB0, rC6
+    addss   CMUL(24)(pC), rC6
+    movaps  0-120(pA5, ldab,2), rC7
+    mulps   rB0, rC7
+    addss   CMUL(28)(pC), rC7
+    movaps  0-120(pA10, mldab,2), rC8
+    mulps   rB0, rC8
+    addss   CMUL(32)(pC), rC8
+    movaps  0-120(pA5,ldab,4), rC9
+    mulps   rB0, rC9
+    addss   CMUL(36)(pC), rC9
+    movaps  0-120(pA10), rC10
+    mulps   rB0, rC10
+    addss   CMUL(40)(pC), rC10
+    movaps  0-120(pA10,ldab), rC11
+    mulps   rB0, rC11
+    addss   CMUL(44)(pC), rC11
+    movaps  0-120(pA10,ldab,2), rC12
+    mulps   rB0, rC12
+    addss   CMUL(48)(pC), rC12
+    movaps  0-120(pA5,ldab,8), rC13
+    mulps   rB0, rC13
+    addss   CMUL(52)(pC), rC13
 #else
-	movaps	0-120(pA10,mldab5,2), rC0
-	movaps	0-120(pB0), rC13
-	mulps	rC13, rC0
-	movaps	0-120(pA5, mldab,4), rC1
-	mulps	rC13, rC1
-	movaps	0-120(pA10, mldab,8), rC2
-	mulps	rC13, rC2
-	movaps	0-120(pA5, mldab,2), rC3
-	mulps	rC13, rC3
-	movaps	0-120(pA5, mldab), rC4
-	mulps	rC13, rC4
-	movaps	0-120(pA5), rC5
-	mulps	rC13, rC5
-	movaps	0-120(pA5, ldab), rC6
-	mulps	rC13, rC6
-	movaps	0-120(pA5, ldab,2), rC7
-	mulps	rC13, rC7
-	movaps	0-120(pA10, mldab,2), rC8
-	mulps	rC13, rC8
-	movaps	0-120(pA5,ldab,4), rC9
-	mulps	rC13, rC9
-	movaps	0-120(pA10), rC10
-	mulps	rC13, rC10
-	movaps	0-120(pA10,ldab), rC11
-	mulps	rC13, rC11
-	movaps	0-120(pA10,ldab,2), rC12
-	mulps	rC13, rC12
-	mulps 	0-120(pA5,ldab,8), rC13
+    movaps  0-120(pA10,mldab5,2), rC0
+    movaps  0-120(pB0), rC13
+    mulps   rC13, rC0
+    movaps  0-120(pA5, mldab,4), rC1
+    mulps   rC13, rC1
+    movaps  0-120(pA10, mldab,8), rC2
+    mulps   rC13, rC2
+    movaps  0-120(pA5, mldab,2), rC3
+    mulps   rC13, rC3
+    movaps  0-120(pA5, mldab), rC4
+    mulps   rC13, rC4
+    movaps  0-120(pA5), rC5
+    mulps   rC13, rC5
+    movaps  0-120(pA5, ldab), rC6
+    mulps   rC13, rC6
+    movaps  0-120(pA5, ldab,2), rC7
+    mulps   rC13, rC7
+    movaps  0-120(pA10, mldab,2), rC8
+    mulps   rC13, rC8
+    movaps  0-120(pA5,ldab,4), rC9
+    mulps   rC13, rC9
+    movaps  0-120(pA10), rC10
+    mulps   rC13, rC10
+    movaps  0-120(pA10,ldab), rC11
+    mulps   rC13, rC11
+    movaps  0-120(pA10,ldab,2), rC12
+    mulps   rC13, rC12
+    mulps   0-120(pA5,ldab,8), rC13
 #endif
 
 #if KB > 4
-	movaps	16-120(pA10,mldab5,2), rA0
-	movaps	16-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	16-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	16-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	16-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	16-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	16-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	16-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	16-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	16-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	16-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	16-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	16-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	16-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	16-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  16-120(pA10,mldab5,2), rA0
+    movaps  16-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  16-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  16-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  16-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  16-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  16-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  16-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  16-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  16-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  16-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  16-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  16-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  16-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   16-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 8
-	movaps	32-120(pA10,mldab5,2), rA0
-	movaps	32-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	32-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	32-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	32-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	32-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	32-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	32-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	32-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	32-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	32-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	32-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	32-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	32-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	32-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  32-120(pA10,mldab5,2), rA0
+    movaps  32-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  32-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  32-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  32-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  32-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  32-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  32-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  32-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  32-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  32-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  32-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  32-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  32-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   32-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 12
-	movaps	48-120(pA10,mldab5,2), rA0
-	movaps	48-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	48-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	48-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	48-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	48-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	48-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	48-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	48-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	48-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	48-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	48-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	48-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	48-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	48-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  48-120(pA10,mldab5,2), rA0
+    movaps  48-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  48-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  48-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  48-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  48-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  48-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  48-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  48-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  48-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  48-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  48-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  48-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  48-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   48-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 16
-	movaps	64-120(pA10,mldab5,2), rA0
-	movaps	64-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	64-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	64-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	64-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	64-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	64-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	64-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	64-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	64-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	64-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	64-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	64-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	64-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	64-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  64-120(pA10,mldab5,2), rA0
+    movaps  64-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  64-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  64-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  64-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  64-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  64-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  64-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  64-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  64-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  64-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  64-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  64-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  64-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   64-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 20
-	movaps	80-120(pA10,mldab5,2), rA0
-	movaps	80-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	80-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	80-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	80-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	80-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	80-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	80-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	80-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	80-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	80-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	80-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	80-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	80-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	80-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  80-120(pA10,mldab5,2), rA0
+    movaps  80-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  80-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  80-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  80-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  80-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  80-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  80-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  80-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  80-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  80-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  80-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  80-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  80-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   80-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 24
-	movaps	96-120(pA10,mldab5,2), rA0
-	movaps	96-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	96-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	96-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	96-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	96-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	96-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	96-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	96-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	96-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	96-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	96-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	96-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	96-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	96-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  96-120(pA10,mldab5,2), rA0
+    movaps  96-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  96-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  96-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  96-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  96-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  96-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  96-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  96-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  96-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  96-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  96-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  96-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  96-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   96-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 28
-	movaps	112-120(pA10,mldab5,2), rA0
-	movaps	112-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	112-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	112-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	112-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	112-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	112-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	112-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	112-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	112-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	112-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	112-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	112-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	112-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	112-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  112-120(pA10,mldab5,2), rA0
+    movaps  112-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  112-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  112-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  112-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  112-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  112-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  112-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  112-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  112-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  112-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  112-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  112-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  112-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   112-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 32
-	movaps	128-120(pA10,mldab5,2), rA0
-	movaps	128-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	128-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	128-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	128-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	128-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	128-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	128-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	128-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	128-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	128-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	128-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	128-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	128-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	128-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  128-120(pA10,mldab5,2), rA0
+    movaps  128-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  128-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  128-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  128-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  128-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  128-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  128-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  128-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  128-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  128-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  128-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  128-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  128-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   128-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 36
-	movaps	144-120(pA10,mldab5,2), rA0
-	movaps	144-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	144-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	144-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	144-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	144-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	144-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	144-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	144-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	144-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	144-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	144-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	144-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	144-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	144-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  144-120(pA10,mldab5,2), rA0
+    movaps  144-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  144-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  144-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  144-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  144-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  144-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  144-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  144-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  144-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  144-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  144-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  144-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  144-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   144-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
-						prefB((pB,ldab))
-						prefB(64(pB,ldab))
+                        prefB((pB,ldab))
+                        prefB(64(pB,ldab))
 
 #if KB > 40
-	movaps	160-120(pA10,mldab5,2), rA0
-	movaps	160-120(pB0), rB0
-	mulps	rB0, rA0
-				addq $176, pB0
-	addps	rA0, rC0
-	movaps	160-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	160-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	160-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	160-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	160-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	160-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	160-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	160-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	160-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	160-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	160-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	160-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-				addq $176, pA10
-	addps	rA0, rC12
-	mulps	160-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
-				addq $176, pA5
+    movaps  160-120(pA10,mldab5,2), rA0
+    movaps  160-120(pB0), rB0
+    mulps   rB0, rA0
+                addq $176, pB0
+    addps   rA0, rC0
+    movaps  160-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  160-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  160-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  160-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  160-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  160-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  160-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  160-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  160-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  160-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  160-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  160-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+                addq $176, pA10
+    addps   rA0, rC12
+    mulps   160-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
+                addq $176, pA5
 #else
-				addq $176, pB0
-				addq $176, pA10
-				addq $176, pA5
+                addq $176, pB0
+                addq $176, pA10
+                addq $176, pA5
 #endif
 
 #if KB > 44
-	movaps	0-120(pA10,mldab5,2), rA0
-	movaps	0-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	0-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	0-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	0-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	0-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	0-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	0-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	0-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	0-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	0-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	0-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	0-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	0-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	0-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  0-120(pA10,mldab5,2), rA0
+    movaps  0-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  0-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  0-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  0-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  0-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  0-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  0-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  0-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  0-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  0-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  0-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  0-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  0-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   0-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 48
-	movaps	16-120(pA10,mldab5,2), rA0
-	movaps	16-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	16-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	16-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	16-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	16-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	16-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	16-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	16-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	16-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	16-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	16-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	16-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	16-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	16-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  16-120(pA10,mldab5,2), rA0
+    movaps  16-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  16-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  16-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  16-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  16-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  16-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  16-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  16-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  16-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  16-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  16-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  16-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  16-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   16-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 52
-	movaps	32-120(pA10,mldab5,2), rA0
-	movaps	32-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	32-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	32-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	32-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	32-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	32-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	32-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	32-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	32-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	32-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	32-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	32-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	32-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	32-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  32-120(pA10,mldab5,2), rA0
+    movaps  32-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  32-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  32-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  32-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  32-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  32-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  32-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  32-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  32-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  32-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  32-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  32-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  32-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   32-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 56
-	movaps	48-120(pA10,mldab5,2), rA0
-	movaps	48-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	48-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	48-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	48-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	48-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	48-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	48-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	48-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	48-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	48-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	48-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	48-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	48-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	48-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  48-120(pA10,mldab5,2), rA0
+    movaps  48-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  48-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  48-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  48-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  48-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  48-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  48-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  48-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  48-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  48-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  48-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  48-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  48-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   48-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 60
-	movaps	64-120(pA10,mldab5,2), rA0
-	movaps	64-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	64-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	64-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	64-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	64-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	64-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	64-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	64-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	64-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	64-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	64-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	64-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	64-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	64-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  64-120(pA10,mldab5,2), rA0
+    movaps  64-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  64-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  64-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  64-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  64-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  64-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  64-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  64-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  64-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  64-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  64-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  64-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  64-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   64-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
-						prefB(128-176(pB,ldab))
-						prefB(192-176(pB,ldab))
+                        prefB(128-176(pB,ldab))
+                        prefB(192-176(pB,ldab))
 
 #if KB > 64
-	movaps	80-120(pA10,mldab5,2), rA0
-	movaps	80-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	80-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	80-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	80-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	80-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	80-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	80-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	80-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	80-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	80-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	80-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	80-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	80-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	80-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  80-120(pA10,mldab5,2), rA0
+    movaps  80-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  80-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  80-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  80-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  80-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  80-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  80-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  80-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  80-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  80-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  80-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  80-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  80-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   80-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 68
-	movaps	96-120(pA10,mldab5,2), rA0
-	movaps	96-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	96-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	96-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	96-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	96-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	96-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	96-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	96-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	96-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	96-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	96-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	96-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	96-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	96-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  96-120(pA10,mldab5,2), rA0
+    movaps  96-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  96-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  96-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  96-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  96-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  96-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  96-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  96-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  96-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  96-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  96-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  96-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  96-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   96-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 72
-	movaps	112-120(pA10,mldab5,2), rA0
-	movaps	112-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	112-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	112-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	112-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	112-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	112-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	112-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	112-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	112-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	112-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	112-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	112-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	112-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	112-120(pA5,ldab,8), rB0
-						prefC((pC))
-						prefC((pC,incCn))
-	addps	rB0, rC13
+    movaps  112-120(pA10,mldab5,2), rA0
+    movaps  112-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  112-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  112-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  112-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  112-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  112-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  112-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  112-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  112-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  112-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  112-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  112-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  112-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   112-120(pA5,ldab,8), rB0
+                        prefC((pC))
+                        prefC((pC,incCn))
+    addps   rB0, rC13
 #else
-						prefC((pC))
-						prefC((pC,incCn))
+                        prefC((pC))
+                        prefC((pC,incCn))
 #endif
 
 #if KB > 76
-	movaps	128-120(pA10,mldab5,2), rA0
-	movaps	128-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	128-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	128-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	128-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	128-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	128-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	128-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	128-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	128-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	128-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	128-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	128-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	128-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	128-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  128-120(pA10,mldab5,2), rA0
+    movaps  128-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  128-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  128-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  128-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  128-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  128-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  128-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  128-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  128-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  128-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  128-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  128-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  128-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   128-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 #if KB > 80
-	movaps	144-120(pA10,mldab5,2), rA0
-	movaps	144-120(pB0), rB0
-	mulps	rB0, rA0
-	addps	rA0, rC0
-	movaps	144-120(pA5, mldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC1
-	movaps	144-120(pA10, mldab,8), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC2
-	movaps	144-120(pA5, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC3
-	movaps	144-120(pA5, mldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC4
-	movaps	144-120(pA5), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC5
-	movaps	144-120(pA5, ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC6
-	movaps	144-120(pA5, ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC7
-	movaps	144-120(pA10, mldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC8
-	movaps	144-120(pA5,ldab,4), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC9
-	movaps	144-120(pA10), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC10
-	movaps	144-120(pA10,ldab), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC11
-	movaps	144-120(pA10,ldab,2), rA0
-	mulps	rB0, rA0
-	addps	rA0, rC12
-	mulps	144-120(pA5,ldab,8), rB0
-	addps	rB0, rC13
+    movaps  144-120(pA10,mldab5,2), rA0
+    movaps  144-120(pB0), rB0
+    mulps   rB0, rA0
+    addps   rA0, rC0
+    movaps  144-120(pA5, mldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC1
+    movaps  144-120(pA10, mldab,8), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC2
+    movaps  144-120(pA5, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC3
+    movaps  144-120(pA5, mldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC4
+    movaps  144-120(pA5), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC5
+    movaps  144-120(pA5, ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC6
+    movaps  144-120(pA5, ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC7
+    movaps  144-120(pA10, mldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC8
+    movaps  144-120(pA5,ldab,4), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC9
+    movaps  144-120(pA10), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC10
+    movaps  144-120(pA10,ldab), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC11
+    movaps  144-120(pA10,ldab,2), rA0
+    mulps   rB0, rA0
+    addps   rA0, rC12
+    mulps   144-120(pA5,ldab,8), rB0
+    addps   rB0, rC13
 #endif
 
 /*UKLOOP */
@@ -2454,202 +2461,202 @@ MLAST:
  *      Get these bastard things summed up correctly
  */
 
-					/* rC0 = c0a    c0b    c0c    c0d */
-					/* rC1 = c1a    c1b    c1c    c1d */
-					/* rC2 = c2a    c2b    c2c    c2d */
-					/* rC3 = c3a    c3b    c3c    c3d */
+                    /* rC0 = c0a    c0b    c0c    c0d */
+                    /* rC1 = c1a    c1b    c1c    c1d */
+                    /* rC2 = c2a    c2b    c2c    c2d */
+                    /* rC3 = c3a    c3b    c3c    c3d */
 /* */
-	movaps		rC2, rB0	/* rB0 = c2a    c2b    c2c    c2d */
-						prefC(64(pC,incCn))
-						prefB(256-176(pB,ldab))
-	movaps		rC0, rA0	/* rA0 = c0a    c0b    c0c    c0d */
-	unpckhps	rC3, rB0	/* rB0 = c2c    c3c    c2d    c3d */
-	unpckhps	rC1, rA0	/* rA0 = c0c    c1c    c0d    c1d */
-	unpcklps	rC3, rC2	/* rC2 = c2a    c3a    c2b    c3b */
-	movlhps		rB0, rC3	/* rC3 = c3a    c3b    c2c    c3c */
-	unpcklps	rC1, rC0	/* rC0 = c0a    c1a    c0b    c1b */
-	movhlps		rA0, rC3	/* rC3 = c0d    c1d    c2c    c3c */
-	movlhps		rC2, rA0	/* rA0 = c0c    c1c    c2a    c3a */
-	movhlps		rC0, rB0	/* rB0 = c0b    c1b    c2d    c3d */
-	addps		rA0, rC3	/* rC3 = c0cd   c1cd   c2ac   c3ac */
-	movlhps		rC0, rC1	/* rC1 = c1a    c1b    c0a    c1a */
-	movhlps		rC1, rC2	/* rC2 = c0a    c1a    c2b    c3b */
-	movaps		rC4, rA0	/* rA0 = c4a    c4b    c4c    c4d */
-	addps		rB0, rC2	/* rC2 = c0ab   c1ab   c2bd   c3bd */
-	movaps		rC6, rB0	/* rB0 = c6a    c6b    c6c    c6d */
-	addps		rC2, rC3	/* rC3 = c0abcd c1abcd c2bdac c3bdac */
-
-
-					/* rC4 = c4a    c4b    c4c    c4d */
-					/* rC5 = c5a    c5b    c5c    c5d */
-					/* rC6 = c6a    c6b    c6c    c6d */
-					/* rC7 = c7a    c7b    c7c    c7d */
-					/* rC8  = c08a    c08b    c08c    c08d */
-					/* rC9  = c09a    c09b    c09c    c09d */
-					/* rC10 = c10a    c10b    c10c    c10d */
-					/* rC11 = c11a    c11b    c11c    c11d */
-					/* rC12 = c12a    c12b    c12c    c12d */
-					/* rC13 = c13a    c13b    c13c    c13d */
+    movaps      rC2, rB0    /* rB0 = c2a    c2b    c2c    c2d */
+                        prefC(64(pC,incCn))
+                        prefB(256-176(pB,ldab))
+    movaps      rC0, rA0    /* rA0 = c0a    c0b    c0c    c0d */
+    unpckhps    rC3, rB0    /* rB0 = c2c    c3c    c2d    c3d */
+    unpckhps    rC1, rA0    /* rA0 = c0c    c1c    c0d    c1d */
+    unpcklps    rC3, rC2    /* rC2 = c2a    c3a    c2b    c3b */
+    movlhps     rB0, rC3    /* rC3 = c3a    c3b    c2c    c3c */
+    unpcklps    rC1, rC0    /* rC0 = c0a    c1a    c0b    c1b */
+    movhlps     rA0, rC3    /* rC3 = c0d    c1d    c2c    c3c */
+    movlhps     rC2, rA0    /* rA0 = c0c    c1c    c2a    c3a */
+    movhlps     rC0, rB0    /* rB0 = c0b    c1b    c2d    c3d */
+    addps       rA0, rC3    /* rC3 = c0cd   c1cd   c2ac   c3ac */
+    movlhps     rC0, rC1    /* rC1 = c1a    c1b    c0a    c1a */
+    movhlps     rC1, rC2    /* rC2 = c0a    c1a    c2b    c3b */
+    movaps      rC4, rA0    /* rA0 = c4a    c4b    c4c    c4d */
+    addps       rB0, rC2    /* rC2 = c0ab   c1ab   c2bd   c3bd */
+    movaps      rC6, rB0    /* rB0 = c6a    c6b    c6c    c6d */
+    addps       rC2, rC3    /* rC3 = c0abcd c1abcd c2bdac c3bdac */
+
+
+                    /* rC4 = c4a    c4b    c4c    c4d */
+                    /* rC5 = c5a    c5b    c5c    c5d */
+                    /* rC6 = c6a    c6b    c6c    c6d */
+                    /* rC7 = c7a    c7b    c7c    c7d */
+                    /* rC8  = c08a    c08b    c08c    c08d */
+                    /* rC9  = c09a    c09b    c09c    c09d */
+                    /* rC10 = c10a    c10b    c10c    c10d */
+                    /* rC11 = c11a    c11b    c11c    c11d */
+                    /* rC12 = c12a    c12b    c12c    c12d */
+                    /* rC13 = c13a    c13b    c13c    c13d */
 /* */
-	movaps		rC10, rC0	/* rC0 = c10a    c10b    c10c    c10d */
-	movaps		rC8 , rC1	/* rC1 = c08a    c08b    c08c    c08d */
-	movaps          rC12, rC2 	/* rC2  = c12a    c12b    c12c    c12d */
-	unpckhps	rC7, rB0	/* rB0 = c6c    c7c    c6d    c7d */
-	unpckhps	rC5, rA0	/* rA0 = c4c    c5c    c4d    c5d */
-	unpcklps	rC7, rC6	/* rC6 = c6a    c7a    c6b    c7b */
-	unpckhps	rC11, rC0	/* rC0 = c10c    c11c    c10d    c11d */
-	unpckhps	rC9 , rC1	/* rC1 = c08c    c09c    c08d    c09d */
-	movlhps		rB0, rC7	/* rC7 = c7a    c7b    c6c    c7c */
-	unpcklps	rC5, rC4	/* rC4 = c4a    c5a    c4b    c5b */
-	movhlps		rA0, rC7	/* rC7 = c4d    c5d    c6c    c7c */
-	movlhps		rC6, rA0	/* rA0 = c4c    c5c    c6a    c7a */
-	unpcklps	rC11, rC10	/* rC10 = c10a    c11a    c10b    c11b */
-	movhlps		rC4, rB0	/* rB0 = c4b    c5b    c6d    c7d */
-	movlhps		rC0, rC11	/* rC11 = c11a    c11b    c10c    c11c */
-	addps		rA0, rC7	/* rC7 = c4cd   c5cd   c6ac   c7ac */
+    movaps      rC10, rC0   /* rC0 = c10a    c10b    c10c    c10d */
+    movaps      rC8 , rC1   /* rC1 = c08a    c08b    c08c    c08d */
+    movaps          rC12, rC2   /* rC2  = c12a    c12b    c12c    c12d */
+    unpckhps    rC7, rB0    /* rB0 = c6c    c7c    c6d    c7d */
+    unpckhps    rC5, rA0    /* rA0 = c4c    c5c    c4d    c5d */
+    unpcklps    rC7, rC6    /* rC6 = c6a    c7a    c6b    c7b */
+    unpckhps    rC11, rC0   /* rC0 = c10c    c11c    c10d    c11d */
+    unpckhps    rC9 , rC1   /* rC1 = c08c    c09c    c08d    c09d */
+    movlhps     rB0, rC7    /* rC7 = c7a    c7b    c6c    c7c */
+    unpcklps    rC5, rC4    /* rC4 = c4a    c5a    c4b    c5b */
+    movhlps     rA0, rC7    /* rC7 = c4d    c5d    c6c    c7c */
+    movlhps     rC6, rA0    /* rA0 = c4c    c5c    c6a    c7a */
+    unpcklps    rC11, rC10  /* rC10 = c10a    c11a    c10b    c11b */
+    movhlps     rC4, rB0    /* rB0 = c4b    c5b    c6d    c7d */
+    movlhps     rC0, rC11   /* rC11 = c11a    c11b    c10c    c11c */
+    addps       rA0, rC7    /* rC7 = c4cd   c5cd   c6ac   c7ac */
 #ifdef BETAX
    #ifdef SREAL
-			movups	(pC), rA0
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-			movups	16(pC), rC4
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-			movups	32(pC), rC5
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
-			movlps	48(pC), rC1
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
-			mulps	BOF(%rsp), rA0
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-			mulps	BOF(%rsp), rC4
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-			mulps	BOF(%rsp), rC5
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
-			mulps	BOF(%rsp), rC1
+            movups  (pC), rA0
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+            movups  16(pC), rC4
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+            movups  32(pC), rC5
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+            movlps  48(pC), rC1
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+            mulps   BOF(%rsp), rA0
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+            mulps   BOF(%rsp), rC4
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+            mulps   BOF(%rsp), rC5
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+            mulps   BOF(%rsp), rC1
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
-			addps	rA0, rC3
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
-			addps	rC4, rC7
-			addps	rC5, rC11
-						prefB(320-176(pB,ldab))
-			addps	rC1, rC12
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
+            addps   rA0, rC3
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
+            addps   rC4, rC7
+            addps   rC5, rC11
+                        prefB(320-176(pB,ldab))
+            addps   rC1, rC12
    #else  /* BETA = X, complex type */
-			movups	(pC), rA0
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-			movups	16(pC), rC4
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
- 			shufps	$0x88, rC4, rA0    /* rA0 = c0 c1 c2 c3 */
-			movups	32(pC), rC4        /* rC4 = c4 X  c5 X */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-			movups	48(pC), rC5        /* rC5 = c6 X c7 X */
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
- 			shufps	$0x88, rC5, rC4    /* rC4 = c4 c5 c6 c7 */
-			movups	64(pC), rC5	   /* rC5 = c8 X  c9 X */
-			movups	80(pC), rC1        /* rC1 = c10 X c11 X */
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
- 			shufps	$0x88, rC1, rC5    /* rC5 = c8 c9 c10 c11 */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-			movss	96(pC), rC1
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-			movss	104(pC), rB0
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-			unpcklps	rB0, rC1
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
-			mulps	BOF(%rsp), rA0
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-			mulps	BOF(%rsp), rC4
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-			mulps	BOF(%rsp), rC5
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
-			mulps	BOF(%rsp), rC1
+            movups  (pC), rA0
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+            movups  16(pC), rC4
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+            shufps  $0x88, rC4, rA0    /* rA0 = c0 c1 c2 c3 */
+            movups  32(pC), rC4        /* rC4 = c4 X  c5 X */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+            movups  48(pC), rC5        /* rC5 = c6 X c7 X */
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+            shufps  $0x88, rC5, rC4    /* rC4 = c4 c5 c6 c7 */
+            movups  64(pC), rC5    /* rC5 = c8 X  c9 X */
+            movups  80(pC), rC1        /* rC1 = c10 X c11 X */
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+            shufps  $0x88, rC1, rC5    /* rC5 = c8 c9 c10 c11 */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+            movss   96(pC), rC1
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+            movss   104(pC), rB0
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+            unpcklps    rB0, rC1
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+            mulps   BOF(%rsp), rA0
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+            mulps   BOF(%rsp), rC4
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+            mulps   BOF(%rsp), rC5
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
+            mulps   BOF(%rsp), rC1
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
-			addps	rA0, rC3
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
-			addps	rC4, rC7
-			addps	rC5, rC11
-						prefB(320-176(pB,ldab))
-			addps	rC1, rC12
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
+            addps   rA0, rC3
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
+            addps   rC4, rC7
+            addps   rC5, rC11
+                        prefB(320-176(pB,ldab))
+            addps   rC1, rC12
    #endif
 
 #else
-	movlhps		rC4, rC5	/* rC5 = c5a    c5b    c4a    c5a */
-	unpcklps	rC9 , rC8 	/* rC8  = c08a    c09a    c08b    c09b */
-	movhlps		rC1, rC11	/* rC11 = c08d    c09d    c10c    c11c */
-	movlhps		rC10, rC1	/* rC1 = c08c    c09c    c10a    c11a */
-	movhlps		rC5, rC6	/* rC6 = c4a    c5a    c6b    c7b */
-	movhlps		rC8 , rC0	/* rC0 = c08b    c09b    c10d    c11d */
-	unpcklps	rC13, rC2	/* rC2  = c12a    c13a    c12b    c13b */
-	addps		rC1, rC11	/* rC11 = c08cd   c09cd   c10ac   c11ac */
-	addps		rB0, rC6	/* rC6 = c4ab   c5ab   c6bd   c7bd */
-	movlhps		rC8 , rC9 	/* rC9  = c09a    c09b    c08a    c09a */
-	unpckhps	rC13, rC12	/* rC12 = c12c    c13c    c12d    c13d */
-	movhlps		rC9 , rC10	/* rC10 = c08a    c09a    c10b    c11b */
-	addps		rC6, rC7	/* rC7 = c4abcd c5abcd c6bdac c7bdac */
-	addps		rC0, rC10	/* rC10 = c08ab   c09ab   c10bd   c11bd */
-	addps		rC2, rC12	/* rC12 = c12ac   c13ac   c12bd   c13bd */
-	addps		rC10, rC11	/* rC11 = c08abcd c09abcd c10bdac c11bdac */
+    movlhps     rC4, rC5    /* rC5 = c5a    c5b    c4a    c5a */
+    unpcklps    rC9 , rC8   /* rC8  = c08a    c09a    c08b    c09b */
+    movhlps     rC1, rC11   /* rC11 = c08d    c09d    c10c    c11c */
+    movlhps     rC10, rC1   /* rC1 = c08c    c09c    c10a    c11a */
+    movhlps     rC5, rC6    /* rC6 = c4a    c5a    c6b    c7b */
+    movhlps     rC8 , rC0   /* rC0 = c08b    c09b    c10d    c11d */
+    unpcklps    rC13, rC2   /* rC2  = c12a    c13a    c12b    c13b */
+    addps       rC1, rC11   /* rC11 = c08cd   c09cd   c10ac   c11ac */
+    addps       rB0, rC6    /* rC6 = c4ab   c5ab   c6bd   c7bd */
+    movlhps     rC8 , rC9   /* rC9  = c09a    c09b    c08a    c09a */
+    unpckhps    rC13, rC12  /* rC12 = c12c    c13c    c12d    c13d */
+    movhlps     rC9 , rC10  /* rC10 = c08a    c09a    c10b    c11b */
+    addps       rC6, rC7    /* rC7 = c4abcd c5abcd c6bdac c7bdac */
+    addps       rC0, rC10   /* rC10 = c08ab   c09ab   c10bd   c11bd */
+    addps       rC2, rC12   /* rC12 = c12ac   c13ac   c12bd   c13bd */
+    addps       rC10, rC11  /* rC11 = c08abcd c09abcd c10bdac c11bdac */
 
 /* */
 
-	movhlps		rC12, rC13	/* rC13 = c12bd   c13bd   X       X */
-						prefB(320-176(pB,ldab))
-	addps		rC13, rC12	/* rC12 = c12abcd c13abcd X       X */
+    movhlps     rC12, rC13  /* rC13 = c12bd   c13bd   X       X */
+                        prefB(320-176(pB,ldab))
+    addps       rC13, rC12  /* rC12 = c12abcd c13abcd X       X */
 
 #endif
 /*
  *      Write results back to C;  pC += 14;
  */
 #ifdef SREAL
-	movups	rC3, (pC)
-	movups	rC7, 16(pC)
-	movups	rC11, 32(pC)
-	movlps	rC12, 48(pC)
-/*	addq	$56, pC */
+    movups  rC3, (pC)
+    movups  rC7, 16(pC)
+    movups  rC11, 32(pC)
+    movlps  rC12, 48(pC)
+/*  addq    $56, pC */
 #else
-	movss	rC3, (pC)
-	movss	rC7, 32(pC)
-	movhlps	rC3, rC0
-	movhlps	rC7, rC6
-	movss	rC0, 16(pC)
-	movss	rC6, 48(pC)
-	shufps	$0x55, rC3, rC3
-	shufps	$0x55, rC7, rC7
-	movss	rC3, 8(pC)
-	movss	rC7, 40(pC)
-	shufps	$0x55, rC0, rC0
-	shufps	$0x55, rC6, rC6
-	movss	rC0, 24(pC)
-	movss	rC6, 56(pC)
-
-	movss	rC11, 64(pC)
-	movhlps	rC11, rC2
-	movss	rC12, 96(pC)
-	movss	rC2, 80(pC)
-	shufps	$0x55, rC11, rC11
-	shufps	$0x55, rC12, rC12
-	movss	rC11, 72(pC)
-	shufps	$0x55, rC2, rC2
-	movss	rC12, 104(pC)
-	movss	rC2, 88(pC)
+    movss   rC3, (pC)
+    movss   rC7, 32(pC)
+    movhlps rC3, rC0
+    movhlps rC7, rC6
+    movss   rC0, 16(pC)
+    movss   rC6, 48(pC)
+    shufps  $0x55, rC3, rC3
+    shufps  $0x55, rC7, rC7
+    movss   rC3, 8(pC)
+    movss   rC7, 40(pC)
+    shufps  $0x55, rC0, rC0
+    shufps  $0x55, rC6, rC6
+    movss   rC0, 24(pC)
+    movss   rC6, 56(pC)
+
+    movss   rC11, 64(pC)
+    movhlps rC11, rC2
+    movss   rC12, 96(pC)
+    movss   rC2, 80(pC)
+    shufps  $0x55, rC11, rC11
+    shufps  $0x55, rC12, rC12
+    movss   rC11, 72(pC)
+    shufps  $0x55, rC2, rC2
+    movss   rC12, 104(pC)
+    movss   rC2, 88(pC)
 
-/*	addq	$112, pC */
+/*  addq    $112, pC */
 #endif
 /*
  *      Write results back to C
@@ -2660,55 +2667,55 @@ MLAST:
 /*
  *      while (pA != stM);
  */
-/*	subq	$1, stM */
-/*	jne	UMLOOP */
+/*  subq    $1, stM */
+/*  jne UMLOOP */
 /*
  *      pC += 14;  pA += 14*NB; pB -= NB;
  */
-/*	subq	$MBKBso-NB14so+176, pA5 */
-/*	subq	$MBKBso-NB14so+176, pA10 */
-	subq	incAm, pA5
-	subq	incAm, pA10
-	addq	$NBso-176, pB0
+/*  subq    $MBKBso-NB14so+176, pA5 */
+/*  subq    $MBKBso-NB14so+176, pA10 */
+    subq    incAm, pA5
+    subq    incAm, pA10
+    addq    $NBso-176, pB0
 /*
  *      while (pA != stM);
  */
-/*	subq	$1, stM */
-/*	jne	UMLOOP */
+/*  subq    $1, stM */
+/*  jne UMLOOP */
 /*
  *      pC += incCn;  pA -= NBNB;  pB += NB;
  */
-	addq	incCn, pC
+    addq    incCn, pC
 /*
  *      while (pB != stN);
  */
-	sub	$1, stN
-	jne	UNLOOP
+    sub $1, stN
+    jne UNLOOP
 
 /*
  *      Restore callee-saved iregs
  */
 DONE:
-	movq	-8(%rsp), %rbp
-	movq	-16(%rsp), %rbx
+    movq    -8(%rsp), %rbp
+    movq    -16(%rsp), %rbx
 #if MB == 0
-	movq	-32(%rsp), %r12
-	movq	-40(%rsp), %r13
+    movq    -32(%rsp), %r12
+    movq    -40(%rsp), %r13
 #endif
-	ret
+    ret
 #if MB == 0
 MB_LT84:
-	cmp	$70, stM
-	jne	MB_LT70
-/*	movq	$70/14, stM */
-	movq	$5, stM
-	jmp	MBFOUND
+    cmp $70, stM
+    jne MB_LT70
+/*  movq    $70/14, stM */
+    movq    $5, stM
+    jmp MBFOUND
 MB_LT70:
-	cmp	$56, stM
-	jne	MB_LT56
-/*	movq	$56/14, stM */
-	movq	$4, stM
-	jmp	MBFOUND
+    cmp $56, stM
+    jne MB_LT56
+/*  movq    $56/14, stM */
+    movq    $4, stM
+    jmp MBFOUND
 MB_LT56:
         cmp     $42, stM
         jne     MB_LT42
diff -rupN ATLAS/tune/blas/level1/scalsrch.c atlas-3.8.3/tune/blas/level1/scalsrch.c
--- ATLAS/tune/blas/level1/scalsrch.c	2009-02-18 19:48:25.000000000 +0100
+++ atlas-3.8.3/tune/blas/level1/scalsrch.c	2009-11-12 13:45:48.141174024 +0100
@@ -747,7 +747,7 @@ void GenMainRout(char pre, int n, int *i
 /*
  * Handle all special alpha cases
  */
-   fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
+   /* fprintf(fpout, "%sif ( SCALAR_IS_ZERO(alpha) )\n", spc);
    fprintf(fpout, "%s{\n", spc);
    if (pre == 'c' || pre == 'z')
    {
@@ -756,7 +756,7 @@ void GenMainRout(char pre, int n, int *i
    }
    else fprintf(fpout, "%s   Mjoin(PATL,set)(N, ATL_rzero, X, incx);\n", spc);
    fprintf(fpout, "%s   return;\n", spc);
-   fprintf(fpout, "%s}\n", spc);
+   fprintf(fpout, "%s}\n", spc); */
    GenAlphCase(pre, spc, fpout, 1, n, ix, iy, ia, ib);
    GenAlphCase(pre, spc, fpout, -1, n, ix, iy, ia, ib);
    if (pre == 'c' || pre == 'z')