With help from Matthias Grundmann I wrote my first piece of VFP code. Here it is:
void MatrixMultiplyF(
MATRIXf &mOut,
const MATRIXf &mA,
const MATRIXf &mB)
{
#if 0
MATRIXf mRet;
/* Perform calculation on a dummy matrix (mRet) */
mRet.f[ 0] = mA.f[ 0]*mB.f[ 0] + mA.f[ 1]*mB.f[ 4] + mA.f[ 2]*mB.f[ 8] + mA.f[ 3]*mB.f[12];
mRet.f[ 1] = mA.f[ 0]*mB.f[ 1] + mA.f[ 1]*mB.f[ 5] + mA.f[ 2]*mB.f[ 9] + mA.f[ 3]*mB.f[13];
mRet.f[ 2] = mA.f[ 0]*mB.f[ 2] + mA.f[ 1]*mB.f[ 6] + mA.f[ 2]*mB.f[10] + mA.f[ 3]*mB.f[14];
mRet.f[ 3] = mA.f[ 0]*mB.f[ 3] + mA.f[ 1]*mB.f[ 7] + mA.f[ 2]*mB.f[11] + mA.f[ 3]*mB.f[15];
mRet.f[ 4] = mA.f[ 4]*mB.f[ 0] + mA.f[ 5]*mB.f[ 4] + mA.f[ 6]*mB.f[ 8] + mA.f[ 7]*mB.f[12];
mRet.f[ 5] = mA.f[ 4]*mB.f[ 1] + mA.f[ 5]*mB.f[ 5] + mA.f[ 6]*mB.f[ 9] + mA.f[ 7]*mB.f[13];
mRet.f[ 6] = mA.f[ 4]*mB.f[ 2] + mA.f[ 5]*mB.f[ 6] + mA.f[ 6]*mB.f[10] + mA.f[ 7]*mB.f[14];
mRet.f[ 7] = mA.f[ 4]*mB.f[ 3] + mA.f[ 5]*mB.f[ 7] + mA.f[ 6]*mB.f[11] + mA.f[ 7]*mB.f[15];
mRet.f[ 8] = mA.f[ 8]*mB.f[ 0] + mA.f[ 9]*mB.f[ 4] + mA.f[10]*mB.f[ 8] + mA.f[11]*mB.f[12];
mRet.f[ 9] = mA.f[ 8]*mB.f[ 1] + mA.f[ 9]*mB.f[ 5] + mA.f[10]*mB.f[ 9] + mA.f[11]*mB.f[13];
mRet.f[10] = mA.f[ 8]*mB.f[ 2] + mA.f[ 9]*mB.f[ 6] + mA.f[10]*mB.f[10] + mA.f[11]*mB.f[14];
mRet.f[11] = mA.f[ 8]*mB.f[ 3] + mA.f[ 9]*mB.f[ 7] + mA.f[10]*mB.f[11] + mA.f[11]*mB.f[15];
mRet.f[12] = mA.f[12]*mB.f[ 0] + mA.f[13]*mB.f[ 4] + mA.f[14]*mB.f[ 8] + mA.f[15]*mB.f[12];
mRet.f[13] = mA.f[12]*mB.f[ 1] + mA.f[13]*mB.f[ 5] + mA.f[14]*mB.f[ 9] + mA.f[15]*mB.f[13];
mRet.f[14] = mA.f[12]*mB.f[ 2] + mA.f[13]*mB.f[ 6] + mA.f[14]*mB.f[10] + mA.f[15]*mB.f[14];
mRet.f[15] = mA.f[12]*mB.f[ 3] + mA.f[13]*mB.f[ 7] + mA.f[14]*mB.f[11] + mA.f[15]*mB.f[15];
/* Copy result in pResultMatrix */
mOut = mRet;
#else
#if (TARGET_CPU_ARM)
const float* src_ptr1 = &mA.f[0];
const float* src_ptr2 = &mB.f[0];
float* dst_ptr = &mOut.f[0];
asm volatile(
// switch on ARM mode
// involves uncoditional jump and mode switch (opcode bx)
// the lowest bit in the address signals whether are (bit cleared)
// or tumb should be selected (bit set)
".align 4 \n\t"
"mov r0, pc \n\t"
"bx r0 \n\t"
".arm \n\t"
// set vector length to 4
// example fadds s8, s8, s16 means that the content s8 - s11
// is added to s16 - s19 and stored in s8 - s11
"fmrx r0, fpscr \n\t" // loads fpscr status reg to r4
"bic r0, r0, #0x00370000 \n\t" // bit clear stride and length
"orr r0, r0, #0x00030000 \n\t" // set length to 4 (11)
"fmxr fpscr, r0 \n\t" // upload r4 to fpscr
// Note: this stalls the FPU
// result[0][1][2][3] = mA.f[0][0][0][0] * mB.f[0][1][2][3]
// result[0][1][2][3] = result + mA.f[1][1][1][1] * mB.f[4][5][6][7]
// result[0][1][2][3] = result + mA.f[2][2][2][2] * mB.f[8][9][10][11]
// result[0][1][2][3] = result + mA.f[3][3][3][3] * mB.f[12][13][14][15]
// s0 - s31
// if Fd == s0 - s7 -> treated as scalar all the other treated like vector
// load the whole matrix into memory - transposed -> second operand first
"fldmias %2, {s8-s23} \n\t"
// load first column to scalar bank
"fldmias %1!, {s0 - s3} \n\t"
// first column times matrix
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// save first column
"fstmias %0!, {s24-s27} \n\t"
// load second column to scalar bank
"fldmias %1!, {s4-s7} \n\t"
// second column times matrix
"fmuls s28, s8, s4 \n\t"
"fmacs s28, s12, s5 \n\t"
"fmacs s28, s16, s6 \n\t"
"fmacs s28, s20, s7 \n\t"
// save second column
"fstmias %0!, {s28-s31) \n\t"
// load third column to scalar bank
"fldmias %1!, {s0-s3} \n\t"
// third column times matrix
"fmuls s24, s8, s0 \n\t"
"fmacs s24, s12, s1 \n\t"
"fmacs s24, s16, s2 \n\t"
"fmacs s24, s20, s3 \n\t"
// save third column
"fstmias %0!, {s24-s27} \n\t"
// load fourth column to scalar bank
"fldmias %1!, {s4-s7} \n\t"
// fourth column times matrix
"fmuls s28, s8, s4 \n\t"
"fmacs s28, s12, s5 \n\t"
"fmacs s28, s16, s6 \n\t"
"fmacs s28, s20, s7 \n\t"
// save fourth column
"fstmias %0!, {s28-s31} \n\t"
// reset vector length to 1
"fmrx r0, fpscr \n\t" // loads fpscr status reg to r4
"bic r0, r0, #0x00370000 \n\t" // bit clear stride and length
"fmxr fpscr, r0 \n\t" // upload r4 to fpscr
// switch to tumb mode
// lower bit of destination is set to 1
"add r0, pc, #1 \n\t"
"bx r0 \n\t"
".thumb \n\t"
// binds variables to registers
: "=r" (dst_ptr), "=r" (src_ptr1), "=r" (src_ptr2)
: "0" (dst_ptr), "1" (src_ptr1), "2" (src_ptr2)
: "r0"
);
#endif
#endif
}