blog.: January 2009

Friday, January 16, 2009

-funroll-loops

In general, C is a lousy language for expressing this kind of parallelism on the SPU. The original loop that 'inspired' this nonsense looks something like :

for (j = 0; j < num_indexes; j += 3) {
   const float *v0, *v1, *v2; 
   v0 = (const float *) (vertices + indexes[j+0] * vertex_size);
   v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
   v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
   func(v0, v1, v2);
}

which is quite clear and straightforward to read, but with hidden complexity - the lack of quadword alignment, the way it is expressed as three seperate multiply-adds, and the separation into three (unpacked) variables which are repacked inside func().

Unrolled 2

Longer than the other one, but with better odd/even balance and only one shuffle constant. Probably faster.

for (j = 0; j < num_indexes; j += 24) {
   qword* lower_qword = (qword*)&indexes[j];
   qword indices0 = lower_qword[0];
   qword indices1 = lower_qword[1];
   qword indices2 = lower_qword[2];

   qword vs0 = indices0;
   qword vs1 = si_shlqbyi(indices0, 6);
   qword vs3 = si_shlqbyi(indices1, 2);
   qword vs4 = si_shlqbyi(indices1, 8);
   qword vs6 = si_shlqbyi(indices2, 4);
   qword vs7 = si_shlqbyi(indices2, 10);

   qword tmp2a, tmp2b, tmp5a, tmp5b;
   qword tmp2a = si_shlqbyi(indices0, 12);
   qword tmp2b = si_rotqmbyi(indices1, 12|16);
   qword vs2 = si_selb(tmp2a, tmp2b, si_fsmh(0x20));

   qword tmp5a = si_shlqbyi(indices1, 14);
   qword tmp5b = si_rotqmbyi(indices2, 14|16);
   qword vs5 = si_selb(tmp5a, tmp5b, si_fsmh(0x60));

   vs0 = si_shufb(vs0, vs0, SHUFB8(0,A,0,B,0,C,0,0));
   vs1 = si_shufb(vs1, vs1, SHUFB8(0,A,0,B,0,C,0,0));
   vs2 = si_shufb(vs2, vs2, SHUFB8(0,A,0,B,0,C,0,0));
   vs3 = si_shufb(vs3, vs3, SHUFB8(0,A,0,B,0,C,0,0));
   vs4 = si_shufb(vs4, vs4, SHUFB8(0,A,0,B,0,C,0,0));
   vs5 = si_shufb(vs5, vs5, SHUFB8(0,A,0,B,0,C,0,0));
   vs6 = si_shufb(vs6, vs6, SHUFB8(0,A,0,B,0,C,0,0));
   vs7 = si_shufb(vs7, vs7, SHUFB8(0,A,0,B,0,C,0,0));

   vs0 = si_mpya(vs0, vertex_sizes, verticess);
   vs1 = si_mpya(vs1, vertex_sizes, verticess);
   vs2 = si_mpya(vs2, vertex_sizes, verticess);
   vs3 = si_mpya(vs3, vertex_sizes, verticess);
   vs4 = si_mpya(vs4, vertex_sizes, verticess);
   vs5 = si_mpya(vs5, vertex_sizes, verticess);
   vs6 = si_mpya(vs6, vertex_sizes, verticess);
   vs7 = si_mpya(vs7, vertex_sizes, verticess);

   switch(num_indexes - j) {
      default: func(vs7);
      case 21: func(vs6);
      case 18: func(vs5);
      case 15: func(vs4);
      case 12: func(vs3);
      case 9:  func(vs2);
      case 6:  func(vs1);
      case 3:  func(vs0);
   }
}

Unrolled 1

Shortest form I've found so far. Not a good odd/even balance on the pipeline usage though.

for (j = 0; j < num_indexes; j += 24) {
   qword* lower_qword = (qword*)&indexes[j];
   qword i0 = lower_qword[0];
   qword i1 = lower_qword[1];
   qword i2 = lower_qword[2];
   qword i0r = si_rotqmbyi(i0, -2);
   qword i1r = si_rotqmbyi(i1, -2);
   qword i2r = si_rotqmbyi(i2, -2);

   qword v0 = si_mpya(i0, vertex_sizes, verticess);
   qword v1 = si_mpya(i1, vertex_sizes, verticess);
   qword v2 = si_mpya(i2, vertex_sizes, verticess);
   qword v0r = si_mpya(i0r, vertex_sizes, verticess);
   qword v1r = si_mpya(i1r, vertex_sizes, verticess);
   qword v2r = si_mpya(i2r, vertex_sizes, verticess);

   // Little constant reuse here :\
   qword vs7 = si_shufb(v2r, v2, SHUFB4(c,D,d,0));
   qword vs6 = si_shufb(v2r, v2, SHUFB4(B,b,C,0));
   qword vs5 = si_shufb(v1, v2r, SHUFB4(D,a,0,0));
         vs5 = si_shufb(vs5, v2, SHUFB4(A,B,a,0));
   qword vs4 = si_shufb(v1, v1r, SHUFB4(c,C,d,0));
   qword vs3 = si_shufb(v1, v1r, SHUFB4(A,b,B,0));
   qword vs2 = si_shufb(v0r, v0, SHUFB4(D,d,0,0));
         vs2 = si_shufb(vs2, v1r,SHUFB4(A,B,a,0));
   qword vs1 = si_shufb(v0r, v0, SHUFB4(b,C,c,0));
   qword vs0 = si_shufb(v0r, v0, SHUFB4(A,a,B,0));

   switch(num_indexes - j) {
      default: func(vs7);
      case 21: func(vs6);
      case 18: func(vs5);
      case 15: func(vs4);
      case 12: func(vs3);
      case 9:  func(vs2);
      case 6:  func(vs1);
      case 3:  func(vs0);
   }
}

Thursday, January 15, 2009

SPU unaligned loads

Extract three adjacent ushorts from an arbitrary array location.

(Would do a lot better unrolled, I think)

for (j = 0; j < num_indexes; j += 3) {

   // Determine address of aligned qword containing indexes[j]
   qword lower_qword = si_from_ptr(&indexes[j]);

   // Load qword containing indexes[j] and successor
   qword first = si_lqd(lower_qword, 0);
   qword second = si_lqd(lower_qword, 16);

   // Calculate &indexes[j]&15 - offset of index from 16 byte alignment
   qword offset = si_andi(lower_qword, 15);

   // Generate a mask to select the appropriate parts of first and second
   // form byte select mask from (1<
   qword one = si_from_uint(1);
   qword mask = si_fsmb(si_sf(one, si_shl(one, offset)));

   // Rotate first and second parts to desired locations
   // This is the key interesting bit, but I'd like to 
   // think this could be improved upon...
   first = si_shlqby(first, offset);
   second = si_rotqmby(second, si_ori(offset, 16));

   // Store indexes[j],[j+1],[j+2] in vs.
   qword is = si_selb(first, second, mask);

   // Expand is to uint positioning
   is = si_shufb(is, is, SHUFB8(0,A,0,B,0,C,0,0));

   qword vs = si_mpya(is, (qword)spu_splats(vertex_size),
                      (qword)spu_splats((unsigned)vertices));

   func(vs);

}

I dig the big picture.

Wednesday, January 14, 2009

20090114

Cubular - I wonder how hard it would be to make one...

25c3 - Hours of entertainment.

Perpetual calendar - Just the thing to go with my binary clock.

No great archive in the sky - Backup. (note to self: backup).

Geeks Bearing Gifts - Want.

Twitterville

blog.