3D Hilbert curves in O(log(n)) optimized

Finally I've managed to implement an optimized version of the 3D Hilbert Index -> XYZ transform in logarithmic time. The crucial element was finding an efficient bitwise representation of the alternating group A4. In the end I decided for a rather direct representation as a subgroup of S4, describing each permutation as a 4-tuple (a,b,c,d). Multiplication in this form (implementing 32 operations at the same time bitwise SIMD style) is then a simple bit selection:

struct S4x32
{
  uint32_t a1;
  uint32_t a0;

  uint32_t b1;
  uint32_t b0;

  uint32_t c1;
  uint32_t c0;

  uint32_t d1;
  uint32_t d0;
};


S4x32 mul(S4x32 x, S4x32 y)
{
  S4 res;

  // If x.a == 0, let res.a = y.a; else if x.a == 1, let res.a = y.b; else if x.a == 2, let res.a = y.c, etc.
  res.a0 = (y.a0 & ~x.a0 & ~x.a1) | (y.b0 & x.a0 & ~x.a1) | (y.c0 & ~x.a0 & x.a1) | (y.c0 & x.a0 & x.a1);
  res.a1 = (y.a1 & ~x.a0 & ~x.a1) | (y.b1 & x.a0 & ~x.a1) | (y.c1 & ~x.a0 & x.a1) | (y.c1 & x.a0 & x.a1);
  res.b0 = (y.a0 & ~x.b0 & ~x.b1) | (y.b0 & x.b0 & ~x.b1) | (y.c0 & ~x.b0 & x.b1) | (y.c0 & x.b0 & x.b1);
  res.b1 = (y.a1 & ~x.b0 & ~x.b1) | (y.b1 & x.b0 & ~x.b1) | (y.c1 & ~x.b0 & x.b1) | (y.c1 & x.b0 & x.b1);
  res.c0 = (y.a0 & ~x.c0 & ~x.c1) | (y.b0 & x.c0 & ~x.c1) | (y.c0 & ~x.c0 & x.c1) | (y.c0 & x.c0 & x.c1);
  res.c1 = (y.a1 & ~x.c0 & ~x.c1) | (y.b1 & x.c0 & ~x.c1) | (y.c1 & ~x.c0 & x.c1) | (y.c1 & x.c0 & x.c1);
  res.d0 = (y.a0 & ~x.d0 & ~x.d1) | (y.b0 & x.d0 & ~x.d1) | (y.c0 & ~x.d0 & x.d1) | (y.c0 & x.d0 & x.d1);
  res.d1 = (y.a1 & ~x.d0 & ~x.d1) | (y.b1 & x.d0 & ~x.d1) | (y.c1 & ~x.d0 & x.d1) | (y.c1 & x.d0 & x.d1);

  return res;
}

This is a lot less code than the autogenerated transform code of the original proof of concept implementation. Now there's some further symmetries that we can immediately exploit. Since there are no duplicate values in a permutation tuple, the last element is uniquely determined by the remaining ones, via d = a ^ b ^ c, and we can get rid of two components.

Since we're in the subgroup A4, the third element also happens to be uniquely determined as we only allow even permutations. After some logic optimizations, we arrive at the rather compact

struct A4x32
{
  uint32_t a1;
  uint32_t a0;

  uint32_t b1;
  uint32_t b0;
};

A4x32 mul(A4x32 x, A4x32 y)
{
  A4x32 res;

  uint32_t yc1 = y.a0 ^ y.b0 ^ y.b1;
  uint32_t ys = y.a1 ^ y.b1;
  uint32_t yc0 = ~((ys & y.a0) ^ (~y.s & y.b0));

  res.a0 = (y.a0 & ~(x.a0 ^ x.a1)) ^ (y.b0 & x.a0) ^ (yc0 & x.a1);
  res.a1 = (y.a1 & ~(x.a0 ^ x.a1)) ^ (y.b1 & x.a0) ^ (yc1 & x.a1);
  res.b0 = (y.a0 & ~(x.b0 ^ x.b1)) ^ (y.b0 & x.b0) ^ (yc0 & x.b1);
  res.b1 = (y.a1 & ~(x.b0 ^ x.b1)) ^ (y.b1 & x.b0) ^ (yc1 & x.b1);

  return res;
}

With this key piece, implementing the remaining function is straightforward. After prefix scanning, we only need to bring the transforms back into a representation that allows us to efficiently transform each octant in 3D. Luckily it's relatively easy to convert from the representation as a permutation tuple to a 3D transformation matrix. Putting it all together, this is the result:

void hilbertTo3D_logarithmic(uint64_t hilbertIndex, uint32_t bits, uint32_t& x, uint32_t& y, uint32_t& z)
{
  // Deinterleave index bits - use a different method if pext is not available
  uint32_t i0 = _pext_u64(hilbertIndex, 0x9249249249249249ull << 0);
  uint32_t i1 = _pext_u64(hilbertIndex, 0x9249249249249249ull << 1);
  uint32_t i2 = _pext_u64(hilbertIndex, 0x9249249249249249ull << 2);

  // Align to MSB
  i0 <<= 32 - bits;
  i1 <<= 32 - bits;
  i2 <<= 32 - bits;

  // Compute initial transform from index
  uint32_t j = ~(i1 ^ i2);
  uint32_t ta1 = (i2 & ~i1 & i0) | (j & ~i0);
  uint32_t ta0 = i0 | i1 | i2;
  uint32_t tb1 = (~i2 & i1 & ~i0) | (j & i0);
  uint32_t tb0 = i2 & i1 & i0;

  // Shift downwards, filling MSB with identity transform
  ta0 >>= 1;
  ta1 >>= 1;
  tb0 = (tb0 >> 1) | 0x80000000;
  tb1 >>= 1;

  // Prefix scan iteration
  auto fold = [&](uint32_t shift)
  {
    // Shift downwards
    uint32_t sa0 = ta0 >> shift;
    uint32_t sa1 = ta1 >> shift;
    uint32_t sb0 = (tb0 >> shift) | (~0 << (32 - shift));
    uint32_t sb1 = tb1 >> shift;

    // Recover third element of permutation which is uniquely determined in A4 by the first two elements
    uint32_t sc1 = sa0 ^ sb0 ^ sb1;
    uint32_t ss = sa1 ^ sb1;
    uint32_t sc0 = ~((ss & sa0) ^ (~ss & sb0));

    // Multiply permutations
    uint32_t ra0 = (sa0 & ~(ta0 ^ ta1)) ^ (sb0 & ta0) ^ (sc0 & ta1);
    uint32_t ra1 = (sa1 & ~(ta0 ^ ta1)) ^ (sb1 & ta0) ^ (sc1 & ta1);
    uint32_t rb0 = (sa0 & ~(tb0 ^ tb1)) ^ (sb0 & tb0) ^ (sc0 & tb1);
    uint32_t rb1 = (sa1 & ~(tb0 ^ tb1)) ^ (sb1 & tb0) ^ (sc1 & tb1);

    // Store current transform
    ta0 = ra0;
    ta1 = ra1;
    tb0 = rb0;
    tb1 = rb1;
  };

  // Apply prefix scan
  fold(1);
  fold(2);
  fold(4);
  fold(8);
  fold(16);

  // Recover untransformed octants
  uint32_t o0 = i0 ^ i1;
  uint32_t o1 = i1 ^ i2;
  uint32_t o2 = i2;

  // Build transformation matrix columns from permutation
  uint32_t m0 = ~(ta1 ^ tb1);
  uint32_t m2 = ~(ta0 ^ tb0);
  uint32_t m1 = ~(m0 | m2);

  uint32_t ma = tb0 ^ m0;
  uint32_t mc = ta1 ^ m2;
  uint32_t mb = ~ta1 & ~ta0 & tb1 | ta1 & ta0 & ~tb1 | ~ta0 & tb1 & tb0 | ta0 & ~tb1 & ~tb0;

  // Apply transform to original octants
  x = (m0 & o0) ^ (m1 & o1) ^ (m2 & o2) ^ ma;
  y = (m2 & o0) ^ (m0 & o1) ^ (m1 & o2) ^ mb;
  z = (m1 & o0) ^ (m2 & o1) ^ (m0 & o2) ^ mc;

  // Undo MSB alignment
  x >>= 32 - bits;
  y >>= 32 - bits;
  z >>= 32 - bits;
}

The resulting code quite nicely demonstrates all of the symmetries inherent to the problem, so I consider it unlikely to find any further significant reductions to the bit logic. The instruction count per iteration is unfortunately still massive compared to the rather trivial O(n) version, so for small bit counts this method is slower in practice despite the lower asymptotic complexity. The cutoff point seems to be around transforming 63 -> 3x21 bits, where it starts being just barely faster on my machine. So for computing positions for massive Hilbert indices (>64 bits) the logarithmic version would be preferable, although I consider that a very rare use case.