I've taken a look at your code and it's funny you ran into the some of the same issues I had too.rainwarrior wrote: ↑Sun Aug 14, 2022 8:58 pm That's 1x, 2x and 4x side by side. 1x is 130% CPU load per scanline, 2x is 90%, 4x is 70%.
I also need to bit shift a lot wit h the reciprocal table which makes the thing kind of pointless, because per component division (doing the division in hw 4 times) is actually faster because it doesn't require all the bit shifting. The only thing I can imagine that gives a really big performance boost there is a multiplication LUT that has the bit shifting pre baked. It would also do away with the sign correction issue. But that would require a lot of ROM of course.
You can avoid the bit shifts for the A and C multiplies though. precision is good enough with A and C prescaled to the correct range. but with B and D it's not easy. I'd try saving 1 or two per component by choosing radix for cos(a)/sin(a) differently for example (-128..127 instead of -256..256 should be decent enough and would also save half of your sine table size).
Apart from that, there's a number of small things you can do to improve performance.
you can move the sign checks out of your inner loop. basically what i said in my last post, but with your implementation, it is easier (you do not need to split the screen into regions for that because your numerators do not change). you can make your inner loop a macro and make a few different implementations for each combination of signs in your scale 0..3 values, then choose one via jump table.
if you do not want to do that for some reason (because of code size perhaps), you can still move your "lsr z:temp+4" line to the place where you have the no ops now
Code: Select all
lsr z:temp+4
bcs :+
lda f:$004216
lsr lsr lsr lsr
bra :++
:
lda f:$004216
lsr lsr lsr lsr
eor #$ffff
ina
:
also maybe
Code: Select all
lda temp + 4
and #1
another thing is, and I don't remember the exact reason (maybe a LoROM vs HiROM issue?), but I don't think using far addressing is actually necessary and if you could use absolute adressing, or even move the zero page to somewhere where it can see the hardware registers, that could also save a few cycles.
if you made pv_zr a 24 bit variable and using 24+16 bit addition with carry (bcs + ina) for the interpolation, the bit shifts could be prebaked into pv_zr, maybe saving a few cycles over 4 times lsr. or, a small saving can be gained by not reloading pv_zr for doing the interpolation. just do sbc pv_zr_inc once just before the loop, then you can do the adc + sta first, and then transfer to x.
another tiny saving, if you build the entire table in reverse (start with the last scanline and work towards the first) you can use y for the loop condition and don't need the additional line counter in temp + 2.
there's one other final thing I've thought of that can maybe work. It might sound ridiculous, but I've actually tried it out and it looks quite promising.
instead of 1/z LUT, you build 2 LUTs, each with ~1024 entries at least for decent accuracy, but more are better. The 2 tables you need are
- ln(x)
- e^x
This means if you precalculate ln(pv_scale) instead of pv_scale, and look up ln(z) in each scanline, you can find the matrix parameters basically by computing e^(ln(pv_scale) - ln(z)). Requiring only subtraction + table lookup per matrix component, but without giant multiplication LUT and also scaling nicely so that no thousands of bit shifts are required.
It doesn't work as well for me because I need to interpolate the numerators so I'd need 9 table lookups in total per scanline, but with your implementation, it could work much better because 4 of them can be precalculated.
demonstration:
Code: Select all
var radix = 128; // for signed 16 bit numerator
var radix2 = 128; // for unsigned 8 bit denominator
function shift(a, b){return(b > 0 ? floor(a >> b) : floor(a << -b))}
function cos(a) {return(Math.cos(a))}
function sin(a) {return(Math.sin(a))}
function floor(a) {return(Math.floor(a))}
function highbyte(a) {return(floor(a / 256))}
function high10bit(a) {return(floor(a / 16))}
function clamp(a) {return floor(a<0?0:a>255?255:a)}
function clamps(a) {return floor(a<-32768 ?-32768:a>32767 ?32767 :a)}
function clampu(a) {return floor(a<0 ?0:a>65535 ?65535 :a)}
function clamp2(a) {return floor(a<0 ?0:a>1023 ?1023 :a)}
function lut_ln(a) {
return clamp2(Math.log(high10bit(a) * 64) * 64)
}
function lut_ln2(a) {
return clamp2(Math.log(high10bit(a) / 4) * 64)
}
function lut_epower(a) {
return clamp2(Math.E ** (a / 64));
}
function lndivide2(a, b) {
return lut_epower(lut_ln(a) - lut_ln2(b)); }
function lndivide(a, b) {
if(a < 0) return -lndivide2(-a, b); else return lndivide2(a, b);
}
// setup
var FOV = 90;
var forward = 128 / Math.tan(FOV * (Math.PI * 2 / 360) / 2);
var yaw = (var1 + framecount * 0.1) * Math.PI / 180;
var pitch = var2 * Math.PI / 180;
var camera_x = framecount;
var camera_y = 0;
var camera_z = var3;
// mode 7 stuff
// constant across the frame
var dx = cos(yaw) * camera_z;
var dy = sin(yaw) * camera_z;
var ax = forward * -sin(yaw) * cos(pitch);
var ay = forward * cos(yaw) * cos(pitch);
var az = forward * sin(pitch);
var bx = sin(yaw) * sin(pitch);
var by = -cos(yaw) * sin(pitch);
var bz = cos(pitch);
// scale values for subpixel precision
// and simulate fixed point math
dx = clamps(dx * radix); dy = clamps(dy * radix);
ax = clamps(ax * radix); ay = clamps(ay * radix); az = clampu(az * radix2);
bx = clamps(bx * radix); by = clamps(by * radix); bz = clampu(bz * radix2);
camera_x = floor(camera_x); camera_y = floor(camera_y);
// per scanline
var cx = clamps(ax + (scanline - 112) * bx);
var cy = clamps(ay + (scanline - 112) * by);
var cz = clampu(az + (scanline - 112) * bz);
camera_x = floor(camera_x * 16 / camera_z);
camera_y = floor(camera_y * 16 / camera_z);
var point_center_x = camera_x + floor(lndivide(cx, cz)) * (radix2 / radix);
var point_center_y = camera_y + floor(lndivide(cy, cz)) * (radix2 / radix);
var offset_x = floor(lndivide(dx, cz)) * (radix2 / radix);
var offset_y = floor(lndivide(dy, cz)) * (radix2 / radix);
m7a = offset_x;
m7b = point_center_x;
m7c = offset_y;
m7d = point_center_y;
m7x = 0;
m7y = 0;
m7hofs = -128;
m7vofs = -camera_z - scanline;
return [m7a, m7b, m7c, m7d, m7x, m7y, m7hofs, m7vofs];