/**********************************************************************
This file is part of Crack dot Com's free source code release of
Golgotha.
for
information about compiling & licensing issues visit this URL
If that doesn't help, contact Jonathan Clark at
golgotha_source@usa.net (Subject should have "GOLG" in it)
***********************************************************************/
#include "software/r1_software_globals.hh"
#include "software/inline_fpu.hh"
#include "software/amd3d/amd3d.h"
extern sw32 had_subdivisions;
//instead of using left_s, left_t, right_s, and right_t,
//the divides and multiplies are nicely vectorized by the amd3d,
//and storing them is a single quad store to an array of 2 floats,
//rather than two dword stores to two seperate floats
extern sw32 left_s_t[2];
extern sw32 right_s_t[2];
extern float mmx0[2];
extern float mmx1[2];
extern float mmx2[2];
extern float mmx3[2];
extern float mmx4[2];
extern float mmx5[2];
extern float mmx6[2];
extern float mmx7[2];
void texture_scanline_perspective_unlit_amd3d(w16 *start_pixel,
sw32 start_x,
void *_left,//perspective_span *left,
sw32 width)
{
start_pixel = (w16 *)((w8 *)start_pixel + start_x);
perspective_span *left = (perspective_span *)_left;
_asm
{
//left_z = 1.f / left->ooz;
//left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
//left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
//sw32 had_subdivisions = width & (~15);
//num_subdivisions = width >> 4;
//num_leftover = width & 15;
mov edi,dword ptr [left]
mov eax,dword ptr [width]
movd mm0, dword ptr [edi]perspective_span.ooz
mov ebx,eax
pfrcp (m1, m0)
and eax,15
shr ebx,4
punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
pfrcpit1 (m0, m1)
mov ecx,dword ptr [width]
movq mm2, qword ptr [edi]perspective_span.soz
mov dword ptr [num_leftover],eax
pfrcpit2 (m0, m1)
and ecx,(~15)
//mov eax,dword ptr [edi]perspective_span.l
mov dword ptr [num_subdivisions],ebx
pfmul (m2, m0)
mov dword ptr [had_subdivisions],ecx
//mov dword ptr [left_l],eax
//clear these out
mov dword ptr [dsdx_frac],0
//high 32 bits of mm2 - toz / ooz (aka t)
//low 32 bits of mm2 - soz / ooz (aka s)
pf2id (m3, m2)
mov dword ptr [dtdx_frac],0
//high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
//low 32 bits of mm3 - soz / ooz (aka s) - truncated ints
paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust
//high 32 bits of mm3 - t + t_adjust
//low 32 bits of mm3 - s + s_adjust
movq qword ptr [left_s_t], mm3
}
if (num_subdivisions)
{
_asm
{
//ooz_right = left->ooz + (cur_grads.doozdxspan);
//soz_right = left->soz + (cur_grads.dsozdxspan);
//toz_right = left->toz + (cur_grads.dtozdxspan);
//edi still has dword ptr [left]
lea ebx,dword ptr [cur_grads]
nop
movd mm1, dword ptr [edi]perspective_span.ooz
mov esi,dword ptr [r1_software_texture_ptr]
movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
mov eax,dword ptr [left_s_t] //left_s
shr esi,1
movq mm0, qword ptr [edi]perspective_span.soz
pfadd (m1, m3)
movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
sar eax,16 //get integral left_s into eax
mov edi,dword ptr [start_pixel]
pfrcp (m6, m1)
movq mm7,mm1
pfadd (m0, m2)
mov ebx,dword ptr [left_s_t+4] //left_t
//calculate the 1st right_z in mm7
sar ebx,16 //get integral left_t into ebx
punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
pfrcpit1 (m7, m6)
mov edx,dword ptr [left_s_t+4] //left_t
mov cl,byte ptr [r1_software_twidth_log2]
add esi,eax
pfrcpit2 (m7, m6)
//calculate starting fractional and integral values for s and t
//esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
//ecx = starting_s_coordinate << 16
//edx = starting_t_coordinate << 16
//some stuff has been moved up, interleaved w/the mmx code above
shl ebx,cl //multiply integral left_t by texture width
sal edx,16 //get fractional left_t into edx
mov ecx,dword ptr [left_s_t] //left_s
sal ecx,16
add esi,ebx
}
while (num_subdivisions)
{
_asm
{
//right_s = qftoi(soz_right * right_z);
//right_t = qftoi(toz_right * right_z);
//soz_right and toz_right are in mm0
//right_z is in mm7
pfmul (m7, m0)
pf2id (m7, m7)
movq qword ptr [right_s_t],mm7
//calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
//more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
//in the leftover span, calculate the end of that.
//if (num_subdivisions!=1)
//{
cmp dword ptr [num_subdivisions],1
je last_subdivision
//ooz_right += (cur_grads.doozdxspan);
//soz_right += (cur_grads.dsozdxspan);
//toz_right += (cur_grads.dtozdxspan);
pfadd (m0, m2)
pfadd (m1, m3)
jmp proceed_with_mapping
//}
//else
//if (num_leftover > 1)
//{
last_subdivision:
cmp dword ptr [num_leftover],1
jle proceed_with_mapping
//calculate the right_z for the end of the leftover span
//ooz_right += (cur_grads.doozdx * num_leftover);
//soz_right += (cur_grads.dsozdx * num_leftover);
//toz_right += (cur_grads.dtozdx * num_leftover);
movd mm2,dword ptr [num_leftover]
movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
pi2fd (m2, m2)
movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx
pfmul (m3, m2)
movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
pfmul (m4, m2)
pfmul (m5, m2)
pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
pfadd (m0, m3)
pfadd (m1, m5)
//}
proceed_with_mapping:
//cap the right_s and right_t's so that they're valid
mov eax,dword ptr [right_s_t] //right_s
mov ebx,dword ptr [right_s_t+4] //right_t
add eax,dword ptr [cur_grads]tri_gradients.s_adjust
add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
//cap the right s and t
cmp eax,0
jge cmp_eax_high
mov eax,0
jmp cmp_ebx_low
cmp_eax_high:
cmp eax,dword ptr [s_mask]
jle cmp_ebx_low
mov eax,dword ptr [s_mask]
cmp_ebx_low:
cmp ebx,0
jge cmp_ebx_high
mov ebx,0
jmp done_compare
cmp_ebx_high:
cmp ebx,dword ptr [t_mask]
jle done_compare
mov ebx,dword ptr [t_mask]
done_compare:
//store the right_s and right_t
//so they can be copied into left_s and left_t at the end of the 16-pixel span
//(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
//calculate the next right_z in mm7
//unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
//calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
//that the amd3d code has something for its executation latencies to sit through
movq mm7, mm1
pfrcp (m6, m1)
mov dword ptr [right_s_t],eax //right_s
mov dword ptr [right_s_t+4],ebx //right_t
punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
sub eax,dword ptr [left_s_t] //left_s
sar eax,4
push ebp
pfrcpit1 (m7, m6)
sub ebx,dword ptr [left_s_t+4] //left_t
sar ebx,4
mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
pfrcpit2 (m7, m6)
nop
sar eax,16
mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
sar ebx,16
mov cl,byte ptr [r1_software_twidth_log2]
shl ebx,cl
add eax,ebx
//s_t_carry[1] = integral_dsdx + integral_dtdx< 1)
{
if (had_subdivisions==0)
{
//calculate the right_z for the end of span
//ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
//soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
//toz_right = left->toz + (cur_grads.dtozdx * num_leftover);
_asm
{
movd mm2,dword ptr [num_leftover]
lea ebx,dword ptr [cur_grads]
movd mm3, dword ptr [ebx]tri_gradients.dsozdx
mov edi,dword ptr [left]
movd mm4, dword ptr [ebx]tri_gradients.dtozdx
pi2fd (m2, m2)
movd mm5, dword ptr [ebx]tri_gradients.doozdx
pfmul (m3, m2)
movq mm0, qword ptr [edi]perspective_span.soz
pfmul (m4, m2)
movd mm1, dword ptr [edi]perspective_span.ooz
pfmul (m5, m2)
pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
pfadd (m1, m5) //ooz += doozdx*num_leftover
pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover
//calculate the z at the right endpoint in mm7
movq mm7, mm1
pfrcp (m6, m1)
punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
pfrcpit1 (m7, m6) //terrible stalls. oh well
pfrcpit2 (m7, m6)
}
}
else
{
//the correct ending right_z is already being calculated
//(see the if (num_subdivisions!=1) case above
}
_asm
{
//calculate starting fractional and integral values for s and t
//calculate the right endpoint
//right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
//right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
//soz_right and toz_right are in mm0
//right_z is in mm7
pfmul (m7, m0) //calculate right_s and right_t
mov edi,dword ptr [start_pixel]
mov esi,dword ptr [r1_software_texture_ptr]
mov eax,dword ptr [left_s_t] //left_s
shr esi,1
pf2id (m7, m7) //truncate right_s and right_t
sar eax,16
mov ebx,dword ptr [left_s_t+4] //left_t
sar ebx,16
movq qword ptr [right_s_t],mm7
mov edx,dword ptr [left_s_t+4] //left_t
add esi,eax
mov cl,byte ptr [r1_software_twidth_log2]
shl ebx,cl
sal edx,16
mov ecx,dword ptr [left_s_t] //left_s
sal ecx,16
add esi,ebx
mov eax,dword ptr [right_s_t] //right_s
mov ebx,dword ptr [right_s_t+4] //right_t
add eax,dword ptr [cur_grads]tri_gradients.s_adjust
add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
//cap the right s and t
cmp eax,0
jge cmp_eax_high_2
mov eax,0
jmp cmp_ebx_low_2
cmp_eax_high_2:
cmp eax,dword ptr [s_mask]
jle cmp_ebx_low_2
mov eax,dword ptr [s_mask]
cmp_ebx_low_2:
cmp ebx,0
jge cmp_ebx_high_2
mov ebx,0
jmp done_compare_2
cmp_ebx_high_2:
cmp ebx,dword ptr [t_mask]
jle done_compare_2
mov ebx,dword ptr [t_mask]
done_compare_2:
//calculate the deltas (left to right)
//temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
//temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);
sub eax,dword ptr [left_s_t] //left_s
sub ebx,dword ptr [left_s_t+4] //left_t
movd mm0,eax //temp_dsdx
push ebp
movd mm1,ebx //temp_dtdx
mov ebp, dword ptr [num_leftover]
pi2fd (m0, m0)
movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
pi2fd (m1, m1)
pfmul (m0, m2)
pfmul (m1, m2) //bad stalls here
pf2id (m0, m0)
pf2id (m1, m1)
movd eax, mm0 //temp_dsdx
movd ebx, mm1 //temp_dtdx
//calculate the fractional and integral delta vars
//s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<>16) + ((temp_dtdx>>16)<>16) + ((left_s_t[1]>>16)<