/**********************************************************************
This file is part of Crack dot Com's free source code release of Golgotha.
for information about compiling & licensing issues visit this URL
 If that doesn't help, contact Jonathan Clark at 
  golgotha_source@usa.net (Subject should have "GOLG" in it) 
***********************************************************************/

#include "software/r1_software_globals.hh"
#include "software/inline_fpu.hh"
#include "software/amd3d/amd3d.h"

extern sw32 had_subdivisions;

//instead of using left_s, left_t, right_s, and right_t,
//the divides and multiplies are nicely vectorized by the amd3d,
//and storing them is a single quad store to an array of 2 floats,
//rather than two dword stores to two seperate floats

extern sw32 left_s_t[2];
extern sw32 right_s_t[2];

extern float mmx0[2];
extern float mmx1[2];
extern float mmx2[2];
extern float mmx3[2];
extern float mmx4[2];
extern float mmx5[2];
extern float mmx6[2];
extern float mmx7[2];

void texture_scanline_perspective_unlit_amd3d(w16 *start_pixel,
                                              sw32 start_x,
                                              void *_left,//perspective_span *left,
                                              sw32 width)
{
  start_pixel = (w16 *)((w8 *)start_pixel + start_x);

  perspective_span *left = (perspective_span *)_left;
  
  _asm
  {
    //left_z = 1.f / left->ooz;
    //left_s = qftoi(left->soz * left_z) + cur_grads.s_adjust;
    //left_t = qftoi(left->toz * left_z) + cur_grads.t_adjust;
    
    //sw32 had_subdivisions = width & (~15);
    //num_subdivisions = width >> 4;
    //num_leftover     = width & 15;
    
    mov edi,dword ptr [left]
    mov eax,dword ptr [width]

    movd mm0, dword ptr [edi]perspective_span.ooz
    mov ebx,eax
    
    pfrcp (m1, m0)
    and eax,15

    shr ebx,4
    punpckldq mm0, mm0 //duplicate low 32bits of m0 into high 32 bits of m0
        
    pfrcpit1 (m0, m1)
    mov ecx,dword ptr [width]
    
    movq mm2, qword ptr [edi]perspective_span.soz
    mov dword ptr [num_leftover],eax
    
    pfrcpit2 (m0, m1)
    and ecx,(~15)
    
    //mov eax,dword ptr [edi]perspective_span.l
    mov dword ptr [num_subdivisions],ebx

    pfmul (m2, m0)
    mov dword ptr [had_subdivisions],ecx
    
    //mov dword ptr [left_l],eax
    //clear these out
    mov dword ptr [dsdx_frac],0

    //high 32 bits of mm2 - toz / ooz (aka t)
    //low  32 bits of mm2 - soz / ooz (aka s)

    pf2id (m3, m2)
    mov dword ptr [dtdx_frac],0

    //high 32 bits of mm3 - toz / ooz (aka t) - truncated ints
    //low  32 bits of mm3 - soz / ooz (aka s) - truncated ints

    paddd mm3, qword ptr [cur_grads]tri_gradients.s_adjust

    //high 32 bits of mm3 - t + t_adjust
    //low  32 bits of mm3 - s + s_adjust

    movq qword ptr [left_s_t], mm3
  }

  if (num_subdivisions)
  {
    _asm
    {
      //ooz_right = left->ooz + (cur_grads.doozdxspan);
      //soz_right = left->soz + (cur_grads.dsozdxspan);
      //toz_right = left->toz + (cur_grads.dtozdxspan);

      //edi still has dword ptr [left]
      lea ebx,dword ptr [cur_grads]
      nop

      movd mm1, dword ptr [edi]perspective_span.ooz
      mov esi,dword ptr [r1_software_texture_ptr]
      
      movd mm3, dword ptr [ebx]tri_gradients.doozdxspan
      mov eax,dword ptr [left_s_t] //left_s
      
      shr esi,1
      movq mm0, qword ptr [edi]perspective_span.soz
      
      pfadd (m1, m3)
      movq mm2, qword ptr [ebx]tri_gradients.dsozdxspan
      
      sar eax,16   //get integral left_s into eax
      mov edi,dword ptr [start_pixel]
      
      pfrcp (m6, m1)
      movq mm7,mm1
      
      pfadd (m0, m2)
      mov ebx,dword ptr [left_s_t+4] //left_t      
      
      //calculate the 1st right_z in mm7
      sar ebx,16 //get integral left_t into ebx
      punpckldq mm7, mm7 //duplicate high 32bits of mm7 into low 32 bits of mm7
      
      pfrcpit1 (m7, m6)
      mov edx,dword ptr [left_s_t+4] //left_t
      
      mov cl,byte ptr [r1_software_twidth_log2]
      add esi,eax
      
      pfrcpit2 (m7, m6)

      //calculate starting fractional and integral values for s and t
      //esi = starting_s_coordinate >> 16 + starting_t_coordinate >> 16 << r1_software_twidth_log2
      //ecx = starting_s_coordinate << 16
      //edx = starting_t_coordinate << 16

      //some stuff has been moved up, interleaved w/the mmx code above
      
      shl ebx,cl //multiply integral left_t by texture width
      
      sal edx,16 //get fractional left_t into edx
      mov ecx,dword ptr [left_s_t] //left_s
    
      sal ecx,16
      add esi,ebx
    }

    while (num_subdivisions)
    {
      _asm
      {
        //right_s = qftoi(soz_right * right_z);
        //right_t = qftoi(toz_right * right_z);
        
        //soz_right and toz_right are in mm0
        //right_z is in mm7
        pfmul (m7, m0)
        
        pf2id (m7, m7)

        movq qword ptr [right_s_t],mm7

      //calculate ooz_right, soz_right, toz_right, and right_z for the end of the next span. if there are
      //more subdivisions, calculate the end of the next span. if there are no more and there is > 1 leftover
      //in the leftover span, calculate the end of that.

      //if (num_subdivisions!=1)
      //{
          cmp dword ptr [num_subdivisions],1
          je  last_subdivision
        
          //ooz_right += (cur_grads.doozdxspan);
          //soz_right += (cur_grads.dsozdxspan);
          //toz_right += (cur_grads.dtozdxspan);
          
          pfadd (m0, m2)
          pfadd (m1, m3)

          jmp proceed_with_mapping
      //}
      //else
      //if (num_leftover > 1)
      //{

      last_subdivision:
          cmp dword ptr [num_leftover],1
          jle proceed_with_mapping
        
          //calculate the right_z for the end of the leftover span
          //ooz_right += (cur_grads.doozdx * num_leftover);
          //soz_right += (cur_grads.dsozdx * num_leftover);
          //toz_right += (cur_grads.dtozdx * num_leftover);
          
          movd mm2,dword ptr [num_leftover]
          movd mm3, dword ptr [cur_grads]tri_gradients.dsozdx
          
          pi2fd (m2, m2)
          movd mm4, dword ptr [cur_grads]tri_gradients.dtozdx

          pfmul (m3, m2)
          movd mm5, dword ptr [cur_grads]tri_gradients.doozdx
          
          pfmul (m4, m2)
          pfmul (m5, m2)

          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3

          pfadd (m0, m3)
          pfadd (m1, m5)
      //}
            
      proceed_with_mapping:
        //cap the right_s and right_t's so that they're valid

        mov eax,dword ptr [right_s_t] //right_s
        mov ebx,dword ptr [right_s_t+4] //right_t
        
        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
 
        //cap the right s and t
        cmp eax,0
        jge cmp_eax_high

        mov eax,0
        jmp cmp_ebx_low

      cmp_eax_high:
        cmp eax,dword ptr [s_mask]
        jle cmp_ebx_low

        mov eax,dword ptr [s_mask]

      cmp_ebx_low:
        cmp ebx,0
        jge cmp_ebx_high

        mov ebx,0
        jmp done_compare
      
      cmp_ebx_high:
        cmp ebx,dword ptr [t_mask]
        jle done_compare

        mov ebx,dword ptr [t_mask]

      done_compare:

        //store the right_s and right_t
        //so they can be copied into left_s and left_t at the end of the 16-pixel span
        //(the cant be copied now because we have to calculate (right_s-left_s)>>4 and (right_t-left_t)>>4
        
        //calculate the next right_z in mm7
        //unfortunately, if the span is a multiple of 16, and this is the last set of 16, it will
        //calculate an unnecessary z. but its best to have the code here mixed in w/integer ops so
        //that the amd3d code has something for its executation latencies to sit through
        movq mm7, mm1
        pfrcp (m6, m1)

        mov dword ptr [right_s_t],eax //right_s
        mov dword ptr [right_s_t+4],ebx //right_t

        punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7
        sub eax,dword ptr [left_s_t] //left_s

        sar eax,4
        push ebp

        pfrcpit1 (m7, m6)
        sub ebx,dword ptr [left_s_t+4] //left_t

        sar ebx,4
        mov word ptr [dsdx_frac+2],ax //this sets the upper 16 bits of dword ptr [dsdx_frac] to ((right_s-left_s)>>4)<<16
        
        pfrcpit2 (m7, m6)
        nop
        
        sar eax,16
        mov word ptr [dtdx_frac+2],bx //this sets the upper 16 bits of dword ptr [dtdx_frac] to ((right_t-left_t)>>4)<<16
        
        sar ebx,16
        mov cl,byte ptr [r1_software_twidth_log2]
        
        shl ebx,cl

        add eax,ebx

        //s_t_carry[1] = integral_dsdx + integral_dtdx< 1)
    {      
      if (had_subdivisions==0)
      {
        //calculate the right_z for the end of span
        //ooz_right = left->ooz + (cur_grads.doozdx * num_leftover);
        //soz_right = left->soz + (cur_grads.dsozdx * num_leftover);
        //toz_right = left->toz + (cur_grads.dtozdx * num_leftover);

        _asm
        {
          movd mm2,dword ptr [num_leftover]
          lea ebx,dword ptr [cur_grads]
          
          movd mm3, dword ptr [ebx]tri_gradients.dsozdx
          mov edi,dword ptr [left]

          movd mm4, dword ptr [ebx]tri_gradients.dtozdx
          pi2fd (m2, m2)
          
          movd mm5, dword ptr [ebx]tri_gradients.doozdx
          pfmul (m3, m2)
          
          movq mm0, qword ptr [edi]perspective_span.soz
          pfmul (m4, m2)

          movd mm1, dword ptr [edi]perspective_span.ooz
          pfmul (m5, m2)          
          
          pfacc (m3, m4) //gets dtozdx*num_leftover into high 32 bits of m3
          
          pfadd (m1, m5) //ooz += doozdx*num_leftover
          pfadd (m0, m3) //soz += dsozdx*num_leftover AND toz += dtozdx*num_leftover

          //calculate the z at the right endpoint in mm7
          movq mm7, mm1
          pfrcp (m6, m1)

          punpckldq mm7, mm7 //duplicate low 32bits of mm7 into high 32 bits of mm7

          pfrcpit1 (m7, m6) //terrible stalls. oh well
       
          pfrcpit2 (m7, m6)
        }
      }
      else
      {
        //the correct ending right_z is already being calculated
        //(see the if (num_subdivisions!=1) case above
      }

      _asm
      {
        //calculate starting fractional and integral values for s and t           
        
        //calculate the right endpoint
        //right_s = qftoi(soz_right * right_z) + cur_grads.s_adjust;
        //right_t = qftoi(toz_right * right_z) + cur_grads.t_adjust;
        
        //soz_right and toz_right are in mm0
        //right_z is in mm7
        pfmul (m7, m0) //calculate right_s and right_t
        mov edi,dword ptr [start_pixel]

        mov esi,dword ptr [r1_software_texture_ptr]
        mov eax,dword ptr [left_s_t] //left_s

        shr esi,1
        pf2id (m7, m7) //truncate right_s and right_t
        
        sar eax,16
        mov ebx,dword ptr [left_s_t+4] //left_t
            
        sar ebx,16
        movq qword ptr [right_s_t],mm7

        mov edx,dword ptr [left_s_t+4] //left_t
        add esi,eax
        
        mov cl,byte ptr [r1_software_twidth_log2]
        shl ebx,cl
      
        sal edx,16
        mov ecx,dword ptr [left_s_t] //left_s
      
        sal ecx,16
        add esi,ebx

        mov eax,dword ptr [right_s_t] //right_s
        mov ebx,dword ptr [right_s_t+4] //right_t
        
        add eax,dword ptr [cur_grads]tri_gradients.s_adjust
        add ebx,dword ptr [cur_grads]tri_gradients.t_adjust
 
        //cap the right s and t
        cmp eax,0
        jge cmp_eax_high_2

        mov eax,0
        jmp cmp_ebx_low_2

      cmp_eax_high_2:
        cmp eax,dword ptr [s_mask]
        jle cmp_ebx_low_2

        mov eax,dword ptr [s_mask]

      cmp_ebx_low_2:
        cmp ebx,0
        jge cmp_ebx_high_2

        mov ebx,0
        jmp done_compare_2
      
      cmp_ebx_high_2:
        cmp ebx,dword ptr [t_mask]
        jle done_compare_2

        mov ebx,dword ptr [t_mask]

      done_compare_2:
            
        //calculate the deltas (left to right)
        //temp_dsdx = qftoi((float)(right_s - left_s) * inverse_leftover_lookup[num_leftover]);
        //temp_dtdx = qftoi((float)(right_t - left_t) * inverse_leftover_lookup[num_leftover]);

        sub eax,dword ptr [left_s_t] //left_s
        sub ebx,dword ptr [left_s_t+4] //left_t

        movd mm0,eax //temp_dsdx
        push ebp
        
        movd mm1,ebx //temp_dtdx
        mov ebp, dword ptr [num_leftover]
        
        pi2fd (m0, m0)
        movd mm2, dword ptr [inverse_leftover_lookup + ebp*4]
        
        pi2fd (m1, m1)        
        pfmul (m0, m2)

        pfmul (m1, m2) //bad stalls here
        pf2id (m0, m0)

        pf2id (m1, m1)

        movd eax, mm0 //temp_dsdx
        movd ebx, mm1 //temp_dtdx

        //calculate the fractional and integral delta vars
        //s_t_carry[0] = (temp_dsdx>>16) + ((temp_dtdx>>16)<>16) + ((temp_dtdx>>16)<>16) + ((left_s_t[1]>>16)<