;////////////////////////////////////////////////////////////////////////////////////////
;// Spherical Harmonic Dots 
;// Copyright (c) 2004   adresd <adresd_ps2dev@yahoo.com>
;// 
;// Licensed under the AFL v2.0. See the file LICENSE included with this
;// distribution for licensing terms.
;//
;// This is my attempt at a procedural object generator
;// it generates a dot pattern of a spherical harmonic object
;//
;// first builds a list of dots for one segment, then transforms that list
;// then kicks that while building another list.
;//
;// Note that this code is highly unoptimized, but might be useful to learn from or so.
;//

iScreenMatrix   = 0	;object->screen transformation matrix

iDU             = 12	;du in each member (TWOPI / (float)resolutioni)
iDV             = 13	;dv in each member (PI / (float)resolutionj)
iSCALE          = 14	;scale in all members
iMFX            = 15	;(mf_X) should have x=mf[1],y=mf[3],z=mf[5],w=mf[7]
iMF02           = 16	;(mf_02) x=mf[0],z=mf[2]
iMF46           = 17	;(mf_46) x=mf[4],z=mf[6]
iRESOLI         = 18	;resolution-i (int)
iRESOLJ         = 19    ;resolution-j (int)
iGIFTAGJ        = 20	;gif tag (for j verts)

vfData          = 30	;storage for verts
vfKick          = 500	;storage for gifpkt

.global         g_Vu1_SparmGenDots
.global         g_Vu1_SparmGenDots_End
                
                .p2align 4  ; This aligns the following data to quadword alignment
                .vu

; vas-asm adaptation of vcl powf function by
; (c) Ian Stephenson
;
; ian@dctsystems.freeserve.co.uk
; adapted from vcl to vas-macro by adresd
.macro MYFAST_POW result,a,b,tmp
	ITOF0	\a,\a					nop
	NOP						loi	0.00000011920928955078125
	MULI	\a,\a,I				nop
	NOP						loi	-127
	ADDi	\tmp,\a,I				nop
	mul	\tmp,\tmp,\b			nop
	NOP						loi	127
	ADDi	\tmp,\tmp,I				nop
	NOP						loi	8388608
	MULi	\tmp,\tmp,I				nop
	FTOI0	\result,\tmp			nop
.endm

g_Vu1_SparmGenDots:
		nop                             lq          vf28, iScreenMatrix+0(vi00)
		nop                             lq          vf29, iScreenMatrix+1(vi00)
		nop                             lq          vf30, iScreenMatrix+2(vi00)
		nop                             lq          vf31, iScreenMatrix+3(vi00)
                                              
		nop                             iaddiu      vi02, vi00, vfKick      ; 
		nop                             iadd        vi03, vi00, vi02      ; xgkick address
		nop                             iaddiu      vi01, vi00, vfData      ; coord adress
                
		nop                             lq.xyzw     vf06, iGIFTAGJ(vi00)     ; giftag
		nop                             sqi.xyzw    vf06, (vi02++)          ; store the giftag
		nop                             iaddiu      vi13, vi00, iRESOLI
		nop                             ilwr.w      vi08, (vi13)w	; this is u int limit
		nop                             iaddiu      vi13, vi00, iRESOLJ
		nop                             ilwr.w      vi09, (vi13)w	; this is v int limit
sparmgen:
		; vf02 is the u
		; vf22 is the v
		nop					lq.xyzw vf23, iDU(vi00)		; vf23 is du
		nop					lq.xyzw vf24, iDV(vi00)		; vf24 is dv
		nop					lq.xyzw vf25, iSCALE(vi00)	; vf25 is scale
		sub vf02,vf00,vf00		iadd vi10,vi00,vi08		; set u to limit
I_LOOP:
		sub vf22,vf00,vf00		iadd vi11,vi00,vi09		; set v to limit
		; vf26 is x=costheta,z=sintheta
;   sintheta = PbSin(u);
;   costheta = PbCos(u);
		nop					move vf19,vf02	; vf02 is u
		nop					bal vi13,doSinCos	; vf19 is dest, sin,cos,sin,cos
		nop					nop
		addy.x vf26x,vf00,vf19y		nop
		addx.z vf26z,vf00,vf19x		nop
; vf26 now contains x=costheta, z=sintheta

; powf(PbSin(mf[4]*u),mf[5]) 
; powf(PbCos(mf[6]*u),mf[7])
; PbSin(mf[4]*u)  PbCos(mf[6]*u)
		nop					lq.xyzw vf18, iMF46(vi00)	; vf18 is MF46
		nop					move vf19,vf02	; vf02 is u
		mul.xz vf19,vf19,vf18		nop ; vf18 x=mf[4],z=mf[6]
		nop					bal vi13,doSinCos	; vf19 is dest, sin,cos,sin,cos
		subw.w vf27w,vf27w,vf27w	nop
		addx.z vf27z,vf00,vf19x		nop
		addw.w vf27w,vf00,vf19w		nop
; vf27 now contains z=PbSin(mf[4]*u),w=PbCos(mf[6]*u)

; now setup the giftag and stuff, for output buffer
		nop                           iaddiu      vi01, vi00, vfData      ; coord adress
		
J_LOOP:
;      r = powf(PbSin(mf[0]*v),mf[1]);
;      r += powf(PbCos(mf[2]*v),mf[3]);
; PbSin(mf[0]*v)  PbCos(mf[2]*v)
		nop					lq.xyzw vf18, iMF02(vi00)	; vf18 is MF02
		nop					move vf19,vf22	; vf22 is v
		mul.xz vf19,vf19,vf18		nop ; vf18 x=mf[0],z=mf[2]
		nop					bal vi13,doSinCos	; vf19 is dest, sin,cos,sin,cos
		nop					nop
		addx.x vf27x,vf00,vf19x		nop
		addw.y vf27y,vf00,vf19w		nop
; vf27 now contains x=PbSin(mf[0]*v),y=PbCos(mf[2]*v)
		nop					lq.xyzw vf18, iMFX(vi00)	; vf19 is MFX

; vf27 (in) should have x=PbSin(mf[0]*v),y=PbCos(mf[2]*v),z=PbSin(mf[4]*u),w=PbCos(mf[6]*u)
; vf19 (mf_X) should have x=mf[1],y=mf[3],z=mf[5],w=mf[7]
; seeing as we can do four 'pow's at same time, we will
		nop					move vf12,vf27
		MYFAST_POW vf11,vf12,vf18,vf19		
;MYFAST_POW vf11,vf27,vf18,vf19
; now add them all together
		
; add to all fields, with w as 1.0f
		nop					esum P,vf11
		; result later in P reg
;      sinphi = PbSin(v);
;      cosphi = PbCos(v);
		nop					move vf19,vf22	; vf22 is v
		nop					bal vi13,doSinCos	; vf19 is dest, sin,cos,sin,cos
		nop					nop
; returns,vf19 x=sinphi,y=cosphi,z=sinphi,w=cosphi

; pickup result from P reg
		nop					waitp
		nop					mfp.xyzw vf11xyzw,P ; put P into vf11
; vf11 all fields now contain 'r'

;      p_outv->x = r * sinphi * costheta * scale;
;      p_outv->y = r * cosphi *            scale;
;      p_outv->z = r * sinphi * sintheta * scale;
;      p_outv->w = 1.0f;
; now multiply xyz by sincos vals	-	r * sinphi,cosphi,sinphi
		mul.xyz vf12,vf11,vf19		nop
; now multiply x,z by costheta,sintheta  - vf26 should have x=costheta,z=sintheta
		mul.xz  vf12,vf12,vf26		nop
; now multiply xyz by scale		-	r * scale
		mul.xyz vf12,vf12,vf25		nop	
; fixup in case	
		nop					loi 1.0
		addi.w vf12w,vf00w,I		nop
; vf12 contains the vertex point at this point - lol
		nop					sqi vf12,(vi01++)

J_ENDL:	; increment v
		add vf22,vf22,vf24		nop
		nop					iaddi vi11,vi11,-1
		; check limit, if under, continue
		nop					ibne vi11,vi00,J_LOOP
		nop					nop
	
		; render this strip of points
		nop					iadd vi05,vi00,vi09	; set num of points to do
		nop					iaddi vi05,vi05,-1
		nop					bal vi13,rendercall
		nop					nop

I_ENDL:	; increment u
		add vf02,vf02,vf23		nop
		nop					iaddi vi10,vi10,-1
		; check limit, if under, continue
		nop					ibne vi10,vi00,I_LOOP
		nop					nop


            nop                             nop
            nop[e]                          nop
            nop                             nop
; go back to start, just in case we are triggered again
		nop					b g_Vu1_SparmGenDots
		nop					nop


rendercall:                
; this must be called with the num of verts in vi05
;//////////////////////////////////////////////////////////////////////////////////              
		nop                           iaddiu      vi01, vi00, vfData      ; coord adress
		nop                             iaddiu      vi02, vi00, vfKick         ; 
		nop                             iadd        vi03, vi00, vi02    
		nop                             lq.xyzw     vf06, iGIFTAGJ(vi00)     ; giftag
		nop                             sqi.xyzw    vf06, (vi02++)          ; store the giftag
             ;//////////////////////////////////////////////////////////////////////////////////
                ;// Temporary for colors right now.
                mul  vf06,vf00,vf00			loi 250.0                
                addi.z      vf06,vf00,I         loi 1.0
                addi        vf07,vf00,I         nop
poly_loop:      

;//////////////////////////////////////////////////////////////////////////////////
                nop                             lqi         vf05, (vi01++)          ; XYZ                                                      
                ;//////////////////////////////////////////////////////////////////////////////////
                ;// Project vertex to screen space.
                                
                mulax     acc,  vf28, vf05x     nop                                 
                madday    acc,  vf29, vf05y     nop                                 
                maddaz    acc,  vf30, vf05z     nop                                 
                maddw     vf05, vf31, vf05w     nop                                 
                nop                             div         q, vf00w, vf05w         

                ;// Temporary color stuff              
                add.xy      vf06, vf06, vf07    nop
                ftoi0       vf08, vf06          loi         0.65
                nop                             sqi         vf08, (vi02++)          ; store colors

                nop                             waitq                               
                mulq        vf05, vf05, q       nop                                 
                ftoi4       vf05, vf05          nop                                 ; fixedpoint for gif
                nop                             sqi         vf05, (vi02++)          ; store xyz
                nop                             iaddi       vi05, vi05, -1          ; 
                nop                             nop                                 ;
                nop                             ibne        vi05, vi00, poly_loop   ; loop check
                nop                             nop
                nop                             xgkick      vi03
                nop                             nop
	NOP						jr vi13
	NOP						nop

;//////////////////////////////////////////////////////////////////////////////////
; input vf19 , x , z
; output vf19 x=sinx,y=cosx z=sinz,w=cosz
;
; touches
; ACC,I
; vf14, vf15, vf16, vf17, vf18, vf19, vf20, vf21 
;--------------------------------------------------------------------
; doSinCos - Returns the sin and cos of up to 2 angles, which must
; be contained in the X and Z elements of "angle".  The sin/cos pair
; will be contained in the X/Y elements of "sincos" for the first
; angle, and Z/W for the second one.
; Thanks to Colin Hughes (SCEE) for that one.
; Thanks to the playstation2 linux forums and site for it.
doSinCos:               
                mulz.w      vf19, vf00, vf19z   nop                     ; copy angle from z to w
                addx.y      vf19, vf00, vf19x   loi 1.570796            ; copy angle from x to y                                     
                subi.xz     vf19, vf19, i       nop                     ; phase difference for sin as cos ( pi/2 )
                abs         vf19, vf19          nop                     ; mirror cos around zero
                maxw        vf20, vf00, vf00w   loi -0.159155           ; initialise all 1s                                                                             
                mulai       acc,  vf19, i       loi 12582912.0          ; scale so single cycle is range 0 to -1 ( *-1/2pi )                                   
                msubai      acc,  vf20, i       nop                     ; apply bias to remove fractional part
                maddai      acc,  vf20, i       loi -0.159155           ; remove bias to leave original int part                                    
                msubai      acc,  vf19, i       loi 0.5                 ; apply original number to leave fraction range only                                             
                msubi       vf19, vf20, i       nop                     ; ajust range: -0.5 to +0.5
                abs         vf19, vf19          loi 0.25                ; clamp: 0 to +0.5                                         
                subi        vf19, vf19, i       nop                     ; ajust range: -0.25 to +0.25
                mul         vf21, vf19, vf19    loi -76.574959          ; a^2
                muli        vf14, vf19, i       loi -41.341675          ; k4 a   
                muli        vf16, vf19, i       loi 81.602226           ; k2 a
                muli        vf15, vf19, i       nop                     ; k3 a
                mul         vf18, vf21, vf21    nop                     ; a^4
                mul         vf14, vf14, vf21    nop                     ; k4 a^3
                mula        acc,  vf16, vf21    loi 39.710659           ; + k2 a^3                                    
                muli        vf16, vf19, i       nop                     ; k5 a
                mul         vf17, vf18, vf18    nop                     ; a^8
                madda       acc,  vf14, vf18    nop                     ; + k4 a^7
                madda       acc,  vf15, vf18    loi 6.283185            ; + k3 a^5
                maddai      acc,  vf19, i       nop                     ; + k1 a
                madd        vf19, vf16, vf17    nop                     ; + k5 a^9
	NOP						jr vi13
	NOP						nop
                
;//

                                
g_Vu1_SparmGenDots_End: