From 48cc350e21acd2b4b03c76937e2861af5271435a Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keith@tungstengraphics.com>
Date: Mon, 26 Aug 2002 22:16:18 +0000
Subject: merged r200-0-1-branch

---
 shared-core/radeon_cp.c    | 363 ++++++++++++++++++++++++++++++++--
 shared-core/radeon_drm.h   |  56 +++++-
 shared-core/radeon_drv.h   | 127 ++++++++----
 shared-core/radeon_state.c | 481 ++++++++++++++++++++++++++++++++++-----------
 4 files changed, 859 insertions(+), 168 deletions(-)

(limited to 'shared-core')

diff --git a/shared-core/radeon_cp.c b/shared-core/radeon_cp.c
index 8250c09b..01069e49 100644
--- a/shared-core/radeon_cp.c
+++ b/shared-core/radeon_cp.c
@@ -44,6 +44,266 @@
 
 
 /* CP microcode (from ATI) */
+static u32 R200_cp_microcode[][2] = {
+	{ 0x21007000, 0000000000 },        
+	{ 0x20007000, 0000000000 }, 
+	{ 0x000000ab, 0x00000004 },
+	{ 0x000000af, 0x00000004 },
+	{ 0x66544a49, 0000000000 },
+	{ 0x49494174, 0000000000 },
+	{ 0x54517d83, 0000000000 },
+	{ 0x498d8b64, 0000000000 },
+	{ 0x49494949, 0000000000 },
+	{ 0x49da493c, 0000000000 },
+	{ 0x49989898, 0000000000 },
+	{ 0xd34949d5, 0000000000 },
+	{ 0x9dc90e11, 0000000000 },
+	{ 0xce9b9b9b, 0000000000 },
+	{ 0x000f0000, 0x00000016 },
+	{ 0x352e232c, 0000000000 },
+	{ 0x00000013, 0x00000004 },
+	{ 0x000f0000, 0x00000016 },
+	{ 0x352e272c, 0000000000 },
+	{ 0x000f0001, 0x00000016 },
+	{ 0x3239362f, 0000000000 },
+	{ 0x000077ef, 0x00000002 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x00000020, 0x0000001a },
+	{ 0x00004000, 0x0000001e },
+	{ 0x00061000, 0x00000002 },
+	{ 0x00000020, 0x0000001a },
+	{ 0x00004000, 0x0000001e },
+	{ 0x00061000, 0x00000002 },
+	{ 0x00000020, 0x0000001a },
+	{ 0x00004000, 0x0000001e },
+	{ 0x00000016, 0x00000004 },
+	{ 0x0003802a, 0x00000002 },
+	{ 0x040067e0, 0x00000002 },
+	{ 0x00000016, 0x00000004 },
+	{ 0x000077e0, 0x00000002 },
+	{ 0x00065000, 0x00000002 },
+	{ 0x000037e1, 0x00000002 },
+	{ 0x040067e1, 0x00000006 },
+	{ 0x000077e0, 0x00000002 },
+	{ 0x000077e1, 0x00000002 },
+	{ 0x000077e1, 0x00000006 },
+	{ 0xffffffff, 0000000000 },
+	{ 0x10000000, 0000000000 },
+	{ 0x0003802a, 0x00000002 },
+	{ 0x040067e0, 0x00000006 },
+	{ 0x00007675, 0x00000002 },
+	{ 0x00007676, 0x00000002 },
+	{ 0x00007677, 0x00000002 },
+	{ 0x00007678, 0x00000006 },
+	{ 0x0003802b, 0x00000002 },
+	{ 0x04002676, 0x00000002 },
+	{ 0x00007677, 0x00000002 },
+	{ 0x00007678, 0x00000006 },
+	{ 0x0000002e, 0x00000018 },
+	{ 0x0000002e, 0x00000018 },
+	{ 0000000000, 0x00000006 },
+	{ 0x0000002f, 0x00000018 },
+	{ 0x0000002f, 0x00000018 },
+	{ 0000000000, 0x00000006 },
+	{ 0x01605000, 0x00000002 },
+	{ 0x00065000, 0x00000002 },
+	{ 0x00098000, 0x00000002 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x64c0603d, 0x00000004 },
+	{ 0x00080000, 0x00000016 },
+	{ 0000000000, 0000000000 },
+	{ 0x0400251d, 0x00000002 },
+	{ 0x00007580, 0x00000002 },
+	{ 0x00067581, 0x00000002 },
+	{ 0x04002580, 0x00000002 },
+	{ 0x00067581, 0x00000002 },
+	{ 0x00000046, 0x00000004 },
+	{ 0x00005000, 0000000000 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x0000750e, 0x00000002 },
+	{ 0x00019000, 0x00000002 },
+	{ 0x00011055, 0x00000014 },
+	{ 0x00000055, 0x00000012 },
+	{ 0x0400250f, 0x00000002 },
+	{ 0x0000504a, 0x00000004 },
+	{ 0x00007565, 0x00000002 },
+	{ 0x00007566, 0x00000002 },
+	{ 0x00000051, 0x00000004 },
+	{ 0x01e655b4, 0x00000002 },
+	{ 0x4401b0dc, 0x00000002 },
+	{ 0x01c110dc, 0x00000002 },
+	{ 0x2666705d, 0x00000018 },
+	{ 0x040c2565, 0x00000002 },
+	{ 0x0000005d, 0x00000018 },
+	{ 0x04002564, 0x00000002 },
+	{ 0x00007566, 0x00000002 },
+	{ 0x00000054, 0x00000004 },
+	{ 0x00401060, 0x00000008 },
+	{ 0x00101000, 0x00000002 },
+	{ 0x000d80ff, 0x00000002 },
+	{ 0x00800063, 0x00000008 },
+	{ 0x000f9000, 0x00000002 },
+	{ 0x000e00ff, 0x00000002 },
+	{ 0000000000, 0x00000006 },
+	{ 0x00000080, 0x00000018 },
+	{ 0x00000054, 0x00000004 },
+	{ 0x00007576, 0x00000002 },
+	{ 0x00065000, 0x00000002 },
+	{ 0x00009000, 0x00000002 },
+	{ 0x00041000, 0x00000002 },
+	{ 0x0c00350e, 0x00000002 },
+	{ 0x00049000, 0x00000002 },
+	{ 0x00051000, 0x00000002 },
+	{ 0x01e785f8, 0x00000002 },
+	{ 0x00200000, 0x00000002 },
+	{ 0x00600073, 0x0000000c },
+	{ 0x00007563, 0x00000002 },
+	{ 0x006075f0, 0x00000021 },
+	{ 0x20007068, 0x00000004 },
+	{ 0x00005068, 0x00000004 },
+	{ 0x00007576, 0x00000002 },
+	{ 0x00007577, 0x00000002 },
+	{ 0x0000750e, 0x00000002 },
+	{ 0x0000750f, 0x00000002 },
+	{ 0x00a05000, 0x00000002 },
+	{ 0x00600076, 0x0000000c },
+	{ 0x006075f0, 0x00000021 },
+	{ 0x000075f8, 0x00000002 },
+	{ 0x00000076, 0x00000004 },
+	{ 0x000a750e, 0x00000002 },
+	{ 0x0020750f, 0x00000002 },
+	{ 0x00600079, 0x00000004 },
+	{ 0x00007570, 0x00000002 },
+	{ 0x00007571, 0x00000002 },
+	{ 0x00007572, 0x00000006 },
+	{ 0x00005000, 0x00000002 },
+	{ 0x00a05000, 0x00000002 },
+	{ 0x00007568, 0x00000002 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x00000084, 0x0000000c },
+	{ 0x00058000, 0x00000002 },
+	{ 0x0c607562, 0x00000002 },
+	{ 0x00000086, 0x00000004 },
+	{ 0x00600085, 0x00000004 },
+	{ 0x400070dd, 0000000000 },
+	{ 0x000380dd, 0x00000002 },
+	{ 0x00000093, 0x0000001c },
+	{ 0x00065095, 0x00000018 },
+	{ 0x040025bb, 0x00000002 },
+	{ 0x00061096, 0x00000018 },
+	{ 0x040075bc, 0000000000 },
+	{ 0x000075bb, 0x00000002 },
+	{ 0x000075bc, 0000000000 },
+	{ 0x00090000, 0x00000006 },
+	{ 0x00090000, 0x00000002 },
+	{ 0x000d8002, 0x00000006 },
+	{ 0x00005000, 0x00000002 },
+	{ 0x00007821, 0x00000002 },
+	{ 0x00007800, 0000000000 },
+	{ 0x00007821, 0x00000002 },
+	{ 0x00007800, 0000000000 },
+	{ 0x01665000, 0x00000002 },
+	{ 0x000a0000, 0x00000002 },
+	{ 0x000671cc, 0x00000002 },
+	{ 0x0286f1cd, 0x00000002 },
+	{ 0x000000a3, 0x00000010 },
+	{ 0x21007000, 0000000000 },
+	{ 0x000000aa, 0x0000001c },
+	{ 0x00065000, 0x00000002 },
+	{ 0x000a0000, 0x00000002 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x000b0000, 0x00000002 },
+	{ 0x38067000, 0x00000002 },
+	{ 0x000a00a6, 0x00000004 },
+	{ 0x20007000, 0000000000 },
+	{ 0x01200000, 0x00000002 },
+	{ 0x20077000, 0x00000002 },
+	{ 0x01200000, 0x00000002 },
+	{ 0x20007000, 0000000000 },
+	{ 0x00061000, 0x00000002 },
+	{ 0x0120751b, 0x00000002 },
+	{ 0x8040750a, 0x00000002 },
+	{ 0x8040750b, 0x00000002 },
+	{ 0x00110000, 0x00000002 },
+	{ 0x000380dd, 0x00000002 },
+	{ 0x000000bd, 0x0000001c },
+	{ 0x00061096, 0x00000018 },
+	{ 0x844075bd, 0x00000002 },
+	{ 0x00061095, 0x00000018 },
+	{ 0x840075bb, 0x00000002 },
+	{ 0x00061096, 0x00000018 },
+	{ 0x844075bc, 0x00000002 },
+	{ 0x000000c0, 0x00000004 },
+	{ 0x804075bd, 0x00000002 },
+	{ 0x800075bb, 0x00000002 },
+	{ 0x804075bc, 0x00000002 },
+	{ 0x00108000, 0x00000002 },
+	{ 0x01400000, 0x00000002 },
+	{ 0x006000c4, 0x0000000c },
+	{ 0x20c07000, 0x00000020 },
+	{ 0x000000c6, 0x00000012 },
+	{ 0x00800000, 0x00000006 },
+	{ 0x0080751d, 0x00000006 },
+	{ 0x000025bb, 0x00000002 },
+	{ 0x000040c0, 0x00000004 },
+	{ 0x0000775c, 0x00000002 },
+	{ 0x00a05000, 0x00000002 },
+	{ 0x00661000, 0x00000002 },
+	{ 0x0460275d, 0x00000020 },
+	{ 0x00004000, 0000000000 },
+	{ 0x00007999, 0x00000002 },
+	{ 0x00a05000, 0x00000002 },
+	{ 0x00661000, 0x00000002 },
+	{ 0x0460299b, 0x00000020 },
+	{ 0x00004000, 0000000000 },
+	{ 0x01e00830, 0x00000002 },
+	{ 0x21007000, 0000000000 },
+	{ 0x00005000, 0x00000002 },
+	{ 0x00038042, 0x00000002 },
+	{ 0x040025e0, 0x00000002 },
+	{ 0x000075e1, 0000000000 },
+	{ 0x00000001, 0000000000 },
+	{ 0x000380d9, 0x00000002 },
+	{ 0x04007394, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+	{ 0000000000, 0000000000 },
+};
+
+
 static u32 radeon_cp_microcode[][2] = {
 	{ 0x21007000, 0000000000 },
 	{ 0x20007000, 0000000000 },
@@ -345,6 +605,8 @@ static int radeon_do_pixcache_flush( drm_radeon_private_t *dev_priv )
 	u32 tmp;
 	int i;
 
+	dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
+
 	tmp  = RADEON_READ( RADEON_RB2D_DSTCACHE_CTLSTAT );
 	tmp |= RADEON_RB2D_DC_FLUSH_ALL;
 	RADEON_WRITE( RADEON_RB2D_DSTCACHE_CTLSTAT, tmp );
@@ -369,6 +631,8 @@ static int radeon_do_wait_for_fifo( drm_radeon_private_t *dev_priv,
 {
 	int i;
 
+	dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
+
 	for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) {
 		int slots = ( RADEON_READ( RADEON_RBBM_STATUS )
 			      & RADEON_RBBM_FIFOCNT_MASK );
@@ -387,6 +651,8 @@ static int radeon_do_wait_for_idle( drm_radeon_private_t *dev_priv )
 {
 	int i, ret;
 
+	dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
+
 	ret = radeon_do_wait_for_fifo( dev_priv, 64 );
 	if ( ret ) return ret;
 
@@ -420,11 +686,26 @@ static void radeon_cp_load_microcode( drm_radeon_private_t *dev_priv )
 	radeon_do_wait_for_idle( dev_priv );
 
 	RADEON_WRITE( RADEON_CP_ME_RAM_ADDR, 0 );
-	for ( i = 0 ; i < 256 ; i++ ) {
-		RADEON_WRITE( RADEON_CP_ME_RAM_DATAH,
-			      radeon_cp_microcode[i][1] );
-		RADEON_WRITE( RADEON_CP_ME_RAM_DATAL,
-			      radeon_cp_microcode[i][0] );
+
+	if (dev_priv->is_r200)
+	{
+		DRM_INFO("Loading R200 Microcode\n");
+		for ( i = 0 ; i < 256 ; i++ ) 
+		{
+			RADEON_WRITE( RADEON_CP_ME_RAM_DATAH,
+				      R200_cp_microcode[i][1] );
+			RADEON_WRITE( RADEON_CP_ME_RAM_DATAL,
+				      R200_cp_microcode[i][0] );
+		}
+	}
+	else
+	{
+		for ( i = 0 ; i < 256 ; i++ ) {
+			RADEON_WRITE( RADEON_CP_ME_RAM_DATAH,
+				      radeon_cp_microcode[i][1] );
+			RADEON_WRITE( RADEON_CP_ME_RAM_DATAL,
+				      radeon_cp_microcode[i][0] );
+		}
 	}
 }
 
@@ -736,12 +1017,10 @@ static int radeon_do_init_cp( drm_device_t *dev, drm_radeon_init_t *init )
 		return DRM_ERR(EINVAL);
 	}
 
+	dev_priv->is_r200 = (init->func == RADEON_INIT_R200_CP);
+	dev_priv->do_boxes = 1;
 	dev_priv->cp_mode = init->cp_mode;
 
-	/* Simple idle check.
-	 */
-	atomic_set( &dev_priv->idle_count, 0 );
-
 	/* We don't support anything other than bus-mastering ring mode,
 	 * but the ring can be in either AGP or PCI space for the ring
 	 * read pointer.
@@ -1028,6 +1307,7 @@ int radeon_cp_init( DRM_IOCTL_ARGS )
 
 	switch ( init.func ) {
 	case RADEON_INIT_CP:
+	case RADEON_INIT_R200_CP:
 		return radeon_do_init_cp( dev, &init );
 	case RADEON_CLEANUP_CP:
 		return radeon_do_cleanup_cp( dev );
@@ -1169,6 +1449,14 @@ int radeon_fullscreen( DRM_IOCTL_ARGS )
  *   completed rendering.  
  *
  * KW:  It's also a good way to find free buffers quickly.
+ *
+ * KW: Ideally this loop wouldn't exist, and freelist_get wouldn't
+ * sleep.  However, bugs in older versions of radeon_accel.c mean that
+ * we essentially have to do this, else old clients will break.
+ * 
+ * However, it does leave open a potential deadlock where all the
+ * buffers are held by other clients, which can't release them because
+ * they can't get the lock.  
  */
 
 drm_buf_t *radeon_freelist_get( drm_device_t *dev )
@@ -1193,17 +1481,56 @@ drm_buf_t *radeon_freelist_get( drm_device_t *dev )
 			buf_priv = buf->dev_private;
 			if ( buf->pid == 0 || (buf->pending && 
 					       buf_priv->age <= done_age) ) {
+				dev_priv->stats.requested_bufs++;
 				buf->pending = 0;
 				return buf;
 			}
 			start = 0;
 		}
-		DRM_UDELAY( 1 );
+
+		if (t) {
+			DRM_UDELAY( 1 );
+			dev_priv->stats.freelist_loops++;
+		}
 	}
 
 	DRM_ERROR( "returning NULL!\n" );
 	return NULL;
 }
+#if 0
+drm_buf_t *radeon_freelist_get( drm_device_t *dev )
+{
+	drm_device_dma_t *dma = dev->dma;
+	drm_radeon_private_t *dev_priv = dev->dev_private;
+	drm_radeon_buf_priv_t *buf_priv;
+	drm_buf_t *buf;
+	int i, t;
+	int start;
+	u32 done_age = DRM_READ32(&dev_priv->scratch[1]);
+
+	if ( ++dev_priv->last_buf >= dma->buf_count )
+		dev_priv->last_buf = 0;
+
+	start = dev_priv->last_buf;
+	dev_priv->stats.freelist_loops++;
+	
+	for ( t = 0 ; t < 2 ; t++ ) {
+		for ( i = start ; i < dma->buf_count ; i++ ) {
+			buf = dma->buflist[i];
+			buf_priv = buf->dev_private;
+			if ( buf->pid == 0 || (buf->pending && 
+					       buf_priv->age <= done_age) ) {
+				dev_priv->stats.requested_bufs++;
+				buf->pending = 0;
+				return buf;
+			}
+		}
+		start = 0;
+	}
+
+	return NULL;
+}
+#endif
 
 void radeon_freelist_reset( drm_device_t *dev )
 {
@@ -1228,11 +1555,23 @@ int radeon_wait_ring( drm_radeon_private_t *dev_priv, int n )
 {
 	drm_radeon_ring_buffer_t *ring = &dev_priv->ring;
 	int i;
+	u32 last_head = GET_RING_HEAD(ring);
 
 	for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) {
-		radeon_update_ring_snapshot( ring );
+		u32 head = GET_RING_HEAD(ring);
+
+		ring->space = (head - ring->tail) * sizeof(u32);
+		if ( ring->space <= 0 )
+			ring->space += ring->size;
 		if ( ring->space > n )
 			return 0;
+		
+		dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
+
+		if (head != last_head)
+			i = 0;
+		last_head = head;
+
 		DRM_UDELAY( 1 );
 	}
 
@@ -1251,7 +1590,7 @@ static int radeon_cp_get_buffers( drm_device_t *dev, drm_dma_t *d )
 
 	for ( i = d->granted_count ; i < d->request_count ; i++ ) {
 		buf = radeon_freelist_get( dev );
-		if ( !buf ) return DRM_ERR(EAGAIN);
+		if ( !buf ) return DRM_ERR(EBUSY); /* NOTE: broken client */
 
 		buf->pid = DRM_CURRENTPID;
 
diff --git a/shared-core/radeon_drm.h b/shared-core/radeon_drm.h
index 3802e46c..6469bfb8 100644
--- a/shared-core/radeon_drm.h
+++ b/shared-core/radeon_drm.h
@@ -89,7 +89,47 @@
 #define RADEON_EMIT_SE_ZBIAS_FACTOR                 18 /* zbias/2 */
 #define RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT           19 /* tcl/11 */
 #define RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED   20 /* material/17 */
-#define RADEON_MAX_STATE_PACKETS                    21
+#define R200_EMIT_PP_TXCBLEND_0                     21 /* tex0/4 */
+#define R200_EMIT_PP_TXCBLEND_1                     22 /* tex1/4 */
+#define R200_EMIT_PP_TXCBLEND_2                     23 /* tex2/4 */
+#define R200_EMIT_PP_TXCBLEND_3                     24 /* tex3/4 */
+#define R200_EMIT_PP_TXCBLEND_4                     25 /* tex4/4 */
+#define R200_EMIT_PP_TXCBLEND_5                     26 /* tex5/4 */
+#define R200_EMIT_PP_TXCBLEND_6                     27 /* /4 */
+#define R200_EMIT_PP_TXCBLEND_7                     28 /* /4 */
+#define R200_EMIT_TCL_LIGHT_MODEL_CTL_0             29 /* tcl/7 */
+#define R200_EMIT_TFACTOR_0                         30 /* tf/7 */
+#define R200_EMIT_VTX_FMT_0                         31 /* vtx/5 */
+#define R200_EMIT_VAP_CTL                           32 /* vap/1 */
+#define R200_EMIT_MATRIX_SELECT_0                   33 /* msl/5 */
+#define R200_EMIT_TEX_PROC_CTL_2                    34 /* tcg/5 */
+#define R200_EMIT_TCL_UCP_VERT_BLEND_CTL            35 /* tcl/1 */
+#define R200_EMIT_PP_TXFILTER_0                     36 /* tex0/6 */
+#define R200_EMIT_PP_TXFILTER_1                     37 /* tex1/6 */
+#define R200_EMIT_PP_TXFILTER_2                     38 /* tex2/6 */
+#define R200_EMIT_PP_TXFILTER_3                     39 /* tex3/6 */
+#define R200_EMIT_PP_TXFILTER_4                     40 /* tex4/6 */
+#define R200_EMIT_PP_TXFILTER_5                     41 /* tex5/6 */
+#define R200_EMIT_PP_TXOFFSET_0                     42 /* tex0/1 */
+#define R200_EMIT_PP_TXOFFSET_1                     43 /* tex1/1 */
+#define R200_EMIT_PP_TXOFFSET_2                     44 /* tex2/1 */
+#define R200_EMIT_PP_TXOFFSET_3                     45 /* tex3/1 */
+#define R200_EMIT_PP_TXOFFSET_4                     46 /* tex4/1 */
+#define R200_EMIT_PP_TXOFFSET_5                     47 /* tex5/1 */
+#define R200_EMIT_VTE_CNTL                          48 /* vte/1 */
+#define R200_EMIT_OUTPUT_VTX_COMP_SEL               49 /* vtx/1 */
+#define R200_EMIT_PP_TAM_DEBUG3                     50 /* tam/1 */
+#define R200_EMIT_PP_CNTL_X                         51 /* cst/1 */
+#define R200_EMIT_RB3D_DEPTHXY_OFFSET               52 /* cst/1 */
+#define R200_EMIT_RE_AUX_SCISSOR_CNTL               53 /* cst/1 */
+#define R200_EMIT_RE_SCISSOR_TL_0                   54 /* cst/2 */
+#define R200_EMIT_RE_SCISSOR_TL_1                   55 /* cst/2 */
+#define R200_EMIT_RE_SCISSOR_TL_2                   56 /* cst/2 */
+#define R200_EMIT_SE_VAP_CNTL_STATUS                57 /* cst/1 */
+#define R200_EMIT_SE_VTX_STATE_CNTL                 58 /* cst/1 */
+#define R200_EMIT_RE_POINTSIZE                      59 /* cst/1 */
+#define R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0       60 /* cst/4 */
+#define RADEON_MAX_STATE_PACKETS                    61
 
 
 /* Commands understood by cmd_buffer ioctl.  More can be added but
@@ -101,24 +141,25 @@
 #define RADEON_CMD_DMA_DISCARD 4 /* discard current dma buf */
 #define RADEON_CMD_PACKET3     5 /* emit hw packet */
 #define RADEON_CMD_PACKET3_CLIP 6 /* emit hw packet wrapped in cliprects */
+#define RADEON_CMD_SCALARS2     7 /* r200 stopgap */
 
 
 typedef union {
 	int i;
 	struct { 
-		char cmd_type, pad0, pad1, pad2;
+		unsigned char cmd_type, pad0, pad1, pad2;
 	} header;
 	struct { 
-		char cmd_type, packet_id, pad0, pad1;
+		unsigned char cmd_type, packet_id, pad0, pad1;
 	} packet;
 	struct { 
-		char cmd_type, offset, stride, count; 
+		unsigned char cmd_type, offset, stride, count; 
 	} scalars;
 	struct { 
-		char cmd_type, offset, stride, count; 
+		unsigned char cmd_type, offset, stride, count; 
 	} vectors;
 	struct { 
-		char cmd_type, buf_idx, pad0, pad1; 
+		unsigned char cmd_type, buf_idx, pad0, pad1; 
 	} dma;
 } drm_radeon_cmd_header_t;
 
@@ -327,7 +368,8 @@ typedef struct {
 typedef struct drm_radeon_init {
 	enum {
 		RADEON_INIT_CP    = 0x01,
-		RADEON_CLEANUP_CP = 0x02
+		RADEON_CLEANUP_CP = 0x02,
+		RADEON_INIT_R200_CP = 0x03,	
 	} func;
 	unsigned long sarea_priv_offset;
 	int is_pci;
diff --git a/shared-core/radeon_drv.h b/shared-core/radeon_drv.h
index 15c0d4dd..7c341b39 100644
--- a/shared-core/radeon_drv.h
+++ b/shared-core/radeon_drv.h
@@ -79,12 +79,25 @@ typedef struct drm_radeon_private {
 	int writeback_works;
 
 	int usec_timeout;
+
+	int is_r200;
+
 	int is_pci;
 	unsigned long phys_pci_gart;
 	dma_addr_t bus_pci_gart;
 
-	atomic_t idle_count;
-
+	struct {
+		u32 boxes;
+		int freelist_timeouts;
+		int freelist_loops;
+		int requested_bufs;
+		int last_frame_reads;
+		int last_clear_reads;
+		int clears;
+		int texture_uploads;
+	} stats;
+
+	int do_boxes;
 	int page_flipping;
 	int current_page;
 	u32 crtc_offset;
@@ -134,14 +147,6 @@ extern drm_buf_t *radeon_freelist_get( drm_device_t *dev );
 
 extern int radeon_wait_ring( drm_radeon_private_t *dev_priv, int n );
 
-static __inline__ void
-radeon_update_ring_snapshot( drm_radeon_ring_buffer_t *ring )
-{
-	ring->space = (GET_RING_HEAD(ring) - ring->tail) * sizeof(u32);
-	if ( ring->space <= 0 )
-		ring->space += ring->size;
-}
-
 extern int radeon_do_cp_idle( drm_radeon_private_t *dev_priv );
 extern int radeon_do_cleanup_cp( drm_device_t *dev );
 extern int radeon_do_cleanup_pageflip( drm_device_t *dev );
@@ -159,6 +164,14 @@ extern int radeon_cp_cmdbuf( DRM_IOCTL_ARGS );
 extern int radeon_cp_getparam( DRM_IOCTL_ARGS );
 extern int radeon_cp_flip( DRM_IOCTL_ARGS );
 
+/* Flags for stats.boxes
+ */
+#define RADEON_BOX_DMA_IDLE      0x1
+#define RADEON_BOX_RING_FULL     0x2
+#define RADEON_BOX_FLIP          0x4
+#define RADEON_BOX_WAIT_IDLE     0x8
+#define RADEON_BOX_TEXTURE_LOAD  0x10
+
 
 
 /* Register definitions, register access macros and drmAddMap constants
@@ -282,6 +295,7 @@ extern int radeon_cp_flip( DRM_IOCTL_ARGS );
 #	define RADEON_STENCIL_ENABLE		(1 << 7)
 #	define RADEON_Z_ENABLE			(1 << 8)
 #define RADEON_RB3D_DEPTHOFFSET		0x1c24
+#define RADEON_RB3D_DEPTHPITCH		0x1c28
 #define RADEON_RB3D_PLANEMASK		0x1d84
 #define RADEON_RB3D_STENCILREFMASK	0x1d7c
 #define RADEON_RB3D_ZCACHE_MODE		0x3250
@@ -513,6 +527,62 @@ extern int radeon_cp_flip( DRM_IOCTL_ARGS );
 #define RADEON_TXFORMAT_ARGB8888	6
 #define RADEON_TXFORMAT_RGBA8888	7
 
+#define R200_PP_TXCBLEND_0                0x2f00
+#define R200_PP_TXCBLEND_1                0x2f10
+#define R200_PP_TXCBLEND_2                0x2f20
+#define R200_PP_TXCBLEND_3                0x2f30
+#define R200_PP_TXCBLEND_4                0x2f40
+#define R200_PP_TXCBLEND_5                0x2f50
+#define R200_PP_TXCBLEND_6                0x2f60
+#define R200_PP_TXCBLEND_7                0x2f70
+#define R200_SE_TCL_LIGHT_MODEL_CTL_0     0x2268 
+#define R200_PP_TFACTOR_0                 0x2ee0
+#define R200_SE_VTX_FMT_0                 0x2088
+#define R200_SE_VAP_CNTL                  0x2080
+#define R200_SE_TCL_MATRIX_SEL_0          0x2230
+#define R200_SE_TCL_TEX_PROC_CTL_2        0x22a8 
+#define R200_SE_TCL_UCP_VERT_BLEND_CTL    0x22c0 
+#define R200_PP_TXFILTER_5                0x2ca0 
+#define R200_PP_TXFILTER_4                0x2c80 
+#define R200_PP_TXFILTER_3                0x2c60 
+#define R200_PP_TXFILTER_2                0x2c40 
+#define R200_PP_TXFILTER_1                0x2c20 
+#define R200_PP_TXFILTER_0                0x2c00 
+#define R200_PP_TXOFFSET_5                0x2d78
+#define R200_PP_TXOFFSET_4                0x2d60
+#define R200_PP_TXOFFSET_3                0x2d48
+#define R200_PP_TXOFFSET_2                0x2d30
+#define R200_PP_TXOFFSET_1                0x2d18
+#define R200_PP_TXOFFSET_0                0x2d00
+#define R200_RE_AUX_SCISSOR_CNTL          0x26f0
+#define R200_SE_VTE_CNTL                  0x20b0
+#define R200_SE_TCL_OUTPUT_VTX_COMP_SEL   0x2250
+#define R200_PP_TAM_DEBUG3                0x2d9c
+#define R200_PP_CNTL_X                    0x2cc4
+#define R200_SE_VAP_CNTL_STATUS           0x2140
+#define R200_RE_SCISSOR_TL_0              0x1cd8
+#define R200_RE_SCISSOR_TL_1              0x1ce0
+#define R200_RE_SCISSOR_TL_2              0x1ce8
+#define R200_RB3D_DEPTHXY_OFFSET          0x1d60 
+#define R200_RE_AUX_SCISSOR_CNTL          0x26f0
+#define R200_SE_VTX_STATE_CNTL            0x2180
+#define R200_RE_POINTSIZE                 0x2648
+#define R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0 0x2254
+
+
+#define SE_VAP_CNTL__TCL_ENA_MASK                          0x00000001
+#define SE_VAP_CNTL__FORCE_W_TO_ONE_MASK                   0x00010000
+#define SE_VAP_CNTL__VF_MAX_VTX_NUM__SHIFT                 0x00000012
+#define SE_VTE_CNTL__VTX_XY_FMT_MASK                       0x00000100
+#define SE_VTE_CNTL__VTX_Z_FMT_MASK                        0x00000200
+#define SE_VTX_FMT_0__VTX_Z0_PRESENT_MASK                  0x00000001
+#define SE_VTX_FMT_0__VTX_W0_PRESENT_MASK                  0x00000002
+#define SE_VTX_FMT_0__VTX_COLOR_0_FMT__SHIFT               0x0000000b
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+#define R200_SE_VTX_FMT_1                 0x208c
+#define R200_RE_CNTL                      0x1c50 
+
+
 /* Constants */
 #define RADEON_MAX_USEC_TIMEOUT		100000	/* 100 ms */
 
@@ -620,30 +690,16 @@ do {									\
 	}								\
 } while (0)
 
+
+/* Perfbox functionality only.  
+ */
 #define RING_SPACE_TEST_WITH_RETURN( dev_priv )				\
 do {									\
-	drm_radeon_ring_buffer_t *ring = &dev_priv->ring; int i;	\
-	if ( ring->space < ring->high_mark ) {				\
-		for ( i = 0 ; i < dev_priv->usec_timeout ; i++ ) {	\
-			radeon_update_ring_snapshot( ring );		\
-			if ( ring->space >= ring->high_mark )		\
-				goto __ring_space_done;			\
-			DRM_UDELAY( 1 );				\
-		}							\
-		DRM_ERROR( "ring space check from memory failed, reading register...\n" );	\
-		/* If ring space check fails from RAM, try reading the	\
-		   register directly */					\
-		ring->space = 4 * ( RADEON_READ( RADEON_CP_RB_RPTR ) - ring->tail );	\
-		if ( ring->space <= 0 )					\
-			ring->space += ring->size;			\
-		if ( ring->space >= ring->high_mark )			\
-			goto __ring_space_done;				\
-									\
-		DRM_ERROR( "ring space check failed!\n" );		\
-		return DRM_ERR(EBUSY);				\
+	if (!(dev_priv->stats.boxes & RADEON_BOX_DMA_IDLE)) {		\
+		u32 head = GET_RING_HEAD(&dev_priv->ring);		\
+		if (head == dev_priv->ring.tail)			\
+			dev_priv->stats.boxes |= RADEON_BOX_DMA_IDLE;	\
 	}								\
- __ring_space_done:							\
-	;								\
 } while (0)
 
 #define VB_AGE_TEST_WITH_RETURN( dev_priv )				\
@@ -710,16 +766,15 @@ do {									\
 	}								\
 	if (((dev_priv->ring.tail + _nr) & mask) != write) {		\
 		DRM_ERROR( 						\
-			"ADVANCE_RING(): mismatch: nr: %x write: %x\n",	\
+			"ADVANCE_RING(): mismatch: nr: %x write: %x line: %d\n",	\
 			((dev_priv->ring.tail + _nr) & mask),		\
-			write);						\
+			write, __LINE__);						\
 	} else								\
 		dev_priv->ring.tail = write;				\
 } while (0)
 
 #define COMMIT_RING() do {					    \
-	radeon_flush_write_combine();					\
-	RADEON_WRITE( RADEON_CP_RB_WPTR, dev_priv->ring.tail );		\
+	RADEON_WRITE( RADEON_CP_RB_WPTR, dev_priv->ring.tail );		    \
 } while (0)
 
 #define OUT_RING( x ) do {						\
@@ -760,6 +815,4 @@ do {									\
 } while (0)
 
 
-#define RADEON_PERFORMANCE_BOXES	0
-
 #endif /* __RADEON_DRV_H__ */
diff --git a/shared-core/radeon_state.c b/shared-core/radeon_state.c
index 1cc6bde8..7f84e739 100644
--- a/shared-core/radeon_state.c
+++ b/shared-core/radeon_state.c
@@ -239,18 +239,50 @@ static struct {
 	{ RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
 	{ RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
 	{ RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
+	{ R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0" },
+	{ R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1" },
+	{ R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2" },
+	{ R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3" },
+	{ R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4" },
+	{ R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5" },
+	{ R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6" },
+	{ R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7" },
+	{ R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0" },
+	{ R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0" },
+	{ R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0" },
+	{ R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL" },
+	{ R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0" },
+	{ R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2" },
+	{ R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL" },
+	{ R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0" },
+	{ R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1" },
+	{ R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2" },
+	{ R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3" },
+	{ R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4" },
+	{ R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5" },
+	{ R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0" },
+	{ R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1" },
+	{ R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2" },
+	{ R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3" },
+	{ R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4" },
+	{ R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5" },
+	{ R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL" },
+	{ R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1, "R200_SE_TCL_OUTPUT_VTX_COMP_SEL" },
+	{ R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3" },
+	{ R200_PP_CNTL_X, 1, "R200_PP_CNTL_X" }, 
+	{ R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET" }, 
+	{ R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL" }, 
+	{ R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0" }, 
+	{ R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1" }, 
+	{ R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2" }, 
+	{ R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS" }, 
+	{ R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL" }, 
+	{ R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE" }, 
+	{ R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4, "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0" },
 };
 
 
 
-
-
-
-
-
-
-
-#if RADEON_PERFORMANCE_BOXES
 /* ================================================================
  * Performance monitoring functions
  */
@@ -259,10 +291,12 @@ static void radeon_clear_box( drm_radeon_private_t *dev_priv,
 			      int x, int y, int w, int h,
 			      int r, int g, int b )
 {
-	u32 pitch, offset;
 	u32 color;
 	RING_LOCALS;
 
+	x += dev_priv->sarea_priv->boxes[0].x1;
+	y += dev_priv->sarea_priv->boxes[0].y1;
+
 	switch ( dev_priv->color_fmt ) {
 	case RADEON_COLOR_FORMAT_RGB565:
 		color = (((r & 0xf8) << 8) |
@@ -275,8 +309,11 @@ static void radeon_clear_box( drm_radeon_private_t *dev_priv,
 		break;
 	}
 
-	offset = dev_priv->back_offset;
-	pitch = dev_priv->back_pitch >> 3;
+	BEGIN_RING( 4 );
+	RADEON_WAIT_UNTIL_3D_IDLE();		
+	OUT_RING( CP_PACKET0( RADEON_DP_WRITE_MASK, 0 ) );
+	OUT_RING( 0xffffffff );
+	ADVANCE_RING();
 
 	BEGIN_RING( 6 );
 
@@ -288,7 +325,12 @@ static void radeon_clear_box( drm_radeon_private_t *dev_priv,
 		  RADEON_ROP3_P |
 		  RADEON_GMC_CLR_CMP_CNTL_DIS );
 
-	OUT_RING( (pitch << 22) | (offset >> 5) );
+ 	if ( dev_priv->page_flipping && dev_priv->current_page == 1 ) { 
+		OUT_RING( dev_priv->front_pitch_offset );
+ 	} else {	 
+		OUT_RING( dev_priv->back_pitch_offset );
+ 	} 
+
 	OUT_RING( color );
 
 	OUT_RING( (x << 16) | y );
@@ -299,16 +341,57 @@ static void radeon_clear_box( drm_radeon_private_t *dev_priv,
 
 static void radeon_cp_performance_boxes( drm_radeon_private_t *dev_priv )
 {
-	if ( atomic_read( &dev_priv->idle_count ) == 0 ) {
-		radeon_clear_box( dev_priv, 64, 4, 8, 8, 0, 255, 0 );
-	} else {
-		atomic_set( &dev_priv->idle_count, 0 );
+	/* Collapse various things into a wait flag -- trying to
+	 * guess if userspase slept -- better just to have them tell us.
+	 */
+	if (dev_priv->stats.last_frame_reads > 1 ||
+	    dev_priv->stats.last_clear_reads > dev_priv->stats.clears) {
+		dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
 	}
-}
 
-#endif
+	if (dev_priv->stats.freelist_loops) {
+		dev_priv->stats.boxes |= RADEON_BOX_WAIT_IDLE;
+	}
 
+	/* Purple box for page flipping
+	 */
+	if ( dev_priv->stats.boxes & RADEON_BOX_FLIP ) 
+		radeon_clear_box( dev_priv, 4, 4, 8, 8, 255, 0, 255 );
+
+	/* Red box if we have to wait for idle at any point
+	 */
+	if ( dev_priv->stats.boxes & RADEON_BOX_WAIT_IDLE ) 
+		radeon_clear_box( dev_priv, 16, 4, 8, 8, 255, 0, 0 );
+
+	/* Blue box: lost context?
+	 */
 
+	/* Yellow box for texture swaps
+	 */
+	if ( dev_priv->stats.boxes & RADEON_BOX_TEXTURE_LOAD ) 
+		radeon_clear_box( dev_priv, 40, 4, 8, 8, 255, 255, 0 );
+
+	/* Green box if hardware never idles (as far as we can tell)
+	 */
+	if ( !(dev_priv->stats.boxes & RADEON_BOX_DMA_IDLE) ) 
+		radeon_clear_box( dev_priv, 64, 4, 8, 8, 0, 255, 0 );
+
+
+	/* Draw bars indicating number of buffers allocated 
+	 * (not a great measure, easily confused)
+	 */
+	if (dev_priv->stats.requested_bufs) {
+		if (dev_priv->stats.requested_bufs > 100)
+			dev_priv->stats.requested_bufs = 100;
+
+		radeon_clear_box( dev_priv, 4, 16,  
+				  dev_priv->stats.requested_bufs, 4,
+				  196, 128, 128 );
+	}
+
+	memset( &dev_priv->stats, 0, sizeof(dev_priv->stats) );
+
+}
 /* ================================================================
  * CP command dispatch functions
  */
@@ -328,6 +411,8 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
 	RING_LOCALS;
 	DRM_DEBUG( "flags = 0x%x\n", flags );
 
+	dev_priv->stats.clears++;
+
 	if ( dev_priv->page_flipping && dev_priv->current_page == 1 ) {
 		unsigned int tmp = flags;
 
@@ -336,120 +421,251 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
 		if ( tmp & RADEON_BACK )  flags |= RADEON_FRONT;
 	}
 
+	if ( flags & (RADEON_FRONT | RADEON_BACK) ) {
+
+		BEGIN_RING( 4 );
+
+		/* Ensure the 3D stream is idle before doing a
+		 * 2D fill to clear the front or back buffer.
+		 */
+		RADEON_WAIT_UNTIL_3D_IDLE();
+		
+		OUT_RING( CP_PACKET0( RADEON_DP_WRITE_MASK, 0 ) );
+		OUT_RING( clear->color_mask );
+
+		ADVANCE_RING();
+
+		/* Make sure we restore the 3D state next time.
+		 */
+		dev_priv->sarea_priv->ctx_owner = 0;
+
+		for ( i = 0 ; i < nbox ; i++ ) {
+			int x = pbox[i].x1;
+			int y = pbox[i].y1;
+			int w = pbox[i].x2 - x;
+			int h = pbox[i].y2 - y;
+
+			DRM_DEBUG( "dispatch clear %d,%d-%d,%d flags 0x%x\n",
+				   x, y, w, h, flags );
+
+			if ( flags & RADEON_FRONT ) {
+				BEGIN_RING( 6 );
+				
+				OUT_RING( CP_PACKET3( RADEON_CNTL_PAINT_MULTI, 4 ) );
+				OUT_RING( RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+					  RADEON_GMC_BRUSH_SOLID_COLOR |
+					  (dev_priv->color_fmt << 8) |
+					  RADEON_GMC_SRC_DATATYPE_COLOR |
+					  RADEON_ROP3_P |
+					  RADEON_GMC_CLR_CMP_CNTL_DIS );
+
+				OUT_RING( dev_priv->front_pitch_offset );
+				OUT_RING( clear->clear_color );
+				
+				OUT_RING( (x << 16) | y );
+				OUT_RING( (w << 16) | h );
+				
+				ADVANCE_RING();
+			}
+			
+			if ( flags & RADEON_BACK ) {
+				BEGIN_RING( 6 );
+				
+				OUT_RING( CP_PACKET3( RADEON_CNTL_PAINT_MULTI, 4 ) );
+				OUT_RING( RADEON_GMC_DST_PITCH_OFFSET_CNTL |
+					  RADEON_GMC_BRUSH_SOLID_COLOR |
+					  (dev_priv->color_fmt << 8) |
+					  RADEON_GMC_SRC_DATATYPE_COLOR |
+					  RADEON_ROP3_P |
+					  RADEON_GMC_CLR_CMP_CNTL_DIS );
+				
+				OUT_RING( dev_priv->back_pitch_offset );
+				OUT_RING( clear->clear_color );
+
+				OUT_RING( (x << 16) | y );
+				OUT_RING( (w << 16) | h );
+
+				ADVANCE_RING();
+			}
+		}
+	}
+
 	/* We have to clear the depth and/or stencil buffers by
 	 * rendering a quad into just those buffers.  Thus, we have to
 	 * make sure the 3D engine is configured correctly.
 	 */
-	if ( flags & (RADEON_DEPTH | RADEON_STENCIL) ) {
-		rb3d_cntl = depth_clear->rb3d_cntl;
+	if ( dev_priv->is_r200 &&
+	     (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) {
 
-		if ( flags & RADEON_DEPTH ) {
-			rb3d_cntl |=  RADEON_Z_ENABLE;
-		} else {
-			rb3d_cntl &= ~RADEON_Z_ENABLE;
-		}
+		int tempPP_CNTL;
+		int tempRE_CNTL;
+		int tempRB3D_CNTL;
+		int tempRB3D_ZSTENCILCNTL;
+		int tempRB3D_STENCILREFMASK;
+		int tempRB3D_PLANEMASK;
+		int tempSE_CNTL;
+		int tempSE_VTE_CNTL;
+		int tempSE_VTX_FMT_0;
+		int tempSE_VTX_FMT_1;
+		int tempSE_VAP_CNTL;
+		int tempRE_AUX_SCISSOR_CNTL;
 
-		if ( flags & RADEON_STENCIL ) {
-			rb3d_cntl |=  RADEON_STENCIL_ENABLE;
-			rb3d_stencilrefmask = clear->depth_mask; /* misnamed field */
-		} else {
-			rb3d_cntl &= ~RADEON_STENCIL_ENABLE;
-			rb3d_stencilrefmask = 0x00000000;
-		}
-	}
+		tempPP_CNTL = 0;
+		tempRE_CNTL = 0;
 
-	for ( i = 0 ; i < nbox ; i++ ) {
-		int x = pbox[i].x1;
-		int y = pbox[i].y1;
-		int w = pbox[i].x2 - x;
-		int h = pbox[i].y2 - y;
+		tempRB3D_CNTL = depth_clear->rb3d_cntl;
+		tempRB3D_CNTL &= ~(1<<15); /* unset radeon magic flag */
 
-		DRM_DEBUG( "dispatch clear %d,%d-%d,%d flags 0x%x\n",
-			   x, y, w, h, flags );
+		tempRB3D_ZSTENCILCNTL = depth_clear->rb3d_zstencilcntl;
+		tempRB3D_STENCILREFMASK = 0x0;
 
-		if ( flags & (RADEON_FRONT | RADEON_BACK) ) {
-			BEGIN_RING( 4 );
+		tempSE_CNTL = depth_clear->se_cntl;
 
-			/* Ensure the 3D stream is idle before doing a
-			 * 2D fill to clear the front or back buffer.
-			 */
-			RADEON_WAIT_UNTIL_3D_IDLE();
 
-			OUT_RING( CP_PACKET0( RADEON_DP_WRITE_MASK, 0 ) );
-			OUT_RING( clear->color_mask );
 
-			ADVANCE_RING();
+		/* Disable TCL */
 
-			/* Make sure we restore the 3D state next time.
-			 */
-			dev_priv->sarea_priv->ctx_owner = 0;
-		}
+		tempSE_VAP_CNTL = (/* SE_VAP_CNTL__FORCE_W_TO_ONE_MASK |  */
+				   (0x9 << SE_VAP_CNTL__VF_MAX_VTX_NUM__SHIFT));
 
-		if ( flags & RADEON_FRONT ) {
-			BEGIN_RING( 6 );
+		tempRB3D_PLANEMASK = 0x0;
 
-			OUT_RING( CP_PACKET3( RADEON_CNTL_PAINT_MULTI, 4 ) );
-			OUT_RING( RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-				  RADEON_GMC_BRUSH_SOLID_COLOR |
-				  (dev_priv->color_fmt << 8) |
-				  RADEON_GMC_SRC_DATATYPE_COLOR |
-				  RADEON_ROP3_P |
-				  RADEON_GMC_CLR_CMP_CNTL_DIS );
+		tempRE_AUX_SCISSOR_CNTL = 0x0;
 
-			OUT_RING( dev_priv->front_pitch_offset );
-			OUT_RING( clear->clear_color );
+		tempSE_VTE_CNTL =
+			SE_VTE_CNTL__VTX_XY_FMT_MASK |
+			SE_VTE_CNTL__VTX_Z_FMT_MASK;
 
-			OUT_RING( (x << 16) | y );
-			OUT_RING( (w << 16) | h );
+		/* Vertex format (X, Y, Z, W)*/
+		tempSE_VTX_FMT_0 =
+			SE_VTX_FMT_0__VTX_Z0_PRESENT_MASK |
+			SE_VTX_FMT_0__VTX_W0_PRESENT_MASK;
+		tempSE_VTX_FMT_1 = 0x0;
 
-			ADVANCE_RING();
+
+		/* 
+		 * Depth buffer specific enables 
+		 */
+		if (flags & RADEON_DEPTH) {
+			/* Enable depth buffer */
+			tempRB3D_CNTL |= RADEON_Z_ENABLE;
+		} else {
+			/* Disable depth buffer */
+			tempRB3D_CNTL &= ~RADEON_Z_ENABLE;
 		}
 
-		if ( flags & RADEON_BACK ) {
-			BEGIN_RING( 6 );
+		/* 
+		 * Stencil buffer specific enables
+		 */
+		if ( flags & RADEON_STENCIL ) {
+			tempRB3D_CNTL |=  RADEON_STENCIL_ENABLE;
+			tempRB3D_STENCILREFMASK = clear->depth_mask; 
+		} else {
+			tempRB3D_CNTL &= ~RADEON_STENCIL_ENABLE;
+			tempRB3D_STENCILREFMASK = 0x00000000;
+		}
 
-			OUT_RING( CP_PACKET3( RADEON_CNTL_PAINT_MULTI, 4 ) );
-			OUT_RING( RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-				  RADEON_GMC_BRUSH_SOLID_COLOR |
-				  (dev_priv->color_fmt << 8) |
-				  RADEON_GMC_SRC_DATATYPE_COLOR |
-				  RADEON_ROP3_P |
-				  RADEON_GMC_CLR_CMP_CNTL_DIS );
+		BEGIN_RING( 26 );
+		RADEON_WAIT_UNTIL_2D_IDLE();
+
+		OUT_RING_REG( RADEON_PP_CNTL, tempPP_CNTL );
+		OUT_RING_REG( R200_RE_CNTL, tempRE_CNTL );
+		OUT_RING_REG( RADEON_RB3D_CNTL, tempRB3D_CNTL );
+		OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL,
+			      tempRB3D_ZSTENCILCNTL );
+		OUT_RING_REG( RADEON_RB3D_STENCILREFMASK, 
+			      tempRB3D_STENCILREFMASK );
+		OUT_RING_REG( RADEON_RB3D_PLANEMASK, tempRB3D_PLANEMASK );
+		OUT_RING_REG( RADEON_SE_CNTL, tempSE_CNTL );
+		OUT_RING_REG( R200_SE_VTE_CNTL, tempSE_VTE_CNTL );
+		OUT_RING_REG( R200_SE_VTX_FMT_0, tempSE_VTX_FMT_0 );
+		OUT_RING_REG( R200_SE_VTX_FMT_1, tempSE_VTX_FMT_1 );
+		OUT_RING_REG( R200_SE_VAP_CNTL, tempSE_VAP_CNTL );
+		OUT_RING_REG( R200_RE_AUX_SCISSOR_CNTL, 
+			      tempRE_AUX_SCISSOR_CNTL );
+		ADVANCE_RING();
 
-			OUT_RING( dev_priv->back_pitch_offset );
-			OUT_RING( clear->clear_color );
+		/* Make sure we restore the 3D state next time.
+		 */
+		dev_priv->sarea_priv->ctx_owner = 0;
 
-			OUT_RING( (x << 16) | y );
-			OUT_RING( (w << 16) | h );
+		for ( i = 0 ; i < nbox ; i++ ) {
+			
+			/* Funny that this should be required -- 
+			 *  sets top-left?
+			 */
+			radeon_emit_clip_rect( dev_priv,
+					       &sarea_priv->boxes[i] );
 
+			BEGIN_RING( 14 );
+			OUT_RING( CP_PACKET3( R200_3D_DRAW_IMMD_2, 12 ) );
+			OUT_RING( (RADEON_PRIM_TYPE_RECT_LIST |
+				   RADEON_PRIM_WALK_RING |
+				   (3 << RADEON_NUM_VERTICES_SHIFT)) );
+			OUT_RING( depth_boxes[i].ui[CLEAR_X1] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_Y1] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_DEPTH] );
+			OUT_RING( 0x3f800000 );
+			OUT_RING( depth_boxes[i].ui[CLEAR_X1] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_Y2] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_DEPTH] );
+			OUT_RING( 0x3f800000 );
+			OUT_RING( depth_boxes[i].ui[CLEAR_X2] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_Y2] );
+			OUT_RING( depth_boxes[i].ui[CLEAR_DEPTH] );
+			OUT_RING( 0x3f800000 );
 			ADVANCE_RING();
 		}
+	} 
+	else if ( (flags & (RADEON_DEPTH | RADEON_STENCIL)) ) {
 
-		if ( flags & (RADEON_DEPTH | RADEON_STENCIL) ) {
+		rb3d_cntl = depth_clear->rb3d_cntl;
 
-			radeon_emit_clip_rect( dev_priv,
-					       &sarea_priv->boxes[i] );
+		if ( flags & RADEON_DEPTH ) {
+			rb3d_cntl |=  RADEON_Z_ENABLE;
+		} else {
+			rb3d_cntl &= ~RADEON_Z_ENABLE;
+		}
 
-			BEGIN_RING( 28 );
+		if ( flags & RADEON_STENCIL ) {
+			rb3d_cntl |=  RADEON_STENCIL_ENABLE;
+			rb3d_stencilrefmask = clear->depth_mask; /* misnamed field */
+		} else {
+			rb3d_cntl &= ~RADEON_STENCIL_ENABLE;
+			rb3d_stencilrefmask = 0x00000000;
+		}
 
-			RADEON_WAIT_UNTIL_2D_IDLE();
+		BEGIN_RING( 13 );
+		RADEON_WAIT_UNTIL_2D_IDLE();
 
-			OUT_RING( CP_PACKET0( RADEON_PP_CNTL, 1 ) );
-			OUT_RING( 0x00000000 );
-			OUT_RING( rb3d_cntl );
+		OUT_RING( CP_PACKET0( RADEON_PP_CNTL, 1 ) );
+		OUT_RING( 0x00000000 );
+		OUT_RING( rb3d_cntl );
+		
+		OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL,
+			      depth_clear->rb3d_zstencilcntl );
+		OUT_RING_REG( RADEON_RB3D_STENCILREFMASK,
+			      rb3d_stencilrefmask );
+		OUT_RING_REG( RADEON_RB3D_PLANEMASK,
+			      0x00000000 );
+		OUT_RING_REG( RADEON_SE_CNTL,
+			      depth_clear->se_cntl );
+		ADVANCE_RING();
 
-			OUT_RING_REG( RADEON_RB3D_ZSTENCILCNTL,
-				      depth_clear->rb3d_zstencilcntl );
-			OUT_RING_REG( RADEON_RB3D_STENCILREFMASK,
-				      rb3d_stencilrefmask );
-			OUT_RING_REG( RADEON_RB3D_PLANEMASK,
-				      0x00000000 );
-			OUT_RING_REG( RADEON_SE_CNTL,
-				      depth_clear->se_cntl );
+		/* Make sure we restore the 3D state next time.
+		 */
+		dev_priv->sarea_priv->ctx_owner = 0;
 
-			/* Radeon 7500 doesn't like vertices without
-			 * color.
+		for ( i = 0 ; i < nbox ; i++ ) {
+			
+			/* Funny that this should be required -- 
+			 *  sets top-left?
 			 */
+			radeon_emit_clip_rect( dev_priv,
+					       &sarea_priv->boxes[i] );
+
+			BEGIN_RING( 15 );
+
 			OUT_RING( CP_PACKET3( RADEON_3D_DRAW_IMMD, 13 ) );
 			OUT_RING( RADEON_VTX_Z_PRESENT |
 				  RADEON_VTX_PKCOLOR_PRESENT);
@@ -459,6 +675,7 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
 				   RADEON_VTX_FMT_RADEON_MODE |
 				   (3 << RADEON_NUM_VERTICES_SHIFT)) );
 
+
 			OUT_RING( depth_boxes[i].ui[CLEAR_X1] );
 			OUT_RING( depth_boxes[i].ui[CLEAR_Y1] );
 			OUT_RING( depth_boxes[i].ui[CLEAR_DEPTH] );
@@ -475,10 +692,6 @@ static void radeon_cp_dispatch_clear( drm_device_t *dev,
 			OUT_RING( 0x0 );
 
 			ADVANCE_RING();
-
-			/* Make sure we restore the 3D state next time.
-			 */
-			dev_priv->sarea_priv->ctx_owner = 0;
 		}
 	}
 
@@ -506,11 +719,12 @@ static void radeon_cp_dispatch_swap( drm_device_t *dev )
 	RING_LOCALS;
 	DRM_DEBUG( "\n" );
 
-#if RADEON_PERFORMANCE_BOXES
+
 	/* Do some trivial performance monitoring...
 	 */
-	radeon_cp_performance_boxes( dev_priv );
-#endif
+	if (dev_priv->do_boxes)
+		radeon_cp_performance_boxes( dev_priv );
+
 
 	/* Wait for the 3D stream to idle before dispatching the bitblt.
 	 * This will prevent data corruption between the two streams.
@@ -579,20 +793,21 @@ static void radeon_cp_dispatch_flip( drm_device_t *dev )
 {
 	drm_radeon_private_t *dev_priv = dev->dev_private;
 	RING_LOCALS;
-	DRM_DEBUG( "page=%d\n", dev_priv->current_page );
+	DRM_DEBUG( "%s: page=%d pfCurrentPage=%d\n", 
+		__FUNCTION__, 
+		dev_priv->current_page,
+		dev_priv->sarea_priv->pfCurrentPage);
 
-#if RADEON_PERFORMANCE_BOXES
 	/* Do some trivial performance monitoring...
 	 */
-	radeon_cp_performance_boxes( dev_priv );
-#endif
+	if (dev_priv->do_boxes) {
+		dev_priv->stats.boxes |= RADEON_BOX_FLIP;
+		radeon_cp_performance_boxes( dev_priv );
+	}
 
 	BEGIN_RING( 4 );
 
 	RADEON_WAIT_UNTIL_3D_IDLE();
-/*
-	RADEON_WAIT_UNTIL_PAGE_FLIPPED();
-*/
 	OUT_RING( CP_PACKET0( RADEON_CRTC_OFFSET, 0 ) );
 
 	if ( dev_priv->current_page == 0 ) {
@@ -847,6 +1062,8 @@ static int radeon_cp_dispatch_texture( drm_device_t *dev,
 	int ret = 0, i;
 	RING_LOCALS;
 
+	dev_priv->stats.boxes |= RADEON_BOX_TEXTURE_LOAD;
+
 	/* FIXME: Be smarter about this...
 	 */
 	buf = radeon_freelist_get( dev );
@@ -1611,6 +1828,30 @@ static __inline__ int radeon_emit_scalars(
 	return 0;
 }
 
+/* God this is ugly
+ */
+static __inline__ int radeon_emit_scalars2( 
+	drm_radeon_private_t *dev_priv,
+	drm_radeon_cmd_header_t header,
+	drm_radeon_cmd_buffer_t *cmdbuf )
+{
+	int sz = header.scalars.count;
+	int *data = (int *)cmdbuf->buf;
+	int start = ((unsigned int)header.scalars.offset) + 0x100;
+	int stride = header.scalars.stride;
+	RING_LOCALS;
+
+	BEGIN_RING( 3+sz );
+	OUT_RING( CP_PACKET0( RADEON_SE_TCL_SCALAR_INDX_REG, 0 ) );
+	OUT_RING( start | (stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT));
+	OUT_RING( CP_PACKET0_TABLE( RADEON_SE_TCL_SCALAR_DATA_REG, sz-1 ) );
+	OUT_RING_USER_TABLE( data, sz );
+	ADVANCE_RING();
+	cmdbuf->buf += sz * sizeof(int);
+	cmdbuf->bufsz -= sz * sizeof(int);
+	return 0;
+}
+
 static __inline__ int radeon_emit_vectors( 
 	drm_radeon_private_t *dev_priv,
 	drm_radeon_cmd_header_t header,
@@ -1775,6 +2016,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 
 		switch (header.header.cmd_type) {
 		case RADEON_CMD_PACKET: 
+			DRM_DEBUG("RADEON_CMD_PACKET\n");
 			if (radeon_emit_packets( dev_priv, header, &cmdbuf )) {
 				DRM_ERROR("radeon_emit_packets failed\n");
 				return DRM_ERR(EINVAL);
@@ -1782,6 +2024,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 			break;
 
 		case RADEON_CMD_SCALARS:
+			DRM_DEBUG("RADEON_CMD_SCALARS\n");
 			if (radeon_emit_scalars( dev_priv, header, &cmdbuf )) {
 				DRM_ERROR("radeon_emit_scalars failed\n");
 				return DRM_ERR(EINVAL);
@@ -1789,6 +2032,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 			break;
 
 		case RADEON_CMD_VECTORS:
+			DRM_DEBUG("RADEON_CMD_VECTORS\n");
 			if (radeon_emit_vectors( dev_priv, header, &cmdbuf )) {
 				DRM_ERROR("radeon_emit_vectors failed\n");
 				return DRM_ERR(EINVAL);
@@ -1796,6 +2040,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 			break;
 
 		case RADEON_CMD_DMA_DISCARD:
+			DRM_DEBUG("RADEON_CMD_DMA_DISCARD\n");
 			idx = header.dma.buf_idx;
 			if ( idx < 0 || idx >= dma->buf_count ) {
 				DRM_ERROR( "buffer index %d (of %d max)\n",
@@ -1813,6 +2058,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 			break;
 
 		case RADEON_CMD_PACKET3:
+			DRM_DEBUG("RADEON_CMD_PACKET3\n");
 			if (radeon_emit_packet3( dev, &cmdbuf )) {
 				DRM_ERROR("radeon_emit_packet3 failed\n");
 				return DRM_ERR(EINVAL);
@@ -1820,12 +2066,20 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 			break;
 
 		case RADEON_CMD_PACKET3_CLIP:
+			DRM_DEBUG("RADEON_CMD_PACKET3_CLIP\n");
 			if (radeon_emit_packet3_cliprect( dev, &cmdbuf, orig_nbox )) {
 				DRM_ERROR("radeon_emit_packet3_clip failed\n");
 				return DRM_ERR(EINVAL);
 			}
 			break;
 
+		case RADEON_CMD_SCALARS2:
+			DRM_DEBUG("RADEON_CMD_SCALARS2\n");
+			if (radeon_emit_scalars2( dev_priv, header, &cmdbuf )) {
+				DRM_ERROR("radeon_emit_scalars2 failed\n");
+				return DRM_ERR(EINVAL);
+			}
+			break;
 		default:
 			DRM_ERROR("bad cmd_type %d at %p\n", 
 				  header.header.cmd_type,
@@ -1835,6 +2089,7 @@ int radeon_cp_cmdbuf( DRM_IOCTL_ARGS )
 	}
 
 
+	DRM_DEBUG("DONE\n");
 	COMMIT_RING();
 	return 0;
 }
@@ -1863,12 +2118,14 @@ int radeon_cp_getparam( DRM_IOCTL_ARGS )
 		value = dev_priv->agp_buffers_offset;
 		break;
 	case RADEON_PARAM_LAST_FRAME:
+		dev_priv->stats.last_frame_reads++;
 		value = GET_SCRATCH( 0 );
 		break;
 	case RADEON_PARAM_LAST_DISPATCH:
 		value = GET_SCRATCH( 1 );
 		break;
 	case RADEON_PARAM_LAST_CLEAR:
+		dev_priv->stats.last_clear_reads++;
 		value = GET_SCRATCH( 2 );
 		break;
 	default:
-- 
cgit v1.2.3