From 068ea68b3f7ebd5efcfcc2f6ae417651423c8382 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Fri, 13 Dec 2013 12:48:30 -0500
Subject: freedreno: add bo cache

Workloads which create many transient buffers cause significant CPU
overhead in buffer allocation, zeroing, cache maint, and mmap setup.
By caching and re-using existing buffers, the CPU overhead drops
significantly.  See:

http://bloggingthemonkey.blogspot.com/2013/09/freedreno-update-moar-fps.html

A simple time based policy is used for purging the cache.  Once the
kernel supports it, we could use madvise style API to handle memory
pressure scenarios a bit better.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 freedreno/freedreno_bo.c | 151 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 144 insertions(+), 7 deletions(-)

(limited to 'freedreno/freedreno_bo.c')

diff --git a/freedreno/freedreno_bo.c b/freedreno/freedreno_bo.c
index 92c7dd79..8cea4ded 100644
--- a/freedreno/freedreno_bo.c
+++ b/freedreno/freedreno_bo.c
@@ -31,6 +31,8 @@
 
 static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER;
 
+static void bo_del(struct fd_bo *bo);
+
 /* set buffer name, and add to table, call w/ table_lock held: */
 static void set_name(struct fd_bo *bo, uint32_t name)
 {
@@ -68,24 +70,128 @@ static struct fd_bo * bo_from_handle(struct fd_device *dev,
 	bo->size = size;
 	bo->handle = handle;
 	atomic_set(&bo->refcnt, 1);
+	list_inithead(&bo->list);
 	/* add ourself into the handle table: */
 	drmHashInsert(dev->handle_table, handle, bo);
 	return bo;
 }
 
+/* Frees older cached buffers.  Called under table_lock */
+void fd_cleanup_bo_cache(struct fd_device *dev, time_t time)
+{
+	int i;
+
+	if (dev->time == time)
+		return;
+
+	for (i = 0; i < dev->num_buckets; i++) {
+		struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
+		struct fd_bo *bo;
+
+		while (!LIST_IS_EMPTY(&bucket->list)) {
+			bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
+
+			/* keep things in cache for at least 1 second: */
+			if (time && ((time - bo->free_time) <= 1))
+				break;
+
+			list_del(&bo->list);
+			bo_del(bo);
+		}
+	}
+
+	dev->time = time;
+}
+
+static struct fd_bo_bucket * get_bucket(struct fd_device *dev, uint32_t size)
+{
+	int i;
+
+	/* hmm, this is what intel does, but I suppose we could calculate our
+	 * way to the correct bucket size rather than looping..
+	 */
+	for (i = 0; i < dev->num_buckets; i++) {
+		struct fd_bo_bucket *bucket = &dev->cache_bucket[i];
+		if (bucket->size >= size) {
+			return bucket;
+		}
+	}
+
+	return NULL;
+}
+
+static int is_idle(struct fd_bo *bo)
+{
+	return fd_bo_cpu_prep(bo, NULL,
+			DRM_FREEDRENO_PREP_READ |
+			DRM_FREEDRENO_PREP_WRITE |
+			DRM_FREEDRENO_PREP_NOSYNC) == 0;
+}
+
+static struct fd_bo *find_in_bucket(struct fd_device *dev,
+		struct fd_bo_bucket *bucket, uint32_t flags)
+{
+	struct fd_bo *bo = NULL;
+
+	/* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could
+	 * skip the busy check.. if it is only going to be a render target
+	 * then we probably don't need to stall..
+	 *
+	 * NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail
+	 * (MRU, since likely to be in GPU cache), rather than head (LRU)..
+	 */
+	pthread_mutex_lock(&table_lock);
+	while (!LIST_IS_EMPTY(&bucket->list)) {
+		bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list);
+		if (0 /* TODO: if madvise tells us bo is gone... */) {
+			list_del(&bo->list);
+			bo_del(bo);
+			bo = NULL;
+			continue;
+		}
+		/* TODO check for compatible flags? */
+		if (is_idle(bo)) {
+			list_del(&bo->list);
+			break;
+		}
+		bo = NULL;
+		break;
+	}
+	pthread_mutex_unlock(&table_lock);
+
+	return bo;
+}
+
+
 struct fd_bo * fd_bo_new(struct fd_device *dev,
 		uint32_t size, uint32_t flags)
 {
 	struct fd_bo *bo = NULL;
+	struct fd_bo_bucket *bucket;
 	uint32_t handle;
 	int ret;
 
-	ret = dev->funcs->bo_new_handle(dev, ALIGN(size, 4096), flags, &handle);
+	size = ALIGN(size, 4096);
+	bucket = get_bucket(dev, size);
+
+	/* see if we can be green and recycle: */
+	if (bucket) {
+		size = bucket->size;
+		bo = find_in_bucket(dev, bucket, flags);
+		if (bo) {
+			atomic_set(&bo->refcnt, 1);
+			fd_device_ref(bo->dev);
+			return bo;
+		}
+	}
+
+	ret = dev->funcs->bo_new_handle(dev, size, flags, &handle);
 	if (ret)
 		return NULL;
 
 	pthread_mutex_lock(&table_lock);
 	bo = bo_from_handle(dev, size, handle);
+	bo->bo_reuse = 1;
 	pthread_mutex_unlock(&table_lock);
 
 	return bo;
@@ -144,30 +250,61 @@ struct fd_bo * fd_bo_ref(struct fd_bo *bo)
 
 void fd_bo_del(struct fd_bo *bo)
 {
-	struct fd_device *dev;
+	struct fd_device *dev = bo->dev;
 
 	if (!atomic_dec_and_test(&bo->refcnt))
 		return;
 
+	pthread_mutex_lock(&table_lock);
+
+	if (bo->bo_reuse) {
+		struct fd_bo_bucket *bucket = get_bucket(dev, bo->size);
+
+		/* see if we can be green and recycle: */
+		if (bucket) {
+			struct timespec time;
+
+			clock_gettime(CLOCK_MONOTONIC, &time);
+
+			bo->free_time = time.tv_sec;
+			list_addtail(&bo->list, &bucket->list);
+			fd_cleanup_bo_cache(dev, time.tv_sec);
+
+			/* bo's in the bucket cache don't have a ref and
+			 * don't hold a ref to the dev:
+			 */
+
+			goto out;
+		}
+	}
+
+	bo_del(bo);
+out:
+	fd_device_del_locked(dev);
+	pthread_mutex_unlock(&table_lock);
+}
+
+/* Called under table_lock */
+static void bo_del(struct fd_bo *bo)
+{
 	if (bo->map)
 		munmap(bo->map, bo->size);
 
+	/* TODO probably bo's in bucket list get removed from
+	 * handle table??
+	 */
+
 	if (bo->handle) {
 		struct drm_gem_close req = {
 				.handle = bo->handle,
 		};
-		pthread_mutex_lock(&table_lock);
 		drmHashDelete(bo->dev->handle_table, bo->handle);
 		if (bo->name)
 			drmHashDelete(bo->dev->name_table, bo->name);
 		drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req);
-		pthread_mutex_unlock(&table_lock);
 	}
 
-	dev = bo->dev;
 	bo->funcs->destroy(bo);
-
-	fd_device_del(dev);
 }
 
 int fd_bo_get_name(struct fd_bo *bo, uint32_t *name)
-- 
cgit v1.2.3