From 068ea68b3f7ebd5efcfcc2f6ae417651423c8382 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 13 Dec 2013 12:48:30 -0500 Subject: freedreno: add bo cache Workloads which create many transient buffers cause significant CPU overhead in buffer allocation, zeroing, cache maint, and mmap setup. By caching and re-using existing buffers, the CPU overhead drops significantly. See: http://bloggingthemonkey.blogspot.com/2013/09/freedreno-update-moar-fps.html A simple time based policy is used for purging the cache. Once the kernel supports it, we could use madvise style API to handle memory pressure scenarios a bit better. Signed-off-by: Rob Clark --- freedreno/freedreno_bo.c | 151 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 144 insertions(+), 7 deletions(-) (limited to 'freedreno/freedreno_bo.c') diff --git a/freedreno/freedreno_bo.c b/freedreno/freedreno_bo.c index 92c7dd79..8cea4ded 100644 --- a/freedreno/freedreno_bo.c +++ b/freedreno/freedreno_bo.c @@ -31,6 +31,8 @@ static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER; +static void bo_del(struct fd_bo *bo); + /* set buffer name, and add to table, call w/ table_lock held: */ static void set_name(struct fd_bo *bo, uint32_t name) { @@ -68,24 +70,128 @@ static struct fd_bo * bo_from_handle(struct fd_device *dev, bo->size = size; bo->handle = handle; atomic_set(&bo->refcnt, 1); + list_inithead(&bo->list); /* add ourself into the handle table: */ drmHashInsert(dev->handle_table, handle, bo); return bo; } +/* Frees older cached buffers. Called under table_lock */ +void fd_cleanup_bo_cache(struct fd_device *dev, time_t time) +{ + int i; + + if (dev->time == time) + return; + + for (i = 0; i < dev->num_buckets; i++) { + struct fd_bo_bucket *bucket = &dev->cache_bucket[i]; + struct fd_bo *bo; + + while (!LIST_IS_EMPTY(&bucket->list)) { + bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list); + + /* keep things in cache for at least 1 second: */ + if (time && ((time - bo->free_time) <= 1)) + break; + + list_del(&bo->list); + bo_del(bo); + } + } + + dev->time = time; +} + +static struct fd_bo_bucket * get_bucket(struct fd_device *dev, uint32_t size) +{ + int i; + + /* hmm, this is what intel does, but I suppose we could calculate our + * way to the correct bucket size rather than looping.. + */ + for (i = 0; i < dev->num_buckets; i++) { + struct fd_bo_bucket *bucket = &dev->cache_bucket[i]; + if (bucket->size >= size) { + return bucket; + } + } + + return NULL; +} + +static int is_idle(struct fd_bo *bo) +{ + return fd_bo_cpu_prep(bo, NULL, + DRM_FREEDRENO_PREP_READ | + DRM_FREEDRENO_PREP_WRITE | + DRM_FREEDRENO_PREP_NOSYNC) == 0; +} + +static struct fd_bo *find_in_bucket(struct fd_device *dev, + struct fd_bo_bucket *bucket, uint32_t flags) +{ + struct fd_bo *bo = NULL; + + /* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could + * skip the busy check.. if it is only going to be a render target + * then we probably don't need to stall.. + * + * NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail + * (MRU, since likely to be in GPU cache), rather than head (LRU).. + */ + pthread_mutex_lock(&table_lock); + while (!LIST_IS_EMPTY(&bucket->list)) { + bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list); + if (0 /* TODO: if madvise tells us bo is gone... */) { + list_del(&bo->list); + bo_del(bo); + bo = NULL; + continue; + } + /* TODO check for compatible flags? */ + if (is_idle(bo)) { + list_del(&bo->list); + break; + } + bo = NULL; + break; + } + pthread_mutex_unlock(&table_lock); + + return bo; +} + + struct fd_bo * fd_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags) { struct fd_bo *bo = NULL; + struct fd_bo_bucket *bucket; uint32_t handle; int ret; - ret = dev->funcs->bo_new_handle(dev, ALIGN(size, 4096), flags, &handle); + size = ALIGN(size, 4096); + bucket = get_bucket(dev, size); + + /* see if we can be green and recycle: */ + if (bucket) { + size = bucket->size; + bo = find_in_bucket(dev, bucket, flags); + if (bo) { + atomic_set(&bo->refcnt, 1); + fd_device_ref(bo->dev); + return bo; + } + } + + ret = dev->funcs->bo_new_handle(dev, size, flags, &handle); if (ret) return NULL; pthread_mutex_lock(&table_lock); bo = bo_from_handle(dev, size, handle); + bo->bo_reuse = 1; pthread_mutex_unlock(&table_lock); return bo; @@ -144,30 +250,61 @@ struct fd_bo * fd_bo_ref(struct fd_bo *bo) void fd_bo_del(struct fd_bo *bo) { - struct fd_device *dev; + struct fd_device *dev = bo->dev; if (!atomic_dec_and_test(&bo->refcnt)) return; + pthread_mutex_lock(&table_lock); + + if (bo->bo_reuse) { + struct fd_bo_bucket *bucket = get_bucket(dev, bo->size); + + /* see if we can be green and recycle: */ + if (bucket) { + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + + bo->free_time = time.tv_sec; + list_addtail(&bo->list, &bucket->list); + fd_cleanup_bo_cache(dev, time.tv_sec); + + /* bo's in the bucket cache don't have a ref and + * don't hold a ref to the dev: + */ + + goto out; + } + } + + bo_del(bo); +out: + fd_device_del_locked(dev); + pthread_mutex_unlock(&table_lock); +} + +/* Called under table_lock */ +static void bo_del(struct fd_bo *bo) +{ if (bo->map) munmap(bo->map, bo->size); + /* TODO probably bo's in bucket list get removed from + * handle table?? + */ + if (bo->handle) { struct drm_gem_close req = { .handle = bo->handle, }; - pthread_mutex_lock(&table_lock); drmHashDelete(bo->dev->handle_table, bo->handle); if (bo->name) drmHashDelete(bo->dev->name_table, bo->name); drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req); - pthread_mutex_unlock(&table_lock); } - dev = bo->dev; bo->funcs->destroy(bo); - - fd_device_del(dev); } int fd_bo_get_name(struct fd_bo *bo, uint32_t *name) -- cgit v1.2.3