From 068ea68b3f7ebd5efcfcc2f6ae417651423c8382 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Fri, 13 Dec 2013 12:48:30 -0500 Subject: freedreno: add bo cache Workloads which create many transient buffers cause significant CPU overhead in buffer allocation, zeroing, cache maint, and mmap setup. By caching and re-using existing buffers, the CPU overhead drops significantly. See: http://bloggingthemonkey.blogspot.com/2013/09/freedreno-update-moar-fps.html A simple time based policy is used for purging the cache. Once the kernel supports it, we could use madvise style API to handle memory pressure scenarios a bit better. Signed-off-by: Rob Clark --- freedreno/freedreno_bo.c | 151 +++++++++++++++++++++++++++++++++++++++++-- freedreno/freedreno_device.c | 60 +++++++++++++++-- freedreno/freedreno_priv.h | 18 ++++++ freedreno/kgsl/kgsl_bo.c | 19 +++++- 4 files changed, 235 insertions(+), 13 deletions(-) diff --git a/freedreno/freedreno_bo.c b/freedreno/freedreno_bo.c index 92c7dd79..8cea4ded 100644 --- a/freedreno/freedreno_bo.c +++ b/freedreno/freedreno_bo.c @@ -31,6 +31,8 @@ static pthread_mutex_t table_lock = PTHREAD_MUTEX_INITIALIZER; +static void bo_del(struct fd_bo *bo); + /* set buffer name, and add to table, call w/ table_lock held: */ static void set_name(struct fd_bo *bo, uint32_t name) { @@ -68,24 +70,128 @@ static struct fd_bo * bo_from_handle(struct fd_device *dev, bo->size = size; bo->handle = handle; atomic_set(&bo->refcnt, 1); + list_inithead(&bo->list); /* add ourself into the handle table: */ drmHashInsert(dev->handle_table, handle, bo); return bo; } +/* Frees older cached buffers. Called under table_lock */ +void fd_cleanup_bo_cache(struct fd_device *dev, time_t time) +{ + int i; + + if (dev->time == time) + return; + + for (i = 0; i < dev->num_buckets; i++) { + struct fd_bo_bucket *bucket = &dev->cache_bucket[i]; + struct fd_bo *bo; + + while (!LIST_IS_EMPTY(&bucket->list)) { + bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list); + + /* keep things in cache for at least 1 second: */ + if (time && ((time - bo->free_time) <= 1)) + break; + + list_del(&bo->list); + bo_del(bo); + } + } + + dev->time = time; +} + +static struct fd_bo_bucket * get_bucket(struct fd_device *dev, uint32_t size) +{ + int i; + + /* hmm, this is what intel does, but I suppose we could calculate our + * way to the correct bucket size rather than looping.. + */ + for (i = 0; i < dev->num_buckets; i++) { + struct fd_bo_bucket *bucket = &dev->cache_bucket[i]; + if (bucket->size >= size) { + return bucket; + } + } + + return NULL; +} + +static int is_idle(struct fd_bo *bo) +{ + return fd_bo_cpu_prep(bo, NULL, + DRM_FREEDRENO_PREP_READ | + DRM_FREEDRENO_PREP_WRITE | + DRM_FREEDRENO_PREP_NOSYNC) == 0; +} + +static struct fd_bo *find_in_bucket(struct fd_device *dev, + struct fd_bo_bucket *bucket, uint32_t flags) +{ + struct fd_bo *bo = NULL; + + /* TODO .. if we had an ALLOC_FOR_RENDER flag like intel, we could + * skip the busy check.. if it is only going to be a render target + * then we probably don't need to stall.. + * + * NOTE that intel takes ALLOC_FOR_RENDER bo's from the list tail + * (MRU, since likely to be in GPU cache), rather than head (LRU).. + */ + pthread_mutex_lock(&table_lock); + while (!LIST_IS_EMPTY(&bucket->list)) { + bo = LIST_ENTRY(struct fd_bo, bucket->list.next, list); + if (0 /* TODO: if madvise tells us bo is gone... */) { + list_del(&bo->list); + bo_del(bo); + bo = NULL; + continue; + } + /* TODO check for compatible flags? */ + if (is_idle(bo)) { + list_del(&bo->list); + break; + } + bo = NULL; + break; + } + pthread_mutex_unlock(&table_lock); + + return bo; +} + + struct fd_bo * fd_bo_new(struct fd_device *dev, uint32_t size, uint32_t flags) { struct fd_bo *bo = NULL; + struct fd_bo_bucket *bucket; uint32_t handle; int ret; - ret = dev->funcs->bo_new_handle(dev, ALIGN(size, 4096), flags, &handle); + size = ALIGN(size, 4096); + bucket = get_bucket(dev, size); + + /* see if we can be green and recycle: */ + if (bucket) { + size = bucket->size; + bo = find_in_bucket(dev, bucket, flags); + if (bo) { + atomic_set(&bo->refcnt, 1); + fd_device_ref(bo->dev); + return bo; + } + } + + ret = dev->funcs->bo_new_handle(dev, size, flags, &handle); if (ret) return NULL; pthread_mutex_lock(&table_lock); bo = bo_from_handle(dev, size, handle); + bo->bo_reuse = 1; pthread_mutex_unlock(&table_lock); return bo; @@ -144,30 +250,61 @@ struct fd_bo * fd_bo_ref(struct fd_bo *bo) void fd_bo_del(struct fd_bo *bo) { - struct fd_device *dev; + struct fd_device *dev = bo->dev; if (!atomic_dec_and_test(&bo->refcnt)) return; + pthread_mutex_lock(&table_lock); + + if (bo->bo_reuse) { + struct fd_bo_bucket *bucket = get_bucket(dev, bo->size); + + /* see if we can be green and recycle: */ + if (bucket) { + struct timespec time; + + clock_gettime(CLOCK_MONOTONIC, &time); + + bo->free_time = time.tv_sec; + list_addtail(&bo->list, &bucket->list); + fd_cleanup_bo_cache(dev, time.tv_sec); + + /* bo's in the bucket cache don't have a ref and + * don't hold a ref to the dev: + */ + + goto out; + } + } + + bo_del(bo); +out: + fd_device_del_locked(dev); + pthread_mutex_unlock(&table_lock); +} + +/* Called under table_lock */ +static void bo_del(struct fd_bo *bo) +{ if (bo->map) munmap(bo->map, bo->size); + /* TODO probably bo's in bucket list get removed from + * handle table?? + */ + if (bo->handle) { struct drm_gem_close req = { .handle = bo->handle, }; - pthread_mutex_lock(&table_lock); drmHashDelete(bo->dev->handle_table, bo->handle); if (bo->name) drmHashDelete(bo->dev->name_table, bo->name); drmIoctl(bo->dev->fd, DRM_IOCTL_GEM_CLOSE, &req); - pthread_mutex_unlock(&table_lock); } - dev = bo->dev; bo->funcs->destroy(bo); - - fd_device_del(dev); } int fd_bo_get_name(struct fd_bo *bo, uint32_t *name) diff --git a/freedreno/freedreno_device.c b/freedreno/freedreno_device.c index 1e3d9df2..6486983d 100644 --- a/freedreno/freedreno_device.c +++ b/freedreno/freedreno_device.c @@ -39,6 +39,44 @@ static void * dev_table; struct fd_device * kgsl_device_new(int fd); struct fd_device * msm_device_new(int fd); +static void +add_bucket(struct fd_device *dev, int size) +{ + unsigned int i = dev->num_buckets; + + assert(i < ARRAY_SIZE(dev->cache_bucket)); + + list_inithead(&dev->cache_bucket[i].list); + dev->cache_bucket[i].size = size; + dev->num_buckets++; +} + +static void +init_cache_buckets(struct fd_device *dev) +{ + unsigned long size, cache_max_size = 64 * 1024 * 1024; + + /* OK, so power of two buckets was too wasteful of memory. + * Give 3 other sizes between each power of two, to hopefully + * cover things accurately enough. (The alternative is + * probably to just go for exact matching of sizes, and assume + * that for things like composited window resize the tiled + * width/height alignment and rounding of sizes to pages will + * get us useful cache hit rates anyway) + */ + add_bucket(dev, 4096); + add_bucket(dev, 4096 * 2); + add_bucket(dev, 4096 * 3); + + /* Initialize the linked lists for BO reuse cache. */ + for (size = 4 * 4096; size <= cache_max_size; size *= 2) { + add_bucket(dev, size); + add_bucket(dev, size + size * 1 / 4); + add_bucket(dev, size + size * 2 / 4); + add_bucket(dev, size + size * 3 / 4); + } +} + static struct fd_device * fd_device_new_impl(int fd) { struct fd_device *dev; @@ -69,6 +107,7 @@ static struct fd_device * fd_device_new_impl(int fd) dev->fd = fd; dev->handle_table = drmHashCreate(); dev->name_table = drmHashCreate(); + init_cache_buckets(dev); return dev; } @@ -102,14 +141,27 @@ struct fd_device * fd_device_ref(struct fd_device *dev) return dev; } +static void fd_device_del_impl(struct fd_device *dev) +{ + fd_cleanup_bo_cache(dev, 0); + drmHashDestroy(dev->handle_table); + drmHashDestroy(dev->name_table); + drmHashDelete(dev_table, dev->fd); + dev->funcs->destroy(dev); +} + +void fd_device_del_locked(struct fd_device *dev) +{ + if (!atomic_dec_and_test(&dev->refcnt)) + return; + fd_device_del_impl(dev); +} + void fd_device_del(struct fd_device *dev) { if (!atomic_dec_and_test(&dev->refcnt)) return; pthread_mutex_lock(&table_lock); - drmHashDestroy(dev->handle_table); - drmHashDestroy(dev->name_table); - drmHashDelete(dev_table, dev->fd); + fd_device_del_impl(dev); pthread_mutex_unlock(&table_lock); - dev->funcs->destroy(dev); } diff --git a/freedreno/freedreno_priv.h b/freedreno/freedreno_priv.h index 69256f51..061d807e 100644 --- a/freedreno/freedreno_priv.h +++ b/freedreno/freedreno_priv.h @@ -59,6 +59,11 @@ struct fd_device_funcs { void (*destroy)(struct fd_device *dev); }; +struct fd_bo_bucket { + uint32_t size; + struct list_head list; +}; + struct fd_device { int fd; atomic_t refcnt; @@ -75,8 +80,17 @@ struct fd_device { void *handle_table, *name_table; struct fd_device_funcs *funcs; + + struct fd_bo_bucket cache_bucket[14 * 4]; + int num_buckets; + time_t time; }; +void fd_cleanup_bo_cache(struct fd_device *dev, time_t time); + +/* for where @table_lock is already held: */ +void fd_device_del_locked(struct fd_device *dev); + struct fd_pipe_funcs { struct fd_ringbuffer * (*ringbuffer_new)(struct fd_pipe *pipe, uint32_t size); int (*get_param)(struct fd_pipe *pipe, enum fd_param_id param, uint64_t *value); @@ -120,6 +134,10 @@ struct fd_bo { void *map; atomic_t refcnt; struct fd_bo_funcs *funcs; + + int bo_reuse; + struct list_head list; /* bucket-list entry */ + time_t free_time; /* time when added to bucket-list */ }; struct fd_bo *fd_bo_from_handle(struct fd_device *dev, diff --git a/freedreno/kgsl/kgsl_bo.c b/freedreno/kgsl/kgsl_bo.c index 0d019cb1..585851c7 100644 --- a/freedreno/kgsl/kgsl_bo.c +++ b/freedreno/kgsl/kgsl_bo.c @@ -80,9 +80,24 @@ static int kgsl_bo_offset(struct fd_bo *bo, uint64_t *offset) static int kgsl_bo_cpu_prep(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op) { uint32_t timestamp = kgsl_bo_get_timestamp(to_kgsl_bo(bo)); - if (timestamp) { - fd_pipe_wait(pipe, timestamp); + + if (op & DRM_FREEDRENO_PREP_NOSYNC) { + uint32_t current; + int ret; + + ret = kgsl_pipe_timestamp(to_kgsl_pipe(pipe), ¤t); + if (ret) + return ret; + + if (timestamp > current) + return -EBUSY; + + return 0; } + + if (timestamp) + fd_pipe_wait(pipe, timestamp); + return 0; } -- cgit v1.2.3