From f85fd1b42dc2d77266007c02144d4f4f524e4157 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Sat, 26 Jul 2008 19:28:14 -0700 Subject: intel-gem: Speed up tiled readpixels by tracking which pages have been flushed. This is around 3x or so speedup, since we would read wide rows at a time, and clflush each tile 8 times as a result. We'll want code related to this anyway when we do fault-based per-page clflushing for sw fallbacks. --- linux-core/i915_gem.c | 99 +++++++++++++++++++++++++++++++++++++------------- shared-core/i915_drv.h | 6 +++ 2 files changed, 79 insertions(+), 26 deletions(-) diff --git a/linux-core/i915_gem.c b/linux-core/i915_gem.c index 4087854c..eea2d488 100644 --- a/linux-core/i915_gem.c +++ b/linux-core/i915_gem.c @@ -36,6 +36,12 @@ static int i915_gem_object_set_domain(struct drm_gem_object *obj, uint32_t read_domains, uint32_t write_domain); +static int +i915_gem_object_set_domain_range(struct drm_gem_object *obj, + uint64_t offset, + uint64_t size, + uint32_t read_domains, + uint32_t write_domain); int i915_gem_set_domain(struct drm_gem_object *obj, struct drm_file *file_priv, @@ -136,32 +142,11 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data, mutex_lock(&dev->struct_mutex); - /* Do a partial equivalent of i915_gem_set_domain(CPU, 0), as - * we don't want to clflush whole objects to read a portion of them. - * - * The side effect of doing this is that repeated preads of the same - * contents would take extra clflush overhead, since we don't track - * flushedness on a page basis. - */ - if (obj->write_domain & ~(I915_GEM_DOMAIN_CPU|I915_GEM_DOMAIN_GTT)) { - ret = i915_gem_object_wait_rendering(obj); - if (ret) { - drm_gem_object_unreference(obj); - mutex_unlock(&dev->struct_mutex); - return ret; - } - } - if ((obj->read_domains & I915_GEM_DOMAIN_CPU) == 0) { - int first_page = args->offset / PAGE_SIZE; - int last_page = (args->offset + args->size - 1) / PAGE_SIZE; - - /* If we don't have the page list, the pages are unpinned - * and swappable, and thus should already be in the CPU domain. - */ - BUG_ON(obj_priv->page_list == NULL); - - drm_ttm_cache_flush(&obj_priv->page_list[first_page], - last_page - first_page + 1); + ret = i915_gem_object_set_domain_range(obj, args->offset, args->size, + I915_GEM_DOMAIN_CPU, 0); + if (ret != 0) { + drm_gem_object_unreference(obj); + mutex_unlock(&dev->struct_mutex); } offset = args->offset; @@ -1383,7 +1368,17 @@ i915_gem_object_set_domain(struct drm_gem_object *obj, if ((write_domain | flush_domains) != 0) obj->write_domain = write_domain; + + /* If we're invalidating the CPU domain, clear the per-page CPU + * domain list as well. + */ + if (obj_priv->page_cpu_valid != NULL && + (obj->read_domains & I915_GEM_DOMAIN_CPU) && + ((read_domains & I915_GEM_DOMAIN_CPU) == 0)) { + memset(obj_priv->page_cpu_valid, 0, obj->size / PAGE_SIZE); + } obj->read_domains = read_domains; + dev->invalidate_domains |= invalidate_domains; dev->flush_domains |= flush_domains; #if WATCH_BUF @@ -1395,6 +1390,57 @@ i915_gem_object_set_domain(struct drm_gem_object *obj, return 0; } +/** + * Set the read/write domain on a range of the object. + * + * Currently only implemented for CPU reads, otherwise drops to normal + * i915_gem_object_set_domain(). + */ +static int +i915_gem_object_set_domain_range(struct drm_gem_object *obj, + uint64_t offset, + uint64_t size, + uint32_t read_domains, + uint32_t write_domain) +{ + struct drm_i915_gem_object *obj_priv = obj->driver_private; + int ret, i; + + if (obj->read_domains & I915_GEM_DOMAIN_CPU) + return 0; + + if (read_domains != I915_GEM_DOMAIN_CPU || + write_domain != 0) + return i915_gem_object_set_domain(obj, + read_domains, write_domain); + + /* Wait on any GPU rendering to the object to be flushed. */ + if (obj->write_domain & ~(I915_GEM_DOMAIN_CPU | I915_GEM_DOMAIN_GTT)) { + ret = i915_gem_object_wait_rendering(obj); + if (ret) + return ret; + } + + if (obj_priv->page_cpu_valid == NULL) { + obj_priv->page_cpu_valid = drm_calloc(1, obj->size / PAGE_SIZE, + DRM_MEM_DRIVER); + } + + /* Flush the cache on any pages that are still invalid from the CPU's + * perspective. + */ + for (i = offset / PAGE_SIZE; i < (offset + size - 1) / PAGE_SIZE; i++) { + if (obj_priv->page_cpu_valid[i]) + continue; + + drm_ttm_cache_flush(obj_priv->page_list + i, 1); + + obj_priv->page_cpu_valid[i] = 1; + } + + return 0; +} + /** * Once all of the objects have been set in the proper domain, * perform the necessary flush and invalidate operations. @@ -2097,6 +2143,7 @@ void i915_gem_free_object(struct drm_gem_object *obj) i915_gem_object_unbind(obj); + drm_free(obj_priv->page_cpu_valid, 1, DRM_MEM_DRIVER); drm_free(obj->driver_private, 1, DRM_MEM_DRIVER); } diff --git a/shared-core/i915_drv.h b/shared-core/i915_drv.h index 99416e8d..7bb9e5bb 100644 --- a/shared-core/i915_drv.h +++ b/shared-core/i915_drv.h @@ -393,6 +393,12 @@ struct drm_i915_gem_object { /** Current tiling mode for the object. */ uint32_t tiling_mode; + + /** + * Flagging of which individual pages are valid in GEM_DOMAIN_CPU when + * GEM_DOMAIN_CPU is not in the object's read domain. + */ + uint8_t *page_cpu_valid; }; /** -- cgit v1.2.3