From 966c9907c040b4fe4b288b4a9d82598797aee743 Mon Sep 17 00:00:00 2001 From: Pauli Nieminen Date: Sat, 29 Aug 2009 12:08:57 +0300 Subject: libdrm_radeon: Optimize cs_gem_reloc to do less looping. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bo->referenced_in_cs is checked if bo is already in cs. Adding and removing reference in bo is done with atomic operations to allow parallel access to a bo from multiple contexts. cs->id generation code quarentees there is not duplicated ids which limits number of cs->ids to 32. If there is more cs objects rest will get id 0. V2: - Fix configure to check for atomics operations if libdrm_radeon is only selected. - Make atomic operations private to libdrm. This optimization decreases cs_write_reloc share of torcs profiling from 4.3% to 2.6%. Tested-by: Michel Dänzer Signed-off-by: Pauli Nieminen --- configure.ac | 31 +++++++++---- radeon/radeon_bo_gem.c | 9 ++++ radeon/radeon_bo_gem.h | 1 + radeon/radeon_cs.c | 6 +++ radeon/radeon_cs.h | 2 +- radeon/radeon_cs_gem.c | 123 ++++++++++++++++++++++++++++++++++++------------- radeon/radeon_cs_int.h | 1 + xf86atomic.h | 6 +++ 8 files changed, 138 insertions(+), 41 deletions(-) diff --git a/configure.ac b/configure.ac index 953a7586..b044c966 100644 --- a/configure.ac +++ b/configure.ac @@ -56,8 +56,8 @@ AC_ARG_ENABLE(intel, AC_ARG_ENABLE(radeon, AS_HELP_STRING([--disable-radeon], - [Enable support for radeon's KMS API (default: enabled)]), - [RADEON=$enableval], [RADEON=yes]) + [Enable support for radeon's KMS API (default: auto)]), + [RADEON=$enableval], [RADEON=auto]) AC_ARG_ENABLE(vmwgfx-experimental-api, AS_HELP_STRING([--enable-vmwgfx-experimental-api], @@ -173,7 +173,7 @@ if test "x$HAVE_LIBUDEV" = xyes; then fi AM_CONDITIONAL(HAVE_LIBUDEV, [test "x$HAVE_LIBUDEV" = xyes]) -if test "x$INTEL" != "xno"; then +if test "x$INTEL" != "xno" -o "x$RADEON" != "xno"; then # Check for atomic intrinsics AC_CACHE_CHECK([for native atomic primitives], drm_cv_atomic_primitives, [ @@ -206,13 +206,26 @@ if test "x$INTEL" != "xno"; then fi if test "x$drm_cv_atomic_primitives" = "xnone"; then - if test "x$INTEL" != "xauto"; then - AC_MSG_ERROR([libdrm_intel depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package, or, failing both of those, disable support for Intel GPUs by passing --disable-intel to ./configure]) - else - INTEL=no - fi + if test "x$INTEL" != "xauto"; then + AC_MSG_ERROR([libdrm_intel depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package, or, failing both of those, disable support for Intel GPUs by passing --disable-intel to ./configure]) + else + AC_MSG_WARN([Disabling libdrm_intel. It depends on atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package.]) + INTEL=no + fi + if test "x$RADEON" != "xauto"; then + AC_MSG_ERROR([libdrm_radeon depends upon atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package, or, failing both of those, disable support for Radeon support by passing --disable-radeon to ./configure]) + else + AC_MSG_WARN([Disabling libdrm_radeon. It depends on atomic operations, which were not found for your compiler/cpu. Try compiling with -march=native, or install the libatomics-op-dev package.]) + RADEON=no + fi + else - INTEL=yes + if test "x$INTEL" != "xno"; then + INTEL=yes + fi + if test "x$RADEON" != "xno"; then + RADEON=yes + fi fi fi diff --git a/radeon/radeon_bo_gem.c b/radeon/radeon_bo_gem.c index bc8058d8..081ccb9f 100644 --- a/radeon/radeon_bo_gem.c +++ b/radeon/radeon_bo_gem.c @@ -39,6 +39,7 @@ #include #include #include "xf86drm.h" +#include "xf86atomic.h" #include "drm.h" #include "radeon_drm.h" #include "radeon_bo.h" @@ -49,6 +50,7 @@ struct radeon_bo_gem { struct radeon_bo_int base; uint32_t name; int map_count; + atomic_t reloc_in_cs; void *priv_ptr; }; @@ -80,6 +82,7 @@ static struct radeon_bo *bo_open(struct radeon_bo_manager *bom, bo->base.domains = domains; bo->base.flags = flags; bo->base.ptr = NULL; + atomic_set(&bo->reloc_in_cs, 0); bo->map_count = 0; if (handle) { struct drm_gem_open open_arg; @@ -309,6 +312,12 @@ uint32_t radeon_gem_name_bo(struct radeon_bo *bo) return bo_gem->name; } +void *radeon_gem_get_reloc_in_cs(struct radeon_bo *bo) +{ + struct radeon_bo_gem *bo_gem = (struct radeon_bo_gem*)bo; + return &bo_gem->reloc_in_cs; +} + int radeon_gem_get_kernel_name(struct radeon_bo *bo, uint32_t *name) { struct radeon_bo_int *boi = (struct radeon_bo_int *)bo; diff --git a/radeon/radeon_bo_gem.h b/radeon/radeon_bo_gem.h index c56c58e9..0af8610b 100644 --- a/radeon/radeon_bo_gem.h +++ b/radeon/radeon_bo_gem.h @@ -38,6 +38,7 @@ struct radeon_bo_manager *radeon_bo_manager_gem_ctor(int fd); void radeon_bo_manager_gem_dtor(struct radeon_bo_manager *bom); uint32_t radeon_gem_name_bo(struct radeon_bo *bo); +void *radeon_gem_get_reloc_in_cs(struct radeon_bo *bo); int radeon_gem_set_domain(struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain); int radeon_gem_get_kernel_name(struct radeon_bo *bo, uint32_t *name); #endif diff --git a/radeon/radeon_cs.c b/radeon/radeon_cs.c index cc9be398..d0e922be 100644 --- a/radeon/radeon_cs.c +++ b/radeon/radeon_cs.c @@ -88,3 +88,9 @@ void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *d csi->space_flush_fn = fn; csi->space_flush_data = data; } + +uint32_t radeon_cs_get_id(struct radeon_cs *cs) +{ + struct radeon_cs_int *csi = (struct radeon_cs_int *)cs; + return csi->id; +} diff --git a/radeon/radeon_cs.h b/radeon/radeon_cs.h index 49d5d9a6..7f6ee68b 100644 --- a/radeon/radeon_cs.h +++ b/radeon/radeon_cs.h @@ -85,7 +85,7 @@ extern int radeon_cs_write_reloc(struct radeon_cs *cs, uint32_t read_domain, uint32_t write_domain, uint32_t flags); - +extern uint32_t radeon_cs_get_id(struct radeon_cs *cs); /* * add a persistent BO to the list * a persistent BO is one that will be referenced across flushes, diff --git a/radeon/radeon_cs_gem.c b/radeon/radeon_cs_gem.c index 45a219c3..28ef5f64 100644 --- a/radeon/radeon_cs_gem.c +++ b/radeon/radeon_cs_gem.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include "radeon_cs.h" @@ -41,6 +42,7 @@ #include "radeon_bo_gem.h" #include "drm.h" #include "xf86drm.h" +#include "xf86atomic.h" #include "radeon_drm.h" struct radeon_cs_manager_gem { @@ -68,6 +70,50 @@ struct cs_gem { struct radeon_bo_int **relocs_bo; }; +static pthread_mutex_t id_mutex = PTHREAD_MUTEX_INITIALIZER; +static uint32_t cs_id_source = 0; + +/** + * result is undefined if called with ~0 + */ +static uint32_t get_first_zero(const uint32_t n) +{ + /* __builtin_ctz returns number of trailing zeros. */ + return 1 << __builtin_ctz(~n); +} + +/** + * Returns a free id for cs. + * If there is no free id we return zero + **/ +static uint32_t generate_id(void) +{ + uint32_t r = 0; + pthread_mutex_lock( &id_mutex ); + /* check for free ids */ + if (cs_id_source != ~r) { + /* find first zero bit */ + r = get_first_zero(cs_id_source); + + /* set id as reserved */ + cs_id_source |= r; + } + pthread_mutex_unlock( &id_mutex ); + return r; +} + +/** + * Free the id for later reuse + **/ +static void free_id(uint32_t id) +{ + pthread_mutex_lock( &id_mutex ); + + cs_id_source &= ~id; + + pthread_mutex_unlock( &id_mutex ); +} + static struct radeon_cs_int *cs_gem_create(struct radeon_cs_manager *csm, uint32_t ndw) { @@ -90,6 +136,7 @@ static struct radeon_cs_int *cs_gem_create(struct radeon_cs_manager *csm, } csg->base.relocs_total_size = 0; csg->base.crelocs = 0; + csg->base.id = generate_id(); csg->nrelocs = 4096 / (4 * 4) ; csg->relocs_bo = (struct radeon_bo_int**)calloc(1, csg->nrelocs*sizeof(void*)); @@ -141,38 +188,45 @@ static int cs_gem_write_reloc(struct radeon_cs_int *cs, if (write_domain == RADEON_GEM_DOMAIN_CPU) { return -EINVAL; } - /* check if bo is already referenced */ - for(i = 0; i < cs->crelocs; i++) { - idx = i * RELOC_SIZE; - reloc = (struct cs_reloc_gem*)&csg->relocs[idx]; - if (reloc->handle == bo->handle) { - /* Check domains must be in read or write. As we check already - * checked that in argument one of the read or write domain was - * set we only need to check that if previous reloc as the read - * domain set then the read_domain should also be set for this - * new relocation. - */ - /* the DDX expects to read and write from same pixmap */ - if (write_domain && (reloc->read_domain & write_domain)) { - reloc->read_domain = 0; - reloc->write_domain = write_domain; - } else if (read_domain & reloc->write_domain) { - reloc->read_domain = 0; - } else { - if (write_domain != reloc->write_domain) - return -EINVAL; - if (read_domain != reloc->read_domain) - return -EINVAL; + /* use bit field hash function to determine + if this bo is for sure not in this cs.*/ + if ((atomic_read((atomic_t *)radeon_gem_get_reloc_in_cs(bo)) & cs->id)) { + /* check if bo is already referenced. + * Scanning from end to begin reduces cycles with mesa because + * it often relocates same shared dma bo again. */ + for(i = cs->crelocs; i != 0;) { + --i; + idx = i * RELOC_SIZE; + reloc = (struct cs_reloc_gem*)&csg->relocs[idx]; + if (reloc->handle == bo->handle) { + /* Check domains must be in read or write. As we check already + * checked that in argument one of the read or write domain was + * set we only need to check that if previous reloc as the read + * domain set then the read_domain should also be set for this + * new relocation. + */ + /* the DDX expects to read and write from same pixmap */ + if (write_domain && (reloc->read_domain & write_domain)) { + reloc->read_domain = 0; + reloc->write_domain = write_domain; + } else if (read_domain & reloc->write_domain) { + reloc->read_domain = 0; + } else { + if (write_domain != reloc->write_domain) + return -EINVAL; + if (read_domain != reloc->read_domain) + return -EINVAL; + } + + reloc->read_domain |= read_domain; + reloc->write_domain |= write_domain; + /* update flags */ + reloc->flags |= (flags & reloc->flags); + /* write relocation packet */ + radeon_cs_write_dword((struct radeon_cs *)cs, 0xc0001000); + radeon_cs_write_dword((struct radeon_cs *)cs, idx); + return 0; } - - reloc->read_domain |= read_domain; - reloc->write_domain |= write_domain; - /* update flags */ - reloc->flags |= (flags & reloc->flags); - /* write relocation packet */ - radeon_cs_write_dword((struct radeon_cs *)cs, 0xc0001000); - radeon_cs_write_dword((struct radeon_cs *)cs, idx); - return 0; } } /* new relocation */ @@ -203,6 +257,8 @@ static int cs_gem_write_reloc(struct radeon_cs_int *cs, reloc->flags = flags; csg->chunks[1].length_dw += RELOC_SIZE; radeon_bo_ref(bo); + /* bo might be referenced from another context so have to use atomic opertions */ + atomic_add((atomic_t *)radeon_gem_get_reloc_in_cs(bo), cs->id); cs->relocs_total_size += boi->size; radeon_cs_write_dword((struct radeon_cs *)cs, 0xc0001000); radeon_cs_write_dword((struct radeon_cs *)cs, idx); @@ -288,6 +344,8 @@ static int cs_gem_emit(struct radeon_cs_int *cs) &csg->cs, sizeof(struct drm_radeon_cs)); for (i = 0; i < csg->base.crelocs; i++) { csg->relocs_bo[i]->space_accounted = 0; + /* bo might be referenced from another context so have to use atomic opertions */ + atomic_dec((atomic_t *)radeon_gem_get_reloc_in_cs((struct radeon_bo*)csg->relocs_bo[i]), cs->id); radeon_bo_unref((struct radeon_bo *)csg->relocs_bo[i]); csg->relocs_bo[i] = NULL; } @@ -302,6 +360,7 @@ static int cs_gem_destroy(struct radeon_cs_int *cs) { struct cs_gem *csg = (struct cs_gem*)cs; + free_id(cs->id); free(csg->relocs_bo); free(cs->relocs); free(cs->packets); @@ -317,6 +376,8 @@ static int cs_gem_erase(struct radeon_cs_int *cs) if (csg->relocs_bo) { for (i = 0; i < csg->base.crelocs; i++) { if (csg->relocs_bo[i]) { + /* bo might be referenced from another context so have to use atomic opertions */ + atomic_dec((atomic_t *)radeon_gem_get_reloc_in_cs((struct radeon_bo*)csg->relocs_bo[i]), cs->id); radeon_bo_unref((struct radeon_bo *)csg->relocs_bo[i]); csg->relocs_bo[i] = NULL; } diff --git a/radeon/radeon_cs_int.h b/radeon/radeon_cs_int.h index 8ba76bf9..6cee5742 100644 --- a/radeon/radeon_cs_int.h +++ b/radeon/radeon_cs_int.h @@ -28,6 +28,7 @@ struct radeon_cs_int { int bo_count; void (*space_flush_fn)(void *); void *space_flush_data; + uint32_t id; }; /* cs functions */ diff --git a/xf86atomic.h b/xf86atomic.h index de8e220e..854187a0 100644 --- a/xf86atomic.h +++ b/xf86atomic.h @@ -50,6 +50,8 @@ typedef struct { # define atomic_set(x, val) ((x)->atomic = (val)) # define atomic_inc(x) ((void) __sync_fetch_and_add (&(x)->atomic, 1)) # define atomic_dec_and_test(x) (__sync_fetch_and_add (&(x)->atomic, -1) == 1) +# define atomic_add(x, v) ((void) __sync_add_and_fetch(&(x)->atomic, (v))) +# define atomic_dec(x, v) ((void) __sync_sub_and_fetch(&(x)->atomic, (v))) # define atomic_cmpxchg(x, oldv, newv) __sync_val_compare_and_swap (&(x)->atomic, oldv, newv) #endif @@ -66,6 +68,8 @@ typedef struct { # define atomic_read(x) AO_load_full(&(x)->atomic) # define atomic_set(x, val) AO_store_full(&(x)->atomic, (val)) # define atomic_inc(x) ((void) AO_fetch_and_add1_full(&(x)->atomic)) +# define atomic_add(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, (v))) +# define atomic_dec(x, v) ((void) AO_fetch_and_add_full(&(x)->atomic, -(v))) # define atomic_dec_and_test(x) (AO_fetch_and_sub1_full(&(x)->atomic) == 1) # define atomic_cmpxchg(x, oldv, newv) AO_compare_and_swap_full(&(x)->atomic, oldv, newv) @@ -82,6 +86,8 @@ typedef struct { uint_t atomic; } atomic_t; # define atomic_set(x, val) ((x)->atomic = (uint_t)(val)) # define atomic_inc(x) (atomic_inc_uint (&(x)->atomic)) # define atomic_dec_and_test(x) (atomic_dec_uint_nv(&(x)->atomic) == 1) +# define atomic_add(x, v) (atomic_add_uint(&(x)->atomic, (v))) +# define atomic_dec(x, v) (atomic_dec_uint(&(x)->atomic, (v))) # define atomic_cmpxchg(x, oldv, newv) atomic_cas_uint (&(x)->atomic, oldv, newv) #endif -- cgit v1.2.3