thr3ads.net - Nouveau - [Nouveau] [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel (v2) [Feb 2010]

If this information is useful, please help other people find it:
Share via:

Luca Barbieri

2010-Feb-09 09:00 UTC

[Nouveau] [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel (v2)

Changes in v2:
- Addressed review comments

nouveau_bo_wait will make the GPU channel wait for fence if possible,
otherwise falling back to waiting with the CPU using ttm_bo_wait.

The nouveau_fence_sync function currently returns -ENOSYS, and is
the focus of the next patch.

Signed-off-by: Luca Barbieri <luca at luca-barbieri.com>
---
 drivers/gpu/drm/nouveau/nouveau_bo.c    |   68 ++++++++++++++++++++++++++++++-
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    2 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |    6 +++
 drivers/gpu/drm/nouveau/nouveau_gem.c   |   20 +++++----
 4 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
b/drivers/gpu/drm/nouveau/nouveau_bo.c
index 028719f..2da6acf 100644
--- a/drivers/gpu/drm/nouveau/nouveau_bo.c
+++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
@@ -35,6 +35,70 @@
 
 #include <linux/log2.h>
 
+int
+nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel *chan)
+{
+	int ret = 0;
+
+	if (likely(!bo->sync_obj))
+		return 0;
+
+	spin_lock(&bo->lock);
+	if (chan) {
+		struct nouveau_fence *new_fence;
+		struct nouveau_channel *waited_chan;
+
+		do {
+			struct nouveau_fence *prev_fence;
+			prev_fence = bo->sync_obj;
+
+			waited_chan = nouveau_fence_channel(prev_fence);
+			if (likely(!waited_chan || waited_chan == chan))
+				break;
+
+			nouveau_fence_ref(prev_fence);
+
+			ret = ttm_bo_wait(bo, false, false, true);
+			if (!ret)
+				goto unref_break;
+
+			if (unlikely(prev_fence != bo->sync_obj))
+				goto unref_continue;
+
+			spin_unlock(&bo->lock);
+			new_fence = nouveau_fence_sync(prev_fence, chan);
+			spin_lock(&bo->lock);
+
+			if (likely(!IS_ERR(new_fence))) {
+				if (likely(bo->sync_obj)) {
+					if (unlikely(bo->sync_obj != prev_fence)) {
+						nouveau_fence_unref((void **)&new_fence);
+						continue;
+					}
+					nouveau_fence_unref((void **)&bo->sync_obj);
+				}
+				bo->sync_obj = new_fence;
+				ret = 0;
+unref_break:
+				nouveau_fence_unref((void **)&prev_fence);
+				break;
+			}
+
+			if (unlikely(prev_fence != bo->sync_obj)) {
+unref_continue:
+				nouveau_fence_unref((void **)&prev_fence);
+				continue;
+			}
+
+			nouveau_fence_unref((void **)&prev_fence);
+			ret = ttm_bo_wait(bo, false, false, false);
+		} while (0);
+	} else
+		ret = ttm_bo_wait(bo, false, false, false);
+	spin_unlock(&bo->lock);
+	return ret;
+}
+
 static void
 nouveau_bo_del_ttm(struct ttm_buffer_object *bo)
 {
@@ -469,8 +533,10 @@ nouveau_bo_move_accel_cleanup(struct nouveau_channel *chan,
 
 	ret = ttm_bo_move_accel_cleanup(&nvbo->bo, fence, NULL,
 					evict, no_wait, new_mem);
+
+	/* TODO: this should be redundant, since we do the check in validate */
 	if (nvbo->channel && nvbo->channel != chan)
-		ret = nouveau_fence_wait(fence, NULL, false, false);
+		ret = nouveau_bo_wait(&nvbo->bo, nvbo->channel);
 	nouveau_fence_unref((void *)&fence);
 	return ret;
 }
diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 64987a9..bb9024c 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -1111,6 +1111,7 @@ extern int nv04_crtc_create(struct drm_device *, int
index);
 
 /* nouveau_bo.c */
 extern struct ttm_bo_driver nouveau_bo_driver;
+extern int nouveau_bo_wait(struct ttm_buffer_object *bo, struct nouveau_channel
*chan);
 extern int nouveau_bo_new(struct drm_device *, struct nouveau_channel *,
 			  int size, int align, uint32_t flags,
 			  uint32_t tile_mode, uint32_t tile_flags,
@@ -1136,6 +1137,7 @@ extern int nouveau_fence_emit(struct nouveau_fence *);
 struct nouveau_channel *nouveau_fence_channel(struct nouveau_fence *);
 extern bool nouveau_fence_signalled(void *obj, void *arg);
 extern int nouveau_fence_wait(void *obj, void *arg, bool lazy, bool intr);
+extern struct nouveau_fence *nouveau_fence_sync(struct nouveau_fence *, struct
nouveau_channel *);
 extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index faddf53..9b1c2c3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -56,6 +56,12 @@ nouveau_fence_del(struct kref *ref)
 	kfree(fence);
 }
 
+struct nouveau_fence*
+nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel
*chan)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
 void
 nouveau_fence_update(struct nouveau_channel *chan)
 {
diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
b/drivers/gpu/drm/nouveau/nouveau_gem.c
index 34063c5..f73ac83 100644
--- a/drivers/gpu/drm/nouveau/nouveau_gem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
@@ -354,15 +354,6 @@ validate_list(struct nouveau_channel *chan, struct
list_head *list,
 
 	list_for_each_entry(nvbo, list, entry) {
 		struct drm_nouveau_gem_pushbuf_bo *b = &pbbo[nvbo->pbbo_index];
-		struct nouveau_fence *prev_fence = nvbo->bo.sync_obj;
-
-		if (prev_fence && nouveau_fence_channel(prev_fence) != chan) {
-			spin_lock(&nvbo->bo.lock);
-			ret = ttm_bo_wait(&nvbo->bo, false, false, false);
-			spin_unlock(&nvbo->bo.lock);
-			if (unlikely(ret))
-				return ret;
-		}
 
 		ret = nouveau_gem_set_domain(nvbo->gem, b->read_domains,
 					     b->write_domains,
@@ -377,6 +368,17 @@ validate_list(struct nouveau_channel *chan, struct
list_head *list,
 		if (unlikely(ret))
 			return ret;
 
+		/* we must wait *after* validation, since we do the move
+		   with the kernel channel.
+
+		    Note that this may spin/sleep on a fence
+		    TODO: is this a good idea, or should we bail and retry?
+		  */
+		ret = nouveau_bo_wait(&nvbo->bo, chan);
+		if (unlikely(ret))
+			return ret;
+
+
 		if (nvbo->bo.offset == b->presumed_offset &&
 		    ((nvbo->bo.mem.mem_type == TTM_PL_VRAM &&
 		      b->presumed_domain & NOUVEAU_GEM_DOMAIN_VRAM) ||
-- 
1.6.6.1.476.g01ddb

Luca Barbieri

2010-Feb-09 09:00 UTC

head link

[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator (v2)

Changes in v2:
- Addressed review comments
- Fixed lockless algorithm (must not dec if negative in addition to if 0)
- Made spinlock irqsave (fences are completed in IRQs)

This patch adds code to allocate semaphores in a dynamic way using
a lockless algorithm.

1. Semaphore BOs

Semaphore BOs are BOs containing semaphores. Each is 4KB large and
contains 1024 4-byte semaphores. They are pinned and mapped.

Semaphore BOs are allocated on-demand and freed at device takedown.
Those that are not fully allocated are kept on a free list.

Each is assigned an handle. DMA objects and references are created
on demand for each channel that needs to use a semaphore BO.
Those objects and references are automatically destroyed at channel
destruction time.

Typically only a single semaphore BO will be used.

2. Semaphore allocation

Each semaphore BO contains a bitmask of free semaphores within the BO.
Allocation is done in a lockless fashion using a count of free
semaphores and the bitmask.

Semaphores are released once the fence on the waiting side passed.
This is done by adding fields to nouveau_fence.

Semaphore values are zeroed when the semaphore BO is allocated, and
are afterwards only modified by the GPU.

This is performed by storing a bitmask that allows to alternate
between using the values 0 and 1 for a given semaphore.

Signed-off-by: Luca Barbieri <luca at luca-barbieri.com>
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    9 +
 drivers/gpu/drm/nouveau/nouveau_fence.c |  265 +++++++++++++++++++++++++++++++
 drivers/gpu/drm/nouveau/nouveau_state.c |    4 +
 3 files changed, 278 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index bb9024c..93e5427 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -621,6 +621,13 @@ struct drm_nouveau_private {
 	struct {
 		struct dentry *channel_root;
 	} debugfs;
+
+	struct {
+		spinlock_t free_list_lock;
+		struct nouveau_sem_bo *free_list;
+		uint32_t handles;
+		uint32_t max_handles;
+	} sem;
 };
 
 static inline struct drm_nouveau_private *
@@ -1142,6 +1149,8 @@ extern int nouveau_fence_flush(void *obj, void *arg);
 extern void nouveau_fence_unref(void **obj);
 extern void *nouveau_fence_ref(void *obj);
 extern void nouveau_fence_handler(struct drm_device *dev, int channel);
+extern void nouveau_fence_device_init(struct drm_device *dev);
+extern void nouveau_fence_device_takedown(struct drm_device *dev);
 
 /* nouveau_gem.c */
 extern int nouveau_gem_new(struct drm_device *, struct nouveau_channel *,
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 9b1c2c3..7157148 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -32,6 +32,13 @@
 
 #define USE_REFCNT (dev_priv->card_type >= NV_10)
 
+#define NOUVEAU_SEM_BO_SIZE PAGE_SIZE
+
+/* reading fences can be very expensive
+ * use a threshold that would only use up half a single sem_bo
+ */
+#define NOUVEAU_SEM_MIN_THRESHOLD (NOUVEAU_SEM_BO_SIZE /
(NOUVEAU_MAX_CHANNEL_NR * 2))
+
 struct nouveau_fence {
 	struct nouveau_channel *channel;
 	struct kref refcount;
@@ -47,6 +54,240 @@ nouveau_fence(void *sync_obj)
 	return (struct nouveau_fence *)sync_obj;
 }
 
+struct nouveau_sem_bo {
+	struct nouveau_sem_bo *next;
+	struct nouveau_bo *bo;
+	uint32_t handle;
+
+	/* >= 0: num_free + 1 slots are free, sem_bo is or is about to be on
free_list
+	    -1: all allocated, sem_bo is NOT on free_list
+	*/
+	atomic_t num_free;
+
+	DECLARE_BITMAP(free_slots, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+	DECLARE_BITMAP(values, NOUVEAU_SEM_BO_SIZE / sizeof(uint32_t));
+	DECLARE_BITMAP(channels, NOUVEAU_MAX_CHANNEL_NR);
+};
+
+struct nouveau_sem {
+	struct nouveau_sem_bo *sem_bo;
+	unsigned num;
+	uint32_t value;
+};
+
+static struct nouveau_sem_bo*
+nouveau_sem_bo_alloc(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	struct nouveau_sem_bo *sem_bo;
+	struct nouveau_bo *bo;
+	int flags = TTM_PL_FLAG_VRAM;
+	int ret;
+	bool is_iomem;
+	void *mem;
+	unsigned handle;
+
+	do {
+		handle = dev_priv->sem.handles;
+		if (handle >= dev_priv->sem.max_handles)
+			return NULL;
+	} while (cmpxchg(&dev_priv->sem.handles, handle, handle + 1) !=
handle);
+
+	sem_bo = kmalloc(sizeof(*sem_bo), GFP_KERNEL);
+	if (!sem_bo)
+		return NULL;
+
+	sem_bo->handle = NvSem + handle;
+
+	ret = nouveau_bo_new(dev, NULL, NOUVEAU_SEM_BO_SIZE, 0, flags,
+			0, 0x0000, true, true, &bo);
+	if (ret)
+		goto out_free;
+
+	sem_bo->bo = bo;
+
+	ret = nouveau_bo_pin(bo, flags);
+	if (ret)
+		goto out_bo;
+
+	ret = nouveau_bo_map(bo);
+	if (ret)
+		goto out_unpin;
+
+	mem = ttm_kmap_obj_virtual(&bo->kmap, &is_iomem);
+	if (is_iomem)
+		memset_io((void __force __iomem *)mem, 0, NOUVEAU_SEM_BO_SIZE);
+	else
+		memset(mem, 0, NOUVEAU_SEM_BO_SIZE);
+
+	nouveau_bo_unmap(bo);
+
+	memset((void *)sem_bo->free_slots, 0xff, sizeof(sem_bo->free_slots));
+	memset((void *)sem_bo->values, 0xff, sizeof(sem_bo->values));
+	atomic_set(&sem_bo->num_free, sizeof(sem_bo->free_slots) * 8 - 1);
+
+	memset((void *)sem_bo->channels, 0, sizeof(sem_bo->channels));
+
+	return sem_bo;
+
+out_unpin:
+	nouveau_bo_unpin(sem_bo->bo);
+out_bo:
+	nouveau_bo_ref(NULL, &sem_bo->bo);
+out_free:
+	kfree(sem_bo);
+	return NULL;
+}
+
+static void
+nouveau_sem_bo_channel_dtor(struct drm_device *dev,
+			     struct nouveau_gpuobj *gpuobj) {
+	struct nouveau_sem_bo *sem_bo;
+	struct nouveau_channel *chan;
+
+	if (!gpuobj->priv)
+		return;
+
+	chan = gpuobj->im_channel;
+	sem_bo = gpuobj->priv;
+
+	clear_bit(chan->id, sem_bo->channels);
+	smp_wmb();
+}
+
+static int
+nouveau_sem_bo_channel_init(struct nouveau_sem_bo *sem_bo, struct
nouveau_channel *chan)
+{
+	struct drm_device *dev = chan->dev;
+	struct nouveau_gpuobj *obj = NULL;
+	int ret;
+
+	if (test_bit(chan->id, sem_bo->channels))
+		return 0;
+
+	if (WARN_ON(sem_bo->bo->bo.mem.mem_type != TTM_PL_VRAM))
+		return -EINVAL;
+
+	ret = nouveau_gpuobj_dma_new(chan, NV_CLASS_DMA_IN_MEMORY,
+		sem_bo->bo->bo.mem.mm_node->start, NOUVEAU_SEM_BO_SIZE,
+		NV_DMA_ACCESS_RW, NV_DMA_TARGET_VIDMEM, &obj);
+	if (ret)
+		return ret;
+
+	obj->dtor = nouveau_sem_bo_channel_dtor;
+	obj->priv = sem_bo;
+
+	ret = nouveau_gpuobj_ref_add(dev, chan, sem_bo->handle, obj, NULL);
+	if (ret) {
+		nouveau_gpuobj_del(dev, &obj);
+		return ret;
+	}
+
+	set_bit(chan->id, sem_bo->channels);
+	smp_wmb();
+
+	return 0;
+}
+
+static void
+nouveau_sem_bo_free(struct nouveau_sem_bo *sem_bo)
+{
+	nouveau_bo_unpin(sem_bo->bo);
+	nouveau_bo_ref(NULL, &sem_bo->bo);
+	kfree(sem_bo);
+}
+
+static inline void
+nouveau_sem_bo_enqueue(struct drm_device *dev, struct nouveau_sem_bo *sem_bo)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev_priv->sem.free_list_lock, flags);
+	sem_bo->next = dev_priv->sem.free_list;
+	dev_priv->sem.free_list = sem_bo;
+	spin_unlock_irqrestore(&dev_priv->sem.free_list_lock, flags);
+}
+
+static int
+nouveau_sem_alloc(struct drm_device *dev, struct nouveau_sem *sem)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	struct nouveau_sem_bo *sem_bo = NULL;
+	int v;
+
+retry:
+	sem_bo = dev_priv->sem.free_list;
+	if (!sem_bo) {
+		sem_bo = nouveau_sem_bo_alloc(dev);
+		if (!sem_bo)
+			return -ENOMEM;
+
+		atomic_dec(&sem_bo->num_free);
+		nouveau_sem_bo_enqueue(dev, sem_bo);
+	} else {
+		int num_free;
+retry_num_free:
+		num_free = atomic_read(&sem_bo->num_free);
+		if (unlikely(num_free <= 0)) {
+			unsigned long flags;
+			if (unlikely(num_free < 0))
+				goto retry;
+
+			spin_lock_irqsave(&dev_priv->sem.free_list_lock, flags);
+			if (unlikely(sem_bo != dev_priv->sem.free_list)) {
+				spin_unlock_irqrestore(&dev_priv->sem.free_list_lock, flags);
+				goto retry;
+			}
+
+			dev_priv->sem.free_list = sem_bo->next;
+			/* Someone may have incremented the count in the meantime.
+			 * In this case, revert the above line and put it back on the free list.
+			 *
+			 * Note that we can't just decrement before removing from the list,
+			 * since otherwise an increment could put sem_bo in the free_list twice,
+			 * corrupting it.
+			 *
+			 * Note that num_free cannot already be -1 because we just checked that
+			 * sem_bo is still the head of the free list, and we are holding
free_list_lock.
+			 *
+			 * atomic_dec_return is a memory barrier, so this is fine.
+			 */
+			if (atomic_dec_return(&sem_bo->num_free) >= 0)
+				dev_priv->sem.free_list = sem_bo;
+
+			spin_unlock_irqrestore(&dev_priv->sem.free_list_lock, flags);
+		} else if (unlikely(atomic_cmpxchg(&sem_bo->num_free, num_free,
num_free - 1) != num_free))
+			goto retry_num_free;
+	}
+
+retry_bit:
+	v = find_first_bit(sem_bo->free_slots, sizeof(sem_bo->free_slots) * 8);
+
+	/* we reserved our bit by decrementing num_free, so this doesn't happen
+	   however, the first available bit may have been taken */
+	if (WARN_ON(v >= sizeof(sem_bo->free_slots) * 8))
+		goto retry;
+
+	if (unlikely(!test_and_clear_bit(v, sem_bo->free_slots)))
+		goto retry_bit;
+
+	sem->sem_bo = sem_bo;
+	sem->value = test_and_change_bit(v, sem_bo->values);
+	sem->num = v;
+
+	return 0;
+}
+
+static void
+nouveau_sem_release(struct drm_device *dev, struct nouveau_sem_bo *sem_bo, int
i)
+{
+	set_bit(i, sem_bo->free_slots);
+
+	if (atomic_inc_and_test(&sem_bo->num_free))
+		nouveau_sem_bo_enqueue(dev, sem_bo);
+}
+
 static void
 nouveau_fence_del(struct kref *ref)
 {
@@ -266,3 +507,27 @@ nouveau_fence_fini(struct nouveau_channel *chan)
 	}
 }
 
+void
+nouveau_fence_device_init(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	spin_lock_init(&dev_priv->sem.free_list_lock);
+	dev_priv->sem.free_list = NULL;
+	dev_priv->sem.handles = 0;
+	/* these are each pinned and 4KB, providing 1024 semaphores each
+	   we should need only one in normal circumstances */
+	dev_priv->sem.max_handles = 16;
+}
+
+void
+nouveau_fence_device_takedown(struct drm_device *dev)
+{
+	struct drm_nouveau_private *dev_priv = dev->dev_private;
+	struct nouveau_sem_bo *sem_bo, *next;
+	/* all the sem_bos allocated must be in the free list since all channels
+	* and thus fences have already been terminated */
+	for (sem_bo = dev_priv->sem.free_list; sem_bo; sem_bo = next) {
+		next = sem_bo->next;
+		nouveau_sem_bo_free(sem_bo);
+	}
+}
diff --git a/drivers/gpu/drm/nouveau/nouveau_state.c
b/drivers/gpu/drm/nouveau/nouveau_state.c
index 3586667..bd3c43a 100644
--- a/drivers/gpu/drm/nouveau/nouveau_state.c
+++ b/drivers/gpu/drm/nouveau/nouveau_state.c
@@ -413,6 +413,8 @@ nouveau_card_init(struct drm_device *dev)
 	if (ret)
 		goto out_mem;
 
+	nouveau_fence_device_init(dev);
+
 	/* PMC */
 	ret = engine->mc.init(dev);
 	if (ret)
@@ -533,6 +535,8 @@ static void nouveau_card_takedown(struct drm_device *dev)
 		engine->timer.takedown(dev);
 		engine->mc.takedown(dev);
 
+		nouveau_fence_device_takedown(dev);
+
 		mutex_lock(&dev->struct_mutex);
 		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_VRAM);
 		ttm_bo_clean_mm(&dev_priv->ttm.bdev, TTM_PL_TT);
-- 
1.6.6.1.476.g01ddb

Luca Barbieri

2010-Feb-09 09:00 UTC

head link

[Nouveau] [PATCH 3/3] Use semaphores for fully on-GPU interchannel synchronization (v2)

Changes in v2:
- Addressed review comments

This patch implements the nouveau_fence_sync interface introduced
in the previous patch using dynamically allocated semaphores,
introduced in the other previous patch.

This is tested on NV40, but should work on NV17-NV50 (previous cards
will just fallback to CPU waiting).

Unlike a previously posted patch, this patch does not make any use of
software methods and is designed to do all work on the GPU, and be
as fast as possible.

To perform inter-channel synchronization, commands are emitted on
both channels involved.

First, a semaphore is allocated, and a valid handle for it is inserted
in the channel if necessary.

DMA_SEMAPHORE is set only if different from the last used one. This
is usually not the case, and thus SEMAPHORE interrupts only happen
once per channel usually.

After that, SEMAPHORE_OFFSET is set if changed and then either ACQUIRE
or RELEASE is used.

On the waiting channel, a fence is also emitted. Once that fence
expires, the semaphore is released and can be reused for any purpose.

This results in synchronization taking place fully on the GPU, with
no CPU waiting necessary.

Signed-off-by: Luca Barbieri <luca at luca-barbieri.com>
---
 drivers/gpu/drm/nouveau/nouveau_drv.h   |    7 ++
 drivers/gpu/drm/nouveau/nouveau_fence.c |  136 +++++++++++++++++++++++++++++--
 2 files changed, 136 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_drv.h
b/drivers/gpu/drm/nouveau/nouveau_drv.h
index 93e5427..d3aa20e 100644
--- a/drivers/gpu/drm/nouveau/nouveau_drv.h
+++ b/drivers/gpu/drm/nouveau/nouveau_drv.h
@@ -195,6 +195,8 @@ struct nouveau_channel {
 		uint32_t sequence;
 		uint32_t sequence_ack;
 		uint32_t last_sequence_irq;
+		atomic_t sem_count;
+		unsigned sem_threshold;
 	} fence;
 
 	/* DMA push buffer */
@@ -255,6 +257,11 @@ struct nouveau_channel {
 		char name[32];
 		struct drm_info_list info;
 	} debugfs;
+
+	struct {
+		unsigned handle;
+		unsigned num;
+	} sem;
 };
 
 struct nouveau_instmem_engine {
diff --git a/drivers/gpu/drm/nouveau/nouveau_fence.c
b/drivers/gpu/drm/nouveau/nouveau_fence.c
index 7157148..b4b016f 100644
--- a/drivers/gpu/drm/nouveau/nouveau_fence.c
+++ b/drivers/gpu/drm/nouveau/nouveau_fence.c
@@ -46,6 +46,9 @@ struct nouveau_fence {
 
 	uint32_t sequence;
 	bool signalled;
+
+	struct nouveau_sem_bo *sem_bo;
+	int sem_num;
 };
 
 static inline struct nouveau_fence *
@@ -297,10 +300,123 @@ nouveau_fence_del(struct kref *ref)
 	kfree(fence);
 }
 
+static inline void
+nouveau_sem_emit(struct nouveau_channel *chan, struct nouveau_sem *sem,
unsigned op)
+{
+	uint32_t handle = sem->sem_bo->handle;
+	if (chan->sem.handle != handle) {
+		BEGIN_RING(chan, NvSubSw, NV_SW_DMA_SEMAPHORE, 1);
+		OUT_RING(chan, handle);
+		chan->sem.handle = handle;
+	}
+	if (chan->sem.num != sem->num) {
+		BEGIN_RING(chan, NvSubSw, NV_SW_SEMAPHORE_OFFSET, 1);
+		OUT_RING(chan, sem->num << 2);
+		chan->sem.num = sem->num;
+	}
+	BEGIN_RING(chan, NvSubSw, op, 1);
+	OUT_RING(chan, sem->value);
+}
+
+/* Currently this ignores waited_fence->sequence and syncs the last fence on
waited_fence->channel
+ * If a better GPU synchronization mechanism is discovered, then the actual
fence may be used.
+ * Note that sem_fence is a fence on the *waiting *channel, used to free the
semaphore.
+ */
 struct nouveau_fence*
 nouveau_fence_sync(struct nouveau_fence *waited_fence, struct nouveau_channel
*chan)
 {
-	return ERR_PTR(-ENOSYS);
+	struct nouveau_channel *waited_chan;
+	struct drm_device *dev;
+	struct drm_nouveau_private *dev_priv;
+	struct nouveau_sem sem;
+	uint32_t handle;
+	int ret;
+	struct nouveau_fence *sem_fence;
+	unsigned long flags;
+
+	dev = chan->dev;
+	dev_priv = chan->dev->dev_private;
+
+	if (dev_priv->chipset < 0x17)
+		return ERR_PTR(-ENOSYS);
+
+	waited_chan = waited_fence->channel;
+
+	ret = RING_SPACE(chan, 6 + 2);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = RING_SPACE(waited_chan, 6);
+	if (ret)
+		return ERR_PTR(ret);
+
+	/* try to reclaim semaphores when we hit the threshold
+	   this helps keeping a low number of active semaphores
+
+	   Note that in the DRI2 case this is never triggered
+	   since we wait for fences on both channels.
+
+	   However, if buffers were all different, this could be
+	   necessary.
+	*/
+	if (atomic_read(&chan->fence.sem_count) >=
chan->fence.sem_threshold) {
+		spin_lock_irqsave(&chan->fence.lock, flags);
+		if (atomic_read(&chan->fence.sem_count) >=
chan->fence.sem_threshold)
+			nouveau_fence_update(chan);
+		spin_unlock_irqrestore(&chan->fence.lock, flags);
+	}
+
+	ret = nouveau_fence_new(chan, &sem_fence, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = nouveau_sem_alloc(chan->dev, &sem);
+	if (ret) {
+		kfree(sem_fence);
+		return ERR_PTR(ret);
+	}
+
+	BUG_ON(!sem.sem_bo);
+
+	ret = nouveau_sem_bo_channel_init(sem.sem_bo, chan);
+	if (!ret)
+		ret = nouveau_sem_bo_channel_init(sem.sem_bo, waited_chan);
+	if (ret) {
+		nouveau_sem_release(dev, sem.sem_bo, sem.num);
+		kfree(sem_fence);
+		return ERR_PTR(ret);
+	}
+
+	handle = sem.sem_bo->handle;
+
+/*	NV_DEBUG(dev, "sync %i <- %i with %x:%i (sem %i/%i)\n",
chan->id, waited_chan->id, sem.sem_bo->handle, sem.num,
atomic_read(&chan->fence.sem_count), chan->fence.sem_threshold); */
+
+	sem_fence->sem_bo = sem.sem_bo;
+	sem_fence->sem_num = sem.num;
+
+	atomic_inc(&chan->fence.sem_count);
+
+/* TODO: this should take the channel locks when they are added */
+	nouveau_sem_emit(chan, &sem, NV_SW_SEMAPHORE_ACQUIRE);
+
+	nouveau_fence_emit(sem_fence);
+
+	nouveau_sem_emit(waited_chan, &sem, NV_SW_SEMAPHORE_RELEASE);
+	FIRE_RING(waited_chan);
+	return sem_fence;
+}
+
+static void
+nouveau_fence_complete(struct nouveau_fence *fence)
+{
+	if (fence->sem_bo) {
+		nouveau_sem_release(fence->channel->dev, fence->sem_bo,
fence->sem_num);
+		atomic_dec(&fence->channel->fence.sem_count);
+	}
+
+	fence->signalled = true;
+	list_del(&fence->entry);
+	kref_put(&fence->refcount, nouveau_fence_del);
 }
 
 void
@@ -310,6 +426,7 @@ nouveau_fence_update(struct nouveau_channel *chan)
 	struct list_head *entry, *tmp;
 	struct nouveau_fence *fence;
 	uint32_t sequence;
+	unsigned sem_threshold;
 
 	if (USE_REFCNT)
 		sequence = nvchan_rd32(chan, 0x48);
@@ -324,13 +441,16 @@ nouveau_fence_update(struct nouveau_channel *chan)
 		fence = list_entry(entry, struct nouveau_fence, entry);
 
 		sequence = fence->sequence;
-		fence->signalled = true;
-		list_del(&fence->entry);
-		kref_put(&fence->refcount, nouveau_fence_del);
+		nouveau_fence_complete(fence);
 
 		if (sequence == chan->fence.sequence_ack)
 			break;
 	}
+
+	sem_threshold = atomic_read(&chan->fence.sem_count) * 2;
+	if (sem_threshold < NOUVEAU_SEM_MIN_THRESHOLD)
+		sem_threshold = NOUVEAU_SEM_MIN_THRESHOLD;
+	chan->fence.sem_threshold = sem_threshold;
 }
 
 int
@@ -489,6 +609,10 @@ nouveau_fence_init(struct nouveau_channel *chan)
 {
 	INIT_LIST_HEAD(&chan->fence.pending);
 	spin_lock_init(&chan->fence.lock);
+	atomic_set(&chan->fence.sem_count, 0);
+	chan->fence.sem_threshold = NOUVEAU_SEM_MIN_THRESHOLD;
+	chan->sem.handle = 0;
+	chan->sem.num = ~0;
 	return 0;
 }
 
@@ -501,9 +625,7 @@ nouveau_fence_fini(struct nouveau_channel *chan)
 	list_for_each_safe(entry, tmp, &chan->fence.pending) {
 		fence = list_entry(entry, struct nouveau_fence, entry);
 
-		fence->signalled = true;
-		list_del(&fence->entry);
-		kref_put(&fence->refcount, nouveau_fence_del);
+		nouveau_fence_complete(fence);
 	}
 }
 
-- 
1.6.6.1.476.g01ddb

Nouveau - Feb 2010 - [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel (v2)

[Nouveau] [PATCH 1/3] Introduce nouveau_bo_wait for waiting on a BO with a GPU channel (v2)

[Nouveau] [PATCH 2/3] drm/nouveau: add lockless dynamic semaphore allocator (v2)

[Nouveau] [PATCH 3/3] Use semaphores for fully on-GPU interchannel synchronization (v2)

Apparently Analagous Threads