Hi, Generally the the imported buffers which has memory type TTM_PL_TT are mapped as small pages probably due to lack of big page allocation. But the platform device which also use memory type TTM_PL_TT, like GK20A, can *allocate* big page though the IOMMU hardware inside the SoC. This is a try to map the imported buffers as big pages in GMMU by the platform IOMMU. With some preparation work to map decreate small pages into big page(s) by IOMMU the GMMU eventually sees the imported buffer as chunks of big pages and does the mapping. And then we can probably do the compression on teh imported buffer which is composed of non-contiguous small pages. The compbits related patches shall come later. I guess most of you won't like the change for the MMU code in this series. So please comment and guide me how to do this better. :) Thanks, Vince Vince Hsu (6): platform: specify the IOMMU physical translation bit instmem/gk20a: refer to IOMMU physical translation bit mmu: map small pages into big pages(s) by IOMMU if possible drm: enable big page mapping for small pages when IOMMU is available mmu: gf100: share most of functions with GK20A mmu: gk20a: implement IOMMU mapping for big pages drm/nouveau/include/nvkm/subdev/mmu.h | 16 ++ drm/nouveau/nouveau_bo.c | 9 ++ drm/nouveau/nouveau_platform.c | 19 +++ drm/nouveau/nouveau_platform.h | 1 + drm/nouveau/nvkm/engine/device/gk104.c | 2 +- drm/nouveau/nvkm/subdev/instmem/gk20a.c | 13 +- drm/nouveau/nvkm/subdev/mmu/Kbuild | 1 + drm/nouveau/nvkm/subdev/mmu/base.c | 158 +++++++++++++++++++- drm/nouveau/nvkm/subdev/mmu/gf100.c | 28 +--- drm/nouveau/nvkm/subdev/mmu/gf100.h | 46 ++++++ drm/nouveau/nvkm/subdev/mmu/gk20a.c | 253 ++++++++++++++++++++++++++++++++ lib/include/nvif/os.h | 12 ++ 12 files changed, 526 insertions(+), 32 deletions(-) create mode 100644 drm/nouveau/nvkm/subdev/mmu/gf100.h create mode 100644 drm/nouveau/nvkm/subdev/mmu/gk20a.c -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 1/6] platform: specify the IOMMU physical translation bit
The IOMMU physical translation bit might vary with different SoCs. So add a variable to specify this bit for GK20A. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/nouveau_platform.c | 19 +++++++++++++++++++ drm/nouveau/nouveau_platform.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drm/nouveau/nouveau_platform.c b/drm/nouveau/nouveau_platform.c index 775277f1edb0..0d002f73e356 100644 --- a/drm/nouveau/nouveau_platform.c +++ b/drm/nouveau/nouveau_platform.c @@ -25,6 +25,7 @@ #include <linux/module.h> #include <linux/platform_device.h> #include <linux/of.h> +#include <linux/of_device.h> #include <linux/reset.h> #include <linux/regulator/consumer.h> #include <linux/iommu.h> @@ -92,6 +93,22 @@ static int nouveau_platform_power_down(struct nouveau_platform_gpu *gpu) return 0; } +static unsigned long nouveau_platform_get_iommu_bit(struct device *dev) +{ + const struct of_device_id *match; + + match = of_match_device(dev->driver->of_match_table, dev); + if (!match) { + dev_warn(dev, "cannot find OF match for device\n"); + return 0; + } + + if (!strcmp(match->compatible, "nvidia,gk20a")) + return 34; + else + return 0; +} + static void nouveau_platform_probe_iommu(struct device *dev, struct nouveau_platform_gpu *gpu) { @@ -122,6 +139,8 @@ static void nouveau_platform_probe_iommu(struct device *dev, gpu->iommu.pgshift -= 1; } + gpu->iommu.phys_addr_bit = nouveau_platform_get_iommu_bit(dev); + err = iommu_attach_device(gpu->iommu.domain, dev); if (err) goto free_domain; diff --git a/drm/nouveau/nouveau_platform.h b/drm/nouveau/nouveau_platform.h index 392874cf4725..3e9bd7dc0092 100644 --- a/drm/nouveau/nouveau_platform.h +++ b/drm/nouveau/nouveau_platform.h @@ -53,6 +53,7 @@ struct nouveau_platform_gpu { struct nvkm_mm *mm; struct iommu_domain *domain; unsigned long pgshift; + unsigned long phys_addr_bit; } iommu; }; -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 2/6] instmem/gk20a: refer to IOMMU physical translation bit
Instead of hard-coding the translation bit in subdev driver, we refer to the platform data. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/nvkm/subdev/instmem/gk20a.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drm/nouveau/nvkm/subdev/instmem/gk20a.c index dd0994d9ebfc..69ef5eae3279 100644 --- a/drm/nouveau/nvkm/subdev/instmem/gk20a.c +++ b/drm/nouveau/nvkm/subdev/instmem/gk20a.c @@ -89,6 +89,7 @@ struct gk20a_instmem_priv { struct nvkm_mm *mm; struct iommu_domain *domain; unsigned long iommu_pgshift; + unsigned long iommu_phys_addr_bit; /* Only used by DMA API */ struct dma_attrs attrs; @@ -169,8 +170,8 @@ gk20a_instobj_dtor_iommu(struct gk20a_instobj_priv *_node) r = list_first_entry(&_node->mem->regions, struct nvkm_mm_node, rl_entry); - /* clear bit 34 to unmap pages */ - r->offset &= ~BIT(34 - priv->iommu_pgshift); + /* clear IOMMU translation bit to unmap pages */ + r->offset &= ~BIT(priv->iommu_phys_addr_bit - priv->iommu_pgshift); /* Unmap pages from GPU address space and free them */ for (i = 0; i < _node->mem->size; i++) { @@ -298,8 +299,11 @@ gk20a_instobj_ctor_iommu(struct nvkm_object *parent, struct nvkm_object *engine, } } - /* Bit 34 tells that an address is to be resolved through the IOMMU */ - r->offset |= BIT(34 - priv->iommu_pgshift); + /* + * The iommu_phys_addr_bit tells that an address is to be resolved + * through the IOMMU + */ + r->offset |= BIT(priv->iommu_phys_addr_bit - priv->iommu_pgshift); node->base._mem.offset = ((u64)r->offset) << priv->iommu_pgshift; @@ -407,6 +411,7 @@ gk20a_instmem_ctor(struct nvkm_object *parent, struct nvkm_object *engine, priv->domain = plat->gpu->iommu.domain; priv->mm = plat->gpu->iommu.mm; priv->iommu_pgshift = plat->gpu->iommu.pgshift; + priv->iommu_phys_addr_bit = plat->gpu->iommu.phys_addr_bit; priv->mm_mutex = &plat->gpu->iommu.mutex; nv_info(priv, "using IOMMU\n"); -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 3/6] mmu: map small pages into big pages(s) by IOMMU if possible
This patch implements a way to aggregate the small pages and make them be mapped as big page(s) by utilizing the platform IOMMU if supported. And then we can enable compression support for these big pages later. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/include/nvkm/subdev/mmu.h | 16 ++++ drm/nouveau/nvkm/subdev/mmu/base.c | 158 ++++++++++++++++++++++++++++++++-- lib/include/nvif/os.h | 12 +++ 3 files changed, 179 insertions(+), 7 deletions(-) diff --git a/drm/nouveau/include/nvkm/subdev/mmu.h b/drm/nouveau/include/nvkm/subdev/mmu.h index 3a5368776c31..3230d31a7971 100644 --- a/drm/nouveau/include/nvkm/subdev/mmu.h +++ b/drm/nouveau/include/nvkm/subdev/mmu.h @@ -22,6 +22,8 @@ struct nvkm_vma { struct nvkm_mm_node *node; u64 offset; u32 access; + struct list_head bp; + bool has_iommu_bp; }; struct nvkm_vm { @@ -37,6 +39,13 @@ struct nvkm_vm { u32 lpde; }; +struct nvkm_vm_bp_list { + struct list_head head; + u32 pde; + u32 pte; + void *priv; +}; + struct nvkm_mmu { struct nvkm_subdev base; @@ -45,6 +54,7 @@ struct nvkm_mmu { u32 pgt_bits; u8 spg_shift; u8 lpg_shift; + bool iommu_capable; int (*create)(struct nvkm_mmu *, u64 offset, u64 length, u64 mm_offset, struct nvkm_vm **); @@ -56,7 +66,12 @@ struct nvkm_mmu { u64 phys, u64 delta); void (*map_sg)(struct nvkm_vma *, struct nvkm_gpuobj *, struct nvkm_mem *, u32 pte, u32 cnt, dma_addr_t *); + void (*map_iommu)(struct nvkm_vma *, struct nvkm_gpuobj *, + struct nvkm_mem *, u32 pte, dma_addr_t *, void **); + void (*map_sg_iommu)(struct nvkm_vma *, struct nvkm_gpuobj *, + struct nvkm_mem *, u32 pte, struct sg_page_iter *, void **); void (*unmap)(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt); + void (*unmap_iommu)(struct nvkm_vma *, void *); void (*flush)(struct nvkm_vm *); }; @@ -84,6 +99,7 @@ extern struct nvkm_oclass nv41_mmu_oclass; extern struct nvkm_oclass nv44_mmu_oclass; extern struct nvkm_oclass nv50_mmu_oclass; extern struct nvkm_oclass gf100_mmu_oclass; +extern struct nvkm_oclass gk20a_mmu_oclass; int nv04_vm_create(struct nvkm_mmu *, u64, u64, u64, struct nvkm_vm **); diff --git a/drm/nouveau/nvkm/subdev/mmu/base.c b/drm/nouveau/nvkm/subdev/mmu/base.c index 277b6ec04e24..747c836d9fa6 100644 --- a/drm/nouveau/nvkm/subdev/mmu/base.c +++ b/drm/nouveau/nvkm/subdev/mmu/base.c @@ -26,6 +26,43 @@ #include <core/gpuobj.h> +static int +nvkm_vm_map_pgt(struct nvkm_vm *vm, u32 pde, u32 type); + +static int +nvkm_vm_link_bp(struct nvkm_vma *vma, u32 pde, u32 pte, + struct nvkm_vm_pgt *vpgt, void *priv) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + struct nvkm_vm_bp_list *list; + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) + return -ENOMEM; + + mutex_lock(&nv_subdev(mmu)->mutex); + + if (!vma->has_iommu_bp) { + INIT_LIST_HEAD(&vma->bp); + vma->has_iommu_bp = true; + } + list->pde = pde; + list->pte = pte; + list->priv = priv; + list_add_tail(&list->head, &vma->bp); + + mutex_unlock(&nv_subdev(mmu)->mutex); + + return 0; +} + +static void +nvkm_vm_unlink_bp(struct nvkm_vma *vma, struct nvkm_vm_bp_list *list) +{ + list_del(&list->head); + kfree(list); +} + void nvkm_vm_map_at(struct nvkm_vma *vma, u64 delta, struct nvkm_mem *node) { @@ -129,6 +166,48 @@ finish: } static void +nvkm_vm_map_sg_table_with_iommu(struct nvkm_vma *vma, u64 delta, u64 length, + struct nvkm_mem *mem) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + int big = vma->node->type != mmu->spg_shift; + u32 offset = vma->node->offset + (delta >> 12); + u32 bits = vma->node->type - 12; + u32 pde = (offset >> mmu->pgt_bits) - vm->fpde; + u32 pte = (offset & ((1 << mmu->pgt_bits) - 1)) >> bits; + u32 max = 1 << (mmu->pgt_bits - bits); + struct sg_page_iter iter; + u32 bpoff, i; + u32 multiple = 1 << bits; + + i = 0; + for_each_sg_page(mem->sg->sgl, &iter, mem->sg->nents, 0) { + struct nvkm_gpuobj *pgt = vm->pgt[pde].obj[big]; + void *priv; + + bpoff = offset + i; + + pde = (bpoff >> mmu->pgt_bits) - vm->fpde; + pte = (bpoff & ((1 << mmu->pgt_bits) - 1)) >> bits; + pgt = vm->pgt[pde].obj[1]; + + mmu->map_sg_iommu(vma, pgt, mem, pte, &iter, &priv); + + nvkm_vm_link_bp(vma, pde, pte, &vm->pgt[pde], priv); + + i += multiple; + pte++; + if (unlikely(pte >= max)) { + pde++; + pte = 0; + } + } + + mmu->flush(vm); +} + +static void nvkm_vm_map_sg(struct nvkm_vma *vma, u64 delta, u64 length, struct nvkm_mem *mem) { @@ -166,15 +245,59 @@ nvkm_vm_map_sg(struct nvkm_vma *vma, u64 delta, u64 length, mmu->flush(vm); } +static void +nvkm_vm_map_sg_with_iommu(struct nvkm_vma *vma, u64 delta, u64 length, + struct nvkm_mem *mem) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + dma_addr_t *list = mem->pages; + int big = vma->node->type != mmu->spg_shift; + u32 offset = vma->node->offset + (delta >> 12); + u32 bits = vma->node->type - 12; + u32 num = length >> vma->node->type; + u32 pde = (offset >> mmu->pgt_bits) - vm->fpde; + u32 pte = (offset & ((1 << mmu->pgt_bits) - 1)) >> bits; + u32 max = 1 << (mmu->pgt_bits - bits); + u32 multiple = 1 << bits; + + while (num) { + struct nvkm_gpuobj *pgt = vm->pgt[pde].obj[big]; + void *priv; + + mmu->map_iommu(vma, pgt, mem, pte, list, &priv); + + nvkm_vm_link_bp(vma, pde, pte, &vm->pgt[pde], priv); + + list += multiple; + num--; + pte++; + if (unlikely(pte >= max)) { + pde++; + pte = 0; + } + } + + mmu->flush(vm); +} + void nvkm_vm_map(struct nvkm_vma *vma, struct nvkm_mem *node) { - if (node->sg) - nvkm_vm_map_sg_table(vma, 0, node->size << 12, node); - else - if (node->pages) - nvkm_vm_map_sg(vma, 0, node->size << 12, node); - else + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + + if (node->sg) { + if (mmu->iommu_capable && vma->node->type == mmu->lpg_shift) + nvkm_vm_map_sg_table_with_iommu(vma, 0, node->size << 12, node); + else + nvkm_vm_map_sg_table(vma, 0, node->size << 12, node); + } else if (node->pages) { + if (mmu->iommu_capable && vma->node->type == mmu->lpg_shift) + nvkm_vm_map_sg_with_iommu(vma, 0, node->size << 12, node); + else + nvkm_vm_map_sg(vma, 0, node->size << 12, node); + } else nvkm_vm_map_at(vma, 0, node); } @@ -214,9 +337,30 @@ nvkm_vm_unmap_at(struct nvkm_vma *vma, u64 delta, u64 length) } void +nvkm_vm_unmap_iommu(struct nvkm_vma *vma) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + struct nvkm_vm_bp_list *list, *tmp; + + list_for_each_entry_safe(list, tmp, &vma->bp, head) { + struct nvkm_gpuobj *pgt = vm->pgt[list->pde].obj[1]; + + mmu->unmap(pgt, list->pte, 1); + mmu->unmap_iommu(vma, list->priv); + nvkm_vm_unlink_bp(vma, list); + } + + vma->has_iommu_bp = false; +} + +void nvkm_vm_unmap(struct nvkm_vma *vma) { - nvkm_vm_unmap_at(vma, 0, (u64)vma->node->length << 12); + if (vma->has_iommu_bp) + nvkm_vm_unmap_iommu(vma); + else + nvkm_vm_unmap_at(vma, 0, (u64)vma->node->length << 12); } static void diff --git a/lib/include/nvif/os.h b/lib/include/nvif/os.h index 275fa84ad003..f56a5e5d3a4d 100644 --- a/lib/include/nvif/os.h +++ b/lib/include/nvif/os.h @@ -88,6 +88,7 @@ typedef dma_addr_t resource_size_t; #define likely(a) (a) #define unlikely(a) (a) #define BIT(a) (1UL << (a)) +#define BIT_ULL(a) (1ULL << (a)) #define ERR_PTR(err) ((void *)(long)(err)) #define PTR_ERR(ptr) ((long)(ptr)) @@ -914,6 +915,17 @@ struct sg_table { #define sg_dma_address(a) 0ULL #define sg_dma_len(a) 0ULL +struct sg_page_iter { +}; + +#define sg_page_iter_dma_address(struct sg_page_iter *piter) 0ULL + +#define for_each_sg_page(sglist, piter, nents, pgoffset) \ + for (__sg_page_iter_start((piter), (sglist), (nents), (pgoffset)); \ + __sg_page_iter_next(piter);) +#define __sg_page_iter_start(a) (a) +#define __sg_page_iter_next(a) (false) + /****************************************************************************** * firmware *****************************************************************************/ -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 4/6] drm: enable big page mapping for small pages when IOMMU is available
Some platforms have IOMMU to map non-contiguous physical memory into contiguous GPU virtual address. We can use this feature to enable big pages mapping on scattered small pages. To achieve that, we also need changes in subdev/mmu as well. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/nouveau_bo.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drm/nouveau/nouveau_bo.c b/drm/nouveau/nouveau_bo.c index 77326e344dad..da76ee1121e4 100644 --- a/drm/nouveau/nouveau_bo.c +++ b/drm/nouveau/nouveau_bo.c @@ -221,6 +221,11 @@ nouveau_bo_new(struct drm_device *dev, int size, int align, if (drm->client.vm) { if (!(flags & TTM_PL_FLAG_TT) && size > 256 * 1024) nvbo->page_shift = drm->client.vm->mmu->lpg_shift; + + if ((flags & TTM_PL_FLAG_TT) && + drm->client.vm->mmu->iommu_capable && + (size % (1 << drm->client.vm->mmu->lpg_shift)) == 0) + nvbo->page_shift = drm->client.vm->mmu->lpg_shift; } nouveau_bo_fixup_align(nvbo, flags, &align, &size); @@ -1641,6 +1646,10 @@ nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nvkm_vm *vm, (nvbo->bo.mem.mem_type == TTM_PL_VRAM || nvbo->page_shift != vma->vm->mmu->lpg_shift)) nvkm_vm_map(vma, nvbo->bo.mem.mm_node); + else if (nvbo->bo.mem.mem_type == TTM_PL_TT && + vma->vm->mmu->iommu_capable && + nvbo->page_shift == vma->vm->mmu->lpg_shift) + nvkm_vm_map(vma, nvbo->bo.mem.mm_node); list_add_tail(&vma->head, &nvbo->vma_list); vma->refcount = 1; -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 5/6] mmu: gf100: share most of functions with GK20A
This patch moves all of the functions which can be shared with GK20A to public for later use. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/nvkm/subdev/mmu/gf100.c | 28 +++++++--------------- drm/nouveau/nvkm/subdev/mmu/gf100.h | 46 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 20 deletions(-) create mode 100644 drm/nouveau/nvkm/subdev/mmu/gf100.h diff --git a/drm/nouveau/nvkm/subdev/mmu/gf100.c b/drm/nouveau/nvkm/subdev/mmu/gf100.c index 294cda37f068..b067ded5d3be 100644 --- a/drm/nouveau/nvkm/subdev/mmu/gf100.c +++ b/drm/nouveau/nvkm/subdev/mmu/gf100.c @@ -29,6 +29,8 @@ #include <core/gpuobj.h> +#include "gf100.h" + struct gf100_mmu_priv { struct nvkm_mmu base; }; @@ -74,7 +76,7 @@ const u8 gf100_pte_storage_type_map[256] }; -static void +void gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, struct nvkm_gpuobj *pgt[2]) { u32 pde[2] = { 0, 0 }; @@ -88,21 +90,7 @@ gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, struct nvkm_gpuobj *pgt[2]) nv_wo32(pgd, (index * 8) + 4, pde[1]); } -static inline u64 -gf100_vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target) -{ - phys >>= 8; - - phys |= 0x00000001; /* present */ - if (vma->access & NV_MEM_ACCESS_SYS) - phys |= 0x00000002; - - phys |= ((u64)target << 32); - phys |= ((u64)memtype << 36); - return phys; -} - -static void +void gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta) { @@ -127,7 +115,7 @@ gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, } } -static void +void gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list) { @@ -144,7 +132,7 @@ gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, } } -static void +void gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt) { pte <<= 3; @@ -155,7 +143,7 @@ gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt) } } -static void +void gf100_vm_flush(struct nvkm_vm *vm) { struct gf100_mmu_priv *priv = (void *)vm->mmu; @@ -191,7 +179,7 @@ gf100_vm_flush(struct nvkm_vm *vm) mutex_unlock(&nv_subdev(priv)->mutex); } -static int +int gf100_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset, struct nvkm_vm **pvm) { diff --git a/drm/nouveau/nvkm/subdev/mmu/gf100.h b/drm/nouveau/nvkm/subdev/mmu/gf100.h new file mode 100644 index 000000000000..a66ca45bc755 --- /dev/null +++ b/drm/nouveau/nvkm/subdev/mmu/gf100.h @@ -0,0 +1,46 @@ +#ifndef __GF100_MMU_PRIV__ +#define __GF100_MMU_PRIV__ + +#include <subdev/mmu.h> + +struct nv04_mmu_priv { + struct nvkm_mmu base; + struct nvkm_vm *vm; + dma_addr_t null; + void *nullp; +}; + +int +gf100_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset, + struct nvkm_vm **pvm); + +void +gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, + struct nvkm_gpuobj *pgt[2]); +void +gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, + struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta); +void +gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, + struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list); +void +gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt); + +void +gf100_vm_flush(struct nvkm_vm *vm); + +static inline u64 +gf100_vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target) +{ + phys >>= 8; + + phys |= 0x00000001; /* present */ + if (vma->access & NV_MEM_ACCESS_SYS) + phys |= 0x00000002; + + phys |= ((u64)target << 32); + phys |= ((u64)memtype << 36); + return phys; +} + +#endif -- 2.1.4
Vince Hsu
2015-Apr-16 11:06 UTC
[Nouveau] [PATCH 6/6] mmu: gk20a: implement IOMMU mapping for big pages
This patch uses IOMMU to aggregate (probably) discrete small pages as larger big page(s) and map it to GMMU. Signed-off-by: Vince Hsu <vinceh at nvidia.com> --- drm/nouveau/nvkm/engine/device/gk104.c | 2 +- drm/nouveau/nvkm/subdev/mmu/Kbuild | 1 + drm/nouveau/nvkm/subdev/mmu/gk20a.c | 253 +++++++++++++++++++++++++++++++++ 3 files changed, 255 insertions(+), 1 deletion(-) create mode 100644 drm/nouveau/nvkm/subdev/mmu/gk20a.c diff --git a/drm/nouveau/nvkm/engine/device/gk104.c b/drm/nouveau/nvkm/engine/device/gk104.c index 6a9483f65d83..9ea48ba31c0d 100644 --- a/drm/nouveau/nvkm/engine/device/gk104.c +++ b/drm/nouveau/nvkm/engine/device/gk104.c @@ -172,7 +172,7 @@ gk104_identify(struct nvkm_device *device) device->oclass[NVDEV_SUBDEV_LTC ] = gk104_ltc_oclass; device->oclass[NVDEV_SUBDEV_IBUS ] = &gk20a_ibus_oclass; device->oclass[NVDEV_SUBDEV_INSTMEM] = gk20a_instmem_oclass; - device->oclass[NVDEV_SUBDEV_MMU ] = &gf100_mmu_oclass; + device->oclass[NVDEV_SUBDEV_MMU ] = &gk20a_mmu_oclass; device->oclass[NVDEV_SUBDEV_BAR ] = &gk20a_bar_oclass; device->oclass[NVDEV_ENGINE_DMAOBJ ] = gf110_dmaeng_oclass; device->oclass[NVDEV_ENGINE_FIFO ] = gk20a_fifo_oclass; diff --git a/drm/nouveau/nvkm/subdev/mmu/Kbuild b/drm/nouveau/nvkm/subdev/mmu/Kbuild index 012c9db687b2..141302a8e933 100644 --- a/drm/nouveau/nvkm/subdev/mmu/Kbuild +++ b/drm/nouveau/nvkm/subdev/mmu/Kbuild @@ -4,3 +4,4 @@ nvkm-y += nvkm/subdev/mmu/nv41.o nvkm-y += nvkm/subdev/mmu/nv44.o nvkm-y += nvkm/subdev/mmu/nv50.o nvkm-y += nvkm/subdev/mmu/gf100.o +nvkm-y += nvkm/subdev/mmu/gk20a.o diff --git a/drm/nouveau/nvkm/subdev/mmu/gk20a.c b/drm/nouveau/nvkm/subdev/mmu/gk20a.c new file mode 100644 index 000000000000..b444b73e208d --- /dev/null +++ b/drm/nouveau/nvkm/subdev/mmu/gk20a.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include <subdev/fb.h> +#include <subdev/ltc.h> +#include <subdev/mmu.h> + +#ifdef __KERNEL__ +#include <linux/iommu.h> +#include <nouveau_platform.h> +#endif + +#include "gf100.h" + +struct gk20a_mmu_priv { + struct nvkm_mmu base; +}; + +struct gk20a_mmu_iommu_mapping { + struct nvkm_mm_node *node; + u64 iova; +}; + +extern const u8 gf100_pte_storage_type_map[256]; + +static void +gk20a_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, + struct nvkm_mem *mem, u32 pte, u64 list) +{ + u32 target = (vma->access & NV_MEM_ACCESS_NOSNOOP) ? 7 : 5; + u64 phys; + + pte <<= 3; + phys = gf100_vm_addr(vma, list, mem->memtype, target); + + if (mem->tag) { + struct nvkm_ltc *ltc = nvkm_ltc(vma->vm->mmu); + u32 tag = mem->tag->offset; + phys |= (u64)tag << (32 + 12); + ltc->tags_clear(ltc, tag, 1); + } + + nv_wo32(pgt, pte + 0, lower_32_bits(phys)); + nv_wo32(pgt, pte + 4, upper_32_bits(phys)); +} + +static void +gk20a_vm_map_iommu(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, + struct nvkm_mem *mem, u32 pte, dma_addr_t *list, + void **priv) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + struct nvkm_mm_node *node; + struct nouveau_platform_device *plat; + struct gk20a_mmu_iommu_mapping *p; + int npages = 1 << (mmu->lpg_shift - mmu->spg_shift); + int i, ret; + u64 addr; + + plat = nv_device_to_platform(nv_device(&mmu->base)); + + *priv = kzalloc(sizeof(struct gk20a_mmu_iommu_mapping), GFP_KERNEL); + if (!*priv) + return; + + mutex_lock(&plat->gpu->iommu.mutex); + ret = nvkm_mm_head(plat->gpu->iommu.mm, + 0, + 1, + npages, + npages, + (1 << mmu->lpg_shift) >> 12, + &node); + mutex_unlock(&plat->gpu->iommu.mutex); + if (ret) + return; + + for (i = 0; i < npages; i++, list++) { + ret = iommu_map(plat->gpu->iommu.domain, + (node->offset + i) << PAGE_SHIFT, + *list, + PAGE_SIZE, + IOMMU_READ | IOMMU_WRITE); + + if (ret < 0) + return; + + nv_trace(mmu, "IOMMU: IOVA=0x%016llx-> IOMMU -> PA=%016llx\n", + (u64)(node->offset + i) << PAGE_SHIFT, (u64)(*list)); + } + + addr = (u64)node->offset << PAGE_SHIFT; + addr |= BIT_ULL(plat->gpu->iommu.phys_addr_bit); + + gk20a_vm_map(vma, pgt, mem, pte, addr); + + p = *priv; + p->node = node; + p->iova = node->offset << PAGE_SHIFT; +} + +static void +gk20a_vm_map_sg_iommu(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, + struct nvkm_mem *mem, u32 pte, struct sg_page_iter *iter, + void **priv) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + struct nvkm_mm_node *node; + struct nouveau_platform_device *plat; + struct gk20a_mmu_iommu_mapping *p; + int npages = 1 << (mmu->lpg_shift - mmu->spg_shift); + int i, ret; + u64 addr; + + plat = nv_device_to_platform(nv_device(&mmu->base)); + + *priv = kzalloc(sizeof(struct gk20a_mmu_iommu_mapping), GFP_KERNEL); + if (!*priv) + return; + + mutex_lock(&plat->gpu->iommu.mutex); + ret = nvkm_mm_head(plat->gpu->iommu.mm, + 0, + 1, + npages, + npages, + (1 << mmu->lpg_shift) >> 12, + &node); + mutex_unlock(&plat->gpu->iommu.mutex); + if (ret) + return; + + for (i = 0; i < npages; i++) { + dma_addr_t phys = sg_page_iter_dma_address(iter); + + ret = iommu_map(plat->gpu->iommu.domain, + (node->offset + i) << PAGE_SHIFT, + phys, + PAGE_SIZE, + IOMMU_READ | IOMMU_WRITE); + + if (ret < 0) + return; + + nv_trace(mmu, "IOMMU: IOVA=0x%016llx-> IOMMU -> PA=%016llx\n", + (u64)(node->offset + i) << PAGE_SHIFT, (u64)phys); + + if ((i < npages - 1) && !__sg_page_iter_next(iter)) { + nv_error(mmu, "failed to iterate sg table\n"); + return; + } + } + + addr = (u64)node->offset << PAGE_SHIFT; + addr |= BIT_ULL(plat->gpu->iommu.phys_addr_bit); + + gk20a_vm_map(vma, pgt, mem, pte, addr); + + p = *priv; + p->node = node; + p->iova = node->offset << PAGE_SHIFT; +} + +static void +gk20a_vm_unmap_iommu(struct nvkm_vma *vma, void *priv) +{ + struct nvkm_vm *vm = vma->vm; + struct nvkm_mmu *mmu = vm->mmu; + struct nouveau_platform_device *plat; + struct gk20a_mmu_iommu_mapping *p = priv; + int ret; + + plat = nv_device_to_platform(nv_device(&mmu->base)); + + ret = iommu_unmap(plat->gpu->iommu.domain, p->iova, + 1 << mmu->lpg_shift); + WARN(ret < 0, "failed to unmap IOMMU address 0x%16llx, ret=%d\n", + p->iova, ret); + + mutex_lock(&plat->gpu->iommu.mutex); + nvkm_mm_free(plat->gpu->iommu.mm, &p->node); + mutex_unlock(&plat->gpu->iommu.mutex); + + kfree(priv); +} + +static int +gk20a_mmu_ctor(struct nvkm_object *parent, struct nvkm_object *engine, + struct nvkm_oclass *oclass, void *data, u32 size, + struct nvkm_object **pobject) +{ + struct gk20a_mmu_priv *priv; + struct nouveau_platform_device *plat; + int ret; + + ret = nvkm_mmu_create(parent, engine, oclass, "VM", "vm", &priv); + *pobject = nv_object(priv); + if (ret) + return ret; + + plat = nv_device_to_platform(nv_device(parent)); + if (plat->gpu->iommu.domain) + priv->base.iommu_capable = true; + + priv->base.limit = 1ULL << 40; + priv->base.dma_bits = 40; + priv->base.pgt_bits = 27 - 12; + priv->base.spg_shift = 12; + priv->base.lpg_shift = 17; + priv->base.create = gf100_vm_create; + priv->base.map_pgt = gf100_vm_map_pgt; + priv->base.map = gf100_vm_map; + priv->base.map_sg = gf100_vm_map_sg; + priv->base.map_iommu = gk20a_vm_map_iommu; + priv->base.unmap_iommu = gk20a_vm_unmap_iommu; + priv->base.map_sg_iommu = gk20a_vm_map_sg_iommu; + priv->base.unmap = gf100_vm_unmap; + priv->base.flush = gf100_vm_flush; + + return 0; +} + +struct nvkm_oclass +gk20a_mmu_oclass = { + .handle = NV_SUBDEV(MMU, 0xea), + .ofuncs = &(struct nvkm_ofuncs) { + .ctor = gk20a_mmu_ctor, + .dtor = _nvkm_mmu_dtor, + .init = _nvkm_mmu_init, + .fini = _nvkm_mmu_fini, + }, +}; -- 2.1.4
Ilia Mirkin
2015-Apr-16 19:31 UTC
[Nouveau] [PATCH 6/6] mmu: gk20a: implement IOMMU mapping for big pages
Two questions -- (a) What's the perf impact of doing this? Less work for the GPU MMU but more work for the IOMMU... (b) Would it be a good idea to do this for desktop GPUs that are on CPUs with IOMMUs in them (VT-d and whatever the AMD one is)? Is there some sort of shared API for this stuff that you should be (or are?) using? -ilia On Thu, Apr 16, 2015 at 7:06 AM, Vince Hsu <vinceh at nvidia.com> wrote:> This patch uses IOMMU to aggregate (probably) discrete small pages as larger > big page(s) and map it to GMMU. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/nvkm/engine/device/gk104.c | 2 +- > drm/nouveau/nvkm/subdev/mmu/Kbuild | 1 + > drm/nouveau/nvkm/subdev/mmu/gk20a.c | 253 +++++++++++++++++++++++++++++++++ > 3 files changed, 255 insertions(+), 1 deletion(-) > create mode 100644 drm/nouveau/nvkm/subdev/mmu/gk20a.c > > diff --git a/drm/nouveau/nvkm/engine/device/gk104.c b/drm/nouveau/nvkm/engine/device/gk104.c > index 6a9483f65d83..9ea48ba31c0d 100644 > --- a/drm/nouveau/nvkm/engine/device/gk104.c > +++ b/drm/nouveau/nvkm/engine/device/gk104.c > @@ -172,7 +172,7 @@ gk104_identify(struct nvkm_device *device) > device->oclass[NVDEV_SUBDEV_LTC ] = gk104_ltc_oclass; > device->oclass[NVDEV_SUBDEV_IBUS ] = &gk20a_ibus_oclass; > device->oclass[NVDEV_SUBDEV_INSTMEM] = gk20a_instmem_oclass; > - device->oclass[NVDEV_SUBDEV_MMU ] = &gf100_mmu_oclass; > + device->oclass[NVDEV_SUBDEV_MMU ] = &gk20a_mmu_oclass; > device->oclass[NVDEV_SUBDEV_BAR ] = &gk20a_bar_oclass; > device->oclass[NVDEV_ENGINE_DMAOBJ ] = gf110_dmaeng_oclass; > device->oclass[NVDEV_ENGINE_FIFO ] = gk20a_fifo_oclass; > diff --git a/drm/nouveau/nvkm/subdev/mmu/Kbuild b/drm/nouveau/nvkm/subdev/mmu/Kbuild > index 012c9db687b2..141302a8e933 100644 > --- a/drm/nouveau/nvkm/subdev/mmu/Kbuild > +++ b/drm/nouveau/nvkm/subdev/mmu/Kbuild > @@ -4,3 +4,4 @@ nvkm-y += nvkm/subdev/mmu/nv41.o > nvkm-y += nvkm/subdev/mmu/nv44.o > nvkm-y += nvkm/subdev/mmu/nv50.o > nvkm-y += nvkm/subdev/mmu/gf100.o > +nvkm-y += nvkm/subdev/mmu/gk20a.o > diff --git a/drm/nouveau/nvkm/subdev/mmu/gk20a.c b/drm/nouveau/nvkm/subdev/mmu/gk20a.c > new file mode 100644 > index 000000000000..b444b73e208d > --- /dev/null > +++ b/drm/nouveau/nvkm/subdev/mmu/gk20a.c > @@ -0,0 +1,253 @@ > +/* > + * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. > + * > + * Permission is hereby granted, free of charge, to any person obtaining a > + * copy of this software and associated documentation files (the "Software"), > + * to deal in the Software without restriction, including without limitation > + * the rights to use, copy, modify, merge, publish, distribute, sublicense, > + * and/or sell copies of the Software, and to permit persons to whom the > + * Software is furnished to do so, subject to the following conditions: > + * > + * The above copyright notice and this permission notice shall be included in > + * all copies or substantial portions of the Software. > + * > + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR > + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER > + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING > + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER > + * DEALINGS IN THE SOFTWARE. > + */ > + > +#include <subdev/fb.h> > +#include <subdev/ltc.h> > +#include <subdev/mmu.h> > + > +#ifdef __KERNEL__ > +#include <linux/iommu.h> > +#include <nouveau_platform.h> > +#endif > + > +#include "gf100.h" > + > +struct gk20a_mmu_priv { > + struct nvkm_mmu base; > +}; > + > +struct gk20a_mmu_iommu_mapping { > + struct nvkm_mm_node *node; > + u64 iova; > +}; > + > +extern const u8 gf100_pte_storage_type_map[256]; > + > +static void > +gk20a_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > + struct nvkm_mem *mem, u32 pte, u64 list) > +{ > + u32 target = (vma->access & NV_MEM_ACCESS_NOSNOOP) ? 7 : 5; > + u64 phys; > + > + pte <<= 3; > + phys = gf100_vm_addr(vma, list, mem->memtype, target); > + > + if (mem->tag) { > + struct nvkm_ltc *ltc = nvkm_ltc(vma->vm->mmu); > + u32 tag = mem->tag->offset; > + phys |= (u64)tag << (32 + 12); > + ltc->tags_clear(ltc, tag, 1); > + } > + > + nv_wo32(pgt, pte + 0, lower_32_bits(phys)); > + nv_wo32(pgt, pte + 4, upper_32_bits(phys)); > +} > + > +static void > +gk20a_vm_map_iommu(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > + struct nvkm_mem *mem, u32 pte, dma_addr_t *list, > + void **priv) > +{ > + struct nvkm_vm *vm = vma->vm; > + struct nvkm_mmu *mmu = vm->mmu; > + struct nvkm_mm_node *node; > + struct nouveau_platform_device *plat; > + struct gk20a_mmu_iommu_mapping *p; > + int npages = 1 << (mmu->lpg_shift - mmu->spg_shift); > + int i, ret; > + u64 addr; > + > + plat = nv_device_to_platform(nv_device(&mmu->base)); > + > + *priv = kzalloc(sizeof(struct gk20a_mmu_iommu_mapping), GFP_KERNEL); > + if (!*priv) > + return; > + > + mutex_lock(&plat->gpu->iommu.mutex); > + ret = nvkm_mm_head(plat->gpu->iommu.mm, > + 0, > + 1, > + npages, > + npages, > + (1 << mmu->lpg_shift) >> 12, > + &node); > + mutex_unlock(&plat->gpu->iommu.mutex); > + if (ret) > + return; > + > + for (i = 0; i < npages; i++, list++) { > + ret = iommu_map(plat->gpu->iommu.domain, > + (node->offset + i) << PAGE_SHIFT, > + *list, > + PAGE_SIZE, > + IOMMU_READ | IOMMU_WRITE); > + > + if (ret < 0) > + return; > + > + nv_trace(mmu, "IOMMU: IOVA=0x%016llx-> IOMMU -> PA=%016llx\n", > + (u64)(node->offset + i) << PAGE_SHIFT, (u64)(*list)); > + } > + > + addr = (u64)node->offset << PAGE_SHIFT; > + addr |= BIT_ULL(plat->gpu->iommu.phys_addr_bit); > + > + gk20a_vm_map(vma, pgt, mem, pte, addr); > + > + p = *priv; > + p->node = node; > + p->iova = node->offset << PAGE_SHIFT; > +} > + > +static void > +gk20a_vm_map_sg_iommu(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > + struct nvkm_mem *mem, u32 pte, struct sg_page_iter *iter, > + void **priv) > +{ > + struct nvkm_vm *vm = vma->vm; > + struct nvkm_mmu *mmu = vm->mmu; > + struct nvkm_mm_node *node; > + struct nouveau_platform_device *plat; > + struct gk20a_mmu_iommu_mapping *p; > + int npages = 1 << (mmu->lpg_shift - mmu->spg_shift); > + int i, ret; > + u64 addr; > + > + plat = nv_device_to_platform(nv_device(&mmu->base)); > + > + *priv = kzalloc(sizeof(struct gk20a_mmu_iommu_mapping), GFP_KERNEL); > + if (!*priv) > + return; > + > + mutex_lock(&plat->gpu->iommu.mutex); > + ret = nvkm_mm_head(plat->gpu->iommu.mm, > + 0, > + 1, > + npages, > + npages, > + (1 << mmu->lpg_shift) >> 12, > + &node); > + mutex_unlock(&plat->gpu->iommu.mutex); > + if (ret) > + return; > + > + for (i = 0; i < npages; i++) { > + dma_addr_t phys = sg_page_iter_dma_address(iter); > + > + ret = iommu_map(plat->gpu->iommu.domain, > + (node->offset + i) << PAGE_SHIFT, > + phys, > + PAGE_SIZE, > + IOMMU_READ | IOMMU_WRITE); > + > + if (ret < 0) > + return; > + > + nv_trace(mmu, "IOMMU: IOVA=0x%016llx-> IOMMU -> PA=%016llx\n", > + (u64)(node->offset + i) << PAGE_SHIFT, (u64)phys); > + > + if ((i < npages - 1) && !__sg_page_iter_next(iter)) { > + nv_error(mmu, "failed to iterate sg table\n"); > + return; > + } > + } > + > + addr = (u64)node->offset << PAGE_SHIFT; > + addr |= BIT_ULL(plat->gpu->iommu.phys_addr_bit); > + > + gk20a_vm_map(vma, pgt, mem, pte, addr); > + > + p = *priv; > + p->node = node; > + p->iova = node->offset << PAGE_SHIFT; > +} > + > +static void > +gk20a_vm_unmap_iommu(struct nvkm_vma *vma, void *priv) > +{ > + struct nvkm_vm *vm = vma->vm; > + struct nvkm_mmu *mmu = vm->mmu; > + struct nouveau_platform_device *plat; > + struct gk20a_mmu_iommu_mapping *p = priv; > + int ret; > + > + plat = nv_device_to_platform(nv_device(&mmu->base)); > + > + ret = iommu_unmap(plat->gpu->iommu.domain, p->iova, > + 1 << mmu->lpg_shift); > + WARN(ret < 0, "failed to unmap IOMMU address 0x%16llx, ret=%d\n", > + p->iova, ret); > + > + mutex_lock(&plat->gpu->iommu.mutex); > + nvkm_mm_free(plat->gpu->iommu.mm, &p->node); > + mutex_unlock(&plat->gpu->iommu.mutex); > + > + kfree(priv); > +} > + > +static int > +gk20a_mmu_ctor(struct nvkm_object *parent, struct nvkm_object *engine, > + struct nvkm_oclass *oclass, void *data, u32 size, > + struct nvkm_object **pobject) > +{ > + struct gk20a_mmu_priv *priv; > + struct nouveau_platform_device *plat; > + int ret; > + > + ret = nvkm_mmu_create(parent, engine, oclass, "VM", "vm", &priv); > + *pobject = nv_object(priv); > + if (ret) > + return ret; > + > + plat = nv_device_to_platform(nv_device(parent)); > + if (plat->gpu->iommu.domain) > + priv->base.iommu_capable = true; > + > + priv->base.limit = 1ULL << 40; > + priv->base.dma_bits = 40; > + priv->base.pgt_bits = 27 - 12; > + priv->base.spg_shift = 12; > + priv->base.lpg_shift = 17; > + priv->base.create = gf100_vm_create; > + priv->base.map_pgt = gf100_vm_map_pgt; > + priv->base.map = gf100_vm_map; > + priv->base.map_sg = gf100_vm_map_sg; > + priv->base.map_iommu = gk20a_vm_map_iommu; > + priv->base.unmap_iommu = gk20a_vm_unmap_iommu; > + priv->base.map_sg_iommu = gk20a_vm_map_sg_iommu; > + priv->base.unmap = gf100_vm_unmap; > + priv->base.flush = gf100_vm_flush; > + > + return 0; > +} > + > +struct nvkm_oclass > +gk20a_mmu_oclass = { > + .handle = NV_SUBDEV(MMU, 0xea), > + .ofuncs = &(struct nvkm_ofuncs) { > + .ctor = gk20a_mmu_ctor, > + .dtor = _nvkm_mmu_dtor, > + .init = _nvkm_mmu_init, > + .fini = _nvkm_mmu_fini, > + }, > +}; > -- > 2.1.4 > > _______________________________________________ > Nouveau mailing list > Nouveau at lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/nouveau
Terje Bergstrom
2015-Apr-16 20:41 UTC
[Nouveau] [PATCH 1/6] platform: specify the IOMMU physical translation bit
On 04/16/2015 04:06 AM, Vince Hsu wrote:> diff --git a/drm/nouveau/nouveau_platform.h b/drm/nouveau/nouveau_platform.h > index 392874cf4725..3e9bd7dc0092 100644 > --- a/drm/nouveau/nouveau_platform.h > +++ b/drm/nouveau/nouveau_platform.h > @@ -53,6 +53,7 @@ struct nouveau_platform_gpu { > struct nvkm_mm *mm; > struct iommu_domain *domain; > unsigned long pgshift; > + unsigned long phys_addr_bit; > } iommu; > };The naming is a bit reversed - the bit set tells we use SMMU, and bit cleared tells we use physical addressing. So the name of the bit should be iommu_addr_bit. Terje
Alexandre Courbot
2015-Apr-17 06:25 UTC
[Nouveau] [PATCH 0/6] map big page by platform IOMMU
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> Hi, > > Generally the the imported buffers which has memory type TTM_PL_TT are > mapped as small pages probably due to lack of big page allocation. But the > platform device which also use memory type TTM_PL_TT, like GK20A, canNit: GK20A can *only* allocate GPU memory from TTM_PL_TT. Trying to allocate from VRAM will result in an error.> *allocate* big page though the IOMMU hardware inside the SoC. This is a try > to map the imported buffers as big pages in GMMU by the platform IOMMU. With > some preparation work to map decreate small pages into big page(s) by IOMMUdecreate?> the GMMU eventually sees the imported buffer as chunks of big pages and does > the mapping. And then we can probably do the compression on teh imported > buffer which is composed of non-contiguous small pages. The compbits related > patches shall come later. > > I guess most of you won't like the change for the MMU code in this series. > So please comment and guide me how to do this better. :) > > Thanks, > Vince > > Vince Hsu (6): > platform: specify the IOMMU physical translation bit > instmem/gk20a: refer to IOMMU physical translation bit > mmu: map small pages into big pages(s) by IOMMU if possible > drm: enable big page mapping for small pages when IOMMU is available > mmu: gf100: share most of functions with GK20A > mmu: gk20a: implement IOMMU mapping for big pages > > drm/nouveau/include/nvkm/subdev/mmu.h | 16 ++ > drm/nouveau/nouveau_bo.c | 9 ++ > drm/nouveau/nouveau_platform.c | 19 +++ > drm/nouveau/nouveau_platform.h | 1 + > drm/nouveau/nvkm/engine/device/gk104.c | 2 +- > drm/nouveau/nvkm/subdev/instmem/gk20a.c | 13 +- > drm/nouveau/nvkm/subdev/mmu/Kbuild | 1 + > drm/nouveau/nvkm/subdev/mmu/base.c | 158 +++++++++++++++++++- > drm/nouveau/nvkm/subdev/mmu/gf100.c | 28 +--- > drm/nouveau/nvkm/subdev/mmu/gf100.h | 46 ++++++ > drm/nouveau/nvkm/subdev/mmu/gk20a.c | 253 ++++++++++++++++++++++++++++++++ > lib/include/nvif/os.h | 12 ++ > 12 files changed, 526 insertions(+), 32 deletions(-) > create mode 100644 drm/nouveau/nvkm/subdev/mmu/gf100.h > create mode 100644 drm/nouveau/nvkm/subdev/mmu/gk20a.c > > -- > 2.1.4 >
Alexandre Courbot
2015-Apr-17 06:26 UTC
[Nouveau] [PATCH 1/6] platform: specify the IOMMU physical translation bit
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> The IOMMU physical translation bit might vary with different SoCs. So add > a variable to specify this bit for GK20A. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/nouveau_platform.c | 19 +++++++++++++++++++ > drm/nouveau/nouveau_platform.h | 1 + > 2 files changed, 20 insertions(+) > > diff --git a/drm/nouveau/nouveau_platform.c b/drm/nouveau/nouveau_platform.c > index 775277f1edb0..0d002f73e356 100644 > --- a/drm/nouveau/nouveau_platform.c > +++ b/drm/nouveau/nouveau_platform.c > @@ -25,6 +25,7 @@ > #include <linux/module.h> > #include <linux/platform_device.h> > #include <linux/of.h> > +#include <linux/of_device.h> > #include <linux/reset.h> > #include <linux/regulator/consumer.h> > #include <linux/iommu.h> > @@ -92,6 +93,22 @@ static int nouveau_platform_power_down(struct nouveau_platform_gpu *gpu) > return 0; > } > > +static unsigned long nouveau_platform_get_iommu_bit(struct device *dev) > +{ > + const struct of_device_id *match; > + > + match = of_match_device(dev->driver->of_match_table, dev); > + if (!match) { > + dev_warn(dev, "cannot find OF match for device\n"); > + return 0; > + } > + > + if (!strcmp(match->compatible, "nvidia,gk20a")) > + return 34; > + else > + return 0; > +}Instead of this function, you should probably use the data field of struct of_device_id. Define a local struct called, say, nouveau_platform_params containing (for now) a single iommu_addr_bit field and instanciate one for each entry of nouveau_platform_match. Then you can cast match->data and retrieve the field directly instead of using strcmp. I'd say this is then simple enough to do directly in nouveau_platform_probe_iommu() instead of having a function dedicated to it. It is also safer because when we add a new chip, we have to update nouveau_platform_match but might very well forget about your function, and will end up with bit 0 being set on all our GPU addresses and endless hours of fun figuring out how this happened. :) While I am at it: how about defining this as a u64 mask to set/unset on GPU addresses instead of just a bit? This is more flexible and again safer in case someone "omits" to specify the correct bit for a chip.
Alexandre Courbot
2015-Apr-17 06:26 UTC
[Nouveau] [PATCH 2/6] instmem/gk20a: refer to IOMMU physical translation bit
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> Instead of hard-coding the translation bit in subdev driver, we refer to > the platform data. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/nvkm/subdev/instmem/gk20a.c | 13 +++++++++---- > 1 file changed, 9 insertions(+), 4 deletions(-) > > diff --git a/drm/nouveau/nvkm/subdev/instmem/gk20a.c b/drm/nouveau/nvkm/subdev/instmem/gk20a.c > index dd0994d9ebfc..69ef5eae3279 100644 > --- a/drm/nouveau/nvkm/subdev/instmem/gk20a.c > +++ b/drm/nouveau/nvkm/subdev/instmem/gk20a.c > @@ -89,6 +89,7 @@ struct gk20a_instmem_priv { > struct nvkm_mm *mm; > struct iommu_domain *domain; > unsigned long iommu_pgshift; > + unsigned long iommu_phys_addr_bit; > > /* Only used by DMA API */ > struct dma_attrs attrs; > @@ -169,8 +170,8 @@ gk20a_instobj_dtor_iommu(struct gk20a_instobj_priv *_node) > r = list_first_entry(&_node->mem->regions, struct nvkm_mm_node, > rl_entry); > > - /* clear bit 34 to unmap pages */ > - r->offset &= ~BIT(34 - priv->iommu_pgshift); > + /* clear IOMMU translation bit to unmap pages */ > + r->offset &= ~BIT(priv->iommu_phys_addr_bit - priv->iommu_pgshift); > > /* Unmap pages from GPU address space and free them */ > for (i = 0; i < _node->mem->size; i++) { > @@ -298,8 +299,11 @@ gk20a_instobj_ctor_iommu(struct nvkm_object *parent, struct nvkm_object *engine, > } > } > > - /* Bit 34 tells that an address is to be resolved through the IOMMU */ > - r->offset |= BIT(34 - priv->iommu_pgshift); > + /* > + * The iommu_phys_addr_bit tells that an address is to be resolved > + * through the IOMMU > + */ > + r->offset |= BIT(priv->iommu_phys_addr_bit - priv->iommu_pgshift); > > node->base._mem.offset = ((u64)r->offset) << priv->iommu_pgshift; > > @@ -407,6 +411,7 @@ gk20a_instmem_ctor(struct nvkm_object *parent, struct nvkm_object *engine, > priv->domain = plat->gpu->iommu.domain; > priv->mm = plat->gpu->iommu.mm; > priv->iommu_pgshift = plat->gpu->iommu.pgshift; > + priv->iommu_phys_addr_bit = plat->gpu->iommu.phys_addr_bit;Looks good, but I think I would definitely prefer this to be a mask instead of a bit index, i.e: r->offset &= ~(priv->iommu_addr_mask >> priv->iommu_pgshift); and r->offset |= (priv->iommu_addr_mask >> priv->iommu_pgshift);
Alexandre Courbot
2015-Apr-17 06:26 UTC
[Nouveau] [PATCH 4/6] drm: enable big page mapping for small pages when IOMMU is available
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> Some platforms have IOMMU to map non-contiguous physical memory into > contiguous GPU virtual address. We can use this feature to enable big pages > mapping on scattered small pages. To achieve that, we also need changes in > subdev/mmu as well. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/nouveau_bo.c | 9 +++++++++ > 1 file changed, 9 insertions(+) > > diff --git a/drm/nouveau/nouveau_bo.c b/drm/nouveau/nouveau_bo.c > index 77326e344dad..da76ee1121e4 100644 > --- a/drm/nouveau/nouveau_bo.c > +++ b/drm/nouveau/nouveau_bo.c > @@ -221,6 +221,11 @@ nouveau_bo_new(struct drm_device *dev, int size, int align, > if (drm->client.vm) { > if (!(flags & TTM_PL_FLAG_TT) && size > 256 * 1024) > nvbo->page_shift = drm->client.vm->mmu->lpg_shift; > + > + if ((flags & TTM_PL_FLAG_TT) && > + drm->client.vm->mmu->iommu_capable && > + (size % (1 << drm->client.vm->mmu->lpg_shift)) == 0) > + nvbo->page_shift = drm->client.vm->mmu->lpg_shift;I wonder if we should not just use the same size heuristics as for VRAM above? Here, unless your buffer size is an exact multiple of the big page size (128K for GK20A), you will not use big pages at all. In effect, many buffers will be rejected for this reason. A behavior like "if the buffer size of more than 256KB, increase the size of the buffer to the next multiple of 128K and use big pages" would probably yield better results.> } > > nouveau_bo_fixup_align(nvbo, flags, &align, &size); > @@ -1641,6 +1646,10 @@ nouveau_bo_vma_add(struct nouveau_bo *nvbo, struct nvkm_vm *vm, > (nvbo->bo.mem.mem_type == TTM_PL_VRAM || > nvbo->page_shift != vma->vm->mmu->lpg_shift)) > nvkm_vm_map(vma, nvbo->bo.mem.mm_node); > + else if (nvbo->bo.mem.mem_type == TTM_PL_TT && > + vma->vm->mmu->iommu_capable && > + nvbo->page_shift == vma->vm->mmu->lpg_shift) > + nvkm_vm_map(vma, nvbo->bo.mem.mm_node);Sorry, I don't understand why this is needed, could you explain?
Alexandre Courbot
2015-Apr-17 06:27 UTC
[Nouveau] [PATCH 5/6] mmu: gf100: share most of functions with GK20A
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> This patch moves all of the functions which can be shared with GK20A to > public for later use. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/nvkm/subdev/mmu/gf100.c | 28 +++++++--------------- > drm/nouveau/nvkm/subdev/mmu/gf100.h | 46 +++++++++++++++++++++++++++++++++++++ > 2 files changed, 54 insertions(+), 20 deletions(-) > create mode 100644 drm/nouveau/nvkm/subdev/mmu/gf100.h > > diff --git a/drm/nouveau/nvkm/subdev/mmu/gf100.c b/drm/nouveau/nvkm/subdev/mmu/gf100.c > index 294cda37f068..b067ded5d3be 100644 > --- a/drm/nouveau/nvkm/subdev/mmu/gf100.c > +++ b/drm/nouveau/nvkm/subdev/mmu/gf100.c > @@ -29,6 +29,8 @@ > > #include <core/gpuobj.h> > > +#include "gf100.h" > + > struct gf100_mmu_priv { > struct nvkm_mmu base; > }; > @@ -74,7 +76,7 @@ const u8 gf100_pte_storage_type_map[256] > }; > > > -static void > +void > gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, struct nvkm_gpuobj *pgt[2]) > { > u32 pde[2] = { 0, 0 }; > @@ -88,21 +90,7 @@ gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, struct nvkm_gpuobj *pgt[2]) > nv_wo32(pgd, (index * 8) + 4, pde[1]); > } > > -static inline u64 > -gf100_vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target) > -{ > - phys >>= 8; > - > - phys |= 0x00000001; /* present */ > - if (vma->access & NV_MEM_ACCESS_SYS) > - phys |= 0x00000002; > - > - phys |= ((u64)target << 32); > - phys |= ((u64)memtype << 36); > - return phys; > -} > - > -static void > +void > gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta) > { > @@ -127,7 +115,7 @@ gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > } > } > > -static void > +void > gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list) > { > @@ -144,7 +132,7 @@ gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > } > } > > -static void > +void > gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt) > { > pte <<= 3; > @@ -155,7 +143,7 @@ gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt) > } > } > > -static void > +void > gf100_vm_flush(struct nvkm_vm *vm) > { > struct gf100_mmu_priv *priv = (void *)vm->mmu; > @@ -191,7 +179,7 @@ gf100_vm_flush(struct nvkm_vm *vm) > mutex_unlock(&nv_subdev(priv)->mutex); > } > > -static int > +int > gf100_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset, > struct nvkm_vm **pvm) > { > diff --git a/drm/nouveau/nvkm/subdev/mmu/gf100.h b/drm/nouveau/nvkm/subdev/mmu/gf100.h > new file mode 100644 > index 000000000000..a66ca45bc755 > --- /dev/null > +++ b/drm/nouveau/nvkm/subdev/mmu/gf100.h > @@ -0,0 +1,46 @@ > +#ifndef __GF100_MMU_PRIV__ > +#define __GF100_MMU_PRIV__ > + > +#include <subdev/mmu.h> > + > +struct nv04_mmu_priv { > + struct nvkm_mmu base; > + struct nvkm_vm *vm; > + dma_addr_t null; > + void *nullp; > +};This structure is already declared in nv04.h - it looks wrong to re-declare it here, especially since it seems to be unused in all your code?> + > +int > +gf100_vm_create(struct nvkm_mmu *mmu, u64 offset, u64 length, u64 mm_offset, > + struct nvkm_vm **pvm); > + > +void > +gf100_vm_map_pgt(struct nvkm_gpuobj *pgd, u32 index, > + struct nvkm_gpuobj *pgt[2]); > +void > +gf100_vm_map(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > + struct nvkm_mem *mem, u32 pte, u32 cnt, u64 phys, u64 delta); > +void > +gf100_vm_map_sg(struct nvkm_vma *vma, struct nvkm_gpuobj *pgt, > + struct nvkm_mem *mem, u32 pte, u32 cnt, dma_addr_t *list); > +void > +gf100_vm_unmap(struct nvkm_gpuobj *pgt, u32 pte, u32 cnt); > + > +void > +gf100_vm_flush(struct nvkm_vm *vm); > + > +static inline u64 > +gf100_vm_addr(struct nvkm_vma *vma, u64 phys, u32 memtype, u32 target) > +{ > + phys >>= 8; > + > + phys |= 0x00000001; /* present */ > + if (vma->access & NV_MEM_ACCESS_SYS) > + phys |= 0x00000002; > + > + phys |= ((u64)target << 32); > + phys |= ((u64)memtype << 36); > + return phys; > +} > + > +#endif > -- > 2.1.4 >
On 04/17/2015 02:25 PM, Alexandre Courbot wrote:> On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote: >> Hi, >> >> Generally the the imported buffers which has memory type TTM_PL_TT are >> mapped as small pages probably due to lack of big page allocation. But the >> platform device which also use memory type TTM_PL_TT, like GK20A, can > Nit: GK20A can *only* allocate GPU memory from TTM_PL_TT. Trying to > allocate from VRAM will result in an error.Yep.> >> *allocate* big page though the IOMMU hardware inside the SoC. This is a try >> to map the imported buffers as big pages in GMMU by the platform IOMMU. With >> some preparation work to map decreate small pages into big page(s) by IOMMU > decreate?It should be discrete. Sorry for the typo.> >> the GMMU eventually sees the imported buffer as chunks of big pages and does >> the mapping. And then we can probably do the compression on teh imported >> buffer which is composed of non-contiguous small pages. The compbits related >> patches shall come later. >> >> I guess most of you won't like the change for the MMU code in this series. >> So please comment and guide me how to do this better. :) >> >> Thanks, >> Vince >> >> Vince Hsu (6): >> platform: specify the IOMMU physical translation bit >> instmem/gk20a: refer to IOMMU physical translation bit >> mmu: map small pages into big pages(s) by IOMMU if possible >> drm: enable big page mapping for small pages when IOMMU is available >> mmu: gf100: share most of functions with GK20A >> mmu: gk20a: implement IOMMU mapping for big pages >> >> drm/nouveau/include/nvkm/subdev/mmu.h | 16 ++ >> drm/nouveau/nouveau_bo.c | 9 ++ >> drm/nouveau/nouveau_platform.c | 19 +++ >> drm/nouveau/nouveau_platform.h | 1 + >> drm/nouveau/nvkm/engine/device/gk104.c | 2 +- >> drm/nouveau/nvkm/subdev/instmem/gk20a.c | 13 +- >> drm/nouveau/nvkm/subdev/mmu/Kbuild | 1 + >> drm/nouveau/nvkm/subdev/mmu/base.c | 158 +++++++++++++++++++- >> drm/nouveau/nvkm/subdev/mmu/gf100.c | 28 +--- >> drm/nouveau/nvkm/subdev/mmu/gf100.h | 46 ++++++ >> drm/nouveau/nvkm/subdev/mmu/gk20a.c | 253 ++++++++++++++++++++++++++++++++ >> lib/include/nvif/os.h | 12 ++ >> 12 files changed, 526 insertions(+), 32 deletions(-) >> create mode 100644 drm/nouveau/nvkm/subdev/mmu/gf100.h >> create mode 100644 drm/nouveau/nvkm/subdev/mmu/gk20a.c >> >> -- >> 2.1.4 >>----------------------------------------------------------------------------------- This email message is for the sole use of the intended recipient(s) and may contain confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please contact the sender by reply email and destroy all copies of the original message. -----------------------------------------------------------------------------------
Alexandre Courbot
2015-Apr-17 09:11 UTC
[Nouveau] [PATCH 3/6] mmu: map small pages into big pages(s) by IOMMU if possible
On Thu, Apr 16, 2015 at 8:06 PM, Vince Hsu <vinceh at nvidia.com> wrote:> This patch implements a way to aggregate the small pages and make them be > mapped as big page(s) by utilizing the platform IOMMU if supported. And then > we can enable compression support for these big pages later. > > Signed-off-by: Vince Hsu <vinceh at nvidia.com> > --- > drm/nouveau/include/nvkm/subdev/mmu.h | 16 ++++ > drm/nouveau/nvkm/subdev/mmu/base.c | 158 ++++++++++++++++++++++++++++++++--I believe (although I may have missed something) that this patch (and patch 6/6) can be rewritten such as these two files remain untouched. IOW, no new data structures (because the PTE will contain all the information you need), and no change to base.c (because IOMMU is chip-specific logic, although one may argue that the use of the IOMMU API makes it more generic). But let's review the extra data structures first:> lib/include/nvif/os.h | 12 +++ > 3 files changed, 179 insertions(+), 7 deletions(-) > > diff --git a/drm/nouveau/include/nvkm/subdev/mmu.h b/drm/nouveau/include/nvkm/subdev/mmu.h > index 3a5368776c31..3230d31a7971 100644 > --- a/drm/nouveau/include/nvkm/subdev/mmu.h > +++ b/drm/nouveau/include/nvkm/subdev/mmu.h > @@ -22,6 +22,8 @@ struct nvkm_vma { > struct nvkm_mm_node *node; > u64 offset; > u32 access; > + struct list_head bp; > + bool has_iommu_bp;Whether a chunk of memory is mapped through the IOMMU can be tested by checking if the IOMMU bit is set in the address recorded in the PTE. So has_iommu_bp looks redundant here.> }; > > struct nvkm_vm { > @@ -37,6 +39,13 @@ struct nvkm_vm { > u32 lpde; > }; > > +struct nvkm_vm_bp_list { > + struct list_head head; > + u32 pde; > + u32 pte; > + void *priv; > +}; > +Tracking the PDE and PTE of each memory chunk can probably be avoided if you change your unmapping strategy. Currently you are going through the list of nvkm_vm_bp_list, but you know your PDE and PTE are always going to be adjacent, since a nvkm_vma represents a contiguous block in the GPU VA. So when unmapping, you can simply check for each PTE entry whether the IOMMU bit is set, and unmap from the IOMMU space after unmapping from the GPU VA space, in a loop similar to that of nvkm_vm_unmap_at(). Then we only need priv. You are keeping the nvkm_mm_node of the IOMMU space into it, and you need it to free the IOMMU VA space. If only we could find another way to store it, we could get rid of the whole structure and associated list_head in nvkm_vma... I need to give it some more thoughts, and we will probably need to change a few things in base.c to make the hooks more flexible, so please give me some more time to think about it. :) I just wanted to share my thoughts so far in case this puts you on track.
Possibly Parallel Threads
- [PATCH 3/6] mmu: map small pages into big pages(s) by IOMMU if possible
- [PATCH 0/6] map big page by platform IOMMU
- [PATCH 6/6] mmu: gk20a: implement IOMMU mapping for big pages
- [drm-nouveau-mmu] question about potential NULL pointer dereference
- CUDA fixed VA allocations and sparse mappings