Si-Wei Liu
2023-Sep-09 13:31 UTC
[PATCH RFC v2 0/4] vdpa: decouple reset of iotlb mapping from device reset
In order to reduce needlessly high setup and teardown cost of iotlb mapping during live migration, it's crucial to decouple the vhost-vdpa iotlb abstraction from the virtio device life cycle, i.e. iotlb mappings should be left intact across virtio device reset [1]. For it to work, the on-chip IOMMU parent device should implement a separate .reset_map() operation callback to restore 1:1 DMA mapping without having to resort to the .reset() callback, which is mainly used to reset virtio specific device state. This new .reset_map() callback will be invoked only when the vhost-vdpa driver is to be removed and detached from the vdpa bus, such that other vdpa bus drivers, e.g. virtio-vdpa, can start with 1:1 DMA mapping when they are attached. For the context, those on-chip IOMMU parent devices, create the 1:1 DMA mapping at vdpa device add, and they would implicitly destroy the 1:1 mapping when the first .set_map or .dma_map callback is invoked. [1] Reducing vdpa migration downtime because of memory pin / maps https://www.mail-archive.com/qemu-devel at nongnu.org/msg953755.html --- RFC v2: - rebased on top of the "[PATCH RFC v2 0/3] vdpa: dedicated descriptor table group" series: https://lore.kernel.org/virtualization/1694248959-13369-1-git-send-email-si-wei.liu at oracle.com/ --- Si-Wei Liu (4): vdpa: introduce .reset_map operation callback vdpa/mlx5: implement .reset_map driver op vhost-vdpa: should restore 1:1 dma mapping before detaching driver vhost-vdpa: introduce IOTLB_PERSIST backend feature bit drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 + drivers/vdpa/mlx5/core/mr.c | 70 +++++++++++++++++++++++--------------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 +++++++--- drivers/vhost/vdpa.c | 32 ++++++++++++++++- include/linux/vdpa.h | 7 ++++ include/uapi/linux/vhost_types.h | 2 ++ 6 files changed, 96 insertions(+), 34 deletions(-) -- 1.8.3.1
Si-Wei Liu
2023-Sep-09 13:31 UTC
[PATCH RFC v2 1/4] vdpa: introduce .reset_map operation callback
On-chip IOMMU parent driver could use it to restore memory mapping to the initial state. Signed-off-by: Si-Wei Liu <si-wei.liu at oracle.com> --- include/linux/vdpa.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 17a4efa..daecf55 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -324,6 +324,12 @@ struct vdpa_map_file { * @iova: iova to be unmapped * @size: size of the area * Returns integer: success (0) or error (< 0) + * @reset_map: Reset device memory mapping (optional) + * Needed for device that using device + * specific DMA translation (on-chip IOMMU) + * @vdev: vdpa device + * @asid: address space identifier + * Returns integer: success (0) or error (< 0) * @get_vq_dma_dev: Get the dma device for a specific * virtqueue (optional) * @vdev: vdpa device @@ -401,6 +407,7 @@ struct vdpa_config_ops { u64 iova, u64 size, u64 pa, u32 perm, void *opaque); int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, u64 iova, u64 size); + int (*reset_map)(struct vdpa_device *vdev, unsigned int asid); int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, unsigned int asid); struct device *(*get_vq_dma_dev)(struct vdpa_device *vdev, u16 idx); -- 1.8.3.1
Si-Wei Liu
2023-Sep-09 13:31 UTC
[PATCH RFC v2 2/4] vdpa/mlx5: implement .reset_map driver op
Today, mlx5_vdpa gets started by preallocate 1:1 DMA mapping at device creation time, while this 1:1 mapping will be implicitly destroyed when the first .set_map call is invoked. Everytime when the .reset callback is invoked, any mapping left behind will be dropped then reset back to the initial 1:1 DMA mapping. In order to reduce excessive memory mapping cost during live migration, it is desirable to decouple the vhost-vdpa iotlb abstraction from the virtio device life cycle, i.e. mappings should be left intact across virtio device reset. Leverage the .reset_map callback to reset memory mapping, then the device .reset routine can run free from having to clean up memory mappings. Signed-off-by: Si-Wei Liu <si-wei.liu at oracle.com> --- RFC v1 -> v2: - fix error path when both CVQ and DVQ fall in same asid --- drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 + drivers/vdpa/mlx5/core/mr.c | 70 +++++++++++++++++++++++--------------- drivers/vdpa/mlx5/net/mlx5_vnet.c | 18 +++++++--- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h b/drivers/vdpa/mlx5/core/mlx5_vdpa.h index b53420e..5c9a25a 100644 --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h @@ -123,6 +123,7 @@ int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, unsigned int asid); void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev); void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid); +int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid); #define mlx5_vdpa_warn(__dev, format, ...) \ dev_warn((__dev)->mdev->device, "%s:%d:(pid %d) warning: " format, __func__, __LINE__, \ diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 5a1971fc..ec2c7b4e1 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -489,21 +489,15 @@ static void destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr } } -static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +static void _mlx5_vdpa_destroy_cvq_mr(struct mlx5_vdpa_dev *mvdev) { - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) - return; - prune_iotlb(mvdev); } -static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +static void _mlx5_vdpa_destroy_dvq_mr(struct mlx5_vdpa_dev *mvdev) { struct mlx5_vdpa_mr *mr = &mvdev->mr; - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) - return; - if (!mr->initialized) return; @@ -521,8 +515,10 @@ void mlx5_vdpa_destroy_mr_asid(struct mlx5_vdpa_dev *mvdev, unsigned int asid) mutex_lock(&mr->mkey_mtx); - _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); - _mlx5_vdpa_destroy_cvq_mr(mvdev, asid); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) + _mlx5_vdpa_destroy_dvq_mr(mvdev); + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) + _mlx5_vdpa_destroy_cvq_mr(mvdev); mutex_unlock(&mr->mkey_mtx); } @@ -534,25 +530,17 @@ void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev) } static int _mlx5_vdpa_create_cvq_mr(struct mlx5_vdpa_dev *mvdev, - struct vhost_iotlb *iotlb, - unsigned int asid) + struct vhost_iotlb *iotlb) { - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] != asid) - return 0; - return dup_iotlb(mvdev, iotlb); } static int _mlx5_vdpa_create_dvq_mr(struct mlx5_vdpa_dev *mvdev, - struct vhost_iotlb *iotlb, - unsigned int asid) + struct vhost_iotlb *iotlb) { struct mlx5_vdpa_mr *mr = &mvdev->mr; int err; - if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid) - return 0; - if (mr->initialized) return 0; @@ -574,18 +562,22 @@ static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, { int err; - err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb, asid); - if (err) - return err; - - err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb, asid); - if (err) - goto out_err; + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) { + err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb); + if (err) + return err; + } + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP] == asid) { + err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb); + if (err) + goto out_err; + } return 0; out_err: - _mlx5_vdpa_destroy_dvq_mr(mvdev, asid); + if (mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) + _mlx5_vdpa_destroy_dvq_mr(mvdev); return err; } @@ -601,6 +593,28 @@ int mlx5_vdpa_create_mr(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, return err; } +int mlx5_vdpa_reset_mr(struct mlx5_vdpa_dev *mvdev, unsigned int asid) +{ + struct mlx5_vdpa_mr *mr = &mvdev->mr; + int err = 0; + + if (asid != 0) + return 0; + + mutex_lock(&mr->mkey_mtx); + if (!mr->user_mr) + goto out; + _mlx5_vdpa_destroy_dvq_mr(mvdev); + if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { + err = _mlx5_vdpa_create_dvq_mr(mvdev, NULL); + if (err) + mlx5_vdpa_warn(mvdev, "create DMA MR failed\n"); + } +out: + mutex_unlock(&mr->mkey_mtx); + return err; +} + int mlx5_vdpa_handle_set_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb, bool *change_map, unsigned int asid) { diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 37be945..3cb5db6 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -2824,7 +2824,6 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) unregister_link_notifier(ndev); teardown_driver(ndev); clear_vqs_ready(ndev); - mlx5_vdpa_destroy_mr(&ndev->mvdev); ndev->mvdev.status = 0; ndev->mvdev.suspended = false; ndev->cur_num_vqs = 0; @@ -2835,10 +2834,6 @@ static int mlx5_vdpa_reset(struct vdpa_device *vdev) init_group_to_asid_map(mvdev); ++mvdev->generation; - if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) { - if (mlx5_vdpa_create_mr(mvdev, NULL, 0)) - mlx5_vdpa_warn(mvdev, "create MR failed\n"); - } up_write(&ndev->reslock); return 0; @@ -2903,6 +2898,18 @@ static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, return err; } +static int mlx5_vdpa_reset_map(struct vdpa_device *vdev, unsigned int asid) +{ + struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); + struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); + int err; + + down_write(&ndev->reslock); + err = mlx5_vdpa_reset_mr(mvdev, asid); + up_write(&ndev->reslock); + return err; +} + static struct device *mlx5_get_vq_dma_dev(struct vdpa_device *vdev, u16 idx) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); @@ -3162,6 +3169,7 @@ static int mlx5_set_group_asid(struct vdpa_device *vdev, u32 group, .set_config = mlx5_vdpa_set_config, .get_generation = mlx5_vdpa_get_generation, .set_map = mlx5_vdpa_set_map, + .reset_map = mlx5_vdpa_reset_map, .set_group_asid = mlx5_set_group_asid, .get_vq_dma_dev = mlx5_get_vq_dma_dev, .free = mlx5_vdpa_free, -- 1.8.3.1
Si-Wei Liu
2023-Sep-09 13:31 UTC
[PATCH RFC v2 3/4] vhost-vdpa: should restore 1:1 dma mapping before detaching driver
Devices with on-chip IOMMU may need to restore iotlb to 1:1 identity mapping from IOVA to PA. Before vhost-vdpa is going away, give them a chance to clean up and reset iotlb back to 1:1 identify mapping mode. This is done so that any vdpa bus driver may start with 1:1 identity mapping by default. Signed-off-by: Si-Wei Liu <si-wei.liu at oracle.com> --- drivers/vhost/vdpa.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index eabac06..71fbd559 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -131,6 +131,15 @@ static struct vhost_vdpa_as *vhost_vdpa_find_alloc_as(struct vhost_vdpa *v, return vhost_vdpa_alloc_as(v, asid); } +static void vhost_vdpa_reset_map(struct vhost_vdpa *v, u32 asid) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->reset_map) + ops->reset_map(vdpa, asid); +} + static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) { struct vhost_vdpa_as *as = asid_to_as(v, asid); @@ -140,6 +149,14 @@ static int vhost_vdpa_remove_as(struct vhost_vdpa *v, u32 asid) hlist_del(&as->hash_link); vhost_vdpa_iotlb_unmap(v, &as->iotlb, 0ULL, 0ULL - 1, asid); + /* + * Devices with on-chip IOMMU need to restore iotlb + * to 1:1 identity mapping before vhost-vdpa is going + * to be removed and detached from the device. Give + * them a chance to do so, as this cannot be done + * efficiently via the whole-range unmap call above. + */ + vhost_vdpa_reset_map(v, asid); kfree(as); return 0; -- 1.8.3.1
Si-Wei Liu
2023-Sep-09 13:31 UTC
[PATCH RFC v2 4/4] vhost-vdpa: introduce IOTLB_PERSIST backend feature bit
Userspace needs this feature flag to distinguish if vhost-vdpa iotlb in the kernel supports persistent IOTLB mapping across device reset. There are two cases that backend may claim this feature bit on: - parent device that has to work with platform IOMMU - parent device with on-chip IOMMU that has the expected .reset_map support in driver Signed-off-by: Si-Wei Liu <si-wei.liu at oracle.com> --- drivers/vhost/vdpa.c | 15 ++++++++++++++- include/uapi/linux/vhost_types.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 71fbd559..bbb1092 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -414,6 +414,14 @@ static bool vhost_vdpa_has_desc_group(const struct vhost_vdpa *v) return ops->get_vq_desc_group; } +static bool vhost_vdpa_has_persistent_map(const struct vhost_vdpa *v) +{ + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + return (!ops->set_map && !ops->dma_map) || ops->reset_map; +} + static long vhost_vdpa_get_features(struct vhost_vdpa *v, u64 __user *featurep) { struct vdpa_device *vdpa = v->vdpa; @@ -716,7 +724,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, if (features & ~(VHOST_VDPA_BACKEND_FEATURES | BIT_ULL(VHOST_BACKEND_F_DESC_ASID) | BIT_ULL(VHOST_BACKEND_F_SUSPEND) | - BIT_ULL(VHOST_BACKEND_F_RESUME))) + BIT_ULL(VHOST_BACKEND_F_RESUME) | + BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST))) return -EOPNOTSUPP; if ((features & BIT_ULL(VHOST_BACKEND_F_SUSPEND)) && !vhost_vdpa_can_suspend(v)) @@ -729,6 +738,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, return -EINVAL; if ((features & BIT_ULL(VHOST_BACKEND_F_DESC_ASID)) && !vhost_vdpa_has_desc_group(v)) + if ((features & BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST)) && + !vhost_vdpa_has_persistent_map(v)) return -EOPNOTSUPP; vhost_set_backend_features(&v->vdev, features); return 0; @@ -785,6 +796,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, features |= BIT_ULL(VHOST_BACKEND_F_RESUME); if (vhost_vdpa_has_desc_group(v)) features |= BIT_ULL(VHOST_BACKEND_F_DESC_ASID); + if (vhost_vdpa_has_persistent_map(v)) + features |= BIT_ULL(VHOST_BACKEND_F_IOTLB_PERSIST); if (copy_to_user(featurep, &features, sizeof(features))) r = -EFAULT; break; diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 6acc604..0fdb6f0 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -186,5 +186,7 @@ struct vhost_vdpa_iova_range { * buffers may reside. Requires VHOST_BACKEND_F_IOTLB_ASID. */ #define VHOST_BACKEND_F_DESC_ASID 0x6 +/* IOTLB don't flush memory mapping across device reset */ +#define VHOST_BACKEND_F_IOTLB_PERSIST 0x7 #endif -- 1.8.3.1