On 8/8/2023 11:52 PM, Jason Wang wrote:> On Wed, Aug 9, 2023 at 6:58?AM Si-Wei Liu <si-wei.liu at oracle.com>
wrote:
>>
>>
>> On 8/7/2023 8:00 PM, Jason Wang wrote:
>>> On Fri, Aug 4, 2023 at 1:58?AM Si-Wei Liu <si-wei.liu at
oracle.com> wrote:
>>>>
>>>> On 8/3/2023 1:03 AM, Jason Wang wrote:
>>>>> On Thu, Aug 3, 2023 at 1:13?AM Dragos Tatulea <dtatulea
at nvidia.com> wrote:
>>>>>> The mr->initialized flag is shared between the
control vq and data vq
>>>>>> part of the mr init/uninit. But if the control vq and
data vq get placed
>>>>>> in different ASIDs, it can happen that initializing the
control vq will
>>>>>> prevent the data vq mr from being initialized.
>>>>>>
>>>>>> This patch consolidates the control and data vq init
parts into their
>>>>>> own init functions. The mr->initialized will now be
used for the data vq
>>>>>> only. The control vq currently doesn't need a flag.
>>>>>>
>>>>>> The uninitializing part is also taken care of:
mlx5_vdpa_destroy_mr got
>>>>>> split into data and control vq functions which are now
also ASID aware.
>>>>>>
>>>>>> Fixes: 8fcd20c30704 ("vdpa/mlx5: Support different
address spaces for control and data")
>>>>>> Signed-off-by: Dragos Tatulea <dtatulea at
nvidia.com>
>>>>>> Reviewed-by: Eugenio P?rez <eperezma at
redhat.com>
>>>>>> Reviewed-by: Gal Pressman <gal at nvidia.com>
>>>>>> ---
>>>>>> drivers/vdpa/mlx5/core/mlx5_vdpa.h | 1 +
>>>>>> drivers/vdpa/mlx5/core/mr.c | 97
+++++++++++++++++++++---------
>>>>>> 2 files changed, 71 insertions(+), 27 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>>>>>> index 25fc4120b618..a0420be5059f 100644
>>>>>> --- a/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>>>>>> +++ b/drivers/vdpa/mlx5/core/mlx5_vdpa.h
>>>>>> @@ -31,6 +31,7 @@ struct mlx5_vdpa_mr {
>>>>>> struct list_head head;
>>>>>> unsigned long num_directs;
>>>>>> unsigned long num_klms;
>>>>>> + /* state of dvq mr */
>>>>>> bool initialized;
>>>>>>
>>>>>> /* serialize mkey creation and destruction
*/
>>>>>> diff --git a/drivers/vdpa/mlx5/core/mr.c
b/drivers/vdpa/mlx5/core/mr.c
>>>>>> index 03e543229791..4ae14a248a4b 100644
>>>>>> --- a/drivers/vdpa/mlx5/core/mr.c
>>>>>> +++ b/drivers/vdpa/mlx5/core/mr.c
>>>>>> @@ -489,60 +489,103 @@ static void
destroy_user_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_mr *mr
>>>>>> }
>>>>>> }
>>>>>>
>>>>>> -void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
>>>>>> +static void _mlx5_vdpa_destroy_cvq_mr(struct
mlx5_vdpa_dev *mvdev, unsigned int asid)
>>>>>> +{
>>>>>> + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]
!= asid)
>>>>>> + return;
>>>>>> +
>>>>>> + prune_iotlb(mvdev);
>>>>>> +}
>>>>>> +
>>>>>> +static void _mlx5_vdpa_destroy_dvq_mr(struct
mlx5_vdpa_dev *mvdev, unsigned int asid)
>>>>>> {
>>>>>> struct mlx5_vdpa_mr *mr = &mvdev->mr;
>>>>>>
>>>>>> - mutex_lock(&mr->mkey_mtx);
>>>>>> + if
(mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
>>>>>> + return;
>>>>>> +
>>>>>> if (!mr->initialized)
>>>>>> - goto out;
>>>>>> + return;
>>>>>>
>>>>>> - prune_iotlb(mvdev);
>>>>>> if (mr->user_mr)
>>>>>> destroy_user_mr(mvdev, mr);
>>>>>> else
>>>>>> destroy_dma_mr(mvdev, mr);
>>>>>>
>>>>>> mr->initialized = false;
>>>>>> -out:
>>>>>> +}
>>>>>> +
>>>>>> +static void mlx5_vdpa_destroy_mr_asid(struct
mlx5_vdpa_dev *mvdev, unsigned int asid)
>>>>>> +{
>>>>>> + struct mlx5_vdpa_mr *mr = &mvdev->mr;
>>>>>> +
>>>>>> + mutex_lock(&mr->mkey_mtx);
>>>>>> +
>>>>>> + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
>>>>>> + _mlx5_vdpa_destroy_cvq_mr(mvdev, asid);
>>>>>> +
>>>>>> mutex_unlock(&mr->mkey_mtx);
>>>>>> }
>>>>>>
>>>>>> -static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev
*mvdev,
>>>>>> - struct vhost_iotlb
*iotlb, unsigned int asid)
>>>>>> +void mlx5_vdpa_destroy_mr(struct mlx5_vdpa_dev *mvdev)
>>>>>> +{
>>>>>> + mlx5_vdpa_destroy_mr_asid(mvdev,
mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]);
>>>>>> + mlx5_vdpa_destroy_mr_asid(mvdev,
mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP]);
>>>>>> +}
>>>>>> +
>>>>>> +static int _mlx5_vdpa_create_cvq_mr(struct
mlx5_vdpa_dev *mvdev,
>>>>>> + struct vhost_iotlb
*iotlb,
>>>>>> + unsigned int asid)
>>>>>> +{
>>>>>> + if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]
!= asid)
>>>>>> + return 0;
>>>>>> +
>>>>>> + return dup_iotlb(mvdev, iotlb);
>>>>> This worries me as conceptually, there should be no
difference between
>>>>> dvq mr and cvq mr. The virtqueue should be loosely coupled
with mr.
>>>>>
>>>>> One example is that, if we only do dup_iotlb() but not try
to create
>>>>> dma mr here, we will break virtio-vdpa:
>>>> For this case, I guess we may need another way to support
virtio-vdpa
>>>> 1:1 mapping rather than overloading virtio device reset
semantics, see:
>>>>
>>>> https://www.mail-archive.com/qemu-devel at
nongnu.org/msg953755.html
>>>>
>>>> > Conceptually, the address mapping is not a part of the
abstraction for
>>>> > a virtio device now. So resetting the memory mapping
during virtio
>>>> > device reset seems wrong.
>>>>
>>>> where we want to keep memory mapping intact across virtio
device reset
>>>> for best live migration latency/downtime. I wonder would it
work to
>>>> reset the mapping in vhost-vdpa life cycle out of virtio reset,
say
>>>> introduce a .reset_map() op to restore 1:1 mapping within
>>>> vhost_vdpa_remove_as() right after vhost_vdpa_iotlb_unmap()?
Then we can
>>>> move the iotlb reset logic to there without worry breaking
virtio-vdpa.
>>> It looks to me we don't need a new ops. We can simply do
set_map()
>>> twice
>> What does it mean, first set_map(0, -1ULL) with zero iotlb entry passed
>> in to destroy all iotlb mappings previously added, and second
set_map(0,
>> -1ULL) to restore 1:1 DMA MR? But userspace (maybe a buggy one but
>> doesn't do harm) apart from vhost-vdpa itself can do unmap twice
anyway,
>> this is supported today I think. Why there'll be such obscure
>> distinction, or what's the benefit to treat second .set_map() as
>> recreating 1:1 mapping?
> Ok, I think I miss some context. I agree that it's better to decouple
> memory mappings from the virtio reset. It helps to reduce the
> unnecessary memory transactions. It might require a new feature flag.
This I agreed. AFAICT QEMU would need to check this new feature flag to
make sure memory mappings are kept intact across reset, otherwise for
the sake of avoid breaking older kernels it has to recreate all the
mappings after reset like how it is done today.
> Regarding the method of restoring to 1:1 DMA MR, it might be dangerous
> for (buggy) vhost-vDPA devices. Since its userspace doesn't set up any
> mapping it can explore the kernel with that via CVQ?
Not sure I understand this proposal. The 1:1 DMA MR is first created at
vdpa device add, and gets destroyed implicitly when the first .set_map
or .dma_map call is made, which is only possible after the vhost-vdpa
module is loaded and bound to vdpa devices. Naturally the DMA MR should
be restored to how it was before when vhost-vdpa module is unloaded, or
if anything the 1:1 DMA MR creation can be deferred to until virtio-vdpa
is probed and bound to devices. Today vhost_vdpa_remove_as() as part of
the vhost-vdpa unload code path already gets all mappings purged through
vhost_vdpa_iotlb_unmap(0, -1ULL), and it should be pretty safe to
restore DMA MR via .reset_map() right after. Not sure what's the concern
here with buggy vhost-vdpa device?
Noted when vhost-vdpa is being unloaded there's even no chance to probe
kernel through CVQ, as the virtio feature is not even negotiated at that
point. And it is even trickier to wait for CVQ response from device
indefinitely when trying to unload a module.
Regards,
-Siwei>
> Thanks
>
>>> or do you mean it would be faster?
>> I think with .reset_map() we at least can avoid indefinite latency
>> hiccup from destroying and recreating 1:1 mapping with the unwarranted
>> 2rd unmap call. And .reset_map() should work with both .dma_map() and
>> .set_map() APIs with clear semantics.
>>
>> Regards,
>> -Siwei
>>> Thanks
>>>
>>>> Thanks,
>>>> -Siwei
>>>>
>>>>> commit 6f5312f801836e6af9bcbb0bdb44dc423e129206
>>>>> Author: Eli Cohen <elic at nvidia.com>
>>>>> Date: Wed Jun 2 11:58:54 2021 +0300
>>>>>
>>>>> vdpa/mlx5: Add support for running with virtio_vdpa
>>>>>
>>>>> In order to support running vdpa using vritio_vdpa
driver, we need to
>>>>> create a different kind of MR, one that has 1:1
mapping, since the
>>>>> addresses referring to virtqueues are dma addresses.
>>>>>
>>>>> We create the 1:1 MR in mlx5_vdpa_dev_add() only in
case firmware
>>>>> supports the general capability umem_uid_0. The
reason for that is that
>>>>> 1:1 MRs must be created with uid == 0 while
virtqueue objects can be
>>>>> created with uid == 0 only when the firmware
capability is on.
>>>>>
>>>>> If the set_map() callback is called with new
translations provided
>>>>> through iotlb, the driver will destroy the 1:1 MR
and create a regular
>>>>> one.
>>>>>
>>>>> Signed-off-by: Eli Cohen <elic at nvidia.com>
>>>>> Link:
https://lore.kernel.org/r/20210602085854.62690-1-elic at nvidia.com
>>>>> Signed-off-by: Michael S. Tsirkin <mst at
redhat.com>
>>>>> Acked-by: Jason Wang <jasowang at redhat.com>
>>>>>
>>>>> Thanks
>>>>>
>>>>>
>>>>>> +}
>>>>>> +
>>>>>> +static int _mlx5_vdpa_create_dvq_mr(struct
mlx5_vdpa_dev *mvdev,
>>>>>> + struct vhost_iotlb
*iotlb,
>>>>>> + unsigned int asid)
>>>>>> {
>>>>>> struct mlx5_vdpa_mr *mr = &mvdev->mr;
>>>>>> int err;
>>>>>>
>>>>>> - if (mr->initialized)
>>>>>> + if
(mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] != asid)
>>>>>> return 0;
>>>>>>
>>>>>> - if
(mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) {
>>>>>> - if (iotlb)
>>>>>> - err = create_user_mr(mvdev,
iotlb);
>>>>>> - else
>>>>>> - err = create_dma_mr(mvdev, mr);
>>>>>> + if (mr->initialized)
>>>>>> + return 0;
>>>>>>
>>>>>> - if (err)
>>>>>> - return err;
>>>>>> - }
>>>>>> + if (iotlb)
>>>>>> + err = create_user_mr(mvdev, iotlb);
>>>>>> + else
>>>>>> + err = create_dma_mr(mvdev, mr);
>>>>>>
>>>>>> - if (mvdev->group2asid[MLX5_VDPA_CVQ_GROUP]
== asid) {
>>>>>> - err = dup_iotlb(mvdev, iotlb);
>>>>>> - if (err)
>>>>>> - goto out_err;
>>>>>> - }
>>>>>> + if (err)
>>>>>> + return err;
>>>>>>
>>>>>> mr->initialized = true;
>>>>>> +
>>>>>> + return 0;
>>>>>> +}
>>>>>> +
>>>>>> +static int _mlx5_vdpa_create_mr(struct mlx5_vdpa_dev
*mvdev,
>>>>>> + struct vhost_iotlb
*iotlb, unsigned int asid)
>>>>>> +{
>>>>>> + int err;
>>>>>> +
>>>>>> + err = _mlx5_vdpa_create_dvq_mr(mvdev, iotlb,
asid);
>>>>>> + if (err)
>>>>>> + return err;
>>>>>> +
>>>>>> + err = _mlx5_vdpa_create_cvq_mr(mvdev, iotlb,
asid);
>>>>>> + if (err)
>>>>>> + goto out_err;
>>>>>> +
>>>>>> return 0;
>>>>>>
>>>>>> out_err:
>>>>>> - if
(mvdev->group2asid[MLX5_VDPA_DATAVQ_GROUP] == asid) {
>>>>>> - if (iotlb)
>>>>>> - destroy_user_mr(mvdev, mr);
>>>>>> - else
>>>>>> - destroy_dma_mr(mvdev, mr);
>>>>>> - }
>>>>>> + _mlx5_vdpa_destroy_dvq_mr(mvdev, asid);
>>>>>>
>>>>>> return err;
>>>>>> }
>>>>>> --
>>>>>> 2.41.0
>>>>>>
>>>>> _______________________________________________
>>>>> Virtualization mailing list
>>>>> Virtualization at lists.linux-foundation.org
>>>>>
https://lists.linuxfoundation.org/mailman/listinfo/virtualization