thr3ads.net - Xen devel - [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall [May 2013]

If this information is useful, please help other people find it:
Share via:

Zhenzhong Duan

2013-May-08 08:17 UTC

[PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Accelerated msix entry is initialized to zero when msixtbl_pt_register is
called. This doesn''t match the value from qemu side, although pirq may
already
be mapped and binded in qemu side. Kernel will get wrong value when reading
msix info.

Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
Tested-by: Yuval Shaia <yuval.shaia@oracle.com>
---
 tools/libxc/xc_domain.c      |    7 ++++++-
 tools/libxc/xenctrl.h        |    4 +++-
 xen/arch/x86/hvm/vmsi.c      |   13 ++++++++++++-
 xen/drivers/passthrough/io.c |    3 ++-
 xen/include/public/domctl.h  |    2 ++
 xen/include/xen/pci.h        |    3 ++-
 6 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/tools/libxc/xc_domain.c b/tools/libxc/xc_domain.c
index bb71cca..f6fc8e4 100644
--- a/tools/libxc/xc_domain.c
+++ b/tools/libxc/xc_domain.c
@@ -1339,7 +1339,9 @@ int xc_domain_update_msi_irq(
     uint32_t gvec,
     uint32_t pirq,
     uint32_t gflags,
-    uint64_t gtable)
+    uint64_t gtable,
+    uint16_t entry_nr,
+    uint32_t msi_ad[3])
 {
     int rc;
     xen_domctl_bind_pt_irq_t *bind;
@@ -1356,6 +1358,9 @@ int xc_domain_update_msi_irq(
     bind->u.msi.gvec = gvec;
     bind->u.msi.gflags = gflags;
     bind->u.msi.gtable = gtable;
+    bind->u.msi.entry_nr = entry_nr;
+    if ( gtable )
+        memcpy(bind->u.msi.msi_ad, msi_ad, sizeof(uint32_t[3]));
 
     rc = do_domctl(xch, &domctl);
     return rc;
diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h
index 54a2d5a..f292443 100644
--- a/tools/libxc/xenctrl.h
+++ b/tools/libxc/xenctrl.h
@@ -1749,7 +1749,9 @@ int xc_domain_update_msi_irq(
     uint32_t gvec,
     uint32_t pirq,
     uint32_t gflags,
-    uint64_t gtable);
+    uint64_t gtable,
+    uint16_t entry_nr,
+    uint32_t msi_ad[3]);
 
 int xc_domain_unbind_msi_irq(xc_interface *xch,
                              uint32_t domid,
diff --git a/xen/arch/x86/hvm/vmsi.c b/xen/arch/x86/hvm/vmsi.c
index 36de312..06ea324 100644
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -168,6 +168,7 @@ struct msixtbl_entry
     struct { 
         uint32_t msi_ad[3];	/* Shadow of address low, high and data */
     } gentries[MAX_MSIX_ACC_ENTRIES];
+    unsigned long table_shadow[BITS_TO_LONGS(MAX_MSIX_ACC_ENTRIES)];
     struct rcu_head rcu;
 };
 
@@ -229,6 +230,9 @@ static int msixtbl_read(
         nr_entry = (address - entry->gtable) / PCI_MSIX_ENTRY_SIZE;
         if ( nr_entry >= MAX_MSIX_ACC_ENTRIES )
             goto out;
+        if( !test_bit(nr_entry, entry->table_shadow) )
+            goto out;
+
         index = offset / sizeof(uint32_t);
         *pval = entry->gentries[nr_entry].msi_ad[index];
     }
@@ -361,7 +365,8 @@ static void del_msixtbl_entry(struct msixtbl_entry *entry)
     call_rcu(&entry->rcu, free_msixtbl_entry);
 }
 
-int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable)
+int msixtbl_pt_register(struct domain *d, struct pirq *pirq, uint64_t gtable,
+                        uint16_t entry_nr, uint32_t msi_ad[3])
 {
     struct irq_desc *irq_desc;
     struct msi_desc *msi_desc;
@@ -408,6 +413,12 @@ int msixtbl_pt_register(struct domain *d, struct pirq
*pirq, uint64_t gtable)
 
 found:
     atomic_inc(&entry->refcnt);
+
+    if( entry_nr < MAX_MSIX_ACC_ENTRIES ) {
+        memcpy(entry->gentries[entry_nr].msi_ad, msi_ad,
sizeof(uint32_t[3]));
+        set_bit(entry_nr, entry->table_shadow);
+    }
+
     spin_unlock(&d->arch.hvm_domain.msixtbl_list_lock);
     r = 0;
 
diff --git a/xen/drivers/passthrough/io.c b/xen/drivers/passthrough/io.c
index 13002c0..17cb8c2 100644
--- a/xen/drivers/passthrough/io.c
+++ b/xen/drivers/passthrough/io.c
@@ -153,7 +153,8 @@ int pt_irq_create_bind(
             rc = pirq_guest_bind(d->vcpu[0], info, 0);
             if ( rc == 0 && pt_irq_bind->u.msi.gtable )
             {
-                rc = msixtbl_pt_register(d, info,
pt_irq_bind->u.msi.gtable);
+                rc = msixtbl_pt_register(d, info, pt_irq_bind->u.msi.gtable,
+                                         pt_irq_bind->u.msi.entry_nr,
pt_irq_bind->u.msi.msi_ad);
                 if ( unlikely(rc) )
                     pirq_guest_unbind(d, info);
             }
diff --git a/xen/include/public/domctl.h b/xen/include/public/domctl.h
index 4c5b2bb..4b160a0 100644
--- a/xen/include/public/domctl.h
+++ b/xen/include/public/domctl.h
@@ -504,6 +504,8 @@ struct xen_domctl_bind_pt_irq {
             uint8_t gvec;
             uint32_t gflags;
             uint64_aligned_t gtable;
+            uint16_t entry_nr;
+            uint32_t msi_ad[3];
         } msi;
     } u;
 };
diff --git a/xen/include/xen/pci.h b/xen/include/xen/pci.h
index ca72a99..d8e22a8 100644
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -151,7 +151,8 @@ int pci_find_next_cap(u16 seg, u8 bus, unsigned int devfn,
u8 pos, int cap);
 int pci_find_ext_capability(int seg, int bus, int devfn, int cap);
 
 struct pirq;
-int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable);
+int msixtbl_pt_register(struct domain *, struct pirq *, uint64_t gtable,
+                        uint16_t entry_nr, uint32_t msi_ad[3]);
 void msixtbl_pt_unregister(struct domain *, struct pirq *);
 void msixtbl_pt_cleanup(struct domain *d);
 
-- 
1.7.3

Jan Beulich

2013-May-08 09:39 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

>>> On 08.05.13 at 10:17, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> Accelerated msix entry is initialized to zero when msixtbl_pt_register is
> called. This doesn''t match the value from qemu side, although pirq
may
> already
> be mapped and binded in qemu side. Kernel will get wrong value when reading
> msix info.
> 
> Signed-off-by: Zhenzhong Duan <zhenzhong.duan@oracle.com>
> Tested-by: Yuval Shaia <yuval.shaia@oracle.com>
I appreciate this needing to change, but it is a no-go to expose an
implementation detail of the hypervisor (number of accelerated
entries being 3) trough a hypercall interface (and even less so by
scattering around literal 3-s).

Please work towards a different solution, leaving the tool stack
agnostic to the number of accelerated entries. And if at all
possible, arrange for the patch to be split into tool stack and
hypervisor pieces, such that they can be applied independently
(and in either order).

Jan

Zhenzhong Duan

2013-May-08 10:00 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On 2013-05-08 17:39, Jan Beulich wrote:>>>> On 08.05.13 at 10:17, Zhenzhong
Duan<zhenzhong.duan@oracle.com>  wrote:
>> Accelerated msix entry is initialized to zero when msixtbl_pt_register
is
>> called. This doesn''t match the value from qemu side, although
pirq may
>> already
>> be mapped and binded in qemu side. Kernel will get wrong value when
reading
>> msix info.
>>
>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
> I appreciate this needing to change, but it is a no-go to expose an
> implementation detail of the hypervisor (number of accelerated
> entries being 3) trough a hypercall interface (and even less so by
> scattering around literal 3-s).I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
Not related to accelerated entries count.

or others?> Please work towards a different solution, leaving the tool stack
> agnostic to the number of accelerated entries. And if at all
> possible, arrange for the patch to be split into tool stack and
> hypervisor pieces, such that they can be applied independently
> (and in either order).sure, will do it after above question is clear.

Regards
zduan

Jan Beulich

2013-May-08 12:03 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

>>> On 08.05.13 at 12:00, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> On 2013-05-08 17:39, Jan Beulich wrote:
>>>>> On 08.05.13 at 10:17, Zhenzhong
Duan<zhenzhong.duan@oracle.com>  wrote:
>>> Accelerated msix entry is initialized to zero when
msixtbl_pt_register is
>>> called. This doesn''t match the value from qemu side,
although pirq may
>>> already
>>> be mapped and binded in qemu side. Kernel will get wrong value when
reading
>>> msix info.
>>>
>>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
>> I appreciate this needing to change, but it is a no-go to expose an
>> implementation detail of the hypervisor (number of accelerated
>> entries being 3) trough a hypercall interface (and even less so by
>> scattering around literal 3-s).
> I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
> Not related to accelerated entries count.
> 
> or others?
Oh, right you are. But then nevertheless give this meaningful
names in the hypercall interface (e.g. addr_lo, addr_hi, and data,
or just [64-bit] addr and [32-bit] data) rather than following the
bad practice in vmsi.c.
>> Please work towards a different solution, leaving the tool stack
>> agnostic to the number of accelerated entries. And if at all
>> possible, arrange for the patch to be split into tool stack and
>> hypervisor pieces, such that they can be applied independently
>> (and in either order).
> sure, will do it after above question is clear.
With the above it''s going to be difficult to split the two pieces.

But of course I still don''t really understand why all of the sudden
this needs to be passed in rather than being under the full control
of the hypervisor at all times. Perhaps this is related to me not
understanding why the kernel would read these values at all:
There''s no other place in the kernel where the message would
be read before first getting written (in fact, apart from the
use of __read_msi_msg() by the Xen code, there''s only one
other user under arch/powerpc/, and there - according to the
accompanying comment - this is just to save away the data for
later use during resume).

Jan

Zhenzhong Duan

2013-May-09 03:02 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On 2013/5/8 20:03, Jan Beulich wrote:>>>> On 08.05.13 at 12:00, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-08 17:39, Jan Beulich wrote:
>>>>>> On 08.05.13 at 10:17, Zhenzhong
Duan<zhenzhong.duan@oracle.com>  wrote:
>>>> Accelerated msix entry is initialized to zero when
msixtbl_pt_register is
>>>> called. This doesn''t match the value from qemu side,
although pirq may
>>>> already
>>>> be mapped and binded in qemu side. Kernel will get wrong value
when reading
>>>> msix info.
>>>>
>>>> Signed-off-by: Zhenzhong Duan<zhenzhong.duan@oracle.com>
>>>> Tested-by: Yuval Shaia<yuval.shaia@oracle.com>
>>> I appreciate this needing to change, but it is a no-go to expose an
>>> implementation detail of the hypervisor (number of accelerated
>>> entries being 3) trough a hypercall interface (and even less so by
>>> scattering around literal 3-s).
>> I presume you mean msi_ad[3]. msi_ad[3] is addr_lo, addr_high and data.
>> Not related to accelerated entries count.
>>
>> or others?
> Oh, right you are. But then nevertheless give this meaningful
> names in the hypercall interface (e.g. addr_lo, addr_hi, and data,
> or just [64-bit] addr and [32-bit] data) rather than following the
> bad practice in vmsi.c.
>
>>> Please work towards a different solution, leaving the tool stack
>>> agnostic to the number of accelerated entries. And if at all
>>> possible, arrange for the patch to be split into tool stack and
>>> hypervisor pieces, such that they can be applied independently
>>> (and in either order).
>> sure, will do it after above question is clear.
> With the above it''s going to be difficult to split the two pieces.so, only change to a meaningful names without split patch,
right?>
> But of course I still don''t really understand why all of the
sudden
> this needs to be passed in rather than being under the full control
> of the hypervisor at all times. Perhaps this is related to me not
> understanding why the kernel would read these values at all:
> There''s no other place in the kernel where the message would
> be read before first getting written (in fact, apart from the
> use of __read_msi_msg() by the Xen code, there''s only one
> other user under arch/powerpc/, and there - according to the
> accompanying comment - this is just to save away the data for
> later use during resume).There is a bug if msi_ad is not passed in.

when driver first load,

kernel.__read_msi_msg()
        (got all zero)
kernel.__write_msi_msg(pirq)
        (ioreq passed to qemu as no msixtbl_entry established yet)
qemu.pt_msi_update_one()
        xc_domain_update_msi_irq()
             (msixtbl_entry dynamicly allocated with msi_ad all zero)

then driver unload,
...
driver load again,

kernel.__read_msi_msg()
        (got all zero from xen as accelerated entry just established with all
zero)
qemu.__write_msi_msg(a new pirq)

pirq would exhaust or fail to map and bind.

zduan

Jan Beulich

2013-May-09 19:05 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02 AM
>>>
>On 2013/5/8 20:03, Jan Beulich wrote:
>> But of course I still don''t really understand why all of the
sudden
>> this needs to be passed in rather than being under the full control
>> of the hypervisor at all times. Perhaps this is related to me not
>> understanding why the kernel would read these values at all:
>> There''s no other place in the kernel where the message would
>> be read before first getting written (in fact, apart from the
>> use of __read_msi_msg() by the Xen code, there''s only one
>> other user under arch/powerpc/, and there - according to the
>> accompanying comment - this is just to save away the data for
>> later use during resume).
>There is a bug if msi_ad is not passed in.
>
>when driver first load,
>
>kernel.__read_msi_msg()
>(got all zero)
But you don''t even comment on the apparently bogus use of the function
here.
>kernel.__write_msi_msg(pirq)
>(ioreq passed to qemu as no msixtbl_entry established yet)
>qemu.pt_msi_update_one()
>xc_domain_update_msi_irq()
>(msixtbl_entry dynamicly allocated with msi_ad all zero)
>
>then driver unload,
>...
>driver load again,
>
>kernel.__read_msi_msg()
>(got all zero from xen as accelerated entry just established with all zero)
If all zeroes get returned, why would the flow here be different then above?
>qemu.__write_msi_msg(a new pirq)
>
>pirq would exhaust or fail to map and bind.
I''m afraid your replies are more confusing to me than clarifying...

Jan

Zhenzhong Duan

2013-May-10 02:49 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On 2013-05-10 03:05, Jan Beulich wrote:>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13 5:02
AM >>>
>> On 2013/5/8 20:03, Jan Beulich wrote:
>>> But of course I still don''t really understand why all of
the sudden
>>> this needs to be passed in rather than being under the full control
>>> of the hypervisor at all times. Perhaps this is related to me not
>>> understanding why the kernel would read these values at all:
>>> There''s no other place in the kernel where the message
would
>>> be read before first getting written (in fact, apart from the
>>> use of __read_msi_msg() by the Xen code, there''s only one
>>> other user under arch/powerpc/, and there - according to the
>>> accompanying comment - this is just to save away the data for
>>> later use during resume).
>> There is a bug if msi_ad is not passed in.
>>
>> when driver first load,
>>
>> kernel.__read_msi_msg()
>> (got all zero)
> But you don''t even comment on the apparently bogus use of the
function here.This pattern is used only when hvm_pirq is enabled. kernel need to check 
XEN_PIRQ_MSI_DATA.
It''s not a issue if data is 0 at first driver load, kernel will call 
__write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.>
>> kernel.__write_msi_msg(pirq)
>> (ioreq passed to qemu as no msixtbl_entry established yet)
>> qemu.pt_msi_update_one()
>> xc_domain_update_msi_irq()
>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>
>> then driver unload,
>> ...
>> driver load again,
>>
>> kernel.__read_msi_msg()
>> (got all zero from xen as accelerated entry just established with all
zero)
> If all zeroes get returned, why would the flow here be different then
above?Because pirq and related mapping and binding are not freed between 
driver load-unload-load. They are freed when device detach.
We should try to use the last pirq.

Regards
zduan

Jan Beulich

2013-May-10 06:37 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

>>> On 10.05.13 at 04:49, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com> 05/09/13
5:02 AM >>>
>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>> But of course I still don''t really understand why all
of the sudden
>>>> this needs to be passed in rather than being under the full
control
>>>> of the hypervisor at all times. Perhaps this is related to me
not
>>>> understanding why the kernel would read these values at all:
>>>> There''s no other place in the kernel where the message
would
>>>> be read before first getting written (in fact, apart from the
>>>> use of __read_msi_msg() by the Xen code, there''s only
one
>>>> other user under arch/powerpc/, and there - according to the
>>>> accompanying comment - this is just to save away the data for
>>>> later use during resume).
>>> There is a bug if msi_ad is not passed in.
>>>
>>> when driver first load,
>>>
>>> kernel.__read_msi_msg()
>>> (got all zero)
>> But you don''t even comment on the apparently bogus use of the
function here.
> This pattern is used only when hvm_pirq is enabled. kernel need to check 
> XEN_PIRQ_MSI_DATA.
> It''s not a issue if data is 0 at first driver load, kernel will
call
> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
But this doesn''t make the use of __read_msi_msg() less bogus.
It''s
not clear on what basis this mechanism got invented in the first
place.
>>> kernel.__write_msi_msg(pirq)
>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>> qemu.pt_msi_update_one()
>>> xc_domain_update_msi_irq()
>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>
>>> then driver unload,
>>> ...
>>> driver load again,
>>>
>>> kernel.__read_msi_msg()
>>> (got all zero from xen as accelerated entry just established with
all zero)
>> If all zeroes get returned, why would the flow here be different then
above?
> Because pirq and related mapping and binding are not freed between 
> driver load-unload-load. They are freed when device detach.
> We should try to use the last pirq.
But then you need to solve the problem generically, i.e. not just
for the driver reload case, but also for e.g. the kexec one (where
__read_msi_msg() returning other than all zeros wouldn''t help you
as xen_irq_from_pirq() would then return -1, and you''d be back to
the same problem. IOW I think the prior IRQ needs to be freed
anyway rather than an attempt be made to reuse it.

Jan

Zhenzhong Duan

2013-May-10 07:39 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On 2013-05-10 14:37, Jan Beulich wrote:>>>> On 10.05.13 at 04:49, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com>
05/09/13 5:02 AM >>>
>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>> But of course I still don''t really understand why
all of the sudden
>>>>> this needs to be passed in rather than being under the full
control
>>>>> of the hypervisor at all times. Perhaps this is related to
me not
>>>>> understanding why the kernel would read these values at
all:
>>>>> There''s no other place in the kernel where the
message would
>>>>> be read before first getting written (in fact, apart from
the
>>>>> use of __read_msi_msg() by the Xen code, there''s
only one
>>>>> other user under arch/powerpc/, and there - according to
the
>>>>> accompanying comment - this is just to save away the data
for
>>>>> later use during resume).
>>>> There is a bug if msi_ad is not passed in.
>>>>
>>>> when driver first load,
>>>>
>>>> kernel.__read_msi_msg()
>>>> (got all zero)
>>> But you don''t even comment on the apparently bogus use of
the function here.
>> This pattern is used only when hvm_pirq is enabled. kernel need to
check
>> XEN_PIRQ_MSI_DATA.
>> It''s not a issue if data is 0 at first driver load, kernel
will call
>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
> But this doesn''t make the use of __read_msi_msg() less bogus.
It''s
> not clear on what basis this mechanism got invented in the first
> place.It''s there since hvm_irq introduced. But it works
indeed.>
>>>> kernel.__write_msi_msg(pirq)
>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>>> qemu.pt_msi_update_one()
>>>> xc_domain_update_msi_irq()
>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>>
>>>> then driver unload,
>>>> ...
>>>> driver load again,
>>>>
>>>> kernel.__read_msi_msg()
>>>> (got all zero from xen as accelerated entry just established
with all zero)
>>> If all zeroes get returned, why would the flow here be different
then above?
>> Because pirq and related mapping and binding are not freed between
>> driver load-unload-load. They are freed when device detach.
>> We should try to use the last pirq.
> But then you need to solve the problem generically, i.e. not just
> for the driver reload case, but also for e.g. the kexec one (where
> __read_msi_msg() returning other than all zeros wouldn''t help you
> as xen_irq_from_pirq() would then return -1, and you''d be back to
> the same problem.No, not only kexec ones, it''s driver unload that makes
xen_irq_from_pirq
return -1. So there is also a bug in kernel side.
I have sent a patch about kernel. I think you miss it.
http://www.gossamer-threads.com/lists/xen/devel/281498> IOW I think the prior IRQ needs to be freed
> anyway rather than an attempt be made to reuse it.I have ever thought about this idea, but when to free the pirq is a problem.
When driver unload? qemu has no idea of if driver unloaded.
When msix entry masked? kernel mask and unmask msix entry 
intermittently, especially when irqbalance enabled.

So based on above, I think it''s better to reuse same pirq, only free it
when device detached.

Regards
zduan

Jan Beulich

2013-May-10 07:55 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

>>> On 10.05.13 at 09:39, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> On 2013-05-10 14:37, Jan Beulich wrote:
>>>>> On 10.05.13 at 04:49, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>>> Zhenzhong Duan <zhenzhong.duan@oracle.com>
05/09/13 5:02 AM >>>
>>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>>> But of course I still don''t really understand
why all of the sudden
>>>>>> this needs to be passed in rather than being under the
full control
>>>>>> of the hypervisor at all times. Perhaps this is related
to me not
>>>>>> understanding why the kernel would read these values at
all:
>>>>>> There''s no other place in the kernel where the
message would
>>>>>> be read before first getting written (in fact, apart
from the
>>>>>> use of __read_msi_msg() by the Xen code,
there''s only one
>>>>>> other user under arch/powerpc/, and there - according
to the
>>>>>> accompanying comment - this is just to save away the
data for
>>>>>> later use during resume).
>>>>> There is a bug if msi_ad is not passed in.
>>>>>
>>>>> when driver first load,
>>>>>
>>>>> kernel.__read_msi_msg()
>>>>> (got all zero)
>>>> But you don''t even comment on the apparently bogus use
of the function here.
>>> This pattern is used only when hvm_pirq is enabled. kernel need to
check
>>> XEN_PIRQ_MSI_DATA.
>>> It''s not a issue if data is 0 at first driver load, kernel
will call
>>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
>> But this doesn''t make the use of __read_msi_msg() less bogus.
It''s
>> not clear on what basis this mechanism got invented in the first
>> place.
> It''s there since hvm_irq introduced. But it works indeed.
But that doesn''t in any way mean the concept is sound.
>>>>> kernel.__write_msi_msg(pirq)
>>>>> (ioreq passed to qemu as no msixtbl_entry established yet)
>>>>> qemu.pt_msi_update_one()
>>>>> xc_domain_update_msi_irq()
>>>>> (msixtbl_entry dynamicly allocated with msi_ad all zero)
>>>>>
>>>>> then driver unload,
>>>>> ...
>>>>> driver load again,
>>>>>
>>>>> kernel.__read_msi_msg()
>>>>> (got all zero from xen as accelerated entry just
established with all zero)
>>>> If all zeroes get returned, why would the flow here be
different then above?
>>> Because pirq and related mapping and binding are not freed between
>>> driver load-unload-load. They are freed when device detach.
>>> We should try to use the last pirq.
>> But then you need to solve the problem generically, i.e. not just
>> for the driver reload case, but also for e.g. the kexec one (where
>> __read_msi_msg() returning other than all zeros wouldn''t help
you
>> as xen_irq_from_pirq() would then return -1, and you''d be back
to
>> the same problem.
> No, not only kexec ones, it''s driver unload that makes
xen_irq_from_pirq
> return -1. So there is also a bug in kernel side.
> I have sent a patch about kernel. I think you miss it.
> http://www.gossamer-threads.com/lists/xen/devel/281498 
>> IOW I think the prior IRQ needs to be freed
>> anyway rather than an attempt be made to reuse it.
> I have ever thought about this idea, but when to free the pirq is a
problem.
> When driver unload? qemu has no idea of if driver unloaded.
But the kernel does, and hence could deal with this. As much as
the setup is being done when the driver gets loaded, cleanup
should be done when the driver gets unloaded. _If_ there
already is such an odd protocol between kernel and qemu, then
if that can''t be dropped, it surely can be leveraged to also deal
with the cleanup side of things? No need to fiddle with the
hypervisor interfaces for something that it''s not supposed to
know about anyway.
> When msix entry masked? kernel mask and unmask msix entry 
> intermittently, especially when irqbalance enabled.
> 
> So based on above, I think it''s better to reuse same pirq, only
free it
> when device detached.
I continue to disagree. Also from a theoretical perspective - if you
have a lot of devices that no driver is loaded for, you''d keep a lot
of IRQs allocated without any need.

Jan

Zhenzhong Duan

2013-May-10 08:22 UTC

head link

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On 2013-05-10 15:55, Jan Beulich wrote:>>>> On 10.05.13 at 09:39, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>> On 2013-05-10 14:37, Jan Beulich wrote:
>>>>>> On 10.05.13 at 04:49, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
>>>> On 2013-05-10 03:05, Jan Beulich wrote:
>>>>>>>> Zhenzhong Duan
<zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
>>>>>> On 2013/5/8 20:03, Jan Beulich wrote:
>>>>>>> But of course I still don''t really
understand why all of the sudden
>>>>>>> this needs to be passed in rather than being under
the full control
>>>>>>> of the hypervisor at all times. Perhaps this is
related to me not
>>>>>>> understanding why the kernel would read these
values at all:
>>>>>>> There''s no other place in the kernel where
the message would
>>>>>>> be read before first getting written (in fact,
apart from the
>>>>>>> use of __read_msi_msg() by the Xen code,
there''s only one
>>>>>>> other user under arch/powerpc/, and there -
according to the
>>>>>>> accompanying comment - this is just to save away
the data for
>>>>>>> later use during resume).
>>>>>> There is a bug if msi_ad is not passed in.
>>>>>>
>>>>>> when driver first load,
>>>>>>
>>>>>> kernel.__read_msi_msg()
>>>>>> (got all zero)
>>>>> But you don''t even comment on the apparently bogus
use of the function here.
>>>> This pattern is used only when hvm_pirq is enabled. kernel need
to check
>>>> XEN_PIRQ_MSI_DATA.
>>>> It''s not a issue if data is 0 at first driver load,
kernel will call
>>>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
>>> But this doesn''t make the use of __read_msi_msg() less
bogus. It''s
>>> not clear on what basis this mechanism got invented in the first
>>> place.
>> It''s there since hvm_irq introduced. But it works indeed.
> But that doesn''t in any way mean the concept is sound.
>
>>>>>> kernel.__write_msi_msg(pirq)
>>>>>> (ioreq passed to qemu as no msixtbl_entry established
yet)
>>>>>> qemu.pt_msi_update_one()
>>>>>> xc_domain_update_msi_irq()
>>>>>> (msixtbl_entry dynamicly allocated with msi_ad all
zero)
>>>>>>
>>>>>> then driver unload,
>>>>>> ...
>>>>>> driver load again,
>>>>>>
>>>>>> kernel.__read_msi_msg()
>>>>>> (got all zero from xen as accelerated entry just
established with all zero)
>>>>> If all zeroes get returned, why would the flow here be
different then above?
>>>> Because pirq and related mapping and binding are not freed
between
>>>> driver load-unload-load. They are freed when device detach.
>>>> We should try to use the last pirq.
>>> But then you need to solve the problem generically, i.e. not just
>>> for the driver reload case, but also for e.g. the kexec one (where
>>> __read_msi_msg() returning other than all zeros wouldn''t
help you
>>> as xen_irq_from_pirq() would then return -1, and you''d be
back to
>>> the same problem.
>> No, not only kexec ones, it''s driver unload that makes
xen_irq_from_pirq
>> return -1. So there is also a bug in kernel side.
>> I have sent a patch about kernel. I think you miss it.
>> http://www.gossamer-threads.com/lists/xen/devel/281498
>>> IOW I think the prior IRQ needs to be freed
>>> anyway rather than an attempt be made to reuse it.
>> I have ever thought about this idea, but when to free the pirq is a
problem.
>> When driver unload? qemu has no idea of if driver unloaded.
> But the kernel does, and hence could deal with this. As much as
> the setup is being done when the driver gets loaded, cleanup
> should be done when the driver gets unloaded. _If_ there
> already is such an odd protocol between kernel and qemu, then
> if that can''t be dropped, it surely can be leveraged to also deal
> with the cleanup side of things? No need to fiddle with the
> hypervisor interfaces for something that it''s not supposed to
> know about anyway.But I''m suspecious if domU has authorization to call unmap and unbind 
hypercall.
Looked the kernel code, only dom0 did that.>
>> When msix entry masked? kernel mask and unmask msix entry
>> intermittently, especially when irqbalance enabled.
>>
>> So based on above, I think it''s better to reuse same pirq,
only free it
>> when device detached.
> I continue to disagree. Also from a theoretical perspective - if you
> have a lot of devices that no driver is loaded for, you''d keep a
lot
> of IRQs allocated without any need.Sould right, but why do you passthrough those devices but don''t use 
them, you will finally use them.
For driver that reload often, this pattern will save some time of 
mapping and binding.
Also both xen and kernel have ability to allocate enough IRQs for each 
device.
If no driver is loaded for a irq, the interrupt will not be triggered 
and no any impact to the whole system.

Regards
zduan

Konrad Rzeszutek Wilk

2013-May-10 19:03 UTC

head link

Is: Telling QEMU to re-use PIRQ value Was: Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

On Fri, May 10, 2013 at 08:55:46AM +0100, Jan Beulich
wrote:> >>> On 10.05.13 at 09:39, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> 
> > On 2013-05-10 14:37, Jan Beulich wrote:
> >>>>> On 10.05.13 at 04:49, Zhenzhong Duan
<zhenzhong.duan@oracle.com> wrote:
> >>> On 2013-05-10 03:05, Jan Beulich wrote:
> >>>>>>> Zhenzhong Duan
<zhenzhong.duan@oracle.com> 05/09/13 5:02 AM >>>
> >>>>> On 2013/5/8 20:03, Jan Beulich wrote:
> >>>>>> But of course I still don''t really
understand why all of the sudden
> >>>>>> this needs to be passed in rather than being under
the full control
> >>>>>> of the hypervisor at all times. Perhaps this is
related to me not
> >>>>>> understanding why the kernel would read these
values at all:
> >>>>>> There''s no other place in the kernel
where the message would
> >>>>>> be read before first getting written (in fact,
apart from the
> >>>>>> use of __read_msi_msg() by the Xen code,
there''s only one
> >>>>>> other user under arch/powerpc/, and there -
according to the
> >>>>>> accompanying comment - this is just to save away
the data for
> >>>>>> later use during resume).
> >>>>> There is a bug if msi_ad is not passed in.
> >>>>>
> >>>>> when driver first load,
> >>>>>
> >>>>> kernel.__read_msi_msg()
> >>>>> (got all zero)
> >>>> But you don''t even comment on the apparently
bogus use of the function here.
> >>> This pattern is used only when hvm_pirq is enabled. kernel
need to check
> >>> XEN_PIRQ_MSI_DATA.
> >>> It''s not a issue if data is 0 at first driver load,
kernel will call
> >>> __write_msi_msg with pirq and  XEN_PIRQ_MSI_DATA set.
> >> But this doesn''t make the use of __read_msi_msg() less
bogus. It''s
> >> not clear on what basis this mechanism got invented in the first
> >> place.
> > It''s there since hvm_irq introduced. But it works indeed.
> 
> But that doesn''t in any way mean the concept is sound.
> 
> >>>>> kernel.__write_msi_msg(pirq)
> >>>>> (ioreq passed to qemu as no msixtbl_entry established
yet)
> >>>>> qemu.pt_msi_update_one()
> >>>>> xc_domain_update_msi_irq()
> >>>>> (msixtbl_entry dynamicly allocated with msi_ad all
zero)
> >>>>>
> >>>>> then driver unload,
> >>>>> ...
> >>>>> driver load again,
> >>>>>
> >>>>> kernel.__read_msi_msg()
> >>>>> (got all zero from xen as accelerated entry just
established with all zero)
> >>>> If all zeroes get returned, why would the flow here be
different then above?
> >>> Because pirq and related mapping and binding are not freed
between
> >>> driver load-unload-load. They are freed when device detach.
> >>> We should try to use the last pirq.
> >> But then you need to solve the problem generically, i.e. not just
> >> for the driver reload case, but also for e.g. the kexec one (where
> >> __read_msi_msg() returning other than all zeros wouldn''t
help you
> >> as xen_irq_from_pirq() would then return -1, and you''d be
back to
> >> the same problem.
> > No, not only kexec ones, it''s driver unload that makes
xen_irq_from_pirq
> > return -1. So there is also a bug in kernel side.
> > I have sent a patch about kernel. I think you miss it.
> > http://www.gossamer-threads.com/lists/xen/devel/281498 
> >> IOW I think the prior IRQ needs to be freed
> >> anyway rather than an attempt be made to reuse it.
> > I have ever thought about this idea, but when to free the pirq is a
problem.
> > When driver unload? qemu has no idea of if driver unloaded.
> 
> But the kernel does, and hence could deal with this. As much as
> the setup is being done when the driver gets loaded, cleanup
> should be done when the driver gets unloaded. _If_ there
> already is such an odd protocol between kernel and qemu, then
> if that can''t be dropped, it surely can be leveraged to also deal
> with the cleanup side of things? No need to fiddle with the
I don''t know if such thing exists. Stefano, is there a way
to tell QEMU to re-use the PIRQ? Writting zero to the MSI?
> hypervisor interfaces for something that it''s not supposed to
> know about anyway.
> 
> > When msix entry masked? kernel mask and unmask msix entry 
> > intermittently, especially when irqbalance enabled.
> > 
> > So based on above, I think it''s better to reuse same pirq,
only free it
> > when device detached.
> 
> I continue to disagree. Also from a theoretical perspective - if you
> have a lot of devices that no driver is loaded for, you''d keep a
lot
> of IRQs allocated without any need.
The guest has to use PHYSDEVOP_get_free_pirq to allocate it. And 
in this case we don''t have a ''free_pirq'' hypercall to
release it.

The Linux Xen<->IRQ drivers drops all of the information it has on
the PIRQ once the driver is unloaded (rightly so - the driver after
does not need the IRQ anymore and the PIRQ<->events connection has
been broken).

I wrote a tiny patch that needs improvements that would cache the
last seen BDF and PIRQ (that part is missing). That would allow us
to re-use the PIRQ and not call PHYSDEVOP_get_free_pirq until we
exhaust the allocation we have.

In other words - this can be fixed in the kernel.

But if there is a ''magic'' value that can be written to QEMU to
tell
it to re-use the PIRQ.. that would good too.> 
> Jan

Reasonably Related Threads

Search for more possibly parallel threads

Xen devel - May 2013 - [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

[PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Is: Telling QEMU to re-use PIRQ value Was: Re: [PATCH 1/2] xen, libxc: init msix addr/data with value from qemu via hypercall

Reasonably Related Threads