thr3ads.net - Linux Virtualization - [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest [Dec 2018]

If this information is useful, please help other people find it:
Share via:

jiangyiwen

2018-Dec-13 02:38 UTC

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

Hi Michael,

On 2018/12/12 23:31, Michael S. Tsirkin wrote:> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>> Guest receive mergeable rx buffer, it can merge
>> scatter rx buffer into a big buffer and then copy
>> to user space.
>>
>> In addition, it also use iovec to replace buf in struct
>> virtio_vsock_pkt, keep tx and rx consistency. The only
>> difference is now tx still uses a segment of continuous
>> physical memory to implement.
>>
>> Signed-off-by: Yiwen Jiang <jiangyiwen at huawei.com>
>> ---
>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>  include/linux/virtio_vsock.h            |   6 +-
>>  net/vmw_vsock/virtio_transport.c        | 105
++++++++++++++++++++++++++++----
>>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
>>  4 files changed, 166 insertions(+), 35 deletions(-)
> 
> 
> This was supposed to be a guest patch, why is vhost changed here?
> 
In mergeable rx buff cases, it need to scatter big packets into several
buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
time, in order to keep tx and rx consistency, I use kvec to replace
variable buf, because vhost use the variable pkt->buf, so this patch
caused vhost is changed.
>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>> index dc52b0f..c7ab0dd 100644
>> --- a/drivers/vhost/vsock.c
>> +++ b/drivers/vhost/vsock.c
>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue *vq,
>>  		size_t nbytes;
>>  		size_t len;
>>  		s16 headcount;
>> +		size_t remain_len;
>> +		int i;
>>
>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>  		if (list_empty(&vsock->send_pkt_list)) {
>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct vhost_virtqueue
*vq,
>>  			break;
>>  		}
>>
>> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
>> -		if (nbytes != pkt->len) {
>> -			virtio_transport_free_pkt(pkt);
>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>> -			break;
>> +		remain_len = pkt->len;
>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>> +			int tmp_len;
>> +
>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len,
&iov_iter);
>> +			if (nbytes != tmp_len) {
>> +				virtio_transport_free_pkt(pkt);
>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>> +				break;
>> +			}
>> +
>> +			remain_len -= tmp_len;
>>  		}
>>
>>  		vhost_add_used_n(vq, vq->heads, headcount);
>> @@ -341,6 +351,7 @@ static void vhost_transport_send_pkt_work(struct
vhost_work *work)
>>  	struct iov_iter iov_iter;
>>  	size_t nbytes;
>>  	size_t len;
>> +	void *buf;
>>
>>  	if (in != 0) {
>>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
>> @@ -375,13 +386,17 @@ static void vhost_transport_send_pkt_work(struct
vhost_work *work)
>>  		return NULL;
>>  	}
>>
>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>> -	if (!pkt->buf) {
>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>> +	if (!buf) {
>>  		kfree(pkt);
>>  		return NULL;
>>  	}
>>
>> -	nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
>> +	pkt->vec[0].iov_base = buf;
>> +	pkt->vec[0].iov_len = pkt->len;
>> +	pkt->nr_vecs = 1;
>> +
>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>  	if (nbytes != pkt->len) {
>>  		vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
>>  		       pkt->len, nbytes);
>> diff --git a/include/linux/virtio_vsock.h
b/include/linux/virtio_vsock.h
>> index da9e1fe..734eeed 100644
>> --- a/include/linux/virtio_vsock.h
>> +++ b/include/linux/virtio_vsock.h
>> @@ -13,6 +13,8 @@
>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE /
PAGE_SIZE) + 1)
>>
>>  /* Virtio-vsock feature */
>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive buffers.
*/
>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>  	struct list_head list;
>>  	/* socket refcnt not held, only use for cancellation */
>>  	struct vsock_sock *vsk;
>> -	void *buf;
>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>> +	int nr_vecs;
>>  	u32 len;
>>  	u32 off;
>>  	bool reply;
>> +	bool mergeable;
>>  };
>>
>>  struct virtio_vsock_pkt_info {
>> diff --git a/net/vmw_vsock/virtio_transport.c
b/net/vmw_vsock/virtio_transport.c
>> index c4a465c..148b58a 100644
>> --- a/net/vmw_vsock/virtio_transport.c
>> +++ b/net/vmw_vsock/virtio_transport.c
>> @@ -155,8 +155,10 @@ static int
virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>
>>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>  		sgs[out_sg++] = &hdr;
>> -		if (pkt->buf) {
>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>> +		if (pkt->len) {
>> +			/* Currently only support a segment of memory in tx */
>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>> +			sg_init_one(&buf, pkt->vec[0].iov_base,
pkt->vec[0].iov_len);
>>  			sgs[out_sg++] = &buf;
>>  		}
>>
>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue *vq)
>>  	struct virtio_vsock_pkt *pkt;
>>  	struct scatterlist hdr, buf, *sgs[2];
>>  	int ret;
>> +	void *pkt_buf;
>>
>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>  	if (!pkt)
>>  		return -ENOMEM;
>>
>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>> -	if (!pkt->buf) {
>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>> +	if (!pkt_buf) {
>>  		virtio_transport_free_pkt(pkt);
>>  		return -ENOMEM;
>>  	}
>>
>> +	pkt->vec[0].iov_base = pkt_buf;
>> +	pkt->vec[0].iov_len = buf_len;
>> +	pkt->nr_vecs = 1;
>> +
>>  	pkt->len = buf_len;
>>
>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>  	sgs[0] = &hdr;
>>
>> -	sg_init_one(&buf, pkt->buf, buf_len);
>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>  	sgs[1] = &buf;
>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>  	if (ret)
>> @@ -388,11 +395,78 @@ static bool virtio_transport_more_replies(struct
virtio_vsock *vsock)
>>  	return val < virtqueue_get_vring_size(vq);
>>  }
>>
>> +static struct virtio_vsock_pkt *receive_mergeable(struct virtqueue
*vq,
>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>> +{
>> +	struct virtio_vsock_pkt *pkt;
>> +	u16 num_buf;
>> +	void *buf;
>> +	unsigned int len;
>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>> +
>> +	buf = virtqueue_get_buf(vq, &len);
>> +	if (!buf)
>> +		return NULL;
>> +
>> +	*total_len = len;
>> +	vsock->rx_buf_nr--;
>> +
>> +	if (unlikely(len < vsock_hlen)) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
>> +
>> +	pkt = buf;
>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>> +		put_page(virt_to_head_page(buf));
>> +		return NULL;
>> +	}
> 
> So everything just stops going, and host and user don't even
> know what the reason is. And not only that - the next
> packet will be corrupted because we skipped the first one.
> 
> 
I understand this case will not encountered unless the code has
*BUG*, like Host send some problematic packages (shorten/longer than
expected). In this case, I think we should ignore/drop these packets.
> 
>> +
>> +	/* Initialize pkt residual structure */
>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct
virtio_vsock_hdr) -
>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>> +
>> +	pkt->mergeable = true;
>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>> +	if (!pkt->len)
>> +		return pkt;
>> +
>> +	len -= vsock_hlen;
>> +	if (len) {
>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>> +		/* Shared page with pkt, so get page in advance */
>> +		get_page(virt_to_head_page(buf));
>> +		pkt->nr_vecs++;
>> +	}
>> +
>> +	while (--num_buf) {
>> +		buf = virtqueue_get_buf(vq, &len);
>> +		if (!buf)
>> +			goto err;
>> +
>> +		*total_len += len;
>> +		vsock->rx_buf_nr--;
>> +
>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>> +		pkt->nr_vecs++;
>> +	}
>> +
>> +	return pkt;
>> +err:
>> +	virtio_transport_free_pkt(pkt);
>> +	return NULL;
>> +}
>> +
>>  static void virtio_transport_rx_work(struct work_struct *work)
>>  {
>>  	struct virtio_vsock *vsock >>  		container_of(work, struct
virtio_vsock, rx_work);
>>  	struct virtqueue *vq;
>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct
virtio_vsock_pkt) :
>> +			sizeof(struct virtio_vsock_hdr);
>>
>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>
>> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct
work_struct *work)
>>  				goto out;
>>  			}
>>
>> -			pkt = virtqueue_get_buf(vq, &len);
>> -			if (!pkt) {
>> -				break;
>> -			}
>> +			if (likely(vsock->mergeable)) {
>> +				pkt = receive_mergeable(vq, vsock, &len);
>> +				if (!pkt)
>> +					break;
>> +			} else {
>> +				pkt = virtqueue_get_buf(vq, &len);
>> +				if (!pkt)
>> +					break;
>>
> 
> So looking at it, this seems to be the main source of the gain.
> But why does this require host/guest changes?
> 
> 
> The way I see it:
> 	- get a buffer and create an skb
> 	- get the next one, check header matches, if yes
> 	  tack it on the skb as a fragment. If not then
> 	  don't, deliver previous one and queue the new one.
> 
> 
Vhost change reason I explain as above, and I hope use kvec
to instead buf, after all buf only can express a contiguous
physical memory.

Thanks,
Yiwen.
> 
>> -			vsock->rx_buf_nr--;
>> +				vsock->rx_buf_nr--;
>> +			}
>>
>>  			/* Drop short/long packets */
>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>> +			if (unlikely(len < vsock_hlen ||
>> +				     len > vsock_hlen + pkt->len)) {
>>  				virtio_transport_free_pkt(pkt);
>>  				continue;
>>  			}
>>
>> -			pkt->len = len - sizeof(pkt->hdr);
>> +			pkt->len = len - vsock_hlen;
>>  			virtio_transport_deliver_tap_pkt(pkt);
>>  			virtio_transport_recv_pkt(pkt);
>>  		}
>> diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
>> index 3ae3a33..123a8b6 100644
>> --- a/net/vmw_vsock/virtio_transport_common.c
>> +++ b/net/vmw_vsock/virtio_transport_common.c
>> @@ -44,6 +44,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>  {
>>  	struct virtio_vsock_pkt *pkt;
>>  	int err;
>> +	void *buf = NULL;
>>
>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>  	if (!pkt)
>> @@ -62,12 +63,16 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>  	pkt->vsk		= info->vsk;
>>
>>  	if (info->msg && len > 0) {
>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>> -		if (!pkt->buf)
>> +		buf = kmalloc(len, GFP_KERNEL);
>> +		if (!buf)
>>  			goto out_pkt;
>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>> +		err = memcpy_from_msg(buf, info->msg, len);
>>  		if (err)
>>  			goto out;
>> +
>> +		pkt->vec[0].iov_base = buf;
>> +		pkt->vec[0].iov_len = len;
>> +		pkt->nr_vecs = 1;
>>  	}
>>
>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>> @@ -80,7 +85,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>  	return pkt;
>>
>>  out:
>> -	kfree(pkt->buf);
>> +	kfree(buf);
>>  out_pkt:
>>  	kfree(pkt);
>>  	return NULL;
>> @@ -92,6 +97,7 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
>>  	struct virtio_vsock_pkt *pkt = opaque;
>>  	struct af_vsockmon_hdr *hdr;
>>  	struct sk_buff *skb;
>> +	int i;
>>
>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len,
>>  			GFP_ATOMIC);
>> @@ -134,7 +140,8 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>
>>  	if (pkt->len) {
>> -		skb_put_data(skb, pkt->buf, pkt->len);
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			skb_put_data(skb, pkt->vec[i].iov_base, pkt->vec[i].iov_len);
>>  	}
>>
>>  	return skb;
>> @@ -260,6 +267,9 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>
>>  	spin_lock_bh(&vvs->rx_lock);
>>  	while (total < len && !list_empty(&vvs->rx_queue))
{
>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>> +		int i;
>> +
>>  		pkt = list_first_entry(&vvs->rx_queue,
>>  				       struct virtio_vsock_pkt, list);
>>
>> @@ -272,14 +282,28 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>  		 */
>>  		spin_unlock_bh(&vvs->rx_lock);
>>
>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>> -		if (err)
>> -			goto out;
>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len) {
>> +				last_vec_total += pkt->vec[i].iov_len;
>> +				continue;
>> +			}
>> +
>> +			vec_off = pkt->off - last_vec_total;
>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>> +					copy_bytes);
>> +			if (err)
>> +				goto out;
>> +
>> +			bytes -= copy_bytes;
>> +			pkt->off += copy_bytes;
>> +			total += copy_bytes;
>> +			last_vec_total += pkt->vec[i].iov_len;
>> +			if (!bytes)
>> +				break;
>> +		}
>>
>>  		spin_lock_bh(&vvs->rx_lock);
>> -
>> -		total += bytes;
>> -		pkt->off += bytes;
>>  		if (pkt->off == pkt->len) {
>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>  			list_del(&pkt->list);
>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct
virtio_vsock_pkt *pkt)
>>
>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>  {
>> -	kfree(pkt->buf);
>> -	kfree(pkt);
>> +	int i;
>> +
>> +	if (pkt->mergeable) {
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>> +		put_page(virt_to_head_page((void *)pkt));
>> +	} else {
>> +		for (i = 0; i < pkt->nr_vecs; i++)
>> +			kfree(pkt->vec[i].iov_base);
>> +		kfree(pkt);
>> +	}
>>  }
>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>
>> -- 
>> 1.8.3.1
>>
> 
> .
>

Michael S. Tsirkin

2018-Dec-13 14:29 UTC

head link

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen
wrote:> Hi Michael,
> 
> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
> > On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
> >> Guest receive mergeable rx buffer, it can merge
> >> scatter rx buffer into a big buffer and then copy
> >> to user space.
> >>
> >> In addition, it also use iovec to replace buf in struct
> >> virtio_vsock_pkt, keep tx and rx consistency. The only
> >> difference is now tx still uses a segment of continuous
> >> physical memory to implement.
> >>
> >> Signed-off-by: Yiwen Jiang <jiangyiwen at huawei.com>
> >> ---
> >>  drivers/vhost/vsock.c                   |  31 +++++++---
> >>  include/linux/virtio_vsock.h            |   6 +-
> >>  net/vmw_vsock/virtio_transport.c        | 105
++++++++++++++++++++++++++++----
> >>  net/vmw_vsock/virtio_transport_common.c |  59 ++++++++++++++----
> >>  4 files changed, 166 insertions(+), 35 deletions(-)
> > 
> > 
> > This was supposed to be a guest patch, why is vhost changed here?
> > 
> 
> In mergeable rx buff cases, it need to scatter big packets into several
> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
> time, in order to keep tx and rx consistency, I use kvec to replace
> variable buf, because vhost use the variable pkt->buf, so this patch
> caused vhost is changed.
You'd want to split these patches imho.
> >> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
> >> index dc52b0f..c7ab0dd 100644
> >> --- a/drivers/vhost/vsock.c
> >> +++ b/drivers/vhost/vsock.c
> >> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct vhost_virtqueue
*vq,
> >>  		size_t nbytes;
> >>  		size_t len;
> >>  		s16 headcount;
> >> +		size_t remain_len;
> >> +		int i;
> >>
> >>  		spin_lock_bh(&vsock->send_pkt_list_lock);
> >>  		if (list_empty(&vsock->send_pkt_list)) {
> >> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct
vhost_virtqueue *vq,
> >>  			break;
> >>  		}
> >>
> >> -		nbytes = copy_to_iter(pkt->buf, pkt->len, &iov_iter);
> >> -		if (nbytes != pkt->len) {
> >> -			virtio_transport_free_pkt(pkt);
> >> -			vq_err(vq, "Faulted on copying pkt buf\n");
> >> -			break;
> >> +		remain_len = pkt->len;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			int tmp_len;
> >> +
> >> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
> >> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len,
&iov_iter);
> >> +			if (nbytes != tmp_len) {
> >> +				virtio_transport_free_pkt(pkt);
> >> +				vq_err(vq, "Faulted on copying pkt buf\n");
> >> +				break;
> >> +			}
> >> +
> >> +			remain_len -= tmp_len;
> >>  		}
> >>
> >>  		vhost_add_used_n(vq, vq->heads, headcount);
> >> @@ -341,6 +351,7 @@ static void
vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  	struct iov_iter iov_iter;
> >>  	size_t nbytes;
> >>  	size_t len;
> >> +	void *buf;
> >>
> >>  	if (in != 0) {
> >>  		vq_err(vq, "Expected 0 input buffers, got %u\n", in);
> >> @@ -375,13 +386,17 @@ static void
vhost_transport_send_pkt_work(struct vhost_work *work)
> >>  		return NULL;
> >>  	}
> >>
> >> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	buf = kmalloc(pkt->len, GFP_KERNEL);
> >> +	if (!buf) {
> >>  		kfree(pkt);
> >>  		return NULL;
> >>  	}
> >>
> >> -	nbytes = copy_from_iter(pkt->buf, pkt->len,
&iov_iter);
> >> +	pkt->vec[0].iov_base = buf;
> >> +	pkt->vec[0].iov_len = pkt->len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
> >>  	if (nbytes != pkt->len) {
> >>  		vq_err(vq, "Expected %u byte payload, got %zu
bytes\n",
> >>  		       pkt->len, nbytes);
> >> diff --git a/include/linux/virtio_vsock.h
b/include/linux/virtio_vsock.h
> >> index da9e1fe..734eeed 100644
> >> --- a/include/linux/virtio_vsock.h
> >> +++ b/include/linux/virtio_vsock.h
> >> @@ -13,6 +13,8 @@
> >>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
> >>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
> >>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
> >> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
> >> +#define VIRTIO_VSOCK_MAX_VEC_NUM ((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE
/ PAGE_SIZE) + 1)
> >>
> >>  /* Virtio-vsock feature */
> >>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive
buffers. */
> >> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
> >>  	struct list_head list;
> >>  	/* socket refcnt not held, only use for cancellation */
> >>  	struct vsock_sock *vsk;
> >> -	void *buf;
> >> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
> >> +	int nr_vecs;
> >>  	u32 len;
> >>  	u32 off;
> >>  	bool reply;
> >> +	bool mergeable;
> >>  };
> >>
> >>  struct virtio_vsock_pkt_info {
> >> diff --git a/net/vmw_vsock/virtio_transport.c
b/net/vmw_vsock/virtio_transport.c
> >> index c4a465c..148b58a 100644
> >> --- a/net/vmw_vsock/virtio_transport.c
> >> +++ b/net/vmw_vsock/virtio_transport.c
> >> @@ -155,8 +155,10 @@ static int
virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
> >>
> >>  		sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  		sgs[out_sg++] = &hdr;
> >> -		if (pkt->buf) {
> >> -			sg_init_one(&buf, pkt->buf, pkt->len);
> >> +		if (pkt->len) {
> >> +			/* Currently only support a segment of memory in tx */
> >> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
> >> +			sg_init_one(&buf, pkt->vec[0].iov_base,
pkt->vec[0].iov_len);
> >>  			sgs[out_sg++] = &buf;
> >>  		}
> >>
> >> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct virtqueue
*vq)
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	struct scatterlist hdr, buf, *sgs[2];
> >>  	int ret;
> >> +	void *pkt_buf;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >>  		return -ENOMEM;
> >>
> >> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
> >> -	if (!pkt->buf) {
> >> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
> >> +	if (!pkt_buf) {
> >>  		virtio_transport_free_pkt(pkt);
> >>  		return -ENOMEM;
> >>  	}
> >>
> >> +	pkt->vec[0].iov_base = pkt_buf;
> >> +	pkt->vec[0].iov_len = buf_len;
> >> +	pkt->nr_vecs = 1;
> >> +
> >>  	pkt->len = buf_len;
> >>
> >>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
> >>  	sgs[0] = &hdr;
> >>
> >> -	sg_init_one(&buf, pkt->buf, buf_len);
> >> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
> >>  	sgs[1] = &buf;
> >>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
> >>  	if (ret)
> >> @@ -388,11 +395,78 @@ static bool
virtio_transport_more_replies(struct virtio_vsock *vsock)
> >>  	return val < virtqueue_get_vring_size(vq);
> >>  }
> >>
> >> +static struct virtio_vsock_pkt *receive_mergeable(struct
virtqueue *vq,
> >> +		struct virtio_vsock *vsock, unsigned int *total_len)
> >> +{
> >> +	struct virtio_vsock_pkt *pkt;
> >> +	u16 num_buf;
> >> +	void *buf;
> >> +	unsigned int len;
> >> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
> >> +
> >> +	buf = virtqueue_get_buf(vq, &len);
> >> +	if (!buf)
> >> +		return NULL;
> >> +
> >> +	*total_len = len;
> >> +	vsock->rx_buf_nr--;
> >> +
> >> +	if (unlikely(len < vsock_hlen)) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> >> +
> >> +	pkt = buf;
> >> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
> >> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
> >> +		put_page(virt_to_head_page(buf));
> >> +		return NULL;
> >> +	}
> > 
> > So everything just stops going, and host and user don't even
> > know what the reason is. And not only that - the next
> > packet will be corrupted because we skipped the first one.
> > 
> > 
> 
> I understand this case will not encountered unless the code has
> *BUG*, like Host send some problematic packages (shorten/longer than
> expected). In this case, I think we should ignore/drop these packets.
If there's a specific packet length expected, e.g.  in this case vector
size, it needs to be negotiated between host and guest in some way.
> > 
> >> +
> >> +	/* Initialize pkt residual structure */
> >> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct
virtio_vsock_hdr) -
> >> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
> >> +
> >> +	pkt->mergeable = true;
> >> +	pkt->len = le32_to_cpu(pkt->hdr.len);
> >> +	if (!pkt->len)
> >> +		return pkt;
> >> +
> >> +	len -= vsock_hlen;
> >> +	if (len) {
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		/* Shared page with pkt, so get page in advance */
> >> +		get_page(virt_to_head_page(buf));
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	while (--num_buf) {
> >> +		buf = virtqueue_get_buf(vq, &len);
> >> +		if (!buf)
> >> +			goto err;
> >> +
> >> +		*total_len += len;
> >> +		vsock->rx_buf_nr--;
> >> +
> >> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
> >> +		pkt->vec[pkt->nr_vecs].iov_len = len;
> >> +		pkt->nr_vecs++;
> >> +	}
> >> +
> >> +	return pkt;
> >> +err:
> >> +	virtio_transport_free_pkt(pkt);
> >> +	return NULL;
> >> +}
> >> +
> >>  static void virtio_transport_rx_work(struct work_struct *work)
> >>  {
> >>  	struct virtio_vsock *vsock > >>  		container_of(work,
struct virtio_vsock, rx_work);
> >>  	struct virtqueue *vq;
> >> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct
virtio_vsock_pkt) :
> >> +			sizeof(struct virtio_vsock_hdr);
> >>
> >>  	vq = vsock->vqs[VSOCK_VQ_RX];
> >>
> >> @@ -412,21 +486,26 @@ static void virtio_transport_rx_work(struct
work_struct *work)
> >>  				goto out;
> >>  			}
> >>
> >> -			pkt = virtqueue_get_buf(vq, &len);
> >> -			if (!pkt) {
> >> -				break;
> >> -			}
> >> +			if (likely(vsock->mergeable)) {
> >> +				pkt = receive_mergeable(vq, vsock, &len);
> >> +				if (!pkt)
> >> +					break;
> >> +			} else {
> >> +				pkt = virtqueue_get_buf(vq, &len);
> >> +				if (!pkt)
> >> +					break;
> >>
> > 
> > So looking at it, this seems to be the main source of the gain.
> > But why does this require host/guest changes?
> > 
> > 
> > The way I see it:
> > 	- get a buffer and create an skb
> > 	- get the next one, check header matches, if yes
> > 	  tack it on the skb as a fragment. If not then
> > 	  don't, deliver previous one and queue the new one.
> > 
> > 
> 
> Vhost change reason I explain as above, and I hope use kvec
> to instead buf, after all buf only can express a contiguous
> physical memory.
> 
> Thanks,
> Yiwen.

I got the reason but I am not yet convinced it's a good one.
You don't necessarily need host to skip headers
in all but the first chunk. Guest can do this just as well.


> > 
> >> -			vsock->rx_buf_nr--;
> >> +				vsock->rx_buf_nr--;
> >> +			}
> >>
> >>  			/* Drop short/long packets */
> >> -			if (unlikely(len < sizeof(pkt->hdr) ||
> >> -				     len > sizeof(pkt->hdr) + pkt->len)) {
> >> +			if (unlikely(len < vsock_hlen ||
> >> +				     len > vsock_hlen + pkt->len)) {
> >>  				virtio_transport_free_pkt(pkt);
> >>  				continue;
> >>  			}
> >>
> >> -			pkt->len = len - sizeof(pkt->hdr);
> >> +			pkt->len = len - vsock_hlen;
> >>  			virtio_transport_deliver_tap_pkt(pkt);
> >>  			virtio_transport_recv_pkt(pkt);
> >>  		}
> >> diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
> >> index 3ae3a33..123a8b6 100644
> >> --- a/net/vmw_vsock/virtio_transport_common.c
> >> +++ b/net/vmw_vsock/virtio_transport_common.c
> >> @@ -44,6 +44,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
> >>  {
> >>  	struct virtio_vsock_pkt *pkt;
> >>  	int err;
> >> +	void *buf = NULL;
> >>
> >>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
> >>  	if (!pkt)
> >> @@ -62,12 +63,16 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
> >>  	pkt->vsk		= info->vsk;
> >>
> >>  	if (info->msg && len > 0) {
> >> -		pkt->buf = kmalloc(len, GFP_KERNEL);
> >> -		if (!pkt->buf)
> >> +		buf = kmalloc(len, GFP_KERNEL);
> >> +		if (!buf)
> >>  			goto out_pkt;
> >> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
> >> +		err = memcpy_from_msg(buf, info->msg, len);
> >>  		if (err)
> >>  			goto out;
> >> +
> >> +		pkt->vec[0].iov_base = buf;
> >> +		pkt->vec[0].iov_len = len;
> >> +		pkt->nr_vecs = 1;
> >>  	}
> >>
> >>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
> >> @@ -80,7 +85,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
> >>  	return pkt;
> >>
> >>  out:
> >> -	kfree(pkt->buf);
> >> +	kfree(buf);
> >>  out_pkt:
> >>  	kfree(pkt);
> >>  	return NULL;
> >> @@ -92,6 +97,7 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
> >>  	struct virtio_vsock_pkt *pkt = opaque;
> >>  	struct af_vsockmon_hdr *hdr;
> >>  	struct sk_buff *skb;
> >> +	int i;
> >>
> >>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) +
pkt->len,
> >>  			GFP_ATOMIC);
> >> @@ -134,7 +140,8 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
> >>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
> >>
> >>  	if (pkt->len) {
> >> -		skb_put_data(skb, pkt->buf, pkt->len);
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			skb_put_data(skb, pkt->vec[i].iov_base,
pkt->vec[i].iov_len);
> >>  	}
> >>
> >>  	return skb;
> >> @@ -260,6 +267,9 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>
> >>  	spin_lock_bh(&vvs->rx_lock);
> >>  	while (total < len &&
!list_empty(&vvs->rx_queue)) {
> >> +		size_t copy_bytes, last_vec_total = 0, vec_off;
> >> +		int i;
> >> +
> >>  		pkt = list_first_entry(&vvs->rx_queue,
> >>  				       struct virtio_vsock_pkt, list);
> >>
> >> @@ -272,14 +282,28 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
> >>  		 */
> >>  		spin_unlock_bh(&vvs->rx_lock);
> >>
> >> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
> >> -		if (err)
> >> -			goto out;
> >> +		for (i = 0; i < pkt->nr_vecs; i++) {
> >> +			if (pkt->off > last_vec_total + pkt->vec[i].iov_len)
{
> >> +				last_vec_total += pkt->vec[i].iov_len;
> >> +				continue;
> >> +			}
> >> +
> >> +			vec_off = pkt->off - last_vec_total;
> >> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
> >> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
> >> +					copy_bytes);
> >> +			if (err)
> >> +				goto out;
> >> +
> >> +			bytes -= copy_bytes;
> >> +			pkt->off += copy_bytes;
> >> +			total += copy_bytes;
> >> +			last_vec_total += pkt->vec[i].iov_len;
> >> +			if (!bytes)
> >> +				break;
> >> +		}
> >>
> >>  		spin_lock_bh(&vvs->rx_lock);
> >> -
> >> -		total += bytes;
> >> -		pkt->off += bytes;
> >>  		if (pkt->off == pkt->len) {
> >>  			virtio_transport_dec_rx_pkt(vvs, pkt);
> >>  			list_del(&pkt->list);
> >> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct
virtio_vsock_pkt *pkt)
> >>
> >>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
> >>  {
> >> -	kfree(pkt->buf);
> >> -	kfree(pkt);
> >> +	int i;
> >> +
> >> +	if (pkt->mergeable) {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
> >> +		put_page(virt_to_head_page((void *)pkt));
> >> +	} else {
> >> +		for (i = 0; i < pkt->nr_vecs; i++)
> >> +			kfree(pkt->vec[i].iov_base);
> >> +		kfree(pkt);
> >> +	}
> >>  }
> >>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
> >>
> >> -- 
> >> 1.8.3.1
> >>
> > 
> > .
> > 
>

jiangyiwen

2018-Dec-14 08:11 UTC

head link

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

On 2018/12/13 22:29, Michael S. Tsirkin wrote:> On Thu, Dec 13, 2018 at 10:38:09AM +0800, jiangyiwen wrote:
>> Hi Michael,
>>
>> On 2018/12/12 23:31, Michael S. Tsirkin wrote:
>>> On Wed, Dec 12, 2018 at 05:31:39PM +0800, jiangyiwen wrote:
>>>> Guest receive mergeable rx buffer, it can merge
>>>> scatter rx buffer into a big buffer and then copy
>>>> to user space.
>>>>
>>>> In addition, it also use iovec to replace buf in struct
>>>> virtio_vsock_pkt, keep tx and rx consistency. The only
>>>> difference is now tx still uses a segment of continuous
>>>> physical memory to implement.
>>>>
>>>> Signed-off-by: Yiwen Jiang <jiangyiwen at huawei.com>
>>>> ---
>>>>  drivers/vhost/vsock.c                   |  31 +++++++---
>>>>  include/linux/virtio_vsock.h            |   6 +-
>>>>  net/vmw_vsock/virtio_transport.c        | 105
++++++++++++++++++++++++++++----
>>>>  net/vmw_vsock/virtio_transport_common.c |  59
++++++++++++++----
>>>>  4 files changed, 166 insertions(+), 35 deletions(-)
>>>
>>>
>>> This was supposed to be a guest patch, why is vhost changed here?
>>>
>>
>> In mergeable rx buff cases, it need to scatter big packets into several
>> buffers, so I add kvec variable in struct virtio_vsock_pkt, at the same
>> time, in order to keep tx and rx consistency, I use kvec to replace
>> variable buf, because vhost use the variable pkt->buf, so this patch
>> caused vhost is changed.
> 
> You'd want to split these patches imho.
> 
Ok, I should do it.
>>>> diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
>>>> index dc52b0f..c7ab0dd 100644
>>>> --- a/drivers/vhost/vsock.c
>>>> +++ b/drivers/vhost/vsock.c
>>>> @@ -179,6 +179,8 @@ static int get_rx_bufs(struct
vhost_virtqueue *vq,
>>>>  		size_t nbytes;
>>>>  		size_t len;
>>>>  		s16 headcount;
>>>> +		size_t remain_len;
>>>> +		int i;
>>>>
>>>>  		spin_lock_bh(&vsock->send_pkt_list_lock);
>>>>  		if (list_empty(&vsock->send_pkt_list)) {
>>>> @@ -221,11 +223,19 @@ static int get_rx_bufs(struct
vhost_virtqueue *vq,
>>>>  			break;
>>>>  		}
>>>>
>>>> -		nbytes = copy_to_iter(pkt->buf, pkt->len,
&iov_iter);
>>>> -		if (nbytes != pkt->len) {
>>>> -			virtio_transport_free_pkt(pkt);
>>>> -			vq_err(vq, "Faulted on copying pkt buf\n");
>>>> -			break;
>>>> +		remain_len = pkt->len;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			int tmp_len;
>>>> +
>>>> +			tmp_len = min(remain_len, pkt->vec[i].iov_len);
>>>> +			nbytes = copy_to_iter(pkt->vec[i].iov_base, tmp_len,
&iov_iter);
>>>> +			if (nbytes != tmp_len) {
>>>> +				virtio_transport_free_pkt(pkt);
>>>> +				vq_err(vq, "Faulted on copying pkt buf\n");
>>>> +				break;
>>>> +			}
>>>> +
>>>> +			remain_len -= tmp_len;
>>>>  		}
>>>>
>>>>  		vhost_add_used_n(vq, vq->heads, headcount);
>>>> @@ -341,6 +351,7 @@ static void
vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  	struct iov_iter iov_iter;
>>>>  	size_t nbytes;
>>>>  	size_t len;
>>>> +	void *buf;
>>>>
>>>>  	if (in != 0) {
>>>>  		vq_err(vq, "Expected 0 input buffers, got %u\n",
in);
>>>> @@ -375,13 +386,17 @@ static void
vhost_transport_send_pkt_work(struct vhost_work *work)
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	pkt->buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	buf = kmalloc(pkt->len, GFP_KERNEL);
>>>> +	if (!buf) {
>>>>  		kfree(pkt);
>>>>  		return NULL;
>>>>  	}
>>>>
>>>> -	nbytes = copy_from_iter(pkt->buf, pkt->len,
&iov_iter);
>>>> +	pkt->vec[0].iov_base = buf;
>>>> +	pkt->vec[0].iov_len = pkt->len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>> +	nbytes = copy_from_iter(buf, pkt->len, &iov_iter);
>>>>  	if (nbytes != pkt->len) {
>>>>  		vq_err(vq, "Expected %u byte payload, got %zu
bytes\n",
>>>>  		       pkt->len, nbytes);
>>>> diff --git a/include/linux/virtio_vsock.h
b/include/linux/virtio_vsock.h
>>>> index da9e1fe..734eeed 100644
>>>> --- a/include/linux/virtio_vsock.h
>>>> +++ b/include/linux/virtio_vsock.h
>>>> @@ -13,6 +13,8 @@
>>>>  #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE	(1024 * 4)
>>>>  #define VIRTIO_VSOCK_MAX_BUF_SIZE		0xFFFFFFFFUL
>>>>  #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE		(1024 * 64)
>>>> +/* virtio_vsock_pkt + max_pkt_len(default MAX_PKT_BUF_SIZE) */
>>>> +#define VIRTIO_VSOCK_MAX_VEC_NUM
((VIRTIO_VSOCK_MAX_PKT_BUF_SIZE / PAGE_SIZE) + 1)
>>>>
>>>>  /* Virtio-vsock feature */
>>>>  #define VIRTIO_VSOCK_F_MRG_RXBUF 0 /* Host can merge receive
buffers. */
>>>> @@ -55,10 +57,12 @@ struct virtio_vsock_pkt {
>>>>  	struct list_head list;
>>>>  	/* socket refcnt not held, only use for cancellation */
>>>>  	struct vsock_sock *vsk;
>>>> -	void *buf;
>>>> +	struct kvec vec[VIRTIO_VSOCK_MAX_VEC_NUM];
>>>> +	int nr_vecs;
>>>>  	u32 len;
>>>>  	u32 off;
>>>>  	bool reply;
>>>> +	bool mergeable;
>>>>  };
>>>>
>>>>  struct virtio_vsock_pkt_info {
>>>> diff --git a/net/vmw_vsock/virtio_transport.c
b/net/vmw_vsock/virtio_transport.c
>>>> index c4a465c..148b58a 100644
>>>> --- a/net/vmw_vsock/virtio_transport.c
>>>> +++ b/net/vmw_vsock/virtio_transport.c
>>>> @@ -155,8 +155,10 @@ static int
virtio_transport_send_pkt_loopback(struct virtio_vsock *vsock,
>>>>
>>>>  		sg_init_one(&hdr, &pkt->hdr,
sizeof(pkt->hdr));
>>>>  		sgs[out_sg++] = &hdr;
>>>> -		if (pkt->buf) {
>>>> -			sg_init_one(&buf, pkt->buf, pkt->len);
>>>> +		if (pkt->len) {
>>>> +			/* Currently only support a segment of memory in tx */
>>>> +			BUG_ON(pkt->vec[0].iov_len != pkt->len);
>>>> +			sg_init_one(&buf, pkt->vec[0].iov_base,
pkt->vec[0].iov_len);
>>>>  			sgs[out_sg++] = &buf;
>>>>  		}
>>>>
>>>> @@ -304,23 +306,28 @@ static int fill_old_rx_buff(struct
virtqueue *vq)
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	struct scatterlist hdr, buf, *sgs[2];
>>>>  	int ret;
>>>> +	void *pkt_buf;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>>  		return -ENOMEM;
>>>>
>>>> -	pkt->buf = kmalloc(buf_len, GFP_KERNEL);
>>>> -	if (!pkt->buf) {
>>>> +	pkt_buf = kmalloc(buf_len, GFP_KERNEL);
>>>> +	if (!pkt_buf) {
>>>>  		virtio_transport_free_pkt(pkt);
>>>>  		return -ENOMEM;
>>>>  	}
>>>>
>>>> +	pkt->vec[0].iov_base = pkt_buf;
>>>> +	pkt->vec[0].iov_len = buf_len;
>>>> +	pkt->nr_vecs = 1;
>>>> +
>>>>  	pkt->len = buf_len;
>>>>
>>>>  	sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
>>>>  	sgs[0] = &hdr;
>>>>
>>>> -	sg_init_one(&buf, pkt->buf, buf_len);
>>>> +	sg_init_one(&buf, pkt->vec[0].iov_base, buf_len);
>>>>  	sgs[1] = &buf;
>>>>  	ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
>>>>  	if (ret)
>>>> @@ -388,11 +395,78 @@ static bool
virtio_transport_more_replies(struct virtio_vsock *vsock)
>>>>  	return val < virtqueue_get_vring_size(vq);
>>>>  }
>>>>
>>>> +static struct virtio_vsock_pkt *receive_mergeable(struct
virtqueue *vq,
>>>> +		struct virtio_vsock *vsock, unsigned int *total_len)
>>>> +{
>>>> +	struct virtio_vsock_pkt *pkt;
>>>> +	u16 num_buf;
>>>> +	void *buf;
>>>> +	unsigned int len;
>>>> +	size_t vsock_hlen = sizeof(struct virtio_vsock_pkt);
>>>> +
>>>> +	buf = virtqueue_get_buf(vq, &len);
>>>> +	if (!buf)
>>>> +		return NULL;
>>>> +
>>>> +	*total_len = len;
>>>> +	vsock->rx_buf_nr--;
>>>> +
>>>> +	if (unlikely(len < vsock_hlen)) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>> +
>>>> +	pkt = buf;
>>>> +	num_buf = le16_to_cpu(pkt->mrg_rxbuf_hdr.num_buffers);
>>>> +	if (!num_buf || num_buf > VIRTIO_VSOCK_MAX_VEC_NUM) {
>>>> +		put_page(virt_to_head_page(buf));
>>>> +		return NULL;
>>>> +	}
>>>
>>> So everything just stops going, and host and user don't even
>>> know what the reason is. And not only that - the next
>>> packet will be corrupted because we skipped the first one.
>>>
>>>
>>
>> I understand this case will not encountered unless the code has
>> *BUG*, like Host send some problematic packages (shorten/longer than
>> expected). In this case, I think we should ignore/drop these packets.
> 
> If there's a specific packet length expected, e.g.  in this case vector
> size, it needs to be negotiated between host and guest in some way.
> 
Actually, I still keep the negotiated in the virtio_transport_rx_work(),
in the receive_mergeable() I will calculate total len, and compared with
the pkt->hdr.len(host packed).
>>>
>>>> +
>>>> +	/* Initialize pkt residual structure */
>>>> +	memset(&pkt->work, 0, vsock_hlen - sizeof(struct
virtio_vsock_hdr) -
>>>> +			sizeof(struct virtio_vsock_mrg_rxbuf_hdr));
>>>> +
>>>> +	pkt->mergeable = true;
>>>> +	pkt->len = le32_to_cpu(pkt->hdr.len);
>>>> +	if (!pkt->len)
>>>> +		return pkt;
>>>> +
>>>> +	len -= vsock_hlen;
>>>> +	if (len) {
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf + vsock_hlen;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		/* Shared page with pkt, so get page in advance */
>>>> +		get_page(virt_to_head_page(buf));
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	while (--num_buf) {
>>>> +		buf = virtqueue_get_buf(vq, &len);
>>>> +		if (!buf)
>>>> +			goto err;
>>>> +
>>>> +		*total_len += len;
>>>> +		vsock->rx_buf_nr--;
>>>> +
>>>> +		pkt->vec[pkt->nr_vecs].iov_base = buf;
>>>> +		pkt->vec[pkt->nr_vecs].iov_len = len;
>>>> +		pkt->nr_vecs++;
>>>> +	}
>>>> +
>>>> +	return pkt;
>>>> +err:
>>>> +	virtio_transport_free_pkt(pkt);
>>>> +	return NULL;
>>>> +}
>>>> +
>>>>  static void virtio_transport_rx_work(struct work_struct *work)
>>>>  {
>>>>  	struct virtio_vsock *vsock >>>>  	
container_of(work, struct virtio_vsock, rx_work);
>>>>  	struct virtqueue *vq;
>>>> +	size_t vsock_hlen = vsock->mergeable ? sizeof(struct
virtio_vsock_pkt) :
>>>> +			sizeof(struct virtio_vsock_hdr);
>>>>
>>>>  	vq = vsock->vqs[VSOCK_VQ_RX];
>>>>
>>>> @@ -412,21 +486,26 @@ static void
virtio_transport_rx_work(struct work_struct *work)
>>>>  				goto out;
>>>>  			}
>>>>
>>>> -			pkt = virtqueue_get_buf(vq, &len);
>>>> -			if (!pkt) {
>>>> -				break;
>>>> -			}
>>>> +			if (likely(vsock->mergeable)) {
>>>> +				pkt = receive_mergeable(vq, vsock, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>> +			} else {
>>>> +				pkt = virtqueue_get_buf(vq, &len);
>>>> +				if (!pkt)
>>>> +					break;
>>>>
>>>
>>> So looking at it, this seems to be the main source of the gain.
>>> But why does this require host/guest changes?
>>>
>>>
>>> The way I see it:
>>> 	- get a buffer and create an skb
>>> 	- get the next one, check header matches, if yes
>>> 	  tack it on the skb as a fragment. If not then
>>> 	  don't, deliver previous one and queue the new one.
>>>
>>>
>>
>> Vhost change reason I explain as above, and I hope use kvec
>> to instead buf, after all buf only can express a contiguous
>> physical memory.
>>
>> Thanks,
>> Yiwen.
> 
> 
> I got the reason but I am not yet convinced it's a good one.
> You don't necessarily need host to skip headers
> in all but the first chunk. Guest can do this just as well.
> 
> 
I understand what you mean, right, only waste some memory, but it
can gain more convenient, like I don't need to skip headers and
host don't need to transport total pkt. I will fix it in the later
version.

Thanks,
Yiwen.
> 
>>>
>>>> -			vsock->rx_buf_nr--;
>>>> +				vsock->rx_buf_nr--;
>>>> +			}
>>>>
>>>>  			/* Drop short/long packets */
>>>> -			if (unlikely(len < sizeof(pkt->hdr) ||
>>>> -				     len > sizeof(pkt->hdr) + pkt->len)) {
>>>> +			if (unlikely(len < vsock_hlen ||
>>>> +				     len > vsock_hlen + pkt->len)) {
>>>>  				virtio_transport_free_pkt(pkt);
>>>>  				continue;
>>>>  			}
>>>>
>>>> -			pkt->len = len - sizeof(pkt->hdr);
>>>> +			pkt->len = len - vsock_hlen;
>>>>  			virtio_transport_deliver_tap_pkt(pkt);
>>>>  			virtio_transport_recv_pkt(pkt);
>>>>  		}
>>>> diff --git a/net/vmw_vsock/virtio_transport_common.c
b/net/vmw_vsock/virtio_transport_common.c
>>>> index 3ae3a33..123a8b6 100644
>>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>>> @@ -44,6 +44,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>>>  {
>>>>  	struct virtio_vsock_pkt *pkt;
>>>>  	int err;
>>>> +	void *buf = NULL;
>>>>
>>>>  	pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
>>>>  	if (!pkt)
>>>> @@ -62,12 +63,16 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>>>  	pkt->vsk		= info->vsk;
>>>>
>>>>  	if (info->msg && len > 0) {
>>>> -		pkt->buf = kmalloc(len, GFP_KERNEL);
>>>> -		if (!pkt->buf)
>>>> +		buf = kmalloc(len, GFP_KERNEL);
>>>> +		if (!buf)
>>>>  			goto out_pkt;
>>>> -		err = memcpy_from_msg(pkt->buf, info->msg, len);
>>>> +		err = memcpy_from_msg(buf, info->msg, len);
>>>>  		if (err)
>>>>  			goto out;
>>>> +
>>>> +		pkt->vec[0].iov_base = buf;
>>>> +		pkt->vec[0].iov_len = len;
>>>> +		pkt->nr_vecs = 1;
>>>>  	}
>>>>
>>>>  	trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>>> @@ -80,7 +85,7 @@ static const struct virtio_transport
*virtio_transport_get_ops(void)
>>>>  	return pkt;
>>>>
>>>>  out:
>>>> -	kfree(pkt->buf);
>>>> +	kfree(buf);
>>>>  out_pkt:
>>>>  	kfree(pkt);
>>>>  	return NULL;
>>>> @@ -92,6 +97,7 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
>>>>  	struct virtio_vsock_pkt *pkt = opaque;
>>>>  	struct af_vsockmon_hdr *hdr;
>>>>  	struct sk_buff *skb;
>>>> +	int i;
>>>>
>>>>  	skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) +
pkt->len,
>>>>  			GFP_ATOMIC);
>>>> @@ -134,7 +140,8 @@ static struct sk_buff
*virtio_transport_build_skb(void *opaque)
>>>>  	skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
>>>>
>>>>  	if (pkt->len) {
>>>> -		skb_put_data(skb, pkt->buf, pkt->len);
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			skb_put_data(skb, pkt->vec[i].iov_base,
pkt->vec[i].iov_len);
>>>>  	}
>>>>
>>>>  	return skb;
>>>> @@ -260,6 +267,9 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>
>>>>  	spin_lock_bh(&vvs->rx_lock);
>>>>  	while (total < len &&
!list_empty(&vvs->rx_queue)) {
>>>> +		size_t copy_bytes, last_vec_total = 0, vec_off;
>>>> +		int i;
>>>> +
>>>>  		pkt = list_first_entry(&vvs->rx_queue,
>>>>  				       struct virtio_vsock_pkt, list);
>>>>
>>>> @@ -272,14 +282,28 @@ static int
virtio_transport_send_credit_update(struct vsock_sock *vsk,
>>>>  		 */
>>>>  		spin_unlock_bh(&vvs->rx_lock);
>>>>
>>>> -		err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
>>>> -		if (err)
>>>> -			goto out;
>>>> +		for (i = 0; i < pkt->nr_vecs; i++) {
>>>> +			if (pkt->off > last_vec_total +
pkt->vec[i].iov_len) {
>>>> +				last_vec_total += pkt->vec[i].iov_len;
>>>> +				continue;
>>>> +			}
>>>> +
>>>> +			vec_off = pkt->off - last_vec_total;
>>>> +			copy_bytes = min(pkt->vec[i].iov_len - vec_off, bytes);
>>>> +			err = memcpy_to_msg(msg, pkt->vec[i].iov_base + vec_off,
>>>> +					copy_bytes);
>>>> +			if (err)
>>>> +				goto out;
>>>> +
>>>> +			bytes -= copy_bytes;
>>>> +			pkt->off += copy_bytes;
>>>> +			total += copy_bytes;
>>>> +			last_vec_total += pkt->vec[i].iov_len;
>>>> +			if (!bytes)
>>>> +				break;
>>>> +		}
>>>>
>>>>  		spin_lock_bh(&vvs->rx_lock);
>>>> -
>>>> -		total += bytes;
>>>> -		pkt->off += bytes;
>>>>  		if (pkt->off == pkt->len) {
>>>>  			virtio_transport_dec_rx_pkt(vvs, pkt);
>>>>  			list_del(&pkt->list);
>>>> @@ -1050,8 +1074,17 @@ void virtio_transport_recv_pkt(struct
virtio_vsock_pkt *pkt)
>>>>
>>>>  void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
>>>>  {
>>>> -	kfree(pkt->buf);
>>>> -	kfree(pkt);
>>>> +	int i;
>>>> +
>>>> +	if (pkt->mergeable) {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			put_page(virt_to_head_page(pkt->vec[i].iov_base));
>>>> +		put_page(virt_to_head_page((void *)pkt));
>>>> +	} else {
>>>> +		for (i = 0; i < pkt->nr_vecs; i++)
>>>> +			kfree(pkt->vec[i].iov_base);
>>>> +		kfree(pkt);
>>>> +	}
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
>>>>
>>>> -- 
>>>> 1.8.3.1
>>>>
>>>
>>> .
>>>
>>
> 
> .
>

Apparently Analagous Threads

Search for more seemingly similar threads

Linux Virtualization - Dec 2018 - [PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

[PATCH v2 3/5] VSOCK: support receive mergeable rx buffer in guest

Apparently Analagous Threads