Stefano Garzarella
2023-Apr-19 09:29 UTC
[PATCH RFC net-next v2 1/4] virtio/vsock: support dgram
On Fri, Apr 14, 2023 at 12:25:57AM +0000, Bobby Eshleman wrote:>This commit adds support for datagrams over virtio/vsock. > >Message boundaries are preserved on a per-skb and per-vq entry basis. >Messages are copied in whole from the user to an SKB, which in turn is >added to the scatterlist for the virtqueue in whole for the device. >Messages do not straddle skbs and they do not straddle packets. >Messages may be truncated by the receiving user if their buffer is >shorter than the message. > >Other properties of vsock datagrams: >- Datagrams self-throttle at the per-socket sk_sndbuf threshold. >- The same virtqueue is used as is used for streams and seqpacket flows >- Credits are not used for datagrams >- Packets are dropped silently by the device, which means the virtqueue > will still get kicked even during high packet loss, so long as the > socket does not exceed sk_sndbuf.So, IIUC the device will allocate the skb, then in virtio_transport_recv_dgram(), when we call sock_queue_rcv_skb(), if sk_rcvbuf is full, the skb is freed, right?> >Future work might include finding a way to reduce the virtqueue kick >rate for datagram flows with high packet loss. > >One outstanding issue with this commit is that it re-uses the stream >binding code and table, which means that there can not simultaneously be >VSOCK dgram and VSOCK stream/seqpacket of same port and CID. This should >be changed before undoing the RFC tag.Okay, thanks for mentioning!> >Signed-off-by: Bobby Eshleman <bobby.eshleman at bytedance.com> >--- > drivers/vhost/vsock.c | 2 +- > include/net/af_vsock.h | 1 + > include/uapi/linux/virtio_vsock.h | 1 + > net/vmw_vsock/af_vsock.c | 26 ++++- > net/vmw_vsock/virtio_transport.c | 2 +- > net/vmw_vsock/virtio_transport_common.c | 199 ++++++++++++++++++++++++++++---- > 6 files changed, 204 insertions(+), 27 deletions(-) > >diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c >index 6578db78f0ae..dff6ee1c479b 100644 >--- a/drivers/vhost/vsock.c >+++ b/drivers/vhost/vsock.c >@@ -921,7 +921,7 @@ static int __init vhost_vsock_init(void) > int ret; > > ret = vsock_core_register(&vhost_transport.transport, >- VSOCK_TRANSPORT_F_H2G); >+ VSOCK_TRANSPORT_F_H2G | VSOCK_TRANSPORT_F_DGRAM);IIRC we don't support multiple VSOCK_TRANSPORT_F_DGRAM transports, so I think we need to address also this case. The problem with stream was that vmci_transport uses MODULE_ALIAS_NETPROTO(PF_VSOCK). So, if a vsock socket is created before any transports loaded, vmci_transport is automatically loaded, then we can't laod any other transport providing F_DGRAM. Maybe we can move vsock_core_register(..., VSOCK_TRANSPORT_F_DGRAM) in vmci_vsock_transport_cb() for VMCI, but we need to think carefully about it.> if (ret < 0) > return ret; > >diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h >index 0e7504a42925..57af28fede19 100644 >--- a/include/net/af_vsock.h >+++ b/include/net/af_vsock.h >@@ -80,6 +80,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk); > s64 vsock_stream_has_space(struct vsock_sock *vsk); > struct sock *vsock_create_connected(struct sock *parent); > void vsock_data_ready(struct sock *sk); >+int vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr); > > /**** TRANSPORT ****/ > >diff --git a/include/uapi/linux/virtio_vsock.h b/include/uapi/linux/virtio_vsock.h >index 64738838bee5..331be28b1d30 100644 >--- a/include/uapi/linux/virtio_vsock.h >+++ b/include/uapi/linux/virtio_vsock.h >@@ -69,6 +69,7 @@ struct virtio_vsock_hdr { > enum virtio_vsock_type { > VIRTIO_VSOCK_TYPE_STREAM = 1, > VIRTIO_VSOCK_TYPE_SEQPACKET = 2, >+ VIRTIO_VSOCK_TYPE_DGRAM = 3, > }; > > enum virtio_vsock_op { >diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c >index 413407bb646c..46b3f35e3adc 100644 >--- a/net/vmw_vsock/af_vsock.c >+++ b/net/vmw_vsock/af_vsock.c >@@ -677,6 +677,19 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, > return 0; > } > >+int vsock_bind_stream(struct vsock_sock *vsk, >+ struct sockaddr_vm *addr) >+{ >+ int retval; >+ >+ spin_lock_bh(&vsock_table_lock); >+ retval = __vsock_bind_connectible(vsk, addr); >+ spin_unlock_bh(&vsock_table_lock); >+ >+ return retval; >+} >+EXPORT_SYMBOL(vsock_bind_stream); >+ > static int __vsock_bind_dgram(struct vsock_sock *vsk, > struct sockaddr_vm *addr) > { >@@ -2453,11 +2466,16 @@ int vsock_core_register(const struct vsock_transport *t, int features) > } > > if (features & VSOCK_TRANSPORT_F_DGRAM) { >- if (t_dgram) { >- err = -EBUSY; >- goto err_busy; >+ /* XXX: always chose the G2H variant over others, support nesting later */Okay, please mention also in the commit description that we will fix this before removing RFC.>+ if (features & VSOCK_TRANSPORT_F_G2H) { >+ if (t_dgram) >+ pr_warn("vsock: preferring g2h transport for dgram\n"); >+ t_dgram = t; >+ } >+ >+ if (!t_dgram) { >+ t_dgram = t; > } >- t_dgram = t; > }Would be better to split this patch in 2 (changes in the core, changes in virtio/vhost-vsock).> > if (features & VSOCK_TRANSPORT_F_LOCAL) { >diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c >index e95df847176b..582c6c0f788f 100644 >--- a/net/vmw_vsock/virtio_transport.c >+++ b/net/vmw_vsock/virtio_transport.c >@@ -775,7 +775,7 @@ static int __init virtio_vsock_init(void) > return -ENOMEM; > > ret = vsock_core_register(&virtio_transport.transport, >- VSOCK_TRANSPORT_F_G2H); >+ VSOCK_TRANSPORT_F_G2H | VSOCK_TRANSPORT_F_DGRAM); > if (ret) > goto out_wq; > >diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c >index e4878551f140..925acface893 100644 >--- a/net/vmw_vsock/virtio_transport_common.c >+++ b/net/vmw_vsock/virtio_transport_common.c >@@ -37,6 +37,35 @@ virtio_transport_get_ops(struct vsock_sock *vsk) > return container_of(t, struct virtio_transport, transport); > } > >+/* Requires info->msg and info->vsk */ >+static struct sk_buff * >+virtio_transport_sock_alloc_send_skb(struct virtio_vsock_pkt_info *info, unsigned int size, >+ gfp_t mask, int *err) >+{ >+ struct sk_buff *skb; >+ struct sock *sk; >+ int noblock; >+ >+ if (size < VIRTIO_VSOCK_SKB_HEADROOM) { >+ *err = -EINVAL;What about using ERR_PTR, instead of passing *err?>+ return NULL; >+ } >+ >+ if (info->msg) >+ noblock = info->msg->msg_flags & MSG_DONTWAIT; >+ else >+ noblock = 1; >+ >+ sk = sk_vsock(info->vsk); >+ sk->sk_allocation = mask; >+ skb = sock_alloc_send_skb(sk, size, noblock, err); >+ if (!skb) >+ return NULL; >+ >+ skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM); >+ return skb; >+} >+ > /* Returns a new packet on success, otherwise returns NULL. > * > * If NULL is returned, errp is set to a negative errno. >@@ -47,7 +76,8 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, > u32 src_cid, > u32 src_port, > u32 dst_cid, >- u32 dst_port) >+ u32 dst_port, >+ int *errp) > { > const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; > struct virtio_vsock_hdr *hdr; >@@ -55,9 +85,21 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, > void *payload; > int err; > >- skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); >- if (!skb) >+ /* dgrams do not use credits, self-throttle according to sk_sndbuf >+ * using sock_alloc_send_skb. This helps avoid triggering the OOM. >+ */ >+ if (info->vsk && info->type == VIRTIO_VSOCK_TYPE_DGRAM) { >+ skb = virtio_transport_sock_alloc_send_skb(info, skb_len, GFP_KERNEL, &err); >+ } else { >+ skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); >+ if (!skb) >+ err = -ENOMEM; >+ } >+ >+ if (!skb) { >+ *errp = err; > return NULL; >+ } > > hdr = virtio_vsock_hdr(skb); > hdr->type = cpu_to_le16(info->type); >@@ -102,6 +144,7 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, > return skb; > > out: >+ *errp = err; > kfree_skb(skb); > return NULL; > } >@@ -183,7 +226,9 @@ EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); > > static u16 virtio_transport_get_type(struct sock *sk) > { >- if (sk->sk_type == SOCK_STREAM) >+ if (sk->sk_type == SOCK_DGRAM) >+ return VIRTIO_VSOCK_TYPE_DGRAM; >+ else if (sk->sk_type == SOCK_STREAM) > return VIRTIO_VSOCK_TYPE_STREAM; > else > return VIRTIO_VSOCK_TYPE_SEQPACKET; >@@ -239,11 +284,10 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, > > skb = virtio_transport_alloc_skb(info, skb_len, > src_cid, src_port, >- dst_cid, dst_port); >- if (!skb) { >- ret = -ENOMEM; >+ dst_cid, dst_port, >+ &ret); >+ if (!skb) > break; >- } > > virtio_transport_inc_tx_pkt(vvs, skb); > >@@ -588,7 +632,56 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk, > struct msghdr *msg, > size_t len, int flags) > { >- return -EOPNOTSUPP; >+ struct sk_buff *skb; >+ struct sock *sk; >+ size_t bytes; >+ int err; >+ >+ if (flags & MSG_OOB || flags & MSG_ERRQUEUE) >+ return -EOPNOTSUPP; >+ >+ sk = sk_vsock(vsk); >+ err = 0; >+ >+ skb = skb_recv_datagram(sk, flags, &err); >+ if (!skb) >+ goto out; >+ >+ /* If the user buffer is too short then truncate the message and set >+ * MSG_TRUNC. The remainder will be discarded when the skb is freed. >+ */ >+ if (len < skb->len) { >+ bytes = len; >+ msg->msg_flags |= MSG_TRUNC; >+ } else { >+ bytes = skb->len; >+ } >+ >+ /* Copy to msg from skb->data. >+ * virtio_vsock_alloc_skb() should have already set >+ * the skb pointers correctly. That is, skb->data >+ * should not still be at skb->head. >+ */ >+ WARN_ON(skb->data == skb->head); >+ err = skb_copy_datagram_msg(skb, 0, msg, bytes); >+ if (err) >+ goto out; >+ >+ /* On success, return the number bytes copied to the user buffer */ >+ err = bytes; >+ >+ if (msg->msg_name) { >+ /* Provide the address of the sender. */ >+ DECLARE_SOCKADDR(struct sockaddr_vm *, vm_addr, msg->msg_name); >+ >+ vsock_addr_init(vm_addr, le64_to_cpu(virtio_vsock_hdr(skb)->src_cid), >+ le32_to_cpu(virtio_vsock_hdr(skb)->src_port)); >+ msg->msg_namelen = sizeof(*vm_addr); >+ } >+ >+out: >+ skb_free_datagram(&vsk->sk, skb); >+ return err; > } > EXPORT_SYMBOL_GPL(virtio_transport_dgram_dequeue); > >@@ -793,13 +886,13 @@ EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); > int virtio_transport_dgram_bind(struct vsock_sock *vsk, > struct sockaddr_vm *addr) > { >- return -EOPNOTSUPP; >+ return vsock_bind_stream(vsk, addr); > } > EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); > > bool virtio_transport_dgram_allow(u32 cid, u32 port) > { >- return false; >+ return true;Should we check if F_DGRAM is negotiated?> } > EXPORT_SYMBOL_GPL(virtio_transport_dgram_allow); > >@@ -835,7 +928,37 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk, > struct msghdr *msg, > size_t dgram_len) > { >- return -EOPNOTSUPP; >+ struct virtio_vsock_pkt_info info = { >+ .op = VIRTIO_VSOCK_OP_RW, >+ .msg = msg, >+ .vsk = vsk, >+ .type = VIRTIO_VSOCK_TYPE_DGRAM, >+ }; >+ const struct virtio_transport *t_ops; >+ u32 src_cid, src_port; >+ struct sk_buff *skb; >+ int err; >+ >+ if (dgram_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) >+ return -EMSGSIZE; >+ >+ t_ops = virtio_transport_get_ops(vsk); >+ if (unlikely(!t_ops)) >+ return -EFAULT; >+ >+ src_cid = t_ops->transport.get_local_cid(); >+ src_port = vsk->local_addr.svm_port; >+ >+ skb = virtio_transport_alloc_skb(&info, dgram_len, >+ src_cid, src_port, >+ remote_addr->svm_cid, >+ remote_addr->svm_port, >+ &err); >+ >+ if (!skb) >+ return err; >+ >+ return t_ops->send_pkt(skb); > } > EXPORT_SYMBOL_GPL(virtio_transport_dgram_enqueue); > >@@ -892,6 +1015,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, > .reply = true, > }; > struct sk_buff *reply; >+ int err; > > /* Send RST only if the original pkt is not a RST pkt */ > if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) >@@ -904,9 +1028,10 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, > le64_to_cpu(hdr->dst_cid), > le32_to_cpu(hdr->dst_port), > le64_to_cpu(hdr->src_cid), >- le32_to_cpu(hdr->src_port)); >+ le32_to_cpu(hdr->src_port), >+ &err); > if (!reply) >- return -ENOMEM; >+ return err; > > return t->send_pkt(reply); > } >@@ -1126,6 +1251,25 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, > kfree_skb(skb); > } > >+/* This function takes ownership of the skb. >+ * >+ * It either places the skb on the sk_receive_queue or frees it. >+ */ >+static int >+virtio_transport_recv_dgram(struct sock *sk, struct sk_buff *skb)We don't use the return value, so this can be void.>+{ >+ int err; >+ >+ err = sock_queue_rcv_skb(sk, skb); >+ if (err < 0) { >+ kfree_skb(skb); >+ return err; >+ } >+ >+ sk->sk_data_ready(sk); >+ return 0; >+} >+ > static int > virtio_transport_recv_connected(struct sock *sk, > struct sk_buff *skb) >@@ -1289,7 +1433,8 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, > static bool virtio_transport_valid_type(u16 type) > { > return (type == VIRTIO_VSOCK_TYPE_STREAM) || >- (type == VIRTIO_VSOCK_TYPE_SEQPACKET); >+ (type == VIRTIO_VSOCK_TYPE_SEQPACKET) || >+ (type == VIRTIO_VSOCK_TYPE_DGRAM); > } > > /* We are under the virtio-vsock's vsock->rx_lock or vhost-vsock's vq->mutex >@@ -1303,22 +1448,25 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, > struct vsock_sock *vsk; > struct sock *sk; > bool space_available; >+ u16 type; > > vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), > le32_to_cpu(hdr->src_port)); > vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), > le32_to_cpu(hdr->dst_port)); > >+ type = le16_to_cpu(hdr->type); >+ > trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, > dst.svm_cid, dst.svm_port, > le32_to_cpu(hdr->len), >- le16_to_cpu(hdr->type), >+ type, > le16_to_cpu(hdr->op), > le32_to_cpu(hdr->flags), > le32_to_cpu(hdr->buf_alloc), > le32_to_cpu(hdr->fwd_cnt)); > >- if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { >+ if (!virtio_transport_valid_type(type)) { > (void)virtio_transport_reset_no_sock(t, skb); > goto free_pkt; > } >@@ -1330,13 +1478,15 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, > if (!sk) { > sk = vsock_find_bound_socket(&dst); > if (!sk) { >- (void)virtio_transport_reset_no_sock(t, skb); >+ if (type != VIRTIO_VSOCK_TYPE_DGRAM) >+ (void)virtio_transport_reset_no_sock(t, skb); > goto free_pkt; > } > } > >- if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { >- (void)virtio_transport_reset_no_sock(t, skb); >+ if (virtio_transport_get_type(sk) != type) { >+ if (type != VIRTIO_VSOCK_TYPE_DGRAM) >+ (void)virtio_transport_reset_no_sock(t, skb); > sock_put(sk); > goto free_pkt; > } >@@ -1352,12 +1502,18 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, > > /* Check if sk has been closed before lock_sock */ > if (sock_flag(sk, SOCK_DONE)) { >- (void)virtio_transport_reset_no_sock(t, skb); >+ if (type != VIRTIO_VSOCK_TYPE_DGRAM) >+ (void)virtio_transport_reset_no_sock(t, skb); > release_sock(sk); > sock_put(sk); > goto free_pkt; > } > >+ if (sk->sk_type == SOCK_DGRAM) { >+ virtio_transport_recv_dgram(sk, skb); >+ goto out; >+ } >+ > space_available = virtio_transport_space_update(sk, skb); > > /* Update CID in case it has changed after a transport reset event */ >@@ -1389,6 +1545,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, > break; > } > >+out: > release_sock(sk); > > /* Release refcnt obtained when we fetched this socket out of the > >-- >2.30.2 >