Mike Christie
2023-May-24 23:34 UTC
[PATCH 0/3] vhost-scsi: Fix IO hangs when using windows
The following patches were made over Linus's tree and fix an issue where windows guests will send iovecs with offset/lengths that result in IOs that are not aligned to 512. The LIO layer will then send them to Linux's block layer but it requires 512 byte alignment, so depending on the block driver being used we will get IO errors or hung IO. The following patches have vhost-scsi detect when windows sends these IOs and copy them to a bounce buffer. It then does some cleanup in the related code.
Mike Christie
2023-May-24 23:34 UTC
[PATCH 1/3] vhost-scsi: Fix alignment handling with windows
The linux block layer requires bios/requests to have lengths with a 512 byte alignment. Some drivers/layers like dm-crypt and the directi IO code will test for it and just fail. Other drivers like SCSI just assume the requirement is met and will end up in infinte retry loops. The problem for drivers like SCSI is that it uses functions like blk_rq_cur_sectors and blk_rq_sectors which divide the request's length by 512. If there's lefovers then it just gets dropped. But other code in the block/scsi layer may use blk_rq_bytes/blk_rq_cur_bytes and end up thinking there is still data left and try to retry the cmd. We can then end up getting stuck in retry loops where part of the block/scsi thinks there is data left, but other parts think we want to do IOs of zero length. Linux will always check for alignment, but windows will not. When vhost-scsi then translates the iovec it gets from a windows guest to a scatterlist, we can end up with sg items where the sg->length is not divisible by 512 due to the misaligned offset: sg[0].offset = 255; sg[0].length = 3841; sg... sg[N].offset = 0; sg[N].length = 255; When the lio backends then convert the SG to bios or other iovecs, we end up sending them with the same misaligned values and can hit the issues above. This just has us drop down to allocating a temp page and copying the data when this happens. Because this adds a check that needs to loop over the iovec in the main IO path, this patch adds an attribute that can be turned on for just windows. Signed-off-by: Mike Christie <michael.christie at oracle.com> --- drivers/vhost/scsi.c | 174 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 151 insertions(+), 23 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index bb10fa4bb4f6..dbad8fb579eb 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -25,6 +25,8 @@ #include <linux/fs.h> #include <linux/vmalloc.h> #include <linux/miscdevice.h> +#include <linux/blk_types.h> +#include <linux/bio.h> #include <asm/unaligned.h> #include <scsi/scsi_common.h> #include <scsi/scsi_proto.h> @@ -75,6 +77,9 @@ struct vhost_scsi_cmd { u32 tvc_prot_sgl_count; /* Saved unpacked SCSI LUN for vhost_scsi_target_queue_cmd() */ u32 tvc_lun; + u32 copied_iov:1; + const void *saved_iter_addr; + struct iov_iter saved_iter; /* Pointer to the SGL formatted memory from virtio-scsi */ struct scatterlist *tvc_sgl; struct scatterlist *tvc_prot_sgl; @@ -107,6 +112,7 @@ struct vhost_scsi_nexus { struct vhost_scsi_tpg { /* Vhost port target portal group tag for TCM */ u16 tport_tpgt; + bool check_io_alignment; /* Used to track number of TPG Port/Lun Links wrt to explict I_T Nexus shutdown */ int tv_tpg_port_count; /* Used for vhost_scsi device reference to tpg_nexus, protected by tv_tpg_mutex */ @@ -328,8 +334,13 @@ static void vhost_scsi_release_cmd_res(struct se_cmd *se_cmd) int i; if (tv_cmd->tvc_sgl_count) { - for (i = 0; i < tv_cmd->tvc_sgl_count; i++) - put_page(sg_page(&tv_cmd->tvc_sgl[i])); + for (i = 0; i < tv_cmd->tvc_sgl_count; i++) { + if (tv_cmd->copied_iov) + __free_page(sg_page(&tv_cmd->tvc_sgl[i])); + else + put_page(sg_page(&tv_cmd->tvc_sgl[i])); + } + kfree(tv_cmd->saved_iter_addr); } if (tv_cmd->tvc_prot_sgl_count) { for (i = 0; i < tv_cmd->tvc_prot_sgl_count; i++) @@ -502,6 +513,27 @@ static void vhost_scsi_evt_work(struct vhost_work *work) mutex_unlock(&vq->mutex); } +static int vhost_scsi_copy_sgl_to_iov(struct vhost_scsi_cmd *cmd) +{ + struct iov_iter *iter = &cmd->saved_iter; + struct scatterlist *sg = cmd->tvc_sgl; + struct page *page; + size_t len; + int i; + + for (i = 0; i < cmd->tvc_sgl_count; i++) { + page = sg_page(&sg[i]); + len = sg[i].length; + + if (copy_page_to_iter(page, 0, len, iter) != len) { + pr_err("Could not copy data. Error %lu\n", len); + return -1; + } + } + + return 0; +} + /* Fill in status and signal that we are done processing this command * * This is scheduled in the vhost work queue so we are called with the owner @@ -525,15 +557,20 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) pr_debug("%s tv_cmd %p resid %u status %#02x\n", __func__, cmd, se_cmd->residual_count, se_cmd->scsi_status); - memset(&v_rsp, 0, sizeof(v_rsp)); - v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, se_cmd->residual_count); - /* TODO is status_qualifier field needed? */ - v_rsp.status = se_cmd->scsi_status; - v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, - se_cmd->scsi_sense_length); - memcpy(v_rsp.sense, cmd->tvc_sense_buf, - se_cmd->scsi_sense_length); + + if (cmd->saved_iter_addr && vhost_scsi_copy_sgl_to_iov(cmd)) { + v_rsp.response = VIRTIO_SCSI_S_BAD_TARGET; + } else { + v_rsp.resid = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->residual_count); + /* TODO is status_qualifier field needed? */ + v_rsp.status = se_cmd->scsi_status; + v_rsp.sense_len = cpu_to_vhost32(cmd->tvc_vq, + se_cmd->scsi_sense_length); + memcpy(v_rsp.sense, cmd->tvc_sense_buf, + se_cmd->scsi_sense_length); + } iov_iter_init(&iov_iter, ITER_DEST, cmd->tvc_resp_iov, cmd->tvc_in_iovs, sizeof(v_rsp)); @@ -682,7 +719,52 @@ vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, } static int -vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, +vhost_scsi_copy_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count) +{ + size_t len = iov_iter_count(iter); + unsigned int nbytes = 0; + struct page *page; + int i; + + if (cmd->tvc_data_direction == DMA_FROM_DEVICE) { + cmd->saved_iter_addr = dup_iter(&cmd->saved_iter, iter, + GFP_KERNEL); + if (!cmd->saved_iter_addr) + return -ENOMEM; + } + + for (i = 0; i < sg_count; i++) { + page = alloc_page(GFP_KERNEL); + if (!page) { + i--; + goto err; + } + + nbytes = min_t(unsigned int, PAGE_SIZE, len); + sg_set_page(&sg[i], page, nbytes, 0); + + if (cmd->tvc_data_direction == DMA_TO_DEVICE && + copy_page_from_iter(page, 0, nbytes, iter) != nbytes) + goto err; + + len -= nbytes; + } + + cmd->copied_iov = 1; + return 0; + +err: + pr_err("Could not read %u bytes\n", nbytes); + + for (; i >= 0; i--) + __free_page(sg_page(&sg[i])); + kfree(cmd->saved_iter_addr); + return -ENOMEM; +} + +static int +vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, size_t prot_bytes, struct iov_iter *prot_iter, size_t data_bytes, struct iov_iter *data_iter) { @@ -703,10 +785,8 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - if (ret < 0) { - cmd->tvc_prot_sgl_count = 0; - return ret; - } + if (ret < 0) + goto map_fail; } sgl_count = vhost_scsi_calc_sgls(data_iter, data_bytes, VHOST_SCSI_PREALLOC_SGLS); @@ -717,14 +797,32 @@ vhost_scsi_mapal(struct vhost_scsi_cmd *cmd, cmd->tvc_sgl_count = sgl_count; pr_debug("%s data_sg %p data_sgl_count %u\n", __func__, cmd->tvc_sgl, cmd->tvc_sgl_count); + /* + * The block layer requires bios/requests to be a multiple of 512 bytes, + * but Windows can send us vecs that are misaligned. This can result + * in bios and later requests with misaligned sizes if we have to break + * up a cmd into multiple bios. + * + * We currently only break up a command into multiple bios if we hit + * the vec/seg limit, so check if our sgl_count is greater than the max + * and if a vec in the cmd has a misaligned size. + */ + if (tpg->check_io_alignment && + (!iov_iter_is_aligned(data_iter, 0, SECTOR_SIZE - 1) && + bio_max_segs(sgl_count) != sgl_count)) + ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count); + else + ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, + cmd->tvc_sgl, cmd->tvc_sgl_count); + if (ret) + goto map_fail; - ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, - cmd->tvc_sgl, cmd->tvc_sgl_count); - if (ret < 0) { - cmd->tvc_sgl_count = 0; - return ret; - } return 0; + +map_fail: + cmd->tvc_sgl_count = 0; + return ret; } static int vhost_scsi_to_tcm_attr(int attr) @@ -1077,7 +1175,7 @@ vhost_scsi_handle_vq(struct vhost_scsi *vs, struct vhost_virtqueue *vq) " %d\n", cmd, exp_data_len, prot_bytes, data_direction); if (data_direction != DMA_NONE) { - if (unlikely(vhost_scsi_mapal(cmd, prot_bytes, + if (unlikely(vhost_scsi_mapal(tpg, cmd, prot_bytes, &prot_iter, exp_data_len, &data_iter))) { vq_err(vq, "Failed to map iov to sgl\n"); @@ -2068,11 +2166,41 @@ static ssize_t vhost_scsi_tpg_attrib_fabric_prot_type_show( return sysfs_emit(page, "%d\n", tpg->tv_fabric_prot_type); } - CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, fabric_prot_type); +static ssize_t +vhost_scsi_tpg_attrib_check_io_alignment_store(struct config_item *item, + const char *page, size_t count) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, + se_tpg); + bool val; + int ret; + + ret = kstrtobool(page, &val); + if (ret) + return ret; + + tpg->check_io_alignment = val; + return count; +} + +static ssize_t +vhost_scsi_tpg_attrib_check_io_alignment_show(struct config_item *item, + char *page) +{ + struct se_portal_group *se_tpg = attrib_to_tpg(item); + struct vhost_scsi_tpg *tpg = container_of(se_tpg, struct vhost_scsi_tpg, + se_tpg); + + return sysfs_emit(page, "%d\n", tpg->check_io_alignment); +} +CONFIGFS_ATTR(vhost_scsi_tpg_attrib_, check_io_alignment); + static struct configfs_attribute *vhost_scsi_tpg_attrib_attrs[] = { &vhost_scsi_tpg_attrib_attr_fabric_prot_type, + &vhost_scsi_tpg_attrib_attr_check_io_alignment, NULL, }; -- 2.25.1
The write arg that's passed to the mapping functions is not used so remove it. Signed-off-by: Mike Christie <michael.christie at oracle.com> --- drivers/vhost/scsi.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index dbad8fb579eb..382158b39740 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -649,10 +649,8 @@ vhost_scsi_get_cmd(struct vhost_virtqueue *vq, struct vhost_scsi_tpg *tpg, * Returns the number of scatterlist entries used or -errno on error. */ static int -vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, - struct iov_iter *iter, - struct scatterlist *sgl, - bool write) +vhost_scsi_map_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sgl) { struct page **pages = cmd->tvc_upages; struct scatterlist *sg = sgl; @@ -696,15 +694,14 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) } static int -vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, bool write, - struct iov_iter *iter, +vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, struct scatterlist *sg, int sg_count) { struct scatterlist *p = sg; int ret; while (iov_iter_count(iter)) { - ret = vhost_scsi_map_to_sgl(cmd, iter, sg, write); + ret = vhost_scsi_map_to_sgl(cmd, iter, sg); if (ret < 0) { while (p < sg) { struct page *page = sg_page(p++); @@ -769,7 +766,6 @@ vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, size_t data_bytes, struct iov_iter *data_iter) { int sgl_count, ret; - bool write = (cmd->tvc_data_direction == DMA_FROM_DEVICE); if (prot_bytes) { sgl_count = vhost_scsi_calc_sgls(prot_iter, prot_bytes, @@ -782,8 +778,7 @@ vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, write, prot_iter, - cmd->tvc_prot_sgl, + ret = vhost_scsi_iov_to_sgl(cmd, prot_iter, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); if (ret < 0) goto map_fail; @@ -813,8 +808,8 @@ vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, cmd->tvc_sgl_count); else - ret = vhost_scsi_iov_to_sgl(cmd, write, data_iter, - cmd->tvc_sgl, cmd->tvc_sgl_count); + ret = vhost_scsi_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count); if (ret) goto map_fail; -- 2.25.1
Rename vhost_scsi_iov_to_sgl to vhost_scsi_map_iov_to_sgl so it matches matches the naming style used for vhost_scsi_copy_iov_to_sgl. Signed-off-by: Mike Christie <michael.christie at oracle.com> --- drivers/vhost/scsi.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 382158b39740..a4d32b96e66a 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -694,8 +694,8 @@ vhost_scsi_calc_sgls(struct iov_iter *iter, size_t bytes, int max_sgls) } static int -vhost_scsi_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, - struct scatterlist *sg, int sg_count) +vhost_scsi_map_iov_to_sgl(struct vhost_scsi_cmd *cmd, struct iov_iter *iter, + struct scatterlist *sg, int sg_count) { struct scatterlist *p = sg; int ret; @@ -778,8 +778,9 @@ vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, pr_debug("%s prot_sg %p prot_sgl_count %u\n", __func__, cmd->tvc_prot_sgl, cmd->tvc_prot_sgl_count); - ret = vhost_scsi_iov_to_sgl(cmd, prot_iter, cmd->tvc_prot_sgl, - cmd->tvc_prot_sgl_count); + ret = vhost_scsi_map_iov_to_sgl(cmd, prot_iter, + cmd->tvc_prot_sgl, + cmd->tvc_prot_sgl_count); if (ret < 0) goto map_fail; } @@ -808,8 +809,8 @@ vhost_scsi_mapal(struct vhost_scsi_tpg *tpg, struct vhost_scsi_cmd *cmd, ret = vhost_scsi_copy_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, cmd->tvc_sgl_count); else - ret = vhost_scsi_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, - cmd->tvc_sgl_count); + ret = vhost_scsi_map_iov_to_sgl(cmd, data_iter, cmd->tvc_sgl, + cmd->tvc_sgl_count); if (ret) goto map_fail; -- 2.25.1
Michael S. Tsirkin
2023-May-25 07:59 UTC
[PATCH 0/3] vhost-scsi: Fix IO hangs when using windows
On Wed, May 24, 2023 at 06:34:04PM -0500, Mike Christie wrote:> The following patches were made over Linus's tree and fix an issue > where windows guests will send iovecs with offset/lengths that result > in IOs that are not aligned to 512. The LIO layer will then send them > to Linux's block layer but it requires 512 byte alignment, so depending > on the block driver being used we will get IO errors or hung IO. > > The following patches have vhost-scsi detect when windows sends these > IOs and copy them to a bounce buffer. It then does some cleanup in > the related code.Normally with virtio we'd have a feature bit that allows skipping alignment checks. Teach linux to set it. Stefan, WDYT? -- MST
Possibly Parallel Threads
- [PATCH v2 0/2] vhost-scsi: Fix IO hangs when using windows
- [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()
- [PATCH 00/12] block/bio, fs: convert put_page() to put_user_page*()
- [PATCH 0/2] vhost/scsi: Adjustments for five function implementations
- [PATCH 0/2] vhost/scsi: Adjustments for five function implementations