Thanos Makatos
2013-Jul-15 11:59 UTC
[PATCH 0 of 3] blktap3/misc: Merge from blktap2.5, add fixes/improvements, and build blktap3
This patch series imports bug-fix and functionality commits from blkap2.5, introduces some other bug fixes and code improvements, and hooks blktap3 into the Xen build system. Signed-off-by: Thanos Makatos <thanos.makatos@citrix.com>
This patch imports commits from blktap2.5 after blktap3 was forked from it in July 2011 (4212b833df6321fac4ccabb75f7f9a476553d6d0) until February 2013 (ce9f9ce8529ac7fc330371c8ec1efe018da637e0). There are a few more to be merged. The patch introduces (and builds) the following: * the part utility * the lvm utility * the tap-ctl utility * vhd-util-XXX utilities * the mirroring functionality (NBD) * the pause, unpause, and stats commands * various block drivers (mostly in unknown state) * bug fixes Singed-off-by: Thanos Makatos <thanos.makatos@citrix.com> diff --git a/tools/blktap3/Makefile b/tools/blktap3/Makefile --- a/tools/blktap3/Makefile +++ b/tools/blktap3/Makefile @@ -10,6 +10,8 @@ SUBDIRS-y += vhd SUBDIRS-y += control SUBDIRS-y += tapback SUBDIRS-y += drivers +SUBDIRS-y += part +SUBDIRS-y += lvm tags: ctags -R --language-force=C --c-kinds=+px diff --git a/tools/blktap3/control/Makefile b/tools/blktap3/control/Makefile --- a/tools/blktap3/control/Makefile +++ b/tools/blktap3/control/Makefile @@ -5,6 +5,8 @@ MAJOR = 3 MINOR = 0 LIBNAME = libblktapctl +IBIN = tap3-ctl + override CFLAGS += \ -I../include \ -DTAPDISK_BUILDDIR=''"../drivers"'' \ @@ -16,7 +18,7 @@ override CFLAGS += \ -Wextra \ -Werror -# FIXME cause trouble +# TODO cause trouble override CFLAGS += \ -Wno-type-limits \ -Wno-missing-field-initializers \ @@ -31,6 +33,9 @@ CTL_OBJS += tap-ctl-open.o CTL_OBJS += tap-ctl-close.o CTL_OBJS += tap-ctl-create.o CTL_OBJS += tap-ctl-destroy.o +CTL_OBJS += tap-ctl-pause.o +CTL_OBJS += tap-ctl-unpause.o +CTL_OBJS += tap-ctl-stats.o CTL_PICS = $(patsubst %.o,%.opic,$(CTL_OBJS)) @@ -50,15 +55,18 @@ build: $(IBIN) $(LIB_STATIC) $(LIB_SHARE $(CC) $(LDFLAGS) -fPIC -Wl,$(SONAME_LDFLAG) -Wl,$(LIB_SHARED) \ $(SHLIB_LDFLAGS) -rdynamic $^ -o $@ -install: $(LIB_STATIC) $(LIB_SHARED) - $(INSTALL_DIR) -p $(DESTDIR)$(SBINDIR) - # TODO Why install the static version? - #$(INSTALL_DATA) $(LIB_STATIC) $(DESTDIR)$(LIBDIR) +tap3-ctl: tap-ctl.o $(LIB_SHARED) + $(CC) $(LDFLAGS) -o $@ $^ + +install: $(LIB_SHARED) $(IBIN) + $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR) $(INSTALL_PROG) $(LIB_SHARED) $(DESTDIR)$(LIBDIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR) ldconfig clean: - rm -f $(CTL_OBJS) $(PICS) $(DEPS) $(LIB_STATIC) $(LIB_SHARED) + rm -f $(CTL_OBJS) $(PICS) $(DEPS) $(LIB_STATIC) $(LIB_SHARED) $(IBIN)\ + tap-ctl.o .PHONY: all build clean install diff --git a/tools/blktap3/control/tap-ctl-destroy.c b/tools/blktap3/control/tap-ctl-destroy.c --- a/tools/blktap3/control/tap-ctl-destroy.c +++ b/tools/blktap3/control/tap-ctl-destroy.c @@ -1,15 +1,29 @@ /* - * Copyright (C) 2012 Citrix Ltd. + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; version 2.1 only. with the special - * exception on linking described in file LICENSE. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include <stdio.h> diff --git a/tools/blktap3/control/tap-ctl-ipc.c b/tools/blktap3/control/tap-ctl-ipc.c --- a/tools/blktap3/control/tap-ctl-ipc.c +++ b/tools/blktap3/control/tap-ctl-ipc.c @@ -154,7 +154,8 @@ tap_ctl_send_and_receive(const int sfd, return err; } - if (tapdisk_message_is_rsp_paired(msg_type)) { + if (TAPDISK_MESSAGE_ERROR != message->type + && tapdisk_message_is_rsp_paired(msg_type)) { if (message->type - msg_type != 1) { err = EINVAL; EPRINTF("invalid response ''%s'' to message ''%s''\n", @@ -190,7 +191,7 @@ tap_ctl_connect(const char *name, int *s fd = socket(AF_UNIX, SOCK_STREAM, 0); if (fd == -1) { - EPRINTF("couldn''t create socket for %s: %d\n", name, errno); + EPRINTF("couldn''t create socket for %s: %s\n", name, strerror(errno)); return -errno; } @@ -200,9 +201,10 @@ tap_ctl_connect(const char *name, int *s err = connect(fd, (const struct sockaddr *)&saddr, sizeof(saddr)); if (err) { - EPRINTF("couldn''t connect to %s: %d\n", name, errno); + err = errno; + EPRINTF("couldn''t connect to %s: %s\n", name, strerror(err)); close(fd); - return -errno; + return -err; } *sfd = fd; @@ -229,6 +231,7 @@ tap_ctl_connect_id(int id, int *sfd) } err = tap_ctl_connect(name, sfd); + free(name); return err; diff --git a/tools/blktap3/control/tap-ctl-open.c b/tools/blktap3/control/tap-ctl-open.c --- a/tools/blktap3/control/tap-ctl-open.c +++ b/tools/blktap3/control/tap-ctl-open.c @@ -45,7 +45,7 @@ tap_ctl_open(const int pid, const char * memset(&message, 0, sizeof(message)); message.type = TAPDISK_MESSAGE_OPEN; if (prt_path) { - if (strnlen(prt_path, TAPDISK_MESSAGE_OPEN) == TAPDISK_MESSAGE_OPEN) + if (strlen(prt_path) >= TAPDISK_MESSAGE_OPEN) return -ENAMETOOLONG; strcpy(message.u.params.prt_path, prt_path); } diff --git a/tools/blktap3/control/tap-ctl-pause.c b/tools/blktap3/control/tap-ctl-pause.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/control/tap-ctl-pause.c @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <assert.h> + +#include "tap-ctl.h" + +int +tap_ctl_pause(const int id, const char *params, struct timeval *timeout) +{ + int err; + tapdisk_message_t message; + + assert(params); + + if (strnlen(params, TAPDISK_MESSAGE_MAX_PATH_LENGTH) + >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + return ENAMETOOLONG; + } + + memset(&message, 0, sizeof(message)); + message.type = TAPDISK_MESSAGE_PAUSE; + + strncpy(message.u.params.path, params, TAPDISK_MESSAGE_MAX_PATH_LENGTH); + + err = tap_ctl_connect_send_and_receive(id, &message, timeout); + if (err) + return err; + + if (message.type == TAPDISK_MESSAGE_PAUSE_RSP + || message.type == TAPDISK_MESSAGE_ERROR) + err = message.u.response.error; + else { + err = EINVAL; + EPRINTF("got unexpected message ''%s'' from %d\n", + tapdisk_message_name(message.type), id); + } + + return err; +} diff --git a/tools/blktap3/control/tap-ctl-stats.c b/tools/blktap3/control/tap-ctl-stats.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/control/tap-ctl-stats.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2010, Citrix + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <assert.h> + +#include "tap-ctl.h" + +int +_tap_ctl_stats_connect_and_send(pid_t pid, const char *params) +{ + struct timeval timeout = { .tv_sec = 10, .tv_usec = 0 }; + tapdisk_message_t message; + int sfd, err; + + assert(params); + + if (strnlen(params, TAPDISK_MESSAGE_MAX_PATH_LENGTH) + >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + return ENAMETOOLONG; + } + + memset(&message, 0, sizeof(message)); + message.type = TAPDISK_MESSAGE_STATS; + + strncpy(message.u.params.path, params, TAPDISK_MESSAGE_MAX_PATH_LENGTH); + + err = tap_ctl_connect_id(pid, &sfd); + if (err) + return err; + + + err = tap_ctl_write_message(sfd, &message, &timeout); + if (err) + return err; + + return sfd; +} + +ssize_t +tap_ctl_stats(pid_t pid, const char *params, char *buf, size_t size) +{ + tapdisk_message_t message; + int sfd, err; + size_t len; + + assert(params); + + sfd = _tap_ctl_stats_connect_and_send(pid, params); + if (sfd < 0) + return sfd; + + err = tap_ctl_read_message(sfd, &message, NULL); + if (err) + return err; + + len= message.u.info.length; + if (len < 0) { + err = len; + goto out; + } + if (size < len + 1) + len = size - 1; + + err = tap_ctl_read_raw(sfd, buf, len, NULL); + if (err) + goto out; + + buf[len] = 0; + +out: + close(sfd); + return err; +} + +int +tap_ctl_stats_fwrite(pid_t pid, const char *params, FILE *stream) +{ + tapdisk_message_t message; + int sfd = -1, prot, flags, err; + size_t len, bufsz; + char *buf = MAP_FAILED; + + assert(params); + + prot = PROT_READ|PROT_WRITE; + flags = MAP_ANONYMOUS|MAP_PRIVATE; + bufsz = sysconf(_SC_PAGE_SIZE); + + buf = mmap(NULL, bufsz, prot, flags, -1, 0); + if (buf == MAP_FAILED) { + buf = NULL; + err = -ENOMEM; + goto out; + } + + sfd = _tap_ctl_stats_connect_and_send(pid, params); + if (sfd < 0) { + err = sfd; + goto out; + } + + err = tap_ctl_read_message(sfd, &message, NULL); + if (err) + goto out; + + len = message.u.info.length; + if (len < 0) { + err = len; + goto out; + } + + while (len) { + fd_set rfds; + size_t in, out; + int n; + + FD_ZERO(&rfds); + FD_SET(sfd, &rfds); + + n = select(sfd + 1, &rfds, NULL, NULL, NULL); + if (n < 0) { + err = n; + goto out; + } + + in = read(sfd, buf, bufsz); + if (in <= 0) { + err = in; + goto out; + } + + len -= in; + + out = fwrite(buf, in, 1, stream); + if (out != 1) { + err = -errno; + goto out; + } + } + len = fwrite("\n", 1, 1, stream); + +out: + if (sfd >= 0) + close(sfd); + if (buf != MAP_FAILED) + munmap(buf, bufsz); + + return err; +} diff --git a/tools/blktap3/control/tap-ctl-unpause.c b/tools/blktap3/control/tap-ctl-unpause.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/control/tap-ctl-unpause.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <getopt.h> +#include <assert.h> + +#include "tap-ctl.h" + +int +tap_ctl_unpause(const int id, const char *params1, const char *params2, + int flags, char *secondary) +{ + int err; + tapdisk_message_t message; + + assert(params1); + + memset(&message, 0, sizeof(message)); + message.type = TAPDISK_MESSAGE_RESUME; + message.u.resume.flags = flags; + + if (strnlen(params1, TAPDISK_MESSAGE_MAX_PATH_LENGTH) + >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + /* TODO log error */ + return ENAMETOOLONG; + } + + strncpy(message.u.resume.params1, params1, TAPDISK_MESSAGE_MAX_PATH_LENGTH); + + if (params2) { + if (strnlen(params2, TAPDISK_MESSAGE_MAX_PATH_LENGTH) + >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + /* TODO log error */ + return ENAMETOOLONG; + } + strncpy(message.u.resume.params2, params2, + TAPDISK_MESSAGE_MAX_PATH_LENGTH); + } else { + message.u.resume.params2[0] = ''\0''; + } + + if (secondary) { + if (strnlen(secondary, TAPDISK_MESSAGE_MAX_PATH_LENGTH) + >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + /* TODO log error */ + return ENAMETOOLONG; + } + strncpy(message.u.resume.secondary, secondary, + TAPDISK_MESSAGE_MAX_PATH_LENGTH); + } else { + message.u.resume.secondary[0] = ''\0''; + } + + err = tap_ctl_connect_send_and_receive(id, &message, NULL); + if (err) + return err; + + if (message.type == TAPDISK_MESSAGE_RESUME_RSP + || message.type == TAPDISK_MESSAGE_ERROR) + err = message.u.response.error; + else { + err = EINVAL; + EPRINTF("got unexpected result ''%s'' from %d\n", + tapdisk_message_name(message.type), id); + } + + return err; +} diff --git a/tools/blktap3/control/tap-ctl.c b/tools/blktap3/control/tap-ctl.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/control/tap-ctl.c @@ -0,0 +1,707 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <getopt.h> +#include <signal.h> +#include <sys/time.h> + +#include "tap-ctl.h" + +typedef int (*tap_ctl_func_t) (int, char **); + +struct command { + char *name; + tap_ctl_func_t func; +}; + +static void +tap_cli_list_usage(FILE *stream) +{ + fprintf(stream, + "usage: list [-h] [-p pid] [-t type] [-f file]\n"); +} + +static void +tap_cli_list_row(tap_list_t *entry) +{ + char minor_str[10] = "-"; + char state_str[10] = "-"; + char pid_str[10] = "-"; + + if (entry->pid != -1) + sprintf(pid_str, "%d", entry->pid); + + if (entry->state != -1) + sprintf(state_str, "%#x", entry->state); + + printf("%8s %4s %4s %10s %s\n", + pid_str, minor_str, state_str, + entry->type ? : "-", entry->path ? : "-"); +} + +static void +tap_cli_list_dict(tap_list_t *entry) +{ + int d = 0; + + if (entry->pid != -1) { + if (d) putc('' '', stdout); + d = printf("pid=%d", entry->pid); + } + + if (entry->state != -1) { + if (d) putc('' '', stdout); + d = printf("state=%#x", entry->state); + } + + if (entry->type && entry->path) { + if (d) putc('' '', stdout); + d = printf("args=%s:%s", entry->type, entry->path); + } + + putc(''\n'', stdout); +} + +int +tap_cli_list(int argc, char **argv) +{ + struct tqh_tap_list list = TAILQ_HEAD_INITIALIZER(list); + int c, minor, tty, err; + const char *type, *file; + tap_list_t *entry; + pid_t pid; + + pid = -1; + minor = -1; + type = NULL; + file = NULL; + + while ((c = getopt(argc, argv, "m:p:t:f:h")) != -1) { + switch (c) { + case ''m'': + minor = atoi(optarg); + break; + case ''p'': + pid = atoi(optarg); + break; + case ''t'': + type = optarg; + break; + case ''f'': + file = optarg; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_list_usage(stdout); + return 0; + } + } + + if (pid != -1) + err = tap_ctl_list_pid(pid, &list); + else + err = tap_ctl_list(&list); + if (err) + return -err; + + tty = isatty(STDOUT_FILENO); + + tap_list_for_each_entry(entry, &list) { + if (pid >= 0 && entry->pid != pid) + continue; + + if (type && entry->type && strcmp(entry->type, type)) + continue; + + if (file && entry->path && strcmp(entry->path, file)) + continue; + + if (tty) + tap_cli_list_row(entry); + else + tap_cli_list_dict(entry); + } + + tap_ctl_list_free(&list); + + return 0; + +usage: + tap_cli_list_usage(stderr); + return EINVAL; +} + +static void +tap_cli_create_usage(FILE *stream) +{ + /* FIXME "stack on existing ..." */ + fprintf(stream, "usage: create <-a type:/path/to/file> [-R readonly] " + "[-e <minor> stack on existing tapdisk for the parent chain] " + "[-r turn on read caching into leaf node] [-2 <path> " + "use secondary image (in mirror mode if no -s)] [-s " + "fail over to the secondary image on ENOSPC]\n"); +} + +static int +tap_cli_create(int argc, char **argv) +{ + int c, err, flags; + char *args, *secondary, *prt_path; + + args = NULL; + secondary = NULL; + prt_path = NULL; + flags = 0; + + optind = 0; + while ((c = getopt(argc, argv, "a:Rd:e:r2:sh")) != -1) { + switch (c) { + case ''a'': + args = optarg; + break; + case ''R'': + flags |= TAPDISK_MESSAGE_FLAG_RDONLY; + break; + case ''r'': + flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE; + break; + case ''e'': + flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT; + prt_path = optarg; + break; + case ''2'': + flags |= TAPDISK_MESSAGE_FLAG_SECONDARY; + secondary = optarg; + break; + case ''s'': + flags |= TAPDISK_MESSAGE_FLAG_STANDBY; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_create_usage(stdout); + return 0; + } + } + + if (!args) + goto usage; + + err = tap_ctl_create(args, flags, prt_path, secondary); + + return err; + +usage: + tap_cli_create_usage(stderr); + return EINVAL; +} + +static void +tap_cli_destroy_usage(FILE *stream) +{ + fprintf(stream, "usage: destroy <-p pid> <-a type:/path/to/file>\n"); +} + +static struct timeval* +tap_cli_timeout(const char *optarg) +{ + static struct timeval tv; + struct timeval now; + + tv.tv_sec = atoi(optarg); + tv.tv_usec = 0; + + gettimeofday(&now, NULL); + timeradd(&tv, &now, &tv); + + return &tv; +} + +static int +tap_cli_destroy(int argc, char **argv) +{ + int c, pid; + struct timeval *timeout; + char *params; + + pid = -1; + params = NULL; + timeout = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "p:a:t:h")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params = optarg; + break; + case ''t'': + timeout = tap_cli_timeout(optarg); + if (!timeout) + goto usage; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_destroy_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params) + goto usage; + + return tap_ctl_destroy(pid, params, 0, timeout); + +usage: + tap_cli_destroy_usage(stderr); + return EINVAL; +} + +static void +tap_cli_spawn_usage(FILE *stream) +{ + fprintf(stream, "usage: spawn\n"); +} + +static int +tap_cli_spawn(int argc, char **argv) +{ + int c, tty; + pid_t pid; + + optind = 0; + while ((c = getopt(argc, argv, "h")) != -1) { + switch (c) { + case ''?'': + goto usage; + case ''h'': + tap_cli_spawn_usage(stdout); + return 0; + } + } + + pid = tap_ctl_spawn(); + if (pid < 0) + return pid; + + tty = isatty(STDOUT_FILENO); + if (tty) + printf("tapdisk spawned with pid %d\n", pid); + else + printf("%d\n", pid); + + return 0; + +usage: + tap_cli_spawn_usage(stderr); + return EINVAL; +} + +static void +tap_cli_close_usage(FILE *stream) +{ + fprintf(stream, "usage: close <-p pid> <-a type:/path/to/file> " + "[-f force]\n"); +} + +static int +tap_cli_close(int argc, char **argv) +{ + int c, pid, force; + char *params; + struct timeval *timeout; + + pid = -1; + params = NULL; + force = 0; + timeout = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "p:a:ft:h")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params = optarg; + break; + case ''f'': + force = -1; + break; + case ''t'': + timeout = tap_cli_timeout(optarg); + if (!timeout) + goto usage; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_close_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params) + goto usage; + + return tap_ctl_close(pid, params, force, timeout); + +usage: + tap_cli_close_usage(stderr); + return EINVAL; +} + +static void +tap_cli_pause_usage(FILE *stream) +{ + fprintf(stream, "usage: pause <-p pid> <-a type:/path/to/file>\n"); +} + +static int +tap_cli_pause(int argc, char **argv) +{ + int c, pid; + struct timeval *timeout; + char *params; + + pid = -1; + params = NULL; + timeout = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "p:a:t:h")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params = optarg; + break; + case ''t'': + timeout = tap_cli_timeout(optarg); + if (!timeout) + goto usage; + case ''?'': + goto usage; + case ''h'': + tap_cli_pause_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params) + goto usage; + + return tap_ctl_pause(pid, params, timeout); + +usage: + tap_cli_pause_usage(stderr); + return EINVAL; +} + +static void +tap_cli_unpause_usage(FILE *stream) +{ + fprintf(stream, "usage: unpause <-p pid> <-a type:/path/to/file> " + "[-b type:/path/to/file] [-2 secondary]\n"); +} + +int +tap_cli_unpause(int argc, char **argv) +{ + char *secondary, *params1, *params2; + int c, pid, flags; + + pid = -1; + params1 = NULL; + params2 = NULL; + secondary = NULL; + flags = 0; + + optind = 0; + while ((c = getopt(argc, argv, "p:a:b:2:h")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params1 = optarg; + break; + case ''b'': + params2 = optarg; + break; + case ''2'': + flags |= TAPDISK_MESSAGE_FLAG_SECONDARY; + secondary = optarg; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_unpause_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params1) + goto usage; + + return tap_ctl_unpause(pid, params1, params2, flags, secondary); + +usage: + tap_cli_unpause_usage(stderr); + return EINVAL; +} + +static void +tap_cli_open_usage(FILE *stream) +{ + fprintf(stream, "usage: open <-p pid> <-a type:/path/to/file> " + "[-R readonly] [-e <type:/path/to/file> stack on existing tapdisk for " + "the parent chain] [-r turn on read caching into leaf node] [-2 " + "<path> use secondary image (in mirror mode if no -s)] [-s fail over " + "to the secondary image on ENOSPC]\n"); +} + +static int +tap_cli_open(int argc, char **argv) +{ + const char *params, *prt_params, *secondary; + int c, pid, flags; + + flags = 0; + pid = -1; + params = NULL; + prt_params = NULL; + secondary = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "a:R:p:e:r2:sh")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params = optarg; + break; + case ''R'': + flags |= TAPDISK_MESSAGE_FLAG_RDONLY; + break; + case ''r'': + flags |= TAPDISK_MESSAGE_FLAG_ADD_LCACHE; + break; + case ''e'': + flags |= TAPDISK_MESSAGE_FLAG_REUSE_PRT; + prt_params = optarg; + break; + case ''2'': + flags |= TAPDISK_MESSAGE_FLAG_SECONDARY; + secondary = optarg; + break; + case ''s'': + flags |= TAPDISK_MESSAGE_FLAG_STANDBY; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_open_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params) + goto usage; + + return tap_ctl_open(pid, params, flags, prt_params, secondary); + +usage: + tap_cli_open_usage(stderr); + return EINVAL; +} + +static void +tap_cli_stats_usage(FILE *stream) +{ + fprintf(stream, "usage: stats <-p pid> <-a type:/path/to/file>\n"); +} + +static int +tap_cli_stats(int argc, char **argv) +{ + pid_t pid; + int c, err; + char *params; + + pid = -1; + params = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "p:a:h")) != -1) { + switch (c) { + case ''p'': + pid = atoi(optarg); + break; + case ''a'': + params = optarg; + break; + case ''?'': + goto usage; + case ''h'': + tap_cli_stats_usage(stdout); + return 0; + } + } + + if (pid == -1 || !params) + goto usage; + + err = tap_ctl_stats_fwrite(pid, params, stdout); + if (err) + return err; + + fprintf(stdout, "\n"); + + return 0; + +usage: + tap_cli_stats_usage(stderr); + return EINVAL; +} + +struct command commands[] = { + { .name = "list", .func = tap_cli_list }, + { .name = "create", .func = tap_cli_create }, + { .name = "destroy", .func = tap_cli_destroy }, + { .name = "spawn", .func = tap_cli_spawn }, + { .name = "open", .func = tap_cli_open }, + { .name = "close", .func = tap_cli_close }, + { .name = "pause", .func = tap_cli_pause }, + { .name = "unpause", .func = tap_cli_unpause }, + { .name = "stats", .func = tap_cli_stats }, +}; + +#define print_commands() \ + do { \ + int i, n; \ + n = sizeof(commands) / sizeof(struct command); \ + printf("COMMAND := { "); \ + printf("%s", commands[0].name); \ + for (i = 1; i < n; i++) \ + printf(" | %s", commands[i].name); \ + printf(" }\n"); \ + } while (0) + +void +help(void) +{ + printf("usage: tap-ctl COMMAND [OPTIONS]\n"); + print_commands(); + exit(0); +} + +struct command * +get_command(char *command) +{ + int i, n; + + if (strnlen(command, 25) >= 25) + return NULL; + + n = sizeof(commands) / sizeof (struct command); + + for (i = 0; i < n; i++) + if (!strcmp(command, commands[i].name)) + return &commands[i]; + + return NULL; +} + +int +main(int argc, char *argv[]) +{ + char **cargv; + struct command *cmd; + int cargc, i, cnt, ret; + +#ifdef CORE_DUMP + #include <sys/resource.h> + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + PERROR("setrlimit failed"); +#endif + + signal(SIGPIPE, SIG_IGN); + + ret = 0; + + if (argc < 2) + help(); + + cargc = argc - 1; + cmd = get_command(argv[1]); + if (!cmd) { + EPRINTF("invalid COMMAND %s", argv[1]); + help(); + } + + cargv = malloc(sizeof(char *) * cargc); + if (!cargv) + exit(ENOMEM); + + cnt = 1; + cargv[0] = cmd->name; + for (i = 1; i < cargc; i++) { + char *arg = argv[i + (argc - cargc)]; + + if (!strcmp(arg, "--debug")) { + tap_ctl_debug = 1; + continue; + } + + cargv[cnt++] = arg; + } + + ret = cmd->func(cnt, cargv); + if (ret) { + /* TODO Some functions return +errno, others -errno, fix. */ + printf("%s\n", strerror(abs(ret))); + } + + free(cargv); + + return (ret >= 0 ? ret : -ret); +} diff --git a/tools/blktap3/control/tap-ctl.h b/tools/blktap3/control/tap-ctl.h --- a/tools/blktap3/control/tap-ctl.h +++ b/tools/blktap3/control/tap-ctl.h @@ -214,12 +214,19 @@ int tap_ctl_destroy(const int id, const /** * Pauses the VBD. */ -int tap_ctl_pause(const int id, const char * params, struct timeval +int tap_ctl_pause(const int pid, const char * params, struct timeval *timeout); /** * Unpauses the VBD + * + * @param pid the process ID of the tapdisk + * @param params1 VDI (type:/path/to/file) + * @param params2 new VDI to use (type:/path/to/file), optional + * @param flags TODO + * @param secondary TODO */ -int tap_ctl_unpause(const int id, const char * params); +int tap_ctl_unpause(const int pid, const char * params1, const char *params2, + int flags, char *secondary); ssize_t tap_ctl_stats(pid_t pid, const char * params, char *buf, size_t size); int tap_ctl_stats_fwrite(pid_t pid, const char * params, FILE * out); diff --git a/tools/blktap3/drivers/Makefile b/tools/blktap3/drivers/Makefile --- a/tools/blktap3/drivers/Makefile +++ b/tools/blktap3/drivers/Makefile @@ -11,7 +11,6 @@ LIBVHDDIR = $(BLKTAP_ROOT)/vhd/lib # FIXME tapdisk-client tapdisk-stream tapdisk-diff not in blktap2.5 IBIN = tapdisk3 LOCK_UTIL = lock-util -INST_DIR = $(SBINDIR) override CFLAGS += \ -fno-strict-aliasing \ @@ -78,15 +77,16 @@ TAP-OBJS-y += tapdisk-server.o TAP-OBJS-y += tapdisk-queue.o TAP-OBJS-y += tapdisk-filter.o TAP-OBJS-y += tapdisk-utils.o -TAP-OBJS-y += tapdisk-log.o +TAP-OBJS-y += tapdisk-log.o TAP-OBJS-y += io-optimize.o #TAP-OBJS-y += lock.o -#TAP-OBJS-y += tapdisk-blktap.o -TAP-OBJS-y += tapdisk-stats.o -TAP-OBJS-y += tapdisk-storage.o -TAP-OBJS-y += tapdisk-loglimit.o -TAP-OBJS-y += tapdisk-logfile.o -TAP-OBJS-y += tapdisk-syslog.o +TAP-OBJS-y += tapdisk-stats.o +TAP-OBJS-y += tapdisk-storage.o +TAP-OBJS-y += tapdisk-loglimit.o +TAP-OBJS-y += tapdisk-logfile.o +TAP-OBJS-y += tapdisk-syslog.o +TAP-OBJS-y += tapdisk-nbdserver.o +TAP-OBJS-y += tapdisk-fdreceiver.o #TAP-OBJS-y += $(PORTABLE-OBJS-y) LIBSRING := sring/libsring.a @@ -96,6 +96,14 @@ LIBSRING := sring/libsring.a BLK-OBJS-y := block-aio.o BLK-OBJS-y += block-vhd.o +BLK-OBJS-y += block-ram.o +BLK-OBJS-y += block-cache.o +BLK-OJBS-y += block-vindex.o +BLK-OBJS-y += block-lcache.o +BLK-OBJS-y += block-llcache.o +BLK-OBJS-y += block-valve.o +BLK-OBJS-y += block-nbd.o +BLK-OBJS-y += block-vindex.o # FIXME The following exist in blktap2 but not in blktap2.5. #BLK-OBJS-y += aes.o #BLK-OBJS-y += md5.o @@ -125,8 +133,8 @@ lock-util: lock.c # FIXME img2qcow, qcow-create, qcow2raw not built so not installed # FIXME lock-util should be installed install: all - $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR) - $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR) + $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR) clean: subdirs-clean rm -rf .*.d *.o *~ xen TAGS $(IBIN) $(LIB) $(LOCK_UTIL) diff --git a/tools/blktap3/drivers/atomicio.c b/tools/blktap3/drivers/atomicio.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/atomicio.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2005 Anil Madhavapeddy. All rights reserved. + * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'''' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdlib.h> +#include <errno.h> +#include "atomicio.h" + +/* + * ensure all of data on socket comes through. f==read || f==vwrite + */ +size_t +atomicio(f, fd, _s, n) + ssize_t (*f) (int, void *, size_t); + int fd; + void *_s; + size_t n; +{ + char *s = _s; + size_t pos = 0; + ssize_t res; + + while (n > pos) { + res = (f) (fd, s + pos, n - pos); + switch (res) { + case -1: + if (errno == EINTR || errno == EAGAIN) + continue; + return 0; + case 0: + errno = EPIPE; + return pos; + default: + pos += (size_t)res; + } + } + return (pos); +} + diff --git a/tools/blktap3/drivers/atomicio.h b/tools/blktap3/drivers/atomicio.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/atomicio.h @@ -0,0 +1,33 @@ +/* $OpenBSD: atomicio.h,v 1.6 2005/05/24 17:32:43 avsm Exp $ */ + +/* + * Copyright (c) 1995,1999 Theo de Raadt. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'''' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Ensure all of data on socket comes through. f==read || f==vwrite + */ +size_t atomicio(ssize_t (*)(int, void *, size_t), int, void *, size_t); + +#define vwrite (ssize_t (*)(int, void *, size_t))write diff --git a/tools/blktap3/drivers/block-aio.c b/tools/blktap3/drivers/block-aio.c --- a/tools/blktap3/drivers/block-aio.c +++ b/tools/blktap3/drivers/block-aio.c @@ -240,7 +240,8 @@ void tdaio_queue_write(td_driver_t * dri td_complete_request(treq, -EBUSY); } -int tdaio_close(td_driver_t * driver) +int tdaio_close(td_driver_t * driver, + struct tqh_td_image_handle *head __attribute__((unused))) { struct tdaio_state *prv = (struct tdaio_state *) driver->data; diff --git a/tools/blktap3/drivers/block-cache.c b/tools/blktap3/drivers/block-cache.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-cache.c @@ -0,0 +1,795 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> + +#include "tapdisk.h" +#include "tapdisk-utils.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" + +#ifdef DEBUG +#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) +#else +#define DBG(_f, _a...) ((void)0) +#endif + +#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) + +#define RADIX_TREE_PAGE_SHIFT 12 /* 4K pages */ +#define RADIX_TREE_PAGE_SIZE (1 << RADIX_TREE_PAGE_SHIFT) + +#define RADIX_TREE_NODE_SHIFT 9 /* 512B nodes */ +#define RADIX_TREE_NODE_SIZE (1 << RADIX_TREE_NODE_SHIFT) +#define RADIX_TREE_NODE_MASK (RADIX_TREE_NODE_SIZE - 1) + +#define BLOCK_CACHE_NODES_PER_PAGE (1 << (RADIX_TREE_PAGE_SHIFT - RADIX_TREE_NODE_SHIFT)) + +#define BLOCK_CACHE_MAX_SIZE (10 << 20) /* 100MB cache */ +#define BLOCK_CACHE_REQUESTS (TAPDISK_DATA_REQUESTS << 3) +#define BLOCK_CACHE_PAGE_IDLETIME 60 + +typedef struct radix_tree radix_tree_t; +typedef struct radix_tree_node radix_tree_node_t; +typedef struct radix_tree_link radix_tree_link_t; +typedef struct radix_tree_leaf radix_tree_leaf_t; +typedef struct radix_tree_page radix_tree_page_t; + +typedef struct block_cache block_cache_t; +typedef struct block_cache_request block_cache_request_t; +typedef struct block_cache_stats block_cache_stats_t; + +struct radix_tree_page { + char *buf; + size_t size; + uint64_t sec; + radix_tree_link_t *owners[BLOCK_CACHE_NODES_PER_PAGE]; +}; + +struct radix_tree_leaf { + radix_tree_page_t *page; + char *buf; +}; + +struct radix_tree_link { + uint32_t time; + union { + radix_tree_node_t *next; + radix_tree_leaf_t leaf; + } u; +}; + +struct radix_tree_node { + int height; + radix_tree_link_t links[RADIX_TREE_NODE_SIZE]; +}; + +struct radix_tree { + int height; + uint64_t size; + uint32_t nodes; + radix_tree_node_t *root; + + block_cache_t *cache; +}; + +struct block_cache_request { + int err; + char *buf; + uint64_t secs; + td_request_t treq; + block_cache_t *cache; +}; + +struct block_cache_stats { + uint64_t reads; + uint64_t hits; + uint64_t misses; + uint64_t prunes; +}; + +struct block_cache { + int ptype; + char *name; + + uint64_t sectors; + + block_cache_request_t requests[BLOCK_CACHE_REQUESTS]; + block_cache_request_t *request_free_list[BLOCK_CACHE_REQUESTS]; + int requests_free; + + event_id_t timeout_id; + + radix_tree_t tree; + + block_cache_stats_t stats; +}; + +static inline uint64_t +radix_tree_calculate_size(int height) +{ + return (uint64_t)RADIX_TREE_NODE_SIZE << + (height * RADIX_TREE_NODE_SHIFT); +} + +static inline int +radix_tree_calculate_height(uint64_t sectors) +{ + int height; + uint64_t tree_size; + + height = 1; /* always allocate root node */ + tree_size = radix_tree_calculate_size(height); + while (sectors > tree_size) + tree_size = radix_tree_calculate_size(++height); + + return height; +} + +static inline int +radix_tree_index(radix_tree_node_t *node, uint64_t sector) +{ + return ((sector >> (node->height * RADIX_TREE_NODE_SHIFT)) & + RADIX_TREE_NODE_MASK); +} + +static inline int +radix_tree_node_contains_leaves(radix_tree_t *tree __attribute__((unused)), + radix_tree_node_t *node) +{ + return (node->height == 0); +} + +static inline int +radix_tree_node_is_root(radix_tree_t *tree, radix_tree_node_t *node) +{ + return (node->height == tree->height); +} + +static inline uint64_t +radix_tree_size(radix_tree_t *tree) +{ + return tree->size + tree->nodes * sizeof(radix_tree_node_t); +} + +static inline void +radix_tree_clear_link(radix_tree_link_t *link) +{ + if (link) + memset(link, 0, sizeof(radix_tree_link_t)); +} + +static inline radix_tree_node_t * +radix_tree_allocate_node(radix_tree_t *tree, int height) +{ + radix_tree_node_t *node; + + node = calloc(1, sizeof(radix_tree_node_t)); + if (!node) + return NULL; + + node->height = height; + tree->nodes++; + + return node; +} + +static inline radix_tree_node_t * +radix_tree_allocate_child_node(radix_tree_t *tree, radix_tree_node_t *parent) +{ + return radix_tree_allocate_node(tree, parent->height - 1); +} + +void +radix_tree_free_node(radix_tree_t *tree, radix_tree_node_t *node) +{ + if (!node) + return; + + free(node); + tree->nodes--; +} + +static inline radix_tree_page_t * +radix_tree_allocate_page(radix_tree_t *tree, + char *buf, uint64_t sec, size_t size) +{ + radix_tree_page_t *page; + + page = calloc(1, sizeof(radix_tree_page_t)); + if (!page) + return NULL; + + page->buf = buf; + page->sec = sec; + page->size = size; + tree->size += size; + + return page; +} + +static inline void +radix_tree_free_page(radix_tree_t *tree, radix_tree_page_t *page) +{ + int i; + + for (i = 0; i < page->size >> RADIX_TREE_NODE_SHIFT; i++) + DBG("%s: ejecting sector 0x%llx\n", + tree->cache->name, page->sec + i); + + tree->cache->stats.prunes += (page->size >> RADIX_TREE_NODE_SHIFT); + tree->size -= page->size; + free(page->buf); + free(page); +} + +/* + * remove a leaf and the shared radix_tree_page_t containing its buffer. + * leaves are deleted, nodes are not; gc will reap the nodes later. + */ +static void +radix_tree_remove_page(radix_tree_t *tree, radix_tree_page_t *page) +{ + int i; + + if (!page) + return; + + for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) + radix_tree_clear_link(page->owners[i]); + + radix_tree_free_page(tree, page); +} + +static void +radix_tree_insert_leaf(radix_tree_t *tree __attribute__((unused)), + radix_tree_link_t *link, radix_tree_page_t *page, off_t off) +{ + int i; + + if (off + RADIX_TREE_NODE_SIZE > page->size) + return; + + for (i = 0; i < BLOCK_CACHE_NODES_PER_PAGE; i++) { + if (page->owners[i]) + continue; + + page->owners[i] = link; + link->u.leaf.page = page; + link->u.leaf.buf = page->buf + off; + + break; + } +} + +static char * +radix_tree_find_leaf(radix_tree_t *tree, uint64_t sector) +{ + int idx; + struct timeval now; + radix_tree_link_t *link; + radix_tree_node_t *node; + + node = tree->root; + gettimeofday(&now, NULL); + + do { + idx = radix_tree_index(node, sector); + link = node->links + idx; + link->time = now.tv_sec; + + if (radix_tree_node_contains_leaves(tree, node)) + return link->u.leaf.buf; + + if (!link->u.next) + return NULL; + + node = link->u.next; + } while (1); +} + +static char * +radix_tree_add_leaf(radix_tree_t *tree, uint64_t sector, + radix_tree_page_t *page, off_t off) +{ + int idx; + struct timeval now; + radix_tree_link_t *link; + radix_tree_node_t *node; + + node = tree->root; + gettimeofday(&now, NULL); + + do { + idx = radix_tree_index(node, sector); + link = node->links + idx; + link->time = now.tv_sec; + + if (radix_tree_node_contains_leaves(tree, node)) { + radix_tree_remove_page(tree, link->u.leaf.page); + radix_tree_insert_leaf(tree, link, page, off); + return link->u.leaf.buf; + } + + if (!link->u.next) { + link->u.next = radix_tree_allocate_child_node(tree, + node); + if (!link->u.next) + return NULL; + } + + node = link->u.next; + } while (1); +} + +static int +radix_tree_add_leaves(radix_tree_t *tree, char *buf, + uint64_t sector, uint64_t sectors) +{ + int i; + radix_tree_page_t *page; + + page = radix_tree_allocate_page(tree, buf, sector, + sectors << RADIX_TREE_NODE_SHIFT); + if (!page) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + if (!radix_tree_add_leaf(tree, sector + i, + page, (i << RADIX_TREE_NODE_SHIFT))) + goto fail; + + return 0; + +fail: + page->buf = NULL; + radix_tree_remove_page(tree, page); + return -ENOMEM; +} + +static void +radix_tree_delete_branch(radix_tree_t *tree, radix_tree_node_t *node) +{ + int i; + radix_tree_link_t *link; + + if (!node) + return; + + for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) { + link = node->links + i; + + if (radix_tree_node_contains_leaves(tree, node)) + radix_tree_remove_page(tree, link->u.leaf.page); + else + radix_tree_delete_branch(tree, link->u.next); + + radix_tree_clear_link(link); + } + + radix_tree_free_node(tree, node); +} + +static inline void +radix_tree_destroy(radix_tree_t *tree) +{ + radix_tree_delete_branch(tree, tree->root); + tree->root = NULL; +} + +/* + * returns 1 if @node is empty after pruning, 0 otherwise + */ +static int +radix_tree_prune_branch(radix_tree_t *tree, + radix_tree_node_t *node, uint32_t now) +{ + int i, empty; + radix_tree_link_t *link; + + empty = 1; + if (!node) + return empty; + + for (i = 0; i < RADIX_TREE_NODE_SIZE; i++) { + link = node->links + i; + + if (now - link->time < BLOCK_CACHE_PAGE_IDLETIME) { + if (radix_tree_node_contains_leaves(tree, node)) { + empty = 0; + continue; + } + + if (radix_tree_prune_branch(tree, link->u.next, now)) + radix_tree_clear_link(link); + else + empty = 0; + + continue; + } + + if (radix_tree_node_contains_leaves(tree, node)) + radix_tree_remove_page(tree, link->u.leaf.page); + else + radix_tree_delete_branch(tree, link->u.next); + + radix_tree_clear_link(link); + } + + if (empty && !radix_tree_node_is_root(tree, node)) + radix_tree_free_node(tree, node); + + return empty; +} + +/* + * walk tree and free any node that has been idle for too long + */ +static void +radix_tree_prune(radix_tree_t *tree) +{ + struct timeval now; + + if (!tree->root) + return; + + DPRINTF("tree %s has %"PRIu64" bytes\n", + tree->cache->name, tree->size); + + gettimeofday(&now, NULL); + radix_tree_prune_branch(tree, tree->root, now.tv_sec); + + DPRINTF("tree %s now has %"PRIu64" bytes\n", + tree->cache->name, tree->size); +} + +static inline int +radix_tree_initialize(radix_tree_t *tree, uint64_t sectors) +{ + tree->height = radix_tree_calculate_height(sectors); + tree->root = radix_tree_allocate_node(tree, tree->height); + if (!tree->root) + return -ENOMEM; + + return 0; +} + +static inline void +radix_tree_free(radix_tree_t *tree) +{ + radix_tree_destroy(tree); +} + +static void +block_cache_prune_event(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *private) +{ + radix_tree_t *tree; + block_cache_t *cache; + + cache = (block_cache_t *)private; + tree = &cache->tree; + + radix_tree_prune(tree); +} + +static inline block_cache_request_t * +block_cache_get_request(block_cache_t *cache) +{ + if (!cache->requests_free) + return NULL; + + return cache->request_free_list[--cache->requests_free]; +} + +static inline void +block_cache_put_request(block_cache_t *cache, block_cache_request_t *breq) +{ + memset(breq, 0, sizeof(block_cache_request_t)); + cache->request_free_list[cache->requests_free++] = breq; +} + +static int +block_cache_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + int i, err; + radix_tree_t *tree; + block_cache_t *cache; + + if (!td_flag_test(flags, TD_OPEN_RDONLY)) + return -EINVAL; + + if (driver->info.sector_size != RADIX_TREE_NODE_SIZE) + return -EINVAL; + + cache = (block_cache_t *)driver->data; + err = tapdisk_namedup(&cache->name, (char *)name); + if (err) + return -ENOMEM; + + cache->sectors = driver->info.size; + + tree = &cache->tree; + err = radix_tree_initialize(tree, cache->sectors); + if (err) + goto fail; + + tree->cache = cache; + cache->requests_free = BLOCK_CACHE_REQUESTS; + for (i = 0; i < BLOCK_CACHE_REQUESTS; i++) + cache->request_free_list[i] = cache->requests + i; + + cache->timeout_id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, + -1, /* dummy fd */ + BLOCK_CACHE_PAGE_IDLETIME << 1, + block_cache_prune_event, + cache); + if (cache->timeout_id < 0) + goto fail; + + DPRINTF("opening cache for %s, sectors: %"PRIu64", " + "tree: %p, height: %d\n", + cache->name, cache->sectors, tree, tree->height); + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) + DPRINTF("mlockall failed: %d\n", -errno); + + return 0; + +fail: + free(cache->name); + radix_tree_free(&cache->tree); + return err; +} + +static int +block_cache_close(td_driver_t *driver, + struct tqh_td_image_handle *head __attribute__((unused))) +{ + radix_tree_t *tree; + block_cache_t *cache; + + cache = (block_cache_t *)driver->data; + tree = &cache->tree; + + DPRINTF("closing cache for %s\n", cache->name); + + tapdisk_server_unregister_event(cache->timeout_id); + radix_tree_free(tree); + free(cache->name); + + return 0; +} + +static inline uint64_t +block_cache_hash(block_cache_t *cache __attribute__((unused)), char *buf) +{ + int i, n; + uint64_t cksm, *data; + + return 0; + + cksm = 0; + data = (uint64_t *)buf; + n = RADIX_TREE_NODE_SIZE / sizeof(uint64_t); + + for (i = 0; i < n; i++) + cksm += data[i]; + + return ~cksm; +} + +static void +block_cache_hit(block_cache_t *cache, td_request_t treq, char *iov[]) +{ + int i; + off_t off; + + cache->stats.hits += treq.secs; + + for (i = 0; i < treq.secs; i++) { + DBG("%s: block cache hit: sec 0x%08llx, hash: 0x%08llx\n", + cache->name, treq.sec + i, block_cache_hash(cache, iov[i])); + + off = i << RADIX_TREE_NODE_SHIFT; + memcpy(treq.buf + off, iov[i], RADIX_TREE_NODE_SIZE); + } + + td_complete_request(treq, 0); +} + +static void +block_cache_populate_cache(td_request_t clone, int err) +{ + int i; + radix_tree_t *tree; + block_cache_t *cache; + block_cache_request_t *breq; + + breq = (block_cache_request_t *)clone.cb_data; + cache = breq->cache; + tree = &cache->tree; + breq->secs -= clone.secs; + breq->err = (breq->err ? breq->err : err); + + if (breq->secs) + return; + + if (breq->err) { + free(breq->buf); + goto out; + } + + for (i = 0; i < breq->treq.secs; i++) { + off_t off = i << RADIX_TREE_NODE_SHIFT; + DBG("%s: populating sec 0x%08llx\n", + cache->name, breq->treq.sec + i); + memcpy(breq->treq.buf + off, + breq->buf + off, RADIX_TREE_NODE_SIZE); + } + + if (radix_tree_add_leaves(tree, breq->buf, + breq->treq.sec, breq->treq.secs)) + free(breq->buf); + +out: + td_complete_request(breq->treq, breq->err); + block_cache_put_request(cache, breq); +} + +static void +block_cache_miss(block_cache_t *cache, td_request_t treq) +{ + void *buf; + size_t size; + td_request_t clone; + radix_tree_t *tree; + block_cache_request_t *breq; + + DBG("%s: block cache miss: sec 0x%08llx\n", cache->name, treq.sec); + + clone = treq; + tree = &cache->tree; + size = treq.secs << RADIX_TREE_NODE_SHIFT; + + cache->stats.misses += treq.secs; + + if (radix_tree_size(tree) + size >= BLOCK_CACHE_MAX_SIZE) + goto out; + + breq = block_cache_get_request(cache); + if (!breq) + goto out; + + if (posix_memalign(&buf, RADIX_TREE_NODE_SIZE, size)) { + block_cache_put_request(cache, breq); + goto out; + } + + breq->treq = treq; + breq->secs = treq.secs; + breq->err = 0; + breq->buf = buf; + breq->cache = cache; + + clone.buf = buf; + clone.cb = block_cache_populate_cache; + clone.cb_data = breq; + +out: + td_forward_request(clone); +} + +static void +block_cache_queue_read(td_driver_t *driver, td_request_t treq) +{ + int i; + radix_tree_t *tree; + block_cache_t *cache; + char *iov[BLOCK_CACHE_NODES_PER_PAGE]; + + cache = (block_cache_t *)driver->data; + tree = &cache->tree; + + cache->stats.reads += treq.secs; + + if (treq.secs > BLOCK_CACHE_NODES_PER_PAGE) + return td_forward_request(treq); + + for (i = 0; i < treq.secs; i++) { + iov[i] = radix_tree_find_leaf(tree, treq.sec + i); + if (!iov[i]) + return block_cache_miss(cache, treq); + } + + return block_cache_hit(cache, treq, iov); +} + +static void +block_cache_queue_write(td_driver_t *driver __attribute__((unused)), + td_request_t treq) +{ + td_complete_request(treq, -EPERM); +} + +static int +block_cache_get_parent_id(td_driver_t *driver __attribute__((unused)), + td_disk_id_t *id __attribute__((unused))) +{ + return -EINVAL; +} + +static int +block_cache_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags __attribute__((unused))) +{ + if (!td_flag_test(pdriver->state, TD_DRIVER_RDONLY)) + return -EINVAL; + + if (strcmp(driver->name, pdriver->name)) + return -EINVAL; + + return 0; +} + +static void +block_cache_debug(td_driver_t *driver) +{ + block_cache_t *cache; + block_cache_stats_t *stats; + + cache = (block_cache_t *)driver->data; + stats = &cache->stats; + + WARN("BLOCK CACHE %s\n", cache->name); + WARN("reads: %"PRIu64", hits: %"PRIu64", " + "misses: %"PRIu64", prunes: %"PRIu64"\n", + stats->reads, stats->hits, stats->misses, stats->prunes); +} + +struct tap_disk tapdisk_block_cache = { + .disk_type = "tapdisk_block_cache", + .flags = 0, + .private_data_size = sizeof(block_cache_t), + .td_open = block_cache_open, + .td_close = block_cache_close, + .td_queue_read = block_cache_queue_read, + .td_queue_write = block_cache_queue_write, + .td_get_parent_id = block_cache_get_parent_id, + .td_validate_parent = block_cache_validate_parent, + .td_debug = block_cache_debug, +}; diff --git a/tools/blktap3/drivers/block-lcache.c b/tools/blktap3/drivers/block-lcache.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-lcache.c @@ -0,0 +1,380 @@ +/* + * Copyright (c) 2010, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Local persistent cache: write any sectors not found in the leaf back to the + * leaf. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <limits.h> +#include <sys/mman.h> +#include <sys/vfs.h> + +#include "vhd.h" +#include "tapdisk.h" +#include "tapdisk-utils.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" + +#define DEBUG 1 + +#ifdef DEBUG +#define DBG(_f, _a...) tlog_write(TLOG_DBG, _f, ##_a) +#else +#define DBG(_f, _a...) ((void)0) +#endif +#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ + ##_a, __func__, __LINE__) +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) +#define BUG() td_panic() +#define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } +#define WARN_ON(_p) if (unlikely(_cond)) { WARN(_cond); } + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define TD_LCACHE_MAX_REQ (MAX_REQUESTS*2) +#define TD_LCACHE_BUFSZ (MAX_SEGMENTS_PER_REQ * \ + sysconf(_SC_PAGE_SIZE)) + + +typedef struct lcache td_lcache_t; +typedef struct lcache_request td_lcache_req_t; + +struct lcache_request { + char *buf; + int err; + + td_request_t treq; + int secs; + + td_vbd_request_t vreq; + struct td_iovec iov; + + td_lcache_t *cache; +}; + +struct lcache { + char *name; + + td_lcache_req_t reqv[TD_LCACHE_MAX_REQ]; + td_lcache_req_t *free[TD_LCACHE_MAX_REQ]; + int n_free; + + char *buf; + size_t bufsz; + + int wr_en; + struct timeval ts; +}; + +static td_lcache_req_t * +lcache_alloc_request(td_lcache_t *cache) +{ + td_lcache_req_t *req = NULL; + + if (likely(cache->n_free)) + req = cache->free[--cache->n_free]; + + return req; +} + +static void +lcache_free_request(td_lcache_t *cache, td_lcache_req_t *req) +{ + BUG_ON(cache->n_free >= TD_LCACHE_MAX_REQ); + cache->free[cache->n_free++] = req; +} + +static void +lcache_destroy_buffers(td_lcache_t *cache) +{ + td_lcache_req_t *req; + + do { + req = lcache_alloc_request(cache); + if (req) + munmap(req->buf, TD_LCACHE_BUFSZ); + } while (req); +} + +static int +lcache_create_buffers(td_lcache_t *cache) +{ + int prot, flags, i, err; + + prot = PROT_READ|PROT_WRITE; + flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_LOCKED; + + cache->n_free = 0; + + for (i = 0; i < TD_LCACHE_MAX_REQ; i++) { + td_lcache_req_t *req = &cache->reqv[i]; + + req->buf = mmap(NULL, TD_LCACHE_BUFSZ, prot, flags, -1, 0); + if (req->buf == MAP_FAILED) { + req->buf = NULL; + err = -errno; + goto fail; + } + + lcache_free_request(cache, req); + } + + return 0; + +fail: + EPRINTF("Buffer init failure: %d", err); + lcache_destroy_buffers(cache); + return err; +} + +static int +lcache_close(td_driver_t *driver, + struct tqh_td_image_handle *head __attribute__((unused))) +{ + td_lcache_t *cache = driver->data; + + lcache_destroy_buffers(cache); + + free(cache->name); + + return 0; +} + +static int +lcache_open(td_driver_t *driver, const char *name, + td_flag_t flags __attribute__((unused))) +{ + td_lcache_t *cache = driver->data; + int err; + + err = tapdisk_namedup(&cache->name, (char *)name); + if (err) + goto fail; + + err = lcache_create_buffers(cache); + if (err) + goto fail; + + timerclear(&cache->ts); + cache->wr_en = 1; + + return 0; + +fail: + lcache_close(driver, NULL); + return err; +} + +/* + * NB. lcache->{wr_en,ts}: test free space in the caching SR before + * attempting to store our reads. VHD block allocation writes on Ext3 + * have the nasty property of blocking excessively after running out + * of space. We therefore enable/disable ourselves at a 1/s + * granularity, querying free space through statfs beforehand. + */ + +static long +lcache_fs_bfree(const td_lcache_t *cache, long *bsize) +{ + struct statfs fst; + int err; + + err = statfs(cache->name, &fst); + if (err) + return err; + + if (likely(bsize)) + *bsize = fst.f_bsize; + + return MIN(fst.f_bfree, LONG_MAX); +} + +static int +__lcache_wr_enabled(const td_lcache_t *cache) +{ + long threshold = 2<<20; /* B */ + long bfree, bsz = 1; + int enable; + + bfree = lcache_fs_bfree(cache, &bsz); + enable = bfree > threshold / bsz; + + return enable; +} + +static int +lcache_wr_enabled(td_lcache_t *cache) +{ + const int timeout = 1; /* s */ + struct timeval now, delta; + + gettimeofday(&now, NULL); + timersub(&now, &cache->ts, &delta); + + if (delta.tv_sec >= timeout) { + cache->wr_en = __lcache_wr_enabled(cache); + cache->ts = now; + } + + return cache->wr_en; +} + +static void +__lcache_write_cb(td_vbd_request_t *vreq, int error, + void *token, int final __attribute__((unused))) +{ + td_lcache_req_t *req = containerof(vreq, td_lcache_req_t, vreq); + td_lcache_t *cache = token; + + if (error == -ENOSPC) + cache->wr_en = 0; + + lcache_free_request(cache, req); +} + +static void +lcache_store_read(td_lcache_t *cache, td_lcache_req_t *req) +{ + td_vbd_request_t *vreq; + struct td_iovec *iov; + td_vbd_t *vbd; + int err; + + iov = &req->iov; + iov->base = req->buf; + iov->secs = req->treq.secs; + + vreq = &req->vreq; + vreq->op = TD_OP_WRITE; + vreq->sec = req->treq.sec; + vreq->iov = iov; + vreq->iovcnt = 1; + vreq->cb = __lcache_write_cb; + vreq->token = cache; + + vbd = req->treq.vreq->vbd; + + err = tapdisk_vbd_queue_request(vbd, vreq); + BUG_ON(err); +} + +static void +lcache_complete_read(td_lcache_t *cache, td_lcache_req_t *req) +{ + if (likely(!req->err)) { + size_t sz = req->treq.secs << SECTOR_SHIFT; + memcpy(req->treq.buf, req->buf, sz); + } + + td_complete_request(req->treq, req->err); + + if (unlikely(req->err) || !lcache_wr_enabled(cache)) { + lcache_free_request(cache, req); + return; + } + + lcache_store_read(cache, req); +} + +static void +__lcache_read_cb(td_request_t treq, int err) +{ + td_lcache_req_t *req = treq.cb_data; + td_lcache_t *cache = req->cache; + + BUG_ON(req->secs < treq.secs); + req->secs -= treq.secs; + req->err = req->err ? : err; + + if (!req->secs) + lcache_complete_read(cache, req); +} + +static void +lcache_queue_read(td_driver_t *driver, td_request_t treq) +{ + td_lcache_t *cache = driver->data; + td_request_t clone; + td_lcache_req_t *req; + + req = lcache_alloc_request(cache); + if (!req) { + td_complete_request(treq, -EBUSY); + return; + } + + req->treq = treq; + req->cache = cache; + + req->secs = req->treq.secs; + req->err = 0; + + clone = treq; + clone.buf = req->buf; + clone.cb = __lcache_read_cb; + clone.cb_data = req; + + td_forward_request(clone); +} + +static int +lcache_get_parent_id(td_driver_t *driver __attribute__((unused)), + td_disk_id_t *id __attribute__((unused))) +{ + return -EINVAL; +} + +static int +lcache_validate_parent(td_driver_t *driver, + td_driver_t *pdriver, td_flag_t flags __attribute__((unused))) +{ + if (strcmp(driver->name, pdriver->name)) + return -EINVAL; + + return 0; +} + +struct tap_disk tapdisk_lcache = { + .disk_type = "tapdisk_lcache", + .flags = 0, + .private_data_size = sizeof(td_lcache_t), + .td_open = lcache_open, + .td_close = lcache_close, + .td_queue_read = lcache_queue_read, + .td_get_parent_id = lcache_get_parent_id, + .td_validate_parent = lcache_validate_parent, +}; diff --git a/tools/blktap3/drivers/block-llcache.c b/tools/blktap3/drivers/block-llcache.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-llcache.c @@ -0,0 +1,610 @@ +/* + * Copyright (c) 2010, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> + +#include "tapdisk.h" +#include "tapdisk-vbd.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "tapdisk-disktype.h" + +#define DBG(_f, _a...) tlog_syslog(TLOG_DBG, _f, ##_a) +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, _f, ##_a) +#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f "in %s:%d", \ + ##_a, __func__, __LINE__) + +#define BUG() td_panic() +#define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } +#define WARN_ON(_p) if (unlikely(_cond)) { WARN(_cond); } + +int ll_write_error(int curr, int error) +{ + if (error && (!curr || curr == -ENOSPC)) + return error; + + return 0; +} + +void ll_log_switch(int type __attribute__((unused)), int error, + td_image_t *local, td_image_t *shared) +{ + WARN("WARNING: %s, on %s:%s. Switching to %s:%s.", + strerror(-error), + tapdisk_disk_types[local->type]->name, local->name, + tapdisk_disk_types[shared->type]->name, shared->name); +} + +/* + * LLP: Local leaf persistent cache + * -- Persistent write caching in local storage. + * + * VBD + * \ + * +--r/w--> llp+vhd:/local/leaf + * \ + * +--r/w--> vhd:/shared/leaf + * \ + * +--r/o--> vhd:/shared/parent + * + * We drive two ''leaf'' (r/w) images: One LOCAL (i.e. on local storage, + * unreliable and prone to out-of-space failures), and one SHARED + * (i.e. in shared storage with plenty of physical backing). + * + * All images are on a linear read chain: LOCAL inherits from SHARED, + * which inherits from a shared master image. This filter driver + * aggregates LOCAL. SHARED is our immediate parent, forced into R/W + * mode. + * + * Unless LOCAL failed, reads are issued to LOCAL, to save shared + * storage bandwidth. In case of failure, SHARED provides continued + * VDI consistency. + * + */ +enum { + LLP_MIRROR = 1, + /* + * LLP_MIRROR: + * + * Writes are mirrored to both LOCAL and SHARED. Reads are + * issued to LOCAL. + * + * Failure to write LOCAL are recoverable. The driver will + * transition to LLP_SHARED. + * + * Failure to write SHARED is irrecoverable, and signaled to + * the original issuer. + */ + + LLP_SHARED = 2, + /* + * LLP_SHARED: + * + * Writes are issued to SHARED only. As are reads. + * + * Failure to write SHARED is irrecoverable. + */ +}; + +typedef struct llpcache td_llpcache_t; +typedef struct llpcache_request td_llpcache_req_t; +#define TD_LLPCACHE_MAX_REQ (MAX_REQUESTS*2) + +struct llpcache_vreq { + enum { LOCAL = 0, SHARED = 1 } target; + td_vbd_request_t vreq; +}; + +struct llpcache_request { + td_request_t treq; + + struct td_iovec iov; + int error; + + struct llpcache_vreq lvr[2]; + + unsigned int pending; + int mode; +}; + +struct llpcache { + td_image_t *local; + int mode; + + td_llpcache_req_t reqv[TD_LLPCACHE_MAX_REQ]; + td_llpcache_req_t *free[TD_LLPCACHE_MAX_REQ]; + int n_free; +}; + +static td_llpcache_req_t * +llpcache_alloc_request(td_llpcache_t *s) +{ + td_llpcache_req_t *req = NULL; + + if (likely(s->n_free)) + req = s->free[--s->n_free]; + + return req; +} + +static void +llpcache_free_request(td_llpcache_t *s, td_llpcache_req_t *req) +{ + BUG_ON(s->n_free >= TD_LLPCACHE_MAX_REQ); + s->free[s->n_free++] = req; +} + +static void +__llpcache_write_cb(td_vbd_request_t *vreq, int error, + void *token, int final __attribute__((unused))) +{ + td_llpcache_t *s = token; + struct llpcache_vreq *lvr; + td_llpcache_req_t *req; + int mask; + + lvr = containerof(vreq, struct llpcache_vreq, vreq); + req = containerof(lvr, td_llpcache_req_t, lvr[lvr->target]); + + mask = 1U << lvr->target; + BUG_ON(!(req->pending & mask)) + + if (lvr->target == LOCAL && error == -ENOSPC) { + td_image_t *shared = TAILQ_NEXT(req->treq.image, entry); + ll_log_switch(DISK_TYPE_LLPCACHE, error, + s->local, shared); + s->mode = LLP_SHARED; + error = 0; + } + + req->pending &= ~mask; + req->error = ll_write_error(req->error, error); + + if (!req->pending) { + /* FIXME: Make sure this won''t retry. */ + td_complete_request(req->treq, req->error); + llpcache_free_request(s, req); + } +} + +/* + * NB. Write mirroring. Lacking per-image queues, it''s still a + * hack. But shall do for now: + * + * 1. Store the treq, thereby blocking the original vreq. + * 2. Reissue, as two clone vreqs. One local, one shared. + * 3. Clones seen again then get forwarded. + * 4. Treq completes after both vreqs. + * + * We can recognize clones by matching the vreq->token field. + */ + +static int +llpcache_requeue_treq(td_llpcache_t *s, td_llpcache_req_t *req, int target) +{ + struct llpcache_vreq *lvr; + td_vbd_request_t *vreq; + int err; + + lvr = &req->lvr[target]; + lvr->target = target; + + vreq = &lvr->vreq; + vreq->op = TD_OP_WRITE; + vreq->sec = req->treq.sec; + vreq->iov = &req->iov; + vreq->iovcnt = 1; + vreq->cb = __llpcache_write_cb; + vreq->token = s; + + err = tapdisk_vbd_queue_request(req->treq.vreq->vbd, vreq); + if (err) + goto fail; + + req->pending |= 1UL << target; + return 0; + +fail: + req->error = req->error ? : err; + return err; +} + +static void +llpcache_fork_write(td_llpcache_t *s, td_request_t treq) +{ + td_llpcache_req_t *req; + struct td_iovec *iov; + int err; + + req = llpcache_alloc_request(s); + if (!req) { + td_complete_request(treq, -EBUSY); + return; + } + + memset(req, 0, sizeof(req)); + + req->treq = treq; + + iov = &req->iov; + iov->base = treq.buf; + iov->secs = treq.secs; + + err = llpcache_requeue_treq(s, req, LOCAL); + if (err) + goto fail; + + err = llpcache_requeue_treq(s, req, SHARED); + if (err) + goto fail; + + return; + +fail: + if (!req->pending) { + td_complete_request(treq, req->error); + llpcache_free_request(s, req); + } +} + +static void +llpcache_forward_write(td_llpcache_t *s, td_request_t treq) +{ + const td_vbd_request_t *vreq = treq.vreq; + struct llpcache_vreq *lvr; + + lvr = containerof(vreq, struct llpcache_vreq, vreq); + + switch (lvr->target) { + case SHARED: + td_forward_request(treq); + break; + case LOCAL: + td_queue_write(s->local, treq); + break; + default: + BUG(); + } +} + +static void +llpcache_queue_write(td_driver_t *driver, td_request_t treq) +{ + td_llpcache_t *s = driver->data; + + if (treq.vreq->token == s) + llpcache_forward_write(s, treq); + else + llpcache_fork_write(s, treq); +} + +static void +llpcache_queue_read(td_driver_t *driver, td_request_t treq) +{ + td_llpcache_t *s = driver->data; + + switch (s->mode) { + case LLP_MIRROR: + td_queue_read(s->local, treq); + break; + case LLP_SHARED: + td_forward_request(treq); + default: + BUG(); + } +} + +static int +llpcache_close(td_driver_t *driver, struct tqh_td_image_handle *head) +{ + td_llpcache_t *s = driver->data; + + if (s->local) { + tapdisk_image_close(s->local, head); + s->local = NULL; + } + + return 0; +} + +static int +llpcache_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + td_llpcache_t *s = driver->data; + int i, err; + + s->mode = LLP_MIRROR; + + for (i = 0; i < TD_LLPCACHE_MAX_REQ; i++) + llpcache_free_request(s, &s->reqv[i]); + + err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->local); + if (err) + goto fail; + + driver->info = s->local->driver->info; + + return 0; + +fail: + llpcache_close(driver, NULL); + return err; +} + +static int +llcache_get_parent_id(td_driver_t *driver, td_disk_id_t *id) +{ + td_llpcache_t *s = driver->data; + int err; + + err = td_get_parent_id(s->local, id); + if (!err) + id->flags &= ~TD_OPEN_RDONLY; + + return err; +} + +static int +llcache_validate_parent(td_driver_t *driver __attribute__((unused)), + td_driver_t *pdriver __attribute__((unused)), + td_flag_t flags __attribute__((unused))) +{ + return -ENOSYS; +} + + +struct tap_disk tapdisk_llpcache = { + .disk_type = "tapdisk_llpcache", + .flags = 0, + .private_data_size = sizeof(td_llpcache_t), + .td_open = llpcache_open, + .td_close = llpcache_close, + .td_queue_read = llpcache_queue_read, + .td_queue_write = llpcache_queue_write, + .td_get_parent_id = llcache_get_parent_id, + .td_validate_parent = llcache_validate_parent, +}; + +/* + * LLE: Local Leaf Ephemeral Cache + * -- Non-persistent write caching in local storage. + * + * VBD + * \ + * +--r/w--> lle+vhd:/shared/leaf + * \ + * +--r/w--> vhd:/local/leaf + * \ + * +--r/o--> vhd:/shared/parent + * + * Note that LOCAL and SHARED chain order differs from LLP. Shared + * storage data masks local data. + * + * This means VDI state in shared storage state alone is + * inconsistent. Wherever local is unavailable, SHARED must be + * discarded too. + */ +enum { + LLE_LOCAL = 1, + /* + * LLE_LOCAL: + * + * Writes are forwarded to LOCAL only. As are reads. This + * reduces network overhead. + * + * Failure to write LOCAL is recoverable. The driver will + * transition to LLE_SHARED. + * + * Failure to write to shared are irrecoverable and signaled + * to the original issuer. + */ + + LLE_SHARED = 2, + /* + * LLE_SHARED: + * + * Writes are issued to SHARED. As are reads. + * + * Failure to write to SHARED is irrecoverable. + */ +}; + +typedef struct llecache td_llecache_t; +typedef struct llecache_request td_llecache_req_t; +#define TD_LLECACHE_MAX_REQ (MAX_REQUESTS*2) + +struct llecache_request { + td_llecache_t *s; + td_request_t treq; + int pending; + int error; +}; + +struct llecache { + td_image_t *shared; + int mode; + + td_llecache_req_t reqv[TD_LLECACHE_MAX_REQ]; + td_llecache_req_t *free[TD_LLECACHE_MAX_REQ]; + int n_free; +}; + +static td_llecache_req_t * +llecache_alloc_request(td_llecache_t *s) +{ + td_llecache_req_t *req = NULL; + + if (likely(s->n_free)) + req = s->free[--s->n_free]; + + return req; +} + +static void +llecache_free_request(td_llecache_t *s, td_llecache_req_t *req) +{ + BUG_ON(s->n_free >= TD_LLECACHE_MAX_REQ); + s->free[s->n_free++] = req; +} + +static int +llecache_close(td_driver_t *driver, struct tqh_td_image_handle *head) +{ + td_llecache_t *s = driver->data; + + if (s->shared) { + tapdisk_image_close(s->shared, head); + s->shared = NULL; + } + + return 0; +} + +static int +llecache_open(td_driver_t *driver, const char *name, td_flag_t flags) +{ + td_llecache_t *s = driver->data; + int i, err; + + s->mode = LLE_LOCAL; + + for (i = 0; i < TD_LLECACHE_MAX_REQ; i++) + llecache_free_request(s, &s->reqv[i]); + + err = tapdisk_image_open(DISK_TYPE_VHD, name, flags, &s->shared); + if (err) + goto fail; + + driver->info = s->shared->driver->info; + + return 0; + +fail: + llecache_close(driver, NULL); + return err; +} + +static void +__llecache_write_cb(td_request_t treq, int error) +{ + td_llecache_req_t *req = treq.cb_data; + td_llecache_t *s = req->s; + + BUG_ON(req->pending < treq.secs); + + req->pending -= treq.secs; + req->error = ll_write_error(req->error, error); + + if (req->pending) + return; + + if (req->error == -ENOSPC) { + ll_log_switch(DISK_TYPE_LLECACHE, req->error, + treq.image, s->shared); + + s->mode = LLE_SHARED; + td_queue_write(s->shared, req->treq); + + } else + td_complete_request(req->treq, error); + + llecache_free_request(s, req); +} + +static void +llecache_forward_write(td_llecache_t *s, td_request_t treq) +{ + td_llecache_req_t *req; + td_request_t clone; + + req = llecache_alloc_request(s); + if (!req) { + td_complete_request(treq, -EBUSY); + return; + } + + memset(req, 0, sizeof(req)); + + req->treq = treq; + req->pending = treq.secs; + req->s = s; + + clone = treq; + clone.cb = __llecache_write_cb; + clone.cb_data = req; + + td_forward_request(clone); +} + +static void +llecache_queue_write(td_driver_t *driver, td_request_t treq) +{ + td_llecache_t *s = driver->data; + + switch (s->mode) { + case LLE_LOCAL: + llecache_forward_write(s, treq); + break; + case LLE_SHARED: + td_queue_write(s->shared, treq); + break; + } +} + +static void +llecache_queue_read(td_driver_t *driver, td_request_t treq) +{ + td_llecache_t *s = driver->data; + + switch (s->mode) { + case LLE_LOCAL: + td_forward_request(treq); + break; + case LLE_SHARED: + td_queue_read(s->shared, treq); + break; + default: + BUG(); + } +} + +struct tap_disk tapdisk_llecache = { + .disk_type = "tapdisk_llecache", + .flags = 0, + .private_data_size = sizeof(td_llecache_t), + .td_open = llecache_open, + .td_close = llecache_close, + .td_queue_read = llecache_queue_read, + .td_queue_write = llecache_queue_write, + .td_get_parent_id = llcache_get_parent_id, + .td_validate_parent = llcache_validate_parent, +}; diff --git a/tools/blktap3/drivers/block-log.c b/tools/blktap3/drivers/block-log.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-log.c @@ -0,0 +1,692 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver to sit on top of another disk and log writes, in order + * to synchronize two distinct disks + * + * On receipt of a control request it can export a list of dirty + * sectors in the following format: + * struct writerange { + * u64 sector; + * u32 count; + * } + * terminated by { 0, 0 } + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "log.h" +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +#define MAX_CONNECTIONS 1 + +typedef struct poll_fd { + int fd; + event_id_t id; +} poll_fd_t; + +struct tdlog_state { + uint64_t size; + + void* writelog; + + char* ctlpath; + poll_fd_t ctl; + + int connected; + poll_fd_t connections[MAX_CONNECTIONS]; + + char* shmpath; + void* shm; + + log_sring_t* sring; + log_back_ring_t bring; +}; + +#define BDPRINTF(_f, _a...) syslog (LOG_DEBUG, "log: " _f "\n", ## _a) + +#define BWPRINTF(_f, _a...) syslog (LOG_WARNING, "log: " _f "\n", ## _a) + +static void ctl_accept(event_id_t, char, void *); +static void ctl_request(event_id_t, char, void *); + +/* -- write log -- */ + +/* large flat bitmaps don''t scale particularly well either in size or scan + * time, but they''ll do for now */ +#define BITS_PER_LONG (sizeof(unsigned long) * 8) +#define BITS_TO_LONGS(bits) (((bits)+BITS_PER_LONG-1)/BITS_PER_LONG) + +#define BITMAP_ENTRY(_nr, _bmap) ((unsigned long*)(_bmap))[(_nr)/BITS_PER_LONG] +#define BITMAP_SHIFT(_nr) ((_nr) % BITS_PER_LONG) + +static inline int test_bit(int nr, void* bmap) +{ + return (BITMAP_ENTRY(nr, bmap) >> BITMAP_SHIFT(nr)) & 1; +} + +static inline void clear_bit(int nr, void* bmap) +{ + BITMAP_ENTRY(nr, bmap) &= ~(1UL << BITMAP_SHIFT(nr)); +} + +static inline void set_bit(int nr, void* bmap) +{ + BITMAP_ENTRY(nr, bmap) |= (1UL << BITMAP_SHIFT(nr)); +} + +static inline int bitmap_size(uint64_t sz) +{ + return sz >> 3; +} + +static int writelog_create(struct tdlog_state *s) +{ + uint64_t bmsize; + + bmsize = bitmap_size(s->size); + + BDPRINTF("allocating %"PRIu64" bytes for dirty bitmap", bmsize); + + if (!(s->writelog = calloc(bmsize, 1))) { + BWPRINTF("could not allocate dirty bitmap of size %"PRIu64, bmsize); + return -1; + } + + return 0; +} + +static int writelog_free(struct tdlog_state *s) +{ + if (s->writelog) + free(s->writelog); + + return 0; +} + +static int writelog_set(struct tdlog_state* s, uint64_t sector, int count) +{ + int i; + + for (i = 0; i < count; i++) + set_bit(sector + i, s->writelog); + + return 0; +} + +/* if end is 0, clear to end of disk */ +int writelog_clear(struct tdlog_state* s, uint64_t start, uint64_t end) +{ + if (!end) + end = s->size; + + /* clear to word boundaries */ + while (BITMAP_SHIFT(start)) + clear_bit(start++, s->writelog); + while (BITMAP_SHIFT(end)) + clear_bit(end--, s->writelog); + + memset(s->writelog + start / BITS_PER_LONG, 0, (end - start) >> 3); + + return 0; +} + +/* returns last block exported (may not be end of disk if shm region + * overflows) */ +static uint64_t writelog_export(struct tdlog_state* s) +{ + struct disk_range* range = s->shm; + uint64_t i = 0; + + BDPRINTF("sector count: %"PRIu64, s->size); + + for (i = 0; i < s->size; i++) { + if (test_bit(i, s->writelog)) { + /* range start */ + range->sector = i; + range->count = 1; + /* find end */ + for (i++; i < s->size && test_bit(i, s->writelog); i++) + range->count++; + + BDPRINTF("export: dirty extent %"PRIu64":%u", + range->sector, range->count); + range++; + + /* out of space in shared memory region */ + if ((void*)range >= bmend(s->shm)) { + BDPRINTF("out of space in shm region at sector %"PRIu64, i); + return i; + } + + /* undo forloop increment */ + i--; + } + } + + /* NULL-terminate range list */ + range->sector = 0; + range->count = 0; + + return i; +} + +/* -- communication channel -- */ + +/* remove FS special characters in up to len bytes of path */ +static inline void path_escape(char* path, size_t len) { + int i; + + for (i = 0; i < len && path[i]; i++) + if (strchr(":/", path[i])) + path[i] = ''_''; +} + +static char* ctl_makepath(const char* name, const char* ext) +{ + char* res; + char *file; + + file = strrchr(name, ''/''); + if (!file) { + BWPRINTF("invalid name %s\n", name); + return NULL; + } + + if (asprintf(&res, BLKTAP_CTRL_DIR "/log_%s.%s", file, ext) < 0) { + BWPRINTF("could not allocate path"); + return NULL; + } + + path_escape(res + strlen(BLKTAP_CTRL_DIR) + 5, strlen(file)); + + return res; +} + +static int shmem_open(struct tdlog_state* s, const char* name) +{ + int i, l, fd; + + /* device name -> path */ + if (asprintf(&s->shmpath, "/log_%s.wlog", name) < 0) { + BWPRINTF("could not allocate shm path"); + return -1; + } + + path_escape(s->shmpath + 5, strlen(name)); + + if ((fd = shm_open(s->shmpath, O_CREAT|O_RDWR, 0750)) < 0) { + BWPRINTF("could not open shared memory file %s: %s", s->shmpath, + strerror(errno)); + goto err; + } + if (ftruncate(fd, SHMSIZE) < 0) { + BWPRINTF("error truncating shmem to size %u", SHMSIZE); + close(fd); + goto err; + } + + s->shm = mmap(NULL, SHMSIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + close(fd); + if (s->shm == MAP_FAILED) { + BWPRINTF("could not mmap write log shm: %s", strerror(errno)); + goto err; + } + return 0; + + err: + s->shm = NULL; + free(s->shmpath); + s->shmpath = NULL; + return -1; +} + +static int shmem_close(struct tdlog_state* s) +{ + if (s->shm) { + munmap(s->shm, SHMSIZE); + s->shm = NULL; + } + + if (s->shmpath) { + shm_unlink(s->shmpath); + s->shmpath = NULL; + } + + return 0; +} + +/* control socket */ + +static int ctl_open(struct tdlog_state* s, const char* name) +{ + struct sockaddr_un saddr; + + if (!(s->ctlpath = ctl_makepath(name, "ctl"))) + return -1; + + if ((s->ctl.fd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) { + BWPRINTF("error opening control socket: %s", strerror(errno)); + goto err; + } + + memset(&saddr, 0, sizeof(saddr)); + saddr.sun_family = AF_UNIX; + memcpy(saddr.sun_path, s->ctlpath, strlen(s->ctlpath)); + if (unlink(s->ctlpath) && errno != ENOENT) { + BWPRINTF("error unlinking old socket path %s: %s", s->ctlpath, + strerror(errno)); + goto err_sock; + } + + if (bind(s->ctl.fd, &saddr, sizeof(saddr)) < 0) { + BWPRINTF("error binding control socket to %s: %s", s->ctlpath, + strerror(errno)); + goto err_sock; + } + + if (listen(s->ctl.fd, 1) < 0) { + BWPRINTF("error listening on control socket: %s", strerror(errno)); + goto err_sock; + } + + s->ctl.id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + s->ctl.fd, 0, ctl_accept, s); + if (s->ctl.id < 0) { + BWPRINTF("error register event handler: %s", strerror(s->ctl.id)); + goto err_sock; + } + + return 0; + + err_sock: + close(s->ctl.fd); + s->ctl.fd = -1; + err: + free(s->ctlpath); + s->ctlpath = NULL; + + return -1; +} + +static int ctl_close(struct tdlog_state* s) +{ + while (s->connected) { + tapdisk_server_unregister_event(s->connections[s->connected].id); + close(s->connections[s->connected].fd); + s->connections[s->connected].fd = -1; + s->connections[s->connected].id = 0; + s->connected--; + } + + if (s->ctl.fd >= 0) { + tapdisk_server_unregister_event(s->ctl.id); + close(s->ctl.fd); + s->ctl.fd = -1; + s->ctl.id = 0; + } + + if (s->ctlpath) { + unlink(s->ctlpath); + free(s->ctlpath); + s->ctlpath = NULL; + } + + /* XXX this must be fixed once requests are actually in flight */ + /* could just drain the existing ring here first */ + if (s->sring) { + SHARED_RING_INIT(s->sring); + BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); + } + + return 0; +} + +/* walk list of open sockets, close matching fd */ +static int ctl_close_sock(struct tdlog_state* s, int fd) +{ + int i; + + for (i = 0; i <= s->connected; i++) { + if (s->connections[i].fd == fd) { + tapdisk_server_unregister_event(s->connections[i].id); + close(s->connections[i].fd); + s->connections[i].fd = -1; + s->connections[i].id = 0; + s->connected--; + return 0; + } + } + + BWPRINTF("requested to close unknown socket %d", fd); + return -1; +} + +static void ctl_accept(event_id_t id, char mode, void *private) +{ + struct tdlog_state* s = (struct tdlog_state *)private; + int fd; + event_id_t cid; + + if ((fd = accept(s->ctl.fd, NULL, NULL)) < 0) { + BWPRINTF("error accepting control connection: %s", strerror(errno)); + return; + } + + if (s->connected) { + BWPRINTF("control session in progress, closing new connection"); + close(fd); + return; + } + + cid = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + fd, 0, ctl_request, s); + if (cid < 0) { + BWPRINTF("error registering connection event handler: %s", strerror(cid)); + close(fd); + return; + } + + s->connections[s->connected].fd = fd; + s->connections[s->connected].id = cid; + s->connected++; +} + +/* response format: 4 bytes shmsize, 0-terminated path */ +static int ctl_get_shmpath(struct tdlog_state* s, int fd) +{ + char msg[CTLRSPLEN_SHMP + 1]; + uint32_t sz; + int rc; + + BDPRINTF("ctl: sending shared memory parameters (size: %u, path: %s)", + SHMSIZE, s->shmpath); + + /* TMP: sanity-check shm */ + sz = 0xdeadbeef; + memcpy(s->shm, &sz, sizeof(sz)); + + sz = SHMSIZE; + memcpy(msg, &sz, sizeof(sz)); + snprintf(msg + sizeof(sz), sizeof(msg) - sizeof(sz), "%s", s->shmpath); + if ((rc = write(fd, msg, CTLRSPLEN_SHMP)) < 0) { + BWPRINTF("error writing shmpath: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int ctl_peek_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: peeking bitmap"); + + writelog_export(s); + + if ((rc = write(fd, "done", CTLRSPLEN_PEEK)) < 0) { + BWPRINTF("error writing peek ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +static int ctl_clear_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: clearing bitmap"); + + writelog_clear(s, 0, 0); + + if ((rc = write(fd, "done", CTLRSPLEN_CLEAR)) < 0) { + BWPRINTF("error writing clear ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +/* get dirty bitmap and clear it atomically */ +static int ctl_get_writes(struct tdlog_state* s, int fd) +{ + int rc; + + BDPRINTF("ctl: getting bitmap"); + + writelog_export(s); + writelog_clear(s, 0, 0); + + if ((rc = write(fd, "done", CTLRSPLEN_GET)) < 0) { + BWPRINTF("error writing get ack: %s", strerror(errno)); + return -1; + } + + return 0; +} + +/* get requests from ring */ +static int ctl_kick(struct tdlog_state* s, int fd) +{ + RING_IDX reqstart, reqend; + log_request_t req; + + /* XXX testing */ + RING_IDX rspstart, rspend; + log_response_t rsp; + struct log_ctlmsg msg; + int rc; + + reqstart = s->bring.req_cons; + reqend = s->sring->req_prod; + + BDPRINTF("ctl: ring kicked (start = %u, end = %u)", reqstart, reqend); + + while (reqstart != reqend) { + /* XXX actually submit these! */ + memcpy(&req, RING_GET_REQUEST(&s->bring, reqstart), sizeof(req)); + BDPRINTF("ctl: read request %"PRIu64":%u", req.sector, req.count); + s->bring.req_cons = ++reqstart; + + rsp.sector = req.sector; + rsp.count = req.count; + memcpy(RING_GET_RESPONSE(&s->bring, s->bring.rsp_prod_pvt), &rsp, + sizeof(rsp)); + s->bring.rsp_prod_pvt++; + } + + RING_PUSH_RESPONSES(&s->bring); + memset(&msg, 0, sizeof(msg)); + memcpy(msg.msg, LOGCMD_KICK, 4); + if ((rc = write(fd, &msg, sizeof(msg))) < 0) { + BWPRINTF("error sending notify: %s", strerror(errno)); + return -1; + } else if (rc < sizeof(msg)) { + BWPRINTF("short notify write (%d/%zd)", rc, sizeof(msg)); + return -1; + } + + return 0; +} + +static int ctl_do_request(struct tdlog_state* s, int fd, struct log_ctlmsg* msg) +{ + if (!strncmp(msg->msg, LOGCMD_SHMP, 4)) { + return ctl_get_shmpath(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_PEEK, 4)) { + return ctl_peek_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_CLEAR, 4)) { + return ctl_clear_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_GET, 4)) { + return ctl_get_writes(s, fd); + } else if (!strncmp(msg->msg, LOGCMD_KICK, 4)) { + return ctl_kick(s, fd); + } + + BWPRINTF("unknown control request %.4s", msg->msg); + return -1; +} + +static inline int ctl_find_connection(struct tdlog_state *s, event_id_t id) +{ + int i; + + for (i = 0; i < s->connected; i++) + if (s->connections[i].id == id) + return s->connections[i].fd; + + BWPRINTF("unrecognized event callback id %d", id); + return -1; +} + +static void ctl_request(event_id_t id, char mode, void *private) +{ + struct tdlog_state* s = (struct tdlog_state*)private; + struct log_ctlmsg msg; + int rc, i, fd = -1; + + fd = ctl_find_connection(s, id); + if (fd == -1) + return; + + if ((rc = read(fd, &msg, sizeof(msg))) < 0) { + BWPRINTF("error reading from ctl socket %d, closing: %s", fd, + strerror(errno)); + ctl_close_sock(s, fd); + return; + } else if (rc == 0) { + BDPRINTF("ctl_request: EOF, closing socket"); + ctl_close_sock(s, fd); + return; + } else if (rc < sizeof(msg)) { + BWPRINTF("short request received (%d/%zd bytes), ignoring", rc, + sizeof(msg)); + return; + } + + ctl_do_request(s, fd, &msg); +} + +/* -- interface -- */ + +static int tdlog_close(td_driver_t*); + +static int tdlog_open(td_driver_t* driver, const char* name, td_flag_t flags) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + int rc; + + memset(s, 0, sizeof(*s)); + + s->size = driver->info.size; + + if ((rc = writelog_create(s))) { + tdlog_close(driver); + return rc; + } + if ((rc = shmem_open(s, name))) { + tdlog_close(driver); + return rc; + } + if ((rc = ctl_open(s, name))) { + tdlog_close(driver); + return rc; + } + + s->sring = (log_sring_t*)sringstart(s->shm); + SHARED_RING_INIT(s->sring); + BACK_RING_INIT(&s->bring, s->sring, SRINGSIZE); + + BDPRINTF("opened ctl socket"); + + return 0; +} + +static int tdlog_close(td_driver_t* driver) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + + ctl_close(s); + shmem_close(s); + writelog_free(s); + + return 0; +} + +static void tdlog_queue_read(td_driver_t* driver, td_request_t treq) +{ + td_forward_request(treq); +} + +static void tdlog_queue_write(td_driver_t* driver, td_request_t treq) +{ + struct tdlog_state* s = (struct tdlog_state*)driver->data; + int rc; + + writelog_set(s, treq.sec, treq.secs); + td_forward_request(treq); +} + +static int tdlog_get_parent_id(td_driver_t* driver, td_disk_id_t* id) +{ + return -EINVAL; +} + +static int tdlog_validate_parent(td_driver_t *driver, + td_driver_t *parent, td_flag_t flags) +{ + return 0; +} + +struct tap_disk tapdisk_log = { + .disk_type = "tapdisk_log", + .private_data_size = sizeof(struct tdlog_state), + .flags = 0, + .td_open = tdlog_open, + .td_close = tdlog_close, + .td_queue_read = tdlog_queue_read, + .td_queue_write = tdlog_queue_write, + .td_get_parent_id = tdlog_get_parent_id, + .td_validate_parent = tdlog_validate_parent, +}; diff --git a/tools/blktap3/drivers/block-nbd.c b/tools/blktap3/drivers/block-nbd.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-nbd.c @@ -0,0 +1,908 @@ +/* + * Copyright (c) 2012, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/types.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <netinet/tcp.h> +#include <netinet/in.h> +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "tapdisk-utils.h" +#include "tapdisk-fdreceiver.h" +#include "tapdisk-nbd.h" + +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a) +#define ERROR(_f, _a...) tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a) + +#define N_PASSED_FDS 10 +#define TAPDISK_NBDCLIENT_MAX_PATH_LEN 256 +#define TAPDISK_NBDCLIENT_LISTEN_SOCK_PATH "/var/run/blktap-control/nbdclient" +#define MAX_NBD_REQS TAPDISK_DATA_REQUESTS +#define NBD_TIMEOUT 30 + +/* + * We''ll only ever have one nbdclient fd receiver per tapdisk process, so let''s + * just store it here globally. We''ll also keep track of the passed fds here + * too. + */ + +struct td_fdreceiver *fdreceiver = NULL; + +struct tdnbd_passed_fd { + char id[40]; + struct timeval t; + int fd; +} passed_fds[N_PASSED_FDS]; + +struct nbd_queued_io { + char *buffer; + int len; + int so_far; +}; + +/* + * this creates "struct tqh_td_nbd_request" + */ +TAILQ_HEAD(tqh_td_nbd_request, td_nbd_request); + +struct td_nbd_request { + td_request_t treq; + struct nbd_request nreq; + int timeout_event; + int fake; + struct nbd_queued_io header; + struct nbd_queued_io body; /* in or out, depending on whether + type is read or write. */ + TAILQ_ENTRY(td_nbd_request) queue; /* TODO rename to entry */ +}; + +struct tdnbd_data +{ + int writer_event_id; + struct tqh_td_nbd_request sent_reqs; + struct tqh_td_nbd_request pending_reqs; + struct tqh_td_nbd_request free_reqs; + struct td_nbd_request requests[MAX_NBD_REQS]; + int nr_free_count; + + int reader_event_id; + struct nbd_reply current_reply; + struct nbd_queued_io cur_reply_qio; + struct td_nbd_request *curr_reply_req; + + int socket; + struct sockaddr_in *remote; + char *peer_ip; + int port; + char *name; + + int flags; + int closed; +}; + +int global_id = 0; + +static void disable_write_queue(struct tdnbd_data *prv); + + +/* -- fdreceiver bits and pieces -- */ + +static void +tdnbd_stash_passed_fd(int fd, char *msg, void *data __attribute__((unused))) +{ + int free_index = -1; + int i; + for (i = 0; i < N_PASSED_FDS; i++) + if (passed_fds[i].fd == -1) { + free_index = i; + break; + } + + if (free_index == -1) { + ERROR("Error - more than %d fds passed! cannot stash another", + N_PASSED_FDS); + close(fd); + return; + } + + passed_fds[free_index].fd = fd; + strncpy(passed_fds[free_index].id, msg, + sizeof(passed_fds[free_index].id)); + gettimeofday(&passed_fds[free_index].t, NULL); + +} + +static int +tdnbd_retreive_passed_fd(const char *name) +{ + int fd, i; + + for (i = 0; i < N_PASSED_FDS; i++) { + if (strncmp(name, passed_fds[i].id, + sizeof(passed_fds[i].id)) == 0) { + fd = passed_fds[i].fd; + passed_fds[i].fd = -1; + return fd; + } + } + + ERROR("Couldn''t find the fd named: %s", name); + + return -1; +} + +void +tdnbd_fdreceiver_start(void) +{ + char fdreceiver_path[TAPDISK_NBDCLIENT_MAX_PATH_LEN]; + int i; + + /* initialise the passed fds list */ + for (i = 0; i < N_PASSED_FDS; i++) + passed_fds[i].fd = -1; + + snprintf(fdreceiver_path, TAPDISK_NBDCLIENT_MAX_PATH_LEN, + "%s%d", TAPDISK_NBDCLIENT_LISTEN_SOCK_PATH, getpid()); + + fdreceiver = td_fdreceiver_start(fdreceiver_path, + tdnbd_stash_passed_fd, NULL); + +} + +void +tdnbd_fdreceiver_stop(void) +{ + if (fdreceiver) + td_fdreceiver_stop(fdreceiver); +} + +static void +__cancel_req(int i, struct td_nbd_request *pos, int e) +{ + char handle[9]; + memcpy(handle, pos->nreq.handle, 8); + handle[8] = 0; + INFO("Entry %d: handle=''%s'' type=%d -- reporting errno: %d", + i, handle, ntohl(pos->nreq.type), e); + + if (pos->timeout_event >= 0) { + tapdisk_server_unregister_event(pos->timeout_event); + pos->timeout_event = -1; + } + + td_complete_request(pos->treq, e); +} + +static void +tdnbd_disable(struct tdnbd_data *prv, int e) +{ + struct td_nbd_request *pos, *q; + int i = 0; + + INFO("NBD client full-disable"); + + tapdisk_server_unregister_event(prv->writer_event_id); + tapdisk_server_unregister_event(prv->reader_event_id); + + TAILQ_FOREACH_SAFE(pos, &prv->sent_reqs, queue, q) + __cancel_req(i++, pos, e); + + TAILQ_FOREACH_SAFE(pos, &prv->pending_reqs, queue, q) + __cancel_req(i++, pos, e); + + INFO("Setting closed"); + prv->closed = 3; +} + +/* NBD writer queue */ + +/* Return code: how much is left to write, or a negative error code */ +static int +tdnbd_write_some(int fd, struct nbd_queued_io *data) +{ + int left = data->len - data->so_far; + int rc; + char *code; + + while (left > 0) { + rc = send(fd, data->buffer + data->so_far, left, 0); + + if (rc == -1) { + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) + return left; + + code = strerror(errno); + ERROR("Bad return code %d from send (%s)", rc, + (code == 0 ? "unknown" : code)); + return rc; + } + + if (rc == 0) { + ERROR("Server shutdown prematurely in write_some"); + return -1; + } + + left -= rc; + data->so_far += rc; + } + + return left; +} + +static int +tdnbd_read_some(int fd, struct nbd_queued_io *data) +{ + int left = data->len - data->so_far; + int rc; + char *code; + + while (left > 0) { + rc = recv(fd, data->buffer + data->so_far, left, 0); + + if (rc == -1) { + + if ((errno == EAGAIN) || (errno == EWOULDBLOCK)) + return left; + + code = strerror(errno); + ERROR("Bad return code %d from send (%s)", rc, + (code == 0 ? "unknown" : code)); + return rc; + } + + if (rc == 0) { + ERROR("Server shutdown prematurely in read_some"); + return -1; + } + + data->so_far += rc; + left -= rc; + } + + return left; +} + +static void +tdnbd_timeout_cb(event_id_t eb, char mode __attribute__((unused)), void *data) +{ + struct tdnbd_data *prv = data; + ERROR("Timeout!: %d", eb); + tdnbd_disable(prv, ETIMEDOUT); +} + +static void +tdnbd_writer_cb(event_id_t eb __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + struct td_nbd_request *pos, *q; + struct tdnbd_data *prv = data; + + TAILQ_FOREACH_SAFE(pos, &prv->pending_reqs, queue, q) { + if (tdnbd_write_some(prv->socket, &pos->header) > 0) + return; + + if (ntohl(pos->nreq.type) == NBD_CMD_WRITE) { + if (tdnbd_write_some(prv->socket, &pos->body) > 0) + return; + } + + if (ntohl(pos->nreq.type) == NBD_CMD_DISC) { + INFO("sent close request"); + /* + * We don''t expect a response from a DISC, so move the + * request back onto the free list + */ + TAILQ_MOVE_HEAD(pos, &prv->pending_reqs, &prv->free_reqs, queue); + prv->nr_free_count++; + prv->closed = 2; + } else + TAILQ_MOVE_HEAD(pos, &prv->pending_reqs, &prv->sent_reqs, queue); + } + + /* If we''re here, we''ve written everything */ + + disable_write_queue(prv); + + if (prv->closed == 2) + tdnbd_disable(prv, EIO); + + return; +} + +static int +enable_write_queue(struct tdnbd_data *prv) +{ + if (prv->writer_event_id >= 0) + return 0; + + prv->writer_event_id + tapdisk_server_register_event(SCHEDULER_POLL_WRITE_FD, + prv->socket, + 0, + tdnbd_writer_cb, + prv); + + return prv->writer_event_id; +} + +static void +disable_write_queue(struct tdnbd_data *prv) +{ + if (prv->writer_event_id < 0) + return; + + tapdisk_server_unregister_event(prv->writer_event_id); + + prv->writer_event_id = -1; +} + +static int +tdnbd_queue_request(struct tdnbd_data *prv, int type, uint64_t offset, + char *buffer, uint32_t length, td_request_t treq, int fake) +{ + struct td_nbd_request *req; + int id; + + if (prv->nr_free_count == 0) + return -EBUSY; + + if (prv->closed == 3) { + td_complete_request(treq, -ETIMEDOUT); + return -ETIMEDOUT; + } + + req = TAILQ_FIRST(&prv->free_reqs); + + /* fill in the request */ + + req->treq = treq; + id = global_id++; + snprintf(req->nreq.handle, 8, "td%05x", id % 0xffff); + + /* No response from a disconnect, so no need for a timeout */ + if (type != NBD_CMD_DISC) { + req->timeout_event = tapdisk_server_register_event( + SCHEDULER_POLL_TIMEOUT, + -1, /* dummy */ + NBD_TIMEOUT, + tdnbd_timeout_cb, + prv); + } else { + req->timeout_event = -1; + } + + INFO("request: %s timeout %d", req->nreq.handle, req->timeout_event); + + req->nreq.magic = htonl(NBD_REQUEST_MAGIC); + req->nreq.type = htonl(type); + req->nreq.from = htonll(offset); + req->nreq.len = htonl(length); + req->header.buffer = (char *)&req->nreq; + req->header.len = sizeof(req->nreq); + req->header.so_far = 0; + req->body.buffer = buffer; + req->body.len = length; + req->body.so_far = 0; + req->fake = fake; + + TAILQ_INSERT_TAIL(&prv->pending_reqs, req, queue); + prv->nr_free_count--; + + if (prv->writer_event_id < 0) + enable_write_queue(prv); + + return 0; +} + +/* NBD Reader callback */ + +static void +tdnbd_reader_cb(event_id_t eb __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + char handle[9]; + int do_disable = 0; + + /* Check to see if we''re in the middle of reading a response already */ + struct tdnbd_data *prv = data; + int rc = tdnbd_read_some(prv->socket, &prv->cur_reply_qio); + + if (rc < 0) { + ERROR("Error reading reply header: %d", rc); + tdnbd_disable(prv, EIO); + return; + } + + if (rc > 0) + return; /* need more data */ + + /* Got a header. */ + if (prv->current_reply.error != 0) { + ERROR("Error in reply: %d", prv->current_reply.error); + tdnbd_disable(prv, EIO); + return; + } + + /* Have we found the request yet? */ + if (prv->curr_reply_req == NULL) { + struct td_nbd_request *pos, *q; + TAILQ_FOREACH_SAFE(pos, &prv->sent_reqs, queue, q) { + if (memcmp(pos->nreq.handle, prv->current_reply.handle, + 8) == 0) { + prv->curr_reply_req = pos; + break; + } + } + + if (prv->curr_reply_req == NULL) { + memcpy(handle, prv->current_reply.handle, 8); + handle[8] = 0; + + ERROR("Couldn''t find request corresponding to reply " + "(reply handle=''%s'')", handle); + tdnbd_disable(prv, EIO); + return; + } + } + + switch(ntohl(prv->curr_reply_req->nreq.type)) { + case NBD_CMD_READ: + rc = tdnbd_read_some(prv->socket, + &prv->curr_reply_req->body); + + if (rc < 0) { + ERROR("Error reading body of request: %d", rc); + tdnbd_disable(prv, EIO); + return; + } + + if (rc > 0) + return; /* need more data */ + + td_complete_request(prv->curr_reply_req->treq, 0); + + break; + case NBD_CMD_WRITE: + td_complete_request(prv->curr_reply_req->treq, 0); + + break; + default: + ERROR("Unhandled request response: %d", + ntohl(prv->curr_reply_req->nreq.type)); + do_disable = 1; + return; + } + + /* remove the state */ + TAILQ_MOVE_HEAD(prv->curr_reply_req, &prv->sent_reqs, &prv->free_reqs, + queue); + prv->nr_free_count++; + + prv->cur_reply_qio.so_far = 0; + if (prv->curr_reply_req->timeout_event >= 0) { + tapdisk_server_unregister_event( + prv->curr_reply_req->timeout_event); + } + + prv->curr_reply_req = NULL; + + /* + * NB: do this here otherwise we cancel the request that has just been + * moved + */ + if (do_disable) + tdnbd_disable(prv, EIO); +} + +static int +tdnbd_wait_read(int fd) +{ + struct timeval select_tv; + fd_set socks; + int rc; + + FD_ZERO(&socks); + FD_SET(fd, &socks); + select_tv.tv_sec = 10; + select_tv.tv_usec = 0; + rc = select(fd + 1, &socks, NULL, NULL, &select_tv); + return rc; +} + +static int +tdnbd_nbd_negotiate(struct tdnbd_data *prv, td_driver_t *driver) +{ +#define RECV_BUFFER_SIZE 256 + int rc; + char buffer[RECV_BUFFER_SIZE]; + uint64_t magic; + uint64_t size; + uint32_t flags; + int padbytes = 124; + int sock = prv->socket; + + /* + * NBD negotiation protocol: + * + * Server sends ''NBDMAGIC'' + * then it sends 0x00420281861253L + * then it sends a 64 bit bigendian size + * then it sends a 32 bit bigendian flags + * then it sends 124 bytes of nothing + */ + + /* + * We need to limit the time we spend in this function as we''re still + * using blocking IO at this point + */ + if (tdnbd_wait_read(sock) <= 0) { + ERROR("Timeout in nbd_negotiate"); + close(sock); + return -1; + } + + rc = recv(sock, buffer, 8, 0); + if (rc < 8) { + ERROR("Short read in negotiation(1) (%d)\n", rc); + close(sock); + return -1; + } + + if (memcmp(buffer, "NBDMAGIC", 8) != 0) { + buffer[8] = 0; + ERROR("Error in NBD negotiation: got ''%s''", buffer); + close(sock); + return -1; + } + + if (tdnbd_wait_read(sock) <= 0) { + ERROR("Timeout in nbd_negotiate"); + close(sock); + return -1; + } + + rc = recv(sock, &magic, sizeof(magic), 0); + if (rc < 8) { + ERROR("Short read in negotiation(2) (%d)\n", rc); + + return -1; + } + + if (ntohll(magic) != NBD_NEGOTIATION_MAGIC) { + ERROR("Not enough magic in negotiation(2) (%"PRIu64")\n", + ntohll(magic)); + close(sock); + return -1; + } + + if (tdnbd_wait_read(sock) <= 0) { + ERROR("Timeout in nbd_negotiate"); + close(sock); + return -1; + } + + rc = recv(sock, &size, sizeof(size), 0); + if (rc < sizeof(size)) { + ERROR("Short read in negotiation(3) (%d)\n", rc); + close(sock); + return -1; + } + + INFO("Got size: %"PRIu64"", ntohll(size)); + + driver->info.size = ntohll(size) >> SECTOR_SHIFT; + driver->info.sector_size = DEFAULT_SECTOR_SIZE; + driver->info.info = 0; + + if (tdnbd_wait_read(sock) <= 0) { + ERROR("Timeout in nbd_negotiate"); + close(sock); + return -1; + } + + rc = recv(sock, &flags, sizeof(flags), 0); + if (rc < sizeof(flags)) { + ERROR("Short read in negotiation(4) (%d)\n", rc); + close(sock); + return -1; + } + + INFO("Got flags: %"PRIu32"", ntohl(flags)); + + while (padbytes > 0) { + if (tdnbd_wait_read(sock) <= 0) { + ERROR("Timeout in nbd_negotiate"); + close(sock); + return -1; + } + + rc = recv(sock, buffer, padbytes, 0); + if (rc < 0) { + ERROR("Bad read in negotiation(5) (%d)\n", rc); + close(sock); + return -1; + } + padbytes -= rc; + } + + INFO("Successfully connected to NBD server"); + + fcntl(sock, F_SETFL, O_NONBLOCK); + + return 0; +} + +static int +tdnbd_connect_import_session(struct tdnbd_data *prv, td_driver_t* driver) +{ + int sock; + int opt = 1; + int rc; + + sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (sock < 0) { + ERROR("Could not create socket: %s\n", strerror(errno)); + return -1; + } + + rc = setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, (void *)&opt, + sizeof(opt)); + if (rc < 0) { + ERROR("Could not set TCP_NODELAY: %s\n", strerror(errno)); + return -1; + } + + prv->remote = (struct sockaddr_in *)malloc( + sizeof(struct sockaddr_in *)); + if (!prv->remote) { + ERROR("struct sockaddr_in malloc failure\n"); + close(sock); + return -1; + } + prv->remote->sin_family = AF_INET; + rc = inet_pton(AF_INET, prv->peer_ip, &(prv->remote->sin_addr.s_addr)); + if (rc < 0) { + ERROR("Could not create inaddr: %s\n", strerror(errno)); + free(prv->remote); + prv->remote = NULL; + close(sock); + return -1; + } + else if (rc == 0) { + ERROR("inet_pton parse error\n"); + free(prv->remote); + prv->remote = NULL; + close(sock); + return -1; + } + prv->remote->sin_port = htons(prv->port); + + if (connect(sock, (struct sockaddr *)prv->remote, + sizeof(struct sockaddr)) < 0) { + ERROR("Could not connect to peer: %s\n", strerror(errno)); + close(sock); + return -1; + } + + prv->socket = sock; + + return tdnbd_nbd_negotiate(prv, driver); +} + +/* -- interface -- */ + +static int tdnbd_close(td_driver_t*, struct tqh_td_image_handle *); + +static int +tdnbd_open(td_driver_t* driver, const char* name, td_flag_t flags) +{ + struct tdnbd_data *prv; + char peer_ip[256]; + int port; + int rc; + int i; + + driver->info.sector_size = 512; + driver->info.info = 0; + + prv = (struct tdnbd_data *)driver->data; + memset(prv, 0, sizeof(struct tdnbd_data)); + + INFO("Opening nbd export to %s (flags=%x)\n", name, flags); + + prv->writer_event_id = -1; + TAILQ_INIT(&prv->sent_reqs); + TAILQ_INIT(&prv->pending_reqs); + TAILQ_INIT(&prv->free_reqs); + for (i = 0; i < MAX_NBD_REQS; i++) { + prv->requests[i].timeout_event = -1; + TAILQ_INSERT_HEAD(&prv->free_reqs, &prv->requests[i], queue); + } + prv->nr_free_count = MAX_NBD_REQS; + prv->cur_reply_qio.buffer = (char *)&prv->current_reply; + prv->cur_reply_qio.len = sizeof(struct nbd_reply); + rc = sscanf(name, "%255[^:]:%d", peer_ip, &port); + if (rc == 2) { + prv->peer_ip = malloc(strlen(peer_ip) + 1); + if (!prv->peer_ip) { + ERROR("Failure to malloc for NBD destination"); + return -1; + } + strcpy(prv->peer_ip, peer_ip); + prv->port = port; + prv->name = NULL; + INFO("Export peer=%s port=%d\n", prv->peer_ip, prv->port); + if (tdnbd_connect_import_session(prv, driver) < 0) + return -1; + + } else { + prv->socket = tdnbd_retreive_passed_fd(name); + if (prv->socket < 0) { + ERROR("Couldn''t find fd named: %s", name); + return -1; + } + INFO("Found passed fd. Connecting..."); + prv->remote = NULL; + prv->peer_ip = NULL; + prv->name = strdup(name); + prv->port = -1; + if (tdnbd_nbd_negotiate(prv, driver) < 0) { + ERROR("Failed to negotiate"); + return -1; + } + } + + prv->reader_event_id + tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + prv->socket, 0, + tdnbd_reader_cb, + (void *)prv); + + prv->flags = flags; + prv->closed = 0; + + if (flags & TD_OPEN_SECONDARY) + INFO("Opening in secondary mode: Read requests will be " + "forwarded"); + + return 0; + +} + +static int +tdnbd_close(td_driver_t* driver, + struct tqh_td_image_handle *head __attribute__((unused))) +{ + struct tdnbd_data *prv = (struct tdnbd_data *)driver->data; + td_request_t treq; + + bzero(&treq, sizeof(treq)); + + if (prv->closed == 3) { + INFO("NBD close: already decided that the connection is dead."); + if (prv->socket >= 0) + close(prv->socket); + prv->socket = -1; + return 0; + } + + /* Send a close packet */ + + INFO("Sending disconnect request"); + tdnbd_queue_request(prv, NBD_CMD_DISC, 0, 0, 0, treq, 0); + + INFO("Switching socket to blocking IO mode"); + fcntl(prv->socket, F_SETFL, fcntl(prv->socket, F_GETFL) & ~O_NONBLOCK); + + INFO("Writing disconnection request"); + tdnbd_writer_cb(0, 0, prv); + + INFO("Written"); + + if (prv->peer_ip) { + free(prv->peer_ip); + prv->peer_ip = NULL; + } + + if (prv->name) { + tdnbd_stash_passed_fd(prv->socket, prv->name, 0); + free(prv->name); + } else { + if (prv->socket >= 0) + close(prv->socket); + prv->socket = -1; + } + + return 0; +} + +static void +tdnbd_queue_read(td_driver_t* driver, td_request_t treq) +{ + struct tdnbd_data *prv = (struct tdnbd_data *)driver->data; + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + if (prv->flags & TD_OPEN_SECONDARY) + td_forward_request(treq); + else + tdnbd_queue_request(prv, NBD_CMD_READ, offset, treq.buf, size, + treq, 0); + +} + +static void +tdnbd_queue_write(td_driver_t* driver, td_request_t treq) +{ + struct tdnbd_data *prv = (struct tdnbd_data *)driver->data; + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + tdnbd_queue_request(prv, NBD_CMD_WRITE, + offset, treq.buf, size, treq, 0); +} + +static int +tdnbd_get_parent_id(td_driver_t* driver __attribute__((unused)), + td_disk_id_t* id __attribute__((unused))) +{ + return TD_NO_PARENT; +} + +static int +tdnbd_validate_parent(td_driver_t *driver __attribute__((unused)), + td_driver_t *parent __attribute__((unused)), + td_flag_t flags __attribute__((unused))) +{ + return -EINVAL; +} + +struct tap_disk tapdisk_nbd = { + .disk_type = "tapdisk_nbd", + .private_data_size = sizeof(struct tdnbd_data), + .flags = 0, + .td_open = tdnbd_open, + .td_close = tdnbd_close, + .td_queue_read = tdnbd_queue_read, + .td_queue_write = tdnbd_queue_write, + .td_get_parent_id = tdnbd_get_parent_id, + .td_validate_parent = tdnbd_validate_parent, +}; + diff --git a/tools/blktap3/drivers/block-ram.c b/tools/blktap3/drivers/block-ram.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-ram.c @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/statvfs.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <string.h> + +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" + +void *img; +long int disksector_size; +long int disksize; +long int diskinfo; +static int connections = 0; + +struct tdram_state { + int fd; +}; + +/*Get Image size, secsize*/ +static int get_image_info(int fd, td_disk_info_t *info) +{ + int ret; + struct stat stat; + + ret = fstat(fd, &stat); + if (ret != 0) { + DPRINTF("ERROR: fstat failed, Couldn''t stat image"); + return -EINVAL; + } + + if (S_ISBLK(stat.st_mode)) { + /*Accessing block device directly*/ + info->size = 0; + if (ioctl(fd,BLKGETSIZE,&info->size)!=0) { + DPRINTF("ERR: BLKGETSIZE failed, couldn''t stat image"); + return -EINVAL; + } + + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + + /*Get the sector size*/ +#if defined(BLKSSZGET) + { + info->sector_size = DEFAULT_SECTOR_SIZE; + ioctl(fd, BLKSSZGET, &info->sector_size); + + if (info->sector_size != DEFAULT_SECTOR_SIZE) + DPRINTF("Note: sector size is %ld (not %d)\n", + info->sector_size, DEFAULT_SECTOR_SIZE); + } +#else + info->sector_size = DEFAULT_SECTOR_SIZE; +#endif + + } else { + /*Local file? try fstat instead*/ + info->size = (stat.st_size >> SECTOR_SHIFT); + info->sector_size = DEFAULT_SECTOR_SIZE; + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(info->size << SECTOR_SHIFT), + (long long unsigned)info->size); + } + + if (info->size == 0) { + info->size =((uint64_t) MAX_RAMDISK_SIZE); + info->sector_size = DEFAULT_SECTOR_SIZE; + } + info->info = 0; + + /*Store variables locally*/ + disksector_size = info->sector_size; + disksize = info->size; + diskinfo = info->info; + DPRINTF("Image sector_size: \n\t[%lu]\n", + info->sector_size); + + return 0; +} + +/* Open the disk file and initialize ram state. */ +int tdram_open (td_driver_t *driver, const char *name, td_flag_t flags) +{ + char *p; + uint64_t size; + int i, fd, ret = 0, count = 0, o_flags; + struct tdram_state *prv = (struct tdram_state *)driver->data; + + connections++; + + if (connections > 1) { + driver->info.sector_size = disksector_size; + driver->info.size = disksize; + driver->info.info = diskinfo; + DPRINTF("Image already open, returning parameters:\n"); + DPRINTF("Image size: \n\tpre sector_shift [%llu]\n\tpost " + "sector_shift [%llu]\n", + (long long unsigned)(driver->info.size << SECTOR_SHIFT), + (long long unsigned)driver->info.size); + DPRINTF("Image sector_size: \n\t[%lu]\n", + driver->info.sector_size); + + prv->fd = -1; + goto done; + } + + /* Open the file */ + o_flags = O_DIRECT | O_LARGEFILE | + ((flags == TD_OPEN_RDONLY) ? O_RDONLY : O_RDWR); + fd = open(name, o_flags); + + if ((fd == -1) && (errno == EINVAL)) { + + /* Maybe O_DIRECT isn''t supported. */ + o_flags &= ~O_DIRECT; + fd = open(name, o_flags); + if (fd != -1) DPRINTF("WARNING: Accessing image without" + "O_DIRECT! (%s)\n", name); + + } else if (fd != -1) DPRINTF("open(%s) with O_DIRECT\n", name); + + if (fd == -1) { + DPRINTF("Unable to open [%s]!\n",name); + ret = 0 - errno; + goto done; + } + + prv->fd = fd; + + ret = get_image_info(fd, &driver->info); + size = MAX_RAMDISK_SIZE; + + if (driver->info.size > size) { + DPRINTF("Disk exceeds limit, must be less than [%d]MB", + (MAX_RAMDISK_SIZE<<SECTOR_SHIFT)>>20); + return -ENOMEM; + } + + /*Read the image into memory*/ + if (posix_memalign(&img, DEFAULT_SECTOR_SIZE, + driver->info.size << SECTOR_SHIFT)) { + DPRINTF("Mem malloc failed\n"); + return -errno; + } + p = img; + DPRINTF("Reading %llu bytes.......", + (long long unsigned)driver->info.size << SECTOR_SHIFT); + + for (i = 0; i < driver->info.size; i++) { + ret = read(prv->fd, p, driver->info.sector_size); + if (ret != driver->info.sector_size) { + DPRINTF("ret = %d, errno = %d\n", ret, errno); + ret = 0 - errno; + break; + } else { + count += ret; + p = img + count; + } + } + DPRINTF("[%d]\n",count); + if (count != driver->info.size << SECTOR_SHIFT) { + ret = -1; + } else { + ret = 0; + } + +done: + return ret; +} + +void tdram_queue_read(td_driver_t *driver, td_request_t treq) +{ + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + memcpy(treq.buf, img + offset, size); + + td_complete_request(treq, 0); +} + +void tdram_queue_write(td_driver_t *driver, td_request_t treq) +{ + int size = treq.secs * driver->info.sector_size; + uint64_t offset = treq.sec * (uint64_t)driver->info.sector_size; + + /* We assume that write access is controlled + * at a higher level for multiple disks */ + memcpy(img + offset, treq.buf, size); + + td_complete_request(treq, 0); +} + +int tdram_close(td_driver_t *driver __attribute__((unused)), + struct tqh_td_image_handle *head __attribute__((unused))) +{ + connections--; + + return 0; +} + +int tdram_get_parent_id(td_driver_t *driver __attribute__((unused)), + td_disk_id_t *id __attribute__((unused))) +{ + return TD_NO_PARENT; +} + +int tdram_validate_parent(td_driver_t *driver __attribute__((unused)), + td_driver_t *pdriver __attribute__((unused)), + td_flag_t flags __attribute__((unused))) +{ + return -EINVAL; +} + +struct tap_disk tapdisk_ram = { + .disk_type = "tapdisk_ram", + .flags = 0, + .private_data_size = sizeof(struct tdram_state), + .td_open = tdram_open, + .td_close = tdram_close, + .td_queue_read = tdram_queue_read, + .td_queue_write = tdram_queue_write, + .td_get_parent_id = tdram_get_parent_id, + .td_validate_parent = tdram_validate_parent, + .td_debug = NULL, +}; diff --git a/tools/blktap3/drivers/block-valve.c b/tools/blktap3/drivers/block-valve.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-valve.c @@ -0,0 +1,703 @@ +/* + * Copyright (c) 2010, Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/socket.h> +#include <sys/un.h> + +#include "tapdisk.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" +#include "block-valve.h" + +typedef struct td_valve td_valve_t; +typedef struct td_valve_request td_valve_request_t; + +TAILQ_HEAD(tqh_td_valve_request, td_valve_request); + +struct td_valve_request { + td_request_t treq; + int secs; + + TAILQ_ENTRY(td_valve_request) entry; + td_valve_t *valve; +}; + +struct td_valve_stats { + unsigned long long stor; + unsigned long long forw; +}; + +struct td_valve { + char *brname; + unsigned long flags; + + int sock; + event_id_t sock_id; + + event_id_t sched_id; + event_id_t retry_id; + + unsigned int cred; + unsigned int need; + unsigned int done; + + struct tqh_td_valve_request stor; + struct tqh_td_valve_request forw; + + td_valve_request_t reqv[MAX_REQUESTS]; + td_valve_request_t *free[MAX_REQUESTS]; + int n_free; + + struct td_valve_stats stats; +}; + +//list_for_each_entry_safe(_req, _next, &(_valve)->stor, entry) +#define td_valve_for_each_stored_request(_req, _next, _valve) \ + TAILQ_FOREACH_SAFE(_req, &(_valve)->stor, entry, _next) + +//list_for_each_entry_safe(_req, _next, &(_valve)->forw, entry) +#define td_valve_for_each_forwarded_request(_req, _next, _valve) \ + TAILQ_FOREACH_SAFE(_req, &(_valve)->forw, entry, _next) + +#define TD_VALVE_CONNECT_INTERVAL 2 /* s */ + +#define TD_VALVE_RDLIMIT (1<<0) +#define TD_VALVE_WRLIMIT (1<<1) +#define TD_VALVE_KILLED (1<<31) + +static void valve_schedule_retry(td_valve_t *); +static void valve_conn_receive(td_valve_t *); +static void valve_conn_request(td_valve_t *, unsigned long); +static void valve_forward_stored_requests(td_valve_t *); +static void valve_kill(td_valve_t *); + +#define DBG(_f, _a...) if (1) { tlog_syslog(TLOG_DBG, _f, ##_a); } +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "valve: " _f, ##_a) +#define WARN(_f, _a...) tlog_syslog(TLOG_WARN, "WARNING: "_f " in %s:%d", \ + ##_a, __func__, __LINE__) +#define ERR(_f, _a...) tlog_syslog(TLOG_WARN, "ERROR: " _f " in %s:%d", \ + ##_a, __func__, __LINE__) +#define VERR(_err, _f, _a...) tlog_syslog(TLOG_WARN, \ + "ERROR: err=%d (%s), " _f ".", \ + _err, strerror(-(_err)), ##_a) +#undef PERROR +#define PERROR(_f, _a...) VERR(-errno, _f, ##_a) + +#define BUG() do { \ + ERR("Aborting"); \ + td_panic(); \ + } while (0) + +#define BUG_ON(_cond) \ + if (unlikely(_cond)) { \ + ERR("(%s) = %ld", #_cond, (long)(_cond)); \ + BUG(); \ + } + +#define WARN_ON(_cond) ({ \ + int __cond = _cond; \ + if (unlikely(__cond)) \ + WARN("(%s) = %ld", #_cond, (long)(_cond)); \ + __cond; \ +}) + +#define TREQ_SIZE(_treq) ((unsigned int)(_treq.secs) << 9) + +static td_valve_request_t * +valve_alloc_request(td_valve_t *valve) +{ + td_valve_request_t *req = NULL; + + if (valve->n_free) + req = valve->free[--valve->n_free]; + + return req; +} + +static void +valve_free_request(td_valve_t *valve, td_valve_request_t *req, + struct tqh_td_valve_request *head) +{ + BUG_ON(valve->n_free >= ARRAY_SIZE(valve->free)); + if (head) + TAILQ_REMOVE(head, req, entry); + valve->free[valve->n_free++] = req; +} + +static void +__valve_sock_event(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *private) +{ + td_valve_t *valve = private; + + valve_conn_receive(valve); + + valve_forward_stored_requests(valve); +} + +static void +valve_set_done_pending(td_valve_t *valve) +{ + WARN_ON(valve->done == 0); + tapdisk_server_mask_event(valve->sched_id, 0); +} + +static void +valve_clear_done_pending(td_valve_t *valve) +{ + WARN_ON(valve->done != 0); + tapdisk_server_mask_event(valve->sched_id, 1); +} + +static void +__valve_sched_event(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *private) +{ + td_valve_t *valve = private; + + if (likely(valve->done > 0)) + /* flush valve->done */ + valve_conn_request(valve, 0); +} + +static void +valve_sock_close(td_valve_t *valve) +{ + if (valve->sock >= 0) { + close(valve->sock); + valve->sock = -1; + } + + if (valve->sock_id >= 0) { + tapdisk_server_unregister_event(valve->sock_id); + valve->sock_id = -1; + } + + if (valve->sched_id >= 0) { + tapdisk_server_unregister_event(valve->sched_id); + valve->sched_id = -1; + } +} + +static int +valve_sock_open(td_valve_t *valve) +{ + struct sockaddr_un addr = { .sun_family = AF_UNIX }; + int s, id, err; + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + PERROR("socket"); + err = -errno; + goto fail; + } + + valve->sock = s; + + if (valve->brname[0] == ''/'') + strncpy(addr.sun_path, valve->brname, + sizeof(addr.sun_path)); + else + snprintf(addr.sun_path, sizeof(addr.sun_path), + "%s/%s", TD_VALVE_SOCKDIR, valve->brname); + + err = connect(valve->sock, &addr, sizeof(addr)); + if (err) { + err = -errno; + goto fail; + } + + id = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + valve->sock, 0, + __valve_sock_event, + valve); + if (id < 0) { + err = id; + goto fail; + } + + valve->sock_id = id; + + id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, + -1, 0, + __valve_sched_event, + valve); + if (id < 0) { + err = id; + goto fail; + } + + valve->sched_id = id; + + INFO("Connected to %s", addr.sun_path); + + valve->cred = 0; + valve->need = 0; + valve->done = 0; + + valve_clear_done_pending(valve); + + return 0; + +fail: + valve_sock_close(valve); + return err; +} + +static int +valve_sock_send(td_valve_t *valve, const void *msg, size_t size) +{ + ssize_t n; + + n = send(valve->sock, msg, size, MSG_DONTWAIT); + if (n < 0) + return -errno; + if (n != size) + return -EPROTO; + + return 0; +} + +static int +valve_sock_recv(td_valve_t *valve, void *msg, size_t size) +{ + ssize_t n; + + n = recv(valve->sock, msg, size, MSG_DONTWAIT); + if (n < 0) + return -errno; + + return n; +} + +static void +__valve_retry_timeout(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *private) +{ + td_valve_t *valve = private; + int err; + + err = valve_sock_open(valve); + if (!err) + tapdisk_server_unregister_event(valve->retry_id); +} + +static void +valve_schedule_retry(td_valve_t *valve) +{ + int id; + + BUG_ON(valve->sock_id >= 0); + + id = tapdisk_server_register_event(SCHEDULER_POLL_TIMEOUT, + -1, TD_VALVE_CONNECT_INTERVAL, + __valve_retry_timeout, + valve); + BUG_ON(id < 0); + + valve->retry_id = id; +} + +static void +valve_conn_open(td_valve_t *valve) +{ + int err; + + BUG_ON(valve->flags & TD_VALVE_KILLED); + + err = valve_sock_open(valve); + if (err) { + WARN("%s: %s", valve->brname, strerror(-err)); + valve_schedule_retry(valve); + } +} + +static void +valve_conn_close(td_valve_t *valve, int reset) +{ + td_valve_request_t *req, *next; + + valve_sock_close(valve); + + if (reset) + td_valve_for_each_stored_request(req, next, valve) { + td_forward_request(req->treq); + valve->stats.forw++; + valve_free_request(valve, req, &valve->stor); + } + + WARN_ON(!TAILQ_EMPTY(&valve->stor)); +} + +static void +valve_conn_reset(td_valve_t *valve) +{ + valve_conn_close(valve, 1); + valve_conn_open(valve); +} + +void +valve_conn_receive(td_valve_t *valve) +{ + unsigned long buf[32], cred = 0; + ssize_t n; + int i, err; + + n = valve_sock_recv(valve, buf, sizeof(buf)); + if (!n) { + err = -ECONNRESET; + goto reset; + } + + if (n < 0) { + err = n; + if (err != -EAGAIN) + goto reset; + } + + for (i = 0; i < n / sizeof(buf[0]); i++) { + err = WARN_ON(buf[i] >= TD_RLB_REQUEST_MAX); + if (err) + goto kill; + + cred += buf[i]; + } + + if (cred > valve->need) { + err = -EINVAL; + goto reset; + } + + valve->cred += cred; + valve->need -= cred; + + return; + +reset: + VERR(err, "resetting connection"); + valve_conn_reset(valve); + return; + +kill: + ERR("Killing valve."); + valve_kill(valve); +} + +static void +valve_conn_request(td_valve_t *valve, unsigned long size) +{ + struct td_valve_req _req; + int err; + + _req.need = size; + _req.done = valve->done; + + valve->need += size; + valve->done = 0; + + valve_clear_done_pending(valve); + + err = valve_sock_send(valve, &_req, sizeof(_req)); + if (!err) + return; + + VERR(err, "resetting connection"); + valve_conn_reset(valve); +} + +static int +valve_expend_request(td_valve_t *valve, const td_request_t treq) +{ + if (valve->flags & TD_VALVE_KILLED) + return 0; + + if (valve->sock < 0) + return 0; + + if (valve->cred < TREQ_SIZE(treq)) + return -EAGAIN; + + valve->cred -= TREQ_SIZE(treq); + + return 0; +} + +static void +__valve_complete_treq(td_request_t treq, int error) +{ + td_valve_request_t *req = treq.cb_data; + td_valve_t *valve = req->valve; + + BUG_ON(req->secs < treq.secs); + req->secs -= treq.secs; + + valve->done += TREQ_SIZE(treq); + valve_set_done_pending(valve); + + if (!req->secs) { + td_complete_request(req->treq, error); + valve_free_request(valve, req, &valve->forw); + } +} + +static void +valve_forward_stored_requests(td_valve_t *valve) +{ + td_valve_request_t *req, *next; + td_request_t clone; + int err; + + td_valve_for_each_stored_request(req, next, valve) { + + err = valve_expend_request(valve, req->treq); + if (err) + break; + + clone = req->treq; + clone.cb = __valve_complete_treq; + clone.cb_data = req; + + td_forward_request(clone); + valve->stats.forw++; + + //list_move(&req->entry, &valve->forw); + TAILQ_MOVE_HEAD(req, &valve->stor, &valve->forw, entry); + } +} + +static int +valve_store_request(td_valve_t *valve, td_request_t treq) +{ + td_valve_request_t *req; + + req = valve_alloc_request(valve); + if (!req) + return -EBUSY; + + valve_conn_request(valve, TREQ_SIZE(treq)); + + req->treq = treq; + req->secs = treq.secs; + + //list_add_tail(&req->entry, &valve->stor); + TAILQ_INSERT_TAIL(&valve->stor, req, entry); + valve->stats.stor++; + + return 0; +} + +static void +valve_kill(td_valve_t *valve) +{ + valve->flags |= TD_VALVE_KILLED; + valve_conn_close(valve, 1); +} + +static void +valve_init(td_valve_t *valve, unsigned long flags) +{ + int i; + + memset(valve, 0, sizeof(*valve)); + + TAILQ_INIT(&valve->stor); + TAILQ_INIT(&valve->forw); + + valve->sock = -1; + valve->sock_id = -1; + + valve->retry_id = -1; + valve->sched_id = -1; + + valve->flags = flags; + + for (i = ARRAY_SIZE(valve->reqv) - 1; i >= 0; i--) { + td_valve_request_t *req = &valve->reqv[i]; + + req->valve = valve; + + valve_free_request(valve, req, NULL); + } +} + +static int +td_valve_close(td_driver_t *driver, + struct tqh_td_image_handle *head __attribute__((unused))) +{ + td_valve_t *valve = driver->data; + + WARN_ON(!TAILQ_EMPTY(&valve->stor)); + WARN_ON(!TAILQ_EMPTY(&valve->forw)); + + valve_conn_close(valve, 0); + + if (valve->brname) { + free(valve->brname); + valve->brname = NULL; + } + + return 0; +} + +static int +td_valve_open(td_driver_t *driver, + const char *name, td_flag_t flags __attribute__((unused))) +{ + td_valve_t *valve = driver->data; + int err; + + valve_init(valve, TD_VALVE_WRLIMIT); + + valve->brname = strdup(name); + if (!valve->brname) { + err = -errno; + goto fail; + } + + valve_conn_open(valve); + + return 0; + +fail: + td_valve_close(driver, NULL); + return err; +} + +static void +td_valve_queue_request(td_driver_t *driver, td_request_t treq) +{ + td_valve_t *valve = driver->data; + int err; + + switch (treq.op) { + + case TD_OP_READ: + if (valve->flags & TD_VALVE_RDLIMIT) + break; + + goto forward; + + case TD_OP_WRITE: + if (valve->flags & TD_VALVE_WRLIMIT) + break; + + goto forward; + + default: + BUG(); + } + + err = valve_expend_request(valve, treq); + if (!err) + goto forward; + + err = valve_store_request(valve, treq); + if (err) + td_complete_request(treq, -EBUSY); + + return; + +forward: + td_forward_request(treq); + valve->stats.forw++; +} + +static int +td_valve_get_parent_id(td_driver_t *driver __attribute__((unused)), + td_disk_id_t *id __attribute__((unused))) +{ + return -EINVAL; +} + +static int +td_valve_validate_parent(td_driver_t *driver __attribute__((unused)), + td_driver_t *parent_driver __attribute__((unused)), + td_flag_t flags __attribute__((unused))) +{ + return -EINVAL; +} + +static void +td_valve_stats(td_driver_t *driver, td_stats_t *st) +{ + td_valve_t *valve = driver->data; + td_valve_request_t *req, *next; + int n_reqs; + + tapdisk_stats_field(st, "bridge", "d", valve->brname); + tapdisk_stats_field(st, "flags", "#x", valve->flags); + + tapdisk_stats_field(st, "cred", "d", valve->cred); + tapdisk_stats_field(st, "need", "d", valve->need); + tapdisk_stats_field(st, "done", "d", valve->done); + + /* + * stored is [ waiting, total-waits ] + */ + + n_reqs = 0; + td_valve_for_each_stored_request(req, next, valve) + n_reqs++; + + tapdisk_stats_field(st, "stor", "["); + tapdisk_stats_val(st, "d", n_reqs); + tapdisk_stats_val(st, "llu", valve->stats.stor); + tapdisk_stats_leave(st, '']''); + + /* + * forwarded is [ in-flight, total-requests ] + */ + + n_reqs = 0; + td_valve_for_each_forwarded_request(req, next, valve) + n_reqs++; + + tapdisk_stats_field(st, "forw", "["); + tapdisk_stats_val(st, "d", n_reqs); + tapdisk_stats_val(st, "llu", valve->stats.forw); + tapdisk_stats_leave(st, '']''); +} + +struct tap_disk tapdisk_valve = { + .disk_type = "tapdisk_valve", + .flags = 0, + .private_data_size = sizeof(td_valve_t), + .td_open = td_valve_open, + .td_close = td_valve_close, + .td_queue_read = td_valve_queue_request, + .td_queue_write = td_valve_queue_request, + .td_get_parent_id = td_valve_get_parent_id, + .td_validate_parent = td_valve_validate_parent, + .td_stats = td_valve_stats, +}; diff --git a/tools/blktap3/drivers/block-valve.h b/tools/blktap3/drivers/block-valve.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-valve.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2011, Citrix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TAPDISK_VALVE_H_ +#define _TAPDISK_VALVE_H_ + +#define TD_VALVE_SOCKDIR "/var/run/blktap/ratelimit" +#define TD_RLB_CONN_MAX 1024 +#define TD_RLB_REQUEST_MAX (8 << 20) + +struct td_valve_req { + unsigned long need; + unsigned long done; +}; + +#endif /* _TAPDISK_VALVE_H_ */ diff --git a/tools/blktap3/drivers/block-vhd.c b/tools/blktap3/drivers/block-vhd.c --- a/tools/blktap3/drivers/block-vhd.c +++ b/tools/blktap3/drivers/block-vhd.c @@ -763,7 +763,8 @@ vhd_log_close(struct vhd_state *s) } static int -_vhd_close(td_driver_t *driver) +_vhd_close(td_driver_t *driver, + struct tqh_td_image_handle *head __attribute__((unused))) { int err; struct vhd_state *s; diff --git a/tools/blktap3/drivers/block-vindex.c b/tools/blktap3/drivers/block-vindex.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/block-vindex.c @@ -0,0 +1,936 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <stdlib.h> + +#include "tapdisk.h" +#include "tapdisk-utils.h" +#include "tapdisk-driver.h" +#include "tapdisk-server.h" +#include "tapdisk-interface.h" + +#include "libvhd.h" +#include "libvhd-index.h" + +#define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) +#define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) +#define WARN(_f, _a...) tlog_write(TLOG_WARN, _f, ##_a) + +#define ASSERT(condition) \ + if (!(condition)) { \ + WARN("FAILED ASSERTION: ''%s''\n", #condition); \ + td_panic(); \ + } + +#define VHD_INDEX_FILE_POOL_SIZE 12 +#define VHD_INDEX_CACHE_SIZE 4 +#define VHD_INDEX_REQUESTS (TAPDISK_DATA_REQUESTS + VHD_INDEX_CACHE_SIZE) + +#define VHD_INDEX_BLOCK_READ_PENDING 0x0001 +#define VHD_INDEX_BLOCK_VALID 0x0002 + +#define VHD_INDEX_BAT_CLEAR 0 +#define VHD_INDEX_BIT_CLEAR 1 +#define VHD_INDEX_BIT_SET 2 +#define VHD_INDEX_CACHE_MISS 3 +#define VHD_INDEX_META_READ_PENDING 4 + +typedef struct vhd_index vhd_index_t; +typedef struct vhd_index_block vhd_index_block_t; +typedef struct vhd_index_request vhd_index_request_t; +typedef struct vhd_index_file_ref vhd_index_file_ref_t; + +TAILQ_HEAD(tqh_vhd_index_request, vhd_index_request); + +struct vhd_index_request { + off64_t off; + td_request_t treq; + vhd_index_t *index; + struct tiocb tiocb; + TAILQ_ENTRY(vhd_index_request) next; + vhd_index_file_ref_t *file; +}; + +struct vhd_index_block { + uint64_t blk; + uint32_t seqno; + td_flag_t state; + vhdi_block_t vhdi_block; + int table_size; + struct tqh_vhd_index_request queue; + vhd_index_request_t req; +}; + +struct vhd_index_file_ref { + int fd; + vhdi_file_id_t fid; + uint32_t seqno; + uint32_t refcnt; +}; + +struct vhd_index { + char *name; + + vhdi_bat_t bat; + vhdi_context_t vhdi; + vhdi_file_table_t files; + + vhd_index_file_ref_t fds[VHD_INDEX_FILE_POOL_SIZE]; + + vhd_index_block_t *cache[VHD_INDEX_CACHE_SIZE]; + + int cache_free_cnt; + vhd_index_block_t *cache_free_list[VHD_INDEX_CACHE_SIZE]; + vhd_index_block_t cache_list[VHD_INDEX_CACHE_SIZE]; + + int requests_free_cnt; + vhd_index_request_t *requests_free_list[VHD_INDEX_REQUESTS]; + vhd_index_request_t requests_list[VHD_INDEX_REQUESTS]; + + td_driver_t *driver; +}; + +static void vhd_index_complete_meta_read(void *, struct tiocb *, int); +static void vhd_index_complete_data_read(void *, struct tiocb *, int); + +//list_for_each_entry_safe((_req), (_tmp), &(_block)->queue, next) +#define vhd_index_block_for_each_request(_block, _req, _tmp) \ + TAILQ_FOREACH_SAFE((_req), &(_block)->queue, next, (_tmp)) + +static inline void +vhd_index_initialize_request(vhd_index_request_t *req) +{ + memset(req, 0, sizeof(vhd_index_request_t)); +} + +static inline void +vhd_index_initialize_block(vhd_index_block_t *block) +{ + block->blk = 0; + block->state = 0; + TAILQ_INIT(&block->queue); + vhd_index_initialize_request(&block->req); + memset(block->vhdi_block.table, 0, block->table_size); +} + +static void +vhd_index_init(vhd_index_t *index) +{ + int i; + + memset(index, 0, sizeof(vhd_index_t)); + + index->cache_free_cnt = VHD_INDEX_CACHE_SIZE; + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + index->cache_free_list[i] = index->cache_list + i; + vhd_index_initialize_block(index->cache_free_list[i]); + } + + index->requests_free_cnt = VHD_INDEX_REQUESTS; + for (i = 0; i < VHD_INDEX_REQUESTS; i++) { + index->requests_free_list[i] = index->requests_list + i; + vhd_index_initialize_request(index->requests_free_list[i]); + } + + for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) + index->fds[i].fd = -1; +} + +static int +vhd_index_allocate_cache(vhd_index_t *index) +{ + void *buf; + int i, err; + size_t size; + + size = vhd_bytes_padded(index->vhdi.spb * sizeof(vhdi_entry_t)); + + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); + if (err) + goto fail; + + memset(buf, 0, size); + index->cache_list[i].vhdi_block.table = (vhdi_entry_t *)buf; + index->cache_list[i].vhdi_block.entries = index->vhdi.spb; + index->cache_list[i].table_size = size; + } + + return 0; + +fail: + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + free(index->cache_list[i].vhdi_block.table); + index->cache_list[i].vhdi_block.table = NULL; + } + + return -ENOMEM; +} + +static void +vhd_index_free(vhd_index_t *index) +{ + int i; + + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) + free(index->cache_list[i].vhdi_block.table); + + for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) + if (index->fds[i].fd != -1) + close(index->fds[i].fd); + + vhdi_file_table_free(&index->files); + free(index->bat.table); + free(index->name); +} + +static int +vhd_index_load(vhd_index_t *index) +{ + int err; + + err = vhdi_bat_load(index->name, &index->bat); + if (err) + return err; + + err = vhdi_open(&index->vhdi, + index->bat.index_path, + O_RDONLY | O_DIRECT | O_LARGEFILE); + if (err) + goto fail; + + err = vhdi_file_table_load(index->bat.file_table_path, &index->files); + if (err) { + vhdi_close(&index->vhdi); + goto fail; + } + + return 0; + +fail: + free(index->bat.table); + memset(&index->bat, 0, sizeof(vhdi_bat_t)); + memset(&index->vhdi, 0, sizeof(vhdi_context_t)); + memset(&index->files, 0, sizeof(vhdi_file_table_t)); + return err; +} + +static int +vhd_index_open(td_driver_t *driver, const char *name, + td_flag_t flags __attribute__((unused))) +{ + int err; + vhd_index_t *index; + + index = (vhd_index_t *)driver->data; + + vhd_index_init(index); + + index->name = strdup(name); + if (!index->name) + return -ENOMEM; + + err = vhd_index_load(index); + if (err) { + free(index->name); + return err; + } + + err = vhd_index_allocate_cache(index); + if (err) { + vhd_index_free(index); + return err; + } + + driver->info.size = index->bat.vhd_blocks * index->bat.vhd_block_size; + driver->info.sector_size = VHD_SECTOR_SIZE; + driver->info.info = 0; + + index->driver = driver; + + DPRINTF("opened vhd index %s\n", name); + + return 0; +} + +static int +vhd_index_close(td_driver_t *driver, + struct tqh_td_image_handle *head __attribute__((unused))) +{ + vhd_index_t *index; + + index = (vhd_index_t *)driver->data; + vhdi_close(&index->vhdi); + + DPRINTF("closed vhd index %s\n", index->name); + + vhd_index_free(index); + + return 0; +} + +static inline void +vhd_index_touch_file_ref(vhd_index_t *index, vhd_index_file_ref_t *ref) +{ + int i; + + if (++ref->seqno == 0xFFFFFFFF) + for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) + index->fds[i].seqno >>= 1; +} + +static inline void +vhd_index_get_file_ref(vhd_index_file_ref_t *ref) +{ + ++ref->refcnt; +} + +static inline void +vhd_index_put_file_ref(vhd_index_file_ref_t *ref) +{ + --ref->refcnt; +} + +static inline vhd_index_file_ref_t * +vhd_index_find_lru_file_ref(vhd_index_t *index) +{ + int i; + uint32_t min; + vhd_index_file_ref_t *lru; + + lru = NULL; + min = (uint32_t)-1; + + for (i = 1; i < VHD_INDEX_FILE_POOL_SIZE; i++) { + if (index->fds[i].refcnt) + continue; + + if (!lru || index->fds[i].seqno < min) { + min = index->fds[i].seqno; + lru = index->fds + i; + } + } + + return lru; +} + +static inline int +vhd_index_open_file(vhd_index_t *index, + vhdi_file_id_t id, vhd_index_file_ref_t *ref) +{ + int i; + char *path; + + path = NULL; + + for (i = 0; i < index->files.entries; i++) + if (index->files.table[i].file_id == id) { + path = index->files.table[i].path; + break; + } + + if (!path) + return -ENOENT; + + ref->fd = open(path, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (ref->fd == -1) + return -errno; + + ref->fid = id; + ref->refcnt = 0; + + return 0; +} + +static int +vhd_index_get_file(vhd_index_t *index, + vhdi_file_id_t id, vhd_index_file_ref_t **ref) +{ + int i, err; + vhd_index_file_ref_t *lru; + + *ref = NULL; + + for (i = 0; i < VHD_INDEX_FILE_POOL_SIZE; i++) + if (id == index->fds[i].fid) { + *ref = index->fds + i; + vhd_index_touch_file_ref(index, *ref); + vhd_index_get_file_ref(*ref); + return 0; + } + + lru = vhd_index_find_lru_file_ref(index); + if (!lru) + return -EBUSY; + + if (lru->fd != -1) + close(lru->fd); + + err = vhd_index_open_file(index, id, lru); + if (err) + goto fail; + + vhd_index_touch_file_ref(index, lru); + vhd_index_get_file_ref(lru); + *ref = lru; + return 0; + +fail: + lru->fd = -1; + lru->fid = 0; + lru->refcnt = 0; + return err; +} + +static inline vhd_index_request_t * +vhd_index_allocate_request(vhd_index_t *index) +{ + vhd_index_request_t *req; + + if (index->requests_free_cnt <= 0) + return NULL; + + req = index->requests_free_list[--index->requests_free_cnt]; + ASSERT(!req->index); + + return req; +} + +static inline void +vhd_index_free_request(vhd_index_t *index, vhd_index_request_t *req, + struct tqh_vhd_index_request *head) +{ + if (head) + TAILQ_REMOVE(head, req, next); + vhd_index_initialize_request(req); + index->requests_free_list[index->requests_free_cnt++] = req; +} + +static inline int +vhd_index_block_valid(vhd_index_block_t *block) +{ + return (!td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING) && + td_flag_test(block->state, VHD_INDEX_BLOCK_VALID)); +} + +static inline void +vhd_index_touch_block(vhd_index_t *index, vhd_index_block_t *block) +{ + int i; + + if (++block->seqno == 0xFFFFFFFF) + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) + index->cache_list[i].seqno >>= 1; +} + +static inline vhd_index_block_t * +vhd_index_get_lru_block(vhd_index_t *index) +{ + int i, idx; + uint32_t min; + vhd_index_block_t *block, *lru; + + lru = NULL; + min = (uint32_t)-1; + idx = 0; + + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + block = index->cache[i]; + + if (!block) + continue; + + if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)) + continue; + + if (!lru || block->seqno < min) { + lru = block; + min = block->seqno; + idx = i; + } + } + + if (lru) + index->cache[idx] = NULL; + + return lru; +} + +static inline int +vhd_index_allocate_block(vhd_index_t *index, vhd_index_block_t **block) +{ + vhd_index_block_t *b; + + *block = NULL; + + if (index->cache_free_cnt > 0) + b = index->cache_free_list[--index->cache_free_cnt]; + else { + b = vhd_index_get_lru_block(index); + if (!b) + return -EBUSY; + } + + vhd_index_initialize_block(b); + vhd_index_touch_block(index, b); + *block = b; + + return 0; +} + +static int +vhd_index_install_block(vhd_index_t *index, + vhd_index_block_t **block, uint32_t blk) +{ + int i, err; + vhd_index_block_t *b; + + *block = NULL; + + err = vhd_index_allocate_block(index, &b); + if (err) + return err; + + b->blk = blk; + + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) + if (!index->cache[i]) { + index->cache[i] = b; + break; + } + + ASSERT(i < VHD_INDEX_CACHE_SIZE); + *block = b; + + return 0; +} + +static inline vhd_index_block_t * +vhd_index_get_block(vhd_index_t *index, uint32_t blk) +{ + int i; + vhd_index_block_t *block; + + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + block = index->cache[i]; + if (!block) + continue; + + if (block->blk == blk) + return block; + } + + return NULL; +} + +static int +vhd_index_read_cache(vhd_index_t *index, uint64_t sector) +{ + uint32_t blk, sec; + vhd_index_block_t *block; + + blk = sector / index->vhdi.spb; + + if (blk >= index->bat.vhd_blocks) + return -EINVAL; + + if (index->bat.table[blk] == DD_BLK_UNUSED) + return VHD_INDEX_BAT_CLEAR; + + block = vhd_index_get_block(index, blk); + if (!block) + return VHD_INDEX_CACHE_MISS; + + vhd_index_touch_block(index, block); + + if (td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)) + return VHD_INDEX_META_READ_PENDING; + + sec = sector % index->vhdi.spb; + if (block->vhdi_block.table[sec].offset == DD_BLK_UNUSED) + return VHD_INDEX_BIT_CLEAR; + + return VHD_INDEX_BIT_SET; +} + +static int +vhd_index_read_cache_span(vhd_index_t *index, + uint64_t sector, int secs, int value) +{ + int i; + uint32_t blk, sec; + vhd_index_block_t *block; + + blk = sector / index->vhdi.spb; + sec = sector % index->vhdi.spb; + + ASSERT(blk < index->bat.vhd_blocks); + + block = vhd_index_get_block(index, blk); + ASSERT(block && vhd_index_block_valid(block)); + + for (i = 0; i < secs && i + sec < index->vhdi.spb; i++) + if (value ^ + (block->vhdi_block.table[sec + i].offset != DD_BLK_UNUSED)) + break; + + return i; +} + +static int +vhd_index_schedule_meta_read(vhd_index_t *index, uint32_t blk) +{ + int err; + off64_t offset; + vhd_index_block_t *block; + vhd_index_request_t *req; + + ASSERT(index->bat.table[blk] != DD_BLK_UNUSED); + + block = vhd_index_get_block(index, blk); + if (!block) { + err = vhd_index_install_block(index, &block, blk); + if (err) + return err; + } + + offset = vhd_sectors_to_bytes(index->bat.table[blk]); + + req = &block->req; + req->index = index; + req->treq.sec = blk * index->vhdi.spb; + req->treq.secs = block->table_size >> VHD_SECTOR_SHIFT; + + td_prep_read(&req->tiocb, index->vhdi.fd, + (char *)block->vhdi_block.table, block->table_size, + offset, vhd_index_complete_meta_read, req); + td_queue_tiocb(index->driver, &req->tiocb); + + td_flag_set(block->state, VHD_INDEX_BLOCK_READ_PENDING); + + return 0; +} + +static int +vhd_index_schedule_data_read(vhd_index_t *index, td_request_t treq) +{ + int i, err; + size_t size; + off64_t offset; + uint32_t blk, sec; + vhd_index_block_t *block; + vhd_index_request_t *req; + vhd_index_file_ref_t *file; + + blk = treq.sec / index->vhdi.spb; + sec = treq.sec % index->vhdi.spb; + block = vhd_index_get_block(index, blk); + + ASSERT(block && vhd_index_block_valid(block)); + for (i = 0; i < treq.secs; i++) { + ASSERT(block->vhdi_block.table[sec + i].file_id != 0); + ASSERT(block->vhdi_block.table[sec + i].offset != DD_BLK_UNUSED); + } + + req = vhd_index_allocate_request(index); + if (!req) + return -EBUSY; + + err = vhd_index_get_file(index, + block->vhdi_block.table[sec].file_id, &file); + if (err) { + vhd_index_free_request(index, req, NULL); + return err; + } + + size = vhd_sectors_to_bytes(treq.secs); + offset = vhd_sectors_to_bytes(block->vhdi_block.table[sec].offset); + + req->file = file; + req->treq = treq; + req->index = index; + req->off = offset; + + td_prep_read(&req->tiocb, file->fd, treq.buf, size, offset, + vhd_index_complete_data_read, req); + td_queue_tiocb(index->driver, &req->tiocb); + + return 0; +} + +static int +vhd_index_queue_request(vhd_index_t *index, td_request_t treq) +{ + vhd_index_block_t *block; + vhd_index_request_t *req; + + req = vhd_index_allocate_request(index); + if (!req) + return -EBUSY; + + req->treq = treq; + + block = vhd_index_get_block(index, treq.sec / index->vhdi.spb); + ASSERT(block && td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)); + + TAILQ_INSERT_TAIL(&block->queue, req, next); + return 0; +} + +static void +vhd_index_queue_read(td_driver_t *driver, td_request_t treq) +{ + vhd_index_t *index; + + index = (vhd_index_t *)driver->data; + + while (treq.secs) { + int err; + td_request_t clone; + + err = 0; + clone = treq; + + switch (vhd_index_read_cache(index, clone.sec)) { + case -EINVAL: + err = -EINVAL; + goto fail; + + case VHD_INDEX_BAT_CLEAR: + clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); + td_forward_request(clone); + break; + + case VHD_INDEX_BIT_CLEAR: + clone.secs = vhd_index_read_cache_span(index, clone.sec, clone.secs, 0); + td_forward_request(clone); + break; + + case VHD_INDEX_BIT_SET: + clone.secs = vhd_index_read_cache_span(index, clone.sec, clone.secs, 1); + err = vhd_index_schedule_data_read(index, clone); + if (err) + goto fail; + break; + + case VHD_INDEX_CACHE_MISS: + err = vhd_index_schedule_meta_read(index, clone.sec / index->vhdi.spb); + if (err) + goto fail; + + clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); + vhd_index_queue_request(index, clone); + break; + + case VHD_INDEX_META_READ_PENDING: + clone.secs = MIN(clone.secs, index->vhdi.spb - (clone.sec % index->vhdi.spb)); + err = vhd_index_queue_request(index, clone); + if (err) + goto fail; + break; + } + + treq.sec += clone.secs; + treq.secs -= clone.secs; + treq.buf += vhd_sectors_to_bytes(clone.secs); + continue; + + fail: + clone.secs = treq.secs; + td_complete_request(clone, err); + break; + } +} + +static void +vhd_index_queue_write(td_driver_t *driver __attribute__((unused)), + td_request_t treq) +{ + td_complete_request(treq, -EPERM); +} + +static inline void +vhd_index_signal_completion(vhd_index_t *index, + vhd_index_request_t *req, int err, + struct tqh_vhd_index_request *head) +{ + td_complete_request(req->treq, err); + vhd_index_put_file_ref(req->file); + vhd_index_free_request(index, req, head); +} + +static void +vhd_index_complete_meta_read(void *arg, + struct tiocb *tiocb __attribute__((unused)), int err) +{ + int i; + uint32_t blk; + td_request_t treq; + vhd_index_t *index; + vhd_index_block_t *block; + vhd_index_request_t *req, *r, *tmp; + + req = (vhd_index_request_t *)arg; + index = req->index; + + blk = req->treq.sec / index->vhdi.spb; + block = vhd_index_get_block(index, blk); + ASSERT(block && td_flag_test(block->state, VHD_INDEX_BLOCK_READ_PENDING)); + td_flag_clear(block->state, VHD_INDEX_BLOCK_READ_PENDING); + + if (err) { + memset(block->vhdi_block.table, 0, block->table_size); + vhd_index_block_for_each_request(block, r, tmp) + vhd_index_signal_completion(index, r, err, &block->queue); + return; + } + + for (i = 0; i < block->vhdi_block.entries; i++) + vhdi_entry_in(block->vhdi_block.table + i); + + td_flag_set(block->state, VHD_INDEX_BLOCK_VALID); + + vhd_index_block_for_each_request(block, r, tmp) { + treq = r->treq; + vhd_index_free_request(index, r, &block->queue); + vhd_index_queue_read(index->driver, treq); + } +} + +static void +vhd_index_complete_data_read(void *arg, + struct tiocb *tiocb __attribute__((unused)), int err) +{ + vhd_index_t *index; + vhd_index_request_t *req; + + req = (vhd_index_request_t *)arg; + index = req->index; + + /* FIXME Can''t tell for sure if req belongs to the queue */ + vhd_index_signal_completion(index, req, err, NULL); +} + +static int +vhd_index_get_parent_id(td_driver_t *driver __attribute__((unused)), + td_disk_id_t *id __attribute__((unused))) +{ + return -EINVAL; +} + +static int +vhd_index_validate_parent(td_driver_t *driver __attribute__((unused)), + td_driver_t *parent __attribute__((unused)), + td_flag_t flags __attribute__((unused))) +{ + return -EINVAL; +} + +static void +vhd_index_debug(td_driver_t *driver) +{ + int i; + vhd_index_t *index; + + index = (vhd_index_t *)driver->data; + + WARN("VHD INDEX %s\n", index->name); + WARN("FILES:\n"); + for (i = 0; i < index->files.entries; i++) { + int j, fd, refcnt; + + fd = -1; + refcnt = 0; + + for (j = 0; j < VHD_INDEX_FILE_POOL_SIZE; j++) + if (index->fds[j].fid == index->files.table[i].file_id) { + fd = index->fds[j].fd; + refcnt = index->fds[j].refcnt; + } + + WARN("%s %u %d %d\n", + index->files.table[i].path, + index->files.table[i].file_id, + fd, refcnt); + } + + WARN("REQUESTS:\n"); + for (i = 0; i < VHD_INDEX_REQUESTS; i++) { + vhd_index_request_t *req; + + req = index->requests_list + i; + + if (!req->index) + continue; + + WARN("%d: buf: %p, sec: 0x%08"PRIx64", secs: 0x%04x, " + "fid: %u, off: 0x%016"PRIx64"\n", i, req->treq.buf, + req->treq.sec, req->treq.secs, req->file->fid, req->off); + } + + WARN("BLOCKS:\n"); + for (i = 0; i < VHD_INDEX_CACHE_SIZE; i++) { + int queued; + vhd_index_block_t *block; + vhd_index_request_t *req, *tmp; + + queued = 0; + block = index->cache[i]; + + if (!block) + continue; + + vhd_index_block_for_each_request(block, req, tmp) + ++queued; + + WARN("%d: blk: 0x%08"PRIx64", state: 0x%08x, queued: %d\n", + i, block->blk, block->state, queued); + } +} + +struct tap_disk tapdisk_vhd_index = { + .disk_type = "tapdisk_vhd_index", + .flags = 0, + .private_data_size = sizeof(vhd_index_t), + .td_open = vhd_index_open, + .td_close = vhd_index_close, + .td_queue_read = vhd_index_queue_read, + .td_queue_write = vhd_index_queue_write, + .td_get_parent_id = vhd_index_get_parent_id, + .td_validate_parent = vhd_index_validate_parent, + .td_debug = vhd_index_debug, +}; diff --git a/tools/blktap3/drivers/io-optimize.h b/tools/blktap3/drivers/io-optimize.h --- a/tools/blktap3/drivers/io-optimize.h +++ b/tools/blktap3/drivers/io-optimize.h @@ -2,28 +2,7 @@ * Copyright (c) 2008, XenSource Inc. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of XenSource Inc. nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * FIXME License missing from blktap2.5. */ #ifndef __IO_OPTIMIZE_H__ diff --git a/tools/blktap3/drivers/profile.h b/tools/blktap3/drivers/profile.h --- a/tools/blktap3/drivers/profile.h +++ b/tools/blktap3/drivers/profile.h @@ -2,28 +2,7 @@ * Copyright (c) 2008, XenSource Inc. * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of XenSource Inc. nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * FIXME blktap2.5 has no license */ #ifndef __TAP_PROFILE_H__ diff --git a/tools/blktap3/drivers/tapdisk-blktap.h b/tools/blktap3/drivers/tapdisk-blktap.h deleted file mode 100644 --- a/tools/blktap3/drivers/tapdisk-blktap.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright (c) 2010, Citrix Systems, Inc. - * - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of XenSource Inc. nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER - * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _TAPDISK_BLKTAP_H_ -#define _TAPDISK_BLKTAP_H_ - -typedef struct td_blktap td_blktap_t; -typedef struct td_blktap_req td_blktap_req_t; - -#include "blktap3.h" -#include "tapdisk-vbd.h" - -#if 0 -struct td_blktap_stats { - struct { - unsigned long long in; - unsigned long long out; - } reqs; - struct { - unsigned long long in; - unsigned long long out; - } kicks; -}; -#endif - -struct td_blktap { - int minor; - //td_vbd_t *vbd; - -#if 0 - int fd; -#endif - -#if 0 - void *vma; - size_t vma_size; - - struct blktap_sring *sring; - unsigned int req_cons; - unsigned int rsp_prod_pvt; -#endif - -#if 0 - int event_id; - void *vstart; - - int n_reqs; - td_blktap_req_t *reqs; - int n_reqs_free; - td_blktap_req_t **reqs_free; -#endif - - //TAILQ_ENTRY(td_blktap) entry; - - //struct td_blktap_stats stats; -}; - -#endif /* _TAPDISK_BLKTAP_H_ */ diff --git a/tools/blktap3/drivers/tapdisk-control.c b/tools/blktap3/drivers/tapdisk-control.c --- a/tools/blktap3/drivers/tapdisk-control.c +++ b/tools/blktap3/drivers/tapdisk-control.c @@ -49,7 +49,7 @@ #include "tapdisk-disktype.h" #include "tapdisk-stats.h" #include "tapdisk-control.h" -#include "sring/td-blkif.h" +#include "tapdisk-nbdserver.h" #define TD_CTL_MAX_CONNECTIONS 10 #define TD_CTL_SOCK_BACKLOG 32 @@ -57,10 +57,11 @@ #define TD_CTL_SEND_TIMEOUT 10 #define TD_CTL_SEND_BUFSZ ((size_t)4096) -#define DBG(_f, _a...) tlog_syslog(LOG_DEBUG, "%s:%d " _f, \ +#define DBG(_f, _a...) tlog_syslog(TLOG_DBG, "%s:%d " _f, \ __FILE__, __LINE__, ##_a) #define ERR(err, _f, _a...) tlog_error(err, "%s:%d " _f, __FILE__, \ __LINE__, ##_a) +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "control: " _f, ##_a) #define ASSERT(_p) \ if (!(_p)) { \ @@ -96,9 +97,8 @@ struct tapdisk_ctl_conn { struct tapdisk_control_info *info; }; -#define TAPDISK_MSG_REENTER (1<<0) /* non-blocking, idempotent */ -#define TAPDISK_MSG_VERBOSE (1<<1) /* tell syslog about it */ -#define TAPDISK_MSG_VERBOSE_ERROR (1<<2) /* tell syslog about it, with errors */ +#define TAPDISK_MSG_REENTER (1<<0) /* non-blocking, idempotent */ +#define TAPDISK_MSG_VERBOSE (1<<1) /* tell syslog about it */ struct tapdisk_control_info { int (*handler) (struct tapdisk_ctl_conn *, tapdisk_message_t *, @@ -128,7 +128,7 @@ static inline size_t page_align(size_t s static void tapdisk_ctl_conn_uninit(struct tapdisk_ctl_conn *conn) { if (conn->out.buf) { - munmap(conn->out.buf, conn->out.bufsz); + free(conn->out.buf); conn->out.buf = NULL; } } @@ -136,22 +136,18 @@ static void tapdisk_ctl_conn_uninit(stru static int tapdisk_ctl_conn_init(struct tapdisk_ctl_conn *conn, size_t bufsz) { - int prot, flags, err; + int err; memset(conn, 0, sizeof(*conn)); conn->out.event_id = -1; conn->in.event_id = -1; - prot = PROT_READ | PROT_WRITE; - flags = MAP_ANONYMOUS | MAP_PRIVATE; - - conn->out.buf = mmap(NULL, bufsz, prot, flags, -1, 0); - if (conn->out.buf == MAP_FAILED) { - conn->out.buf = NULL; - err = -ENOMEM; - goto fail; - } - conn->out.bufsz = page_align(bufsz); + conn->out.buf = malloc(bufsz); + if (!conn->out.buf) { + err = -ENOMEM; + goto fail; + } + conn->out.bufsz = page_align(bufsz); return 0; @@ -264,8 +260,15 @@ static void tapdisk_ctl_conn_drain(struc fd_set wfds; int n, mode; - ASSERT(conn->out.done); - ASSERT(conn->fd >= 0); + if (!conn->out.done) { + /* we accepted this connection but haven''t received the message + * body yet. Since this tapdisk is on its way out, just drop + * the connection. */ + tapdisk_ctl_conn_close(conn); + return; + } + + ASSERT(conn->fd >= 0); while (tapdisk_ctl_conn_connected(conn)) { FD_ZERO(&wfds); @@ -284,7 +287,6 @@ static void tapdisk_ctl_conn_drain(struc } } - struct tapdisk_ctl_conn *tapdisk_ctl_conn_open(int fd) { struct tapdisk_ctl_conn *conn; @@ -301,9 +303,10 @@ struct tapdisk_ctl_conn *tapdisk_ctl_con if (conn->out.event_id < 0) return NULL; - conn->fd = fd; - conn->out.prod = conn->out.buf; - conn->out.cons = conn->out.buf; + conn->fd = fd; + conn->out.prod = conn->out.buf; + conn->out.cons = conn->out.buf; + conn->out.done = 0; tapdisk_ctl_conn_mask_out(conn); @@ -471,7 +474,7 @@ tapdisk_control_write_message(struct tap { size_t size = sizeof(*message), count; - if (conn->info->flags & TAPDISK_MSG_VERBOSE) + if (conn->info && conn->info->flags & TAPDISK_MSG_VERBOSE) DBG("sending ''%s'' message\n", tapdisk_message_name(message->type)); count = tapdisk_ctl_conn_write(conn, message, size); @@ -612,9 +615,6 @@ tapdisk_control_open_image( goto out; } - /* TODO Add after everything has been initialised? */ - tapdisk_server_add_vbd(vbd); - /* TODO check for unsupported flags */ flags = 0; if (request->u.params.flags & TAPDISK_MESSAGE_FLAG_RDONLY) @@ -655,6 +655,18 @@ tapdisk_control_open_image( goto fail_close; } + /* + * For now, let''s do this automatically on all ''open'' calls In the + * future, we''ll probably want a separate call to start the NBD server + */ + err = tapdisk_vbd_start_nbdserver(vbd); + if (err) { + EPRINTF("failed to start nbdserver: %d\n",err); + goto fail_close; + } + + tapdisk_server_add_vbd(vbd); + err = 0; out: @@ -663,9 +675,7 @@ out: response->u.image.sector_size = info.sector_size; response->u.image.info = info.info; response->type = TAPDISK_MESSAGE_OPEN_RSP; - } else - if (vbd) - tapdisk_server_remove_vbd(vbd); + } return err; @@ -713,9 +723,22 @@ tapdisk_control_close_image(struct tapdi * I assume we have disconnected from the ring before? If yes, then * make sure we check this. */ + if (td_flag_test(vbd->state, TD_VBD_PAUSED)) + EPRINTF("warning: closing paused VBD %s", vbd->name); + + if(vbd->nbdserver) { + tapdisk_nbdserver_pause(vbd->nbdserver); + } + + /* FIXME This was executed if tapdisk_blktap_remove_device returned ENOTTY */ while (!TAILQ_EMPTY(&vbd->pending_requests)) tapdisk_server_iterate(); + if (vbd->nbdserver) { + tapdisk_nbdserver_free(vbd->nbdserver); + vbd->nbdserver = NULL; + } + tapdisk_vbd_close_vdi(vbd); /* NB. vbd->name free should probably belong into close_vdi, @@ -725,6 +748,7 @@ tapdisk_control_close_image(struct tapdi vbd->name = NULL; tapdisk_server_remove_vbd(vbd); + /* FIXME free vbd? */ out: if (!err) { @@ -747,24 +771,23 @@ tapdisk_control_pause_vbd(struct tapdisk assert(request); assert(response); - len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH); + len = strnlen(request->u.params.path, TAPDISK_MESSAGE_MAX_PATH_LENGTH); /* TODO boilerplate */ if (len < 1) { err = -EINVAL; goto out; } - if (len >= TAPDISK_MESSAGE_STRING_LENGTH) { + if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { err = -ENAMETOOLONG; goto out; } response->type = TAPDISK_MESSAGE_PAUSE_RSP; - /* TODO Need to fix this in control/tap-ctl-pause.c */ - vbd = tapdisk_server_get_vbd(request->u.string.text); + vbd = tapdisk_server_get_vbd(request->u.params.path); if (!vbd) { - err = -EINVAL; + err = -ENODEV; goto out; } @@ -798,30 +821,44 @@ tapdisk_control_resume_vbd( assert(request); assert(response); - len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH); + len = strnlen(request->u.resume.params1, TAPDISK_MESSAGE_MAX_PATH_LENGTH); /* TODO boilerplate */ if (len < 1) { err = -EINVAL; goto out; } - if (len >= TAPDISK_MESSAGE_STRING_LENGTH) { + if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { err = -ENAMETOOLONG; goto out; } + /* TODO validate secondary */ + response->type = TAPDISK_MESSAGE_RESUME_RSP; - /* TODO Need to fix this in control/tap-ctl-pause.c */ - vbd = tapdisk_server_get_vbd(request->u.string.text); + INFO("Resuming: flags=0x%08x secondary=%p\n", + request->u.resume.flags, request->u.resume.secondary); + + vbd = tapdisk_server_get_vbd(request->u.resume.params1); if (!vbd) { - err = -EINVAL; + err = -ENODEV; goto out; } - /* TODO What''s this path? */ - if (request->u.params.path[0]) - desc = request->u.params.path; + if (request->u.resume.flags & TAPDISK_MESSAGE_FLAG_SECONDARY) { + char *name = strdup(request->u.resume.secondary); + if (!name) { + err = -errno; + goto out; + } + INFO("Resuming with secondary ''%s''\n", name); + vbd->secondary_name = name; + vbd->flags |= TD_OPEN_SECONDARY; + } + + if (request->u.resume.params2[0]) + desc = request->u.resume.params2; err = tapdisk_vbd_resume(vbd, desc); out: @@ -837,27 +874,33 @@ tapdisk_control_stats(struct tapdisk_ctl { td_stats_t _st, *st = &_st; td_vbd_t *vbd; - size_t rv = 0; - int err = 0; - int len; + size_t rv; + void *buf; + int new_size; + size_t len; assert(request); assert(response); - len = strnlen(request->u.string.text, TAPDISK_MESSAGE_STRING_LENGTH); + buf = malloc(TD_CTL_SEND_BUFSZ); + if (!buf) { + rv = -ENOMEM; + goto out; + } - tapdisk_stats_init(st, - conn->out.buf + sizeof(*response), - conn->out.bufsz - sizeof(*response)); + len = strnlen(request->u.params.path, TAPDISK_MESSAGE_MAX_PATH_LENGTH); + + tapdisk_stats_init(st, buf, TD_CTL_SEND_BUFSZ); + if (len > 1) { - if (len >= TAPDISK_MESSAGE_STRING_LENGTH) { - err = -ENAMETOOLONG; + if (len >= TAPDISK_MESSAGE_MAX_PATH_LENGTH) { + rv = -ENAMETOOLONG; goto out; } - vbd = tapdisk_server_get_vbd(request->u.string.text); + vbd = tapdisk_server_get_vbd(request->u.params.path); if (!vbd) { - err = -ENODEV; + rv = -ENODEV; goto out; } @@ -875,8 +918,27 @@ tapdisk_control_stats(struct tapdisk_ctl } rv = tapdisk_stats_length(st); + + if (rv > conn->out.bufsz - sizeof(response)) { + ASSERT(conn->out.prod == conn->out.buf); + ASSERT(conn->out.cons == conn->out.buf); + new_size = rv + sizeof(response); + buf = realloc(conn->out.buf, new_size); + if (!buf) { + rv = -ENOMEM; + goto out; + } + conn->out.buf = buf; + conn->out.bufsz = new_size; + conn->out.prod = buf; + conn->out.cons = buf; + } + if (rv > 0) { + memcpy(conn->out.buf + sizeof(response), st->buf, rv); + } out: - if (!err) { + free(st->buf); + if (!rv) { response->type = TAPDISK_MESSAGE_STATS_RSP; response->u.info.length = rv; } @@ -885,7 +947,7 @@ out: if (rv > 0) conn->out.prod += rv; - return err; + return rv; } /** @@ -1063,21 +1125,16 @@ struct tapdisk_control_info message_info [TAPDISK_MESSAGE_XENBLKIF_CONNECT] = { .handler tapdisk_control_xenblkif_connect, - .flags - TAPDISK_MSG_VERBOSE | - TAPDISK_MSG_VERBOSE_ERROR, + .flags = TAPDISK_MSG_VERBOSE }, [TAPDISK_MESSAGE_XENBLKIF_DISCONNECT] = { .handler tapdisk_control_xenblkif_disconnect, .flags = TAPDISK_MSG_VERBOSE - || TAPDISK_MSG_VERBOSE_ERROR, }, [TAPDISK_MESSAGE_DISK_INFO] = { .handler = tapdisk_control_disk_info, - .flags - TAPDISK_MSG_VERBOSE | - TAPDISK_MSG_VERBOSE_ERROR, + .flags = TAPDISK_MSG_VERBOSE }, }; @@ -1089,15 +1146,13 @@ static void tapdisk_control_handle_reque int err, excl; tapdisk_message_t message, response; struct tapdisk_ctl_conn *conn = private; - struct tapdisk_control_info *info; + + conn->info = NULL; err = tapdisk_control_read_message(conn->fd, &message, 2); if (err) goto close; - if (conn->in.busy) - goto busy; - err = tapdisk_control_validate_request(&message); if (err) goto invalid; @@ -1105,16 +1160,19 @@ static void tapdisk_control_handle_reque if (message.type > TAPDISK_MESSAGE_EXIT) goto invalid; - info = &message_infos[message.type]; + conn->info = &message_infos[message.type]; - if (!info->handler) + if (!conn->info->handler) goto invalid; - if (info->flags & TAPDISK_MSG_VERBOSE) + if (conn->info->flags & TAPDISK_MSG_VERBOSE) DBG("received ''%s'' message\n", tapdisk_message_name(message.type)); - excl = !(info->flags & TAPDISK_MSG_REENTER); + if (conn->in.busy) + goto busy; + + excl = !(conn->info->flags & TAPDISK_MSG_REENTER); if (excl) { if (td_control.busy) goto busy; @@ -1122,11 +1180,10 @@ static void tapdisk_control_handle_reque td_control.busy = 1; } conn->in.busy = 1; - conn->info = info; memset(&response, 0, sizeof(response)); - err = info->handler(conn, &message, &response); + err = conn->info->handler(conn, &message, &response); if (err) { response.type = TAPDISK_MESSAGE_ERROR; response.u.response.error = -err; @@ -1163,7 +1220,8 @@ static void tapdisk_control_handle_reque goto error; } -static void tapdisk_control_accept(event_id_t id __attribute__((unused)), +static void +tapdisk_control_accept(event_id_t id __attribute__((unused)), char mode __attribute__((unused)), void *private __attribute__((unused))) { diff --git a/tools/blktap3/drivers/tapdisk-diff.c b/tools/blktap3/drivers/tapdisk-diff.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-diff.c @@ -0,0 +1,814 @@ +/* + * Copyright (c) 2009, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> + +#include "list.h" +#include "scheduler.h" +#include "tapdisk-vbd.h" +#include "tapdisk-server.h" +#include "tapdisk-disktype.h" +#include "libvhd.h" + +#define POLL_READ 0 +#define POLL_WRITE 1 + +#define SPB_SHIFT (VHD_BLOCK_SHIFT - SECTOR_SHIFT) + +/* + * we have to use half the max number of requests because we''re using the same + * tapdisk server for both streams and all the parents will be shared. If we + * issue more than MAX_REQUESTS/2 requests, the vhd_state will run out of + * vhd_request''s and return EBUSY, which we don''t handle here. However, even + * with MAX_REQUESTS/2 we can still run out of vhd_request''s because of + * splitting: if some sectors spanned by a segment are in a parent, a segment + * could be split into at most N/2 vhd_request''s, where N is the number of + * sectors per segment. Therefore, if we use 11 segments, we need to divide + * MAX_REQUESTS by 11/2=6 on top of that. If we don''t, we''d have to handle + * EBUSY by retrying here. + */ +#define MAX_SEGMENTS 8 +#define MAX_STREAM_REQUESTS (MAX_REQUESTS / 2 / (MAX_SEGMENTS / 2)) + +struct tapdisk_stream_poll { + int pipe[2]; + int set; +}; + +struct tapdisk_stream_request { + uint64_t sec; + uint32_t secs; + uint64_t seqno; + blkif_request_t blkif_req; + struct list_head next; +}; + +struct tapdisk_stream { + td_vbd_t *vbd; + + unsigned int id; + + int err; + + uint64_t cur; + uint64_t start; + uint64_t end; + + uint64_t started; + uint64_t completed; + + struct tapdisk_stream_poll poll; + event_id_t enqueue_event_id; + + struct list_head free_list; + struct list_head pending_list; + struct list_head completed_list; + + struct tapdisk_stream_request requests[MAX_STREAM_REQUESTS]; +}; + +static unsigned int tapdisk_stream_count; + +static void tapdisk_stream_close_image(struct tapdisk_stream *); + +static char *program; +static struct tapdisk_stream stream1, stream2; +static vhd_context_t vhd1; + +static void +usage(FILE *stream) +{ + printf("usage: %s <-n type:/path/to/image> <-m type:/path/to/image>\n", + program); +} + +static int +open_vhd(const char *path, vhd_context_t *vhd) +{ + int err; + + err = vhd_open(vhd, path, VHD_OPEN_RDONLY); + if (err) { + printf("error opening %s: %d\n", path, err); + return err; + } + + err = vhd_get_bat(vhd); + if (err) + { + printf("error reading BAT for %s: %d\n", path, err); + vhd_close(vhd); + return err; + } + + return 0; +} + +static inline void +tapdisk_stream_poll_initialize(struct tapdisk_stream_poll *p) +{ + p->set = 0; + p->pipe[POLL_READ] = p->pipe[POLL_WRITE] = -1; +} + +static int +tapdisk_stream_poll_open(struct tapdisk_stream_poll *p) +{ + int err; + + tapdisk_stream_poll_initialize(p); + + err = pipe(p->pipe); + if (err) + return -errno; + + err = fcntl(p->pipe[POLL_READ], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + err = fcntl(p->pipe[POLL_WRITE], F_SETFL, O_NONBLOCK); + if (err) + goto out; + + return 0; + +out: + close(p->pipe[POLL_READ]); + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); + return -errno; +} + +static void +tapdisk_stream_poll_close(struct tapdisk_stream_poll *p) +{ + if (p->pipe[POLL_READ] != -1) + close(p->pipe[POLL_READ]); + if (p->pipe[POLL_WRITE] != -1) + close(p->pipe[POLL_WRITE]); + tapdisk_stream_poll_initialize(p); +} + +static inline void +tapdisk_stream_poll_clear(struct tapdisk_stream_poll *p) +{ + int gcc, dummy; + + gcc = read(p->pipe[POLL_READ], &dummy, sizeof(dummy)); + p->set = 0; +} + +static inline void +tapdisk_stream_poll_set(struct tapdisk_stream_poll *p) +{ + int dummy = 0; + + if (!p->set) { + int gcc = write(p->pipe[POLL_WRITE], &dummy, sizeof(dummy)); + p->set = 1; + } +} + +static inline int +tapdisk_stream_stop(struct tapdisk_stream *s) +{ + return ((s->cur == s->end || s->err) && + list_empty(&s->pending_list) && + list_empty(&s->completed_list)); +} + +static inline void +tapdisk_stream_initialize_request(struct tapdisk_stream_request *req) +{ + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->next); +} + +static inline int +tapdisk_stream_request_idx(struct tapdisk_stream *s, + struct tapdisk_stream_request *req) +{ + return (req - s->requests); +} + +static inline struct tapdisk_stream_request * +tapdisk_stream_get_request(struct tapdisk_stream *s) +{ + struct tapdisk_stream_request *req; + + if (list_empty(&s->free_list)) + return NULL; + + req = list_entry(s->free_list.next, + struct tapdisk_stream_request, next); + + list_del_init(&req->next); + tapdisk_stream_initialize_request(req); + + return req; +} + +static inline void +tapdisk_stream_queue_completed(struct tapdisk_stream *s, + struct tapdisk_stream_request *sreq) +{ + struct tapdisk_stream_request *itr; + + list_for_each_entry(itr, &s->completed_list, next) + if (sreq->seqno < itr->seqno) { + list_add_tail(&sreq->next, &itr->next); + return; + } + + list_add_tail(&sreq->next, &s->completed_list); +} + +static int +tapdisk_result_compare(struct tapdisk_stream_request *sreq1, + struct tapdisk_stream_request *sreq2) +{ + unsigned long idx1, idx2; + char *buf1, *buf2; + int result; + + assert(sreq1->seqno == sreq2->seqno); + assert(sreq1->secs == sreq2->secs); + idx1 = (unsigned long)tapdisk_stream_request_idx(&stream1, + sreq1); + idx2 = (unsigned long)tapdisk_stream_request_idx(&stream2, + sreq2); + buf1 = (char *)MMAP_VADDR(stream1.vbd->ring.vstart, idx1, 0); + buf2 = (char *)MMAP_VADDR(stream2.vbd->ring.vstart, idx2, 0); + + result = memcmp(buf1, buf2, sreq1->secs << SECTOR_SHIFT); + return result; +} + +static int +tapdisk_stream_process_data(void) +{ + struct tapdisk_stream_request *sreq1, *sreq2, *tmp1, *tmp2; + int advance_both; + int result = 0; + + sreq1 = list_entry(stream1.completed_list.next, + struct tapdisk_stream_request, next); + sreq2 = list_entry(stream2.completed_list.next, + struct tapdisk_stream_request, next); + tmp1 = list_entry(sreq1->next.next, + struct tapdisk_stream_request, next); + tmp2 = list_entry(sreq2->next.next, + struct tapdisk_stream_request, next); + while (result == 0 && + &sreq1->next != &stream1.completed_list && + &sreq2->next != &stream2.completed_list) { + //printf("checking: %llu|%llu\n", sreq1->seqno, sreq2->seqno); + advance_both = 1; + if (sreq1->seqno < sreq2->seqno) { + advance_both = 0; + goto advance1; + } + if (sreq1->seqno > sreq2->seqno) + goto advance2; + + result = tapdisk_result_compare(sreq1, sreq2); + + stream1.completed++; + stream2.completed++; + + list_del_init(&sreq1->next); + list_add_tail(&sreq1->next, &stream1.free_list); + list_del_init(&sreq2->next); + list_add_tail(&sreq2->next, &stream2.free_list); + +advance1: + sreq1 = tmp1; + tmp1 = list_entry(tmp1->next.next, + struct tapdisk_stream_request, next); + if (!advance_both) + continue; +advance2: + sreq2 = tmp2; + tmp2 = list_entry(tmp2->next.next, + struct tapdisk_stream_request, next); + } + + return result; +} + +static void +tapdisk_stream_dequeue(void *arg, blkif_response_t *rsp) +{ + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + struct tapdisk_stream_request *sreq = s->requests + rsp->id; + + list_del_init(&sreq->next); + + if (rsp->status == BLKIF_RSP_OKAY) + tapdisk_stream_queue_completed(s, sreq); + else { + s->err = EIO; + list_add_tail(&sreq->next, &s->free_list); + fprintf(stderr, "error reading sector %llu (stream %d)\n", + sreq->sec, (s == &stream2) + 1); + } + + if (tapdisk_stream_process_data()) { + fprintf(stderr, "mismatch at sector %llu\n", + sreq->sec); + stream1.err = EINVAL; + stream2.err = EINVAL; + } + + tapdisk_stream_poll_set(&stream1.poll); + tapdisk_stream_poll_set(&stream2.poll); +} + +static inline int +tapdisk_stream_enqueue_copy(struct tapdisk_stream *s, + struct tapdisk_stream_request *r) +{ + td_vbd_t *vbd; + blkif_request_t *breq; + td_vbd_request_t *vreq; + struct tapdisk_stream_request *sreq; + int idx; + + vbd = stream2.vbd; + sreq = tapdisk_stream_get_request(s); + if (!sreq) + return 1; + + idx = tapdisk_stream_request_idx(s, sreq); + + sreq->sec = r->sec; + sreq->secs = r->secs; + sreq->seqno = r->seqno; + + breq = &sreq->blkif_req; + breq->id = idx; + breq->nr_segments = r->blkif_req.nr_segments; + breq->sector_number = r->blkif_req.sector_number; + breq->operation = BLKIF_OP_READ; + + for (int i = 0; i < r->blkif_req.nr_segments; i++) { + struct blkif_request_segment *seg = breq->seg + i; + seg->first_sect = r->blkif_req.seg[i].first_sect; + seg->last_sect = r->blkif_req.seg[i].last_sect; + } + s->cur += sreq->secs; + + vreq = vbd->request_list + idx; + assert(list_empty(&vreq->next)); + assert(vreq->secs_pending == 0); + + memcpy(&vreq->req, breq, sizeof(*breq)); + s->started++; + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + list_add_tail(&sreq->next, &s->pending_list); + + return 0; +} + +static void +tapdisk_stream_enqueue1(void) +{ + td_vbd_t *vbd; + int i, idx, psize, blk; + struct tapdisk_stream *s = &stream1; + + vbd = s->vbd; + psize = getpagesize(); + + while (s->cur < s->end && !s->err) { + blkif_request_t *breq; + td_vbd_request_t *vreq; + struct tapdisk_stream_request *sreq; + + /* skip any blocks that are not present in this image */ + blk = s->cur >> SPB_SHIFT; + while (s->cur < s->end && vhd1.bat.bat[blk] == DD_BLK_UNUSED) { + //printf("skipping block %d\n", blk); + blk++; + s->cur = blk << SPB_SHIFT; + } + + if (s->cur >= s->end) + break; + + sreq = tapdisk_stream_get_request(s); + if (!sreq) + break; + + idx = tapdisk_stream_request_idx(s, sreq); + + sreq->sec = s->cur; + sreq->secs = 0; + sreq->seqno = s->started++; + + breq = &sreq->blkif_req; + breq->id = idx; + breq->nr_segments = 0; + breq->sector_number = sreq->sec; + breq->operation = BLKIF_OP_READ; + + for (i = 0; i < MAX_SEGMENTS; i++) { + uint32_t secs; + struct blkif_request_segment *seg = breq->seg + i; + + secs = MIN(s->end - s->cur, psize >> SECTOR_SHIFT); + secs = MIN(((blk + 1) << SPB_SHIFT) - s->cur, secs); + if (!secs) + break; + + sreq->secs += secs; + s->cur += secs; + + seg->first_sect = 0; + seg->last_sect = secs - 1; + breq->nr_segments++; + } + + vreq = vbd->request_list + idx; + + assert(list_empty(&vreq->next)); + assert(vreq->secs_pending == 0); + + memcpy(&vreq->req, breq, sizeof(*breq)); + vbd->received++; + vreq->vbd = vbd; + + tapdisk_vbd_move_request(vreq, &vbd->new_requests); + list_add_tail(&sreq->next, &s->pending_list); + } + + tapdisk_vbd_issue_requests(vbd); +} + +static void +tapdisk_stream_enqueue2(void) +{ + td_vbd_t *vbd; + int i, blk; + struct tapdisk_stream_request *itr; + struct tapdisk_stream *s = &stream2; + + vbd = s->vbd; + + /* issue the same requests that we issued on stream1 */ + list_for_each_entry(itr, &stream1.completed_list, next) { + if (itr->sec < s->cur) + continue; + if (tapdisk_stream_enqueue_copy(s, itr)) + goto done; + } + + list_for_each_entry(itr, &stream1.pending_list, next) { + if (itr->sec < s->cur) + continue; + if (tapdisk_stream_enqueue_copy(s, itr)) + goto done; + } + + stream2.cur = stream1.cur; + +done: + tapdisk_vbd_issue_requests(vbd); +} + +static inline int +tapdisk_diff_done(void) +{ + return (tapdisk_stream_stop(&stream1) && tapdisk_stream_stop(&stream2)); +} + +static void +tapdisk_diff_stop(void) +{ + tapdisk_stream_close_image(&stream1); + tapdisk_stream_close_image(&stream2); +} + +static void +tapdisk_stream_enqueue(event_id_t id, char mode, void *arg) +{ + struct tapdisk_stream *s = (struct tapdisk_stream *)arg; + + tapdisk_stream_poll_clear(&s->poll); + + if (tapdisk_diff_done()) { + tapdisk_diff_stop(); + return; + } + + if (s == &stream1) + tapdisk_stream_enqueue1(); + else if (s == &stream2) + tapdisk_stream_enqueue2(); + else + assert(0); + + if (tapdisk_diff_done()) { + // we have to check again for the case when stream1 had no + // blocks at all + tapdisk_diff_stop(); + return; + } +} + +static int +tapdisk_stream_open_image(struct tapdisk_stream *s, const char *name) +{ + int err; + td_disk_info_t info; + + s->id = tapdisk_stream_count++; + + err = tapdisk_vbd_initialize(-1, -1, s->id); + if (err) + goto out; + + s->vbd = tapdisk_server_get_vbd(s->id); + if (!s->vbd) { + err = ENODEV; + goto out; + } + + tapdisk_vbd_set_callback(s->vbd, tapdisk_stream_dequeue, s); + + err = tapdisk_vbd_open_vdi(s->vbd, name, TD_OPEN_RDONLY, -1); + if (err) + goto out; + + err = tapdisk_vbd_get_disk_info(s->vbd, &info); + if (err) { + fprintf(stderr, "failed getting image size: %d\n", err); + return err; + } + + s->start = 0; + s->cur = s->start; + s->end = info.size; + + err = 0; + +out: + if (err) + fprintf(stderr, "failed to open image %s: %d\n", name, err); + return err; +} + +static void +tapdisk_stream_close_image(struct tapdisk_stream *s) +{ + td_vbd_t *vbd; + + vbd = tapdisk_server_get_vbd(s->id); + if (vbd) { + tapdisk_vbd_close_vdi(vbd); + tapdisk_server_remove_vbd(vbd); + free((void *)vbd->ring.vstart); + free(vbd->name); + free(vbd); + s->vbd = NULL; + } +} + +static int +tapdisk_stream_initialize_requests(struct tapdisk_stream *s) +{ + size_t size; + td_ring_t *ring; + int err, i, psize; + + ring = &s->vbd->ring; + psize = getpagesize(); + size = psize * BLKTAP_MMAP_REGION_SIZE; + + /* sneaky -- set up ring->vstart so tapdisk_vbd will use our buffers */ + err = posix_memalign((void **)&ring->vstart, psize, size); + if (err) { + fprintf(stderr, "failed to allocate buffers: %d\n", err); + ring->vstart = 0; + return err; + } + + for (i = 0; i < MAX_STREAM_REQUESTS; i++) { + struct tapdisk_stream_request *req = s->requests + i; + tapdisk_stream_initialize_request(req); + list_add_tail(&req->next, &s->free_list); + } + + return 0; +} + +static int +tapdisk_stream_register_enqueue_event(struct tapdisk_stream *s) +{ + int err; + struct tapdisk_stream_poll *p = &s->poll; + + err = tapdisk_stream_poll_open(p); + if (err) + goto out; + + err = tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + p->pipe[POLL_READ], 0, + tapdisk_stream_enqueue, s); + if (err < 0) + goto out; + + s->enqueue_event_id = err; + err = 0; + +out: + if (err) + fprintf(stderr, "failed to register event: %d\n", err); + return err; +} + +static void +tapdisk_stream_unregister_enqueue_event(struct tapdisk_stream *s) +{ + if (s->enqueue_event_id) { + tapdisk_server_unregister_event(s->enqueue_event_id); + s->enqueue_event_id = 0; + } + tapdisk_stream_poll_close(&s->poll); +} + +static inline void +tapdisk_stream_initialize(struct tapdisk_stream *s) +{ + memset(s, 0, sizeof(*s)); + INIT_LIST_HEAD(&s->free_list); + INIT_LIST_HEAD(&s->pending_list); + INIT_LIST_HEAD(&s->completed_list); +} + +static int +tapdisk_stream_open(struct tapdisk_stream *s, const char *arg) +{ + int err; + + tapdisk_stream_initialize(s); + + err = tapdisk_stream_open_image(s, arg); + if (err) + return err; + + err = tapdisk_stream_initialize_requests(s); + if (err) + return err; + + err = tapdisk_stream_register_enqueue_event(s); + if (err) + return err; + + tapdisk_stream_enqueue(s->enqueue_event_id, + SCHEDULER_POLL_READ_FD, s); + + return 0; +} + +static void +tapdisk_stream_release(struct tapdisk_stream *s) +{ + tapdisk_stream_close_image(s); + tapdisk_stream_unregister_enqueue_event(s); +} + +static int +tapdisk_stream_run(struct tapdisk_stream *s) +{ + tapdisk_stream_enqueue(s->enqueue_event_id, SCHEDULER_POLL_READ_FD, s); + tapdisk_server_run(); + return s->err; +} + +int +main(int argc, char *argv[]) +{ + int c, err, type1; + const char *arg1 = NULL, *arg2 = NULL; + const disk_info_t *info; + const char *path1; + + err = 0; + + program = basename(argv[0]); + + while ((c = getopt(argc, argv, "n:m:h")) != -1) { + switch (c) { + case ''n'': + arg1 = optarg; + break; + case ''m'': + arg2 = optarg; + break; + case ''h'': + usage(stdout); + return 0; + default: + goto fail_usage; + } + } + + if (!arg1 || !arg2) + goto fail_usage; + + type1 = tapdisk_disktype_parse_params(arg1, &path1); + if (type1 < 0) + return type1; + + if (type1 != DISK_TYPE_VHD) { + printf("error: first VDI is not VHD\n"); + return EINVAL; + } + + err = open_vhd(path1, &vhd1); + if (err) + return err; + + tapdisk_start_logging("tapdisk-diff", "daemon"); + + err = tapdisk_server_initialize(NULL, NULL); + if (err) + goto out; + + err = tapdisk_stream_open(&stream1, arg1); + if (err) { + fprintf(stderr, "Failed to open %s: %s\n", + arg1, strerror(-err)); + goto out; + } + + err = tapdisk_stream_open(&stream2, arg2); + if (err) { + fprintf(stderr, "Failed to open %s: %s\n", + arg2, strerror(-err)); + goto out1; + } + + if (stream1.end != stream2.end) { + fprintf(stderr, "Image sizes differ: %"PRIu64" != %"PRIu64"\n", + stream1.end, stream2.end); + err = EINVAL; + goto out2; + } + + tapdisk_server_run(); + +out2: + tapdisk_stream_release(&stream2); +out1: + tapdisk_stream_release(&stream1); +out: + vhd_close(&vhd1); + tapdisk_stop_logging(); + + return err ? : stream1.err; + +fail_usage: + usage(stderr); + return 1; +} diff --git a/tools/blktap3/drivers/tapdisk-disktype.c b/tools/blktap3/drivers/tapdisk-disktype.c --- a/tools/blktap3/drivers/tapdisk-disktype.c +++ b/tools/blktap3/drivers/tapdisk-disktype.c @@ -126,6 +126,12 @@ static const disk_info_t valve_disk = { DISK_TYPE_FILTER, }; +static const disk_info_t nbd_disk = { + "nbd", + "export to a NBD server", + 0, +}; + const disk_info_t *tapdisk_disk_types[] = { [DISK_TYPE_AIO] = &aio_disk, [DISK_TYPE_SYNC] = &sync_disk, @@ -142,6 +148,7 @@ const disk_info_t *tapdisk_disk_types[] [DISK_TYPE_VALVE] = &valve_disk, [DISK_TYPE_LLPCACHE] = &llpcache_disk, [DISK_TYPE_LLECACHE] = &llecache_disk, + [DISK_TYPE_NBD] = &nbd_disk, 0, }; @@ -155,7 +162,6 @@ extern struct tap_disk tapdisk_sync; extern struct tap_disk tapdisk_vmdk; extern struct tap_disk tapdisk_vhdsync; #endif - extern struct tap_disk tapdisk_vhd; extern struct tap_disk tapdisk_ram; @@ -165,7 +171,7 @@ extern struct tap_disk tapdisk_ram; #if 0 extern struct tap_disk tapdisk_qcow; #endif - +extern struct tap_disk tapdisk_block_cache; extern struct tap_disk tapdisk_vhd_index; /* @@ -174,6 +180,11 @@ extern struct tap_disk tapdisk_vhd_index #if 0 extern struct tap_disk tapdisk_log; #endif +extern struct tap_disk tapdisk_lcache; +extern struct tap_disk tapdisk_llpcache; +extern struct tap_disk tapdisk_llecache; +extern struct tap_disk tapdisk_valve; +extern struct tap_disk tapdisk_nbd; const struct tap_disk * tapdisk_disk_driver_get(const enum disk_type dt) @@ -190,13 +201,7 @@ tapdisk_disk_driver_get(const enum disk_ [DISK_TYPE_VHDSYNC] = &tapdisk_vhdsync_disk #endif [DISK_TYPE_VHD] = &tapdisk_vhd, - - /* - * TODO Commeneted out to simplify the upstreaming process. - */ -#if 0 [DISK_TYPE_RAM] = &tapdisk_ram, -#endif /* * XXX Commented out in blktap2.5. @@ -205,13 +210,8 @@ tapdisk_disk_driver_get(const enum disk_ [DISK_TYPE_QCOW] = &tapdisk_qcow, #endif - /* - * TODO Commeneted out to simplify the upstreaming process. - */ -#if 0 [DISK_TYPE_BLOCK_CACHE] = &tapdisk_block_cache, [DISK_TYPE_VINDEX] = &tapdisk_vhd_index, -#endif /* * XXX Commented out in blktap2.5. @@ -220,16 +220,12 @@ tapdisk_disk_driver_get(const enum disk_ [DISK_TYPE_LOG] = &tapdisk_log, #endif - /* - * TODO Commeneted out to simplify the upstreaming process. - */ -#if 0 [DISK_TYPE_LCACHE] = &tapdisk_lcache, [DISK_TYPE_LLPCACHE] = &tapdisk_llpcache, [DISK_TYPE_LLECACHE] = &tapdisk_llecache, [DISK_TYPE_VALVE] = &tapdisk_valve, [DISK_TYPE_NBD] = &tapdisk_nbd, -#endif + 0 }; if (dt < 0 || dt > ARRAY_SIZE(tapdisk_disk_drivers)) diff --git a/tools/blktap3/drivers/tapdisk-disktype.h b/tools/blktap3/drivers/tapdisk-disktype.h --- a/tools/blktap3/drivers/tapdisk-disktype.h +++ b/tools/blktap3/drivers/tapdisk-disktype.h @@ -44,7 +44,8 @@ enum disk_type { DISK_TYPE_LCACHE, DISK_TYPE_LLECACHE, DISK_TYPE_LLPCACHE, - DISK_TYPE_VALVE}; + DISK_TYPE_VALVE, + DISK_TYPE_NBD}; #define DISK_TYPE_NAME_MAX 32 diff --git a/tools/blktap3/drivers/tapdisk-driver.c b/tools/blktap3/drivers/tapdisk-driver.c --- a/tools/blktap3/drivers/tapdisk-driver.c +++ b/tools/blktap3/drivers/tapdisk-driver.c @@ -40,7 +40,7 @@ tapdisk_driver_log_flush(td_driver_t * d td_loglimit_t *rl = &driver->loglimit; if (rl->dropped) { - tlog_syslog(LOG_WARNING, + tlog_syslog(TLOG_WARN, "%s: %s: %d messages suppressed", driver->name, __caller, rl->dropped); rl->dropped = 0; @@ -58,7 +58,7 @@ int tapdisk_driver_log_pass(td_driver_t } if (!dropping) - tlog_syslog(LOG_WARNING, + tlog_syslog(TLOG_WARN, "%s: %s: too many errors, dropped.", driver->name, __caller); diff --git a/tools/blktap3/drivers/tapdisk-fdreceiver.c b/tools/blktap3/drivers/tapdisk-fdreceiver.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-fdreceiver.c @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2012, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <sys/wait.h> +#include <sys/un.h> + +#include "tapdisk.h" +#include "tapdisk-fdreceiver.h" +#include "tapdisk-server.h" +#include "scheduler.h" + +#define UNIX_BUFFER_SIZE 16384 + +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a) +#define ERROR(_f, _a...) tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a) + +static void +td_fdreceiver_recv_fd(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + struct td_fdreceiver *fdreceiver = data; + int ret, cv_flags = 0, *fdp, fd = -1; + long numbytes; + char iobuf[UNIX_BUFFER_SIZE]; + char buf[CMSG_SPACE(sizeof(fd))]; + struct sockaddr_un unix_socket_name; + + struct msghdr msg; + struct iovec vec; + struct cmsghdr *cmsg; + + numbytes = UNIX_BUFFER_SIZE; + + bzero(iobuf, numbytes); + + msg.msg_name = &unix_socket_name; + msg.msg_namelen = sizeof(unix_socket_name); + vec.iov_base = iobuf; + vec.iov_len = numbytes; + msg.msg_iov = &vec; + + msg.msg_iovlen = 1; + + msg.msg_control = buf; + msg.msg_controllen = sizeof(buf); + + ret = recvmsg(fdreceiver->client_fd, &msg, cv_flags); + + if (ret == -1) { + ERROR("Failed to receive the message: %d", ret); + return; + } + + if (ret > 0 && msg.msg_controllen > 0) { + cmsg = CMSG_FIRSTHDR(&msg); + if (cmsg->cmsg_level == SOL_SOCKET && + (cmsg->cmsg_type == SCM_RIGHTS)) { + fdp = (int*)CMSG_DATA(cmsg); + fd = *fdp; + } else { + ERROR("Failed to recieve a file descriptor"); + } + } else { + fd = -1; + } + + if (ret < numbytes) + numbytes = ret; + + INFO("Recieved fd with message: %s", iobuf); + + /* + * We''re done with this connection, it was only transiently used to + * connect the client + */ + close(fdreceiver->client_fd); + fdreceiver->client_fd = -1; + + tapdisk_server_unregister_event(fdreceiver->client_event_id); + fdreceiver->client_event_id = -1; + + /* + * It is the responsibility of this callback function to arrange that + * the fd is eventually closed + */ + fdreceiver->callback(fd, iobuf, fdreceiver->callback_data); +} + +static void +td_fdreceiver_accept_fd(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + struct sockaddr_storage their_addr; + socklen_t sin_size = sizeof(their_addr); + struct td_fdreceiver *fdreceiver = data; + int new_fd; + + INFO("Unix domain socket is ready to accept"); + + new_fd = accept(fdreceiver->fd, + (struct sockaddr *)&their_addr, &sin_size); + + if (fdreceiver->client_fd != -1) { + ERROR("td_fdreceiver_accept_fd: can only cope with one connec" + "tion at once to the unix domain socket!"); + close(new_fd); + return; + } + + fdreceiver->client_fd = new_fd; + + fdreceiver->client_event_id + tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + fdreceiver->client_fd, 0, + td_fdreceiver_recv_fd, + fdreceiver); + + if (fdreceiver->client_event_id < 0) { + ERROR("td_fdreceiver_accept_fd: failed to register event " + "(errno=%d)", errno); + close(new_fd); + fdreceiver->client_fd = -1; + } +} + +void +td_fdreceiver_stop(struct td_fdreceiver *fdreceiver) +{ + if (fdreceiver->client_fd >= 0) + close(fdreceiver->client_fd); + + if (fdreceiver->client_event_id >= 0) + tapdisk_server_unregister_event(fdreceiver->client_event_id); + + if (fdreceiver->fd >= 0) + close(fdreceiver->fd); + + if (fdreceiver->fd_event_id >= 0) + tapdisk_server_unregister_event(fdreceiver->fd_event_id); + + if (fdreceiver->path != NULL) { + unlink(fdreceiver->path); + free(fdreceiver->path); + } + + free(fdreceiver); +} + +struct td_fdreceiver * +td_fdreceiver_start(char *path, fd_cb_t callback, void *data) +{ + unsigned int s = -1; + struct sockaddr_un local; + int len; + int err; + struct td_fdreceiver *fdreceiver; + + fdreceiver = malloc(sizeof(struct td_fdreceiver)); + if (!fdreceiver) { + ERROR("td_fdreceiver_start: error allocating memory for " + "fdreceiver (path=%s)", path); + goto error; + } + + fdreceiver->path = strdup(path); + fdreceiver->fd = -1; + fdreceiver->fd_event_id = -1; + fdreceiver->client_fd = -1; + fdreceiver->client_event_id = -1; + fdreceiver->callback = callback; + fdreceiver->callback_data = data; + + snprintf(local.sun_path, sizeof(local.sun_path), "%s", path); + local.sun_family = AF_UNIX; + + /* + * NB: here we unlink anything that was there before - be very careful + * with the paths you pass to this function! + */ + unlink(local.sun_path); + len = strlen(local.sun_path) + sizeof(local.sun_family); + + s = socket(AF_UNIX, SOCK_STREAM, 0); + + if (s < 0) { + ERROR("td_fdreceiver_start: error creating socket " + "(path=%s)", path); + goto error; + } + + err = bind(s, (struct sockaddr *)&local, len); + if (err < 0) { + ERROR("td_fdreceiver_start: error binding (path=%s)", path); + goto error; + } + + err = listen(s, 5); + if (err < 0) { + ERROR("td_fdreceiver_start: error listening (path=%s)", path); + goto error; + } + + fdreceiver->fd = s; + + fdreceiver->fd_event_id + tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + fdreceiver->fd, 0, + td_fdreceiver_accept_fd, + fdreceiver); + + if (fdreceiver->fd_event_id < 0) { + ERROR("td_fdreceiver_start: error registering event " + "(path=%s)", path); + goto error; + } + + INFO("Set up local unix domain socket on path ''%s''", path); + + return fdreceiver; + +error: + free(fdreceiver); + + if (s >= 0) + close(s); + + return NULL; +} diff --git a/tools/blktap3/drivers/tapdisk-fdreceiver.h b/tools/blktap3/drivers/tapdisk-fdreceiver.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-fdreceiver.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2012, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Unix domain socket fd receiver */ + +typedef void (*fd_cb_t) (int fd, char *msg, void *data); + +struct td_fdreceiver *td_fdreceiver_start(char *path, fd_cb_t, void *data); +void td_fdreceiver_stop(struct td_fdreceiver *); + +struct td_fdreceiver { + char *path; + + int fd; + int fd_event_id; + + int client_fd; + int client_event_id; + + fd_cb_t callback; + void *callback_data; +}; diff --git a/tools/blktap3/drivers/tapdisk-image.c b/tools/blktap3/drivers/tapdisk-image.c --- a/tools/blktap3/drivers/tapdisk-image.c +++ b/tools/blktap3/drivers/tapdisk-image.c @@ -499,7 +499,6 @@ tapdisk_image_open_chain(const char *par type = tapdisk_disktype_parse_params(params, &name); if (type >= 0) { err = __tapdisk_image_open_chain(type, name, flags, head, prt_path); - BUG_ON(TAILQ_EMPTY(head)); return err; } diff --git a/tools/blktap3/drivers/tapdisk-interface.c b/tools/blktap3/drivers/tapdisk-interface.c --- a/tools/blktap3/drivers/tapdisk-interface.c +++ b/tools/blktap3/drivers/tapdisk-interface.c @@ -121,7 +121,7 @@ td_close(td_image_t * image) driver->refcnt--; if (!driver->refcnt && td_flag_test(driver->state, TD_DRIVER_OPEN)) { - driver->ops->td_close(driver); + driver->ops->td_close(driver, NULL); td_flag_clear(driver->state, TD_DRIVER_OPEN); } diff --git a/tools/blktap3/drivers/tapdisk-log.c b/tools/blktap3/drivers/tapdisk-log.c --- a/tools/blktap3/drivers/tapdisk-log.c +++ b/tools/blktap3/drivers/tapdisk-log.c @@ -88,7 +88,7 @@ static void tlog_logfile_save(void) err = tapdisk_logfile_rename(logfile, TLOG_DIR, name, ".log"); - tlog_syslog(LOG_INFO, "logfile saved to %s: %d\n", logfile->path, err); + tlog_syslog(TLOG_INFO, "logfile saved to %s: %d\n", logfile->path, err); } static void tlog_logfile_close(void) @@ -172,6 +172,9 @@ void tlog_vsyslog(int prio, const char * void tlog_syslog(int prio, const char *fmt, ...) { va_list ap; + static const int tlog_to_syslog[3] = {LOG_WARNING, LOG_INFO, LOG_DEBUG}; + + prio = prio >= 0 && prio < 3 ? tlog_to_syslog[prio] : LOG_INFO; va_start(ap, fmt); tlog_vsyslog(prio, fmt, ap); diff --git a/tools/blktap3/drivers/tapdisk-log.h b/tools/blktap3/drivers/tapdisk-log.h --- a/tools/blktap3/drivers/tapdisk-log.h +++ b/tools/blktap3/drivers/tapdisk-log.h @@ -25,6 +25,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ + #ifndef _TAPDISK_LOG_H_ #define _TAPDISK_LOG_H_ diff --git a/tools/blktap3/drivers/tapdisk-nbd.h b/tools/blktap3/drivers/tapdisk-nbd.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-nbd.h @@ -0,0 +1,89 @@ +/* + * 1999 Copyright (C) Pavel Machek, pavel@ucw.cz. This code is GPL. + * 1999/11/04 Copyright (C) 1999 VMware, Inc. (Regis "HPReg" Duchesne) + * Made nbd_end_request() use the io_request_lock + * 2001 Copyright (C) Steven Whitehouse + * New nbd_end_request() for compatibility with new linux block + * layer code. + * 2003/06/24 Louis D. Langholtz <ldl@aros.net> + * Removed unneeded blksize_bits field from nbd_device struct. + * Cleanup PARANOIA usage & code. + * 2004/02/19 Paul Clements + * Removed PARANOIA, plus various cleanup and comments + */ + +#ifndef LINUX_NBD_H +#define LINUX_NBD_H + +//#include <linux/types.h> + +#define NBD_NEGOTIATION_MAGIC 0x00420281861253LL + +#define NBD_SET_SOCK _IO( 0xab, 0 ) +#define NBD_SET_BLKSIZE _IO( 0xab, 1 ) +#define NBD_SET_SIZE _IO( 0xab, 2 ) +#define NBD_DO_IT _IO( 0xab, 3 ) +#define NBD_CLEAR_SOCK _IO( 0xab, 4 ) +#define NBD_CLEAR_QUE _IO( 0xab, 5 ) +#define NBD_PRINT_DEBUG _IO( 0xab, 6 ) +#define NBD_SET_SIZE_BLOCKS _IO( 0xab, 7 ) +#define NBD_DISCONNECT _IO( 0xab, 8 ) +#define NBD_SET_TIMEOUT _IO( 0xab, 9 ) +#define NBD_SET_FLAGS _IO( 0xab, 10 ) + +enum { + NBD_CMD_READ = 0, + NBD_CMD_WRITE = 1, + NBD_CMD_DISC = 2, + NBD_CMD_FLUSH = 3, + NBD_CMD_TRIM = 4 +}; + +#define NBD_CMD_MASK_COMMAND 0x0000ffff +#define NBD_CMD_FLAG_FUA (1<<16) + +/* values for flags field */ +#define NBD_FLAG_HAS_FLAGS (1 << 0) /* Flags are there */ +#define NBD_FLAG_READ_ONLY (1 << 1) /* Device is read-only */ +#define NBD_FLAG_SEND_FLUSH (1 << 2) /* Send FLUSH */ +#define NBD_FLAG_SEND_FUA (1 << 3) /* Send FUA (Force Unit Access) */ +#define NBD_FLAG_ROTATIONAL (1 << 4) /* Use elevator algorithm - + rotational media */ +#define NBD_FLAG_SEND_TRIM (1 << 5) /* Send TRIM (discard) */ + +#define nbd_cmd(req) ((req)->cmd[0]) + +/* userspace doesn''t need the nbd_device structure */ + +/* These are sent over the network in the request/reply magic fields */ + +#define NBD_REQUEST_MAGIC 0x25609513 +#define NBD_REPLY_MAGIC 0x67446698 +/* Do *not* use magics: 0x12560953 0x96744668. */ + +#define __be32 uint32_t +#define __be64 uint64_t + + +/* + * This is the packet used for communication between client and + * server. All data are in network byte order. + */ +struct nbd_request { + __be32 magic; + __be32 type; /* == READ || == WRITE */ + char handle[8]; + __be64 from; + __be32 len; +} __attribute__ ((packed)); + +/* + * This is the reply packet that nbd-server sends back to the client after + * it has completed an I/O request (or an error occurs). + */ +struct nbd_reply { + __be32 magic; + __be32 error; /* 0 = ok, else error */ + char handle[8]; /* handle you got from request */ +}; +#endif diff --git a/tools/blktap3/drivers/tapdisk-nbdserver.c b/tools/blktap3/drivers/tapdisk-nbdserver.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-nbdserver.c @@ -0,0 +1,712 @@ +/* + * Copyright (c) 2012, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <sys/wait.h> +#include <sys/un.h> + +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-driver.h" +#include "tapdisk-interface.h" +#include "tapdisk-utils.h" +#include "tapdisk-nbdserver.h" +#include "tapdisk-fdreceiver.h" + +#include "tapdisk-nbd.h" + +#define NBD_SERVER_NUM_REQS TAPDISK_DATA_REQUESTS + +#define TAPDISK_NBDSERVER_LISTEN_SOCK_PATH "/var/run/blktap-control/nbdserver" +#define TAPDISK_NBDSERVER_MAX_PATH_LEN 256 + +/* + * Server + */ + +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "nbd: " _f, ##_a) +#define ERROR(_f, _a...) tlog_syslog(TLOG_WARN, "nbd: " _f, ##_a) + +struct td_nbdserver_req { + td_vbd_request_t vreq; + char id[16]; + struct td_iovec iov; +}; + +static void tapdisk_nbdserver_disable_client(td_nbdserver_client_t *client); +static void tapdisk_nbdserver_clientcb(event_id_t id, char mode, void *data); +int tapdisk_nbdserver_setup_listening_socket(td_nbdserver_t *server); +int tapdisk_nbdserver_unpause(td_nbdserver_t *server); + +static td_nbdserver_req_t * +tapdisk_nbdserver_alloc_request(td_nbdserver_client_t *client) +{ + td_nbdserver_req_t *req = NULL; + + if (likely(client->n_reqs_free)) + req = client->reqs_free[--client->n_reqs_free]; + + return req; +} + +static void +tapdisk_nbdserver_free_request(td_nbdserver_client_t *client, + td_nbdserver_req_t *req) +{ + if (client->n_reqs_free >= client->n_reqs) { + ERROR("Error, trying to free a client, but the free list " + "is full! leaking!"); + return; + } + client->reqs_free[client->n_reqs_free++] = req; +} + +static void +tapdisk_nbdserver_reqs_free(td_nbdserver_client_t *client) +{ + if (client->reqs) { + free(client->reqs); + client->reqs = NULL; + } + + if (client->iovecs) { + free(client->iovecs); + client->iovecs = NULL; + } + + if (client->reqs_free) { + free(client->reqs_free); + client->reqs_free = NULL; + } +} + +static int +tapdisk_nbdserver_reqs_init(td_nbdserver_client_t *client, int n_reqs) +{ + int i, err; + + INFO("Reqs init"); + + client->reqs = malloc(n_reqs * sizeof(td_nbdserver_req_t)); + if (!client->reqs) { + err = -errno; + goto fail; + } + client->iovecs = malloc(n_reqs * sizeof(struct td_iovec)); + if (!client->iovecs) { + err = - errno; + goto fail; + } + + client->reqs_free = malloc(n_reqs * sizeof(td_nbdserver_req_t*)); + if (!client->reqs_free) { + err = -errno; + goto fail; + } + + client->n_reqs = n_reqs; + client->n_reqs_free = 0; + + for (i = 0; i < n_reqs; i++) { + client->reqs[i].vreq.iov = &client->iovecs[i]; + tapdisk_nbdserver_free_request(client, &client->reqs[i]); + } + + return 0; + +fail: + tapdisk_nbdserver_reqs_free(client); + return err; +} + +static td_nbdserver_client_t * +tapdisk_nbdserver_alloc_client(td_nbdserver_t *server) +{ + td_nbdserver_client_t *client = NULL; + int err; + + INFO("Alloc client"); + + client = malloc(sizeof(td_nbdserver_client_t)); + if (!client) { + ERROR("Couldn''t allocate client structure: %s", + strerror(errno)); + goto fail; + } + + bzero(client, sizeof(td_nbdserver_client_t)); + + err = tapdisk_nbdserver_reqs_init(client, NBD_SERVER_NUM_REQS); + if (err < 0) { + ERROR("Couldn''t allocate client reqs: %d", err); + goto fail; + } + + client->client_fd = -1; + client->client_event_id = -1; + client->server = server; + TAILQ_INSERT_HEAD(&server->clients, client, clientlist); + + client->paused = 0; + + return client; + +fail: + if (client) { + free(client); + client = NULL; + } + + return client; +} + +static void +tapdisk_nbdserver_free_client(td_nbdserver_client_t *client) +{ + INFO("Free client"); + + if (!client) { + ERROR("Attempt to free NULL pointer!"); + return; + } + + if (client->client_event_id >= 0) + tapdisk_nbdserver_disable_client(client); + + TAILQ_REMOVE(&client->server->clients, client, clientlist); + tapdisk_nbdserver_reqs_free(client); + free(client); +} + +static int +tapdisk_nbdserver_enable_client(td_nbdserver_client_t *client) +{ + INFO("Enable client"); + + if (client->client_event_id >= 0) { + ERROR("Attempting to enable an already-enabled client"); + return -1; + } + + if (client->client_fd < 0) { + ERROR("Attempting to register events on a closed client"); + return -1; + } + + client->client_event_id = tapdisk_server_register_event( + SCHEDULER_POLL_READ_FD, + client->client_fd, 0, + tapdisk_nbdserver_clientcb, + client); + + if (client->client_event_id < 0) { + ERROR("Error registering events on client: %d", + client->client_event_id); + return client->client_event_id; + } + + return client->client_event_id; +} + +static void +tapdisk_nbdserver_disable_client(td_nbdserver_client_t *client) +{ + INFO("Disable client"); + + if (client->client_event_id < 0) { + ERROR("Attempting to disable an already-disabled client"); + return; + } + + tapdisk_server_unregister_event(client->client_event_id); + client->client_event_id = -1; +} + +static void +*get_in_addr(struct sockaddr_storage *ss) +{ + if (ss->ss_family == AF_INET) + return &(((struct sockaddr_in*)ss)->sin_addr); + + return &(((struct sockaddr_in6*)ss)->sin6_addr); +} + +static void +__tapdisk_nbdserver_request_cb(td_vbd_request_t *vreq, int error, + void *token, int final __attribute__((unused))) +{ + td_nbdserver_client_t *client = token; + td_nbdserver_req_t *req = containerof(vreq, td_nbdserver_req_t, vreq); + struct nbd_reply reply; + int tosend = 0; + int sent = 0; + int len = 0; + + reply.magic = htonl(NBD_REPLY_MAGIC); + reply.error = htonl(error); + memcpy(reply.handle, req->id, sizeof(reply.handle)); + + if (client->client_fd < 0) { + ERROR("Finishing request for client that has disappeared"); + goto finish; + } + + send(client->client_fd, &reply, sizeof(reply), 0); + + switch(vreq->op) { + case TD_OP_READ: + tosend = len = vreq->iov->secs << SECTOR_SHIFT; + while (tosend > 0) { + sent = send(client->client_fd, + vreq->iov->base + (len - tosend), + tosend, 0); + if (sent <= 0) { + ERROR("Short send or error in " + "callback: %d", sent); + goto finish; + } + + tosend -= sent; + } + break; + default: + break; + } + +finish: + free(vreq->iov->base); + tapdisk_nbdserver_free_request(client, req); +} + +static void tapdisk_nbdserver_newclient_fd(td_nbdserver_t *server, int new_fd); + +static void +tapdisk_nbdserver_clientcb(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + td_nbdserver_client_t *client = data; + td_nbdserver_t *server = client->server; + int rc; + int len; + int hdrlen; + int n; + int fd = client->client_fd; + char *ptr; + td_vbd_request_t *vreq; + struct nbd_request request; + + td_nbdserver_req_t *req = tapdisk_nbdserver_alloc_request(client); + + if (req == NULL) { + ERROR("Couldn''t allocate request in clientcb - killing client"); + tapdisk_nbdserver_free_client(client); + return; + } + + vreq = &req->vreq; + + memset(req, 0, sizeof(td_nbdserver_req_t)); + /* Read the request the client has sent */ + + hdrlen = sizeof(struct nbd_request); + + n = 0; + ptr = (char *) &request; + while (n < hdrlen) { + rc = recv(fd, ptr + n, hdrlen - n, 0); + if (rc == 0 || errno == ECONNRESET) { + INFO("Client closed connection"); + goto fail; + } + if (rc < 0) { + ERROR("Bad return in nbdserver_clientcb. Closing " + "connection: %s\n", strerror(errno)); + goto fail; + } + n += rc; + } + + if (request.magic != htonl(NBD_REQUEST_MAGIC)) { + ERROR("Not enough magic"); + goto fail; + } + + request.from = ntohll(request.from); + request.type = ntohl(request.type); + len = ntohl(request.len); + if (((len & 0x1ff) != 0) || ((request.from & 0x1ff) != 0)) { + ERROR("Non sector-aligned request (%"PRIu64", %d)", + request.from, len); + } + + bzero(req->id, sizeof(req->id)); + memcpy(req->id, request.handle, sizeof(request.handle)); + + rc = posix_memalign(&req->iov.base, 512, len); + if (rc < 0) { + ERROR("posix_memalign failed (%d)", rc); + goto fail; + } + + vreq->sec = request.from >> SECTOR_SHIFT; + vreq->iovcnt = 1; + vreq->iov = &req->iov; + vreq->iov->secs = len >> SECTOR_SHIFT; + vreq->token = client; + vreq->cb = __tapdisk_nbdserver_request_cb; + vreq->name = req->id; + vreq->vbd = server->vbd; + + switch(request.type) { + case NBD_CMD_READ: + vreq->op = TD_OP_READ; + break; + case NBD_CMD_WRITE: + vreq->op = TD_OP_WRITE; + + n = 0; + while (n < len) { + rc = recv(fd, vreq->iov->base + n, (len - n), 0); + if (rc <= 0) { + ERROR("Short send or error in " + "callback: %d", rc); + goto fail; + } + + n += rc; + }; + + break; + case NBD_CMD_DISC: + INFO("Received close message. Sending reconnect " + "header"); + tapdisk_nbdserver_free_client(client); + INFO("About to send initial connection message"); + tapdisk_nbdserver_newclient_fd(server, fd); + INFO("Sent"); + return; + + default: + ERROR("Unsupported operation: 0x%x", request.type); + goto fail; + } + + rc = tapdisk_vbd_queue_request(server->vbd, vreq); + if (rc) { + ERROR("tapdisk_vbd_queue_request failed: %d", rc); + goto fail; + } + + return; + +fail: + tapdisk_nbdserver_free_client(client); + return; +} + +static void +tapdisk_nbdserver_newclient_fd(td_nbdserver_t *server, int new_fd) +{ + char buffer[256]; + int rc; + uint64_t tmp64; + uint32_t tmp32; + td_nbdserver_client_t *client; + + INFO("Got a new client!"); + + /* Spit out the NBD connection stuff */ + + memcpy(buffer, "NBDMAGIC", 8); + tmp64 = htonll(NBD_NEGOTIATION_MAGIC); + memcpy(buffer + 8, &tmp64, sizeof(tmp64)); + tmp64 = htonll(server->info.size * server->info.sector_size); + memcpy(buffer + 16, &tmp64, sizeof(tmp64)); + tmp32 = htonl(0); + memcpy(buffer + 24, &tmp32, sizeof(tmp32)); + bzero(buffer + 28, 124); + + rc = send(new_fd, buffer, 152, 0); + + if (rc < 152) { + close(new_fd); + INFO("Short write in negotiation!"); + } + + INFO("About to alloc client"); + client = tapdisk_nbdserver_alloc_client(server); + INFO("Got an allocated client at %p", client); + client->client_fd = new_fd; + INFO("About to enable client"); + + if (tapdisk_nbdserver_enable_client(client) < 0) { + ERROR("Error enabling client"); + tapdisk_nbdserver_free_client(client); + close(new_fd); + return; + } +} + +static void +tapdisk_nbdserver_fdreceiver_cb(int fd, char *msg, void *data) +{ + td_nbdserver_t *server = data; + INFO("Received fd with msg: %s", msg); + tapdisk_nbdserver_newclient_fd(server, fd); +} + +static void +tapdisk_nbdserver_newclient(event_id_t id __attribute__((unused)), + char mode __attribute__((unused)), void *data) +{ + struct sockaddr_storage their_addr; + socklen_t sin_size = sizeof(their_addr); + char s[INET6_ADDRSTRLEN]; + int new_fd; + td_nbdserver_t *server = data; + + INFO("About to accept (server->listening_fd = %d)", + server->listening_fd); + new_fd = accept(server->listening_fd, (struct sockaddr *)&their_addr, + &sin_size); + + if (new_fd == -1) { + ERROR("accept (%s)", strerror(errno)); + return; + } + + inet_ntop(their_addr.ss_family, get_in_addr(&their_addr), s, sizeof s); + + INFO("server: got connection from %s\n", s); + + tapdisk_nbdserver_newclient_fd(server, new_fd); +} + +td_nbdserver_t * +tapdisk_nbdserver_alloc(td_vbd_t *vbd, td_disk_info_t info) +{ + td_nbdserver_t *server; + char fdreceiver_path[TAPDISK_NBDSERVER_MAX_PATH_LEN]; + int i; + + server = malloc(sizeof(*server)); + if (!server) { + ERROR("Failed to allocate memory for nbdserver: %s", strerror(errno)); + return NULL; + } + + memset(server, 0, sizeof(*server)); + server->listening_fd = -1; + server->listening_event_id = -1; + TAILQ_INIT(&server->clients); + + server->vbd = vbd; + server->info = info; + + snprintf(fdreceiver_path, TAPDISK_NBDSERVER_MAX_PATH_LEN, "%s%d-%s", + TAPDISK_NBDSERVER_LISTEN_SOCK_PATH, getpid(), + vbd->name); + + /* + * XXX The path we''re supplying will be appended to the socket path, so it + * cannot contain the ''/'' character. We replace all ''/'' with ''-''. + */ + for (i = strlen(TAPDISK_NBDSERVER_LISTEN_SOCK_PATH); + fdreceiver_path[i] != ''\0''; i++) { + if (fdreceiver_path[i] == ''/'') { + fdreceiver_path[i] = ''-''; + } + } + + server->fdreceiver = td_fdreceiver_start(fdreceiver_path, + tapdisk_nbdserver_fdreceiver_cb, server); + + if (!server->fdreceiver) { + ERROR("Error setting up fd receiver"); + /* + * TODO If td_fdreceiver_start failed, we don''t have to call + * tapdisk_server_unregister_event, right? + */ + tapdisk_server_unregister_event(server->listening_event_id); + close(server->listening_fd); + return NULL; + } + + return server; +} + +int +tapdisk_nbdserver_listen(td_nbdserver_t *server, int port) +{ + struct addrinfo hints, *servinfo, *p; + char portstr[10]; + int err; + int yes = 1; + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_PASSIVE; + + snprintf(portstr, 10, "%d", port); + + if ((err = getaddrinfo(NULL, portstr, &hints, &servinfo)) != 0) { + ERROR("Failed to getaddrinfo"); + return -1; + } + + for (p = servinfo; p != NULL; p = p->ai_next) { + if ((server->listening_fd = socket(AF_INET, SOCK_STREAM, 0)) =+ -1) { + ERROR("Failed to create socket"); + continue; + } + + if (setsockopt(server->listening_fd, SOL_SOCKET, SO_REUSEADDR, + &yes, sizeof(int)) == -1) { + ERROR("Failed to setsockopt"); + close(server->listening_fd); + return -1; + } + + if (bind(server->listening_fd, p->ai_addr, p->ai_addrlen) =+ -1) { + ERROR("Failed to bind"); + close(server->listening_fd); + continue; + } + + break; + } + + if (p == NULL) { + ERROR("Failed to bind"); + close(server->listening_fd); + return -1; + } + + freeaddrinfo(servinfo); + + if (listen(server->listening_fd, 10) == -1) { + ERROR("listen"); + return -1; + } + + tapdisk_nbdserver_unpause(server); + + if (server->listening_event_id < 0) { + err = server->listening_event_id; + close(server->listening_fd); + return -1; + } + + INFO("Successfully started NBD server"); + + return 0; +} + +void +tapdisk_nbdserver_pause(td_nbdserver_t *server) +{ + struct td_nbdserver_client *pos, *q; + + INFO("NBD server pause(%p)", server); + + TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q) { + if (pos->paused != 1 && pos->client_event_id >= 0) { + tapdisk_nbdserver_disable_client(pos); + pos->paused = 1; + } + } + + if (server->listening_event_id >= 0) + tapdisk_server_unregister_event(server->listening_event_id); +} + +int +tapdisk_nbdserver_unpause(td_nbdserver_t *server) +{ + struct td_nbdserver_client *pos, *q; + + INFO("NBD server unpause(%p) - listening_fd = %d", server, + server->listening_fd); + + TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q) { + if (pos->paused == 1) { + tapdisk_nbdserver_enable_client(pos); + pos->paused = 0; + } + } + + if (server->listening_event_id < 0 && server->listening_fd >= 0) { + server->listening_event_id + tapdisk_server_register_event(SCHEDULER_POLL_READ_FD, + server->listening_fd, 0, + tapdisk_nbdserver_newclient, + server); + INFO("registering for listening_fd"); + } + + return server->listening_event_id; +} + +void +tapdisk_nbdserver_free(td_nbdserver_t *server) +{ + struct td_nbdserver_client *pos, *q; + + INFO("NBD server free(%p)", server); + + TAILQ_FOREACH_SAFE(pos, &server->clients, clientlist, q) + tapdisk_nbdserver_free_client(pos); + + if (server->listening_event_id >= 0) { + tapdisk_server_unregister_event(server->listening_event_id); + server->listening_event_id = -1; + } + + if (server->listening_fd >= 0) { + close(server->listening_fd); + server->listening_fd = -1; + } + + if (server->fdreceiver) + td_fdreceiver_stop(server->fdreceiver); + + free(server); +} diff --git a/tools/blktap3/drivers/tapdisk-nbdserver.h b/tools/blktap3/drivers/tapdisk-nbdserver.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-nbdserver.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2012, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _TAPDISK_NBDSERVER_H_ +#define _TAPDISK_NBDSERVER_H_ + +typedef struct td_nbdserver td_nbdserver_t; +typedef struct td_nbdserver_req td_nbdserver_req_t; +typedef struct td_nbdserver_client td_nbdserver_client_t; + +#include "blktap3.h" +#include "tapdisk-vbd.h" + +TAILQ_HEAD(tqh_td_nbdserver_client, td_nbdserver_client); + +struct td_nbdserver { + td_vbd_t *vbd; + td_disk_info_t info; + + int listening_fd; + int listening_event_id; + + struct td_fdreceiver *fdreceiver; + + /** + * list of td_nbdserver_client + */ + struct tqh_td_nbdserver_client clients; +}; + +struct td_nbdserver_client { + int n_reqs; + td_nbdserver_req_t *reqs; + struct td_iovec *iovecs; + int n_reqs_free; + td_nbdserver_req_t **reqs_free; + + int client_fd; + int client_event_id; + + td_nbdserver_t *server; + + /** + * for linked lists + */ + TAILQ_ENTRY(td_nbdserver_client) clientlist; /* TODO rename to entry */ + + int paused; +}; + +td_nbdserver_t *tapdisk_nbdserver_alloc(td_vbd_t *, td_disk_info_t); +int tapdisk_nbdserver_listen(td_nbdserver_t *, int); +void tapdisk_nbdserver_free(td_nbdserver_t *); +void tapdisk_nbdserver_pause(td_nbdserver_t *); +int tapdisk_nbdserver_unpause(td_nbdserver_t *); + +#endif /* _TAPDISK_NBDSERVER_H_ */ diff --git a/tools/blktap3/drivers/tapdisk-queue.c b/tools/blktap3/drivers/tapdisk-queue.c --- a/tools/blktap3/drivers/tapdisk-queue.c +++ b/tools/blktap3/drivers/tapdisk-queue.c @@ -347,7 +347,7 @@ static int __lio_setup_aio_eventfd(struc static int tapdisk_lio_setup_aio(struct tqueue *queue, int qlen) { struct lio *lio = queue->tio_data; - int err; + int err, old_err = 0; lio->aio_ctx = 0; lio->event_fd = -1; @@ -359,17 +359,20 @@ static int tapdisk_lio_setup_aio(struct err = !tapdisk_lio_check_resfd(); if (!err) - err = __lio_setup_aio_eventfd(queue, qlen); + err = old_err = __lio_setup_aio_eventfd(queue, qlen); if (err) err = __lio_setup_aio_poll(queue, qlen); - if (err == -EAGAIN) + /* __lio_setup_aio_poll seems to always fail with EINVAL on newer systems, + * probably because it initializes the output parameter of io_setup to a + * non-zero value and the kernel patch that understands this is missing */ + if (err == -EAGAIN || (err && old_err == -EAGAIN)) goto fail_rsv; fail: return err; fail_rsv: - DPRINTF("Couldn''t setup AIO context. If you are trying to " + EPRINTF("Couldn''t setup AIO context. If you are trying to " "concurrently use a large number of blktap-based disks, you may " "need to increase the system-wide aio request limit. " "(e.g. ''echo 1048576 > /proc/sys/fs/aio-max-nr'')\n"); diff --git a/tools/blktap3/drivers/tapdisk-stats.c b/tools/blktap3/drivers/tapdisk-stats.c --- a/tools/blktap3/drivers/tapdisk-stats.c +++ b/tools/blktap3/drivers/tapdisk-stats.c @@ -29,6 +29,8 @@ #include <stdio.h> #include <stdarg.h> +#include <stdlib.h> +#include <errno.h> #include "tapdisk.h" #include "tapdisk-stats.h" @@ -38,8 +40,28 @@ static void __stats_vsprintf(td_stats_t * st, const char *fmt, va_list ap) { - size_t size = st->buf + st->size - st->pos; - st->pos += vsnprintf(st->pos, size, fmt, ap); + void *buf; + int written, new_size, off; + size_t size = 0; + written = 1; + while (written > size) { + size = st->buf + st->size - st->pos; + written = vsnprintf(st->pos, size, fmt, ap); + if (written <= size) + break; + new_size = st->size * 2; + buf = realloc(st->buf, new_size); + if (!buf) { + st->err = -ENOMEM; + written = size; + break; + } + off = st->pos - st->buf; + st->buf = buf; + st->size = new_size; + st->pos = st->buf + off; + } + st->pos += written; } static void __printf(2, 3) diff --git a/tools/blktap3/drivers/tapdisk-stats.h b/tools/blktap3/drivers/tapdisk-stats.h --- a/tools/blktap3/drivers/tapdisk-stats.h +++ b/tools/blktap3/drivers/tapdisk-stats.h @@ -42,6 +42,7 @@ struct tapdisk_stats_ctx { int n_elem[TD_STATS_MAX_DEPTH]; int depth; + int err; }; typedef struct tapdisk_stats_ctx td_stats_t; @@ -58,6 +59,9 @@ tapdisk_stats_init(td_stats_t * st, char static inline size_t tapdisk_stats_length(td_stats_t * st) { + if (st->err) + return st->err; + return st->pos - st->buf; } diff --git a/tools/blktap3/drivers/tapdisk-stream.c b/tools/blktap3/drivers/tapdisk-stream.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/tapdisk-stream.c @@ -0,0 +1,510 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "list.h" +#include "scheduler.h" +#include "tapdisk.h" +#include "tapdisk-server.h" +#include "tapdisk-disktype.h" + +#define POLL_READ 0 +#define POLL_WRITE 1 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define BUG(_cond) td_panic() +#define BUG_ON(_cond) if (unlikely(_cond)) { td_panic(); } + +#define TD_STREAM_MAX_REQS 16 +#define TD_STREAM_REQ_SIZE (sysconf(_SC_PAGE_SIZE) * 32) + +typedef struct tapdisk_stream_request td_stream_req_t; +typedef struct tapdisk_stream td_stream_t; + +struct tapdisk_stream_request { + void *buf; + struct td_iovec iov; + td_vbd_request_t vreq; + struct list_head entry; +}; + +struct tapdisk_stream { + td_vbd_t *vbd; + + unsigned int id; + int in_fd; + int out_fd; + + int err; + + td_sector_t sec_in; + td_sector_t sec_out; + uint64_t count; + + struct list_head pending_list; + struct list_head completed_list; + + td_stream_req_t reqs[TD_STREAM_MAX_REQS]; + td_stream_req_t *free[TD_STREAM_MAX_REQS]; + int n_free; +}; + +static unsigned int tapdisk_stream_count; + +static void tapdisk_stream_close_image(td_stream_t *); +static void tapdisk_stream_queue_requests(td_stream_t *); + +static void +usage(const char *app, int err) +{ + printf("usage: %s <-n type:/path/to/image> " + "[-c sector count] [-s skip sectors]\n", app); + exit(err); +} + +static inline int +tapdisk_stream_stop(td_stream_t *s) +{ + return (list_empty(&s->pending_list) && (!s->count || s->err)); +} + +static int +tapdisk_stream_req_create(td_stream_req_t *req) +{ + int prot, flags; + + memset(req, 0, sizeof(*req)); + INIT_LIST_HEAD(&req->entry); + + prot = PROT_READ|PROT_WRITE; + flags = MAP_ANONYMOUS|MAP_PRIVATE; + + req->buf = mmap(NULL, TD_STREAM_REQ_SIZE, prot, flags, -1, 0); + if (req->buf == MAP_FAILED) { + req->buf = NULL; + return -errno; + } + + return 0; +} + +static void +tapdisk_stream_req_destroy(td_stream_req_t *req) +{ + if (req->buf) { + int err = munmap(req->iov.base, TD_STREAM_REQ_SIZE); + BUG_ON(err); + req->iov.base = NULL; + } +} + +td_stream_req_t * +tapdisk_stream_alloc_req(td_stream_t *s) +{ + td_stream_req_t *req = NULL; + + if (likely(s->n_free)) + req = s->free[--s->n_free]; + + return req; +} + +void +tapdisk_stream_free_req(td_stream_t *s, td_stream_req_t *req) +{ + BUG_ON(s->n_free >= MAX_REQUESTS); + BUG_ON(!list_empty(&req->entry)); + s->free[s->n_free++] = req; +} + +static void +tapdisk_stream_destroy_reqs(td_stream_t *s) +{ + td_stream_req_t *req; + + do { + req = tapdisk_stream_alloc_req(s); + if (!req) + break; + + tapdisk_stream_req_destroy(req); + } while (1); +} + +static int +tapdisk_stream_create_reqs(td_stream_t *s) +{ + int i, err; + + s->n_free = 0; + + for (i = 0; i < TD_STREAM_MAX_REQS; i++) { + td_stream_req_t *req = &s->reqs[i]; + + err = tapdisk_stream_req_create(req); + if (err) + goto fail; + + tapdisk_stream_free_req(s, req); + } + + return 0; + +fail: + tapdisk_stream_destroy_reqs(s); + return err; +} + +static int +tapdisk_stream_print_request(td_stream_t *s, td_stream_req_t *req) +{ + struct td_iovec *iov = &req->iov; + + int gcc = write(s->out_fd, iov->base, iov->secs << SECTOR_SHIFT); + if (gcc) {}; + + return iov->secs; +} + +static void +tapdisk_stream_write_data(td_stream_t *s) +{ + td_stream_req_t *req, *next; + + list_for_each_entry_safe(req, next, &s->completed_list, entry) { + if (req->vreq.sec != s->sec_out) + break; + + s->sec_out += tapdisk_stream_print_request(s, req); + + list_del_init(&req->entry); + tapdisk_stream_free_req(s, req); + } +} + +static inline void +tapdisk_stream_queue_completed(td_stream_t *s, td_stream_req_t *req) +{ + td_stream_req_t *itr; + + list_for_each_entry(itr, &s->completed_list, entry) + if (req->vreq.sec < itr->vreq.sec) + break; + + list_add_tail(&req->entry, &itr->entry); +} + +static void +tapdisk_stream_complete_request(td_stream_t *s, td_stream_req_t *req, + int error, int final) +{ + list_del_init(&req->entry); + + if (likely(!error)) + tapdisk_stream_queue_completed(s, req); + else { + s->err = EIO; + tapdisk_stream_free_req(s, req); + fprintf(stderr, "error reading sector 0x%"PRIx64"\n", + req->vreq.sec); + } + + if (!final) + return; + + tapdisk_stream_write_data(s); + + if (tapdisk_stream_stop(s)) { + tapdisk_stream_close_image(s); + return; + } + + tapdisk_stream_queue_requests(s); +} + +static void +__tapdisk_stream_request_cb(td_vbd_request_t *vreq, int error, + void *token, int final) +{ + td_stream_req_t *req = containerof(vreq, td_stream_req_t, vreq); + td_stream_t *s = token; + + tapdisk_stream_complete_request(s, req, error, final); +} + +static void +tapdisk_stream_queue_request(td_stream_t *s, td_stream_req_t *req) +{ + td_vbd_request_t *vreq; + struct td_iovec *iov; + int secs, err; + + iov = &req->iov; + secs = MIN(TD_STREAM_REQ_SIZE >> SECTOR_SHIFT, s->count); + + iov->base = req->buf; + iov->secs = secs; + + vreq = &req->vreq; + vreq->iov = iov; + vreq->iovcnt = 1; + vreq->sec = s->sec_in; + vreq->op = TD_OP_READ; + vreq->name = NULL; + vreq->token = s; + vreq->cb = __tapdisk_stream_request_cb; + + s->count -= secs; + s->sec_in += secs; + + err = tapdisk_vbd_queue_request(s->vbd, vreq); + if (err) + tapdisk_stream_complete_request(s, req, err, 1); + + list_add_tail(&req->entry, &s->pending_list); +} + +static void +tapdisk_stream_queue_requests(td_stream_t *s) +{ + + while (s->count && !s->err) { + td_stream_req_t *req; + + req = tapdisk_stream_alloc_req(s); + if (!req) + break; + + tapdisk_stream_queue_request(s, req); + } +} + +static int +tapdisk_stream_open_image(struct tapdisk_stream *s, const char *name) +{ + int err; + + s->id = tapdisk_stream_count++; + + err = tapdisk_server_initialize(NULL, NULL); + if (err) + goto out; + + err = tapdisk_vbd_initialize(-1, -1, s->id); + if (err) + goto out; + + s->vbd = tapdisk_server_get_vbd(s->id); + if (!s->vbd) { + err = ENODEV; + goto out; + } + + err = tapdisk_vbd_open_vdi(s->vbd, name, TD_OPEN_RDONLY, -1); + if (err) + goto out; + + err = 0; + +out: + if (err) + fprintf(stderr, "failed to open %s: %d\n", name, err); + return err; +} + +static void +tapdisk_stream_close_image(td_stream_t *s) +{ + td_vbd_t *vbd; + + vbd = tapdisk_server_get_vbd(s->id); + if (vbd) { + tapdisk_vbd_close_vdi(vbd); + tapdisk_server_remove_vbd(vbd); + free(vbd->name); + free(vbd); + s->vbd = NULL; + } +} + +static int +tapdisk_stream_set_position(td_stream_t *s, + uint64_t count, uint64_t skip) +{ + int err; + td_disk_info_t info; + + err = tapdisk_vbd_get_disk_info(s->vbd, &info); + if (err) { + fprintf(stderr, "failed getting image size: %d\n", err); + return err; + } + + if (count == -1LL) + count = info.size - skip; + + if (count + skip > info.size) { + fprintf(stderr, "0x%"PRIx64" past end of image 0x%"PRIx64"\n", + count + skip, info.size); + return -EINVAL; + } + + s->sec_in = skip; + s->sec_out = skip; + s->count = count; + + return 0; +} + +void +__tapdisk_stream_event_cb(event_id_t id, char mode, void *arg) +{ +} + +static int +tapdisk_stream_open_fds(struct tapdisk_stream *s) +{ + s->out_fd = dup(STDOUT_FILENO); + if (s->out_fd == -1) { + fprintf(stderr, "failed to open output: %d\n", errno); + return errno; + } + + return 0; +} + +static void +tapdisk_stream_close(struct tapdisk_stream *s) +{ + tapdisk_stream_destroy_reqs(s); + + tapdisk_stream_close_image(s); + + if (s->out_fd >= 0) { + close(s->out_fd); + s->out_fd = -1; + } +} + +static int +tapdisk_stream_open(struct tapdisk_stream *s, const char *name, + uint64_t count, uint64_t skip) +{ + int err = 0; + + memset(s, 0, sizeof(*s)); + s->in_fd = s->out_fd = -1; + INIT_LIST_HEAD(&s->pending_list); + INIT_LIST_HEAD(&s->completed_list); + + if (!err) + err = tapdisk_stream_open_fds(s); + if (!err) + err = tapdisk_stream_open_image(s, name); + if (!err) + err = tapdisk_stream_set_position(s, count, skip); + if (!err) + err = tapdisk_stream_create_reqs(s); + + if (err) + tapdisk_stream_close(s); + + return err; +} + +static int +tapdisk_stream_run(struct tapdisk_stream *s) +{ + tapdisk_stream_queue_requests(s); + tapdisk_server_run(); + return s->err; +} + +int +main(int argc, char *argv[]) +{ + int c, err; + const char *params; + uint64_t count, skip; + struct tapdisk_stream stream; + + err = 0; + skip = 0; + count = (uint64_t)-1; + params = NULL; + + while ((c = getopt(argc, argv, "n:c:s:h")) != -1) { + switch (c) { + case ''n'': + params = optarg; + break; + case ''c'': + count = strtoull(optarg, NULL, 10); + break; + case ''s'': + skip = strtoull(optarg, NULL, 10); + break; + default: + err = EINVAL; + case ''h'': + usage(argv[0], err); + } + } + + if (!params) + usage(argv[0], EINVAL); + + tapdisk_start_logging("tapdisk-stream", "daemon"); + + err = tapdisk_stream_open(&stream, params, count, skip); + if (err) + goto out; + + err = tapdisk_stream_run(&stream); + if (err) + goto out; + + err = 0; + +out: + tapdisk_stream_close(&stream); + tapdisk_stop_logging(); + return err; +} diff --git a/tools/blktap3/drivers/tapdisk-syslog.c b/tools/blktap3/drivers/tapdisk-syslog.c --- a/tools/blktap3/drivers/tapdisk-syslog.c +++ b/tools/blktap3/drivers/tapdisk-syslog.c @@ -231,7 +231,7 @@ static void tapdisk_syslog_ring_warning( n = log->oom; log->oom = 0; - err = tapdisk_syslog(log, LOG_WARNING, + err = tapdisk_syslog(log, TLOG_WARN, "tapdisk-syslog: %d messages dropped", n); if (err) log->oom = n; diff --git a/tools/blktap3/drivers/tapdisk-utils.c b/tools/blktap3/drivers/tapdisk-utils.c --- a/tools/blktap3/drivers/tapdisk-utils.c +++ b/tools/blktap3/drivers/tapdisk-utils.c @@ -37,6 +37,8 @@ #include <sys/ioctl.h> #include <sys/resource.h> #include <sys/utsname.h> +#include <arpa/inet.h> + #ifdef __linux__ #include <linux/version.h> #endif @@ -181,6 +183,7 @@ tapdisk_namedup(char **dup, const char * return 0; } +/* FIXME Is this still used? */ /*Get Image size, secsize*/ int tapdisk_get_image_size(int fd, uint64_t *_sectors, uint32_t *_sector_size) @@ -264,3 +267,19 @@ int tapdisk_linux_version(void) } #endif + +#ifdef WORDS_BIGENDIAN +uint64_t ntohll(uint64_t a) { + return a; +} +#else +uint64_t ntohll(uint64_t a) { + uint32_t lo = a & 0xffffffff; + uint32_t hi = a >> 32U; + lo = ntohl(lo); + hi = ntohl(hi); + return ((uint64_t) lo) << 32U | hi; +} +#endif +#define htonll ntohll + diff --git a/tools/blktap3/drivers/tapdisk-utils.h b/tools/blktap3/drivers/tapdisk-utils.h --- a/tools/blktap3/drivers/tapdisk-utils.h +++ b/tools/blktap3/drivers/tapdisk-utils.h @@ -45,5 +45,7 @@ int tapdisk_namedup(char **, const char int tapdisk_parse_disk_type(const char *, char **, int *); int tapdisk_get_image_size(int, uint64_t *, uint32_t *); int tapdisk_linux_version(void); +uint64_t ntohll(uint64_t); +#define htonll ntohll #endif diff --git a/tools/blktap3/drivers/tapdisk-vbd.c b/tools/blktap3/drivers/tapdisk-vbd.c --- a/tools/blktap3/drivers/tapdisk-vbd.c +++ b/tools/blktap3/drivers/tapdisk-vbd.c @@ -48,10 +48,14 @@ #include "tapdisk-stats.h" #include "sring/td-stats.h" #include "tapdisk-storage.h" +#include "tapdisk-nbdserver.h" #define DBG(_level, _f, _a...) tlog_write(_level, _f, ##_a) #define ERR(_err, _f, _a...) tlog_error(_err, _f, ##_a) +#define INFO(_f, _a...) tlog_syslog(TLOG_INFO, "vbd: " _f, ##_a) +#define ERROR(_f, _a...) tlog_syslog(TLOG_WARN, "vbd: " _f, ##_a) + #if 1 #define ASSERT(p) \ do { \ @@ -65,7 +69,6 @@ #define ASSERT(p) ((void)0) #endif - #define TD_VBD_EIO_RETRIES 10 #define TD_VBD_EIO_SLEEP 1 #define TD_VBD_WATCHDOG_TIMEOUT 10 @@ -261,6 +264,14 @@ static int tapdisk_vbd_add_secondary(td_ const char *path; int type, err; + if (strcmp(vbd->secondary_name, "null") == 0) { + DPRINTF("Removing secondary image\n"); + vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED; + vbd->secondary = NULL; + vbd->nbd_mirror_failed = 0; + return 0; + } + DPRINTF("Adding secondary image: %s\n", vbd->secondary_name); type = tapdisk_disktype_parse_params(vbd->secondary_name, &path); @@ -274,8 +285,15 @@ static int tapdisk_vbd_add_secondary(td_ } err = tapdisk_image_open(type, path, leaf->flags, &second); - if (err) + if (err) { + if (type == DISK_TYPE_NBD) + vbd->nbd_mirror_failed = 1; + + vbd->secondary=NULL; + vbd->secondary_mode=TD_VBD_SECONDARY_DISABLED; + goto fail; + } if (second->info.size != leaf->info.size) { EPRINTF("Secondary image size %" PRIu64 " != image size %" PRIu64 @@ -465,8 +483,12 @@ tapdisk_vbd_open_vdi(td_vbd_t * vbd, con if (td_flag_test(vbd->flags, TD_OPEN_SECONDARY)) { err = tapdisk_vbd_add_secondary(vbd); - if (err) + if (err) { + if (vbd->nbd_mirror_failed != 1) goto fail; + INFO("Ignoring failed NBD secondary attach\n"); + err = 0; + } } if (tmp != vbd->name) @@ -631,6 +653,12 @@ int tapdisk_vbd_retry_needed(td_vbd_t * TAILQ_EMPTY(&vbd->new_requests)); } +int +tapdisk_vbd_lock(td_vbd_t *vbd __attribute__((unused))) +{ + return 0; +} + int tapdisk_vbd_quiesce_queue(td_vbd_t * vbd) { if (!TAILQ_EMPTY(&vbd->pending_requests)) { @@ -686,17 +714,20 @@ int tapdisk_vbd_pause(td_vbd_t * vbd) { int err; - DBG(TLOG_DBG, "pause requested\n"); + INFO("pause requested\n"); td_flag_set(vbd->state, TD_VBD_PAUSE_REQUESTED); + if (vbd->nbdserver) + tapdisk_nbdserver_pause(vbd->nbdserver); + err = tapdisk_vbd_quiesce_queue(vbd); if (err) return err; tapdisk_vbd_close_vdi(vbd); - DBG(TLOG_DBG, "pause completed\n"); + INFO("pause completed\n"); td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); td_flag_set(vbd->state, TD_VBD_PAUSED); @@ -716,8 +747,8 @@ int tapdisk_vbd_resume(td_vbd_t * vbd, c } for (i = 0; i < TD_VBD_EIO_RETRIES; i++) { - err - tapdisk_vbd_open_vdi(vbd, name, vbd->flags | TD_OPEN_STRICT, NULL); + err = tapdisk_vbd_open_vdi(vbd, name, vbd->flags | TD_OPEN_STRICT, + NULL); if (!err) break; @@ -734,6 +765,9 @@ int tapdisk_vbd_resume(td_vbd_t * vbd, c td_flag_clear(vbd->state, TD_VBD_PAUSE_REQUESTED); tapdisk_vbd_check_state(vbd); + if (vbd->nbdserver) + tapdisk_nbdserver_unpause(vbd->nbdserver); + DBG(TLOG_DBG, "state checked\n"); return 0; @@ -902,10 +936,10 @@ static void if (err != -EBUSY) { if (!vreq->error && err != vreq->prev_error) tlog_drv_error(image->driver, err, - "req %s: %s 0x%04x secs @ 0x%08" PRIx64, + "req %s: %s 0x%04x secs @ 0x%08"PRIx64" - %s", vreq->name, (treq.op == TD_OP_WRITE ? "write" : "read"), - treq.secs, treq.sec); + treq.secs, treq.sec, strerror(abs(err))); vbd->errors++; } vreq->error = (vreq->error ? : err); @@ -1020,6 +1054,26 @@ void tapdisk_vbd_complete_td_request(td_ } } + if (res != 0) + DPRINTF("Res=%d, image->type=%d\n", res, image->type); + + if (res != 0 && image->type == DISK_TYPE_NBD && + ((image == vbd->secondary) || + (image == vbd->retired))) { + ERROR("Got non-zero res for NBD secondary - disabling " + "mirroring: %s",vreq->name); + vbd->nbd_mirror_failed = 1; + res = 0; /* Pretend the writes have completed successfully */ + + /* It was the secondary that timed out - disable secondary */ + TAILQ_REMOVE(&vbd->images, image, entry); + vbd->retired = image; + if (vbd->secondary_mode != TD_VBD_SECONDARY_DISABLED) { + vbd->secondary = NULL; + vbd->secondary_mode = TD_VBD_SECONDARY_DISABLED; + } + } + DBG(TLOG_DBG, "%s: req %s seg %d sec 0x%08" PRIx64 "secs 0x%04x buf %p op %d res %d\n", image->name, vreq->name, treq.sidx, treq.sec, treq.secs, @@ -1297,6 +1351,27 @@ void tapdisk_vbd_kick(td_vbd_t * vbd) } } +int +tapdisk_vbd_start_nbdserver(td_vbd_t *vbd) +{ + td_disk_info_t info; + int err; + + err = tapdisk_vbd_get_disk_info(vbd, &info); + + if (err) + return err; + + vbd->nbdserver = tapdisk_nbdserver_alloc(vbd, info); + + if (!vbd->nbdserver) { + EPRINTF("Error starting nbd server"); + return -1; + } + + return 0; +} + void tapdisk_vbd_stats(td_vbd_t * vbd, td_stats_t * st) { td_image_t *image, *next; @@ -1324,5 +1399,9 @@ void tapdisk_vbd_stats(td_vbd_t * vbd, t "FIXME_enospc_redirect_count", "llu", vbd->FIXME_enospc_redirect_count); + tapdisk_stats_field(st, + "nbd_mirror_failed", + "d", vbd->nbd_mirror_failed); + tapdisk_stats_leave(st, ''}''); } diff --git a/tools/blktap3/drivers/tapdisk-vbd.h b/tools/blktap3/drivers/tapdisk-vbd.h --- a/tools/blktap3/drivers/tapdisk-vbd.h +++ b/tools/blktap3/drivers/tapdisk-vbd.h @@ -55,6 +55,8 @@ TAILQ_HEAD(tqh_td_vbd_handle, td_vbd_handle); +struct td_nbdserver; + struct td_vbd_handle { /** * type:/path/to/file @@ -80,13 +82,16 @@ struct td_vbd_handle { int FIXME_enospc_redirect_count_enabled; uint64_t FIXME_enospc_redirect_count; - /* when we encounter ENOSPC on the primary leaf image in mirror mode, + /** + * when we encounter ENOSPC on the primary leaf image in mirror mode, * we need to remove it from the VBD chain so that writes start going * on the secondary leaf. However, we cannot free the image at that * time since it might still have in-flight treqs referencing it. * Therefore, we move it into ''retired'' until shutdown. */ td_image_t *retired; + int nbd_mirror_failed; + struct tqh_td_vbd_request new_requests; struct tqh_td_vbd_request pending_requests; struct tqh_td_vbd_request failed_requests; @@ -105,6 +110,8 @@ struct td_vbd_handle { uint64_t retries; uint64_t errors; td_sector_count_t secs; + + struct td_nbdserver *nbdserver; }; #define tapdisk_vbd_for_each_request(vreq, tmp, list) \ @@ -217,6 +224,7 @@ void tapdisk_vbd_check_state(td_vbd_t *) int tapdisk_vbd_recheck_state(td_vbd_t *); void tapdisk_vbd_check_progress(td_vbd_t *); void tapdisk_vbd_debug(td_vbd_t *); +int tapdisk_vbd_start_nbdserver(td_vbd_t *); void tapdisk_vbd_stats(td_vbd_t *, td_stats_t *); #endif diff --git a/tools/blktap3/drivers/tapdisk.c b/tools/blktap3/drivers/tapdisk.c --- a/tools/blktap3/drivers/tapdisk.c +++ b/tools/blktap3/drivers/tapdisk.c @@ -37,6 +37,9 @@ #include "tapdisk-server.h" #include "tapdisk-control.h" +void tdnbd_fdreceiver_start(void); +void tdnbd_fdreceiver_stop(void); + static void usage(const char *app, int err) { fprintf(stderr, "usage: %s <-u uuid> <-c control socket>\n", app); @@ -131,9 +134,17 @@ int main(int argc, char *argv[]) fprintf(out, "%s\n", control); fclose(out); + /* + * NB: We''re unconditionally starting the FD receiver here - this is + * for the block-nbd driver. In the future we may want to start this as + * a response to a tap-ctl message + */ + tdnbd_fdreceiver_start(); + err = tapdisk_server_run(); out: + tdnbd_fdreceiver_stop(); tapdisk_control_close(); tapdisk_stop_logging(); return -err; diff --git a/tools/blktap3/drivers/tapdisk.h b/tools/blktap3/drivers/tapdisk.h --- a/tools/blktap3/drivers/tapdisk.h +++ b/tools/blktap3/drivers/tapdisk.h @@ -60,11 +60,8 @@ #include <stdint.h> #include <assert.h> -// XXX? -//#include "blktaplib.h" #include "blktap3.h" -// TODO necessary? #include "tapdisk-log.h" #include "tapdisk-utils.h" #include "tapdisk-stats.h" @@ -208,6 +205,8 @@ struct td_request { td_vbd_request_t *vreq; }; +struct tqh_td_image_handle; + /* * Structure describing the interface to a virtual disk implementation. * See note at the top of this file describing this interface. @@ -217,7 +216,7 @@ struct tap_disk { td_flag_t flags; int private_data_size; int (*td_open) (td_driver_t *, const char *, td_flag_t); - int (*td_close) (td_driver_t *); + int (*td_close) (td_driver_t *, struct tqh_td_image_handle *); int (*td_get_parent_id) (td_driver_t *, td_disk_id_t *); int (*td_validate_parent) (td_driver_t *, td_driver_t *, td_flag_t); void (*td_queue_read) (td_driver_t *, td_request_t); diff --git a/tools/blktap3/drivers/td-rated.1.txt b/tools/blktap3/drivers/td-rated.1.txt new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/td-rated.1.txt @@ -0,0 +1,190 @@ + +SYNOPSIS + + td-rated <name> -type {token|leaky|meminfo} -- [options] + +DESCRIPTION + + The td-rated ''bridge'' is a daemon program to which one or a number + of tapdisk processes connect, in order to cooperatively limit the + data rate at which they will issue I/O requests to physical + storage. + + A data rate denotes I/O bandwidth, i.e. an (average) amount of + data over time. A rate limiter is a state machine dispatching an + overall queue of incoming I/O requests, at a desired data rate. + + The td-rated program included a number of alternative rate + limiting algorithms for various purposes. Rate limiters are + discussed below. + + The standard client implementation in tapdisk is a transparent + filter driver, of type name ''valve''. Valves are typically inserted + at either the top of certain level of the disk image stack + constituting a VDI, thereby uniformly limiting any I/O issued. + + Every bridge process constitutes a single rate limiter. Arbitrary + numbers of client valves can connect to each bridge. I/O requests + issued by clients are normally aggregated, dividing the available + bandwidth among all active clients. + +OPTIONS + + Token Bucket + + Token bucket is a rate limiter which drains a request queue of + pending I/O requests at a given overall data rate. It is + invoked as follows: + + td-rated -t token -- .. + + --rate <limit> + Bandwidth limit [B/s]. + + --cap <limit> + Burst (aggregated credit) limit [B]. + + Token bucket''s main feature over basic constant-rate + algorithms (leaky buckets) is that it allows for I/O + bursts. Bursts are batches of data request, which are + preferably issued simultaneously to reduce the overall number + of seeks involved on shared rotational media. + + With bursty I/O transfers, bandwidth may transiently exceed + the nominal data rate, but in a controlled fashion. Different + from a constant rate output, the I/O output rate is maintained + as an average over periods of time. + + Internally, bursts issued at any time instant consume + bandwidth credit (''tokens''). Credit gets accumulated, at the + given rate, over time. Once exhausted, credit taken must be + amortized before additional I/O can pass. That is, while the + rate set will limit an output data rate, it does so only + indirectly, by limiting the rate at which new credit is + assigned. + + The cap argument is a limit to accumulated credit. Excess + credit above the given capacity will be discarded. Caps limit + the maximum burst size observable. The maximum only becomes + available whenever all clients remained idle for for a time + perid of cap/rate. + + A token bucket allows for bursts, it does not promote or + enforce them at. Once configured bandwidth credit is exeeded, + amortization time is applied to client request batches + individually, in the order in which they were issued, and + output will effectively degrade to a constant data rate. + + Leaky Bucket + + Leaky bucket is a simpler constant rate algorithm. Requests + are issued in a round-robin fashion. The given rate is never + exceeded, so requests. + + This is presently equivalent to a token bucket with a cap + value of zero, and therefore implemented accordingly. + + td-rated -t leaky -- .. + + --rate <limit> + Bandwidth limit [B/s]. + + Meminfo Driver + + Meminfo is an experimental rate limiting driver aiming + specifically at write bandwidth reduction for tapdisk I/O + modes targeting the host OS buffer cache. It is invoked as + follows + + td-rated -t meminfo -- .. + + --high <limit> + [% of total memory] + + --low <limit> + [% of total memory] + + [--period <time>] + Memory stats update period [ms] + Default: 100 + + -t <type> ... + Subordinate rate limiter type. + + -- [ subordinate options .. ] + + Where the subordinate type and options typically invokes one + of the basic rate-oriented algorithms described above. + + Memory limits are not bandwidth limits, but cache utilization + bounds aimed to be met. The arguments to --high and --low + options are watermarks setting hysteresis limits on domain OS + cache utilization detected. They are defined in percent of + total memory available to the domain OS. + + The driver periodically scans OS memory statistics to estimate + present host buffer I/O loads. By default a state update is + performed every 100ms. + + The cache is considered underutilized while the amount of + memory either modified, or under writeback, does not exceed + the percentage indicated by --high. In that state, I/O will + pass unrestricted. + + Once the --high limit is exceeded, a congestion mode of + operation is entered, where the output data rate is + reduced. That state prevails until the cache is detected + underutilized again, at a value below or equal the --low + watermark. + + Meminfo rate limiting is driven by overall domain state, + commonly involving applications not sharing the same domain of + bandwidth arbitration. I/O can therefore only be throttled, + not blocked, or would risk starvation. For that purpose, the + meminfo driver requires a (configurable) subordinate rate + limiter. This may be any of the raw bandwidth-oriented + implementations available. + + Limit Formats + + I/O size and limit values specified at td-rated invocation + time are integers in units of bytes, or integers as multiples + of units given in either SI decimal (K,M,G) or IEC binary + (Ki,Mi,Gi) suffix notation, e.g. 10k (10 * 2^10 B), 128Mi (128 + * 10^6 B), 1Gi (1 * 10^9 B). + +EXAMPLES + + Invocations + + td-rated /var/run/blktap/x.sk -t leaky -- \ + --rate=60M + + Constant-rate output rate limit at 60M/s. Listening for + client connections at /var/run/blktap/x.sk. + + td-rated /var/run/blktap/y.sk -t token -- \ + --rate=80M --cap 10M + + Token bucket rate limiting at 80M/s with a burst limit of 10M. + + td-rated /var/run/blktap/y.sk -t meminfo -- \ + --low=40 --high=60 -t leaky -- --rate=15M + + Buffer I/O rate limiting with a high/low cache utilization + watermark of 60%/40% of host memory. Once the upper limit is + met, constant rate output targeting a limit of 10M/s is + applied. + + Image Chain + + tap-ctl create x-chain:/var/tmp/limit.chain + + /var/tmp/limit.chain: + valve:/var/run/blktap/x.sk + vhd:/dev/vg/image.vhd + +BUGS + + The -t leaky type isn''t really aliased yet properly. + Use the form -t token -- --cap=0 instead. diff --git a/tools/blktap3/drivers/td-rated.c b/tools/blktap3/drivers/td-rated.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/td-rated.c @@ -0,0 +1,1722 @@ +/* + * Copyright (c) 2011, Citrix Systems. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> +#include <stdarg.h> +#include <signal.h> +#include <getopt.h> +#include <syslog.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/time.h> + +#include "block-valve.h" +#include "compiler.h" +#include "list.h" + +static void +rlb_vlog_vfprintf(int prio, const char *fmt, va_list ap) +{ + vfprintf(stderr, fmt, ap); fputc(''\n'', stderr); +} + +static void (*rlb_vlog)(int prio, const char *fmt, va_list ap); + +__printf(2, 3) +static void +rlb_log(int prio, const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); rlb_vlog(prio, fmt, ap); va_end(ap); +} + +static int debug = 0; + +#define DBG(_l, _f, _a...) if (debug >= _l) { rlb_log(LOG_DEBUG, _f, ##_a); } +#define INFO(_f, _a...) rlb_log(LOG_INFO, _f, ##_a) +#define WARN(_f, _a...) rlb_log(LOG_WARNING, "WARNING: " _f ", in %s:%d", \ + ##_a, __func__, __LINE__) +#define ERR(_f, _a...) rlb_log(LOG_ERR, "ERROR: " _f ", in %s:%d", \ + ##_a, __func__, __LINE__) +#define PERROR(_f, _a...) rlb_log(LOG_ERR, _f ": %s in %s:%d", \ + ##_a, strerror(errno), __func__, __LINE__) + +#define BUG() do { \ + ERR("Aborting"); \ + abort(); \ + } while (0) + +#define BUG_ON(_cond) \ + if (unlikely(_cond)) { \ + ERR("(%s) = %d", #_cond, _cond); \ + BUG(); \ + } + +#define WARN_ON(_cond) ({ \ + int __cond = _cond; \ + if (unlikely(__cond)) \ + WARN("(%s) = %d", #_cond, _cond); \ + __cond; \ +}) + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#define ARRAY_SIZE(_a) (sizeof(_a)/sizeof((_a)[0])) + +typedef struct ratelimit_bridge td_rlb_t; +typedef struct ratelimit_connection td_rlb_conn_t; + +struct ratelimit_connection { + int sock; + + unsigned long need; /* I/O requested */ + unsigned long gntd; /* I/O granted, pending */ + + struct list_head open; /* connected */ + struct list_head wait; /* need > 0 */ + + struct { + struct timeval since; + struct timeval total; + } wstat; +}; + +#define RLB_CONN_MAX 1024 + +struct ratelimit_ops { + void (*usage)(td_rlb_t *rlb, FILE *stream, void *data); + + int (*create)(td_rlb_t *rlb, int argc, char **argv, void **data); + void (*destroy)(td_rlb_t *rlb, void *data); + + void (*info)(td_rlb_t *rlb, void *data); + + void (*settimeo)(td_rlb_t *rlb, struct timeval **tv, void *data); + void (*timeout)(td_rlb_t *rlb, void *data); + void (*dispatch)(td_rlb_t *rlb, void *data); + void (*reset)(td_rlb_t *rlb, void *data); +}; + +struct ratelimit_bridge { + char *name; + char *ident; + + struct sockaddr_un addr; + char *path; + int sock; + + struct list_head open; /* all connections */ + struct list_head wait; /* all in need */ + + struct timeval ts, now; + + td_rlb_conn_t connv[RLB_CONN_MAX]; + td_rlb_conn_t *free[RLB_CONN_MAX]; + int n_free; + + struct rlb_valve { + struct ratelimit_ops *ops; + void *data; + } valve; +}; + +#define rlb_for_each_conn(_conn, _rlb) \ + list_for_each_entry(_conn, &(_rlb)->open, open) + +#define rlb_for_each_conn_safe(_conn, _next, _rlb) \ + list_for_each_entry_safe(_conn, _next, &(_rlb)->open, open) + +#define rlb_for_each_waiting(_conn, _next, _rlb) \ + list_for_each_entry(_conn, _next, &(_rlb)->wait, wait) + +#define rlb_for_each_waiting_safe(_conn, _next, _rlb) \ + list_for_each_entry_safe(_conn, _next, &(_rlb)->wait, wait) + +#define rlb_conn_entry(_list) \ + list_entry(_list, td_rlb_conn_t, open) + +#define rlb_wait_entry(_list) \ + list_entry(_list, td_rlb_conn_t, wait) + +static struct ratelimit_ops *rlb_find_valve(const char *name); + +static int rlb_create_valve(td_rlb_t *, struct rlb_valve *, + const char *name, int argc, char **argv); + +/* + * util + */ + +#define case_G case ''G'': case ''g'' +#define case_M case ''M'': case ''m'' +#define case_K case ''K'': case ''k'' + +static long +rlb_strtol(const char *s) +{ + unsigned long l, u = 1; + char *end, p, q; + + l = strtoul(s, &end, 0); + if (!*end) + return l; + + p = *end++; + + switch (p) { + case_G: case_M: case_K: + + q = *end++; + + switch (q) { + case ''i'': + switch (p) { + case_G: + u *= 1024; + case_M: + u *= 1024; + case_K: + u *= 1024; + } + break; + + case 0: + switch (p) { + case_G: + u *= 1000; + case_M: + u *= 1000; + case_K: + u *= 1000; + } + break; + + default: + goto fail; + } + break; + + case 0: + break; + + default: + goto fail; + } + + return l * u; + +fail: + return -EINVAL; +} + +static char* +vmprintf(const char *fmt, va_list ap) +{ + char *s; + int n; + + n = vasprintf(&s, fmt, ap); + if (n < 0) + s = NULL; + + return s; +} + +__printf(1, 2) +static char* +mprintf(const char *fmt, ...) +{ + va_list ap; + char *s; + + va_start(ap, fmt); + s = vmprintf(fmt, ap); + va_end(ap); + + return s; +} + +static int +sysctl_vscanf(const char *name, const char *fmt, va_list ap) +{ + char *path = NULL; + FILE *s = NULL; + int rv; + + path = mprintf("/proc/sys/%s", name); + if (!path) { + rv = -errno; + goto fail; + } + + s = fopen(path, "r"); + if (!s) { + rv = -errno; + goto fail; + } + + rv = vfscanf(s, fmt, ap); +fail: + if (s) + fclose(s); + + if (path) + free(path); + + return rv; +} + +static int +sysctl_scanf(const char *name, const char *fmt, ...) +{ + va_list(ap); + int rv; + + va_start(ap, fmt); + rv = sysctl_vscanf(name, fmt, ap); + va_end(ap); + + return rv; +} + +static long +sysctl_strtoul(const char *name) +{ + unsigned val; + int n; + + n = sysctl_scanf(name, "%lu", &val); + if (n < 0) + return n; + if (n != 1) + return -EINVAL; + + return val; +} + + +static long long +rlb_tv_usec(const struct timeval *tv) +{ + long long us; + + us = tv->tv_sec; + us *= 1000000; + us += tv->tv_usec; + + return us; +} + +static long long +rlb_usec_since(td_rlb_t *rlb, const struct timeval *since) +{ + struct timeval delta; + + timersub(&rlb->now, since, &delta); + + return rlb_tv_usec(&delta); +} + +static inline void +rlb_argv_shift(int *optind, int *argc, char ***argv) +{ + /* reset optind and args after ''--'' */ + + *optind -= 1; + + *argc -= *optind; + *argv += *optind; + + *optind = 1; +} + +/* + * socket I/O + */ + +static void +rlb_sock_close(td_rlb_t *rlb) +{ + if (rlb->path) { + unlink(rlb->path); + rlb->path = NULL; + } + + if (rlb->sock >= 0) { + close(rlb->sock); + rlb->sock = -1; + } +} + +static int +rlb_sock_open(td_rlb_t *rlb) +{ + int s, err; + + rlb->sock = -1; + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + PERROR("socket"); + err = -errno; + goto fail; + } + + rlb->sock = s; + + rlb->addr.sun_family = AF_UNIX; + + if (rlb->name[0] == ''/'') + strncpy(rlb->addr.sun_path, rlb->name, + sizeof(rlb->addr.sun_path)); + else + snprintf(rlb->addr.sun_path, sizeof(rlb->addr.sun_path), + "%s/%s", TD_VALVE_SOCKDIR, rlb->name); + + err = bind(rlb->sock, &rlb->addr, sizeof(rlb->addr)); + if (err) { + PERROR("%s", rlb->addr.sun_path); + err = -errno; + goto fail; + } + + rlb->path = rlb->addr.sun_path; + + err = listen(rlb->sock, RLB_CONN_MAX); + if (err) { + PERROR("listen(%s)", rlb->addr.sun_path); + err = -errno; + goto fail; + } + + return 0; + +fail: + rlb_sock_close(rlb); + return err; +} + +static int +rlb_sock_send(td_rlb_t *rlb, td_rlb_conn_t *conn, + const void *msg, size_t size) +{ + ssize_t n; + + n = send(conn->sock, msg, size, MSG_DONTWAIT); + if (n < 0) + return -errno; + if (n && n != size) + return -EPROTO; + + return 0; +} + +static int +rlb_sock_recv(td_rlb_t *rlb, td_rlb_conn_t *conn, + void *msg, size_t size) +{ + ssize_t n; + + n = recv(conn->sock, msg, size, MSG_DONTWAIT); + if (n < 0) + return -errno; + + return n; +} + +static td_rlb_conn_t * +rlb_conn_alloc(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn = NULL; + + if (likely(rlb->n_free > 0)) + conn = rlb->free[--rlb->n_free]; + + return conn; +} + +static void +rlb_conn_free(td_rlb_t *rlb, td_rlb_conn_t *conn) +{ + BUG_ON(rlb->n_free >= RLB_CONN_MAX); + + rlb->free[rlb->n_free++] = conn; +} + +static int +rlb_conn_id(td_rlb_t *rlb, td_rlb_conn_t *conn) +{ + return conn - rlb->connv; +} + +static void +rlb_conn_info(td_rlb_t *rlb, td_rlb_conn_t *conn) +{ + long long wtime; + int waits; + + wtime = 0; + waits = !list_empty(&conn->wait); + if (waits) + wtime = rlb_usec_since(rlb, &conn->wstat.since) / 1000; + + WARN_ON(!!conn->need != waits); + + INFO("conn[%d] needs %lu (since %llu ms, total %lu.%06lu s)," + " %lu granted", + rlb_conn_id(rlb, conn), conn->need, wtime, + conn->wstat.total.tv_sec, conn->wstat.total.tv_usec, + conn->gntd); +} + +static void +rlb_conn_infos(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn; + + rlb_for_each_conn(conn, rlb) + rlb_conn_info(rlb, conn); +} + +static void +rlb_conn_close(td_rlb_t *rlb, td_rlb_conn_t *conn) +{ + int s = conn->sock; + + INFO("Connection %d closed.", rlb_conn_id(rlb, conn)); + rlb_conn_info(rlb, conn); + + if (s) { + close(s); + conn->sock = -1; + } + + list_del_init(&conn->wait); + list_del(&conn->open); + + rlb_conn_free(rlb, conn); +} + +static void +rlb_conn_receive(td_rlb_t *rlb, td_rlb_conn_t *conn) +{ + struct td_valve_req buf[32], req = { -1, -1 }; + ssize_t n; + int i, err; + + n = rlb_sock_recv(rlb, conn, buf, sizeof(buf)); + if (!n) + goto close; + + if (n < 0) { + err = n; + if (err != -EAGAIN) + goto fail; + } + + if (unlikely(n % sizeof(req))) { + err = -EPROTO; + goto fail; + } + + for (i = 0; i < n / sizeof(buf[0]); i++) { + req = buf[i]; + + if (unlikely(req.need > TD_RLB_REQUEST_MAX)) { + err = -EINVAL; + goto fail; + } + + if (unlikely(req.done > conn->gntd)) { + err = -EINVAL; + goto fail; + } + + conn->need += req.need; + conn->gntd -= req.done; + + DBG(8, "rcv: %lu/%lu need=%lu gntd=%lu", + req.need, req.done, conn->need, conn->gntd); + + if (unlikely(conn->need > TD_RLB_REQUEST_MAX)) { + err = -EINVAL; + goto fail; + } + } + + if (conn->need && list_empty(&conn->wait)) { + list_add_tail(&conn->wait, &rlb->wait); + conn->wstat.since = rlb->now; + } + + return; + +fail: + WARN("err = %d (%s)" + " (need %ld/%ld, %ld/%ld done)," + " closing connection.", + err, strerror(-err), + req.need, conn->need, req.done, conn->gntd); + + rlb_conn_info(rlb, conn); +close: + rlb_conn_close(rlb, conn); +} + +static void +rlb_conn_respond(td_rlb_t *rlb, td_rlb_conn_t *conn, unsigned long need) +{ + int err; + + BUG_ON(need > conn->need); + + err = rlb_sock_send(rlb, conn, &need, sizeof(need)); + if (err) + goto fail; + + conn->need -= need; + conn->gntd += need; + + DBG(8, "snd: %lu need=%lu gntd=%lu", need, conn->need, conn->gntd); + + if (!conn->need) { + struct timeval delta; + + timersub(&rlb->now, &conn->wstat.since, &delta); + timeradd(&conn->wstat.total, &delta, &conn->wstat.total); + + list_del_init(&conn->wait); + } + + return; + +fail: + WARN("err = %d, killing connection.", err); + rlb_conn_close(rlb, conn); +} + +static void +rlb_accept_conn(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn; + int s, err; + + s = accept(rlb->sock, NULL, NULL); + if (!s) { + err = -errno; + goto fail; + } + + conn = rlb_conn_alloc(rlb); + if (!conn) { + err = -ENOMEM; + close(s); + goto fail; + } + + INFO("Accepting connection %td.", conn - rlb->connv); + + memset(conn, 0, sizeof(*conn)); + INIT_LIST_HEAD(&conn->wait); + conn->sock = s; + list_add_tail(&conn->open, &rlb->open); + + return; + +fail: + WARN("err = %d", err); +} + +static long long +rlb_pending(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn; + long long pend = 0; + + rlb_for_each_conn(conn, rlb) + pend += conn->gntd; + + return pend; +} + +/* + * token bucket valve + */ + +typedef struct ratelimit_token td_rlb_token_t; + +struct ratelimit_token { + long cred; + long cap; + long rate; + struct timeval timeo; +}; + +static void +rlb_token_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data) +{ + td_rlb_token_t *token = data; + struct timeval *tv = &token->timeo; + long long us; + + if (list_empty(&rlb->wait)) { + *_tv = NULL; + return; + } + + WARN_ON(token->cred >= 0); + + us = -token->cred; + us *= 1000000; + us /= token->rate; + + tv->tv_sec = us / 1000000; + tv->tv_usec = us % 1000000; + + WARN_ON(!timerisset(tv)); + + *_tv = tv; +} + +static void +rlb_token_refill(td_rlb_t *rlb, td_rlb_token_t *token) +{ + struct timeval tv; + long long cred, max_usec; + + /* max time needed to refill up to cap */ + + max_usec = token->cap - token->cred; + max_usec *= 1000000; + max_usec += token->rate - 1; + max_usec /= token->rate; + + /* actual credit gained */ + + timersub(&rlb->now, &rlb->ts, &tv); + + cred = rlb_tv_usec(&tv); + cred = MIN(cred, max_usec); + cred *= token->rate; + cred /= 1000000; + + /* up to cap */ + + token->cred += cred; + token->cred = MIN(token->cred, token->cap); +} + +static void +rlb_token_dispatch(td_rlb_t *rlb, void *data) +{ + td_rlb_token_t *token = data; + td_rlb_conn_t *conn, *next; + + rlb_token_refill(rlb, token); + + rlb_for_each_waiting_safe(conn, next, rlb) { + if (token->cred < 0) + break; + + token->cred -= conn->need; + + rlb_conn_respond(rlb, conn, conn->need); + } +} + +static void +rlb_token_reset(td_rlb_t *rlb, void *data) +{ + td_rlb_token_t *token = data; + + token->cred = token->cap; +} + +static void +rlb_token_destroy(td_rlb_t *rlb, void *data) +{ + td_rlb_token_t *token = data; + + if (token) + free(token); +} + +static int +rlb_token_create(td_rlb_t *rlb, int argc, char **argv, void **data) +{ + td_rlb_token_t *token; + int err; + + token = calloc(1, sizeof(*token)); + if (!token) { + err = -ENOMEM; + goto fail; + } + + token->rate = 0; + token->cap = 0; + + do { + const struct option longopts[] = { + { "rate", 1, NULL, ''r'' }, + { "cap", 1, NULL, ''c'' }, + { NULL, 0, NULL, 0 } + }; + int c; + + c = getopt_long(argc, argv, "r:c:", longopts, NULL); + if (c < 0) + break; + + switch (c) { + case ''r'': + token->rate = rlb_strtol(optarg); + if (token->rate < 0) { + ERR("invalid --rate"); + goto usage; + } + break; + + case ''c'': + token->cap = rlb_strtol(optarg); + if (token->cap < 0) { + ERR("invalid --cap"); + goto usage; + } + break; + + case ''?'': + goto usage; + + default: + BUG(); + } + } while (1); + + if (!token->rate) { + ERR("--rate required"); + goto usage; + } + + rlb_token_reset(rlb, token); + + *data = token; + + return 0; + +fail: + if (token) + free(token); + + return err; + +usage: + err = -EINVAL; + goto fail; +} + +static void +rlb_token_usage(td_rlb_t *rlb, FILE *stream, void *data) +{ + fprintf(stream, + " {-t|--type}=token --" + " {-r|--rate}=<rate [KMG]>" + " {-c|--cap}=<size [KMG]>"); +} + +static void +rlb_token_info(td_rlb_t *rlb, void *data) +{ + td_rlb_token_t *token = data; + + INFO("TOKEN: rate: %ld B/s cap: %ld B cred: %ld B", + token->rate, token->cap, token->cred); +} + +static struct ratelimit_ops rlb_token_ops = { + .usage = rlb_token_usage, + .create = rlb_token_create, + .destroy = rlb_token_destroy, + .info = rlb_token_info, + + .settimeo = rlb_token_settimeo, + .timeout = rlb_token_dispatch, + .dispatch = rlb_token_dispatch, + .reset = rlb_token_reset, +}; + +/* + * meminfo valve + */ + +typedef struct ratelimit_meminfo td_rlb_meminfo_t; + +struct ratelimit_meminfo { + unsigned int period; + struct timeval ts; + + FILE *s; + + unsigned long total; + unsigned long dirty; + unsigned long writeback; + + unsigned int limit_hi; + unsigned int limit_lo; + unsigned int congested; + + struct rlb_valve valve; + struct timeval timeo; +}; + +static void +rlb_meminfo_info(td_rlb_t *rlb, void *data) +{ + td_rlb_meminfo_t *m = data; + + INFO("MEMINFO: lo/hi: %u/%u%% period: %u ms", + m->limit_lo, m->limit_hi, m->period); + + INFO("MEMINFO: total %lu kB, dirty/writeback %lu/%lu kB", + m->total, m->dirty, m->writeback); + + m->valve.ops->info(rlb, m->valve.data); +} + +static void +rlb_meminfo_close(td_rlb_meminfo_t *m) +{ + if (m->s) { + fclose(m->s); + m->s = NULL; + } +} + +static int +rlb_meminfo_open(td_rlb_meminfo_t *m) +{ + FILE *s; + int err; + + m->s = NULL; + + s = fopen("/proc/meminfo", "r"); + if (!s) { + err = -errno; + goto fail; + } + + m->s = s; + + return 0; + +fail: + rlb_meminfo_close(m); + return err; +} + +static inline int __test_bit(int n, unsigned long *bitmap) +{ + return !!(*bitmap & (1UL<<n)); +} + +static inline void __clear_bit(int n, unsigned long *bitmap) +{ + *bitmap &= ~(1UL<<n); +} + +static struct ratelimit_meminfo_scan { + const char *format; + ptrdiff_t ptrdiff; +} rlb_meminfo_scanfs[] = { + { "MemTotal: %lu kB", + offsetof(struct ratelimit_meminfo, total) }, + { "Dirty: %lu kB", + offsetof(struct ratelimit_meminfo, dirty) }, + { "Writeback: %lu kB", + offsetof(struct ratelimit_meminfo, writeback) }, +}; + +static int +rlb_meminfo_scan(td_rlb_meminfo_t *m) +{ + const int n_keys = ARRAY_SIZE(rlb_meminfo_scanfs); + unsigned long pending; + int err; + + err = rlb_meminfo_open(m); + if (err) + goto fail; + + pending = (1UL << n_keys) - 1; + + do { + char buf[80], *b; + int i; + + b = fgets(buf, sizeof(buf), m->s); + if (!b) + break; + + for (i = 0; i < n_keys; i++) { + struct ratelimit_meminfo_scan *scan; + unsigned long val, *ptr; + int n; + + if (!__test_bit(i, &pending)) + continue; + + scan = &rlb_meminfo_scanfs[i]; + + n = sscanf(buf, scan->format, &val); + if (n != 1) + continue; + + ptr = (void*)m + scan->ptrdiff; + *ptr = val; + + __clear_bit(i, &pending); + } + + } while (pending); + + if (pending) { + err = -ESRCH; + goto fail; + } + + err = 0; +fail: + rlb_meminfo_close(m); + return err; +} + +static void +rlb_meminfo_usage(td_rlb_t *rlb, FILE *stream, void *data) +{ + td_rlb_meminfo_t *m = data; + + fprintf(stream, + " {-t|--type}=meminfo " + " {-H|--high}=<percent> {-L|--low}=<percent>" + " {-p|--period}=<msecs> --"); + + if (m && m->valve.ops) { + m->valve.ops->usage(rlb, stream, m->valve.data); + } else + fprintf(stream, " {-t|--type}={...}"); +} + +static void +rlb_meminfo_destroy(td_rlb_t *rlb, void *data) +{ + td_rlb_meminfo_t *m = data; + + if (m) { + if (m->valve.data) { + m->valve.ops->destroy(rlb, m->valve.data); + m->valve.data = NULL; + } + + free(m); + } +} + +static int +rlb_meminfo_create(td_rlb_t *rlb, int argc, char **argv, void **data) +{ + td_rlb_meminfo_t *m; + const char *type; + long dbr; + int err; + + m = calloc(1, sizeof(*m)); + if (!m) { + PERROR("calloc"); + err = -errno; + goto fail; + } + + type = NULL; + m->period = 100; + + do { + const struct option longopts[] = { + { "period", 1, NULL, ''p'' }, + { "type", 1, NULL, ''t'' }, + { "high", 1, NULL, ''H'' }, + { "low", 1, NULL, ''L'' }, + { NULL, 0, NULL, 0 } + }; + int c; + + c = getopt_long(argc, argv, "p:t:H:L:", longopts, NULL); + if (c < 0) + break; + + switch (c) { + case ''p'': + m->period = rlb_strtol(optarg); + if (m->period < 0) + goto usage; + break; + + case ''H'': + m->limit_hi = strtoul(optarg, NULL, 0); + break; + + case ''L'': + m->limit_lo = strtoul(optarg, NULL, 0); + break; + + case ''t'': + type = optarg; + break; + + case ''?'': + goto usage; + + default: + BUG(); + } + } while (1); + + if (!m->limit_hi || !m->limit_lo) { + ERR("--high/--low required"); + goto usage; + } + + if (m->limit_lo >= m->limit_hi) { + ERR("invalid --high/--low ratio"); + goto usage; + } + + if (!type) { + ERR("(sub) --type required"); + goto usage; + } + + dbr = sysctl_strtoul("vm/dirty_background_ratio"); + if (dbr < 0) { + err = dbr; + ERR("vm/dirty_background_ratio: %d", err); + goto fail; + } + + if (0 && m->limit_lo < dbr) { + ERR("--low %u is less than vm.dirty_background_ratio (= %ld)", + m->limit_lo, dbr); + err = -EINVAL; + goto fail; + } + + *data = m; + + rlb_argv_shift(&optind, &argc, &argv); + + err = rlb_create_valve(rlb, &m->valve, type, argc, argv); + if (err) { + if (err == -EINVAL) + goto usage; + goto fail; + } + + err = rlb_meminfo_scan(m); + if (err) { + PERROR("/proc/meminfo"); + goto fail; + } + + return 0; + +fail: + ERR("err = %d", err); + return err; + +usage: + err = -EINVAL; + return err; +}; + +static void +rlb_meminfo_settimeo(td_rlb_t *rlb, struct timeval **_tv, void *data) +{ + td_rlb_meminfo_t *m = data; + int idle; + + idle = list_empty(&rlb->wait); + BUG_ON(!idle && !m->congested); + + if (m->congested) { + m->valve.ops->settimeo(rlb, _tv, m->valve.data); + return; + } + + *_tv = NULL; +} + +static void +rlb_meminfo_timeout(td_rlb_t *rlb, void *data) +{ + td_rlb_meminfo_t *m = data; + + WARN_ON(!m->congested); + + if (m->congested) + m->valve.ops->timeout(rlb, m->valve.data); +} + +static int +rlb_meminfo_test_high(td_rlb_t *rlb, td_rlb_meminfo_t *m, long long cred) +{ + long long lo; + + if (m->congested) { + /* hysteresis */ + + lo = m->total; + lo *= m->limit_lo; + lo /= 100; + + if (cred >= lo) + return 0; + + } else + if (cred <= 0) { + m->valve.ops->reset(rlb, m->valve.data); + return 1; + } + + return m->congested; +} + +static void +rlb_meminfo_dispatch_low(td_rlb_t *rlb, td_rlb_meminfo_t *m, + long long *_cred) +{ + td_rlb_conn_t *conn, *next; + long long cred = *_cred, grant; + + rlb_for_each_waiting_safe(conn, next, rlb) { + + if (cred <= 0) + break; + + grant = MIN(cred, conn->need); + + rlb_conn_respond(rlb, conn, grant); + + cred -= grant; + } + + *_cred = cred; +} + +static void +rlb_meminfo_dispatch(td_rlb_t *rlb, void *data) +{ + td_rlb_meminfo_t *m = data; + long long us, hi, cred, dirty, pend; + + /* we run only once per m->period */ + + us = rlb_usec_since(rlb, &m->ts); + if (us / 1000 > m->period) { + rlb_meminfo_scan(m); + m->ts = rlb->now; + } + + /* uncongested credit: + memory below hi watermark minus pending I/O */ + + hi = m->total; + hi *= m->limit_hi; + hi /= 100; + + dirty = m->dirty + m->writeback; + + cred = hi - dirty; + cred *= 1000; + + pend = rlb_pending(rlb); + cred -= pend; + + m->congested = rlb_meminfo_test_high(rlb, m, cred); + + DBG(3, "dirty=%lld (%lld) pend=%llu cred=%lld %s", + dirty, dirty * 100 / m->total, pend, cred, + m->congested ? "congested" : ""); + + if (!m->congested) { + rlb_meminfo_dispatch_low(rlb, m, &cred); + + m->congested = rlb_meminfo_test_high(rlb, m, cred); + } + + if (m->congested) + m->valve.ops->dispatch(rlb, m->valve.data); +} + +static struct ratelimit_ops rlb_meminfo_ops = { + .usage = rlb_meminfo_usage, + .create = rlb_meminfo_create, + .destroy = rlb_meminfo_destroy, + .info = rlb_meminfo_info, + + .settimeo = rlb_meminfo_settimeo, + .timeout = rlb_meminfo_timeout, + .dispatch = rlb_meminfo_dispatch, +}; + +/* + * main loop + */ + +static void +rlb_info(td_rlb_t *rlb) +{ + rlb->valve.ops->info(rlb, rlb->valve.data); + + rlb_conn_infos(rlb); +} + +static sigset_t rlb_sigunblock; +static sigset_t rlb_sigpending; + +static void +rlb_sigmark(int signo) +{ + INFO("Caught SIG%d", signo); + sigaddset(&rlb_sigpending, signo); +} + +static int +rlb_siginit(void) +{ + struct sigaction sa_ignore = { .sa_handler = SIG_IGN }; + struct sigaction sa_pending = { .sa_handler = rlb_sigmark }; + sigset_t sigmask; + int err = 0; + + if (!err) + err = sigaction(SIGPIPE, &sa_ignore, NULL); + if (!err) + err = sigaction(SIGINT, &sa_pending, NULL); + if (!err) + err = sigaction(SIGTERM, &sa_pending, NULL); + if (!err) + err = sigaction(SIGUSR1, &sa_pending, NULL); + if (err) { + err = -errno; + goto fail; + } + + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGINT); + sigaddset(&sigmask, SIGTERM); + sigaddset(&sigmask, SIGUSR1); + + err = sigprocmask(SIG_BLOCK, &sigmask, &rlb_sigunblock); + if (err) { + err = -errno; + goto fail; + } + +fail: + return err; +} + +static int +rlb_main_signaled(td_rlb_t *rlb) +{ + if (sigismember(&rlb_sigpending, SIGUSR1)) + rlb_info(rlb); + + if (sigismember(&rlb_sigpending, SIGINT) || + sigismember(&rlb_sigpending, SIGTERM)) + return -EINTR; + + return 0; +} + + +static struct ratelimit_ops * +rlb_find_valve(const char *name) +{ + struct ratelimit_ops *ops = NULL; + + switch (name[0]) { +#if 0 + case ''l'': + if (!strcmp(name, "leaky")) + ops = &rlb_leaky_ops; + break; +#endif + + case ''t'': + if (!strcmp(name, "token")) + ops = &rlb_token_ops; + break; + + case ''m'': + if (!strcmp(name, "meminfo")) + ops = &rlb_meminfo_ops; + break; + } + + return ops; +} + +static int +rlb_main_iterate(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn, *next; + struct timeval *tv; + struct timespec _ts, *ts = &_ts; + int nfds, err; + fd_set rfds; + + FD_ZERO(&rfds); + nfds = 0; + + if (stdin) { + FD_SET(STDIN_FILENO, &rfds); + nfds = MAX(nfds, STDIN_FILENO); + } + + if (rlb->sock >= 0) { + FD_SET(rlb->sock, &rfds); + nfds = MAX(nfds, rlb->sock); + } + + rlb_for_each_conn(conn, rlb) { + FD_SET(conn->sock, &rfds); + nfds = MAX(nfds, conn->sock); + } + + rlb->valve.ops->settimeo(rlb, &tv, rlb->valve.data); + if (tv) { + TIMEVAL_TO_TIMESPEC(tv, ts); + } else + ts = NULL; + + rlb->ts = rlb->now; + + nfds = pselect(nfds + 1, &rfds, NULL, NULL, ts, &rlb_sigunblock); + if (nfds < 0) { + err = -errno; + if (err != -EINTR) + PERROR("select"); + goto fail; + } + + gettimeofday(&rlb->now, NULL); + + if (!nfds) { + BUG_ON(!ts); + rlb->valve.ops->timeout(rlb, rlb->valve.data); + } + + if (nfds) { + rlb_for_each_conn_safe(conn, next, rlb) + if (FD_ISSET(conn->sock, &rfds)) { + rlb_conn_receive(rlb, conn); + if (!--nfds) + break; + } + + rlb->valve.ops->dispatch(rlb, rlb->valve.data); + } + + if (unlikely(nfds)) { + if (FD_ISSET(STDIN_FILENO, &rfds)) { + getc(stdin); + rlb_info(rlb); + nfds--; + } + } + + if (unlikely(nfds)) { + if (FD_ISSET(rlb->sock, &rfds)) { + rlb_accept_conn(rlb); + nfds--; + } + } + + BUG_ON(nfds); + err = 0; +fail: + return err; +} + +static int +rlb_main_run(td_rlb_t *rlb) +{ + int err; + + do { + err = rlb_main_iterate(rlb); + if (err) { + if (err != -EINTR) + break; + + err = rlb_main_signaled(rlb); + if (err) { + err = 0; + break; + } + } + + } while (rlb->sock >= 0 || !list_empty(&rlb->open)); + + return err; +} + +static void +rlb_shutdown(td_rlb_t *rlb) +{ + td_rlb_conn_t *conn, *next; + + rlb_for_each_conn_safe(conn, next, rlb) + rlb_conn_close(rlb, conn); + + rlb_sock_close(rlb); +} + +static void +rlb_usage(td_rlb_t *rlb, const char *prog, FILE *stream) +{ + fprintf(stream, "Usage: %s <name>", prog); + + if (rlb && rlb->valve.ops) + rlb->valve.ops->usage(rlb, stream, rlb->valve.data); + else + fprintf(stream, + " {-t|--type}={token|meminfo}" + " [-h|--help] [-D|--debug=<n>]"); + + fprintf(stream, "\n"); +} + +static void +rlb_destroy(td_rlb_t *rlb) +{ + rlb_shutdown(rlb); + + if (rlb->valve.data) { + rlb->valve.ops->destroy(rlb, rlb->valve.data); + rlb->valve.data = NULL; + } + + if (rlb->name) { + free(rlb->name); + rlb->name = NULL; + } +} + +static int +rlb_create(td_rlb_t *rlb, const char *name) +{ + int i, err; + + memset(rlb, 0, sizeof(*rlb)); + INIT_LIST_HEAD(&rlb->open); + INIT_LIST_HEAD(&rlb->wait); + rlb->sock = -1; + + for (i = RLB_CONN_MAX - 1; i >= 0; i--) + rlb_conn_free(rlb, &rlb->connv[i]); + + rlb->name = strdup(name); + if (!rlb->name) { + err = -errno; + goto fail; + } + + err = rlb_sock_open(rlb); + if (err) + goto fail; + + gettimeofday(&rlb->now, NULL); + + return 0; + +fail: + WARN("err = %d", err); + rlb_destroy(rlb); + return err; +} + +static int +rlb_create_valve(td_rlb_t *rlb, struct rlb_valve *v, + const char *name, int argc, char **argv) +{ + struct ratelimit_ops *ops; + int err; + + ops = rlb_find_valve(name); + if (!ops) { + ERR("No such driver: %s", name); + err = -ESRCH; + goto fail; + } + + v->ops = ops; + + err = v->ops->create(rlb, argc, argv, &v->data); + +fail: + return err; +} + +static void +rlb_openlog(const char *name, int facility) +{ + static char ident[32]; + + snprintf(ident, sizeof(ident), "%s[%d]", name, getpid()); + ident[sizeof(ident)-1] = 0; + + openlog(ident, 0, facility); + + rlb_vlog = vsyslog; +} + +int +main(int argc, char **argv) +{ + td_rlb_t _rlb, *rlb; + const char *prog, *type; + int err; + + setbuf(stdin, NULL); + setlinebuf(stderr); + + rlb = NULL; + prog = basename(argv[0]); + type = NULL; + rlb_vlog = rlb_vlog_vfprintf; + + do { + const struct option longopts[] = { + { "help", 0, NULL, ''h'' }, + { "type", 1, NULL, ''t'' }, + { "debug", 0, NULL, ''D'' }, + { NULL, 0, NULL, 0 }, + }; + int c; + + c = getopt_long(argc, argv, "ht:D:", longopts, NULL); + if (c < 0) + break; + + switch (c) { + case ''h'': + rlb_usage(NULL, prog, stdout); + return 0; + + case ''t'': + type = optarg; + break; + + case ''D'': + debug = strtoul(optarg, NULL, 0); + break; + + case ''?'': + goto usage; + + default: + BUG(); + } + + } while (1); + + if (!type) + goto usage; + + if (argc - optind < 1) + goto usage; + + err = rlb_siginit(); + if (err) + goto fail; + + err = rlb_create(&_rlb, argv[optind++]); + if (err) + goto fail; + + rlb = &_rlb; + + rlb_argv_shift(&optind, &argc, &argv); + + err = rlb_create_valve(rlb, &rlb->valve, type, argc, argv); + if (err) { + if (err == -EINVAL) + goto usage; + goto fail; + } + + if (!debug) { + err = daemon(0, 0); + if (err) + goto fail; + + stdin = stdout = stderr = NULL; + rlb_openlog(prog, LOG_DAEMON); + } + + INFO("TD ratelimit bridge: %s, pid %d", rlb->path, getpid()); + + rlb_info(rlb); + + err = rlb_main_run(rlb); + + if (err) + INFO("Exiting with status %d", -err); + +fail: + if (rlb) + rlb_destroy(rlb); + + return -err; + +usage: + rlb_usage(rlb, prog, stderr); + err = -EINVAL; + goto fail; +} diff --git a/tools/blktap3/drivers/td.c b/tools/blktap3/drivers/td.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/drivers/td.c @@ -0,0 +1,697 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/resource.h> +#include <unistd.h> +#include <string.h> + +#include "libvhd.h" +#include "vhd-util.h" +#include "tapdisk-utils.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf ( stdout, _f , ## _a ) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +typedef enum { + TD_FIELD_HIDDEN = 0, + TD_FIELD_INVALID = 1 +} td_field_t; + +struct vdi_field { + char *name; + td_field_t id; +}; + +static struct vdi_field td_vdi_fields[TD_FIELD_INVALID] = { + { .id = TD_FIELD_HIDDEN, .name = "hidden" } +}; + +typedef enum { + TD_CMD_CREATE = 0, + TD_CMD_SNAPSHOT, +/* TD_CMD_COALESCE, */ + TD_CMD_QUERY, +/* TD_CMD_RESIZE, */ + TD_CMD_SET, +/* TD_CMD_REPAIR, */ +/* TD_CMD_FILL, */ +/* TD_CMD_READ, */ + TD_CMD_INVALID, +} td_command_t; + +struct command { + td_command_t id; + char *name; + int needs_type; +}; + +struct command commands[TD_CMD_INVALID] = { + { .id = TD_CMD_CREATE, .name = "create", .needs_type = 1 }, + { .id = TD_CMD_SNAPSHOT, .name = "snapshot", .needs_type = 1 }, +/* { .id = TD_CMD_COALESCE, .name = "coalesce", .needs_type = 1 }, */ + { .id = TD_CMD_QUERY, .name = "query", .needs_type = 1 }, +/* { .id = TD_CMD_RESIZE, .name = "resize", .needs_type = 1 }, */ + { .id = TD_CMD_SET, .name = "set", .needs_type = 1 }, +/* { .id = TD_CMD_REPAIR, .name = "repair", .needs_type = 1 }, */ +/* { .id = TD_CMD_FILL, .name = "fill", .needs_type = 1 }, */ +/* { .id = TD_CMD_READ, .name = "read", .needs_type = 1 }, */ +}; + +typedef enum { + TD_TYPE_VHD = 0, + TD_TYPE_AIO, + TD_TYPE_INVALID, +} td_disk_t; + +const char *td_disk_types[TD_TYPE_INVALID] = { + "vhd", + "aio", +}; + +#define print_commands() \ + do { \ + int i; \ + fprintf(stderr, "COMMAND := { "); \ + fprintf(stderr, "%s", commands[0].name); \ + for (i = 1; i < TD_CMD_INVALID; i++) \ + fprintf(stderr, " | %s", commands[i].name); \ + fprintf(stderr, " }\n"); \ + } while (0) + +#define print_disk_types() \ + do { \ + int i; \ + fprintf(stderr, "TYPE := { "); \ + fprintf(stderr, "%s", td_disk_types[0]); \ + for (i = 1; i < TD_TYPE_INVALID; i++) \ + fprintf(stderr, " | %s", td_disk_types[i]); \ + fprintf(stderr, " }\n"); \ + } while (0); + +#define print_field_names() \ + do { \ + int i; \ + fprintf(stderr, "FIELD := { "); \ + fprintf(stderr, "%s", td_vdi_fields[0].name); \ + for (i = 1; i < TD_FIELD_INVALID; i++) \ + fprintf(stderr, " | %s", td_vdi_fields[i].name); \ + fprintf(stderr, " }\n"); \ + } while (0) + +void +help(void) +{ + fprintf(stderr, "Tapdisk Utilities: v1.0.0\n"); + fprintf(stderr, "usage: td-util COMMAND [TYPE] [OPTIONS]\n"); + print_commands(); + print_disk_types(); + exit(-1); +} + +struct command * +get_command(char *command) +{ + int i; + + for (i = 0; i < TD_CMD_INVALID; i++) + if (!strcmp(command, commands[i].name)) + return &commands[i]; + + return NULL; +} + +struct vdi_field * +get_field(char *field) +{ + int i; + + for (i = 0; i < TD_FIELD_INVALID; i++) + if (!strcmp(field, td_vdi_fields[i].name)) + return &td_vdi_fields[i]; + + return NULL; +} + +int +get_driver_type(char *type) +{ + int i; + + if (strnlen(type, 25) >= 25) + return -ENAMETOOLONG; + + for (i = 0; i < TD_TYPE_INVALID; i++) + if (!strcmp(type, td_disk_types[i])) + return i; + + return -TD_TYPE_INVALID; +} + +int +td_create(int type, int argc, char *argv[]) +{ + ssize_t mb; + uint64_t size; + char *name, *buf; + int c, i, fd, sparse = 1, fixedsize = 0; + + while ((c = getopt(argc, argv, "hrb")) != -1) { + switch(c) { + case ''r'': + sparse = 0; + break; + case ''b'': + fixedsize = 1; + break; + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case ''h'': + goto usage; + } + } + + if (optind != (argc - 2)) + goto usage; + + mb = 1 << 20; + size = atoi(argv[optind++]); + size = size << 20; + name = argv[optind]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (type == TD_TYPE_VHD) { + int cargc = 0; + char sbuf[32], *cargv[10]; + + size >>= 20; + + memset(cargv, 0, sizeof(cargv)); + snprintf(sbuf, sizeof(sbuf) - 1, "%"PRIu64, size); + cargv[cargc++] = "create"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-s"; + cargv[cargc++] = sbuf; + if (!sparse) + cargv[cargc++] = "-r"; + if (fixedsize) + cargv[cargc++] = "-b"; + + return vhd_util_create(cargc, cargv); + } + + /* generic create */ + if (sparse) { + fprintf(stderr, "Cannot create sparse %s image\n", + td_disk_types[type]); + return EINVAL; + } + + buf = calloc(1, mb); + if (!buf) + return ENOMEM; + + fd = open(name, O_WRONLY | O_DIRECT | O_CREAT | O_TRUNC, 0644); + if (fd == -1) { + free(buf); + return errno; + } + + size >>= 20; + for (i = 0; i < size; i++) + if (write(fd, buf, mb) != mb) { + close(fd); + unlink(name); + free(buf); + return EIO; + } + + close(fd); + free(buf); + return 0; + + usage: + fprintf(stderr, "usage: td-util create %s [-h help] [-r reserve] " + "[-b file_is_fixed_size] <SIZE(MB)> <FILENAME>\n", + td_disk_types[type]); + return EINVAL; +} + +int +td_snapshot(int type, int argc, char *argv[]) +{ + char *cargv[10]; + int c, err, cargc; + struct stat stats; + char *name, *backing, *limit = NULL; + int fixedsize = 0, rawparent = 0; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot create snapshot of %s image type\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "hbml:")) != -1) { + switch(c) { + case ''b'': + fixedsize = 1; + break; + case ''m'': + rawparent = 1; + break; + case ''l'': + limit = optarg; + break; + case ''h'': + err = 0; + goto usage; + default: + err = EINVAL; + goto usage; + } + } + + if (optind != (argc - 2)) { + err = EINVAL; + goto usage; + } + + name = argv[optind++]; + backing = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN || + strnlen(backing, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (stat(backing, &stats) == -1) { + fprintf(stderr, "File %s not found\n", backing); + return errno; + } + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "snapshot"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-p"; + cargv[cargc++] = backing; + if (fixedsize) + cargv[cargc++] = "-b"; + if (rawparent) + cargv[cargc++] = "-m"; + if (limit) { + cargv[cargc++] = "-l"; + cargv[cargc++] = limit; + } + return vhd_util_snapshot(cargc, cargv); + + usage: + fprintf(stderr, "usage: td-util snapshot %s [-h help] [-m parent_raw] " + "[-b file_is_fixed_size] [-l snapshot depth limit] " + "<FILENAME> <BACKING_FILENAME>\n", td_disk_types[type]); + return err; +} + +int +td_coalesce(int type, int argc, char *argv[]) +{ + int c, ret, cargc; + char *name, *cargv[3]; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot create snapshot of %s image type\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "h")) != -1) { + switch(c) { + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case ''h'': + goto usage; + } + } + + if (optind != (argc - 1)) + goto usage; + + name = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "coalesce"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + ret = vhd_util_coalesce(cargc, cargv); + if (ret) + printf("coalesce failed: %d\n", ret); + + return ret; + + usage: + fprintf(stderr, "usage: td-util coalesce %s [-h help] " + "<FILENAME>\n", td_disk_types[type]); + return EINVAL; +} + +int +td_query(int type, int argc, char *argv[]) +{ + char *name; + int c, size = 0, parent = 0, fields = 0, depth = 0, err = 0; + + while ((c = getopt(argc, argv, "hvpfd")) != -1) { + switch(c) { + case ''v'': + size = 1; + break; + case ''p'': + parent = 1; + break; + case ''f'': + fields = 1; + break; + case ''d'': + depth = 1; + break; + case ''h'': + err = 0; + goto usage; + default: + err = EINVAL; + goto usage; + } + } + + if (optind != (argc - 1)) { + err = EINVAL; + goto usage; + } + + name = argv[optind++]; + + if (strnlen(name, MAX_NAME_LEN) == MAX_NAME_LEN) { + fprintf(stderr, "Device name too long\n"); + return ENAMETOOLONG; + } + + if (type == TD_TYPE_VHD) { + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) { + printf("failed opening %s: %d\n", name, err); + return err; + } + + if (size) + printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); + + if (parent) { + if (vhd.footer.type != HD_TYPE_DIFF) + printf("%s has no parent\n", name); + else { + char *pname; + + err = vhd_parent_locator_get(&vhd, &pname); + if (err) + printf("failed getting parent: %d\n", + err); + else { + printf("%s\n", pname); + free(pname); + } + } + } + + if (fields) { + int ret, hidden; + + ret = vhd_hidden(&vhd, &hidden); + if (ret) { + printf("failed checking ''hidden'' field: %d\n", + ret); + err = (err ? : ret); + } else + printf("%s: %d\n", + td_vdi_fields[TD_FIELD_HIDDEN].name, + hidden); + } + + if (depth) { + int ret, length; + + ret = vhd_chain_depth(&vhd, &length); + if (ret) + printf("error checking chain depth: %d\n", ret); + else + printf("chain depth: %d\n", length); + + err = (err ? : ret); + } + + vhd_close(&vhd); + + } else if (type == TD_TYPE_AIO) { + if (size) { + int fd; + uint64_t secs; + uint32_t ssize; + + fd = open(name, O_RDONLY | O_LARGEFILE); + if (fd == -1) { + printf("failed opening %s: %d\n", name, errno); + return -errno; + } + + err = tapdisk_get_image_size(fd, &secs, &ssize); + close(fd); + + if (err) { + printf("failed getting size for %s: %d\n:", + name, err); + return err; + } + + printf("%"PRIu64"\n", secs >> 11); + } + + if (parent) + printf("%s has no parent\n", name); + + if (fields) { + int i; + + for (i = 0; i < TD_FIELD_INVALID; i++) + printf("%s: 0\n", td_vdi_fields[i].name); + } + } + + return err; + + usage: + fprintf(stderr, "usage: td-util query %s [-h help] [-v virtsize] " + "[-p parent] [-f fields] <FILENAME>\n", td_disk_types[type]); + return err; +} + +int +td_set_field(int type, int argc, char *argv[]) +{ + int c, cargc; + struct vdi_field *field; + char *name, *value, *cargv[7]; + + if (type != TD_TYPE_VHD) { + fprintf(stderr, "Cannot set fields of %s images\n", + td_disk_types[type]); + return EINVAL; + } + + while ((c = getopt(argc, argv, "h")) != -1) { + switch(c) { + default: + fprintf(stderr, "Unknown option %c\n", (char)c); + case ''h'': + goto usage; + } + } + + if (optind != (argc - 3)) + goto usage; + + name = argv[optind++]; + + field = get_field(argv[optind]); + if (!field || field->id != TD_FIELD_HIDDEN) { + fprintf(stderr, "Invalid field %s\n", argv[optind]); + goto usage; + } + + value = argv[++optind]; + + cargc = 0; + memset(cargv, 0, sizeof(cargv)); + cargv[cargc++] = "set"; + cargv[cargc++] = "-n"; + cargv[cargc++] = name; + cargv[cargc++] = "-f"; + cargv[cargc++] = field->name; + cargv[cargc++] = "-v"; + cargv[cargc++] = value; + return vhd_util_set_field(cargc, cargv); + + usage: + fprintf(stderr, "usage: td-util set %s [-h help] " + "<FILENAME> <FIELD> <VALUE>\n", td_disk_types[type]); + print_field_names(); + return EINVAL; +} + +int +main(int argc, char *argv[]) +{ + char **cargv; + struct command *cmd; + int cargc, i, type = -1, ret = 0; + +#ifdef CORE_DUMP + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + fprintf(stderr, "setrlimit failed: %d\n", errno); +#endif + + if (argc < 2) + help(); + + cargc = argc - 1; + cmd = get_command(argv[1]); + if (!cmd) { + fprintf(stderr, "invalid COMMAND %s\n", argv[1]); + help(); + } + + if (cmd->needs_type) { + if (argc < 3) { + fprintf(stderr, "td-util %s requires a TYPE\n", + cmd->name); + print_disk_types(); + exit(-1); + } + + type = get_driver_type(argv[2]); + if (type < 0) { + fprintf(stderr, "invalid TYPE ''%s''.\n", argv[2]); + print_disk_types(); + exit(-1); + } + --cargc; + } + + cargv = malloc(sizeof(char *) * cargc); + if (!cargv) + exit(ENOMEM); + + cargv[0] = cmd->name; + for (i = 1; i < cargc; i++) + cargv[i] = argv[i + (argc - cargc)]; + + switch(cmd->id) { + case TD_CMD_CREATE: + ret = td_create(type, cargc, cargv); + break; + case TD_CMD_SNAPSHOT: + ret = td_snapshot(type, cargc, cargv); + break; +/* + case TD_CMD_COALESCE: + ret = td_coalesce(type, cargc, cargv); + break; +*/ + case TD_CMD_QUERY: + ret = td_query(type, cargc, cargv); + break; +/* + case TD_CMD_RESIZE: + ret = td_resize(type, cargc, cargv); + break; +*/ + case TD_CMD_SET: + ret = td_set_field(type, cargc, cargv); + break; +/* + case TD_CMD_REPAIR: + ret = td_repair(type, cargc, cargv); + break; + case TD_CMD_FILL: + ret = td_fill(type, cargc, cargv); + break; + case TD_CMD_READ: + ret = td_read(type, cargc, cargv); + break; +*/ + default: + case TD_CMD_INVALID: + ret = EINVAL; + break; + } + + free(cargv); + + return (ret >= 0 ? ret : -ret); +} diff --git a/tools/blktap3/include/libvhd-index.h b/tools/blktap3/include/libvhd-index.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/include/libvhd-index.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LIB_VHDI_H_ +#define _LIB_VHDI_H_ + +#include <inttypes.h> +#include <uuid/uuid.h> + +#define VHD_MAX_NAME_LEN 1024 + +typedef struct vhdi_context vhdi_context_t; +typedef struct vhdi_bat vhdi_bat_t; +typedef struct vhdi_block vhdi_block_t; +typedef struct vhdi_entry vhdi_entry_t; +typedef uint32_t vhdi_file_id_t; +typedef struct vhdi_file_ref vhdi_file_ref_t; +typedef struct vhdi_file_table vhdi_file_table_t; + +struct vhdi_context { + int fd; + int spb; + char *name; + uint32_t vhd_block_size; +}; + +struct vhdi_bat { + uint32_t *table; + uint64_t vhd_blocks; + uint32_t vhd_block_size; + char vhd_path[VHD_MAX_NAME_LEN]; + char index_path[VHD_MAX_NAME_LEN]; + char file_table_path[VHD_MAX_NAME_LEN]; +}; + +struct vhdi_entry { + vhdi_file_id_t file_id; + uint32_t offset; +}; + +struct vhdi_block { + int entries; + vhdi_entry_t *table; +}; + +struct vhdi_file_ref { + vhdi_file_id_t file_id; + char *path; + uuid_t vhd_uuid; + uint32_t vhd_timestamp; +}; + +struct vhdi_file_table { + int entries; + vhdi_file_ref_t *table; +}; + +void vhdi_entry_in(vhdi_entry_t *); + +int vhdi_create(const char *, uint32_t); +int vhdi_open(vhdi_context_t *, const char *, int); +void vhdi_close(vhdi_context_t *); +int vhdi_read_block(vhdi_context_t *, vhdi_block_t *, uint32_t); +int vhdi_write_block(vhdi_context_t *, vhdi_block_t *, uint32_t); +int vhdi_append_block(vhdi_context_t *, vhdi_block_t *, uint32_t *); + +int vhdi_bat_create(const char *, const char *, const char *, const char *); +int vhdi_bat_load(const char *, vhdi_bat_t *); +int vhdi_bat_write(const char *, vhdi_bat_t *); + +int vhdi_file_table_create(const char *); +int vhdi_file_table_load(const char *, vhdi_file_table_t *); +int vhdi_file_table_add(const char *, const char *, vhdi_file_id_t *); +void vhdi_file_table_free(vhdi_file_table_t *); + +#endif diff --git a/tools/blktap3/include/libvhd-journal.h b/tools/blktap3/include/libvhd-journal.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/include/libvhd-journal.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _VHD_JOURNAL_H_ +#define _VHD_JOURNAL_H_ + +#include <inttypes.h> + +#include "libvhd.h" + +#define VHD_JOURNAL_METADATA 0x01 +#define VHD_JOURNAL_DATA 0x02 + +#define VHD_JOURNAL_HEADER_COOKIE "vjournal" +#define VHD_JOURNAL_ENTRY_COOKIE 0xaaaa12344321aaaaULL + +typedef struct vhd_journal_header { + char cookie[8]; + uuid_t uuid; + uint64_t vhd_footer_offset; + uint32_t journal_data_entries; + uint32_t journal_metadata_entries; + uint64_t journal_data_offset; + uint64_t journal_metadata_offset; + uint64_t journal_eof; + char pad[448]; +} vhd_journal_header_t; + +typedef struct vhd_journal { + char *jname; + int jfd; + int is_block; /* is jfd a block device */ + vhd_journal_header_t header; + vhd_context_t vhd; +} vhd_journal_t; + +int vhd_journal_create(vhd_journal_t *, const char *file, const char *jfile); +int vhd_journal_open(vhd_journal_t *, const char *file, const char *jfile); +int vhd_journal_add_block(vhd_journal_t *, uint32_t block, char mode); +int vhd_journal_commit(vhd_journal_t *); +int vhd_journal_revert(vhd_journal_t *); +int vhd_journal_close(vhd_journal_t *); +int vhd_journal_remove(vhd_journal_t *); + +#endif diff --git a/tools/blktap3/include/tapdisk-message.h b/tools/blktap3/include/tapdisk-message.h --- a/tools/blktap3/include/tapdisk-message.h +++ b/tools/blktap3/include/tapdisk-message.h @@ -30,6 +30,10 @@ #include <inttypes.h> #include <sys/types.h> +/* TODO Why do we have two of them? */ +/* TODO This is quite small since we don''t allow path bigger than 256 chars. If + * we ever increase this, make sure tapdisk_message_t structures are not + * allocated on the stack. */ #define TAPDISK_MESSAGE_MAX_PATH_LENGTH 256 #define TAPDISK_MESSAGE_STRING_LENGTH 256 @@ -145,6 +149,31 @@ struct tapdisk_message_blkif { char params[TAPDISK_MESSAGE_MAX_PATH_LENGTH]; }; +/** + * Contains parameters for resuming a previously paused VBD. + */ +typedef struct tapdisk_message_resume { + /** + * TODO + */ + tapdisk_message_flag_t flags; + + /** + * The VDI (type:/path/to/file) to pause. + */ + char params1[TAPDISK_MESSAGE_MAX_PATH_LENGTH]; + + /** + * A new VDI to use instead of the old one. Optional. + */ + char params2[TAPDISK_MESSAGE_MAX_PATH_LENGTH]; + + /** + * TODO + */ + char secondary[TAPDISK_MESSAGE_MAX_PATH_LENGTH]; +} tapdisk_message_resume_t; + struct tapdisk_message { /** * TAPDISK_MESSAGE_??? @@ -161,6 +190,7 @@ struct tapdisk_message { tapdisk_message_list_t list; tapdisk_message_stat_t info; tapdisk_message_blkif_t blkif; + tapdisk_message_resume_t resume; } u; }; diff --git a/tools/blktap3/include/vhd-util.h b/tools/blktap3/include/vhd-util.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/include/vhd-util.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _VHD_UTIL_H_ +#define _VHD_UTIL_H_ + +int vhd_util_create(int argc, char **argv); +int vhd_util_snapshot(int argc, char **argv); +int vhd_util_query(int argc, char **argv); +int vhd_util_read(int argc, char **argv); +int vhd_util_set_field(int argc, char **argv); +int vhd_util_repair(int argc, char **argv); +int vhd_util_fill(int argc, char **argv); +int vhd_util_resize(int argc, char **argv); +int vhd_util_coalesce(int argc, char **argv); +int vhd_util_modify(int argc, char **argv); +int vhd_util_scan(int argc, char **argv); +int vhd_util_check(int argc, char **argv); +int vhd_util_revert(int argc, char **argv); + +#endif diff --git a/tools/blktap3/lvm/Makefile b/tools/blktap3/lvm/Makefile new file mode 100644 --- /dev/null +++ b/tools/blktap3/lvm/Makefile @@ -0,0 +1,45 @@ +XEN_ROOT = $(CURDIR)/../../.. +BLKTAP_ROOT := .. +include $(XEN_ROOT)/tools/Rules.mk + +ifeq ($(LVM_UTIL_TEST),y) +TEST := lvm-util +endif + +override CFLAGS += \ + -I$(BLKTAP_ROOT)/include \ + -D_GNU_SOURCE \ + -Wall \ + -Wextra \ + -Werror + +# FIXME cause trouble +override CFLAGS += \ + -Wno-sign-compare + +# FIXME Why only on 64-bit? +ifeq ($(CONFIG_X86_64),y) +CFLAGS += -fPIC +endif + +LVM-OBJS := lvm-util.o + +all: build liblvm.a + +build: $(TEST) $(LVM-OBJS) + +# FIXME lvm-util not installed somewhere +install: all + +lvm-util: lvm-util.o + $(CC) -DLVM_UTIL $(LDFLAGS) -o lvm-util lvm-util.c + +liblvm.a: $(LVM-OBJS) + $(AR) rc $@ $^ + +clean: + rm -rf *.o *.opic *~ $(DEPS) $(IBIN) + +.PHONY: all build clean install lvm-util + +-include $(DEPS) diff --git a/tools/blktap3/lvm/lvm-util.c b/tools/blktap3/lvm/lvm-util.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/lvm/lvm-util.c @@ -0,0 +1,375 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <string.h> +#include <syslog.h> + +#include "lvm-util.h" + +#define EPRINTF(_f, _a...) \ + do { \ + syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ + } while (0) + +#define _NAME "%255s" +static char line[1024]; + +static inline int +lvm_read_line(FILE *scan) +{ + memset(line, 0, sizeof(line)); + return (fscanf(scan, "%1023[^\n]", line) != 1); +} + +static inline int +lvm_next_line(FILE *scan) +{ + return (fscanf(scan, "%1023[\n]", line) != 1); +} + +static int +lvm_copy_name(char *dst, const char *src, size_t size) +{ + if (strnlen(src, size) == size) + return -ENAMETOOLONG; + + strcpy(dst, src); + return 0; +} + +static int +lvm_parse_pv(struct vg *vg, const char *name, int pvs, uint64_t start) +{ + int i, err; + struct pv *pv; + + pv = NULL; + + if (!vg->pvs) { + vg->pvs = calloc(pvs, sizeof(struct pv)); + if (!vg->pvs) + return -ENOMEM; + } + + for (i = 0; i < pvs; i++) { + pv = vg->pvs + i; + + if (!pv->name[0]) + break; + + if (!strcmp(pv->name, name)) + return -EEXIST; + } + + if (!pv) + return -ENOENT; + + if (i == pvs) + return -ENOMEM; + + err = lvm_copy_name(pv->name, name, sizeof(pv->name) - 1); + if (err) + return err; + + pv->start = start; + return 0; +} + +static int +lvm_open_vg(const char *vgname, struct vg *vg) +{ + FILE *scan; + int i, err, pvs, lvs; + char *cmd, pvname[256]; + uint64_t size, pv_start; + + memset(vg, 0, sizeof(*vg)); + + err = asprintf(&cmd, "/usr/sbin/vgs %s --noheadings --nosuffix --units=b " + "--options=vg_name,vg_extent_size,lv_count,pv_count," + "pv_name,pe_start --unbuffered 2> /dev/null", vgname); + if (err == -1) + return -ENOMEM; + + errno = 0; + scan = popen(cmd, "r"); + if (!scan) { + err = (errno ? -errno : ENOMEM); + goto out; + } + + for (;;) { + if (lvm_read_line(scan)) + break; + + err = -EINVAL; + if (sscanf(line, _NAME" %"PRIu64" %d %d "_NAME" %"PRIu64, vg->name, + &size, &lvs, &pvs, pvname, &pv_start) != 6) { + EPRINTF("sscanf failed on ''%s''\n", line); + goto out; + } + + if (strcmp(vg->name, vgname)) { + EPRINTF("VG name ''%s'' != ''%s''\n", vg->name, vgname); + goto out; + } + err = lvm_parse_pv(vg, pvname, pvs, pv_start); + if (err) + goto out; + + if (lvm_next_line(scan)) + break; + } + + err = -EINVAL; + if (strcmp(vg->name, vgname)) { + EPRINTF("VG name ''%s'' != ''%s''\n", vg->name, vgname); + goto out; + } + + for (i = 0; i < pvs; i++) + if (!vg->pvs[i].name[0]) { + EPRINTF("pvs %d name empty\n", i); + goto out; + } + + err = -ENOMEM; + vg->lvs = calloc(lvs, sizeof(struct lv)); + if (!vg->lvs) + goto out; + + err = 0; + vg->lv_cnt = lvs; + vg->pv_cnt = pvs; + vg->extent_size = size; + +out: + if (scan) + pclose(scan); + if (err) + lvm_free_vg(vg); + free(cmd); + return err; +} + +static int +lvm_parse_lv_devices(struct vg *vg, struct lv_segment *seg, char *devices) +{ + int i; + uint64_t start, pe_start; + + for (i = 0; i < strlen(devices); i++) + if (strchr(",()", devices[i])) + devices[i] = '' ''; + + if (sscanf(devices, _NAME" %"PRIu64, seg->device, &start) != 2) { + EPRINTF("sscanf failed on ''%s''\n", devices); + return -EINVAL; + } + + pe_start = -1; + for (i = 0; i < vg->pv_cnt; i++) + if (!strcmp(vg->pvs[i].name, seg->device)) { + pe_start = vg->pvs[i].start; + break; + } + + if (pe_start == -1) { + EPRINTF("invalid pe_start value\n"); + return -EINVAL; + } + + seg->pe_start = (start * vg->extent_size) + pe_start; + return 0; +} + +static int +lvm_scan_lvs(struct vg *vg) +{ + char *cmd; + FILE *scan; + int i, err; + + err = asprintf(&cmd, "/usr/sbin/lvs %s --noheadings --nosuffix --units=b " + "--options=lv_name,lv_size,segtype,seg_count,seg_start," + "seg_size,devices --unbuffered 2> /dev/null", vg->name); + if (err == -1) + return -ENOMEM; + + errno = 0; + scan = popen(cmd, "r"); + if (!scan) { + err = (errno ? -errno : -ENOMEM); + goto out; + } + + for (i = 0;;) { + int segs; + struct lv *lv; + struct lv_segment seg; + unsigned long long size, seg_start; + char type[32], name[256], devices[1024]; + + if (i >= vg->lv_cnt) + break; + + if (lvm_read_line(scan)) { + vg->lv_cnt = i; + break; + } + + err = -EINVAL; + lv = vg->lvs + i; + + if (sscanf(line, _NAME" %llu %31s %u %llu %"PRIu64" %1023s", + name, &size, type, &segs, &seg_start, + &seg.pe_size, devices) != 7) { + EPRINTF("sscanf failed on ''%s''\n", line); + goto out; + } + + if (seg_start) + goto next; + + if (!strcmp(type, "linear")) + seg.type = LVM_SEG_TYPE_LINEAR; + else + seg.type = LVM_SEG_TYPE_UNKNOWN; + + if (lvm_parse_lv_devices(vg, &seg, devices)) + goto out; + + i++; + lv->size = size; + lv->segments = segs; + lv->first_segment = seg; + + err = lvm_copy_name(lv->name, name, sizeof(lv->name) - 1); + if (err) + goto out; + err = -EINVAL; + + next: + if (lvm_next_line(scan)) { + if (err) + EPRINTF("fscanf failed\n"); + goto out; + } + } + + err = 0; + +out: + if (scan) + pclose(scan); + free(cmd); + return err; +} + +void +lvm_free_vg(struct vg *vg) +{ + free(vg->lvs); + free(vg->pvs); + memset(vg, 0, sizeof(*vg)); +} + +int +lvm_scan_vg(const char *vg_name, struct vg *vg) +{ + int err; + + memset(vg, 0, sizeof(*vg)); + + err = lvm_open_vg(vg_name, vg); + if (err) + return err; + + err = lvm_scan_lvs(vg); + if (err) { + lvm_free_vg(vg); + return err; + } + + return 0; +} + +#ifdef LVM_UTIL +static int +usage(void) +{ + printf("usage: lvm-util <vgname>\n"); + exit(EINVAL); +} + +int +main(int argc, char **argv) +{ + int i, err; + struct vg vg; + struct pv *pv; + struct lv *lv; + struct lv_segment *seg; + + if (argc != 2) + usage(); + + err = lvm_scan_vg(argv[1], &vg); + if (err) { + printf("scan failed: %d\n", err); + return (err >= 0 ? err : -err); + } + + printf("vg %s: extent_size: %"PRIu64", pvs: %d, lvs: %d\n", + vg.name, vg.extent_size, vg.pv_cnt, vg.lv_cnt); + + for (i = 0; i < vg.pv_cnt; i++) { + pv = vg.pvs + i; + printf("pv %s: start %"PRIu64"\n", pv->name, pv->start); + } + + for (i = 0; i < vg.lv_cnt; i++) { + lv = vg.lvs + i; + seg = &lv->first_segment; + printf("lv %s: size: %"PRIu64", segments: %u, type: %u, " + "dev: %s, pe_start: %"PRIu64", pe_size: %"PRIu64"\n", + lv->name, lv->size, lv->segments, seg->type, + seg->device, seg->pe_start, seg->pe_size); + } + + lvm_free_vg(&vg); + return 0; +} +#endif diff --git a/tools/blktap3/lvm/lvm-util.h b/tools/blktap3/lvm/lvm-util.h new file mode 100644 --- /dev/null +++ b/tools/blktap3/lvm/lvm-util.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef _LVM_UTIL_H_ +#define _LVM_UTIL_H_ + +#include <inttypes.h> + +#define MAX_NAME_SIZE 256 + +#define LVM_SEG_TYPE_LINEAR 1 +#define LVM_SEG_TYPE_UNKNOWN 2 + +struct lv_segment { + uint8_t type; + char device[MAX_NAME_SIZE]; + uint64_t pe_start; + uint64_t pe_size; +}; + +struct lv { + char name[MAX_NAME_SIZE]; + uint64_t size; + uint32_t segments; + struct lv_segment first_segment; +}; + +struct pv { + char name[MAX_NAME_SIZE]; + uint64_t start; +}; + +struct vg { + char name[MAX_NAME_SIZE]; + uint64_t extent_size; + + int pv_cnt; + struct pv *pvs; + + int lv_cnt; + struct lv *lvs; +}; + +int lvm_scan_vg(const char *vg_name, struct vg *vg); +void lvm_free_vg(struct vg *vg); + +#endif diff --git a/tools/blktap3/part/Makefile b/tools/blktap3/part/Makefile new file mode 100644 --- /dev/null +++ b/tools/blktap3/part/Makefile @@ -0,0 +1,34 @@ +XEN_ROOT := $(CURDIR)/../../../ +include $(XEN_ROOT)/tools/Rules.mk + +BLKTAP_ROOT := .. + +IBIN = part-util + +override CFLAGS += \ + -I$(BLKTAP_ROOT)/include \ + $(CFLAGS_xeninclude) \ + -Wall \ + -Wextra \ + -Werror + +# FIXME cause trouble +override CFLAGS += \ + -Wno-sign-compare + +PART-OBJS := partition.o + +all: $(IBIN) vhdpartx + +$(IBIN): $(PART-OBJS) part-util.o + $(CC) -o $@ $^ $(LDFLAGS) + +install: all + $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR) + $(INSTALL_PROG) vhdpartx $(DESTDIR)$(BINDIR) + +clean: + rm -f *.o .*.o.d $(IBIN) $(PART-OBJS) + +.PHONY: clean diff --git a/tools/blktap3/part/part-util.c b/tools/blktap3/part/part-util.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/part/part-util.c @@ -0,0 +1,369 @@ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <time.h> +#include <stdio.h> +#include <fcntl.h> +#include <errno.h> +#include <endian.h> +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#include <byteswap.h> +#include <sys/ioctl.h> +#include <linux/fs.h> +#include <linux/hdreg.h> + +#include "partition.h" + +#if BYTE_ORDER == LITTLE_ENDIAN + #define cpu_to_le32(x) (x) + #define cpu_to_le64(x) (x) +#else + #define cpu_to_le32(x) bswap_32(x) + #define cpu_to_le64(x) bswap_64(x) +#endif + +static void +usage(const char *app) +{ + printf("usage: %s <-i image> " + "[-d dump] [-c count] [-f format] " + "[-t type] [-s sig <part>]\n", app); +} + +static void +chs_unpack(struct partition_chs *c, + uint8_t *head, uint8_t *sector, uint16_t *cylinder) +{ + *head = c->chs[0]; + *sector = c->chs[1] & 0x3f; + *cylinder = (c->chs[1] & 0xc0) * 4 + c->chs[2]; +} + +void +partition_table_dump(struct partition_table *pt) +{ + int i; + + printf("disk signature 0x%08x\n", pt->disk_signature); + printf("mbr signature 0x%04x\n", pt->mbr_signature); + printf("\n"); + + for (i = 0; i < 4; i++) { + struct primary_partition *p = pt->partitions + i; + uint8_t head, sector; + uint16_t cylinder; + + printf(" %d status 0x%02x\n", i, p->status); + + chs_unpack(&p->chs_first, &head, §or, &cylinder); + printf(" %d s cylinder 0x%04x\n", i, cylinder); + printf(" %d s sector 0x%01x\n", i, sector); + printf(" %d s head 0x%01x\n", i, head); + + printf(" %d type 0x%01x\n", i, p->type); + + chs_unpack(&p->chs_last, &head, §or, &cylinder); + printf(" %d e cylinder 0x%04x\n", i, cylinder); + printf(" %d e sector 0x%01x\n", i, sector); + printf(" %d e head 0x%01x\n", i, head); + + printf(" %d lba 0x%08x\n", i, p->lba); + printf(" %d blocks 0x%08x\n", i, p->blocks); + + printf("\n"); + } +} + +static int +dump_partitions(const char *image) +{ + int fd, ret; + struct partition_table pt; + + ret = 1; + fd = -1; + + fd = open(image, O_RDONLY); + if (fd == -1) + goto out; + + if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { + errno = errno ? : EIO; + goto out; + } + + partition_table_in(&pt); + if (partition_table_validate(&pt)) { + errno = EINVAL; + printf("table invalid\n"); + goto out; + } + + partition_table_dump(&pt); + ret = 0; + +out: + close(fd); + return ret; +} + +static void +__dump_signature(struct partition_table *pt, int part) +{ + if (part < 1 || part > 4) + errno = EINVAL; + else { + uint8_t *p, *s; + uint32_t sig = pt->disk_signature; + uint64_t off = (uint64_t)pt->partitions[part - 1].lba << 9; + + sig = cpu_to_le32(sig); + off = cpu_to_le64(off); + + for (p = s = (uint8_t *)&sig; p - s < sizeof(sig); p++) + printf("%02x", *p); + + for (p = s = (uint8_t *)&off; p - s < sizeof(off); p++) + printf("%02x", *p); + + printf("\n"); + } +} + +static int +dump_signature(const char *image, int part) +{ + int fd, ret; + struct partition_table pt; + + ret = 1; + fd = -1; + + fd = open(image, O_RDONLY); + if (fd == -1) + goto out; + + if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { + errno = errno ? : EIO; + goto out; + } + + partition_table_in(&pt); + if (partition_table_validate(&pt)) { + errno = EINVAL; + printf("table invalid\n"); + goto out; + } + + __dump_signature(&pt, part); + ret = 0; + +out: + close(fd); + return ret; +} + +static int +count_partitions(const char *image, int *count) +{ + int i, fd, ret; + struct partition_table pt; + + ret = 1; + fd = -1; + + fd = open(image, O_RDONLY); + if (fd == -1) + goto out; + + if (read(fd, &pt, sizeof(pt)) != sizeof(pt)) { + errno = errno ? : EIO; + goto out; + } + + partition_table_in(&pt); + if (partition_table_validate(&pt)) { + *count = 0; + goto done; + } + + *count = 0; + for (i = 0; i < 4; i++) + if (pt.partitions[i].type) + (*count)++; + +done: + ret = 0; +out: + close(fd); + return ret; +} + +static int +format_partition(const char *image, int type, struct partition_table *pt) +{ + uint64_t lend; + uint32_t start, end; + int ret, sec_size, fd; + unsigned int cylinders; + struct hd_geometry geo; + struct primary_partition *pp; + struct partition_geometry pgeo; + unsigned long long bytes, llcyls; + + ret = 1; + fd = -1; + + memset(pt, 0, sizeof(*pt)); + pp = pt->partitions; + + srandom(time(NULL)); + + fd = open(image, O_RDWR); + if (fd == -1) + goto out; + + if (ioctl(fd, HDIO_GETGEO, &geo)) + goto out; + + if (ioctl(fd, BLKGETSIZE64, &bytes)) + goto out; + + if (ioctl(fd, BLKSSZGET, &sec_size)) + goto out; + + llcyls = (bytes >> 9) / ((sec_size >> 9) * geo.heads * geo.sectors); + cylinders = llcyls; + if (cylinders != llcyls) + cylinders = ~0; + + pgeo.heads = geo.heads; + pgeo.sectors = geo.sectors; + pgeo.cylinders = cylinders; + + start = pgeo.sectors; + lend = geo.heads * geo.sectors * llcyls - 1; + + end = lend; + if (end != lend) + end = ~0; + + pp->status = PARTITION_BOOTABLE; + pp->type = type; + pp->lba = start; + pp->blocks = end - start + 1; + pp->chs_first = lba_to_chs(&pgeo, start); + pp->chs_last = lba_to_chs(&pgeo, lend); + + pt->mbr_signature = MBR_SIGNATURE; + pt->disk_signature = random(); + + partition_table_out(pt); + if (write(fd, pt, sizeof(*pt)) != sizeof(*pt)) { + errno = errno ? : EIO; + goto out; + } + + ret = 0; + +out: + close(fd); + return ret; +} + +int +main(int argc, char *argv[]) +{ + char *image; + struct partition_table pt; + int ret, c, type, count, dump, format, signature; + + ret = 1; + format = 0; + count = 0; + dump = 0; + type = 0; + signature = -1; + image = NULL; + + while ((c = getopt(argc, argv, "i:fdt:cs:h")) != -1) { + switch (c) { + case ''i'': + image = optarg; + break; + case ''c'': + count = 1; + break; + case ''s'': + signature = atoi(optarg); + break; + case ''f'': + format = 1; + break; + case ''t'': { + int base = (!strncasecmp(optarg, "0x", 2) ? 16 : 10); + type = strtol(optarg, NULL, base); + break; + } + case ''d'': + dump = 1; + break; + case ''h'': + usage(argv[0]); + ret = 0; + goto out; + } + } + + if (!image || (!format && !count && !signature && !dump)) { + errno = EINVAL; + usage(argv[0]); + goto out; + } + + if (format) { + if (!type) { + errno = EINVAL; + perror("type required"); + goto out; + } + + if (format_partition(image, type, &pt)) { + perror("formatting partition"); + goto out; + } + + __dump_signature(&pt, 1); + } + + if (count) { + if (count_partitions(image, &count)) { + perror("counting partitions"); + goto out; + } + printf("%d\n", count); + } + + if (signature != -1) { + if (dump_signature(image, signature)) { + perror("dumping signature"); + goto out; + } + } + + if (dump) { + if (dump_partitions(image)) { + perror("dumping partitions"); + goto out; + } + } + + ret = 0; + +out: + return ret; +} diff --git a/tools/blktap3/part/partition.c b/tools/blktap3/part/partition.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/part/partition.c @@ -0,0 +1,112 @@ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <endian.h> +#include <byteswap.h> + +#include "partition.h" + +#if BYTE_ORDER == LITTLE_ENDIAN + #define le16_to_cpu(x) (x) + #define le32_to_cpu(x) (x) + #define cpu_to_le16(x) (x) + #define cpu_to_le32(x) (x) +#else + #define le16_to_cpu(x) bswap_16(x) + #define le32_to_cpu(x) bswap_32(x) + #define cpu_to_le16(x) bswap_16(x) + #define cpu_to_le32(x) bswap_32(x) +#endif + +#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a)[0]) + +void +primary_partition_in(struct primary_partition *p) +{ + p->lba = le32_to_cpu(p->lba); + p->blocks = le32_to_cpu(p->blocks); +} + +void +primary_partition_out(struct primary_partition *p) +{ + p->lba = cpu_to_le32(p->lba); + p->blocks = cpu_to_le32(p->blocks); +} + +void +partition_table_in(struct partition_table *pt) +{ + int i; + + pt->disk_signature = le32_to_cpu(pt->disk_signature); + pt->mbr_signature = le16_to_cpu(pt->mbr_signature); + + for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) + primary_partition_in(pt->partitions + i); +} + +void +partition_table_out(struct partition_table *pt) +{ + int i; + + pt->disk_signature = cpu_to_le32(pt->disk_signature); + pt->mbr_signature = cpu_to_le16(pt->mbr_signature); + + for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) + primary_partition_out(pt->partitions + i); +} + +int +primary_partition_validate(struct primary_partition *p) +{ + if (p->status != PARTITION_BOOTABLE && + p->status != PARTITION_NON_BOOTABLE) + return EINVAL; + + return 0; +} + +int +partition_table_validate(struct partition_table *pt) +{ + int i; + + if (pt->mbr_signature != MBR_SIGNATURE) + return EINVAL; + + for (i = 0; i < ARRAY_SIZE(pt->partitions); i++) { + int err = primary_partition_validate(pt->partitions + i); + if (err) + return err; + } + + return 0; +} + +struct partition_chs +lba_to_chs(struct partition_geometry *geo, uint64_t lba) +{ + struct partition_chs c; + + if (lba >= 0x3ff * geo->sectors * geo->heads) { + c.chs[0] = geo->heads - 1; + c.chs[1] = geo->sectors; + lba = 0x3ff; + } else { + c.chs[1] = lba % geo->sectors + 1; + lba /= geo->sectors; + + c.chs[0] = lba % geo->heads; + lba /= geo->heads; + } + + c.chs[2] = lba & 0xff; + c.chs[1] |= (lba >> 2) & 0xc0; + + return c; +} diff --git a/tools/blktap3/part/vhdpartx b/tools/blktap3/part/vhdpartx new file mode 100755 --- /dev/null +++ b/tools/blktap3/part/vhdpartx @@ -0,0 +1,109 @@ +#!/bin/sh + +set -e + +PARTUTIL=/usr/sbin/part-util +LIBVHDIO=/usr/lib/libvhdio.so.1.0 + +die() +{ + echo "$@" + exit 1 +} + +usage() +{ + echo "usage: $0 [-a | -d | -l] vhd [lib]" + echo "-a add partition mappings" + echo "-d del partition mappings" + echo "-l list partition mappings" + exit 1 +} + +parse_args() +{ + part_util=$PARTUTIL + + while [ $# -ge 1 ]; do + case $1 in + -a) add="TRUE" && count="1$count";; + -d) del="TRUE" && count="1$count";; + -l) list="TRUE" && count="1$count";; + *) if [ -z "$vhd" ]; then vhd=$1; + elif [ -z "$lib" ]; then lib=$1; + else usage; + fi;; + esac + shift + done + + [[ -z "$lib" ]] && lib=$LIBVHDIO + [[ -z "$vhd" || "$count" != "1" ]] && usage + return 0 +} + +# screen-scraping of fdisk... not used +fdisk_read_partitions() +{ + local data=$(LD_PRELOAD=$lib fdisk -l $vhd) + + local none=$(echo $data | grep "This doesn''t look like a partition table") + [[ -n "$none" ]] && partitions=0 && return 0 + + partitions=4 + while [[ "$partitions" != "0" ]]; do + local hit=$(echo $data | grep "${vhd}$partitions") + [[ -n "$hit" ]] && break + let partitions=$partitions-1 + done +} + +part_util_read_partitions() +{ + partitions=$(LD_PRELOAD=$lib $part_util -c -i $vhd) +} + +list_mappings() +{ + local parts=1 + while [[ $parts -le $partitions ]]; do + echo ${vhd}$parts + let parts=$parts+1 + done +} + +add_mappings() +{ + local parts=1 + local path=$(realpath $vhd) + while [[ $parts -le $partitions ]]; do + [[ -e ${path}${parts} ]] || ln -s $(basename $path) ${path}$parts + let parts=$parts+1 + done +} + +del_mappings() +{ + local parts=1 + while [[ $parts -le $partitions ]]; do + [[ -L ${vhd}$parts ]] && rm -f ${vhd}$parts + let parts=$parts+1 + done +} + +main() +{ + parse_args $@ + [[ -x $part_util ]] || die "can''t find part-util" + [[ -r $vhd && -r $lib ]] || die "can''t find vhd or lib" + + part_util_read_partitions + + [[ -n "$add" ]] && add_mappings + [[ -n "$del" ]] && del_mappings + [[ -n "$list" ]] && list_mappings + + return 0 +} + +main $@ diff --git a/tools/blktap3/tapback/Makefile b/tools/blktap3/tapback/Makefile --- a/tools/blktap3/tapback/Makefile +++ b/tools/blktap3/tapback/Makefile @@ -3,8 +3,6 @@ include $(XEN_ROOT)/tools/Rules.mk BLKTAP_ROOT := .. -INST_DIR ?= $(BINDIR) - IBIN = tapback # -D_GNU_SOURCE is required by vasprintf. @@ -39,8 +37,8 @@ all: $(IBIN) $(CC) -o $@ $^ $(TAPBACK-LIBS) $(LDFLAGS) install: all - $(INSTALL_DIR) -p $(DESTDIR)$(INST_DIR) - $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(INST_DIR) + $(INSTALL_DIR) -p $(DESTDIR)$(BINDIR) + $(INSTALL_PROG) $(IBIN) $(DESTDIR)$(BINDIR) clean: rm -f *.o *.o.d .*.o.d $(IBIN) diff --git a/tools/blktap3/vhd/Makefile b/tools/blktap3/vhd/Makefile --- a/tools/blktap3/vhd/Makefile +++ b/tools/blktap3/vhd/Makefile @@ -5,11 +5,40 @@ include $(XEN_ROOT)/tools/Rules.mk SUBDIRS-y : SUBDIRS-y += lib -all: subdirs-all +IBIN = vhd-util3 vhd-index3 vhd-update3 -LIBS_DEPENDS := lib/libvhd.so lib/libvhd.a -$(LIBS_DEPENDS): subdirs-all +override CFLAGS += \ + -fno-strict-aliasing \ + -I$(BLKTAP_ROOT)/include \ + $(CFLAGS_libxenctrl) \ + -D_GNU_SOURCE \ + -DUSE_NFS_LOCKS \ + -Werror \ + -Wall \ + -Wextra + +ifeq ($(VHD_STATIC),y) +CFLAGS += -static +endif + +LIBS := -Llib -lvhd +LIBS += -luuid + +all: subdirs-all build + +build: $(IBIN) + +vhd-util3: vhd-util.o + $(CC) $(CFLAGS) -o vhd-util3 vhd-util.o $(LIBS) + +vhd-index3: vhd-index.o + $(CC) $(CFLAGS) -o vhd-index3 vhd-index.o $(LIBS) + +vhd-update3: vhd-update.o + $(CC) $(CFLAGS) -o vhd-update3 vhd-update.o $(LIBS) + +# FIXME Must install vhd-* binaries install: all $(MAKE) subdirs-install diff --git a/tools/blktap3/vhd/lib/Makefile b/tools/blktap3/vhd/lib/Makefile --- a/tools/blktap3/vhd/lib/Makefile +++ b/tools/blktap3/vhd/lib/Makefile @@ -13,6 +13,7 @@ INST-DIR = $(LIBDIR) override CFLAGS += \ -I$(BLKTAP_ROOT)/include \ -I$(BLKTAP_ROOT)/part \ + -I$(BLKTAP_ROOT)/lvm \ -D_GNU_SOURCE \ -fPIC \ $(CFLAGS_xeninclude) \ @@ -34,10 +35,29 @@ LIBS += -liconv endif LIB-SRCS := libvhd.c -# TODO Not in Citrix blktap2, import it. -#LIB-SRCS += vhd-util-uuid.c LIB-SRCS += relative-path.c LIB-SRCS += atomicio.c +LIB-SRCS += libvhd-index.c +LIB-SRCS += libvhd-journal.c +LIB-SRCS += vhd-util-coalesce.c +LIB-SRCS += vhd-util-create.c +LIB-SRCS += vhd-util-fill.c +LIB-SRCS += vhd-util-modify.c +LIB-SRCS += vhd-util-query.c +LIB-SRCS += vhd-util-read.c +LIB-SRCS += vhd-util-repair.c +LIB-SRCS += vhd-util-resize.c +LIB-SRCS += vhd-util-revert.c +LIB-SRCS += vhd-util-set-field.c +LIB-SRCS += vhd-util-snapshot.c +LIB-SRCS += vhd-util-scan.c +LIB-SRCS += vhd-util-check.c + +# FIXME hack, make it a shared lib +LIB-SRCS += $(BLKTAP_ROOT)/lvm/lvm-util.c + +# TODO Not in blktap2.5. +#LIB-SRCS += vhd-util-uuid.c LIB-OBJS = $(patsubst %.c,%.o,$(LIB-SRCS)) diff --git a/tools/blktap3/vhd/lib/atomicio.c b/tools/blktap3/vhd/lib/atomicio.c --- a/tools/blktap3/vhd/lib/atomicio.c +++ b/tools/blktap3/vhd/lib/atomicio.c @@ -40,7 +40,7 @@ atomicio(f, fd, _s, n) { char *s = _s; size_t pos = 0; - size_t res; + ssize_t res; while (n > pos) { res = (f) (fd, s + pos, n - pos); diff --git a/tools/blktap3/vhd/lib/libvhd-journal.c b/tools/blktap3/vhd/lib/libvhd-journal.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/libvhd-journal.c @@ -0,0 +1,1540 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> + +#include "atomicio.h" +#include "libvhd-journal.h" + +#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_P 1 +#define VHD_JOURNAL_ENTRY_TYPE_FOOTER_C 2 +#define VHD_JOURNAL_ENTRY_TYPE_HEADER 3 +#define VHD_JOURNAL_ENTRY_TYPE_LOCATOR 4 +#define VHD_JOURNAL_ENTRY_TYPE_BAT 5 +#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_H 6 +#define VHD_JOURNAL_ENTRY_TYPE_BATMAP_M 7 +#define VHD_JOURNAL_ENTRY_TYPE_DATA 8 + +typedef struct vhd_journal_entry { + uint64_t cookie; + uint32_t type; + uint32_t size; + uint64_t offset; + uint32_t checksum; +} vhd_journal_entry_t; + +static inline int +vhd_journal_seek(vhd_journal_t *j, off64_t offset, int whence) +{ + off64_t off; + + off = lseek64(j->jfd, offset, whence); + if (off == (off64_t)-1) + return -errno; + + return 0; +} + +static inline off64_t +vhd_journal_position(vhd_journal_t *j) +{ + return lseek64(j->jfd, 0, SEEK_CUR); +} + +static inline int +vhd_journal_read(vhd_journal_t *j, void *buf, size_t size) +{ + ssize_t ret; + + errno = 0; + + ret = atomicio(read, j->jfd, buf, size); + if (ret != size) + return (errno ? -errno : -EIO); + + return 0; +} + +static inline int +vhd_journal_write(vhd_journal_t *j, void *buf, size_t size) +{ + ssize_t ret; + + errno = 0; + + ret = atomicio(vwrite, j->jfd, buf, size); + if (ret != size) + return (errno ? -errno : -EIO); + + return 0; +} + +static inline int +vhd_journal_truncate(vhd_journal_t *j, off64_t length) +{ + int err; + + err = ftruncate(j->jfd, length); + if (err == -1) + return -errno; + + return 0; +} + +static inline int +vhd_journal_sync(vhd_journal_t *j) +{ + int err; + + err = fdatasync(j->jfd); + if (err) + return -errno; + + return 0; +} + +static inline void +vhd_journal_header_in(vhd_journal_header_t *header) +{ + BE64_IN(&header->vhd_footer_offset); + BE32_IN(&header->journal_data_entries); + BE32_IN(&header->journal_metadata_entries); + BE64_IN(&header->journal_data_offset); + BE64_IN(&header->journal_metadata_offset); +} + +static inline void +vhd_journal_header_out(vhd_journal_header_t *header) +{ + BE64_OUT(&header->vhd_footer_offset); + BE32_OUT(&header->journal_data_entries); + BE32_OUT(&header->journal_metadata_entries); + BE64_OUT(&header->journal_data_offset); + BE64_OUT(&header->journal_metadata_offset); +} + +static int +vhd_journal_validate_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + off64_t eof; + + if (memcmp(header->cookie, + VHD_JOURNAL_HEADER_COOKIE, sizeof(header->cookie))) + return -EINVAL; + + err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); + if (err) + return err; + + eof = vhd_journal_position(j); + if (eof == (off64_t)-1) + return -errno; + + if (j->header.journal_data_offset > j->header.journal_eof) + return -EINVAL; + + if (j->header.journal_metadata_offset > j->header.journal_eof) + return -EINVAL; + + return 0; +} + +static int +vhd_journal_read_journal_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + size_t size; + + size = sizeof(vhd_journal_header_t); + err = vhd_journal_seek(j, 0, SEEK_SET); + if (err) + return err; + + err = vhd_journal_read(j, header, size); + if (err) + return err; + + vhd_journal_header_in(header); + + return vhd_journal_validate_header(j, header); +} + +static int +vhd_journal_write_header(vhd_journal_t *j, vhd_journal_header_t *header) +{ + int err; + size_t size; + vhd_journal_header_t h; + + memcpy(&h, header, sizeof(vhd_journal_header_t)); + + err = vhd_journal_validate_header(j, &h); + if (err) + return err; + + vhd_journal_header_out(&h); + size = sizeof(vhd_journal_header_t); + + err = vhd_journal_seek(j, 0, SEEK_SET); + if (err) + return err; + + err = vhd_journal_write(j, &h, size); + if (err) + return err; + + return 0; +} + +static int +vhd_journal_add_journal_header(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + + vhd = &j->vhd; + memset(&j->header, 0, sizeof(vhd_journal_header_t)); + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + return err; + + off = vhd_position(vhd); + if (off == (off64_t)-1) + return -errno; + + err = vhd_get_footer(vhd); + if (err) + return err; + + uuid_copy(j->header.uuid, vhd->footer.uuid); + memcpy(j->header.cookie, + VHD_JOURNAL_HEADER_COOKIE, sizeof(j->header.cookie)); + j->header.vhd_footer_offset = off - sizeof(vhd_footer_t); + j->header.journal_eof = sizeof(vhd_journal_header_t); + + return vhd_journal_write_header(j, &j->header); +} + +static void +vhd_journal_entry_in(vhd_journal_entry_t *entry) +{ + BE32_IN(&entry->type); + BE32_IN(&entry->size); + BE64_IN(&entry->offset); + BE64_IN(&entry->cookie); + BE32_IN(&entry->checksum); +} + +static void +vhd_journal_entry_out(vhd_journal_entry_t *entry) +{ + BE32_OUT(&entry->type); + BE32_OUT(&entry->size); + BE64_OUT(&entry->offset); + BE64_OUT(&entry->cookie); + BE32_OUT(&entry->checksum); +} + +static uint32_t +vhd_journal_checksum_entry(vhd_journal_entry_t *entry, char *buf, size_t size) +{ + int i; + unsigned char *blob; + uint32_t checksum, tmp; + + checksum = 0; + tmp = entry->checksum; + entry->checksum = 0; + + blob = (unsigned char *)entry; + for (i = 0; i < sizeof(vhd_journal_entry_t); i++) + checksum += blob[i]; + + blob = (unsigned char *)buf; + for (i = 0; i < size; i++) + checksum += blob[i]; + + entry->checksum = tmp; + return ~checksum; +} + +static int +vhd_journal_validate_entry(vhd_journal_entry_t *entry) +{ + if (entry->size == 0) + return -EINVAL; + + if (entry->size & (VHD_SECTOR_SIZE - 1)) + return -EINVAL; + + if (entry->cookie != VHD_JOURNAL_ENTRY_COOKIE) + return -EINVAL; + + return 0; +} + +static int +vhd_journal_read_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) +{ + int err; + + err = vhd_journal_read(j, entry, sizeof(vhd_journal_entry_t)); + if (err) + return err; + + vhd_journal_entry_in(entry); + return vhd_journal_validate_entry(entry); +} + +static int +vhd_journal_write_entry(vhd_journal_t *j, vhd_journal_entry_t *entry) +{ + int err; + vhd_journal_entry_t e; + + err = vhd_journal_validate_entry(entry); + if (err) + return err; + + memcpy(&e, entry, sizeof(vhd_journal_entry_t)); + vhd_journal_entry_out(&e); + + err = vhd_journal_write(j, &e, sizeof(vhd_journal_entry_t)); + if (err) + return err; + + return 0; +} + +static int +vhd_journal_validate_entry_data(vhd_journal_entry_t *entry, char *buf) +{ + int err; + uint32_t checksum; + + err = 0; + checksum = vhd_journal_checksum_entry(entry, buf, entry->size); + + if (checksum != entry->checksum) + return -EINVAL; + + return err; +} + +static int +vhd_journal_update(vhd_journal_t *j, off64_t offset, + char *buf, size_t size, uint32_t type) +{ + int err; + uint64_t *off, off_bak; + uint32_t *entries; + vhd_journal_entry_t entry; + + entry.type = type; + entry.size = size; + entry.offset = offset; + entry.cookie = VHD_JOURNAL_ENTRY_COOKIE; + entry.checksum = vhd_journal_checksum_entry(&entry, buf, size); + + err = vhd_journal_seek(j, j->header.journal_eof, SEEK_SET); + if (err) + return err; + + err = vhd_journal_write_entry(j, &entry); + if (err) + goto fail; + + err = vhd_journal_write(j, buf, size); + if (err) + goto fail; + + if (type == VHD_JOURNAL_ENTRY_TYPE_DATA) { + off = &j->header.journal_data_offset; + entries = &j->header.journal_data_entries; + } else { + off = &j->header.journal_metadata_offset; + entries = &j->header.journal_metadata_entries; + } + + off_bak = *off; + if (!(*entries)++) + *off = j->header.journal_eof; + j->header.journal_eof += (size + sizeof(vhd_journal_entry_t)); + + err = vhd_journal_write_header(j, &j->header); + if (err) { + if (!--(*entries)) + *off = off_bak; + j->header.journal_eof -= (size + sizeof(vhd_journal_entry_t)); + goto fail; + } + + return 0; + +fail: + if (!j->is_block) + vhd_journal_truncate(j, j->header.journal_eof); + return err; +} + +static int +vhd_journal_add_footer(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + vhd_footer_t footer; + + vhd = &j->vhd; + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + return err; + + off = vhd_position(vhd); + if (off == (off64_t)-1) + return -errno; + + err = vhd_read_footer_at(vhd, &footer, off - sizeof(vhd_footer_t)); + if (err) + return err; + + vhd_footer_out(&footer); + err = vhd_journal_update(j, off - sizeof(vhd_footer_t), + (char *)&footer, + sizeof(vhd_footer_t), + VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + return 0; + + err = vhd_read_footer_at(vhd, &footer, 0); + if (err) + return err; + + vhd_footer_out(&footer); + err = vhd_journal_update(j, 0, + (char *)&footer, + sizeof(vhd_footer_t), + VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); + + return err; +} + +static int +vhd_journal_add_header(vhd_journal_t *j) +{ + int err; + off64_t off; + vhd_context_t *vhd; + vhd_header_t header; + + vhd = &j->vhd; + + err = vhd_read_header(vhd, &header); + if (err) + return err; + + off = vhd->footer.data_offset; + + vhd_header_out(&header); + err = vhd_journal_update(j, off, + (char *)&header, + sizeof(vhd_header_t), + VHD_JOURNAL_ENTRY_TYPE_HEADER); + + return err; +} + +static int +vhd_journal_add_locators(vhd_journal_t *j) +{ + int i, n, err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_header(vhd); + if (err) + return err; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + for (i = 0; i < n; i++) { + void *buf; + off64_t off; + size_t size; + vhd_parent_locator_t *loc; + + loc = vhd->header.loc + i; + err = vhd_validate_platform_code(loc->code); + if (err) + return err; + + if (loc->code == PLAT_CODE_NONE) + continue; + + off = loc->data_offset; + size = vhd_parent_locator_size(loc); + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); + if (err) + return -err; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto end; + + err = vhd_read(vhd, buf, size); + if (err) + goto end; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_LOCATOR); + if (err) + goto end; + + err = 0; + + end: + free(buf); + if (err) + break; + } + + return err; +} + +static int +vhd_journal_add_bat(vhd_journal_t *j) +{ + int err; + off64_t off; + size_t size; + vhd_bat_t bat; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_header(vhd); + if (err) + return err; + + err = vhd_read_bat(vhd, &bat); + if (err) + return err; + + off = vhd->header.table_offset; + size = vhd_bytes_padded(bat.entries * sizeof(uint32_t)); + + vhd_bat_out(&bat); + err = vhd_journal_update(j, off, (char *)bat.bat, size, + VHD_JOURNAL_ENTRY_TYPE_BAT); + + free(bat.bat); + return err; +} + +static int +vhd_journal_add_batmap(vhd_journal_t *j) +{ + int err; + off64_t off; + size_t size; + vhd_context_t *vhd; + vhd_batmap_t batmap; + + vhd = &j->vhd; + + err = vhd_batmap_header_offset(vhd, &off); + if (err) + return err; + + err = vhd_read_batmap(vhd, &batmap); + if (err) + return err; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + vhd_batmap_header_out(&batmap); + err = vhd_journal_update(j, off, (char *)&batmap.header, size, + VHD_JOURNAL_ENTRY_TYPE_BATMAP_H); + if (err) + goto out; + + vhd_batmap_header_in(&batmap); + off = batmap.header.batmap_offset; + size = vhd_sectors_to_bytes(batmap.header.batmap_size); + + err = vhd_journal_update(j, off, batmap.map, size, + VHD_JOURNAL_ENTRY_TYPE_BATMAP_M); + +out: + free(batmap.map); + return err; +} + +static int +vhd_journal_add_metadata(vhd_journal_t *j) +{ + int err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_journal_add_footer(j); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + return 0; + + err = vhd_journal_add_header(j); + if (err) + return err; + + err = vhd_journal_add_locators(j); + if (err) + return err; + + err = vhd_journal_add_bat(j); + if (err) + return err; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_add_batmap(j); + if (err) + return err; + } + + j->header.journal_data_offset = j->header.journal_eof; + return vhd_journal_write_header(j, &j->header); +} + +static int +__vhd_journal_read_footer(vhd_journal_t *j, + vhd_footer_t *footer, uint32_t type) +{ + int err; + vhd_journal_entry_t entry; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != type) + return -EINVAL; + + if (entry.size != sizeof(vhd_footer_t)) + return -EINVAL; + + err = vhd_journal_read(j, footer, entry.size); + if (err) + return err; + + vhd_footer_in(footer); + return vhd_validate_footer(footer); +} + +static int +vhd_journal_read_footer(vhd_journal_t *j, vhd_footer_t *footer) +{ + return __vhd_journal_read_footer(j, footer, + VHD_JOURNAL_ENTRY_TYPE_FOOTER_P); +} + +static int +vhd_journal_read_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) +{ + return __vhd_journal_read_footer(j, footer, + VHD_JOURNAL_ENTRY_TYPE_FOOTER_C); +} + +static int +vhd_journal_read_header(vhd_journal_t *j, vhd_header_t *header) +{ + int err; + vhd_journal_entry_t entry; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_HEADER) + return -EINVAL; + + if (entry.size != sizeof(vhd_header_t)) + return -EINVAL; + + err = vhd_journal_read(j, header, entry.size); + if (err) + return err; + + vhd_header_in(header); + return vhd_validate_header(header); +} + +static int +vhd_journal_read_locators(vhd_journal_t *j, char ***locators, int *locs) +{ + int err, n, _locs; + char **_locators; + void *buf; + off_t pos; + vhd_journal_entry_t entry; + + _locs = 0; + *locs = 0; + *locators = NULL; + + n = sizeof(j->vhd.header.loc) / sizeof(vhd_parent_locator_t); + _locators = calloc(n, sizeof(char *)); + if (!_locators) + return -ENOMEM; + + for (;;) { + buf = NULL; + + pos = vhd_journal_position(j); + err = vhd_journal_read_entry(j, &entry); + if (err) + goto fail; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_LOCATOR) { + err = vhd_journal_seek(j, pos, SEEK_SET); + if (err) + goto fail; + break; + } + + if (_locs >= n) { + err = -EINVAL; + goto fail; + } + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size); + if (err) { + err = -err; + buf = NULL; + goto fail; + } + + err = vhd_journal_read(j, buf, entry.size); + if (err) + goto fail; + + _locators[_locs++] = buf; + err = 0; + } + + + *locs = _locs; + *locators = _locators; + + return 0; + +fail: + if (_locators) { + for (n = 0; n < _locs; n++) + free(_locators[n]); + free(_locators); + } + return err; +} + +static int +vhd_journal_read_bat(vhd_journal_t *j, vhd_bat_t *bat) +{ + int err; + size_t size; + vhd_context_t *vhd; + vhd_journal_entry_t entry; + void *_bat; + + vhd = &j->vhd; + + size = vhd_bytes_padded(vhd->header.max_bat_size * sizeof(uint32_t)); + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BAT) + return -EINVAL; + + if (entry.size != size) + return -EINVAL; + + if (entry.offset != vhd->header.table_offset) + return -EINVAL; + + err = posix_memalign(&_bat, VHD_SECTOR_SIZE, size); + if (err) + return -err; + bat->bat = _bat; + + err = vhd_journal_read(j, bat->bat, entry.size); + if (err) + goto fail; + + bat->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; + bat->entries = vhd->header.max_bat_size; + vhd_bat_in(bat); + + return 0; + +fail: + free(bat->bat); + bat->bat = NULL; + return err; +} + +static int +vhd_journal_read_batmap_header(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + void *buf; + size_t size; + vhd_journal_entry_t entry; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_H) + return -EINVAL; + + if (entry.size != size) + return -EINVAL; + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); + if (err) + return err; + + err = vhd_journal_read(j, buf, entry.size); + if (err) { + free(buf); + return err; + } + + memcpy(&batmap->header, buf, sizeof(batmap->header)); + + vhd_batmap_header_in(batmap); + return vhd_validate_batmap_header(batmap); +} + +static int +vhd_journal_read_batmap_map(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + vhd_journal_entry_t entry; + void *map; + + err = vhd_journal_read_entry(j, &entry); + if (err) + return err; + + if (entry.type != VHD_JOURNAL_ENTRY_TYPE_BATMAP_M) + return -EINVAL; + + if (entry.size != vhd_sectors_to_bytes(batmap->header.batmap_size)) + return -EINVAL; + + if (entry.offset != batmap->header.batmap_offset) + return -EINVAL; + + err = posix_memalign(&map, VHD_SECTOR_SIZE, entry.size); + if (err) + return -err; + + batmap->map = map; + + err = vhd_journal_read(j, batmap->map, entry.size); + if (err) { + free(batmap->map); + batmap->map = NULL; + return err; + } + + return 0; +} + +static int +vhd_journal_read_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + int err; + + err = vhd_journal_read_batmap_header(j, batmap); + if (err) + return err; + + err = vhd_journal_read_batmap_map(j, batmap); + if (err) + return err; + + err = vhd_validate_batmap(&j->vhd, batmap); + if (err) { + free(batmap->map); + batmap->map = NULL; + return err; + } + + return 0; +} + +static int +vhd_journal_restore_footer(vhd_journal_t *j, vhd_footer_t *footer) +{ + return vhd_write_footer_at(&j->vhd, footer, + j->header.vhd_footer_offset); +} + +static int +vhd_journal_restore_footer_copy(vhd_journal_t *j, vhd_footer_t *footer) +{ + return vhd_write_footer_at(&j->vhd, footer, 0); +} + +static int +vhd_journal_restore_header(vhd_journal_t *j, vhd_header_t *header) +{ + off64_t off; + vhd_context_t *vhd; + + vhd = &j->vhd; + off = vhd->footer.data_offset; + + return vhd_write_header_at(&j->vhd, header, off); +} + +static int +vhd_journal_restore_locators(vhd_journal_t *j, char **locators, int locs) +{ + size_t size; + vhd_context_t *vhd; + int i, n, lidx, err; + vhd_parent_locator_t *loc; + + lidx = 0; + vhd = &j->vhd; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + + for (i = 0; i < n && lidx < locs; i++) { + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + err = vhd_seek(vhd, loc->data_offset, SEEK_SET); + if (err) + return err; + + size = vhd_parent_locator_size(loc); + err = vhd_write(vhd, locators[lidx++], size); + if (err) + return err; + } + + return 0; +} + +static int +vhd_journal_restore_bat(vhd_journal_t *j, vhd_bat_t *bat) +{ + return vhd_write_bat(&j->vhd, bat); +} + +static int +vhd_journal_restore_batmap(vhd_journal_t *j, vhd_batmap_t *batmap) +{ + return vhd_write_batmap(&j->vhd, batmap); +} + +static int +vhd_journal_restore_metadata(vhd_journal_t *j) +{ + off64_t off; + char **locators; + vhd_footer_t copy; + vhd_context_t *vhd; + int i, locs, hlocs, err; + + vhd = &j->vhd; + locs = 0; + hlocs = 0; + locators = NULL; + + err = vhd_journal_seek(j, sizeof(vhd_journal_header_t), SEEK_SET); + if (err) + return err; + + err = vhd_journal_read_footer(j, &vhd->footer); + if (err) + return err; + + if (!vhd_type_dynamic(vhd)) + goto restore; + + err = vhd_journal_read_footer_copy(j, ©); + if (err) + return err; + + err = vhd_journal_read_header(j, &vhd->header); + if (err) + return err; + + for (hlocs = 0, i = 0; i < vhd_parent_locator_count(vhd); i++) { + if (vhd_validate_platform_code(vhd->header.loc[i].code)) + return err; + + if (vhd->header.loc[i].code != PLAT_CODE_NONE) + hlocs++; + } + + if (hlocs) { + err = vhd_journal_read_locators(j, &locators, &locs); + if (err) + return err; + + if (hlocs != locs) { + err = -EINVAL; + goto out; + } + } + + err = vhd_journal_read_bat(j, &vhd->bat); + if (err) + goto out; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_read_batmap(j, &vhd->batmap); + if (err) + goto out; + } + +restore: + off = vhd_journal_position(j); + if (off == (off64_t)-1) + return -errno; + + if (j->header.journal_data_offset != off) + return -EINVAL; + + err = vhd_journal_restore_footer(j, &vhd->footer); + if (err) + goto out; + + if (!vhd_type_dynamic(vhd)) + goto out; + + err = vhd_journal_restore_footer_copy(j, ©); + if (err) + goto out; + + err = vhd_journal_restore_header(j, &vhd->header); + if (err) + goto out; + + if (locs) { + err = vhd_journal_restore_locators(j, locators, locs); + if (err) + goto out; + } + + err = vhd_journal_restore_bat(j, &vhd->bat); + if (err) + goto out; + + if (vhd_has_batmap(vhd)) { + err = vhd_journal_restore_batmap(j, &vhd->batmap); + if (err) + goto out; + } + + err = 0; + +out: + if (locators) { + for (i = 0; i < locs; i++) + free(locators[i]); + free(locators); + } + + if (!err && !vhd->is_block) + err = ftruncate(vhd->fd, + j->header.vhd_footer_offset + + sizeof(vhd_footer_t)); + + return err; +} + +static int +vhd_journal_disable_vhd(vhd_journal_t *j) +{ + int err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_footer(vhd); + if (err) + return err; + + memcpy(&vhd->footer.cookie, + VHD_POISON_COOKIE, sizeof(vhd->footer.cookie)); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + return 0; +} + +static int +vhd_journal_enable_vhd(vhd_journal_t *j) +{ + int err; + vhd_context_t *vhd; + + vhd = &j->vhd; + + err = vhd_get_footer(vhd); + if (err) + return err; + + if (!vhd_disabled(vhd)) + return 0; + + memcpy(&vhd->footer.cookie, HD_COOKIE, sizeof(vhd->footer.cookie)); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + return 0; +} + +int +vhd_journal_close(vhd_journal_t *j) +{ + if (j->jfd) + close(j->jfd); + + vhd_close(&j->vhd); + free(j->jname); + + return 0; +} + +int +vhd_journal_remove(vhd_journal_t *j) +{ + int err; + + err = vhd_journal_enable_vhd(j); + if (err) + return err; + + if (j->jfd) { + close(j->jfd); + if (!j->is_block) + unlink(j->jname); + } + + vhd_close(&j->vhd); + free(j->jname); + + return 0; +} + +int +vhd_journal_open(vhd_journal_t *j, const char *file, const char *jfile) +{ + int err; + vhd_context_t *vhd; + + memset(j, 0, sizeof(vhd_journal_t)); + + j->jfd = -1; + vhd = &j->vhd; + + j->jname = strdup(jfile); + if (j->jname == NULL) + return -ENOMEM; + + j->jfd = open(j->jname, O_LARGEFILE | O_RDWR); + if (j->jfd == -1) { + err = -errno; + goto fail; + } + + err = vhd_test_file_fixed(j->jname, &j->is_block); + if (err) + goto fail; + + vhd->fd = open(file, O_LARGEFILE | O_RDWR | O_DIRECT); + if (vhd->fd == -1) { + err = -errno; + goto fail; + } + + err = vhd_test_file_fixed(file, &vhd->is_block); + if (err) + goto fail; + + err = vhd_journal_read_journal_header(j, &j->header); + if (err) + goto fail; + + err = vhd_journal_restore_metadata(j); + if (err) + goto fail; + + close(vhd->fd); + free(vhd->bat.bat); + free(vhd->batmap.map); + + err = vhd_open(vhd, file, VHD_OPEN_RDWR); + if (err) + goto fail; + + err = vhd_get_bat(vhd); + if (err) + goto fail; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + goto fail; + } + + err = vhd_journal_disable_vhd(j); + if (err) + goto fail; + + return 0; + +fail: + vhd_journal_close(j); + return err; +} + +int +vhd_journal_create(vhd_journal_t *j, const char *file, const char *jfile) +{ + int err; + + memset(j, 0, sizeof(vhd_journal_t)); + j->jfd = -1; + + j->jname = strdup(jfile); + if (j->jname == NULL) { + err = -ENOMEM; + goto fail1; + } + + if (access(j->jname, F_OK) == 0) { + err = vhd_test_file_fixed(j->jname, &j->is_block); + if (err) + goto fail1; + + if (!j->is_block) { + err = -EEXIST; + goto fail1; + } + } + + if (j->is_block) + j->jfd = open(j->jname, O_LARGEFILE | O_RDWR, 0644); + else + j->jfd = open(j->jname, + O_CREAT | O_TRUNC | O_LARGEFILE | O_RDWR, 0644); + if (j->jfd == -1) { + err = -errno; + goto fail1; + } + + err = vhd_open(&j->vhd, file, VHD_OPEN_RDWR | VHD_OPEN_STRICT); + if (err) + goto fail1; + + err = vhd_get_bat(&j->vhd); + if (err) + goto fail2; + + if (vhd_has_batmap(&j->vhd)) { + err = vhd_get_batmap(&j->vhd); + if (err) + goto fail2; + } + + err = vhd_journal_add_journal_header(j); + if (err) + goto fail2; + + err = vhd_journal_add_metadata(j); + if (err) + goto fail2; + + err = vhd_journal_disable_vhd(j); + if (err) + goto fail2; + + err = vhd_journal_sync(j); + if (err) + goto fail2; + + return 0; + +fail1: + if (j->jfd != -1) { + close(j->jfd); + if (!j->is_block) + unlink(j->jname); + } + free(j->jname); + memset(j, 0, sizeof(vhd_journal_t)); + + return err; + +fail2: + vhd_journal_remove(j); + return err; +} + +int +vhd_journal_add_block(vhd_journal_t *j, uint32_t block, char mode) +{ + int err; + char *buf; + off64_t off; + size_t size; + uint64_t blk; + vhd_context_t *vhd; + + buf = NULL; + vhd = &j->vhd; + + if (!vhd_type_dynamic(vhd)) + return -EINVAL; + + err = vhd_get_bat(vhd); + if (err) + return err; + + if (block >= vhd->bat.entries) + return -ERANGE; + + blk = vhd->bat.bat[block]; + if (blk == DD_BLK_UNUSED) + return 0; + + off = vhd_sectors_to_bytes(blk); + + if (mode & VHD_JOURNAL_METADATA) { + size = vhd_sectors_to_bytes(vhd->bm_secs); + + err = vhd_read_bitmap(vhd, block, &buf); + if (err) + return err; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_DATA); + + free(buf); + + if (err) + return err; + } + + if (mode & VHD_JOURNAL_DATA) { + off += vhd_sectors_to_bytes(vhd->bm_secs); + size = vhd_sectors_to_bytes(vhd->spb); + + err = vhd_read_block(vhd, block, &buf); + if (err) + return err; + + err = vhd_journal_update(j, off, buf, size, + VHD_JOURNAL_ENTRY_TYPE_DATA); + free(buf); + + if (err) + return err; + } + + return vhd_journal_sync(j); +} + +/* + * commit indicates the transaction completed + * successfully and we can remove the undo log + */ +int +vhd_journal_commit(vhd_journal_t *j) +{ + int err; + + j->header.journal_data_entries = 0; + j->header.journal_metadata_entries = 0; + j->header.journal_data_offset = 0; + j->header.journal_metadata_offset = 0; + + err = vhd_journal_write_header(j, &j->header); + if (err) + return err; + + if (!j->is_block) + err = vhd_journal_truncate(j, sizeof(vhd_journal_header_t)); + if (err) + return -errno; + + return 0; +} + +/* + * revert indicates the transaction failed + * and we should revert any changes via the undo log + */ +int +vhd_journal_revert(vhd_journal_t *j) +{ + int i, err; + char *file; + void *buf; + vhd_context_t *vhd; + vhd_journal_entry_t entry; + + err = 0; + vhd = &j->vhd; + buf = NULL; + + file = strdup(vhd->file); + if (!file) + return -ENOMEM; + + vhd_close(&j->vhd); + j->vhd.fd = open(file, O_RDWR | O_DIRECT | O_LARGEFILE); + if (j->vhd.fd == -1) { + free(file); + return -errno; + } + + err = vhd_test_file_fixed(file, &vhd->is_block); + if (err) { + free(file); + return err; + } + + err = vhd_journal_restore_metadata(j); + if (err) { + free(file); + return err; + } + + close(vhd->fd); + free(vhd->bat.bat); + free(vhd->batmap.map); + + err = vhd_open(vhd, file, VHD_OPEN_RDWR); + free(file); + if (err) + return err; + + err = vhd_journal_seek(j, j->header.journal_data_offset, SEEK_SET); + if (err) + return err; + + for (i = 0; i < j->header.journal_data_entries; i++) { + err = vhd_journal_read_entry(j, &entry); + if (err) + goto end; + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, entry.size); + if (err) { + err = -err; + buf = NULL; + goto end; + } + + err = vhd_journal_read(j, buf, entry.size); + if (err) + goto end; + + err = vhd_journal_validate_entry_data(&entry, buf); + if (err) + goto end; + + err = vhd_seek(vhd, entry.offset, SEEK_SET); + if (err) + goto end; + + err = vhd_write(vhd, buf, entry.size); + if (err) + goto end; + + err = 0; + + end: + free(buf); + buf = NULL; + if (err) + break; + } + + if (err) + return err; + + if (!vhd->is_block) { + err = ftruncate(vhd->fd, j->header.vhd_footer_offset + + sizeof(vhd_footer_t)); + if (err) + return -errno; + } + + return vhd_journal_sync(j); +} diff --git a/tools/blktap3/vhd/lib/libvhd.c b/tools/blktap3/vhd/lib/libvhd.c --- a/tools/blktap3/vhd/lib/libvhd.c +++ b/tools/blktap3/vhd/lib/libvhd.c @@ -44,6 +44,10 @@ #include "libvhd.h" #include "relative-path.h" +/* VHD uses an epoch of 12:00AM, Jan 1, 2000. This is the Unix timestamp for + * the start of the VHD epoch. */ +#define VHD_EPOCH_START 946684800 + #define VHD_HEADER_MAX_RETRIES 10 static int libvhd_dbg = 0; @@ -698,19 +702,10 @@ vhd_end_of_data(vhd_context_t *ctx, off6 return 0; } -uint32_t +inline uint32_t vhd_time(time_t time) { - struct tm tm; - time_t micro_epoch; - - memset(&tm, 0, sizeof(struct tm)); - tm.tm_year = 100; - tm.tm_mon = 0; - tm.tm_mday = 1; - micro_epoch = mktime(&tm); - - return (uint32_t)(time - micro_epoch); + return (uint32_t)(time - VHD_EPOCH_START); } /* @@ -721,20 +716,10 @@ size_t vhd_time_to_string(uint32_t timestamp, char *target) { char *cr; - struct tm tm; - time_t t1, t2; - - memset(&tm, 0, sizeof(struct tm)); - - /* VHD uses an epoch of 12:00AM, Jan 1, 2000. */ - /* Need to adjust this to the expected epoch of 1970. */ - tm.tm_year = 100; - tm.tm_mon = 0; - tm.tm_mday = 1; - - t1 = mktime(&tm); - t2 = t1 + (time_t)timestamp; - ctime_r(&t2, target); + time_t unix_timestamp; + + unix_timestamp = (time_t)timestamp + VHD_EPOCH_START; + ctime_r(&unix_timestamp, target); /* handle mad ctime_r newline appending. */ if ((cr = strchr(target, ''\n'')) != NULL) @@ -2808,6 +2793,11 @@ vhd_change_parent(vhd_context_t *child, vhd_context_t parent; char __parent_path[PATH_MAX]; + if (child->footer.type != HD_TYPE_DIFF) { + VHDLOG("would-be child is not a differencing disk\n"); + return -EINVAL; + } + ppath = realpath(parent_path, __parent_path); if (!ppath) { VHDLOG("error resolving parent path %s for %s: %d\n", @@ -3225,22 +3215,29 @@ static int { off64_t off; uint32_t blk, sec; - int err, cnt, map_off; + int err, cnt, map_off, i; char *bitmap, *data, *src; map_off = 0; do { + data = NULL; + bitmap = NULL; + if (sector >= ctx->footer.curr_size >> VHD_SECTOR_SHIFT) { + cnt = secs; + for (i = 0; i < cnt; i++) + set_bit(map, map_off + i); + /* buf has already been zeroed out */ + goto next; + } + blk = sector / ctx->spb; sec = sector % ctx->spb; - off = ctx->bat.bat[blk]; - data = NULL; - bitmap = NULL; - - if (off == DD_BLK_UNUSED) { - cnt = MIN(secs, ctx->spb); + cnt = MIN(secs, ctx->spb - sec); + off = ctx->bat.bat[blk]; + + if (off == DD_BLK_UNUSED) goto next; - } err = vhd_read_bitmap(ctx, blk, &bitmap); if (err) @@ -3252,7 +3249,6 @@ static int return err; } - cnt = MIN(secs, ctx->spb - sec); src = data + vhd_sectors_to_bytes(sec); __vhd_io_dynamic_copy_data(ctx, diff --git a/tools/blktap3/vhd/lib/libvhdio.c b/tools/blktap3/vhd/lib/libvhdio.c --- a/tools/blktap3/vhd/lib/libvhdio.c +++ b/tools/blktap3/vhd/lib/libvhdio.c @@ -1381,17 +1381,25 @@ int __lxstat64(int version, const char * return ret; } -int ioctl(int fd, int request, char *argp) +#ifdef __x86_64__ +#define IOCTL_REQUEST long long +#define IOCTL_REQUEST_FMT "%Lx" +#else +#define IOCTL_REQUEST int +#define IOCTL_REQUEST_FMT "%x" +#endif + +int ioctl(int fd, IOCTL_REQUEST request, char *argp) { vhd_fd_context_t *vhd_fd; - static int (*_std_ioctl) (int, int, char *); + static int (*_std_ioctl) (int, IOCTL_REQUEST, char *); _RESOLVE(_std_ioctl); vhd_fd = _libvhd_io_map_get(fd); if (!vhd_fd) return _std_ioctl(fd, request, argp); - LOG("%s 0x%x 0x%x %p\n", __func__, fd, request, argp); + LOG("%s 0x%x 0x" IOCTL_REQUEST_FMT " %p\n", __func__, fd, request, argp); #ifdef BLKGETSIZE64 if (request == BLKGETSIZE64) { diff --git a/tools/blktap3/vhd/lib/test/random-copy.c b/tools/blktap3/vhd/lib/test/random-copy.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/test/random-copy.c @@ -0,0 +1,226 @@ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <inttypes.h> +#include <sys/stat.h> + +struct range { + off64_t start; + off64_t end; +}; + +struct random_copy_ctx { + int sfd; + int dfd; + int total_chunks; + struct range *chunks; +}; + +static void +usage(const char *app, int err) +{ + printf("usage: %s <src> <dst>\n", app); + exit(err); +} + +static int +random_copy_carve_source(struct random_copy_ctx *ctx) +{ + int err, i, n; + struct stat64 st; + off64_t bytes, start; + + err = fstat64(ctx->sfd, &st); + if (err) { + perror("stat source"); + return errno; + } + + n = 100; + start = 0; + bytes = st.st_size; + + ctx->chunks = calloc(n, sizeof(struct range)); + if (!ctx->chunks) { + printf("calloc failed\n"); + return ENOMEM; + } + + for (i = 0; start < st.st_size; i++) { + int chunk; + off64_t end; + + if (i == n) { + struct range *new; + + n *= 2; + new = realloc(ctx->chunks, n * sizeof(struct range)); + if (!new) { + free(ctx->chunks); + ctx->chunks = NULL; + printf("realloc failed\n"); + return ENOMEM; + } + + ctx->chunks = new; + } + + chunk = (random() % (st.st_size / 10)) + 1; + end = start + chunk; + if (end >= st.st_size) + end = st.st_size - 1; + + ctx->chunks[i].start = start; + ctx->chunks[i].end = end; + + bytes -= (end - start); + start = end + 1; + } + + ctx->total_chunks = i; + + return 0; +} + +static int +random_copy_permute_source(struct random_copy_ctx *ctx) +{ + int i; + + for (i = 0; i < ctx->total_chunks; i++) { + int idx = random() % ctx->total_chunks; + struct range tmp = ctx->chunks[idx]; + ctx->chunks[idx] = ctx->chunks[i]; + ctx->chunks[i] = tmp; + } + + return 0; +} + +static int +random_copy_init(struct random_copy_ctx *ctx, const char *src, const char *dst) +{ + int err; + + memset(ctx, 0, sizeof(*ctx)); + ctx->sfd = ctx->dfd = -1; + + ctx->sfd = open(src, O_LARGEFILE | O_RDONLY); + if (ctx->sfd == -1) { + err = errno; + perror("opening source"); + goto fail; + } + + ctx->dfd = open(dst, O_LARGEFILE | O_WRONLY); + if (ctx->dfd == -1) { + err = errno; + perror("opening destination"); + goto fail; + } + + err = random_copy_carve_source(ctx); + if (err) { + printf("failed to carve source: %d\n", err); + goto fail; + } + + err = random_copy_permute_source(ctx); + if (err) { + printf("failed to permute source: %d\n", err); + goto fail; + } + + return 0; + +fail: + close(ctx->sfd); + close(ctx->dfd); + memset(ctx, 0, sizeof(*ctx)); + return err; +} + +static int +random_copy(struct random_copy_ctx *ctx) +{ + char *buf; + int i, err; + + for (i = 0; i < ctx->total_chunks; i++) { + struct range *r = &ctx->chunks[i]; + size_t count = r->end - r->start + 1; + + buf = calloc(1, count); + if (!buf) { + printf("calloc failed\n"); + return ENOMEM; + } + + fprintf(stderr, "copying 0x%zx from 0x%"PRIx64"\n", + count, r->start); + + err = pread(ctx->sfd, buf, count, r->start); + if (err != count) { + printf("pread(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n", + count, r->start, err, errno); + free(buf); + return (errno ? : EIO); + } + + err = pwrite(ctx->dfd, buf, count, r->start); + if (err != count) { + printf("pwrite(0x%zx 0x%"PRIx64") returned 0x%x (%d)\n", + count, r->start, err, errno); + free(buf); + return (errno ? : EIO); + } + + free(buf); + } + + return 0; +} + +static void +random_copy_close(struct random_copy_ctx *ctx) +{ + close(ctx->sfd); + close(ctx->dfd); + free(ctx->chunks); +} + +int +main(int argc, char *argv[]) +{ + int err; + char *src, *dst; + struct random_copy_ctx ctx; + + if (argc != 3) + usage(argv[0], EINVAL); + + src = argv[1]; + dst = argv[2]; + + err = random_copy_init(&ctx, src, dst); + if (err) { + printf("failed to init: %d\n", err); + exit(err); + } + + err = random_copy(&ctx); + if (err) + printf("copy failed: %d\n", err); + + random_copy_close(&ctx); + + return err; +} diff --git a/tools/blktap3/vhd/lib/test/test-snapshot.c b/tools/blktap3/vhd/lib/test/test-snapshot.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/test/test-snapshot.c @@ -0,0 +1,161 @@ +/* + * libvhdio.so supports a simple test hook for validating vhd chains: + * if LIBVHD_IO_TEST is set, libvhdio will handle SIGCONT specially + * by closing, snapshotting, and reopening any vhds it is tracking. + * + * this harness simply forks a test and stops/continues it at a given interval. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <signal.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/wait.h> + +static void +usage(const char *app, int err) +{ + printf("usage: %s <-i interval> -- <app and args>\n", app); + exit(err); +} + +static void +sighandler(int sig) +{ + fprintf(stderr, "child exited\n"); + exit(0); +} + +static void +stop(pid_t pid) +{ + int status; + + fprintf(stderr, "stopping %d\n", pid); + + if (kill(pid, SIGSTOP)) { + perror("stop child"); + exit(1); + } + + if (waitpid(pid, &status, WUNTRACED) == -1) { + perror("waiting for child to stop"); + exit(1); + } + + if (WIFEXITED(status)) + exit(0); + + if (!WIFSTOPPED(status)) { + perror("child not stopped"); + exit(1); + } +} + +static void +resume(pid_t pid) +{ + int status; + + fprintf(stderr, "resuming %d\n", pid); + + if (kill(pid, SIGCONT)) { + perror("resume child"); + exit(1); + } + + if (waitpid(pid, &status, WCONTINUED) == -1) { + perror("waiting for child to resume"); + exit(1); + } + + if (WIFEXITED(status)) + exit(0); + + if (!WIFCONTINUED(status)) { + perror("child not resumed"); + exit(1); + } +} + +static void +test(pid_t pid, int interval) +{ + for (;;) { + fprintf(stderr, "sleeping\n"); + sleep(interval); + stop(pid); + resume(pid); + } +} + +int +main(int argc, char **argv) +{ + pid_t pid; + sigset_t set; + int c, interval; + struct sigaction act; + + interval = 0; + + while ((c = getopt(argc, argv, "i:h")) != -1) { + switch (c) { + case ''i'': + interval = atoi(optarg); + break; + case ''h'': + usage(argv[0], 0); + break; + default: + usage(argv[0], EINVAL); + break; + } + } + + if (optind == argc || !interval) + usage(argv[0], EINVAL); + + if (sigemptyset(&set)) { + perror("init sigset"); + exit(1); + } + + act = (struct sigaction) { + .sa_handler = sighandler, + .sa_mask = set, + .sa_flags = SA_NOCLDSTOP, + }; + + if (sigaction(SIGCHLD, &act, NULL)) { + perror("register sig handler"); + exit(1); + } + + switch ((pid = fork())) { + case 0: + if (putenv("LIBVHD_IO_TEST=y")) { + perror("setting environment"); + exit(errno); + } + + execvp(argv[optind], &argv[optind]); + + perror("exec"); + exit(errno); + case -1: + perror("fork"); + exit(errno); + default: + test(pid, interval); + break; + } + + return 0; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-check.c b/tools/blktap3/vhd/lib/vhd-util-check.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-check.c @@ -0,0 +1,1272 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <time.h> +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <libgen.h> +#include <inttypes.h> +#include <sys/stat.h> + +#include "libvhd.h" +#include "vhd-util.h" + +// allow the VHD timestamp to be at most this many seconds into the future to +// account for time skew with NFS servers +#define TIMESTAMP_MAX_SLACK 1800 + +struct vhd_util_check_options { + char ignore_footer; + char ignore_parent_uuid; + char ignore_timestamps; + char check_data; + char no_check_bat; + char collect_stats; +}; + +TAILQ_HEAD(tqh_vhd_util_check_stats, vhd_util_check_stats); + +struct vhd_util_check_stats { + char *name; + char *bitmap; + uint64_t secs_total; + uint64_t secs_allocated; + uint64_t secs_written; + TAILQ_ENTRY(vhd_util_check_stats) next; +}; + +struct vhd_util_check_ctx { + struct vhd_util_check_options opts; + struct tqh_vhd_util_check_stats stats; + int primary_footer_missing; +}; + +#define ctx_cur_stats(ctx) \ + TAILQ_FIRST(&(ctx)->stats) + +static inline int +test_bit_u64(volatile char *addr, uint64_t nr) +{ + return ((addr[nr >> 3] << (nr & 7)) & 0x80) != 0; +} + +static inline void +set_bit_u64(volatile char *addr, uint64_t nr) +{ + addr[nr >> 3] |= (0x80 >> (nr & 7)); +} + +static void +vhd_util_check_stats_init(struct vhd_util_check_ctx *ctx) +{ + memset(&ctx->stats, 0, sizeof(ctx->stats)); + TAILQ_INIT(&ctx->stats); +} + +static void +vhd_util_check_stats_free_one(struct vhd_util_check_stats *stats) +{ + if (stats) { + free(stats->name); + free(stats->bitmap); + free(stats); + } +} + +static int +vhd_util_check_stats_alloc_one(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd) +{ + int size; + struct vhd_util_check_stats *stats; + + stats = calloc(1, sizeof(*stats)); + if (!stats) + goto fail; + + stats->name = strdup(vhd->file); + if (!stats->name) + goto fail; + + stats->secs_total = (uint64_t)vhd->spb * vhd->header.max_bat_size; + size = (stats->secs_total + 7) >> 3; + stats->bitmap = calloc(1, size); + if (!stats->bitmap) + goto fail; + + TAILQ_INSERT_HEAD(&ctx->stats, stats, next); + + return 0; + +fail: + vhd_util_check_stats_free_one(stats); + printf("failed to allocate stats for %s\n", vhd->file); + return -ENOMEM; +} + +static void +vhd_util_check_stats_free(struct vhd_util_check_ctx *ctx) +{ + struct vhd_util_check_stats *stats, *tmp; + + TAILQ_FOREACH_SAFE(stats, &ctx->stats, next, tmp) { + TAILQ_REMOVE(&ctx->stats, stats, next); + vhd_util_check_stats_free_one(stats); + } +} + +static inline float +pct(uint64_t num, uint64_t den) +{ + return (!den ? 0.0 : (((float)num / (float)den)) * 100.0); +} + +static inline char * +name(const char *path) +{ + char *p = strrchr(path, ''/''); + if (p && (p - path) == strlen(path)) + p = strrchr(--p, ''/''); + return (char *)(p ? ++p : path); +} + +static void +vhd_util_check_stats_print(struct vhd_util_check_ctx *ctx) +{ + char *bitmap; + uint64_t secs; + struct vhd_util_check_stats *head, *cur, *prev; + + if (TAILQ_EMPTY(&ctx->stats)) + return; + + head = TAILQ_FIRST(&ctx->stats); + printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64" (%.2f%%)\n", + name(head->name), head->secs_allocated, head->secs_written, + pct(head->secs_written, head->secs_allocated)); + + if (TAILQ_LAST(&ctx->stats, tqh_vhd_util_check_stats) == head) + return; + + secs = head->secs_total; + + bitmap = malloc((secs + 7) >> 3); + if (!bitmap) { + printf("failed to allocate bitmap\n"); + return; + } + memcpy(bitmap, head->bitmap, ((secs + 7) >> 3)); + + cur = prev = head; + while (TAILQ_LAST(&ctx->stats, tqh_vhd_util_check_stats) != cur) { + uint64_t i, up = 0, uc = 0; + + cur = TAILQ_NEXT(cur, next); + + for (i = 0; i < secs; i++) { + if (test_bit_u64(cur->bitmap, i)) { + if (!test_bit_u64(prev->bitmap, i)) + up++; /* sector is unique wrt parent */ + + if (!test_bit_u64(bitmap, i)) + uc++; /* sector is unique wrt chain */ + + set_bit_u64(bitmap, i); + } + } + + printf("%s: secs allocated: 0x%"PRIx64" secs written: 0x%"PRIx64 + " (%.2f%%) secs not in parent: 0x%"PRIx64" (%.2f%%)" + " secs not in ancestors: 0x%"PRIx64" (%.2f%%)\n", + name(cur->name), cur->secs_allocated, cur->secs_written, + pct(cur->secs_written, cur->secs_allocated), + up, pct(up, cur->secs_written), + uc, pct(uc, cur->secs_written)); + + prev = cur; + } + + free(bitmap); +} + +static int +vhd_util_check_zeros(void *buf, size_t size) +{ + int i; + char *p; + + p = buf; + for (i = 0; i < size; i++) + if (p[i]) + return i; + + return 0; +} + +static char * +vhd_util_check_validate_footer(struct vhd_util_check_ctx *ctx, + vhd_footer_t *footer) +{ + int size; + uint32_t checksum; + + size = sizeof(footer->cookie); + if (memcmp(footer->cookie, HD_COOKIE, size)) + return "invalid cookie"; + + checksum = vhd_checksum_footer(footer); + if (checksum != footer->checksum) { + if (footer->hidden && + !strncmp(footer->crtr_app, "tap", 3) && + (footer->crtr_ver == VHD_VERSION(0, 1) || + footer->crtr_ver == VHD_VERSION(1, 1))) { + char tmp = footer->hidden; + footer->hidden = 0; + checksum = vhd_checksum_footer(footer); + footer->hidden = tmp; + + if (checksum == footer->checksum) + goto ok; + } + + return "invalid checksum"; + } + +ok: + if (!(footer->features & HD_RESERVED)) + return "invalid ''reserved'' feature"; + + if (footer->features & ~(HD_TEMPORARY | HD_RESERVED)) + return "invalid extra features"; + + if (footer->ff_version != HD_FF_VERSION) + return "invalid file format version"; + + if (footer->type != HD_TYPE_DYNAMIC && + footer->type != HD_TYPE_DIFF && + footer->data_offset != ~(0ULL)) + return "invalid data offset"; + + if (!ctx->opts.ignore_timestamps) { + uint32_t now = vhd_time(time(NULL)); + if (footer->timestamp > now + TIMESTAMP_MAX_SLACK) + return "creation time in future"; + } + + if (!strncmp(footer->crtr_app, "tap", 3) && + footer->crtr_ver > VHD_CURRENT_VERSION) + return "unsupported tap creator version"; + + if (vhd_chs(footer->curr_size) < footer->geometry) + return "geometry too large"; + + if (footer->type != HD_TYPE_FIXED && + footer->type != HD_TYPE_DYNAMIC && + footer->type != HD_TYPE_DIFF) + return "invalid type"; + + if (footer->saved && footer->saved != 1) + return "invalid ''saved'' state"; + + if (footer->hidden && footer->hidden != 1) + return "invalid ''hidden'' state"; + + if (vhd_util_check_zeros(footer->reserved, + sizeof(footer->reserved))) + return "invalid ''reserved'' bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_header(int fd, vhd_header_t *header) +{ + off64_t eof; + int i, cnt, size; + uint32_t checksum; + + size = sizeof(header->cookie); + if (memcmp(header->cookie, DD_COOKIE, size)) + return "invalid cookie"; + + checksum = vhd_checksum_header(header); + if (checksum != header->checksum) + return "invalid checksum"; + + if (header->hdr_ver != 0x00010000) + return "invalid header version"; + + if (header->data_offset != ~(0ULL)) + return "invalid data offset"; + + eof = lseek64(fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (header->table_offset <= 0 || + header->table_offset % 512 || + (header->table_offset + + (header->max_bat_size * sizeof(uint32_t)) > + eof - sizeof(vhd_footer_t))) + return "invalid table offset"; + + for (cnt = 0, i = 0; i < sizeof(header->block_size) * 8; i++) + if ((header->block_size >> i) & 1) + cnt++; + + if (cnt != 1) + return "invalid block size"; + + if (header->res1) + return "invalid reserved bits"; + + if (vhd_util_check_zeros(header->res2, sizeof(header->res2))) + return "invalid reserved bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_differencing_header(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd) +{ + vhd_header_t *header; + + header = &vhd->header; + + if (vhd->footer.type == HD_TYPE_DIFF) { + char *parent; + + if (!ctx->opts.ignore_timestamps) { + uint32_t now = vhd_time(time(NULL)); + if (header->prt_ts > now + TIMESTAMP_MAX_SLACK) + return "parent creation time in future"; + } + + if (vhd_header_decode_parent(vhd, header, &parent)) + return "invalid parent name"; + + free(parent); + } else { + if (vhd_util_check_zeros(header->prt_name, + sizeof(header->prt_name))) + return "invalid non-null parent name"; + + if (vhd_util_check_zeros(header->loc, sizeof(header->loc))) + return "invalid non-null parent locators"; + + if (!uuid_is_null(header->prt_uuid)) + return "invalid non-null parent uuid"; + + if (header->prt_ts) + return "invalid non-zero parent timestamp"; + } + + return NULL; +} + +static char * +vhd_util_check_validate_batmap(vhd_context_t *vhd, vhd_batmap_t *batmap) +{ + int size; + off64_t eof; + uint32_t checksum; + + size = sizeof(batmap->header.cookie); + if (memcmp(batmap->header.cookie, VHD_BATMAP_COOKIE, size)) + return "invalid cookie"; + + if (batmap->header.batmap_version > VHD_BATMAP_CURRENT_VERSION) + return "unsupported batmap version"; + + checksum = vhd_checksum_batmap(vhd, batmap); + if (checksum != batmap->header.checksum) + return "invalid checksum"; + + if (!batmap->header.batmap_size) + return "invalid size zero"; + + if (batmap->header.batmap_size << (VHD_SECTOR_SHIFT + 3) < + vhd->header.max_bat_size) + return "batmap-BAT size mismatch"; + + eof = lseek64(vhd->fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (!batmap->header.batmap_offset || + batmap->header.batmap_offset % 512) + return "invalid batmap offset"; + + if ((batmap->header.batmap_offset + + vhd_sectors_to_bytes(batmap->header.batmap_size)) > + eof - sizeof(vhd_footer_t)) + return "invalid batmap size"; + + return NULL; +} + +static char * +vhd_util_check_validate_parent_locator(vhd_context_t *vhd, + vhd_parent_locator_t *loc) +{ + off64_t eof; + + if (vhd_validate_platform_code(loc->code)) + return "invalid platform code"; + + if (loc->code == PLAT_CODE_NONE) { + if (vhd_util_check_zeros(loc, sizeof(*loc))) + return "non-zero locator"; + + return NULL; + } + + if (!loc->data_offset) + return "invalid data offset"; + + if (!loc->data_space) + return "invalid data space"; + + if (!loc->data_len) + return "invalid data length"; + + eof = lseek64(vhd->fd, 0, SEEK_END); + if (eof == (off64_t)-1) + return "error finding eof"; + + if (loc->data_offset + vhd_parent_locator_size(loc) > + eof - sizeof(vhd_footer_t)) + return "invalid size"; + + if (loc->res) + return "invalid reserved bits"; + + return NULL; +} + +static char * +vhd_util_check_validate_parent(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd, const char *ppath) +{ + char *msg; + vhd_context_t parent; + + msg = NULL; + + if (vhd_parent_raw(vhd)) + return msg; + + if (ctx->opts.ignore_parent_uuid) + return msg; + + if (vhd_open(&parent, ppath, + VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED)) + return "error opening parent"; + + if (uuid_compare(vhd->header.prt_uuid, parent.footer.uuid)) { + msg = "invalid parent uuid"; + goto out; + } + +out: + vhd_close(&parent); + return msg; +} + +static int +vhd_util_check_footer(struct vhd_util_check_ctx *ctx, + int fd, vhd_footer_t *footer) +{ + int err; + size_t size; + char *msg; + void *buf; + off64_t eof, off; + vhd_footer_t primary, backup; + + memset(&primary, 0, sizeof(primary)); + memset(&backup, 0, sizeof(backup)); + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(primary)); + if (err) { + printf("error allocating buffer: %d\n", err); + return -err; + } + + memset(buf, 0, sizeof(primary)); + + eof = lseek64(fd, 0, SEEK_END); + if (eof == (off64_t)-1) { + err = -errno; + printf("error calculating end of file: %d\n", err); + goto out; + } + + size = ((eof % 512) ? 511 : 512); + eof = lseek64(fd, eof - size, SEEK_SET); + if (eof == (off64_t)-1) { + err = -errno; + printf("error calculating end of file: %d\n", err); + goto out; + } + + err = read(fd, buf, 512); + if (err != size) { + err = (errno ? -errno : -EIO); + printf("error reading primary footer: %d\n", err); + goto out; + } + + memcpy(&primary, buf, sizeof(primary)); + vhd_footer_in(&primary); + + msg = vhd_util_check_validate_footer(ctx, &primary); + if (msg) { + ctx->primary_footer_missing = 1; + + if (ctx->opts.ignore_footer) + goto check_backup; + + err = -EINVAL; + printf("primary footer invalid: %s\n", msg); + goto out; + } + + if (primary.type == HD_TYPE_FIXED) { + err = 0; + goto out; + } + +check_backup: + off = lseek64(fd, 0, SEEK_SET); + if (off == (off64_t)-1) { + err = -errno; + printf("error seeking to backup footer: %d\n", err); + goto out; + } + + size = 512; + memset(buf, 0, sizeof(primary)); + + err = read(fd, buf, size); + if (err != size) { + err = (errno ? -errno : -EIO); + printf("error reading backup footer: %d\n", err); + goto out; + } + + memcpy(&backup, buf, sizeof(backup)); + vhd_footer_in(&backup); + + msg = vhd_util_check_validate_footer(ctx, &backup); + if (msg) { + err = -EINVAL; + printf("backup footer invalid: %s\n", msg); + goto out; + } + + if (memcmp(&primary, &backup, sizeof(primary))) { + if (ctx->opts.ignore_footer) { + memcpy(&primary, &backup, sizeof(primary)); + goto ok; + } + + if (backup.hidden && + !strncmp(backup.crtr_app, "tap", 3) && + (backup.crtr_ver == VHD_VERSION(0, 1) || + backup.crtr_ver == VHD_VERSION(1, 1))) { + char cmp, tmp = backup.hidden; + backup.hidden = 0; + cmp = memcmp(&primary, &backup, sizeof(primary)); + backup.hidden = tmp; + if (!cmp) + goto ok; + } + + err = -EINVAL; + printf("primary and backup footers do not match\n"); + goto out; + } + +ok: + err = 0; + memcpy(footer, &primary, sizeof(primary)); + +out: + free(buf); + return err; +} + +static int +vhd_util_check_header(int fd, vhd_footer_t *footer) +{ + int err; + off64_t off; + char *msg; + void *buf; + vhd_header_t header; + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, sizeof(header)); + if (err) { + printf("error allocating header: %d\n", err); + return err; + } + + off = footer->data_offset; + off = lseek64(fd, off, SEEK_SET); + if (off == (off64_t)-1) { + err = -errno; + printf("error seeking to header: %d\n", err); + goto out; + } + + err = read(fd, buf, sizeof(header)); + if (err != sizeof(header)) { + err = (errno ? -errno : -EIO); + printf("error reading header: %d\n", err); + goto out; + } + + memcpy(&header, buf, sizeof(header)); + vhd_header_in(&header); + + msg = vhd_util_check_validate_header(fd, &header); + if (msg) { + err = -EINVAL; + printf("header is invalid: %s\n", msg); + goto out; + } + + err = 0; + +out: + free(buf); + return err; +} + +static int +vhd_util_check_differencing_header(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd) +{ + char *msg; + + msg = vhd_util_check_validate_differencing_header(ctx, vhd); + if (msg) { + printf("differencing header is invalid: %s\n", msg); + return -EINVAL; + } + + return 0; +} + +static int +vhd_util_check_bitmap(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd, uint32_t block) +{ + int err, i; + uint64_t sector; + char *bitmap, *data; + + data = NULL; + bitmap = NULL; + sector = (uint64_t)block * vhd->spb; + + err = vhd_read_bitmap(vhd, block, &bitmap); + if (err) { + printf("error reading bitmap 0x%x\n", block); + goto out; + } + + if (ctx->opts.check_data) { + err = vhd_read_block(vhd, block, &data); + if (err) { + printf("error reading data block 0x%x\n", block); + goto out; + } + } + + for (i = 0; i < vhd->spb; i++) { + if (ctx->opts.collect_stats && + vhd_bitmap_test(vhd, bitmap, i)) { + ctx_cur_stats(ctx)->secs_written++; + set_bit_u64(ctx_cur_stats(ctx)->bitmap, sector + i); + } + + if (ctx->opts.check_data) { + char *buf = data + (i << VHD_SECTOR_SHIFT); + int set = vhd_util_check_zeros(buf, VHD_SECTOR_SIZE); + int map = vhd_bitmap_test(vhd, bitmap, i); + + if (set && !map) { + printf("sector 0x%x of block 0x%x has data " + "where bitmap is clear\n", i, block); + err = -EINVAL; + } + } + } + +out: + free(data); + free(bitmap); + return err; +} + +static int +vhd_util_check_bat(struct vhd_util_check_ctx *ctx, vhd_context_t *vhd) +{ + off64_t eof, eoh; + uint64_t vhd_blks; + int i, j, err, block_size; + + if (ctx->opts.collect_stats) { + err = vhd_util_check_stats_alloc_one(ctx, vhd); + if (err) + return err; + } + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) { + printf("error calculating eof: %d\n", err); + return err; + } + + eof = vhd_position(vhd); + if (eof == (off64_t)-1) { + printf("error calculating eof: %d\n", -errno); + return -errno; + } + + /* adjust eof for vhds with short footers */ + if (eof % 512) { + if (eof % 512 != 511) { + printf("invalid file size: 0x%"PRIx64"\n", eof); + return -EINVAL; + } + + eof++; + } + + err = vhd_get_bat(vhd); + if (err) { + printf("error reading bat: %d\n", err); + return err; + } + + err = vhd_end_of_headers(vhd, &eoh); + if (err) { + printf("error calculating end of metadata: %d\n", err); + return err; + } + + eof -= sizeof(vhd_footer_t); + eof >>= VHD_SECTOR_SHIFT; + eoh >>= VHD_SECTOR_SHIFT; + block_size = vhd->spb + vhd->bm_secs; + + vhd_blks = vhd->footer.curr_size >> VHD_BLOCK_SHIFT; + if (vhd_blks > vhd->header.max_bat_size) { + printf("VHD size (%"PRIu64" blocks) exceeds BAT size (%u)\n", + vhd_blks, vhd->header.max_bat_size); + return -EINVAL; + } + + for (i = 0; i < vhd_blks; i++) { + uint32_t off = vhd->bat.bat[i]; + if (off == DD_BLK_UNUSED) + continue; + + if (off < eoh) { + printf("block %d (offset 0x%x) clobbers headers\n", + i, off); + return -EINVAL; + } + + if (off + block_size > eof) { + if (!(ctx->primary_footer_missing && + ctx->opts.ignore_footer && + off + block_size == eof + 1)) { + printf("block %d (offset 0x%x) clobbers " + "footer\n", i, off); + return -EINVAL; + } + } + + if (ctx->opts.no_check_bat) + continue; + + for (j = 0; j < vhd_blks; j++) { + uint32_t joff = vhd->bat.bat[j]; + + if (i == j) + continue; + + if (joff == DD_BLK_UNUSED) + continue; + + if (off == joff) + err = -EINVAL; + + if (off > joff && off < joff + block_size) + err = -EINVAL; + + if (off + block_size > joff && + off + block_size < joff + block_size) + err = -EINVAL; + + if (err) { + printf("block %d (offset 0x%x) clobbers " + "block %d (offset 0x%x)\n", + i, off, j, joff); + return err; + } + } + + if (ctx->opts.check_data || ctx->opts.collect_stats) { + if (ctx->opts.collect_stats) + ctx_cur_stats(ctx)->secs_allocated += vhd->spb; + + err = vhd_util_check_bitmap(ctx, vhd, i); + if (err) + return err; + } + } + + return 0; +} + +static int +vhd_util_check_batmap(vhd_context_t *vhd) +{ + char *msg; + int i, err; + + err = vhd_get_bat(vhd); + if (err) { + printf("error reading bat: %d\n", err); + return err; + } + + err = vhd_get_batmap(vhd); + if (err) { + printf("error reading batmap: %d\n", err); + return err; + } + + msg = vhd_util_check_validate_batmap(vhd, &vhd->batmap); + if (msg) { + printf("batmap is invalid: %s\n", msg); + return -EINVAL; + } + + for (i = 0; i < vhd->footer.curr_size >> VHD_BLOCK_SHIFT; i++) { + if (!vhd_batmap_test(vhd, &vhd->batmap, i)) + continue; + + if (vhd->bat.bat[i] == DD_BLK_UNUSED) { + printf("batmap shows unallocated block %d full\n", i); + return -EINVAL; + } + } + + return 0; +} + +static int +vhd_util_check_parent_locators(struct vhd_util_check_ctx *ctx, + vhd_context_t *vhd) +{ + int i, n, err; + vhd_parent_locator_t *loc; + char *msg, *file, *ppath, *location, *pname; + int mac, macx, w2ku, w2ru, wi2r, wi2k, found; + + mac = 0; + macx = 0; + w2ku = 0; + w2ru = 0; + wi2r = 0; + wi2k = 0; + found = 0; + pname = NULL; + ppath = NULL; + location = NULL; + + err = vhd_header_decode_parent(vhd, &vhd->header, &pname); + if (err) { + printf("error decoding parent name: %d\n", err); + return err; + } + + n = sizeof(vhd->header.loc) / sizeof(vhd->header.loc[0]); + for (i = 0; i < n; i++) { + ppath = NULL; + location = NULL; + loc = vhd->header.loc + i; + + msg = vhd_util_check_validate_parent_locator(vhd, loc); + if (msg) { + err = -EINVAL; + printf("invalid parent locator %d: %s\n", i, msg); + goto out; + } + + if (loc->code == PLAT_CODE_NONE) + continue; + + switch (loc->code) { + case PLAT_CODE_MACX: + if (macx++) + goto dup; + break; + + case PLAT_CODE_MAC: + if (mac++) + goto dup; + break; + + case PLAT_CODE_W2KU: + if (w2ku++) + goto dup; + break; + + case PLAT_CODE_W2RU: + if (w2ru++) + goto dup; + break; + + case PLAT_CODE_WI2R: + if (wi2r++) + goto dup; + break; + + case PLAT_CODE_WI2K: + if (wi2k++) + goto dup; + break; + + default: + err = -EINVAL; + printf("invalid platform code for locator %d\n", i); + goto out; + } + + if (loc->code != PLAT_CODE_MACX && + loc->code != PLAT_CODE_W2RU && + loc->code != PLAT_CODE_W2KU) + continue; + + err = vhd_parent_locator_read(vhd, loc, &ppath); + if (err) { + printf("error reading parent locator %d: %d\n", i, err); + goto out; + } + + file = basename(ppath); + if (strcmp(pname, file)) { + err = -EINVAL; + printf("parent locator %d name (%s) does not match " + "header name (%s)\n", i, file, pname); + goto out; + } + + err = vhd_find_parent(vhd, ppath, &location); + if (err) { + printf("error resolving %s: %d\n", ppath, err); + goto out; + } + + err = access(location, R_OK); + if (err && loc->code == PLAT_CODE_MACX) { + err = -errno; + printf("parent locator %d points to missing file %s " + "(resolved to %s)\n", i, ppath, location); + goto out; + } + + msg = vhd_util_check_validate_parent(ctx, vhd, location); + if (msg) { + err = -EINVAL; + printf("invalid parent %s: %s\n", location, msg); + goto out; + } + + found++; + free(ppath); + free(location); + ppath = NULL; + location = NULL; + + continue; + + dup: + printf("duplicate platform code in locator %d: 0x%x\n", + i, loc->code); + err = -EINVAL; + goto out; + } + + if (!found) { + err = -EINVAL; + printf("could not find parent %s\n", pname); + goto out; + } + + err = 0; + +out: + free(pname); + free(ppath); + free(location); + return err; +} + +static void +vhd_util_dump_headers(const char *name) +{ + char *argv[] = { "read", "-p", "-n", (char *)name }; + int argc = sizeof(argv) / sizeof(argv[0]); + + printf("%s appears invalid; dumping metadata\n", name); + vhd_util_read(argc, argv); +} + +static int +vhd_util_check_vhd(struct vhd_util_check_ctx *ctx, const char *name) +{ + int fd, err; + vhd_context_t vhd; + struct stat stats; + vhd_footer_t footer; + + fd = -1; + memset(&vhd, 0, sizeof(vhd)); + memset(&footer, 0, sizeof(footer)); + + err = stat(name, &stats); + if (err == -1) { + printf("cannot stat %s: %d\n", name, errno); + return -errno; + } + + if (!S_ISREG(stats.st_mode) && !S_ISBLK(stats.st_mode)) { + printf("%s is not a regular file or block device\n", name); + return -EINVAL; + } + + fd = open(name, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (fd == -1) { + printf("error opening %s\n", name); + return -errno; + } + + err = vhd_util_check_footer(ctx, fd, &footer); + if (err) + goto out; + + if (footer.type != HD_TYPE_DYNAMIC && footer.type != HD_TYPE_DIFF) + goto out; + + err = vhd_util_check_header(fd, &footer); + if (err) + goto out; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) + goto out; + + err = vhd_util_check_differencing_header(ctx, &vhd); + if (err) + goto out; + + err = vhd_util_check_bat(ctx, &vhd); + if (err) + goto out; + + if (vhd_has_batmap(&vhd)) { + err = vhd_util_check_batmap(&vhd); + if (err) + goto out; + } + + if (vhd.footer.type == HD_TYPE_DIFF) { + err = vhd_util_check_parent_locators(ctx, &vhd); + if (err) + goto out; + } + + err = 0; + + if (!ctx->opts.collect_stats) + printf("%s is valid\n", name); + +out: + if (err) + vhd_util_dump_headers(name); + if (fd != -1) + close(fd); + vhd_close(&vhd); + return err; +} + +static int +vhd_util_check_parents(struct vhd_util_check_ctx *ctx, const char *name) +{ + int err; + vhd_context_t vhd; + char *cur, *parent; + + cur = (char *)name; + + for (;;) { + err = vhd_open(&vhd, cur, + VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) + goto out; + + if (vhd.footer.type != HD_TYPE_DIFF || vhd_parent_raw(&vhd)) { + vhd_close(&vhd); + goto out; + } + + err = vhd_parent_locator_get(&vhd, &parent); + vhd_close(&vhd); + + if (err) { + printf("error getting parent: %d\n", err); + goto out; + } + + if (cur != name) + free(cur); + cur = parent; + + err = vhd_util_check_vhd(ctx, cur); + if (err) + goto out; + } + +out: + if (err) + printf("error checking parents: %d\n", err); + if (cur != name) + free(cur); + return err; +} + +int +vhd_util_check(int argc, char **argv) +{ + char *name; + int c, err, parents; + struct vhd_util_check_ctx ctx; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + name = NULL; + parents = 0; + memset(&ctx, 0, sizeof(ctx)); + vhd_util_check_stats_init(&ctx); + + optind = 0; + while ((c = getopt(argc, argv, "n:iItpbBsh")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''i'': + ctx.opts.ignore_footer = 1; + break; + case ''I'': + ctx.opts.ignore_parent_uuid = 1; + break; + case ''t'': + ctx.opts.ignore_timestamps = 1; + break; + case ''p'': + parents = 1; + break; + case ''b'': + ctx.opts.check_data = 1; + break; + case ''B'': + ctx.opts.no_check_bat = 1; + break; + case ''s'': + ctx.opts.collect_stats = 1; + break; + case ''h'': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || optind != argc) { + err = -EINVAL; + goto usage; + } + + if ((ctx.opts.collect_stats || ctx.opts.check_data) && + ctx.opts.no_check_bat) { + err = -EINVAL; + goto usage; + } + + err = vhd_util_check_vhd(&ctx, name); + if (err) + goto out; + + if (parents) + err = vhd_util_check_parents(&ctx, name); + + if (ctx.opts.collect_stats) + vhd_util_check_stats_print(&ctx); + + vhd_util_check_stats_free(&ctx); + +out: + return err; + +usage: + printf("options: -n <file> [-i ignore missing primary footers] " + "[-I ignore parent uuids] [-t ignore timestamps] " + "[-B do not check BAT for overlapping (precludes -s, -b)] " + "[-p check parents] [-b check bitmaps] [-s stats] [-h help]\n"); + return err; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-coalesce.c b/tools/blktap3/vhd/lib/vhd-util-coalesce.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-coalesce.c @@ -0,0 +1,708 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <limits.h> + +#include "libvhd.h" + +static int +__raw_io_write(int fd, char* buf, uint64_t sec, uint32_t secs) +{ + off64_t off; + size_t ret; + + errno = 0; + off = lseek64(fd, vhd_sectors_to_bytes(sec), SEEK_SET); + if (off == (off64_t)-1) { + printf("raw parent: seek(0x%08"PRIx64") failed: %d\n", + vhd_sectors_to_bytes(sec), -errno); + return -errno; + } + + ret = write(fd, buf, vhd_sectors_to_bytes(secs)); + if (ret == vhd_sectors_to_bytes(secs)) + return 0; + + printf("raw parent: write of 0x%"PRIx64" returned %zd, errno: %d\n", + vhd_sectors_to_bytes(secs), ret, -errno); + return (errno ? -errno : -EIO); +} + +/* + * Use ''parent'' if the parent is VHD, and ''parent_fd'' if the parent is raw + */ +static int +vhd_util_coalesce_block(vhd_context_t *vhd, vhd_context_t *parent, + int parent_fd, uint64_t block) +{ + int i, err; + void *buf; + char *map; + uint64_t sec, secs; + + buf = NULL; + map = NULL; + sec = block * vhd->spb; + + if (vhd->bat.bat[block] == DD_BLK_UNUSED) + return 0; + + err = posix_memalign(&buf, 4096, vhd->header.block_size); + if (err) + return -err; + + err = vhd_io_read(vhd, buf, sec, vhd->spb); + if (err) + goto done; + + if (vhd_has_batmap(vhd) && vhd_batmap_test(vhd, &vhd->batmap, block)) { + if (parent->file) + err = vhd_io_write(parent, buf, sec, vhd->spb); + else + err = __raw_io_write(parent_fd, buf, sec, vhd->spb); + goto done; + } + + err = vhd_read_bitmap(vhd, block, &map); + if (err) + goto done; + + for (i = 0; i < vhd->spb; i++) { + if (!vhd_bitmap_test(vhd, map, i)) + continue; + + for (secs = 0; i + secs < vhd->spb; secs++) + if (!vhd_bitmap_test(vhd, map, i + secs)) + break; + + if (parent->file) + err = vhd_io_write(parent, + buf + vhd_sectors_to_bytes(i), + sec + i, secs); + else + err = __raw_io_write(parent_fd, + buf + vhd_sectors_to_bytes(i), + sec + i, secs); + if (err) + goto done; + + i += secs; + } + + err = 0; + +done: + free(buf); + free(map); + return err; +} + +static int +vhd_util_coalesce_onto(vhd_context_t *from, + vhd_context_t *to, int to_fd, int progress) +{ + int err; + uint64_t i; + + err = vhd_get_bat(from); + if (err) + goto out; + + if (vhd_has_batmap(from)) { + err = vhd_get_batmap(from); + if (err) + goto out; + } + + for (i = 0; i < from->bat.entries; i++) { + if (progress) { + printf("\r%6.2f%%", + ((float)i / (float)from->bat.entries) * 100.00); + fflush(stdout); + } + err = vhd_util_coalesce_block(from, to, to_fd, i); + if (err) + goto out; + } + + err = 0; + + if (progress) + printf("\r100.00%%\n"); + +out: + return err; +} + +static int +vhd_util_coalesce_parent(const char *name, int sparse, int progress) +{ + char *pname; + int err, parent_fd; + vhd_context_t vhd, parent; + + parent_fd = -1; + parent.file = NULL; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_parent_locator_get(&vhd, &pname); + if (err) { + printf("error finding %s parent: %d\n", name, err); + vhd_close(&vhd); + return err; + } + + if (vhd_parent_raw(&vhd)) { + parent_fd = open(pname, O_RDWR | O_DIRECT | O_LARGEFILE, 0644); + if (parent_fd == -1) { + err = -errno; + printf("failed to open parent %s: %d\n", pname, err); + vhd_close(&vhd); + return err; + } + } else { + int flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); + if (sparse) printf("opening for sparse writes\n"); + err = vhd_open(&parent, pname, VHD_OPEN_RDWR | flags); + if (err) { + printf("error opening %s: %d\n", pname, err); + free(pname); + vhd_close(&vhd); + return err; + } + } + + err = vhd_util_coalesce_onto(&vhd, &parent, parent_fd, progress); + + free(pname); + vhd_close(&vhd); + if (parent.file) + vhd_close(&parent); + else + close(parent_fd); + return err; +} + +TAILQ_HEAD(tqh_vhd_list_entry, vhd_list_entry); +struct vhd_list_entry { + int raw; + int raw_fd; + vhd_context_t vhd; + TAILQ_ENTRY(vhd_list_entry) next; +}; + +static int +vhd_util_pathcmp(const char *a, const char *b, int *cmp) +{ + int err; + char *apath = NULL, __apath[PATH_MAX]; + char *bpath = NULL, __bpath[PATH_MAX]; + + apath = realpath(a, __apath); + if (!apath) { + err = -errno; + goto out; + } + + bpath = realpath(b, __bpath); + if (!bpath) { + err = -errno; + goto out; + } + + *cmp = strcmp(apath, bpath); + err = 0; + +out: + return err; +} + +static void +vhd_util_coalesce_free_chain(struct tqh_vhd_list_entry *head) +{ + struct vhd_list_entry *entry, *tmp; + + TAILQ_FOREACH_SAFE(entry, head, next, tmp) { + if (entry->raw) + close(entry->raw_fd); + else + vhd_close(&entry->vhd); + TAILQ_REMOVE(head, entry, next); + free(entry); + } + + TAILQ_INIT(head); +} + +static int +vhd_util_coalesce_load_chain(struct tqh_vhd_list_entry *head, + const char *cname, const char *aname, int sparse) +{ + char *next; + vhd_context_t *child; + int err, cmp, vhd_flags; + struct vhd_list_entry *entry; + + next = NULL; + entry = NULL; + TAILQ_INIT(head); + + vhd_flags = VHD_OPEN_RDWR | (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); + + err = vhd_util_pathcmp(cname, aname, &cmp); + if (err) + goto out; + + if (!cmp) { + err = -EINVAL; + goto out; + } + + entry = calloc(1, sizeof(*entry)); + if (!entry) + goto out; + + err = vhd_open(&entry->vhd, cname, vhd_flags); + if (err) + goto out; + + err = vhd_get_bat(&entry->vhd); + if (err) + goto out; + + if (vhd_has_batmap(&entry->vhd)) { + err = vhd_get_batmap(&entry->vhd); + if (err) + goto out; + } + + child = &entry->vhd; + TAILQ_INSERT_TAIL(head, entry, next); + + for (;;) { + int raw; + + if (entry->raw || entry->vhd.footer.type != HD_TYPE_DIFF) { + err = -ENOENT; + goto out; + } + + if (child->header.block_size != entry->vhd.header.block_size) { + err = -EINVAL; + goto out; + } + + err = vhd_parent_locator_get(&entry->vhd, &next); + if (err) + goto out; + + raw = vhd_parent_raw(&entry->vhd); + + entry = calloc(1, sizeof(*entry)); + if (!entry) + goto out; + + if (raw) { + entry->raw = raw; + entry->raw_fd = open(next, + O_RDWR | O_DIRECT | O_LARGEFILE); + if (entry->raw_fd == -1) { + err = -errno; + goto out; + } + } else { + err = vhd_open(&entry->vhd, next, vhd_flags); + if (err) + goto out; + + err = vhd_get_bat(&entry->vhd); + if (err) + goto out; + + if (vhd_has_batmap(&entry->vhd)) { + err = vhd_get_batmap(&entry->vhd); + if (err) + goto out; + } + } + + TAILQ_INSERT_HEAD(head, entry, next); + + err = vhd_util_pathcmp(next, aname, &cmp); + if (err) + goto out; + + if (!cmp) + goto done; + + free(next); + next = NULL; + } + +done: + err = 0; +out: + if (err) { + if (entry && TAILQ_EMPTY(head)) { + if (entry->vhd.file) + vhd_close(&entry->vhd); + else if (entry->raw) + close(entry->raw_fd); + free(entry); + } + vhd_util_coalesce_free_chain(head); + } + return err; +} + +static int +vhd_util_coalesce_clear_bitmap(vhd_context_t *child, char *cmap, + vhd_context_t *ancestor, const uint64_t block) +{ + char *amap = NULL; + int i, dirty, err; + + if (child->spb != ancestor->spb) { + err = -EINVAL; + goto out; + } + + if (block >= ancestor->bat.entries) + goto done; + + if (ancestor->bat.bat[block] == DD_BLK_UNUSED) + goto done; + + err = vhd_read_bitmap(ancestor, block, &amap); + if (err) + goto out; + + for (i = 0; i < child->spb; i++) { + if (vhd_bitmap_test(child, cmap, i)) { + if (vhd_bitmap_test(ancestor, amap, i)) { + dirty = 1; + vhd_bitmap_clear(ancestor, amap, i); + } + } + } + + if (dirty) { + err = vhd_write_bitmap(ancestor, block, amap); + if (err) + goto out; + if (vhd_has_batmap(ancestor) && + vhd_batmap_test(ancestor, &ancestor->batmap, block)) { + vhd_batmap_clear(ancestor, &ancestor->batmap, block); + err = vhd_write_batmap(ancestor, &ancestor->batmap); + if (err) + goto out; + } + } + +done: + err = 0; +out: + free(amap); + return err; +} + +static int +vhd_util_coalesce_clear_bitmaps(struct tqh_vhd_list_entry *chain, + vhd_context_t *child, vhd_context_t *ancestor, uint64_t block) +{ + int err; + char *map = NULL; + struct vhd_list_entry *entry; + + if (child->bat.bat[block] == DD_BLK_UNUSED) + goto done; + + err = vhd_read_bitmap(child, block, &map); + if (err) + goto out; + + TAILQ_FOREACH(entry, chain, next) { + if (&entry->vhd == child) + continue; + if (&entry->vhd == ancestor) + break; + err = vhd_util_coalesce_clear_bitmap(child, map, + &entry->vhd, block); + if (err) + goto out; + } + +done: + err = 0; +out: + free(map); + return err; +} + +static int +vhd_util_coalesce_ancestor(const char *cname, + const char *aname, int sparse, int progress) +{ + uint64_t i; + int err, raw_fd; + struct tqh_vhd_list_entry chain; + struct vhd_list_entry *entry; + vhd_context_t *child, *ancestor; + + child = NULL; + ancestor = NULL; + + err = vhd_util_coalesce_load_chain(&chain, cname, aname, sparse); + if (err) + goto out; + + TAILQ_FOREACH(entry, &chain, next) { + if (!child) + child = &entry->vhd; + else if (TAILQ_LAST(&chain, tqh_vhd_list_entry) == entry) { + ancestor = &entry->vhd; + raw_fd = entry->raw_fd; + break; + } + } + + if (!ancestor) { + err = -EINVAL; + goto out; + } + + err = vhd_util_coalesce_onto(child, ancestor, raw_fd, progress); + if (err) + goto out; + + for (i = 0; i < child->bat.entries; i++) { + err = vhd_util_coalesce_clear_bitmaps(&chain, + child, ancestor, i); + if (err) + goto out; + } + +out: + vhd_util_coalesce_free_chain(&chain); + return err; +} + +static int +vhd_util_coalesce_open_output(vhd_context_t *dst, + vhd_context_t *src, const char *name, int flags) +{ + int err; + + err = access(name, F_OK); + if (!err) { + printf("%s already exists\n", name); + return -EEXIST; + } else if (errno != ENOENT) { + printf("error checking %s: %d\n", name, errno); + return -errno; + } + + err = vhd_create(name, src->footer.curr_size, HD_TYPE_DYNAMIC, 0, 0); + if (err) { + printf("error creating %s: %d\n", name, err); + return err; + } + + err = vhd_open(dst, name, VHD_OPEN_RDWR | flags); + if (err || dst->header.block_size != src->header.block_size) { + printf("error opening %s: %d\n", name, (err ? : EINVAL)); + unlink(name); + return err ? : EINVAL; + } + + return 0; +} + +/* + * read block from @src chain and write it to @dst, unless it is all zeros + */ +static int +vhd_util_coalesce_block_out(vhd_context_t *dst, + vhd_context_t *src, uint64_t block) +{ + int i, err; + uint64_t sec; + void *buf; + char *p; + + buf = NULL; + sec = block * src->spb; + + err = posix_memalign(&buf, 4096, src->header.block_size); + if (err) + return -err; + + err = vhd_io_read(src, buf, sec, src->spb); + if (err) + goto done; + + for (p = buf, i = 0; i < src->header.block_size; i++, p++) { + if (*p) { + err = vhd_io_write(dst, buf, sec, src->spb); + break; + } + } + +done: + free(buf); + return err; +} + +static int +vhd_util_coalesce_out(const char *src_name, const char *dst_name, + int sparse, int progress) +{ + uint64_t i; + int err, flags; + vhd_context_t src, dst; + + err = vhd_open(&src, src_name, VHD_OPEN_RDONLY | VHD_OPEN_CACHED); + if (err) + return err; + + flags = (sparse ? VHD_OPEN_IO_WRITE_SPARSE : 0); + err = vhd_util_coalesce_open_output(&dst, &src, dst_name, flags); + if (err) { + vhd_close(&src); + return err; + } + + err = vhd_get_bat(&src); + if (err) + goto done; + + if (vhd_has_batmap(&src)) { + err = vhd_get_batmap(&src); + if (err) + goto done; + } + + for (i = 0; i < src.bat.entries; i++) { + if (progress) { + printf("\r%6.2f%%", + ((float)i / (float)src.bat.entries) * 100.0); + fflush(stdout); + } + err = vhd_util_coalesce_block_out(&dst, &src, i); + if (err) + goto done; + } + + err = 0; + + if (progress) + printf("\r100.00%%\n"); + +done: + if (err) + unlink(dst.file); + vhd_close(&src); + vhd_close(&dst); + return err; +} + +int +vhd_util_coalesce(int argc, char **argv) +{ + char *name, *oname, *ancestor; + int err, c, progress, sparse; + + name = NULL; + oname = NULL; + ancestor = NULL; + sparse = 0; + progress = 0; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:o:a:sph")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''o'': + oname = optarg; + break; + case ''a'': + ancestor = optarg; + break; + case ''s'': + sparse = 1; + break; + case ''p'': + progress = 1; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + if (oname && ancestor) + goto usage; + + if (oname) + err = vhd_util_coalesce_out(name, oname, sparse, progress); + else if (ancestor) + err = vhd_util_coalesce_ancestor(name, ancestor, + sparse, progress); + else + err = vhd_util_coalesce_parent(name, sparse, progress); + + if (err) + printf("error coalescing: %d\n", err); + + return err; + +usage: + printf("options: <-n name> [-a ancestor] " + "[-o output] [-s sparse] [-p progress] [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-create.c b/tools/blktap3/vhd/lib/vhd-util-create.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-create.c @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_create(int argc, char **argv) +{ + char *name; + uint64_t size, msize; + int c, sparse, err; + vhd_flag_creat_t flags; + + err = -EINVAL; + size = 0; + msize = 0; + sparse = 1; + name = NULL; + flags = 0; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:s:S:rh")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''s'': + err = 0; + size = strtoull(optarg, NULL, 10); + break; + case ''S'': + err = 0; + msize = strtoull(optarg, NULL, 10); + break; + case ''r'': + sparse = 0; + break; + case ''h'': + default: + goto usage; + } + } + + if (err || !name || optind != argc) + goto usage; + + if (msize && msize < size) { + printf("Error: <-S size> must be greater than <-s size>\n"); + return -EINVAL; + } + + return vhd_create(name, size << 20, + (sparse ? HD_TYPE_DYNAMIC : HD_TYPE_FIXED), + msize << 20, flags); + +usage: + printf("options: <-n name> <-s size (MB)> [-r reserve] [-h help] " + "[<-S size (MB) for metadata preallocation " + "(see vhd-util resize)>]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-fill.c b/tools/blktap3/vhd/lib/vhd-util-fill.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-fill.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_fill(int argc, char **argv) +{ + int err, c; + char *name; + void *buf; + vhd_context_t vhd; + uint64_t i, sec, secs; + + buf = NULL; + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_get_bat(&vhd); + if (err) + goto done; + + err = posix_memalign(&buf, 4096, vhd.header.block_size); + if (err) { + err = -err; + goto done; + } + + sec = 0; + secs = vhd.header.block_size >> VHD_SECTOR_SHIFT; + + for (i = 0; i < vhd.header.max_bat_size; i++) { + err = vhd_io_read(&vhd, buf, sec, secs); + if (err) + goto done; + + err = vhd_io_write(&vhd, buf, sec, secs); + if (err) + goto done; + + sec += secs; + } + + err = 0; + + done: + free(buf); + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-modify.c b/tools/blktap3/vhd/lib/vhd-util-modify.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-modify.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +TEST_FAIL_EXTERN_VARS; + +static int +vhd_util_zero_bat(vhd_context_t *vhd) +{ + int err, map_bytes; + uint64_t i; + + err = vhd_get_bat(vhd); + if (err) + return err; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + return err; + } + + for (i = 0; i < vhd->bat.entries; i++) + vhd->bat.bat[i] = DD_BLK_UNUSED; + err = vhd_write_bat(vhd, &vhd->bat); + if (err) + return err; + + map_bytes = ((vhd->footer.curr_size >> VHD_SECTOR_SHIFT) / + vhd->spb) >> 3; + map_bytes = vhd_sectors_to_bytes(secs_round_up_no_zero(map_bytes)); + memset(vhd->batmap.map, 0, map_bytes); + return vhd_write_batmap(vhd, &vhd->batmap); +} + +int +vhd_util_modify(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + int err, c, size, parent, parent_raw, kill_data; + off64_t newsize = 0; + char *newparent = NULL; + + name = NULL; + size = 0; + parent = 0; + parent_raw = 0; + kill_data = 0; + + optind = 0; + while ((c = getopt(argc, argv, "n:s:p:mzh")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''s'': + size = 1; + errno = 0; + newsize = strtoll(optarg, NULL, 10); + if (errno) { + fprintf(stderr, "Invalid size ''%s''\n", optarg); + goto usage; + } + break; + case ''p'': + parent = 1; + newparent = optarg; + break; + case ''m'': + parent_raw = 1; + break; + case ''z'': + kill_data = 1; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (kill_data) { + if (vhd_type_dynamic(&vhd)) + err = vhd_util_zero_bat(&vhd); + else + err = -ENOSYS; + + if (!err && !vhd.is_block) // truncate file-based VHDs + err = vhd_write_footer(&vhd, &vhd.footer); + + if (err) + printf("failed to zero VHD: %d\n", err); + } + + if (size) { + err = vhd_set_phys_size(&vhd, newsize); + if (err) + printf("failed to set physical size to %"PRIu64":" + " %d\n", newsize, err); + } + + if (parent) { + TEST_FAIL_AT(FAIL_REPARENT_BEGIN); + err = vhd_change_parent(&vhd, newparent, parent_raw); + if (err) { + printf("failed to set parent to ''%s'': %d\n", + newparent, err); + goto done; + } + TEST_FAIL_AT(FAIL_REPARENT_END); + } + +done: + vhd_close(&vhd); + return err; + +usage: + printf("*** Dangerous operations, use with care ***\n"); + printf("options: <-n name> [-p NEW_PARENT set parent [-m raw]] " + "[-s NEW_SIZE set size] [-z zero (kill data)] " + "[-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-query.c b/tools/blktap3/vhd/lib/vhd-util-query.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-query.c @@ -0,0 +1,195 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_query(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + off64_t currsize; + int ret, err, c, size, physize, parent, fields, depth, fastresize, marker; + + name = NULL; + size = 0; + physize = 0; + parent = 0; + fields = 0; + depth = 0; + fastresize = 0; + marker = 0; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + optind = 0; + while ((c = getopt(argc, argv, "n:vspfdSmh")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''v'': + size = 1; + break; + case ''s'': + physize = 1; + break; + case ''p'': + parent = 1; + break; + case ''f'': + fields = 1; + break; + case ''d'': + depth = 1; + break; + case ''S'': + fastresize = 1; + break; + case ''m'': + marker = 1; + break; + case ''h'': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || optind != argc) { + err = -EINVAL; + goto usage; + } + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (size) + printf("%"PRIu64"\n", vhd.footer.curr_size >> 20); + + if (physize) { + err = vhd_get_phys_size(&vhd, &currsize); + if (err) + printf("failed to get physical size: %d\n", err); + else + printf("%"PRIu64"\n", currsize); + } + + if (parent) { + ret = 0; + + if (vhd.footer.type != HD_TYPE_DIFF) + printf("%s has no parent\n", name); + else { + char *pname; + + ret = vhd_parent_locator_get(&vhd, &pname); + if (ret) + printf("query failed\n"); + else { + printf("%s\n", pname); + free(pname); + } + } + + err = (err ? : ret); + } + + if (fields) { + int hidden; + + ret = vhd_hidden(&vhd, &hidden); + if (ret) + printf("error checking ''hidden'' field: %d\n", ret); + else + printf("hidden: %d\n", hidden); + + err = (err ? : ret); + } + + if (marker) { + char marker; + + ret = vhd_marker(&vhd, &marker); + if (ret) + printf("error checking ''marker'' field: %d\n", ret); + else + printf("marker: %d\n", marker); + + err = (err ? : ret); + } + + if (depth) { + int length; + + ret = vhd_chain_depth(&vhd, &length); + if (ret) + printf("error checking chain depth: %d\n", ret); + else + printf("chain depth: %d\n", length); + + err = (err ? : ret); + } + + if (fastresize) { + uint64_t max_size; + + max_size = vhd.header.max_bat_size << (VHD_BLOCK_SHIFT - 20); + printf("%"PRIu64"\n", max_size); + } + + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-v print virtual size (in MB)] " + "[-s print physical utilization (bytes)] [-p print parent] " + "[-f print fields] [-m print marker] [-d print chain depth] " + "[-S print max virtual size (MB) for fast resize] [-h help]\n"); + return err; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-read.c b/tools/blktap3/vhd/lib/vhd-util-read.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-read.c @@ -0,0 +1,937 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <inttypes.h> + +#include "libvhd.h" +#include "vhd-util.h" + +#define nsize 15 +static char nbuf[nsize]; + +static inline char * +__xconv(uint64_t num) +{ + snprintf(nbuf, nsize, "%#" PRIx64 , num); + return nbuf; +} + +static inline char * +__dconv(uint64_t num) +{ + snprintf(nbuf, nsize, "%" PRIu64, num); + return nbuf; +} + +#define conv(hex, num) \ + (hex ? __xconv((uint64_t)num) : __dconv((uint64_t)num)) + +static void +vhd_print_header(vhd_context_t *vhd, vhd_header_t *h, int hex) +{ + int err; + uint32_t cksm; + char uuid[37], time_str[26], cookie[9], *name; + + printf("VHD Header Summary:\n-------------------\n"); + + snprintf(cookie, 9, "%s", h->cookie); + printf("Cookie : %s\n", cookie); + + printf("Data offset (unusd) : %s\n", conv(hex, h->data_offset)); + printf("Table offset : %s\n", conv(hex, h->table_offset)); + printf("Header version : 0x%08x\n", h->hdr_ver); + printf("Max BAT size : %s\n", conv(hex, h->max_bat_size)); + printf("Block size : %s ", conv(hex, h->block_size)); + printf("(%s MB)\n", conv(hex, h->block_size >> 20)); + + err = vhd_header_decode_parent(vhd, h, &name); + printf("Parent name : %s\n", + (err ? "failed to read name" : name)); + free(name); + + uuid_unparse(h->prt_uuid, uuid); + printf("Parent UUID : %s\n", uuid); + + vhd_time_to_string(h->prt_ts, time_str); + printf("Parent timestamp : %s\n", time_str); + + cksm = vhd_checksum_header(h); + printf("Checksum : 0x%x|0x%x (%s)\n", h->checksum, cksm, + h->checksum == cksm ? "Good!" : "Bad!"); + printf("\n"); +} + +/* String table for hd.type */ +char *hd_type_str[7] = { + "None", /* 0 */ + "Reserved (deprecated)", /* 1 */ + "Fixed hard disk", /* 2 */ + "Dynamic hard disk", /* 3 */ + "Differencing hard disk", /* 4 */ + "Reserved (deprecated)", /* 5 */ + "Reserved (deprecated)" /* 6 */ +}; + +static void +vhd_print_footer(vhd_footer_t *f, int hex) +{ + uint64_t c, h, s; + uint32_t ff_maj, ff_min, cr_maj, cr_min, cksm; + char time_str[26], creator[5], uuid[37], cookie[9]; + + printf("VHD Footer Summary:\n-------------------\n"); + + snprintf(cookie, 9, "%s", f->cookie); + printf("Cookie : %s\n", cookie); + + printf("Features : (0x%08x) %s%s\n", f->features, + (f->features & HD_TEMPORARY) ? "<TEMP>" : "", + (f->features & HD_RESERVED) ? "<RESV>" : ""); + + ff_maj = f->ff_version >> 16; + ff_min = f->ff_version & 0xffff; + printf("File format version : Major: %d, Minor: %d\n", + ff_maj, ff_min); + + printf("Data offset : %s\n", conv(hex, f->data_offset)); + + vhd_time_to_string(f->timestamp, time_str); + printf("Timestamp : %s\n", time_str); + + memcpy(creator, f->crtr_app, 4); + creator[4] = ''\0''; + printf("Creator Application : ''%s''\n", creator); + + cr_maj = f->crtr_ver >> 16; + cr_min = f->crtr_ver & 0xffff; + printf("Creator version : Major: %d, Minor: %d\n", + cr_maj, cr_min); + + printf("Creator OS : %s\n", + ((f->crtr_os == HD_CR_OS_WINDOWS) ? "Windows" : + ((f->crtr_os == HD_CR_OS_MACINTOSH) ? "Macintosh" : + "Unknown!"))); + + printf("Original disk size : %s MB ", conv(hex, f->orig_size >> 20)); + printf("(%s Bytes)\n", conv(hex, f->orig_size)); + + printf("Current disk size : %s MB ", conv(hex, f->curr_size >> 20)); + printf("(%s Bytes)\n", conv(hex, f->curr_size)); + + c = f->geometry >> 16; + h = (f->geometry & 0x0000FF00) >> 8; + s = f->geometry & 0x000000FF; + printf("Geometry : Cyl: %s, ", conv(hex, c)); + printf("Hds: %s, ", conv(hex, h)); + printf("Sctrs: %s\n", conv(hex, s)); + printf(" : = %s MB ", conv(hex, (c * h * s) >> 11)); + printf("(%s Bytes)\n", conv(hex, c * h * s << 9)); + + printf("Disk type : %s\n", + f->type <= HD_TYPE_MAX ? + hd_type_str[f->type] : "Unknown type!\n"); + + cksm = vhd_checksum_footer(f); + printf("Checksum : 0x%x|0x%x (%s)\n", f->checksum, cksm, + f->checksum == cksm ? "Good!" : "Bad!"); + + uuid_unparse(f->uuid, uuid); + printf("UUID : %s\n", uuid); + + printf("Saved state : %s\n", f->saved == 0 ? "No" : "Yes"); + printf("Hidden : %d\n", f->hidden); + printf("\n"); +} + +static inline char * +code_name(uint32_t code) +{ + switch(code) { + case PLAT_CODE_NONE: + return "PLAT_CODE_NONE"; + case PLAT_CODE_WI2R: + return "PLAT_CODE_WI2R"; + case PLAT_CODE_WI2K: + return "PLAT_CODE_WI2K"; + case PLAT_CODE_W2RU: + return "PLAT_CODE_W2RU"; + case PLAT_CODE_W2KU: + return "PLAT_CODE_W2KU"; + case PLAT_CODE_MAC: + return "PLAT_CODE_MAC"; + case PLAT_CODE_MACX: + return "PLAT_CODE_MACX"; + default: + return "UNKOWN"; + } +} + +static void +vhd_print_parent(vhd_context_t *vhd, vhd_parent_locator_t *loc) +{ + int err; + char *buf; + + err = vhd_parent_locator_read(vhd, loc, &buf); + if (err) { + printf("failed to read parent name\n"); + return; + } + + printf(" decoded name : %s\n", buf); +} + +static void +vhd_print_parent_locators(vhd_context_t *vhd, int hex) +{ + int i, n; + vhd_parent_locator_t *loc; + + printf("VHD Parent Locators:\n--------------------\n"); + + n = sizeof(vhd->header.loc) / sizeof(struct prt_loc); + for (i = 0; i < n; i++) { + loc = &vhd->header.loc[i]; + + if (loc->code == PLAT_CODE_NONE) + continue; + + printf("locator: : %d\n", i); + printf(" code : %s\n", + code_name(loc->code)); + printf(" data_space : %s\n", + conv(hex, loc->data_space)); + printf(" data_length : %s\n", + conv(hex, loc->data_len)); + printf(" data_offset : %s\n", + conv(hex, loc->data_offset)); + vhd_print_parent(vhd, loc); + printf("\n"); + } +} + +static void +vhd_print_batmap_header(vhd_context_t *vhd, vhd_batmap_t *batmap, int hex) +{ + uint32_t cksm; + + printf("VHD Batmap Summary:\n-------------------\n"); + printf("Batmap offset : %s\n", + conv(hex, batmap->header.batmap_offset)); + printf("Batmap size (secs) : %s\n", + conv(hex, batmap->header.batmap_size)); + printf("Batmap version : 0x%08x\n", + batmap->header.batmap_version); + + cksm = vhd_checksum_batmap(vhd, batmap); + printf("Checksum : 0x%x|0x%x (%s)\n", + batmap->header.checksum, cksm, + (batmap->header.checksum == cksm ? "Good!" : "Bad!")); + printf("\n"); +} + +static inline int +check_block_range(vhd_context_t *vhd, uint64_t block, int hex) +{ + if (block > vhd->header.max_bat_size) { + fprintf(stderr, "block %s past end of file\n", + conv(hex, block)); + return -ERANGE; + } + + return 0; +} + +static int +vhd_print_headers(vhd_context_t *vhd, int hex) +{ + int err; + + vhd_print_footer(&vhd->footer, hex); + + if (vhd_type_dynamic(vhd)) { + vhd_print_header(vhd, &vhd->header, hex); + + if (vhd->footer.type == HD_TYPE_DIFF) + vhd_print_parent_locators(vhd, hex); + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) { + printf("failed to get batmap header\n"); + return err; + } + + vhd_print_batmap_header(vhd, &vhd->batmap, hex); + } + } + + return 0; +} + +static int +vhd_dump_headers(const char *name, int hex) +{ + vhd_context_t vhd; + + libvhd_set_log_level(1); + memset(&vhd, 0, sizeof(vhd)); + + printf("\n%s appears invalid; dumping headers\n\n", name); + + vhd.fd = open(name, O_DIRECT | O_LARGEFILE | O_RDONLY); + if (vhd.fd == -1) + return -errno; + + vhd.file = strdup(name); + + vhd_read_footer(&vhd, &vhd.footer); + vhd_read_header(&vhd, &vhd.header); + + vhd_print_footer(&vhd.footer, hex); + vhd_print_header(&vhd, &vhd.header, hex); + + close(vhd.fd); + free(vhd.file); + + return 0; +} + +static int +vhd_print_logical_to_physical(vhd_context_t *vhd, + uint64_t sector, int count, int hex) +{ + int i; + uint32_t blk, lsec; + uint64_t cur, offset; + + if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { + fprintf(stderr, "sector %s past end of file\n", + conv(hex, sector + count)); + return -ERANGE; + } + + for (i = 0; i < count; i++) { + cur = sector + i; + blk = cur / vhd->spb; + lsec = cur % vhd->spb; + offset = vhd->bat.bat[blk]; + + if (offset != DD_BLK_UNUSED) { + offset += lsec + 1; + offset = vhd_sectors_to_bytes(offset); + } + + printf("logical sector %s: ", conv(hex, cur)); + printf("block number: %s, ", conv(hex, blk)); + printf("sector offset: %s, ", conv(hex, lsec)); + printf("file offset: %s\n", (offset == DD_BLK_UNUSED ? + "not allocated" : conv(hex, offset))); + } + + return 0; +} + +static int +vhd_print_bat(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + int i; + uint64_t cur, offset; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + cur = block + i; + offset = vhd->bat.bat[cur]; + + printf("block: %s: ", conv(hex, cur)); + printf("offset: %s\n", + (offset == DD_BLK_UNUSED ? "not allocated" : + conv(hex, vhd_sectors_to_bytes(offset)))); + } + + return 0; +} + +static int +vhd_print_bat_str(vhd_context_t *vhd) +{ + int i, err, total_blocks, bitmap_size; + char *bitmap; + ssize_t n; + + err = 0; + + if (!vhd_type_dynamic(vhd)) + return -EINVAL; + + total_blocks = vhd->footer.curr_size / vhd->header.block_size; + bitmap_size = total_blocks >> 3; + if (bitmap_size << 3 < total_blocks) + bitmap_size++; + + bitmap = malloc(bitmap_size); + if (!bitmap) + return -ENOMEM; + memset(bitmap, 0, bitmap_size); + + for (i = 0; i < total_blocks; i++) { + if (vhd->bat.bat[i] != DD_BLK_UNUSED) + set_bit(bitmap, i); + } + + n = write(STDOUT_FILENO, bitmap, bitmap_size); + if (n < 0) + err = -errno; + + free(bitmap); + + return err; +} + +static int +vhd_print_bitmap(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + char *buf; + int i, err; + uint64_t cur; + ssize_t n; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + cur = block + i; + + if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { + printf("block %s not allocated\n", conv(hex, cur)); + continue; + } + + err = vhd_read_bitmap(vhd, cur, &buf); + if (err) + goto out; + + n = write(STDOUT_FILENO, buf, vhd_sectors_to_bytes(vhd->bm_secs)); + if (n < 0) { + err = -errno; + goto out; + } + + free(buf); + } + + err = 0; +out: + return err; +} + +static int +vhd_test_bitmap(vhd_context_t *vhd, uint64_t sector, int count, int hex) +{ + char *buf; + uint64_t cur; + int i, err, bit; + uint32_t blk, bm_blk, sec; + + if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { + printf("sector %s past end of file\n", conv(hex, sector)); + return -ERANGE; + } + + bm_blk = -1; + buf = NULL; + + for (i = 0; i < count; i++) { + cur = sector + i; + blk = cur / vhd->spb; + sec = cur % vhd->spb; + + if (blk != bm_blk) { + bm_blk = blk; + free(buf); + buf = NULL; + + if (vhd->bat.bat[blk] != DD_BLK_UNUSED) { + err = vhd_read_bitmap(vhd, blk, &buf); + if (err) + goto out; + } + } + + if (vhd->bat.bat[blk] == DD_BLK_UNUSED) + bit = 0; + else + bit = vhd_bitmap_test(vhd, buf, sec); + + printf("block %s: ", conv(hex, blk)); + printf("sec: %s: %d\n", conv(hex, sec), bit); + } + + err = 0; + out: + free(buf); + return err; +} + +static int +vhd_print_bitmap_extents(vhd_context_t *vhd, uint64_t sector, int count, + int hex) +{ + char *buf; + uint64_t cur; + int i, err, bit; + uint32_t blk, bm_blk, sec; + int64_t s, r; + + if (vhd_sectors_to_bytes(sector + count) > vhd->footer.curr_size) { + printf("sector %s past end of file\n", conv(hex, sector)); + return -ERANGE; + } + + bm_blk = -1; + buf = NULL; + s = -1; + r = 0; + + for (i = 0; i < count; i++) { + cur = sector + i; + blk = cur / vhd->spb; + sec = cur % vhd->spb; + + if (blk != bm_blk) { + bm_blk = blk; + free(buf); + buf = NULL; + + if (vhd->bat.bat[blk] != DD_BLK_UNUSED) { + err = vhd_read_bitmap(vhd, blk, &buf); + if (err) + goto out; + } + } + + if (vhd->bat.bat[blk] == DD_BLK_UNUSED) + bit = 0; + else + bit = vhd_bitmap_test(vhd, buf, sec); + + if (bit) { + if (r == 0) + s = cur; + r++; + } else { + if (r > 0) { + printf("%s ", conv(hex, s)); + printf("%s\n", conv(hex, r)); + } + r = 0; + } + } + if (r > 0) { + printf("%s ", conv(hex, s)); + printf("%s\n", conv(hex, r)); + } + + err = 0; + out: + free(buf); + return err; +} + +static int +vhd_print_batmap(vhd_context_t *vhd) +{ + int err, gcc; + size_t size; + + err = vhd_get_batmap(vhd); + if (err) { + printf("failed to read batmap: %d\n", err); + return err; + } + + size = vhd_sectors_to_bytes(vhd->batmap.header.batmap_size); + gcc = write(STDOUT_FILENO, vhd->batmap.map, size); + if (gcc) { + ; + } + + return 0; +} + +static int +vhd_test_batmap(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + int i, err; + uint64_t cur; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + err = vhd_get_batmap(vhd); + if (err) { + fprintf(stderr, "failed to get batmap\n"); + return err; + } + + for (i = 0; i < count; i++) { + cur = block + i; + fprintf(stderr, "batmap for block %s: %d\n", conv(hex, cur), + vhd_batmap_test(vhd, &vhd->batmap, cur)); + } + + return 0; +} + +static int +vhd_print_data(vhd_context_t *vhd, uint64_t block, int count, int hex) +{ + char *buf; + int i, err; + uint64_t cur; + + err = 0; + + if (check_block_range(vhd, block + count, hex)) + return -ERANGE; + + for (i = 0; i < count; i++) { + int gcc; + cur = block + i; + + if (vhd->bat.bat[cur] == DD_BLK_UNUSED) { + printf("block %s not allocated\n", conv(hex, cur)); + continue; + } + + err = vhd_read_block(vhd, cur, &buf); + if (err) + break; + + gcc = write(STDOUT_FILENO, buf, vhd->header.block_size); + if (gcc) { + ; + } + free(buf); + } + + return err; +} + +static int +vhd_read_data(vhd_context_t *vhd, uint64_t sec, int count, + int hex __attribute__((unused))) +{ + void *buf; + uint64_t cur; + int err, max, secs; + + if (vhd_sectors_to_bytes(sec + count) > vhd->footer.curr_size) + return -ERANGE; + + max = MIN(vhd_sectors_to_bytes(count), VHD_BLOCK_SIZE); + err = posix_memalign(&buf, VHD_SECTOR_SIZE, max); + if (err) + return -err; + + cur = sec; + while (count) { + int gcc; + + secs = MIN((max >> VHD_SECTOR_SHIFT), count); + err = vhd_io_read(vhd, buf, cur, secs); + if (err) + break; + + gcc = write(STDOUT_FILENO, buf, vhd_sectors_to_bytes(secs)); + if (gcc) { + ; + } + + cur += secs; + count -= secs; + } + + free(buf); + return err; +} + +static int +vhd_read_bytes(vhd_context_t *vhd, uint64_t byte, int count, + int hex __attribute__((unused))) +{ + void *buf; + uint64_t cur; + int err, max, bytes; + + if (byte + count > vhd->footer.curr_size) + return -ERANGE; + + max = MIN(count, VHD_BLOCK_SIZE); + err = posix_memalign(&buf, VHD_SECTOR_SIZE, max); + if (err) + return -err; + + cur = byte; + while (count) { + ssize_t n; + + bytes = MIN(max, count); + err = vhd_io_read_bytes(vhd, buf, bytes, cur); + if (err) + break; + + n = write(STDOUT_FILENO, buf, bytes); + if (n < 0) { + err = -errno; + break; + } + + cur += bytes; + count -= bytes; + } + + free(buf); + return err; +} + +int +vhd_util_read(int argc, char **argv) +{ + char *name; + vhd_context_t vhd; + int c, err, headers, hex, bat_str, cache, flags; + uint64_t bat, bitmap, tbitmap, ebitmap, batmap, tbatmap, data, lsec, count, read; + uint64_t bread; + + err = 0; + hex = 0; + cache = 0; + headers = 0; + bat_str = 0; + count = 1; + bat = -1; + bitmap = -1; + tbitmap = -1; + ebitmap = -1; + batmap = -1; + tbatmap = -1; + data = -1; + lsec = -1; + read = -1; + bread = -1; + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:pt:b:Bm:i:e:aj:d:c:r:R:xCh")) != -1) { + switch(c) { + case ''n'': + name = optarg; + break; + case ''p'': + headers = 1; + break; + case ''C'': + cache = 1; + break; + case ''B'': + bat_str = 1; + break; + case ''t'': + lsec = strtoul(optarg, NULL, 10); + break; + case ''b'': + bat = strtoull(optarg, NULL, 10); + break; + case ''m'': + bitmap = strtoull(optarg, NULL, 10); + break; + case ''i'': + tbitmap = strtoul(optarg, NULL, 10); + break; + case ''e'': + ebitmap = strtoul(optarg, NULL, 10); + break; + case ''a'': + batmap = 1; + break; + case ''j'': + tbatmap = strtoull(optarg, NULL, 10); + break; + case ''d'': + data = strtoull(optarg, NULL, 10); + break; + case ''r'': + read = strtoull(optarg, NULL, 10); + break; + case ''R'': + bread = strtoull(optarg, NULL, 10); + break; + case ''c'': + count = strtoul(optarg, NULL, 10); + break; + case ''x'': + hex = 1; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED; + if (cache) + flags |= VHD_OPEN_CACHED | VHD_OPEN_FAST; + err = vhd_open(&vhd, name, flags); + if (err) { + printf("Failed to open %s: %d\n", name, err); + vhd_dump_headers(name, hex); + return err; + } + + err = vhd_get_bat(&vhd); + if (err) { + printf("Failed to get bat for %s: %d\n", name, err); + goto out; + } + + if (headers) + vhd_print_headers(&vhd, hex); + + if (lsec != -1) { + err = vhd_print_logical_to_physical(&vhd, lsec, count, hex); + if (err) + goto out; + } + + if (bat != -1) { + err = vhd_print_bat(&vhd, bat, count, hex); + if (err) + goto out; + } + + if (bat_str) { + err = vhd_print_bat_str(&vhd); + if (err) + goto out; + } + + if (bitmap != -1) { + err = vhd_print_bitmap(&vhd, bitmap, count, hex); + if (err) + goto out; + } + + if (tbitmap != -1) { + err = vhd_test_bitmap(&vhd, tbitmap, count, hex); + if (err) + goto out; + } + + if (ebitmap != -1) { + err = vhd_print_bitmap_extents(&vhd, ebitmap, count, hex); + if (err) + goto out; + } + + if (batmap != -1) { + err = vhd_print_batmap(&vhd); + if (err) + goto out; + } + + if (tbatmap != -1) { + err = vhd_test_batmap(&vhd, tbatmap, count, hex); + if (err) + goto out; + } + + if (data != -1) { + err = vhd_print_data(&vhd, data, count, hex); + if (err) + goto out; + } + + if (read != -1) { + err = vhd_read_data(&vhd, read, count, hex); + if (err) + goto out; + } + + if (bread != -1) { + err = vhd_read_bytes(&vhd, bread, count, hex); + if (err) + goto out; + } + + err = 0; + + out: + vhd_close(&vhd); + return err; + + usage: + printf("options:\n" + "-h help\n" + "-n name\n" + "-p print VHD headers\n" + "-t sec translate logical sector to VHD location\n" + "-b blk print bat entry\n" + "-B print entire bat as a bitmap\n" + "-m blk print bitmap\n" + "-i sec test bitmap for logical sector\n" + "-e sec output extent list of allocated logical sectors\n" + "-a print batmap\n" + "-j blk test batmap for block\n" + "-d blk print data\n" + "-c num num units\n" + "-r sec read num sectors at sec\n" + "-R byte read num bytes at byte\n" + "-x print in hex\n"); + return EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-repair.c b/tools/blktap3/vhd/lib/vhd-util-repair.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-repair.c @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_repair(int argc, char **argv) +{ + char *name; + int err, c; + vhd_context_t vhd; + + name = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:h")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || optind != argc) + goto usage; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + err = vhd_write_footer(&vhd, &vhd.footer); + if (err) + printf("error writing footer: %d\n", err); + + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-resize.c b/tools/blktap3/vhd/lib/vhd-util-resize.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-resize.c @@ -0,0 +1,1200 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <syslog.h> +#include <inttypes.h> +#include <sys/mman.h> + +#include "libvhd-journal.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf(stdout, _f, ##_a) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +#define EPRINTF(_f, _a...) \ + do { \ + syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ + DFPRINTF(_f, _a); \ + } while (0) + +typedef struct vhd_block { + uint32_t block; + uint32_t offset; +} vhd_block_t; + +TEST_FAIL_EXTERN_VARS; + +static inline uint32_t +secs_to_blocks_down(vhd_context_t *vhd, uint64_t secs) +{ + return secs / vhd->spb; +} + +static uint32_t +secs_to_blocks_up(vhd_context_t *vhd, uint64_t secs) +{ + uint32_t blocks; + + blocks = secs / vhd->spb; + if (secs % vhd->spb) + blocks++; + + return blocks; +} + +static int +vhd_fixed_shrink(vhd_journal_t *journal, uint64_t secs) +{ + int err; + uint64_t new_eof; + vhd_context_t *vhd; + + vhd = &journal->vhd; + + new_eof = vhd->footer.curr_size - vhd_sectors_to_bytes(secs); + if (new_eof <= sizeof(vhd_footer_t)) + return -EINVAL; + + err = ftruncate(vhd->fd, new_eof); + if (err) + return errno; + + vhd->footer.curr_size = new_eof; + return vhd_write_footer(vhd, &vhd->footer); +} + +static int +vhd_write_zeros(vhd_journal_t *journal, off64_t off, uint64_t size) +{ + int err; + char *buf; + vhd_context_t *vhd; + uint64_t bytes, map; + + vhd = &journal->vhd; + map = MIN(size, VHD_BLOCK_SIZE); + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + return err; + + buf = mmap(0, map, PROT_READ, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (buf == MAP_FAILED) + return -errno; + + do { + bytes = MIN(size, map); + + err = vhd_write(vhd, buf, bytes); + if (err) + break; + + size -= bytes; + } while (size); + + munmap(buf, map); + + return err; +} + +static int +vhd_fixed_grow(vhd_journal_t *journal, uint64_t secs) +{ + int err; + vhd_context_t *vhd; + uint64_t size, eof, new_eof; + + size = vhd_sectors_to_bytes(secs); + vhd = &journal->vhd; + + err = vhd_seek(vhd, 0, SEEK_END); + if (err) + goto out; + + eof = vhd_position(vhd); + if (eof == (off64_t)-1) { + err = -errno; + goto out; + } + + err = vhd_write_zeros(journal, eof - sizeof(vhd_footer_t), size); + if (err) + goto out; + + new_eof = eof + size; + err = vhd_seek(vhd, new_eof, SEEK_SET); + if (err) + goto out; + + vhd->footer.curr_size += size; + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + goto out; + + err = 0; + +out: + return err; +} + +static int +vhd_fixed_resize(vhd_journal_t *journal, uint64_t size) +{ + int err; + vhd_context_t *vhd; + uint64_t cur_secs, new_secs; + + vhd = &journal->vhd; + cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; + new_secs = size << (20 - VHD_SECTOR_SHIFT); + + if (cur_secs == new_secs) + return 0; + else if (cur_secs > new_secs) + err = vhd_fixed_shrink(journal, cur_secs - new_secs); + else + err = vhd_fixed_grow(journal, new_secs - cur_secs); + + return err; +} + +static inline void +swap(vhd_block_t *list, int a, int b) +{ + vhd_block_t tmp; + + tmp = list[a]; + list[a] = list[b]; + list[b] = tmp; +} + +static int +partition(vhd_block_t *list, int left, int right, int pidx) +{ + int i, sidx; + long long pval; + + sidx = left; + pval = list[pidx].offset; + swap(list, pidx, right); + + for (i = left; i < right; i++) + if (list[i].offset >= pval) { + swap(list, sidx, i); + ++sidx; + } + + swap(list, right, sidx); + return sidx; +} + +static void +quicksort(vhd_block_t *list, int left, int right) +{ + int pidx, new_pidx; + + if (right < left) + return; + + pidx = left; + new_pidx = partition(list, left, right, pidx); + quicksort(list, left, new_pidx - 1); + quicksort(list, new_pidx + 1, right); +} + +static int +vhd_move_block(vhd_journal_t *journal, uint32_t src, off64_t offset) +{ + int err; + char *buf; + size_t size; + vhd_context_t *vhd; + off64_t off, src_off; + + buf = NULL; + vhd = &journal->vhd; + off = offset; + size = vhd_sectors_to_bytes(vhd->bm_secs); + src_off = vhd->bat.bat[src]; + + if (src_off == DD_BLK_UNUSED) + return -EINVAL; + src_off = vhd_sectors_to_bytes(src_off); + + err = vhd_journal_add_block(journal, src, + VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); + if (err) + goto out; + + err = vhd_read_bitmap(vhd, src, &buf); + if (err) + goto out; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, buf, size); + if (err) + goto out; + + free(buf); + buf = NULL; + off += size; + size = vhd_sectors_to_bytes(vhd->spb); + + err = vhd_read_block(vhd, src, &buf); + if (err) + goto out; + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, buf, size); + if (err) + goto out; + + vhd->bat.bat[src] = offset >> VHD_SECTOR_SHIFT; + + err = vhd_write_zeros(journal, src_off, + vhd_sectors_to_bytes(vhd->bm_secs + vhd->spb)); + +out: + free(buf); + return err; +} + +static int +vhd_clobber_block(vhd_journal_t *journal, uint32_t src, uint32_t dest) +{ + int err; + off64_t off; + vhd_context_t *vhd; + + vhd = &journal->vhd; + off = vhd_sectors_to_bytes(vhd->bat.bat[dest]); + + err = vhd_journal_add_block(journal, dest, + VHD_JOURNAL_DATA | VHD_JOURNAL_METADATA); + if (err) + return err; + + err = vhd_move_block(journal, src, off); + if (err) + return err; + + vhd->bat.bat[dest] = DD_BLK_UNUSED; + + return 0; +} + +/* + * remove a list of blocks from the vhd file + * if a block to be removed: + * - resides at the end of the file: simply clear its bat entry + * - resides elsewhere: move the last block in the file into its position + * and update the bat to reflect this + */ +static int +vhd_defrag_shrink(vhd_journal_t *journal, + vhd_block_t *original_free_list, int free_cnt) +{ + vhd_context_t *vhd; + int i, j, free_idx, err; + vhd_block_t *blocks, *free_list; + + err = 0; + blocks = NULL; + free_list = NULL; + vhd = &journal->vhd; + + blocks = malloc(vhd->bat.entries * sizeof(vhd_block_t)); + if (!blocks) { + err = -ENOMEM; + goto out; + } + + free_list = malloc(free_cnt * sizeof(vhd_block_t)); + if (!free_list) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < vhd->bat.entries; i++) { + blocks[i].block = i; + blocks[i].offset = vhd->bat.bat[i]; + } + + memcpy(free_list, original_free_list, + free_cnt * sizeof(vhd_block_t)); + + /* sort both the to-free list and the bat list + * in order of descending file offset */ + quicksort(free_list, 0, free_cnt - 1); + quicksort(blocks, 0, vhd->bat.entries - 1); + + for (i = 0, free_idx = 0; + i < vhd->bat.entries && free_idx < free_cnt; i++) { + vhd_block_t *b = blocks + i; + + if (b->offset == DD_BLK_UNUSED) + continue; + + for (j = free_idx; j < free_cnt; j++) + if (b->block == free_list[j].block) { + /* the last block in the file is in the list of + * blocks to remove; no need to shuffle the + * data -- just clear the bat entry */ + vhd->bat.bat[free_list[j].block] = DD_BLK_UNUSED; + free_idx++; + continue; + } + + err = vhd_clobber_block(journal, b->block, + free_list[free_idx++].block); + if (err) + goto out; + } + + /* clear any bat entries for blocks we did not shuffle */ + for (i = free_idx; i < free_cnt; i++) + vhd->bat.bat[free_list[i].block] = DD_BLK_UNUSED; + +out: + free(blocks); + free(free_list); + + return err; +} + +static int +vhd_clear_bat_entries(vhd_journal_t *journal, uint32_t entries) +{ + int i, err; + vhd_context_t *vhd; + off64_t orig_map_off, new_map_off; + uint32_t orig_entries, new_entries; + + vhd = &journal->vhd; + orig_entries = vhd->header.max_bat_size; + new_entries = orig_entries - entries; + + if (vhd_has_batmap(vhd)) { + err = vhd_batmap_header_offset(vhd, &orig_map_off); + if (err) + return err; + } + + /* update header */ + vhd->header.max_bat_size = new_entries; + err = vhd_write_header(vhd, &vhd->header); + if (err) + return err; + + /* update footer */ + vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; + vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + /* update bat -- we don''t reclaim space, just clear entries */ + for (i = new_entries; i < orig_entries; i++) + vhd->bat.bat[i] = 0; + + err = vhd_write_bat(vhd, &vhd->bat); + if (err) + return err; + + /* update this after write_bat so the end of the bat is zeored */ + vhd->bat.entries = new_entries; + + if (!vhd_has_batmap(vhd)) + return 0; + + /* zero out old batmap header if new header has moved */ + err = vhd_batmap_header_offset(vhd, &new_map_off); + if (err) + return err; + + if (orig_map_off != new_map_off) { + size_t size; + + size = vhd_bytes_padded(sizeof(struct dd_batmap_hdr)); + + err = vhd_write_zeros(journal, orig_map_off, size); + if (err) + return err; + } + + /* update batmap -- clear entries for freed blocks */ + for (i = new_entries; i < orig_entries; i++) + vhd_batmap_clear(vhd, &vhd->batmap, i); + + err = vhd_write_batmap(vhd, &vhd->batmap); + if (err) + return err; + + return 0; +} + +static int +vhd_dynamic_shrink(vhd_journal_t *journal, uint64_t secs) +{ + off64_t eof; + uint32_t blocks; + vhd_context_t *vhd; + int i, j, err, free_cnt; + struct vhd_block *free_list; + + printf("dynamic shrink not fully implemented\n"); + return -ENOSYS; + + eof = 0; + free_cnt = 0; + free_list = NULL; + vhd = &journal->vhd; + + blocks = secs_to_blocks_down(vhd, secs); + if (blocks == 0) + return 0; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + return err; + } + + free_list = malloc(blocks * sizeof(struct vhd_block)); + if (!free_list) + return -ENOMEM; + + for (i = vhd->bat.entries - 1, j = 0; i >= 0 && j < blocks; i--, j++) { + uint32_t blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + free_list[free_cnt].block = i; + free_list[free_cnt].offset = blk; + free_cnt++; + } + } + + if (free_cnt) { + err = vhd_defrag_shrink(journal, free_list, free_cnt); + if (err) + goto out; + } + + err = vhd_clear_bat_entries(journal, blocks); + if (err) + goto out; + + /* remove data beyond footer */ + err = vhd_end_of_data(vhd, &eof); + if (err) + goto out; + + err = ftruncate(vhd->fd, eof + sizeof(vhd_footer_t)); + if (err) { + err = -errno; + goto out; + } + + err = 0; + +out: + free(free_list); + return err; +} + +static inline void +vhd_first_data_block(vhd_context_t *vhd, vhd_block_t *block) +{ + int i; + uint32_t blk; + + memset(block, 0, sizeof(vhd_block_t)); + + for (i = 0; i < vhd->bat.entries; i++) { + blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + if (!block->offset || blk < block->offset) { + block->block = i; + block->offset = blk; + } + } + } +} + +static inline uint32_t +vhd_next_block_offset(vhd_context_t *vhd) +{ + int i; + uint32_t blk, end, next; + + next = 0; + + for (i = 0; i < vhd->bat.entries; i++) { + blk = vhd->bat.bat[i]; + + if (blk != DD_BLK_UNUSED) { + end = blk + vhd->spb + vhd->bm_secs; + next = MAX(next, end); + } + } + + return next; +} + +static inline int +in_range(off64_t off, off64_t start, off64_t size) +{ + return (start < off && start + size > off); +} + +#define SKIP_HEADER 0x01 +#define SKIP_BAT 0x02 +#define SKIP_BATMAP 0x04 +#define SKIP_PLOC 0x08 +#define SKIP_DATA 0x10 + +static inline int +skip_check(int mode, int type) +{ + return mode & type; +} + +static int +vhd_check_for_clobber(vhd_context_t *vhd, off64_t off, int mode) +{ + int i, n; + char *msg; + size_t size; + vhd_block_t fb; + vhd_parent_locator_t *loc; + + msg = NULL; + + if (!vhd_type_dynamic(vhd)) + return 0; + + if (off < VHD_SECTOR_SIZE) { + msg = "backup footer"; + goto fail; + } + + if (!skip_check(mode, SKIP_HEADER)) + if (in_range(off, + vhd->footer.data_offset, sizeof(vhd_header_t))) { + msg = "header"; + goto fail; + } + + if (!skip_check(mode, SKIP_BAT)) + if (in_range(off, vhd->header.table_offset, + vhd_bytes_padded(vhd->header.max_bat_size * + sizeof(uint32_t)))) { + msg = "bat"; + goto fail; + } + + if (!skip_check(mode, SKIP_BATMAP)) + if (vhd_has_batmap(vhd) && + in_range(off, vhd->batmap.header.batmap_offset, + vhd_bytes_padded(vhd->batmap.header.batmap_size))) { + msg = "batmap"; + goto fail; + } + + if (!skip_check(mode, SKIP_PLOC)) { + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + for (i = 0; i < n; i++) { + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + size = vhd_parent_locator_size(loc); + if (in_range(off, loc->data_offset, size)) { + msg = "parent locator"; + goto fail; + } + } + } + + if (!skip_check(mode, SKIP_DATA)) { + vhd_first_data_block(vhd, &fb); + if (fb.offset && in_range(off, + vhd_sectors_to_bytes(fb.offset), + VHD_BLOCK_SIZE)) { + msg = "data block"; + goto fail; + } + } + + return 0; + +fail: + EPRINTF("write to 0x%08"PRIx64" would clobber %s\n", off, msg); + return -EINVAL; +} + +/* + * take any metadata after the bat (@eob) and shift it + */ +static int +vhd_shift_metadata(vhd_journal_t *journal, off64_t eob, + size_t bat_needed, size_t map_needed) +{ + int i, n, err; + vhd_context_t *vhd; + size_t size_needed; + void *buf; + char **locators; + vhd_parent_locator_t *loc; + + vhd = &journal->vhd; + size_needed = bat_needed + map_needed; + + n = sizeof(vhd->header.loc) / sizeof(vhd_parent_locator_t); + + locators = calloc(n, sizeof(char *)); + if (!locators) + return -ENOMEM; + + for (i = 0; i < n; i++) { + size_t size; + + loc = vhd->header.loc + i; + if (loc->code == PLAT_CODE_NONE) + continue; + + if (loc->data_offset < eob) + continue; + + size = vhd_parent_locator_size(loc); + err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); + if (err) { + err = -err; + buf = NULL; + goto out; + } + + err = vhd_seek(vhd, loc->data_offset, SEEK_SET); + if (err) + goto out; + + err = vhd_read(vhd, buf, size); + if (err) + goto out; + + locators[i] = buf; + } + + for (i = 0; i < n; i++) { + off64_t off; + size_t size; + + if (!locators[i]) + continue; + + loc = vhd->header.loc + i; + off = loc->data_offset + size_needed; + size = vhd_parent_locator_size(loc); + + if (vhd_check_for_clobber(vhd, off + size, SKIP_PLOC)) { + EPRINTF("%s: shifting locator %d would clobber data\n", + vhd->file, i); + return -EINVAL; + } + + err = vhd_seek(vhd, off, SEEK_SET); + if (err) + goto out; + + err = vhd_write(vhd, locators[i], size); + if (err) + goto out; + + free(locators[i]); + locators[i] = NULL; + loc->data_offset = off; + + /* write the new header after writing the new bat */ + } + + if (vhd_has_batmap(vhd) && vhd->batmap.header.batmap_offset > eob) { + vhd->batmap.header.batmap_offset += bat_needed; + + /* write the new batmap after writing the new bat */ + } + + err = 0; + +out: + for (i = 0; i < n; i++) + free(locators[i]); + free(locators); + + return err; +} + +static int +vhd_add_bat_entries(vhd_journal_t *journal, int entries) +{ + int i, err; + off64_t off; + vhd_bat_t new_bat; + vhd_context_t *vhd; + uint32_t new_entries; + vhd_batmap_t new_batmap; + uint64_t bat_size, new_bat_size, map_size, new_map_size; + void *bat, *map; + + vhd = &journal->vhd; + new_entries = vhd->header.max_bat_size + entries; + + bat_size = vhd_bytes_padded(vhd->header.max_bat_size * + sizeof(uint32_t)); + new_bat_size = vhd_bytes_padded(new_entries * sizeof(uint32_t)); + + map_size = vhd_bytes_padded((vhd->header.max_bat_size + 7) >> 3); + new_map_size = vhd_bytes_padded((new_entries + 7) >> 3); + + off = vhd->header.table_offset + new_bat_size; + if (vhd_check_for_clobber(vhd, off, SKIP_BAT | SKIP_BATMAP)) { + EPRINTF("%s: writing new bat of 0x%"PRIx64" bytes " + "at 0x%08"PRIx64" would clobber data\n", + vhd->file, new_bat_size, vhd->header.table_offset); + return -EINVAL; + } + + if (vhd_has_batmap(vhd)) { + off = vhd->batmap.header.batmap_offset + new_map_size; + if (vhd_check_for_clobber(vhd, off, 0)) { + EPRINTF("%s: writing new batmap of 0x%"PRIx64" bytes" + " at 0x%08"PRIx64" would clobber data\n", vhd->file, + new_map_size, vhd->batmap.header.batmap_offset); + return -EINVAL; + } + } + + /* update header */ + vhd->header.max_bat_size = new_entries; + err = vhd_write_header(vhd, &vhd->header); + if (err) + return err; + + /* allocate new bat */ + err = posix_memalign(&bat, VHD_SECTOR_SIZE, new_bat_size); + if (err) + return -err; + + new_bat.bat = bat; + new_bat.spb = vhd->bat.spb; + new_bat.entries = new_entries; + memcpy(new_bat.bat, vhd->bat.bat, bat_size); + for (i = vhd->bat.entries; i < new_entries; i++) + new_bat.bat[i] = DD_BLK_UNUSED; + + /* write new bat */ + err = vhd_write_bat(vhd, &new_bat); + if (err) { + free(new_bat.bat); + return err; + } + + /* update in-memory bat */ + free(vhd->bat.bat); + vhd->bat = new_bat; + + if (!vhd_has_batmap(vhd)) + return 0; + + /* allocate new batmap */ + err = posix_memalign(&map, VHD_SECTOR_SIZE, new_map_size); + if (err) + return err; + + new_batmap.map = map; + new_batmap.header = vhd->batmap.header; + new_batmap.header.batmap_size = secs_round_up_no_zero(new_map_size); + memcpy(new_batmap.map, vhd->batmap.map, map_size); + memset(new_batmap.map + map_size, 0, new_map_size - map_size); + + /* write new batmap */ + err = vhd_write_batmap(vhd, &new_batmap); + if (err) { + free(new_batmap.map); + return err; + } + + /* update in-memory batmap */ + free(vhd->batmap.map); + vhd->batmap = new_batmap; + + /* update footer */ + vhd->footer.curr_size = (uint64_t)new_entries * vhd->header.block_size; + vhd->footer.geometry = vhd_chs(vhd->footer.curr_size); + vhd->footer.checksum = vhd_checksum_footer(&vhd->footer); + err = vhd_write_footer(vhd, &vhd->footer); + if (err) + return err; + + return 0; +} + +static int +vhd_dynamic_grow(vhd_journal_t *journal, uint64_t secs) +{ + int err; + off64_t eob, eom; + vhd_context_t *vhd; + vhd_block_t first_block; + uint64_t blocks, size_needed; + uint64_t bat_needed, bat_size, bat_avail, bat_bytes, bat_secs; + uint64_t map_needed, map_size, map_avail, map_bytes, map_secs; + + vhd = &journal->vhd; + + size_needed = 0; + bat_needed = 0; + map_needed = 0; + + /* number of vhd blocks to add */ + blocks = secs_to_blocks_up(vhd, secs); + + /* size in bytes needed for new bat entries */ + bat_needed = blocks * sizeof(uint32_t); + map_needed = (blocks >> 3) + 1; + + /* available bytes in current bat */ + bat_bytes = vhd->header.max_bat_size * sizeof(uint32_t); + bat_secs = secs_round_up_no_zero(bat_bytes); + bat_size = vhd_sectors_to_bytes(bat_secs); + bat_avail = bat_size - bat_bytes; + + if (vhd_has_batmap(vhd)) { + /* avaliable bytes in current batmap */ + map_bytes = (vhd->header.max_bat_size + 7) >> 3; + map_secs = vhd->batmap.header.batmap_size; + map_size = vhd_sectors_to_bytes(map_secs); + map_avail = map_size - map_bytes; + } else { + map_needed = 0; + map_avail = 0; + } + + /* we have enough space already; just extend the bat */ + if (bat_needed <= bat_avail && map_needed <= map_avail) + goto add_entries; + + /* we need to add new sectors to the bat */ + if (bat_needed > bat_avail) { + bat_needed -= bat_avail; + bat_needed = vhd_bytes_padded(bat_needed); + } else + bat_needed = 0; + + /* we need to add new sectors to the batmap */ + if (map_needed > map_avail) { + map_needed -= map_avail; + map_needed = vhd_bytes_padded(map_needed); + } else + map_needed = 0; + + /* how many additional bytes do we need? */ + size_needed = bat_needed + map_needed; + + /* calculate space between end of headers and beginning of data */ + err = vhd_end_of_headers(vhd, &eom); + if (err) + return err; + + eob = vhd->header.table_offset + vhd_sectors_to_bytes(bat_secs); + vhd_first_data_block(vhd, &first_block); + + /* no blocks allocated; just shift post-bat metadata */ + if (!first_block.offset) + goto shift_metadata; + + /* + * not enough space -- + * move vhd data blocks to the end of the file to make room + */ + do { + off64_t new_off, bm_size, gap_size; + + new_off = vhd_sectors_to_bytes(vhd_next_block_offset(vhd)); + + /* data region of segment should begin on page boundary */ + bm_size = vhd_sectors_to_bytes(vhd->bm_secs); + if ((new_off + bm_size) % 4096) { + gap_size = 4096 - ((new_off + bm_size) % 4096); + + err = vhd_write_zeros(journal, new_off, gap_size); + if (err) + return err; + + new_off += gap_size; + } + + err = vhd_move_block(journal, first_block.block, new_off); + if (err) + return err; + + vhd_first_data_block(vhd, &first_block); + + } while (eom + size_needed >= vhd_sectors_to_bytes(first_block.offset)); + + TEST_FAIL_AT(FAIL_RESIZE_DATA_MOVED); + +shift_metadata: + /* shift any metadata after the bat to make room for new bat sectors */ + err = vhd_shift_metadata(journal, eob, bat_needed, map_needed); + if (err) + return err; + + TEST_FAIL_AT(FAIL_RESIZE_METADATA_MOVED); + +add_entries: + return vhd_add_bat_entries(journal, blocks); +} + +static int +vhd_dynamic_resize(vhd_journal_t *journal, uint64_t size) +{ + int err; + vhd_context_t *vhd; + uint64_t cur_secs, new_secs; + + vhd = &journal->vhd; + cur_secs = vhd->footer.curr_size >> VHD_SECTOR_SHIFT; + new_secs = size << (20 - VHD_SECTOR_SHIFT); + + if (cur_secs == new_secs) + return 0; + + err = vhd_get_header(vhd); + if (err) + return err; + + err = vhd_get_bat(vhd); + if (err) + return err; + + if (vhd_has_batmap(vhd)) { + err = vhd_get_batmap(vhd); + if (err) + return err; + } + + if (cur_secs > new_secs) + err = vhd_dynamic_shrink(journal, cur_secs - new_secs); + else + err = vhd_dynamic_grow(journal, new_secs - cur_secs); + + return err; +} + +static int +vhd_util_resize_check_creator(const char *name) +{ + int err; + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY | VHD_OPEN_STRICT); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (!vhd_creator_tapdisk(&vhd)) { + printf("%s not created by xen; resize not supported\n", name); + err = -EINVAL; + } + + vhd_close(&vhd); + return err; +} + +static int +vhd_dynamic_grow_fast(const char *name, uint64_t bytes) +{ + vhd_context_t vhd; + uint64_t blks, size; + int err; + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) + return err; + + err = vhd_get_bat(&vhd); + if (err) + goto done; + + if (vhd_has_batmap(&vhd)) { + err = vhd_get_batmap(&vhd); + if (err) + goto done; + } + + blks = (bytes + VHD_BLOCK_SIZE - 1) >> VHD_BLOCK_SHIFT; + size = blks << VHD_BLOCK_SHIFT; + if (size < vhd.footer.curr_size) { + printf("%s: size (%"PRIu64") < curr size (%"PRIu64")\n", + name, size, vhd.footer.curr_size); + err = -EINVAL; + goto done; + } + if (size == vhd.footer.curr_size) + goto done; + + err = vhd_set_virt_size(&vhd, size); + +done: + vhd_close(&vhd); + return err; +} + +int +vhd_util_resize(int argc, char **argv) +{ + char *name, *jname; + uint64_t size; + int fast, c, err, jerr; + vhd_journal_t journal; + vhd_context_t *vhd; + + err = -EINVAL; + size = 0; + name = NULL; + jname = NULL; + fast = 0; + + optind = 0; + while ((c = getopt(argc, argv, "n:s:j:fh")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''j'': + jname = optarg; + break; + case ''f'': + fast = 1; + break; + case ''s'': + err = 0; + size = strtoull(optarg, NULL, 10); + break; + case ''h'': + default: + goto usage; + } + } + + if (err || !name || (!jname && !fast) || argc != optind) + goto usage; + + if (jname && fast) + goto usage; + + err = vhd_util_resize_check_creator(name); + if (err) + return err; + + libvhd_set_log_level(1); + + if (fast) + return vhd_dynamic_grow_fast(name, size << 20); + + err = vhd_journal_create(&journal, name, jname); + if (err) { + printf("creating journal failed: %d\n", err); + return err; + } + + vhd = &journal.vhd; + + err = vhd_get_footer(vhd); + if (err) + goto out; + + TEST_FAIL_AT(FAIL_RESIZE_BEGIN); + + if (vhd_type_dynamic(vhd)) + err = vhd_dynamic_resize(&journal, size); + else + err = vhd_fixed_resize(&journal, size); + + TEST_FAIL_AT(FAIL_RESIZE_END); + +out: + if (err) { + printf("resize failed: %d\n", err); + jerr = vhd_journal_revert(&journal); + } else + jerr = vhd_journal_commit(&journal); + + if (jerr) { + printf("closing journal failed: %d\n", jerr); + vhd_journal_close(&journal); + } else + vhd_journal_remove(&journal); + + return (err ? : jerr); + +usage: + printf("options: <-n name> <-s size (in MB)> (<-j journal>|<-f fast>) " + "[-h help]\n\n" + "The resize operation can only be performed offline " + "and must be journaled because resizing the metadata " + "might require moving data blocks. However, if a " + "VHD was created with -S <msize> option (during " + "vhd-util create/snapshot), which preallocates the " + "metadata for growing the VHD up to size <msize>, then " + "resizing such a VHD up to <msize> can be performed " + "online without journaling (-f option).\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-revert.c b/tools/blktap3/vhd/lib/vhd-util-revert.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-revert.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +//#include <fcntl.h> +#include <stdio.h> +//#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" +#include "libvhd-journal.h" + +int +vhd_util_revert(int argc, char **argv) +{ + char *name, *jname; + vhd_journal_t journal; + int c, err; + + name = NULL; + jname = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "n:j:h")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''j'': + jname = optarg; + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || !jname || argc != optind) + goto usage; + + libvhd_set_log_level(1); + err = vhd_journal_open(&journal, name, jname); + if (err) { + printf("opening journal failed: %d\n", err); + return err; + } + + err = vhd_journal_revert(&journal); + if (err) { + printf("reverting journal failed: %d\n", err); + vhd_journal_close(&journal); + return err; + } + + err = vhd_journal_remove(&journal); + if (err) { + printf("removing journal failed: %d\n", err); + vhd_journal_close(&journal); + return err; + } + + return 0; + +usage: + printf("options: <-n name> <-j journal> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-scan.c b/tools/blktap3/vhd/lib/vhd-util-scan.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-scan.c @@ -0,0 +1,1372 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <glob.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <unistd.h> +#include <fnmatch.h> +#include <limits.h> +#include <libgen.h> +#include <syslog.h> +#include <sys/stat.h> +#include <sys/types.h> + +#include "libvhd.h" +#include "lvm-util.h" + +#define VHD_SCAN_FAST 0x01 +#define VHD_SCAN_PRETTY 0x02 +#define VHD_SCAN_VOLUME 0x04 +#define VHD_SCAN_NOFAIL 0x08 +#define VHD_SCAN_VERBOSE 0x10 +#define VHD_SCAN_PARENTS 0x20 +#define VHD_SCAN_MARKERS 0x40 + +#define VHD_TYPE_RAW_FILE 0x01 +#define VHD_TYPE_VHD_FILE 0x02 +#define VHD_TYPE_RAW_VOLUME 0x04 +#define VHD_TYPE_VHD_VOLUME 0x08 + +#define EPRINTF(_f, _a...) \ + do { \ + syslog(LOG_INFO, "%s: " _f, __func__, ##_a); \ + } while (0) + +static inline int +target_volume(uint8_t type) +{ + return (type == VHD_TYPE_RAW_VOLUME || type == VHD_TYPE_VHD_VOLUME); +} + +static inline int +target_vhd(uint8_t type) +{ + return (type == VHD_TYPE_VHD_FILE || type == VHD_TYPE_VHD_VOLUME); +} + +struct target { + char name[VHD_MAX_NAME_LEN]; + char device[VHD_MAX_NAME_LEN]; + uint64_t size; + uint64_t start; + uint64_t end; + uint8_t type; +}; + +struct iterator { + int cur; + int cur_size; + int max_size; + struct target *targets; +}; + +TAILQ_HEAD(tqh_vhd_image, vhd_image); + +struct vhd_image { + char *name; + char *parent; + uint64_t capacity; + off64_t size; + uint8_t hidden; + char marker; + int error; + char *message; + + struct target *target; + + TAILQ_ENTRY(vhd_image) sibling; + struct tqh_vhd_image children; + struct vhd_image *parent_image; +}; + +struct vhd_scan { + int cur; + int size; + + int lists_cur; + int lists_size; + + struct vhd_image **images; + struct vhd_image **lists; +}; + +static int flags; +static struct vg vg; +static struct vhd_scan scan; + +static int +vhd_util_scan_pretty_allocate_list(int cnt) +{ + int i; + + memset(&scan, 0, sizeof(scan)); + + scan.lists_cur = 1; + scan.lists_size = 10; + + scan.lists = calloc(scan.lists_size, sizeof(struct vhd_image *)); + if (!scan.lists) + goto fail; + + scan.lists[0] = calloc(cnt, sizeof(struct vhd_image)); + if (!scan.lists[0]) + goto fail; + + scan.images = calloc(cnt, sizeof(struct vhd_image *)); + if (!scan.images) + goto fail; + + for (i = 0; i < cnt; i++) + scan.images[i] = scan.lists[0] + i; + + scan.cur = 0; + scan.size = cnt; + + return 0; + +fail: + if (scan.lists) { + free(scan.lists[0]); + free(scan.lists); + } + + free(scan.images); + memset(&scan, 0, sizeof(scan)); + return -ENOMEM; +} + +static void +vhd_util_scan_pretty_free_list(void) +{ + int i; + + if (scan.lists) { + for (i = 0; i < scan.lists_cur; i++) + free(scan.lists[i]); + free(scan.lists); + } + + free(scan.images); + memset(&scan, 0, sizeof(scan)); +} + +static int +vhd_util_scan_pretty_add_image(struct vhd_image *image) +{ + int i; + struct vhd_image *img; + + for (i = 0; i < scan.cur; i++) { + img = scan.images[i]; + if (!strcmp(img->name, image->name)) + return 0; + } + + if (scan.cur >= scan.size) { + struct vhd_image *new, **list; + + if (scan.lists_cur >= scan.lists_size) { + list = realloc(scan.lists, scan.lists_size * 2 * + sizeof(struct vhd_image *)); + if (!list) + return -ENOMEM; + + scan.lists_size *= 2; + scan.lists = list; + } + + new = calloc(scan.size, sizeof(struct vhd_image)); + if (!new) + return -ENOMEM; + + scan.lists[scan.lists_cur++] = new; + scan.size *= 2; + + list = realloc(scan.images, scan.size * + sizeof(struct vhd_image *)); + if (!list) + return -ENOMEM; + + scan.images = list; + for (i = 0; i + scan.cur < scan.size; i++) + scan.images[i + scan.cur] = new + i; + } + + img = scan.images[scan.cur]; + TAILQ_INIT(&img->children); + + img->capacity = image->capacity; + img->size = image->size; + img->hidden = image->hidden; + img->marker = image->marker; + img->error = image->error; + img->message = image->message; + + img->name = strdup(image->name); + if (!img->name) + goto fail; + + if (image->parent) { + img->parent = strdup(image->parent); + if (!img->parent) + goto fail; + } + + scan.cur++; + return 0; + +fail: + free(img->name); + free(img->parent); + memset(img, 0, sizeof(*img)); + return -ENOMEM; +} + +static int +vhd_util_scan_pretty_image_compare(const void *lhs, const void *rhs) +{ + struct vhd_image *l, *r; + + l = *(struct vhd_image **)lhs; + r = *(struct vhd_image **)rhs; + + return strcmp(l->name, r->name); +} + +static void +vhd_util_scan_print_image_indent(struct vhd_image *image, int tab) +{ + char *pad, *name, *pmsg, *parent; + + pad = (tab ? " " : ""); + name = image->name; + parent = (image->parent ? : "none"); + + if ((flags & VHD_SCAN_PRETTY) && image->parent && !image->parent_image) + pmsg = " (not found in scan)"; + else + pmsg = ""; + + if (!(flags & VHD_SCAN_VERBOSE)) { + name = basename(image->name); + if (image->parent) + parent = basename(image->parent); + } + + if (image->error) + printf("%*svhd=%s scan-error=%d error-message=''%s''\n", + tab, pad, image->name, image->error, image->message); + else if (!(flags & VHD_SCAN_MARKERS)) + printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u " + "parent=%s%s\n", tab, pad, name, image->capacity, + image->size, image->hidden, parent, pmsg); + else + printf("%*svhd=%s capacity=%"PRIu64" size=%"PRIu64" hidden=%u " + "marker=%u parent=%s%s\n", tab, pad, name, + image->capacity, image->size, image->hidden, + (uint8_t)image->marker, parent, pmsg); +} + +static void +vhd_util_scan_pretty_print_tree(struct vhd_image *image, int depth) +{ + struct vhd_image *img, *tmp; + + vhd_util_scan_print_image_indent(image, depth * 3); + + TAILQ_FOREACH_SAFE(img, &image->children, sibling, tmp) + if (!img->hidden) + vhd_util_scan_pretty_print_tree(img, depth + 1); + + TAILQ_FOREACH_SAFE(img, &image->children, sibling, tmp) + if (img->hidden) + vhd_util_scan_pretty_print_tree(img, depth + 1); + + free(image->name); + free(image->parent); + + image->name = NULL; + image->parent = NULL; +} + +static void +vhd_util_scan_pretty_print_images(void) +{ + int i; + struct vhd_image *image, **parentp, *parent, *keyp, key; + + qsort(scan.images, scan.cur, sizeof(scan.images[0]), + vhd_util_scan_pretty_image_compare); + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->parent) { + image->parent_image = NULL; + continue; + } + + memset(&key, 0, sizeof(key)); + key.name = image->parent; + keyp = &key; + + parentp = bsearch(&keyp, scan.images, scan.cur, + sizeof(scan.images[0]), + vhd_util_scan_pretty_image_compare); + if (!parentp) { + image->parent_image = NULL; + continue; + } + + parent = *parentp; + image->parent_image = parent; + TAILQ_INSERT_TAIL(&parent->children, image, sibling); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (image->parent_image || !image->hidden) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->name || image->parent_image) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } + + for (i = 0; i < scan.cur; i++) { + image = scan.images[i]; + + if (!image->name) + continue; + + vhd_util_scan_pretty_print_tree(image, 0); + } +} + +static void +vhd_util_scan_print_image(struct vhd_image *image) +{ + int err; + + if (!image->error && (flags & VHD_SCAN_PRETTY)) { + err = vhd_util_scan_pretty_add_image(image); + if (!err) + return; + + if (!image->error) { + image->error = err; + image->message = "allocating memory"; + } + } + + vhd_util_scan_print_image_indent(image, 0); +} + +static int +vhd_util_scan_error(const char *file, int err) +{ + struct vhd_image image; + + memset(&image, 0, sizeof(image)); + image.name = (char *)file; + image.error = err; + image.message = "failure scanning target"; + + vhd_util_scan_print_image(&image); + + /* + if (flags & VHD_SCAN_NOFAIL) + return 0; + */ + + return err; +} + +static vhd_parent_locator_t * +vhd_util_scan_get_parent_locator(vhd_context_t *vhd) +{ + int i; + vhd_parent_locator_t *loc; + + loc = NULL; + + for (i = 0; i < 8; i++) { + if (vhd->header.loc[i].code == PLAT_CODE_MACX) { + loc = vhd->header.loc + i; + break; + } + + if (vhd->header.loc[i].code == PLAT_CODE_W2RU) + loc = vhd->header.loc + i; + + if (!loc && vhd->header.loc[i].code != PLAT_CODE_NONE) + loc = vhd->header.loc + i; + } + + return loc; +} + +static inline int +copy_name(char *dst, const char *src) +{ + if (snprintf(dst, VHD_MAX_NAME_LEN, "%s", src) < VHD_MAX_NAME_LEN) + return 0; + + return -ENAMETOOLONG; +} + +/* + * LVHD stores realpath(parent) in parent locators, so + * /dev/<vol-group>/<lv-name> becomes /dev/mapper/<vol--group>-<lv--name> + */ +static int +vhd_util_scan_extract_volume_name(char *dst, const char *src) +{ + char copy[VHD_MAX_NAME_LEN], *name, *s, *c; + + name = strrchr(src, ''/''); + if (!name) + name = (char *)src; + + /* convert single dashes to slashes, double dashes to single dashes */ + for (c = copy, s = name; *s != ''\0''; s++, c++) { + if (*s == ''-'') { + if (s[1] != ''-'') + *c = ''/''; + else { + s++; + *c = ''-''; + } + } else + *c = *s; + } + + *c = ''\0''; + c = strrchr(copy, ''/''); + if (c == name) { + /* unrecognized format */ + strcpy(dst, src); + return -EINVAL; + } + + strcpy(dst, ++c); + return 0; +} + +static int +vhd_util_scan_get_volume_parent(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + char name[VHD_MAX_NAME_LEN]; + vhd_parent_locator_t *loc, copy; + + if (flags & VHD_SCAN_FAST) { + err = vhd_header_decode_parent(vhd, + &vhd->header, &image->parent); + if (!err) + goto found; + } + + loc = vhd_util_scan_get_parent_locator(vhd); + if (!loc) + return -EINVAL; + + copy = *loc; + copy.data_offset += image->target->start; + err = vhd_parent_locator_read(vhd, ©, &image->parent); + if (err) + return err; + +found: + err = vhd_util_scan_extract_volume_name(name, image->parent); + if (!err) + return copy_name(image->parent, name); + + return 0; +} + +static int +vhd_util_scan_get_parent(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + vhd_parent_locator_t *loc; + + if (!target_vhd(image->target->type)) { + image->parent = NULL; + return 0; + } + + loc = NULL; + + if (target_volume(image->target->type)) + return vhd_util_scan_get_volume_parent(vhd, image); + + if (flags & VHD_SCAN_FAST) { + err = vhd_header_decode_parent(vhd, + &vhd->header, &image->parent); + if (!err) + return 0; + } else { + /* + * vhd_parent_locator_get checks for the existence of the + * parent file. if this call succeeds, all is well; if not, + * we''ll try to return whatever string we have before failing + * outright. + */ + err = vhd_parent_locator_get(vhd, &image->parent); + if (!err) + return 0; + } + + loc = vhd_util_scan_get_parent_locator(vhd); + if (!loc) + return -EINVAL; + + return vhd_parent_locator_read(vhd, loc, &image->parent); +} + +static int +vhd_util_scan_get_hidden(vhd_context_t *vhd, struct vhd_image *image) +{ + int err, hidden; + + err = 0; + hidden = 0; + + if (target_vhd(image->target->type)) + err = vhd_hidden(vhd, &hidden); + else + hidden = 1; + + if (err) + return err; + + image->hidden = hidden; + return 0; +} + +static int +vhd_util_scan_get_marker(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + char marker; + + err = 0; + marker = 0; + + if (target_vhd(image->target->type) && vhd_has_batmap(vhd)) + err = vhd_marker(vhd, &marker); + + image->marker = marker; + return err; +} + +static int +vhd_util_scan_get_size(vhd_context_t *vhd, struct vhd_image *image) +{ + image->size = image->target->size; + + if (target_vhd(image->target->type)) + image->capacity = vhd->footer.curr_size; + else + image->capacity = image->size; + + return 0; +} + +static int +vhd_util_scan_open_file(vhd_context_t *vhd, struct vhd_image *image) +{ + int err, vhd_flags; + + if (!target_vhd(image->target->type)) + return 0; + + vhd_flags = VHD_OPEN_RDONLY | VHD_OPEN_IGNORE_DISABLED; + if (flags & VHD_SCAN_FAST) + vhd_flags |= VHD_OPEN_FAST; + + err = vhd_open(vhd, image->name, vhd_flags); + if (err) { + vhd->file = NULL; + image->message = "opening file"; + image->error = err; + return image->error; + } + + return 0; +} + +static int +vhd_util_scan_read_volume_headers(vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + void *buf; + size_t size; + struct target *target; + + buf = NULL; + target = image->target; + size = sizeof(vhd_footer_t) + sizeof(vhd_header_t); + + err = posix_memalign(&buf, VHD_SECTOR_SIZE, size); + if (err) { + buf = NULL; + image->message = "allocating image"; + image->error = -err; + goto out; + } + + err = vhd_seek(vhd, target->start, SEEK_SET); + if (err) { + image->message = "seeking to headers"; + image->error = err; + goto out; + } + + err = vhd_read(vhd, buf, size); + if (err) { + image->message = "reading headers"; + image->error = err; + goto out; + } + + memcpy(&vhd->footer, buf, sizeof(vhd_footer_t)); + vhd_footer_in(&vhd->footer); + err = vhd_validate_footer(&vhd->footer); + if (err) { + image->message = "invalid footer"; + image->error = err; + goto out; + } + + /* lvhd vhds should always be dynamic */ + if (vhd_type_dynamic(vhd)) { + if (vhd->footer.data_offset != sizeof(vhd_footer_t)) + err = vhd_read_header_at(vhd, &vhd->header, + vhd->footer.data_offset + + target->start); + else { + memcpy(&vhd->header, + buf + sizeof(vhd_footer_t), + sizeof(vhd_header_t)); + vhd_header_in(&vhd->header); + err = vhd_validate_header(&vhd->header); + } + + if (err) { + image->message = "reading header"; + image->error = err; + goto out; + } + + vhd->spb = vhd->header.block_size >> VHD_SECTOR_SHIFT; + vhd->bm_secs = secs_round_up_no_zero(vhd->spb >> 3); + } + +out: + free(buf); + return image->error; +} + +static int +vhd_util_scan_open_volume(vhd_context_t *vhd, struct vhd_image *image) +{ + struct target *target; + + target = image->target; + memset(vhd, 0, sizeof(*vhd)); + vhd->oflags = VHD_OPEN_RDONLY | VHD_OPEN_FAST; + + if (target->end - target->start < 4096) { + image->message = "device too small"; + image->error = -EINVAL; + return image->error; + } + + vhd->file = strdup(image->name); + if (!vhd->file) { + image->message = "allocating device"; + image->error = -ENOMEM; + return image->error; + } + + vhd->fd = open(target->device, O_RDONLY | O_DIRECT | O_LARGEFILE); + if (vhd->fd == -1) { + free(vhd->file); + vhd->file = NULL; + + image->message = "opening device"; + image->error = -errno; + return image->error; + } + + if (target_vhd(target->type)) + return vhd_util_scan_read_volume_headers(vhd, image); + + return 0; +} + +static int +vhd_util_scan_open(vhd_context_t *vhd, struct vhd_image *image) +{ + struct target *target; + + target = image->target; + + if (target_volume(image->target->type) || !(flags & VHD_SCAN_PRETTY)) + image->name = target->name; + else { + char __image_name[PATH_MAX]; + + image->name = realpath(target->name, __image_name); + if (image->name) + image->name = strdup(__image_name); + if (!image->name) { + image->name = target->name; + image->message = "resolving name"; + image->error = -errno; + return image->error; + } + } + + if (target_volume(target->type)) + return vhd_util_scan_open_volume(vhd, image); + else + return vhd_util_scan_open_file(vhd, image); +} + +static int +vhd_util_scan_init_file_target(struct target *target, + const char *file, uint8_t type) +{ + int err; + struct stat stats; + + err = stat(file, &stats); + if (err == -1) + return -errno; + + err = copy_name(target->name, file); + if (err) + return err; + + err = copy_name(target->device, file); + if (err) + return err; + + target->type = type; + target->start = 0; + target->size = stats.st_size; + target->end = stats.st_size; + + return 0; +} + +static int +vhd_util_scan_init_volume_target(struct target *target, + struct lv *lv, uint8_t type) +{ + int err; + + if (lv->first_segment.type != LVM_SEG_TYPE_LINEAR) + return -ENOSYS; + + err = copy_name(target->name, lv->name); + if (err) { + EPRINTF("copy target name failed: ''%s''\n", lv->name); + return err; + } + + err = copy_name(target->device, lv->first_segment.device); + if (err) { + EPRINTF("copy target device failed: ''%s''\n", + lv->first_segment.device); + return err; + } + + target->type = type; + target->size = lv->size; + target->start = lv->first_segment.pe_start; + target->end = target->start + lv->first_segment.pe_size; + + return 0; +} + +static int +iterator_init(struct iterator *itr, int cnt, struct target *targets) +{ + memset(itr, 0, sizeof(*itr)); + + itr->targets = malloc(sizeof(struct target) * cnt); + if (!itr->targets) + return -ENOMEM; + + memcpy(itr->targets, targets, sizeof(struct target) * cnt); + + itr->cur = 0; + itr->cur_size = cnt; + itr->max_size = cnt; + + return 0; +} + +static struct target * +iterator_next(struct iterator *itr) +{ + if (itr->cur == itr->cur_size) + return NULL; + + return itr->targets + itr->cur++; +} + +static int +iterator_add_file(struct iterator *itr, + struct target *target, const char *parent, uint8_t type) +{ + int i; + struct target *t; + char *lname, *rname; + + for (i = 0; i < itr->cur_size; i++) { + t = itr->targets + i; + lname = basename((char *)t->name); + rname = basename((char *)parent); + + if (!strcmp(lname, rname)) + return -EEXIST; + } + + return vhd_util_scan_init_file_target(target, parent, type); +} + +static int +iterator_add_volume(struct iterator *itr, + struct target *target, const char *parent, uint8_t type) +{ + int i, err; + struct lv *lv; + + lv = NULL; + err = -ENOENT; + + for (i = 0; i < itr->cur_size; i++) + if (!strcmp(parent, itr->targets[i].name)) + return -EEXIST; + + for (i = 0; i < vg.lv_cnt; i++) { + err = fnmatch(parent, vg.lvs[i].name, FNM_PATHNAME); + if (err != FNM_NOMATCH) { + lv = vg.lvs + i; + break; + } + } + + if (err && err != FNM_PATHNAME) + return err; + + if (!lv) + return -ENOENT; + + return vhd_util_scan_init_volume_target(target, lv, type); +} + +static int +iterator_add(struct iterator *itr, const char *parent, uint8_t type) +{ + int err; + struct target *target; + + if (itr->cur_size == itr->max_size) { + struct target *new; + + new = realloc(itr->targets, + sizeof(struct target) * + itr->max_size * 2); + if (!new) + return -ENOMEM; + + itr->max_size *= 2; + itr->targets = new; + } + + target = itr->targets + itr->cur_size; + + if (target_volume(type)) + err = iterator_add_volume(itr, target, parent, type); + else + err = iterator_add_file(itr, target, parent, type); + + if (err) + memset(target, 0, sizeof(*target)); + else + itr->cur_size++; + + return (err == -EEXIST ? 0 : err); +} + +static void +iterator_free(struct iterator *itr) +{ + free(itr->targets); + memset(itr, 0, sizeof(*itr)); +} + +static void +vhd_util_scan_add_parent(struct iterator *itr, + vhd_context_t *vhd, struct vhd_image *image) +{ + int err; + uint8_t type; + + if (vhd_parent_raw(vhd)) + type = target_volume(image->target->type) ? + VHD_TYPE_RAW_VOLUME : VHD_TYPE_RAW_FILE; + else + type = target_volume(image->target->type) ? + VHD_TYPE_VHD_VOLUME : VHD_TYPE_VHD_FILE; + + err = iterator_add(itr, image->parent, type); + if (err) + vhd_util_scan_error(image->parent, err); +} + +static int +vhd_util_scan_targets(int cnt, struct target *targets) +{ + int ret, err; + vhd_context_t vhd; + struct iterator itr; + struct target *target; + struct vhd_image image; + + ret = 0; + err = 0; + + err = iterator_init(&itr, cnt, targets); + if (err) + return err; + + while ((target = iterator_next(&itr))) { + memset(&vhd, 0, sizeof(vhd)); + memset(&image, 0, sizeof(image)); + + image.target = target; + + err = vhd_util_scan_open(&vhd, &image); + if (err) { + ret = -EAGAIN; + goto end; + } + + err = vhd_util_scan_get_size(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "getting physical size"; + image.error = err; + goto end; + } + + err = vhd_util_scan_get_hidden(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "checking ''hidden'' field"; + image.error = err; + goto end; + } + + if (flags & VHD_SCAN_MARKERS) { + err = vhd_util_scan_get_marker(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "checking marker"; + image.error = err; + goto end; + } + } + + if (vhd.footer.type == HD_TYPE_DIFF) { + err = vhd_util_scan_get_parent(&vhd, &image); + if (err) { + ret = -EAGAIN; + image.message = "getting parent"; + image.error = err; + goto end; + } + } + + end: + vhd_util_scan_print_image(&image); + + if (flags & VHD_SCAN_PARENTS && image.parent) + vhd_util_scan_add_parent(&itr, &vhd, &image); + + if (vhd.file) + vhd_close(&vhd); + if (image.name != target->name) + free(image.name); + free(image.parent); + + if (err && !(flags & VHD_SCAN_NOFAIL)) + break; + } + + iterator_free(&itr); + + if (flags & VHD_SCAN_NOFAIL) + return ret; + + return err; +} + +static int +vhd_util_scan_targets_pretty(int cnt, struct target *targets) +{ + int err; + + err = vhd_util_scan_pretty_allocate_list(cnt); + if (err) { + printf("scan failed: no memory\n"); + return -ENOMEM; + } + + err = vhd_util_scan_targets(cnt, targets); + + vhd_util_scan_pretty_print_images(); + vhd_util_scan_pretty_free_list(); + + return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); +} + +static int +vhd_util_scan_find_file_targets(int cnt, char **names, + const char *filter, + struct target **_targets, int *_total) +{ + glob_t g; + struct target *targets; + int i, globs, err, total; + + total = cnt; + globs = 0; + *_total = 0; + *_targets = NULL; + + memset(&g, 0, sizeof(g)); + + if (filter) { + int gflags = ((flags & VHD_SCAN_FAST) ? GLOB_NOSORT : 0); + + errno = 0; + err = glob(filter, gflags, vhd_util_scan_error, &g); + + switch (err) { + case GLOB_NOSPACE: + err = -ENOMEM; + break; + case GLOB_ABORTED: + err = -EIO; + break; + case GLOB_NOMATCH: + err = -errno; + break; + } + + if (err) { + vhd_util_scan_error(filter, err); + return err; + } + + globs = g.gl_pathc; + total += globs; + } + + targets = calloc(total, sizeof(struct target)); + if (!targets) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < g.gl_pathc; i++) { + err = vhd_util_scan_init_file_target(targets + i, + g.gl_pathv[i], + VHD_TYPE_VHD_FILE); + if (err) { + vhd_util_scan_error(g.gl_pathv[i], err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + for (i = 0; i + globs < total; i++) { + err = vhd_util_scan_init_file_target(targets + i + globs, + names[i], + VHD_TYPE_VHD_FILE); + if (err) { + vhd_util_scan_error(names[i], err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + err = 0; + *_total = total; + *_targets = targets; + +out: + if (err) + free(targets); + if (filter) + globfree(&g); + + return err; +} + +static inline void +swap_volume(struct lv *lvs, int dst, int src) +{ + struct lv copy, *ldst, *lsrc; + + if (dst == src) + return; + + lsrc = lvs + src; + ldst = lvs + dst; + + memcpy(©, ldst, sizeof(copy)); + memcpy(ldst, lsrc, sizeof(*ldst)); + memcpy(lsrc, ©, sizeof(copy)); +} + +static int +vhd_util_scan_sort_volumes(struct lv *lvs, int cnt, + const char *filter, int *_matches) +{ + struct lv *lv; + int i, err, matches; + + matches = 0; + *_matches = 0; + + if (!filter) + return 0; + + for (i = 0; i < cnt; i++) { + lv = lvs + i; + + err = fnmatch(filter, lv->name, FNM_PATHNAME); + if (err) { + if (err != FNM_NOMATCH) { + EPRINTF("fnmatch failed: ''%s'', ''%s''\n", + filter, lv->name); + vhd_util_scan_error(lv->name, err); + if (!(flags & VHD_SCAN_NOFAIL)) + return err; + } + + continue; + } + + swap_volume(lvs, matches++, i); + } + + *_matches = matches; + return 0; +} + +static int +vhd_util_scan_find_volume_targets(int cnt, char **names, + const char *volume, const char *filter, + struct target **_targets, int *_total) +{ + struct target *targets; + int i, err, total, matches; + + *_total = 0; + *_targets = NULL; + targets = NULL; + + err = lvm_scan_vg(volume, &vg); + if (err) + return err; + + err = vhd_util_scan_sort_volumes(vg.lvs, vg.lv_cnt, + filter, &matches); + if (err) + goto out; + + total = matches; + for (i = 0; i < cnt; i++) { + err = vhd_util_scan_sort_volumes(vg.lvs + total, + vg.lv_cnt - total, + names[i], &matches); + if (err) + goto out; + + total += matches; + } + + targets = calloc(total, sizeof(struct target)); + if (!targets) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < total; i++) { + err = vhd_util_scan_init_volume_target(targets + i, + vg.lvs + i, + VHD_TYPE_VHD_VOLUME); + if (err) { + vhd_util_scan_error(vg.lvs[i].name, err); + if (!(flags & VHD_SCAN_NOFAIL)) + goto out; + } + } + + err = 0; + *_total = total; + *_targets = targets; + +out: + if (err) + free(targets); + return err; +} + +static int +vhd_util_scan_find_targets(int cnt, char **names, + const char *volume, const char *filter, + struct target **targets, int *total) +{ + if (flags & VHD_SCAN_VOLUME) + return vhd_util_scan_find_volume_targets(cnt, names, + volume, filter, + targets, total); + return vhd_util_scan_find_file_targets(cnt, names, + filter, targets, total); +} + +int +vhd_util_scan(int argc, char **argv) +{ + int c, err, cnt; + char *filter, *volume; + struct target *targets; + + cnt = 0; + err = 0; + flags = 0; + filter = NULL; + volume = NULL; + targets = NULL; + + optind = 0; + while ((c = getopt(argc, argv, "m:fcl:pavMh")) != -1) { + switch (c) { + case ''m'': + filter = optarg; + break; + case ''f'': + flags |= VHD_SCAN_FAST; + break; + case ''c'': + flags |= VHD_SCAN_NOFAIL; + break; + case ''l'': + volume = optarg; + flags |= VHD_SCAN_VOLUME; + break; + case ''p'': + flags |= VHD_SCAN_PRETTY; + break; + case ''a'': + flags |= VHD_SCAN_PARENTS; + break; + case ''v'': + flags |= VHD_SCAN_VERBOSE; + break; + case ''M'': + flags |= VHD_SCAN_MARKERS; + break; + case ''h'': + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!filter && argc - optind == 0) { + err = -EINVAL; + goto usage; + } + + if (flags & VHD_SCAN_PRETTY) + flags &= ~VHD_SCAN_FAST; + + err = vhd_util_scan_find_targets(argc - optind, argv + optind, + volume, filter, &targets, &cnt); + if (err) { + printf("scan failed: %d\n", err); + return err; + } + + if (!cnt) + return 0; + + if (flags & VHD_SCAN_PRETTY) + err = vhd_util_scan_targets_pretty(cnt, targets); + else + err = vhd_util_scan_targets(cnt, targets); + + free(targets); + lvm_free_vg(&vg); + + return ((flags & VHD_SCAN_NOFAIL) ? 0 : err); + +usage: + printf("usage: [OPTIONS] FILES\n" + "options: [-m match filter] [-f fast] [-c continue on failure] " + "[-l LVM volume] [-p pretty print] [-a scan parents] " + "[-v verbose] [-h help] [-M show markers]\n"); + return err; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-set-field.c b/tools/blktap3/vhd/lib/vhd-util-set-field.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-set-field.c @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "libvhd.h" + +int +vhd_util_set_field(int argc, char **argv) +{ + long value; + int err, c; + vhd_context_t vhd; + char *name, *field; + + err = -EINVAL; + value = 0; + name = NULL; + field = NULL; + + if (!argc || !argv) + goto usage; + + optind = 0; + while ((c = getopt(argc, argv, "n:f:v:h")) != -1) { + switch (c) { + case ''n'': + name = optarg; + break; + case ''f'': + field = optarg; + break; + case ''v'': + err = 0; + value = strtol(optarg, NULL, 10); + break; + case ''h'': + default: + goto usage; + } + } + + if (!name || !field || optind != argc || err) + goto usage; + + if (strnlen(field, 25) >= 25) { + printf("invalid field\n"); + goto usage; + } + + if (strcmp(field, "hidden") && strcmp(field, "marker")) { + printf("invalid field %s\n", field); + goto usage; + } + + if (value < 0 || value > 255) { + printf("invalid value %ld\n", value); + goto usage; + } + + err = vhd_open(&vhd, name, VHD_OPEN_RDWR); + if (err) { + printf("error opening %s: %d\n", name, err); + return err; + } + + if (!strcmp(field, "hidden")) { + vhd.footer.hidden = (char)value; + err = vhd_write_footer(&vhd, &vhd.footer); + if (err == -ENOSPC && vhd_type_dynamic(&vhd) && value) + /* if no space to write the primary footer, at least write the + * backup footer so that it''s possible to delete the VDI */ + err = vhd_write_footer_at(&vhd, &vhd.footer, 0); + } else { + err = vhd_set_marker(&vhd, (char)value); + } + + vhd_close(&vhd); + return err; + +usage: + printf("options: <-n name> <-f field> <-v value> [-h help]\n"); + return -EINVAL; +} diff --git a/tools/blktap3/vhd/lib/vhd-util-snapshot.c b/tools/blktap3/vhd/lib/vhd-util-snapshot.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/lib/vhd-util-snapshot.c @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <stdio.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <limits.h> + +#include "libvhd.h" + +static int +vhd_util_find_snapshot_target(const char *name, char **result, int *parent_raw) +{ + int i, err; + char *target; + vhd_context_t vhd; + + *parent_raw = 0; + *result = NULL; + + target = strdup(name); + if (!target) + return -ENOMEM; + + for (;;) { + err = vhd_open(&vhd, target, VHD_OPEN_RDONLY); + if (err) + return err; + + if (vhd.footer.type != HD_TYPE_DIFF) + goto out; + + err = vhd_get_bat(&vhd); + if (err) + goto out; + + for (i = 0; i < vhd.bat.entries; i++) + if (vhd.bat.bat[i] != DD_BLK_UNUSED) + goto out; + + free(target); + err = vhd_parent_locator_get(&vhd, &target); + if (err) + goto out; + + if (vhd_parent_raw(&vhd)) { + *parent_raw = 1; + goto out; + } + + vhd_close(&vhd); + } + +out: + vhd_close(&vhd); + if (err) + free(target); + else + *result = target; + + return err; +} + +static int +vhd_util_check_depth(const char *name, int *depth) +{ + int err; + vhd_context_t vhd; + + err = vhd_open(&vhd, name, VHD_OPEN_RDONLY); + if (err) + return err; + + err = vhd_chain_depth(&vhd, depth); + vhd_close(&vhd); + + return err; +} + +int +vhd_util_snapshot(int argc, char **argv) +{ + vhd_flag_creat_t flags; + int c, err, prt_raw, limit, empty_check; + char *name, *pname, *backing; + char *ppath, __ppath[PATH_MAX]; + uint64_t size, msize; + vhd_context_t vhd; + + name = NULL; + pname = NULL; + ppath = NULL; + backing = NULL; + size = 0; + msize = 0; + flags = 0; + limit = 0; + empty_check = 1; + + if (!argc || !argv) { + err = -EINVAL; + goto usage; + } + + optind = 0; + while ((c = getopt(argc, argv, "n:p:S:l:meh")) != -1) { + + switch (c) { + case ''n'': + name = optarg; + break; + case ''p'': + pname = optarg; + break; + case ''S'': + msize = strtoull(optarg, NULL, 10); + case ''l'': + limit = strtol(optarg, NULL, 10); + break; + case ''m'': + vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); + break; + case ''e'': + empty_check = 0; + break; + case ''h'': + err = 0; + goto usage; + default: + err = -EINVAL; + goto usage; + } + } + + if (!name || !pname || optind != argc) { + err = -EINVAL; + goto usage; + } + + ppath = realpath(pname, __ppath); + if (!ppath) + return -errno; + + if (vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW) || !empty_check) { + backing = strdup(ppath); + if (!backing) { + err = -ENOMEM; + goto out; + } + } else { + err = vhd_util_find_snapshot_target(ppath, &backing, &prt_raw); + if (err) { + backing = NULL; + goto out; + } + + /* + * if the sizes of the parent chain are non-uniform, we need to + * pick the right size: that of the supplied parent + */ + if (strcmp(ppath, backing)) { + err = vhd_open(&vhd, ppath, VHD_OPEN_RDONLY); + if (err) + goto out; + size = vhd.footer.curr_size; + vhd_close(&vhd); + } + + if (prt_raw) + vhd_flag_set(flags, VHD_FLAG_CREAT_PARENT_RAW); + } + + if (limit && !vhd_flag_test(flags, VHD_FLAG_CREAT_PARENT_RAW)) { + int depth; + + err = vhd_util_check_depth(backing, &depth); + if (err) + printf("error checking snapshot depth: %d\n", err); + else if (depth + 1 > limit) { + err = -ENOSPC; + printf("snapshot depth exceeded: " + "current depth: %d, limit: %d\n", depth, limit); + } + + if (err) + goto out; + } + + err = vhd_snapshot(name, size, backing, msize << 20, flags); + +out: + free(backing); + + return err; + +usage: + printf("options: <-n name> <-p parent name> [-l snapshot depth limit]" + " [-m parent_is_raw] [-S size (MB) for metadata preallocation " + "(see vhd-util resize)] [-e link to supplied parent name even " + "if it''s empty] [-h help]\n"); + return err; +} diff --git a/tools/blktap3/vhd/vhd-index.c b/tools/blktap3/vhd/vhd-index.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/vhd-index.c @@ -0,0 +1,1012 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <limits.h> + +#include "libvhd.h" +#include "libvhd-index.h" + +static void +usage(void) +{ + printf("usage: vhd-index <command>\n" + "commands:\n" + "\t index: <-i index name> <-v vhd file>\n" + "\t summary: <-s index name> [-v vhd file [-b block]]\n"); + exit(-EINVAL); +} + +typedef struct vhdi_name vhdi_name_t; + +struct vhdi_name { + char *vhd; + char *bat; + + char *base; + char *index; + char *files; +}; + +static int +vhd_index_get_name(const char *index, const char *vhd, vhdi_name_t *name) +{ + int err, len; + + memset(name, 0, sizeof(vhdi_name_t)); + + len = strnlen(index, VHD_MAX_NAME_LEN); + if (len + 5 >= VHD_MAX_NAME_LEN - 1) + return -ENAMETOOLONG; + + if (vhd) { + len = strnlen(vhd, VHD_MAX_NAME_LEN); + if (len >= VHD_MAX_NAME_LEN - 1) + return -ENAMETOOLONG; + + err = asprintf(&name->vhd, "%s", vhd); + if (err == -1) { + name->vhd = NULL; + goto fail; + } + + err = asprintf(&name->bat, "%s.bat", vhd); + if (err == -1) { + name->bat = NULL; + goto fail; + } + } + + err = asprintf(&name->base, "%s", index); + if (err == -1) { + name->base = NULL; + goto fail; + } + + err = asprintf(&name->index, "%s.index", index); + if (err == -1) { + name->index = NULL; + goto fail; + } + + err = asprintf(&name->files, "%s.files", index); + if (err == -1) { + name->files = NULL; + goto fail; + } + + return 0; + +fail: + free(name->vhd); + free(name->bat); + free(name->base); + free(name->index); + free(name->files); + + return -ENOMEM; +} + +static inline void +vhd_index_free_name(vhdi_name_t *name) +{ + free(name->vhd); + free(name->bat); + free(name->base); + free(name->index); + free(name->files); +} + +static inline int +vhd_index_add_file_table_entry(vhdi_name_t *name, const char *file, + vhdi_file_table_t *files, vhdi_file_id_t *fid) +{ + int err; + + vhdi_file_table_free(files); + + err = vhdi_file_table_add(name->files, file, fid); + if (err) + return err; + + return vhdi_file_table_load(name->files, files); +} + +static inline int +vhd_index_get_file_id(vhdi_name_t *name, const char *file, + vhdi_file_table_t *files, vhdi_file_id_t *fid) +{ + char *path, __path[PATH_MAX]; + int i; + + path = realpath(file, __path); + if (!path) + return -errno; + + for (i = 0; i < files->entries; i++) + if (!strcmp(files->table[i].path, path)) { + *fid = files->table[i].file_id; + return 0; + } + + return vhd_index_add_file_table_entry(name, file, files, fid); +} + +static inline int +vhd_index_get_block(vhdi_context_t *vhdi, vhd_context_t *vhd, + uint32_t block, vhdi_block_t *vhdi_block) +{ + int i; + + if (block) + return vhdi_read_block(vhdi, vhdi_block, block); + + vhdi_block->entries = vhd->spb; + vhdi_block->table = calloc(vhd->spb, sizeof(vhdi_entry_t)); + if (!vhdi_block->table) + return -ENOMEM; + + for (i = 0; i < vhdi_block->entries; i++) + vhdi_block->table[i].offset = DD_BLK_UNUSED; + + return 0; +} + +static int +vhd_index_add_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, + vhdi_bat_t *bat, vhdi_file_table_t *files, + vhd_context_t *vhd, uint32_t block, char *finished) +{ + char *map; + vhdi_file_id_t fid; + uint32_t i, count, off; + vhdi_block_t vhdi_block; + int err, update, append; + + fid = 0; + count = 0; + update = 0; + append = (bat->table[block] == 0); + + if (vhd->bat.bat[block] == DD_BLK_UNUSED) + return 0; + + err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); + if (err) + return err; + + err = vhd_read_bitmap(vhd, block, &map); + if (err) + goto out; + + err = vhd_index_get_file_id(name, vhd->file, files, &fid); + if (err) + goto out; + + for (i = 0; i < vhd->spb; i++) { + if (vhdi_block.table[i].file_id) { + count++; + continue; + } + + if (!vhd_bitmap_test(vhd, map, i)) + continue; + + err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); + if (err) + goto out; + + vhdi_block.table[i].file_id = fid; + vhdi_block.table[i].offset = off; + count++; + update++; + } + + if (update) { + if (append) { + uint32_t location; + + err = vhdi_append_block(vhdi, &vhdi_block, &location); + if (err) + goto out; + + bat->table[block] = location; + } else { + err = vhdi_write_block(vhdi, &vhdi_block, + bat->table[block]); + if (err) + goto out; + } + } + + if (count == vhd->spb) + *finished = 1; + + err = 0; + +out: + free(vhdi_block.table); + free(map); + + return err; +} + +static int +vhd_index_clone_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, + vhdi_bat_t *bat, vhdi_file_table_t *files, + vhd_context_t *vhd, uint32_t block) +{ + char *map; + int err, update; + uint32_t i, off; + vhdi_file_id_t fid; + vhdi_block_t vhdi_block; + + fid = 0; + update = 0; + + if (vhd->bat.bat[block] == DD_BLK_UNUSED) + return 0; + + err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); + if (err) + return err; + + err = vhd_read_bitmap(vhd, block, &map); + if (err) + goto out; + + err = vhd_index_get_file_id(name, vhd->file, files, &fid); + if (err) + goto out; + + for (i = 0; i < vhd->spb; i++) { + if (!vhd_bitmap_test(vhd, map, i)) + continue; + + err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); + if (err) + goto out; + + vhdi_block.table[i].file_id = fid; + vhdi_block.table[i].offset = off; + update++; + } + + if (update) { + uint32_t location; + + err = vhdi_append_block(vhdi, &vhdi_block, &location); + if (err) + goto out; + + bat->table[block] = location; + } + + err = 0; + +out: + free(vhdi_block.table); + free(map); + + return err; +} + +static int +vhd_index_update_bat_entry(vhdi_name_t *name, vhdi_context_t *vhdi, + vhdi_bat_t *bat, vhdi_file_table_t *files, + vhd_context_t *vhd, uint32_t block) +{ + char *map; + int err, update; + uint32_t i, off; + vhdi_file_id_t fid; + vhdi_block_t vhdi_block; + + fid = 0; + update = 0; + + if (vhd->bat.bat[block] == DD_BLK_UNUSED) + return 0; + + err = vhd_index_get_block(vhdi, vhd, bat->table[block], &vhdi_block); + if (err) + return err; + + err = vhd_read_bitmap(vhd, block, &map); + if (err) + goto out; + + err = vhd_index_get_file_id(name, vhd->file, files, &fid); + if (err) + goto out; + + for (i = 0; i < vhd->spb; i++) { + if (!vhd_bitmap_test(vhd, map, i)) + continue; + + err = vhd_offset(vhd, (uint64_t)block * vhd->spb + i, &off); + if (err) + goto out; + + if (vhdi_block.table[i].file_id == fid && + vhdi_block.table[i].offset == off) + continue; + + vhdi_block.table[i].file_id = fid; + vhdi_block.table[i].offset = off; + update++; + } + + if (update) { + uint32_t location; + + err = vhdi_append_block(vhdi, &vhdi_block, &location); + if (err) + goto out; + + bat->table[block] = location; + } + + err = 0; + +out: + free(vhdi_block.table); + free(map); + + return err; +} + +static int +vhd_index_add_bat(vhdi_name_t *name, + uint64_t vhd_blocks, uint32_t vhd_block_size) +{ + int err; + vhdi_bat_t bat; + vhd_context_t vhd; + vhdi_context_t vhdi; + vhdi_file_table_t files; + char *vhd_file, *finished; + uint32_t block, remaining; + + memset(&bat, 0, sizeof(vhdi_bat_t)); + memset(&files, 0, sizeof(vhdi_file_table_t)); + + vhd_file = NULL; + finished = NULL; + bat.vhd_blocks = vhd_blocks; + bat.vhd_block_size = vhd_block_size; + + strcpy(bat.vhd_path, name->vhd); + strcpy(bat.index_path, name->index); + strcpy(bat.file_table_path, name->files); + + err = vhdi_open(&vhdi, name->index, O_RDWR); + if (err) + return err; + + err = vhdi_file_table_load(name->files, &files); + if (err) { + vhdi_close(&vhdi); + return err; + } + + err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files); + if (err) + goto out; + + bat.table = calloc(vhd_blocks, sizeof(uint32_t)); + if (!bat.table) { + err = -ENOMEM; + goto out; + } + + vhd_file = strdup(name->vhd); + if (!vhd_file) + goto out; + + remaining = vhd_blocks; + finished = calloc(remaining, sizeof(char)); + if (!finished) { + err = -ENOMEM; + goto out; + } + + for (;;) { + err = vhd_open(&vhd, vhd_file, VHD_OPEN_RDONLY); + if (err) + goto out; + + err = vhd_get_bat(&vhd); + if (err) + goto out_vhd; + + for (block = 0; block < vhd.bat.entries; block++) { + if (finished[block]) + continue; + + err = vhd_index_add_bat_entry(name, &vhdi, &bat, + &files, &vhd, block, + &finished[block]); + if (err) + goto out_bat; + + if (finished[block]) + remaining--; + } + + free(vhd_file); + vhd_file = NULL; + + if (!remaining || vhd.footer.type != HD_TYPE_DIFF) { + vhd_put_bat(&vhd); + vhd_close(&vhd); + break; + } + + err = vhd_parent_locator_get(&vhd, &vhd_file); + if (err) + goto out_bat; + + out_bat: + vhd_put_bat(&vhd); + out_vhd: + vhd_close(&vhd); + if (err) + goto out; + } + + err = vhdi_bat_write(name->bat, &bat); + if (err) + goto out; + + err = 0; + +out: + if (err) + unlink(name->bat); + + vhdi_file_table_free(&files); + vhdi_close(&vhdi); + free(bat.table); + free(finished); + free(vhd_file); + + return err; +} + +static int +vhd_index_clone_bat(vhdi_name_t *name, const char *parent) +{ + int err; + char *pbat; + uint32_t block; + vhdi_bat_t bat; + vhd_context_t vhd; + vhdi_context_t vhdi; + vhdi_file_table_t files; + + memset(&bat, 0, sizeof(vhdi_bat_t)); + memset(&files, 0, sizeof(vhdi_file_table_t)); + + err = asprintf(&pbat, "%s.bat", parent); + if (err == -1) + return -ENOMEM; + + err = access(pbat, R_OK); + if (err == -1) { + free(pbat); + return -errno; + } + + err = vhdi_open(&vhdi, name->index, O_RDWR); + if (err) + goto out; + + err = vhdi_bat_load(pbat, &bat); + if (err) + goto out_vhdi; + + err = vhdi_file_table_load(name->files, &files); + if (err) + goto out_vhdi; + + err = vhdi_bat_create(name->bat, name->vhd, name->index, name->files); + if (err) + goto out_ft; + + err = vhdi_bat_write(name->bat, &bat); + if (err) + goto out_ft; + + err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY); + if (err) + goto out_ft; + + err = vhd_get_bat(&vhd); + if (err) + goto out_vhd; + + for (block = 0; block < vhd.bat.entries; block++) { + err = vhd_index_clone_bat_entry(name, &vhdi, &bat, + &files, &vhd, block); + if (err) + goto out_bat; + } + + err = vhdi_bat_write(name->bat, &bat); + if (err) + goto out_bat; + + err = 0; + +out_bat: + vhd_put_bat(&vhd); +out_vhd: + vhd_close(&vhd); +out_ft: + vhdi_file_table_free(&files); +out_vhdi: + vhdi_close(&vhdi); +out: + if (err) + unlink(name->bat); + free(bat.table); + free(pbat); + return err; +} + +static int +vhd_index_update_bat(vhdi_name_t *name) +{ + int err; + uint32_t block; + vhdi_bat_t bat; + vhd_context_t vhd; + vhdi_context_t vhdi; + vhdi_file_table_t files; + + memset(&bat, 0, sizeof(vhdi_bat_t)); + memset(&files, 0, sizeof(vhdi_file_table_t)); + + err = access(name->bat, R_OK); + if (err == -1) + return -errno; + + err = vhdi_open(&vhdi, name->index, O_RDWR); + if (err) + goto out; + + err = vhdi_bat_load(name->bat, &bat); + if (err) + goto out_vhdi; + + err = vhdi_file_table_load(name->files, &files); + if (err) + goto out_vhdi; + + err = vhd_open(&vhd, name->vhd, VHD_OPEN_RDONLY); + if (err) + goto out_ft; + + err = vhd_get_bat(&vhd); + if (err) + goto out_vhd; + + for (block = 0; block < vhd.bat.entries; block++) { + err = vhd_index_update_bat_entry(name, &vhdi, &bat, + &files, &vhd, block); + if (err) + goto out_bat; + } + + err = vhdi_bat_write(name->bat, &bat); + if (err) + goto out_bat; + + err = 0; + +out_bat: + vhd_put_bat(&vhd); +out_vhd: + vhd_close(&vhd); +out_ft: + vhdi_file_table_free(&files); +out_vhdi: + vhdi_close(&vhdi); +out: + free(bat.table); + return err; +} + +static int +vhd_index_create(vhdi_name_t *name) +{ + int err; + vhd_context_t ctx; + uint32_t block_size; + + if (!access(name->index, F_OK) || !access(name->files, F_OK)) + return -EEXIST; + + err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); + if (err) + return err; + + err = vhd_get_header(&ctx); + if (err) { + vhd_close(&ctx); + return err; + } + + block_size = ctx.header.block_size; + vhd_close(&ctx); + + err = vhdi_create(name->index, block_size); + if (err) + goto out; + + err = vhdi_file_table_create(name->files); + if (err) + goto out; + + err = 0; + +out: + if (err) { + unlink(name->index); + unlink(name->files); + } + + return err; +} + +static int +vhd_index(vhdi_name_t *name) +{ + char *parent; + vhd_context_t ctx; + uint64_t vhd_blocks; + uint32_t vhd_block_size; + int err, new_index, new_bat; + + parent = NULL; + new_bat = 0; + new_index = 0; + + /* find vhd''s parent -- we only index read-only vhds */ + err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); + if (err) + return err; + + err = vhd_parent_locator_get(&ctx, &parent); + vhd_close(&ctx); + + if (err) + return err; + + /* update name to point to parent */ + free(name->vhd); + name->vhd = parent; + parent = NULL; + + free(name->bat); + err = asprintf(&name->bat, "%s.bat", name->vhd); + if (err == -1) { + name->bat = NULL; + return -ENOMEM; + } + + /* create index if it doesn''t already exist */ + err = access(name->index, R_OK | W_OK); + if (err == -1 && errno == ENOENT) { + new_index = 1; + err = vhd_index_create(name); + } + + if (err) + return err; + + /* get basic vhd info */ + err = vhd_open(&ctx, name->vhd, VHD_OPEN_RDONLY); + if (err) + goto out; + + err = vhd_get_header(&ctx); + if (err) { + vhd_close(&ctx); + goto out; + } + + vhd_blocks = ctx.header.max_bat_size; + vhd_block_size = ctx.header.block_size; + + if (vhd_parent_locator_get(&ctx, &parent)) + parent = NULL; + + vhd_close(&ctx); + + /* update existing bat if it exists */ + err = vhd_index_update_bat(name); + if (err != -ENOENT) + goto out; + + new_bat = 1; + + if (parent) { + /* clone parent bat if it exists */ + err = vhd_index_clone_bat(name, parent); + if (err != -ENOENT) + goto out; + } + + /* create new bat from scratch */ + err = vhd_index_add_bat(name, vhd_blocks, vhd_block_size); + if (err) + goto out; + + err = 0; + +out: + if (err) { + if (new_bat) + unlink(name->bat); + if (new_index) { + unlink(name->index); + unlink(name->files); + } + } + free(parent); + return err; +} + +static void +vhd_index_print_summary(vhdi_name_t *name, + uint32_t block_size, vhdi_file_table_t *files) +{ + int i; + char time[26], uuid[37]; + + printf("VHD INDEX : %s\n", name->index); + printf("--------------------\n"); + printf("block size : %u\n", block_size); + printf("files : %d\n", files->entries); + + printf("\n"); + for (i = 0; i < files->entries; i++) { + uuid_unparse(files->table[i].vhd_uuid, uuid); + vhd_time_to_string(files->table[i].vhd_timestamp, time); + + printf(" fid 0x%04x : %s, %s, %s\n", + files->table[i].file_id, files->table[i].path, uuid, time); + } + + printf("\n"); +} + +static inline void +vhd_index_print_bat_header(const char *name, vhdi_bat_t *bat) +{ + printf("VHD INDEX BAT : %s\n", name); + printf("--------------------\n"); + printf("blocks : %"PRIu64"\n", bat->vhd_blocks); + printf("block size : %u\n", bat->vhd_block_size); + printf("vhd path : %s\n", bat->vhd_path); + printf("index path : %s\n", bat->index_path); + printf("file table path : %s\n", bat->file_table_path); +} + +static int +vhd_index_print_vhd_summary(vhdi_name_t *name) +{ + int err; + uint32_t i; + vhdi_bat_t bat; + + err = vhdi_bat_load(name->bat, &bat); + if (err) + return err; + + vhd_index_print_bat_header(name->bat, &bat); + + printf("\n"); + for (i = 0; i < bat.vhd_blocks; i++) + printf(" block 0x%04x : offset 0x%08x\n", i, bat.table[i]); + + free(bat.table); + return 0; +} + +static int +vhd_index_print_vhd_block_summary(vhdi_name_t *name, uint32_t block) +{ + int err; + int i; + uint32_t off; + vhdi_bat_t bat; + vhdi_context_t vhdi; + vhdi_block_t vhdi_block; + + err = vhdi_bat_load(name->bat, &bat); + if (err) + return err; + + vhd_index_print_bat_header(name->bat, &bat); + + if (block > bat.vhd_blocks) { + printf("block %u past end of bat (%"PRIu64")\n", + block, bat.vhd_blocks); + err = -EINVAL; + goto out; + } + + off = bat.table[block]; + if (off == DD_BLK_UNUSED) { + printf("block %u is unallocated\n", block); + err = 0; + goto out; + } + + err = vhdi_open(&vhdi, name->index, O_RDWR); + if (err) + goto out; + + err = vhdi_read_block(&vhdi, &vhdi_block, off); + vhdi_close(&vhdi); + if (err) + goto out; + + printf("\nBLOCK 0x%08x\n", block); + for (i = 0; i < vhdi_block.entries; i++) + printf(" sec 0x%04x : fid 0x%04x, offset 0x%08x\n", i, + vhdi_block.table[i].file_id, + vhdi_block.table[i].offset); + + free(vhdi_block.table); + err = 0; + +out: + free(bat.table); + return err; +} + +static int +vhd_index_summary(vhdi_name_t *name, uint32_t block) +{ + int err; + uint32_t block_size; + vhdi_context_t vhdi; + vhdi_file_table_t files; + + err = vhdi_open(&vhdi, name->index, O_RDWR); + if (err) + return err; + + block_size = vhdi.vhd_block_size; + vhdi_close(&vhdi); + + err = vhdi_file_table_load(name->files, &files); + if (err) + return err; + + vhd_index_print_summary(name, block_size, &files); + + if (name->vhd) { + if (block == (uint32_t)-1) + err = vhd_index_print_vhd_summary(name); + else + err = vhd_index_print_vhd_block_summary(name, block); + + if (err) + goto out; + } + + err = 0; + +out: + vhdi_file_table_free(&files); + return err; +} + +int +main(int argc, char *argv[]) +{ + int err; + uint32_t block; + vhdi_name_t name; + char *vhd, *index; + int c, update, summary; + + vhd = NULL; + index = NULL; + block = (uint32_t)-1; + update = 0; + summary = 0; + + while ((c = getopt(argc, argv, "i:v:s:b:h")) != -1) { + switch (c) { + case ''i'': + index = optarg; + update = 1; + break; + + case ''v'': + vhd = optarg; + break; + + case ''s'': + index = optarg; + summary = 1; + break; + + case ''b'': + block = strtoul(optarg, NULL, 10); + break; + + default: + usage(); + } + } + + if (optind != argc) + usage(); + + if (!(update ^ summary)) + usage(); + + if (block != (uint32_t)-1 && (!summary || !vhd)) + usage(); + + err = vhd_index_get_name(index, vhd, &name); + if (err) + goto out; + + if (summary) + err = vhd_index_summary(&name, block); + else if (update) { + if (!vhd) + usage(); + + err = vhd_index(&name); + } + +out: + vhd_index_free_name(&name); + return -err; +} diff --git a/tools/blktap3/vhd/vhd-update.c b/tools/blktap3/vhd/vhd-update.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/vhd-update.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2007, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +* Before updating a VHD file, we create a journal consisting of: + * - all data at the beginning of the file, up to and including the BAT + * - each allocated bitmap (existing at the same offset in the journal as + * its corresponding bitmap in the original file) + * Updates are performed in place by writing appropriately + * transformed versions of journaled bitmaps to the original file. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <stdio.h> +#include <errno.h> +#include <fcntl.h> +#include <stdlib.h> +#include <unistd.h> +#include <endian.h> +#include <byteswap.h> + +#include "libvhd.h" +#include "libvhd-journal.h" + +static void +usage(void) +{ + printf("usage: vhd-update <-n name> [-j existing journal] [-h]\n"); + exit(EINVAL); +} + +/* + * update vhd creator version to reflect its new bitmap ordering + */ +static inline int +update_creator_version(vhd_journal_t *journal) +{ + journal->vhd.footer.crtr_ver = VHD_VERSION(1, 1); + return vhd_write_footer(&journal->vhd, &journal->vhd.footer); +} + +static int +journal_bitmaps(vhd_journal_t *journal) +{ + unsigned int i; + int err; + + for (i = 0; i < journal->vhd.bat.entries; i++) { + err = vhd_journal_add_block(journal, i, VHD_JOURNAL_METADATA); + if (err) + return err; + } + + return 0; +} + +/* + * older VHD bitmaps were little endian + * and bits within a word were set from right to left + */ +static inline int +old_test_bit(int nr, volatile void * addr) +{ + return (((unsigned long*)addr)[nr/(sizeof(unsigned long)*8)] >> + (nr % (sizeof(unsigned long)*8))) & 1; +} + +/* + * new VHD bitmaps are big endian + * and bits within a word are set from left to right + */ +#define BIT_MASK 0x80 +static inline void +new_set_bit (int nr, volatile char *addr) +{ + addr[nr >> 3] |= (BIT_MASK >> (nr & 7)); +} + +static void +convert_bitmap(char *in, char *out, int bytes) +{ + int i; + + memset(out, 0, bytes); + + for (i = 0; i < bytes << 3; i++) + if (old_test_bit(i, (void *)in)) + new_set_bit(i, out); +} + +static int +update_vhd(vhd_journal_t *journal, int rollback) +{ + unsigned int i; + int err; + size_t size; + char *buf; + void *converted; + + buf = NULL; + converted = NULL; + + size = vhd_bytes_padded(journal->vhd.spb / 8); + err = posix_memalign(&converted, 512, size); + if (err) { + converted = NULL; + goto out; + } + + for (i = 0; i < journal->vhd.bat.entries; i++) { + if (journal->vhd.bat.bat[i] == DD_BLK_UNUSED) + continue; + + err = vhd_read_bitmap(&journal->vhd, i, &buf); + if (err) + goto out; + + if (rollback) + memcpy(converted, buf, size); + else + convert_bitmap(buf, converted, size); + + free(buf); + + err = vhd_write_bitmap(&journal->vhd, i, converted); + if (err) + goto out; + } + + err = 0; + out: + free(converted); + return err; +} + +static int +open_journal(vhd_journal_t *journal, const char *file, const char *jfile) +{ + int err; + + err = vhd_journal_create(journal, file, jfile); + if (err) { + printf("error creating journal for %s: %d\n", file, err); + return err; + } + + return 0; +} + +static int +close_journal(vhd_journal_t *journal, int err) +{ + if (err) + err = vhd_journal_revert(journal); + else + err = vhd_journal_commit(journal); + + if (err) + return vhd_journal_close(journal); + else + return vhd_journal_remove(journal); +} + +int +main(int argc, char **argv) +{ + char *file, *jfile; + int c, err, rollback; + vhd_journal_t journal; + + file = NULL; + jfile = NULL; + rollback = 0; + + while ((c = getopt(argc, argv, "n:j:rh")) != -1) { + switch(c) { + case ''n'': + file = optarg; + break; + case ''j'': + jfile = optarg; + err = access(jfile, R_OK); + if (err == -1) { + printf("invalid journal arg %s\n", jfile); + return -errno; + } + break; + case ''r'': + /* add a rollback option for debugging which + * pushes journalled bitmaps to original file + * without transforming them */ + rollback = 1; + break; + default: + usage(); + } + } + + if (!file) + usage(); + + if (rollback && !jfile) { + printf("rollback requires a journal argument\n"); + usage(); + } + + err = open_journal(&journal, file, jfile); + if (err) + return err; + + if (!vhd_creator_tapdisk(&journal.vhd) || + journal.vhd.footer.crtr_ver != VHD_VERSION(0, 1) || + journal.vhd.footer.type == HD_TYPE_FIXED) { + err = 0; + goto out; + } + + err = journal_bitmaps(&journal); + if (err) { + /* no changes to vhd file yet, + * so close the journal and bail */ + vhd_journal_close(&journal); + return err; + } + + err = update_vhd(&journal, rollback); + if (err) { + printf("update failed: %d; saving journal\n", err); + goto out; + } + + err = update_creator_version(&journal); + if (err) { + printf("failed to udpate creator version: %d\n", err); + goto out; + } + + err = 0; + +out: + err = close_journal(&journal, err); + return err; +} diff --git a/tools/blktap3/vhd/vhd-util.c b/tools/blktap3/vhd/vhd-util.c new file mode 100644 --- /dev/null +++ b/tools/blktap3/vhd/vhd-util.c @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2008, XenSource Inc. + * Copyright (c) 2010, Citrix Systems, Inc. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of XenSource Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "libvhd.h" +#include "vhd-util.h" + +#if 1 +#define DFPRINTF(_f, _a...) fprintf(stdout, _f , ##_a) +#else +#define DFPRINTF(_f, _a...) ((void)0) +#endif + +typedef int (*vhd_util_func_t) (int, char **); + +struct command { + char *name; + vhd_util_func_t func; +}; + +struct command commands[] = { + { .name = "create", .func = vhd_util_create }, + { .name = "snapshot", .func = vhd_util_snapshot }, + { .name = "query", .func = vhd_util_query }, + { .name = "read", .func = vhd_util_read }, + { .name = "set", .func = vhd_util_set_field }, + { .name = "repair", .func = vhd_util_repair }, + { .name = "resize", .func = vhd_util_resize }, + { .name = "fill", .func = vhd_util_fill }, + { .name = "coalesce", .func = vhd_util_coalesce }, + { .name = "modify", .func = vhd_util_modify }, + { .name = "scan", .func = vhd_util_scan }, + { .name = "check", .func = vhd_util_check }, + { .name = "revert", .func = vhd_util_revert }, +}; + +#define print_commands() \ + do { \ + int i, n; \ + n = sizeof(commands) / sizeof(struct command); \ + printf("COMMAND := { "); \ + printf("%s", commands[0].name); \ + for (i = 1; i < n; i++) \ + printf(" | %s", commands[i].name); \ + printf(" }\n"); \ + } while (0) + +TEST_FAIL_EXTERN_VARS; + +void +help(void) +{ + printf("usage: vhd-util COMMAND [OPTIONS]\n"); + print_commands(); + exit(0); +} + +struct command * +get_command(char *command) +{ + int i, n; + + if (strnlen(command, 25) >= 25) + return NULL; + + n = sizeof(commands) / sizeof (struct command); + + for (i = 0; i < n; i++) + if (!strcmp(command, commands[i].name)) + return &commands[i]; + + return NULL; +} + +int +main(int argc, char *argv[]) +{ + char **cargv; + struct command *cmd; + int cargc, i, cnt, ret; + +#ifdef CORE_DUMP + #include <sys/resource.h> + struct rlimit rlim; + rlim.rlim_cur = RLIM_INFINITY; + rlim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_CORE, &rlim) < 0) + fprintf(stderr, "setrlimit failed: %d\n", errno); +#endif + + ret = 0; + + if (argc < 2) + help(); + + cargc = argc - 1; + cmd = get_command(argv[1]); + if (!cmd) { + fprintf(stderr, "invalid COMMAND %s\n", argv[1]); + help(); + } + + cargv = malloc(sizeof(char *) * cargc); + if (!cargv) + exit(ENOMEM); + + cnt = 1; + cargv[0] = cmd->name; + for (i = 1; i < cargc; i++) { + char *arg = argv[i + (argc - cargc)]; + + if (!strcmp(arg, "--debug")) { + libvhd_set_log_level(1); + continue; + } + + cargv[cnt++] = arg; + } + +#ifdef ENABLE_FAILURE_TESTING + for (i = 0; i < NUM_FAIL_TESTS; i++) { + TEST_FAIL[i] = 0; + if (getenv(ENV_VAR_FAIL[i])) + TEST_FAIL[i] = 1; + } +#endif // ENABLE_FAILURE_TESTING + + ret = cmd->func(cnt, cargv); + + free(cargv); + + return (ret >= 0 ? ret : -ret); +}
This patch adds the rest of blktap3 binaries to the Mercurial ignore list. Signed-off-by: Thanos Makatos <thanos.makatos@citrix.com> diff --git a/.hgignore b/.hgignore --- a/.hgignore +++ b/.hgignore @@ -379,3 +379,7 @@ # blktap3 ^tools/blktap3/tapback/tapback$ ^tools/blktap3/drivers/tapdisk3$ +^tools/blktap3/control/tap3-ctl$ +^tools/blktap3/part/part-util$ +^tools/blktap3/vhd/(vhd-index3|vhd-update3|vhd-util3)$ +
This patch hooks blktap3 into the build system. Signed-off-by: Thanos Makatos <thanos.makatos@citrix.com> diff --git a/tools/Makefile b/tools/Makefile --- a/tools/Makefile +++ b/tools/Makefile @@ -25,6 +25,7 @@ ifeq ($(CONFIG_X86),y) SUBDIRS-$(CONFIG_Linux) += blktap endif SUBDIRS-$(CONFIG_Linux) += blktap2 +SUBDIRS-$(CONFIG_Linux) += blktap3 SUBDIRS-$(CONFIG_NetBSD) += $(SUBDIRS-libaio) SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd