Daniel De Graaf
2011-Aug-19 14:38 UTC
[Xen-devel] [PATCH] libvchan: interdomain communications library
Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- This version includes a local copy of gntalloc.h and gntdev.h to allow it to compile when the installed kernel headers do not include gntalloc and to support GNTDEV_SET_UNMAP_NOTIFY when the running kernel does not match the headers. --- tools/Makefile | 1 + tools/libvchan/Makefile | 57 ++++++ tools/libvchan/gntalloc.h | 82 ++++++++ tools/libvchan/gntdev.h | 150 ++++++++++++++ tools/libvchan/init.c | 456 ++++++++++++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 270 +++++++++++++++++++++++++ tools/libvchan/libvchan.h | 141 +++++++++++++ tools/libvchan/node-select.c | 161 +++++++++++++++ tools/libvchan/node.c | 169 ++++++++++++++++ 9 files changed, 1487 insertions(+), 0 deletions(-) create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/gntalloc.h create mode 100644 tools/libvchan/gntdev.h create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/libvchan.h create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..9195f6e --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,57 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) +LIBVCHAN_OBJS: CFLAGS += $(CFLAGS_libxenstore) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I. -fPIC + +.PHONY: all +all: libvchan.so vchan-node1 vchan-node2 libvchan.a + +libvchan.so: libvchan.so.$(MAJOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR): libvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libvchan.a $^ + +vchan-node1: $(NODE_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libvchan.so $(LDLIBS_libvchan) + +vchan-node2: $(NODE2_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libvchan.so $(LDLIBS_libvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libvchan.so.$(MAJOR) + ln -sf libvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libvchan.so + $(INSTALL_DATA) libvchan.a $(DESTDIR)$(LIBDIR) + $(INSTALL_DATA) libvchan.h $(DESTDIR)$(INCLUDEDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/gntalloc.h b/tools/libvchan/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/libvchan/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/libvchan/gntdev.h b/tools/libvchan/gntdev.h new file mode 100644 index 0000000..5304bd3 --- /dev/null +++ b/tools/libvchan/gntdev.h @@ -0,0 +1,150 @@ +/****************************************************************************** + * gntdev.h + * + * Interface to /dev/xen/gntdev. + * + * Copyright (c) 2007, D G Murray + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_GNTDEV_H__ +#define __LINUX_PUBLIC_GNTDEV_H__ + +struct ioctl_gntdev_grant_ref { + /* The domain ID of the grant to be mapped. */ + uint32_t domid; + /* The grant reference of the grant to be mapped. */ + uint32_t ref; +}; + +/* + * Inserts the grant references into the mapping table of an instance + * of gntdev. N.B. This does not perform the mapping, which is deferred + * until mmap() is called with @index as the offset. + */ +#define IOCTL_GNTDEV_MAP_GRANT_REF \ +_IOC(_IOC_NONE, ''G'', 0, sizeof(struct ioctl_gntdev_map_grant_ref)) +struct ioctl_gntdev_map_grant_ref { + /* IN parameters */ + /* The number of grants to be mapped. */ + uint32_t count; + uint32_t pad; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* Variable IN parameter. */ + /* Array of grant references, of size @count. */ + struct ioctl_gntdev_grant_ref refs[1]; +}; + +/* + * Removes the grant references from the mapping table of an instance of + * of gntdev. N.B. munmap() must be called on the relevant virtual address(es) + * before this ioctl is called, or an error will result. + */ +#define IOCTL_GNTDEV_UNMAP_GRANT_REF \ +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +struct ioctl_gntdev_unmap_grant_ref { + /* IN parameters */ + /* The offset was returned by the corresponding map operation. */ + uint64_t index; + /* The number of pages to be unmapped. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Returns the offset in the driver''s address space that corresponds + * to @vaddr. This can be used to perform a munmap(), followed by an + * UNMAP_GRANT_REF ioctl, where no state about the offset is retained by + * the caller. The number of pages that were allocated at the same time as + * @vaddr is returned in @count. + * + * N.B. Where more than one page has been mapped into a contiguous range, the + * supplied @vaddr must correspond to the start of the range; otherwise + * an error will result. It is only possible to munmap() the entire + * contiguously-allocated range at once, and not any subrange thereof. + */ +#define IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR \ +_IOC(_IOC_NONE, ''G'', 2, sizeof(struct ioctl_gntdev_get_offset_for_vaddr)) +struct ioctl_gntdev_get_offset_for_vaddr { + /* IN parameters */ + /* The virtual address of the first mapped page in a range. */ + uint64_t vaddr; + /* OUT parameters */ + /* The offset that was used in the initial mmap() operation. */ + uint64_t offset; + /* The number of pages mapped in the VM area that begins at @vaddr. */ + uint32_t count; + uint32_t pad; +}; + +/* + * Sets the maximum number of grants that may mapped at once by this gntdev + * instance. + * + * N.B. This must be called before any other ioctl is performed on the device. + */ +#define IOCTL_GNTDEV_SET_MAX_GRANTS \ +_IOC(_IOC_NONE, ''G'', 3, sizeof(struct ioctl_gntdev_set_max_grants)) +struct ioctl_gntdev_set_max_grants { + /* IN parameter */ + /* The maximum number of grants that may be mapped at once. */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..0cf00e2 --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,456 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> + +#include "libvchan.h" +#include "gntalloc.h" +#include "gntdev.h" + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libvchan *ctrl) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); + int ring_ref = -1; + if (ring_fd < 0) + return -1; + struct ioctl_gntalloc_alloc_gref *gref_info = malloc( + sizeof(struct ioctl_gntalloc_alloc_gref) + max(pages_left, pages_right)*sizeof(uint32_t)); + + gref_info->domid = ctrl->other_domain_id; + gref_info->flags = GNTALLOC_FLAG_WRITABLE; + gref_info->count = 1; + + int err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out; + + void* ring = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, ring_fd, gref_info->index); + + if (ring == MAP_FAILED) + goto out; + + ctrl->ring = ring; + ring_ref = gref_info->gref_ids[0]; + + memset(ring, 0, PAGE_SIZE); + + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->debug = 0xabcd; + +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY + { + struct ioctl_gntalloc_unmap_notify arg; + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); + } +#endif + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_left; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_ring; + void* area = mmap(NULL, pages_left * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_ring; + ctrl->read.buffer = area; + memcpy(ctrl->ring->grants, gref_info->gref_ids, pages_left * sizeof(uint32_t)); + } + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_right; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_unmap_left; + void* area = mmap(NULL, pages_right * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_unmap_left; + ctrl->write.buffer = area; + memcpy(ctrl->ring->grants + (pages_left * sizeof(uint32_t)), + gref_info->gref_ids, pages_right * sizeof(uint32_t)); + } + +out: + close(ring_fd); + return ring_ref; +out_unmap_left: + munmap(ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + munmap(ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static void* do_gnt_map(int fd, int domid, uint32_t* pages, size_t npages, uint64_t *index) +{ + int i, rv; + void* area = NULL; + struct ioctl_gntdev_map_grant_ref *gref_info; + gref_info = malloc(sizeof(*gref_info) + npages*sizeof(gref_info->refs[0])); + gref_info->count = npages; + for(i=0; i < npages; i++) { + gref_info->refs[i].domid = domid; + gref_info->refs[i].ref = pages[i]; + } + + rv = ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, gref_info); + if (rv) + goto out; + if (index) + *index = gref_info->index; + area = mmap(NULL, PAGE_SIZE * npages, PROT_READ | PROT_WRITE, MAP_SHARED, fd, gref_info->index); + if (area == MAP_FAILED) { + struct ioctl_gntdev_unmap_grant_ref undo = { + .index = gref_info->index, + .count = gref_info->count + }; + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &undo); + area = NULL; + } + out: + free(gref_info); + return area; +} + +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) +{ + int ring_fd = open("/dev/xen/gntdev", O_RDWR); + int rv = -1; + uint64_t ring_index; + if (ring_fd < 0) + return -1; + + ctrl->ring = do_gnt_map(ring_fd, ctrl->other_domain_id, &ring_ref, 1, &ring_index); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < 10 || ctrl->write.order > 24) + goto out_unmap_ring; + if (ctrl->read.order < 10 || ctrl->read.order > 24) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) + goto out_unmap_ring; + + uint32_t* grants = ctrl->ring->grants; + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_left, NULL); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_right, NULL); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + +#ifdef IOCTL_GNTDEV_SET_UNMAP_NOTIFY + { + struct ioctl_gntdev_unmap_notify arg; + arg.index = ring_index + offsetof(struct vchan_interface, cli_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, &arg); + } +#endif + + rv = 0; + out: + close(ring_fd); + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + out_unmap_ring: + munmap(ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_unbound_port bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + bind.remote_domain = ctrl->other_domain_id; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_UNBOUND_PORT, &bind); + if (ctrl->event_port < 0) + return -1; + write(ctrl->event_fd, &ctrl->event_port, sizeof(ctrl->event_port)); + return 0; +} + +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = ctrl->other_domain_id; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libvchan *libvchan_server_init(int domain, int devno, size_t left_min, size_t right_min) +{ + // if you go over this size, you''ll have too many grants to fit in the shared page. + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + if (!ctrl) + return 0; + + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so +#define MAX_SMALL_RING 1024 +#define MAX_LARGE_RING 2048 + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = 10; + ctrl->write.order = 11; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = 11; + ctrl->write.order = 10; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = 11; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = 11; + } + if (init_evt_srv(ctrl)) + goto out; + int ring_ref = init_gnt_srv(ctrl); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, ring_ref)) + goto out; + return ctrl; +out: + libvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_interdomain bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + + bind.remote_domain = ctrl->other_domain_id; + bind.remote_port = ctrl->event_port; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + if (ctrl->event_port < 0) + return -1; + return 0; +} + + +struct libvchan *libvchan_client_init(int domain, int devno) +{ + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + if (!ctrl) + return 0; + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/ring-ref", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/event-channel", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + +// set up event channel + if (init_evt_cli(ctrl)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->debug = 0xabce; + + out: + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..f95c7b0 --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,270 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> + +#include "libvchan.h" + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +static uint32_t rd_prod(struct libvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static uint32_t* _rd_cons(struct libvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static uint32_t* _wr_prod(struct libvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static uint32_t wr_cons(struct libvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static const void* rd_ring(struct libvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static void* wr_ring(struct libvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static uint32_t wr_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static uint32_t rd_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +int libvchan_data_ready(struct libvchan *ctrl) +{ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libvchan_buffer_space(struct libvchan *ctrl) +{ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +static int do_notify(struct libvchan *ctrl) +{ + struct ioctl_evtchn_notify notify; + notify.port = ctrl->event_port; + return ioctl(ctrl->event_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); +} + +int libvchan_wait(struct libvchan *ctrl) +{ + int ret; + uint32_t dummy; + ret = read(ctrl->event_fd, &dummy, sizeof(dummy)); + if (ret == -1) + return -1; + write(ctrl->event_fd, &dummy, sizeof(dummy)); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + wr_prod(ctrl) += size; + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail = libvchan_buffer_space(ctrl); + if (!libvchan_is_open(ctrl)) + return -1; + if (size > avail) + return 0; + return do_send(ctrl, data, size); +} + +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail = libvchan_buffer_space(ctrl); + if (!libvchan_is_open(ctrl)) + return -1; + if (size > avail) + size = avail; + return do_send(ctrl, data, size); +} + +static int do_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int avail = libvchan_data_ready(ctrl); + if (size <= avail) + return do_recv(ctrl, data, size); + else if (libvchan_is_open(ctrl)) + return 0; + else + return -1; +} + +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) +{ + int avail = libvchan_data_ready(ctrl); + if (size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + else if (libvchan_is_open(ctrl)) + return 0; + else + return -1; +} + +int libvchan_is_open(struct libvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist || ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +/// The fd to use for select() set +int libvchan_fd_for_select(struct libvchan *ctrl) +{ + return ctrl->event_fd; +} + +void libvchan_close(struct libvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->ring) { + if (ctrl->is_server) + ctrl->ring->srv_live = 0; + else + ctrl->ring->cli_live = 0; + munmap(ctrl->ring, PAGE_SIZE); + } + if (ctrl->event_fd != -1) { + if (ctrl->event_port > 0 && ctrl->ring) + do_notify(ctrl); + close(ctrl->event_fd); + } + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + free(ctrl); +} diff --git a/tools/libvchan/libvchan.h b/tools/libvchan/libvchan.h new file mode 100644 index 0000000..a6c08f4 --- /dev/null +++ b/tools/libvchan/libvchan.h @@ -0,0 +1,141 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFN''s and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference is expected to be shared through some out-of-band mechanism + * such as XenStore. + */ + +#include <stdint.h> +#include <sys/types.h> +#include <xen/sys/evtchn.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +/// struct vchan_interface is placed in memory shared between domains +struct vchan_interface { + // standard consumer/producer interface, one pair per buffer + // left is client write, server read + // right is client read, server write + struct ring_shared left, right; + // size of the rings, which determines their location + // 10 - at offset 1024 in ring''s page + // 11 - at offset 2048 in ring''s page + // 12+ - uses 2^(N-12) grants to describe the multi-page ring + // These should remain constant once the page is shared. + // Only one of the two orders can be 10 (or 11). + uint16_t left_order, right_order; + // Shutdown detection + uint8_t cli_live, srv_live; + uint16_t debug; + // Grant list: ordering is left, right. Must not extend into actual ring + // or grow beyond the end of the initial shared page. + // These should remain constant once the page is shared, to allow + // for possible remapping by a client that restarts. + uint32_t grants[0]; +}; + +struct libvchan_ring { + // Pointer into the shared page. Offsets into buffer + struct ring_shared* shr; + // ring data + void* buffer; + // size of the ring is (1 << order). + // This is used to constrain offsets to the buffer. + // (we can''t trust the order in the shared page to remain constant) + int order; +}; + +/// struct libvchan is a control structure, passed to all library calls +struct libvchan { + // person we communicate with + int other_domain_id; + // "port" we communicate on (allows multiple vchans to exist in xenstore) + int device_number; + // Shared ring page, mapped using gntdev or gntalloc + // Note that the FD for gntdev or gntalloc has already been closed. + struct vchan_interface *ring; + // event channel interface (needs port for API) + int event_fd; + uint32_t event_port; + // informative flag + int is_server:1; + int server_persist:1; + struct libvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param domain The peer domain that will be connecting + * @param devno A device number, used to identify this vchan in xenstore + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_server_init(int domain, int devno, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param domain The peer domain to connect to + * @param devno A device number, used to identify this vchan in xenstore + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_client_init(int domain, int devno); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libvchan_close(struct libvchan *ctrl); + +// reads exactly size or aborts +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); +// reads up to size bytes (including zero) without blocking +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); +// sends entire buffer or aborts +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); +// sends as much data as possible without blocking +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); +// waits for reads or writes to unblock, or for a close +int libvchan_wait(struct libvchan *ctrl); +// (only) when this FD is readable, libvchan_wait() will not block +int libvchan_fd_for_select(struct libvchan *ctrl); +// return 0 when one side has called libvchan_close() or crashed +// return 1 when both sides are open +// return 2 [server only] when no client has yet connected +int libvchan_is_open(struct libvchan* ctrl); +int libvchan_data_ready(struct libvchan *ctrl); +int libvchan_buffer_space(struct libvchan *ctrl); diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..a7c614b --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,161 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include "libvchan.h" + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libvchan application, both client and server. + Both sides may write and read, both from the libvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libvchan_fd; + if (argc < 4) + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libvchan_server_init(atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[2]), atoi(argv[3])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libvchan_fd = libvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libvchan_fd, &rfds); + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libvchan_fd, &rfds)) { + libvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..d00a50f --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include "libvchan.h" + +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodeid\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libvchan *ctrl = 0; + int wr; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libvchan_server_init(atoi(argv[3]), atoi(argv[4]), 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[3]), atoi(argv[4])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libvchan_close(ctrl); + return 0; +} -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Vasiliy G Tolstov
2011-Aug-22 07:40 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
On Fri, 19 Aug 2011 10:38:44 -0400, Daniel De Graaf wrote:> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > --- > > This version includes a local copy of gntalloc.h and gntdev.h to > allow > it to compile when the installed kernel headers do not include > gntalloc > and to support GNTDEV_SET_UNMAP_NOTIFY when the running kernel does > not > match the headers. >Sorry fo offtopic, can you provide example of usage this library? _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Aug-22 09:15 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
Hi Daniel, On Fri, 2011-08-19 at 15:38 +0100, Daniel De Graaf wrote: Please could you say a few words about the functionality this new library enables and perhaps the design etc? In particular a protocol spec would be useful for anyone who wanted to reimplement for another guest OS etc. In particular memory barriers are conspicuous in their absence. I didn''t review in detail but it seems like you have invented your own ring datastructures rather than using the standard ring.h ones. I think this needs some justification. I think it would be appropriate to add protocol.txt at the same time as checking in the library. In one of the headers it says "The grant reference is expected to be shared through some out-of-band mechanism such as XenStore." but the library appears to implement the xenstore stuff internally.> tools/libvchan/gntalloc.h | 82 ++++++++ > tools/libvchan/gntdev.h | 150 ++++++++++++++If these define a Linux ioctl interface then they should go in tools/include/xen-sys/Linux/.> tools/libvchan/libvchan.h | 141 +++++++++++++I presume that somewhere in there a shared datastructure is defined. In which case that might be better off added to xen/include/public/io/ instead. Cheers, Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Aug-24 18:52 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
On 08/22/2011 05:15 AM, Ian Campbell wrote:> Hi Daniel, > > On Fri, 2011-08-19 at 15:38 +0100, Daniel De Graaf wrote: > > Please could you say a few words about the functionality this new > library enables and perhaps the design etc? In particular a protocol > spec would be useful for anyone who wanted to reimplement for another > guest OS etc. In particular memory barriers are conspicuous in their > absence.Agreed, barriers are needed for a robust implementation. Since this is a shared library and not xen or kernel code, the existing barrier()/wmb() functions are not available. The only existing implementations in Xen appear to expand to asm("":::"memory") which does not actually implement a memory barrier (i.e. MFENCE or similar opcodes). The __sync_synchronize() compiler intrinsic properly implements these barriers and has been used. The lack of memory fencing in xen seems to be a topic for another thread.> I didn''t review in detail but it seems like you have invented your own > ring datastructures rather than using the standard ring.h ones. I think > this needs some justification(this has been added as a comment) The ring.h macros define an asymmetric interface to a shared data structure that assumes all rings reside in a single contiguous memory space. This is not suitable for vchan because the interface to the ring is symmetric except for the setup. Unlike the producer-consumer rings defined in ring.h, the size of the rings used in vchan are determined at execution time instead of compile time, so the macros in ring.h cannot be used to access the rings.> I think it would be appropriate to add protocol.txt at the same time as > checking in the library.The comments in the shared header file explain the layout of the shared memory regions; any other parts of the protocol are application-defined.> In one of the headers it says "The grant reference is expected to be > shared through some out-of-band mechanism such as XenStore." but the > library appears to implement the xenstore stuff internally.Comment has been updated to point to the xenstore path used.>> tools/libvchan/gntalloc.h | 82 ++++++++ >> tools/libvchan/gntdev.h | 150 ++++++++++++++ > > If these define a Linux ioctl interface then they should go in > tools/include/xen-sys/Linux/. > >> tools/libvchan/libvchan.h | 141 +++++++++++++ > > I presume that somewhere in there a shared datastructure is defined. In > which case that might be better off added to xen/include/public/io/ > instead. > > Cheers, > Ian. > >I''m sending an updated patch with these revisions in a separate email. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Aug-24 18:52 UTC
[Xen-devel] [PATCH v2] libvchan: interdomain communications library
This library implements a bidirectional communication interface between applications in different domains, similar to unix sockets. Data can be sent using the byte-oriented libvchan_read/libvchan_write or the packet-oriented libvchan_recv/libvchan_send. Channel setup is done using a client-server model; domain IDs and a port number must be negotiated prior to initialization. The server allocates memory for the shared pages and determines the sizes of the communication rings (which may span multiple pages, although the default places rings and control within a single page). With properly sized rings, testing has shown that this interface provides speed comparable to pipes within a single Linux domain; it is significantly faster than network-based communication. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/Makefile | 1 + tools/include/xen-sys/Linux/gntalloc.h | 82 ++++++ tools/include/xen-sys/Linux/gntdev.h | 33 +++- tools/libvchan/Makefile | 57 ++++ tools/libvchan/init.c | 463 ++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 275 +++++++++++++++++++ tools/libvchan/node-select.c | 161 +++++++++++ tools/libvchan/node.c | 169 ++++++++++++ xen/include/public/io/libvchan.h | 154 +++++++++++ 9 files changed, 1394 insertions(+), 1 deletions(-) create mode 100644 tools/include/xen-sys/Linux/gntalloc.h create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c create mode 100644 xen/include/public/io/libvchan.h diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/include/xen-sys/Linux/gntalloc.h b/tools/include/xen-sys/Linux/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/include/xen-sys/Linux/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/include/xen-sys/Linux/gntdev.h b/tools/include/xen-sys/Linux/gntdev.h index 8bd1467..5304bd3 100644 --- a/tools/include/xen-sys/Linux/gntdev.h +++ b/tools/include/xen-sys/Linux/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..b4991ea --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,57 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) +LIBVCHAN_OBJS: CFLAGS += $(CFLAGS_libxenstore) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I../include -I. -fPIC + +.PHONY: all +all: libvchan.so vchan-node1 vchan-node2 libvchan.a + +libvchan.so: libvchan.so.$(MAJOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR): libvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libvchan.a $^ + +vchan-node1: $(NODE_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libvchan.so $(LDLIBS_libvchan) + +vchan-node2: $(NODE2_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libvchan.so $(LDLIBS_libvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libvchan.so.$(MAJOR) + ln -sf libvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libvchan.so + $(INSTALL_DATA) libvchan.a $(DESTDIR)$(LIBDIR) + $(INSTALL_DATA) libvchan.h $(DESTDIR)$(INCLUDEDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..dedb071 --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,463 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> +#include <xen/sys/gntalloc.h> +#include <xen/sys/gntdev.h> +#include <xen/io/libvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libvchan *ctrl) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); + int ring_ref = -1; + int err; + void *ring, *area; + + if (ring_fd < 0) + return -1; + + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); + + gref_info->domid = ctrl->other_domain_id; + gref_info->flags = GNTALLOC_FLAG_WRITABLE; + gref_info->count = 1; + + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out; + + ring = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, ring_fd, gref_info->index); + + if (ring == MAP_FAILED) + goto out; + + ctrl->ring = ring; + ring_ref = gref_info->gref_ids[0]; + + memset(ring, 0, PAGE_SIZE); + + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->debug = 0xabcd; + +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY + { + struct ioctl_gntalloc_unmap_notify arg; + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); + } +#endif + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_left; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_ring; + area = mmap(NULL, pages_left * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_ring; + ctrl->read.buffer = area; + memcpy(ctrl->ring->grants, gref_info->gref_ids, pages_left * sizeof(uint32_t)); + } + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_right; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_unmap_left; + area = mmap(NULL, pages_right * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_unmap_left; + ctrl->write.buffer = area; + memcpy(ctrl->ring->grants + (pages_left * sizeof(uint32_t)), + gref_info->gref_ids, pages_right * sizeof(uint32_t)); + } + +out: + close(ring_fd); + free(gref_info); + return ring_ref; +out_unmap_left: + if (ctrl->read.order > 11) + munmap(ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + munmap(ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static void* do_gnt_map(int fd, int domid, uint32_t* pages, size_t npages, uint64_t *index) +{ + int i, rv; + void* area = NULL; + struct ioctl_gntdev_map_grant_ref *gref_info; + gref_info = malloc(sizeof(*gref_info) + npages*sizeof(gref_info->refs[0])); + gref_info->count = npages; + for(i=0; i < npages; i++) { + gref_info->refs[i].domid = domid; + gref_info->refs[i].ref = pages[i]; + } + + rv = ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, gref_info); + if (rv) + goto out; + if (index) + *index = gref_info->index; + area = mmap(NULL, PAGE_SIZE * npages, PROT_READ | PROT_WRITE, MAP_SHARED, fd, gref_info->index); + if (area == MAP_FAILED) { + struct ioctl_gntdev_unmap_grant_ref undo = { + .index = gref_info->index, + .count = gref_info->count + }; + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &undo); + area = NULL; + } + out: + free(gref_info); + return area; +} + +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) +{ + int ring_fd = open("/dev/xen/gntdev", O_RDWR); + int rv = -1; + uint64_t ring_index; + uint32_t *grants; + if (ring_fd < 0) + return -1; + + ctrl->ring = do_gnt_map(ring_fd, ctrl->other_domain_id, &ring_ref, 1, &ring_index); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < 10 || ctrl->write.order > 24) + goto out_unmap_ring; + if (ctrl->read.order < 10 || ctrl->read.order > 24) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) + goto out_unmap_ring; + + grants = ctrl->ring->grants; + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_left, NULL); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_right, NULL); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + +#ifdef IOCTL_GNTDEV_SET_UNMAP_NOTIFY + { + struct ioctl_gntdev_unmap_notify arg; + arg.index = ring_index + offsetof(struct vchan_interface, cli_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, &arg); + } +#endif + + rv = 0; + out: + close(ring_fd); + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + out_unmap_ring: + munmap(ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_unbound_port bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + bind.remote_domain = ctrl->other_domain_id; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_UNBOUND_PORT, &bind); + if (ctrl->event_port < 0) + return -1; + write(ctrl->event_fd, &ctrl->event_port, sizeof(ctrl->event_port)); + return 0; +} + +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = ctrl->other_domain_id; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libvchan *libvchan_server_init(int domain, int devno, size_t left_min, size_t right_min) +{ + // if you go over this size, you''ll have too many grants to fit in the shared page. + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; + struct libvchan *ctrl; + int ring_ref; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + ctrl = malloc(sizeof(*ctrl)); + if (!ctrl) + return 0; + + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so +#define MAX_SMALL_RING 1024 +#define MAX_LARGE_RING 2048 + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = 10; + ctrl->write.order = 11; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = 11; + ctrl->write.order = 10; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = 11; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = 11; + } + if (init_evt_srv(ctrl)) + goto out; + ring_ref = init_gnt_srv(ctrl); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, ring_ref)) + goto out; + return ctrl; +out: + libvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_interdomain bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + + bind.remote_domain = ctrl->other_domain_id; + bind.remote_port = ctrl->event_port; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + if (ctrl->event_port < 0) + return -1; + return 0; +} + + +struct libvchan *libvchan_client_init(int domain, int devno) +{ + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + if (!ctrl) + return 0; + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/ring-ref", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/event-channel", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + +// set up event channel + if (init_evt_cli(ctrl)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->debug = 0xabce; + + out: + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..9f60a80 --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,275 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <xen/io/libvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +#define barrier() __sync_synchronize() + +static uint32_t rd_prod(struct libvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static uint32_t* _rd_cons(struct libvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static uint32_t* _wr_prod(struct libvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static uint32_t wr_cons(struct libvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static const void* rd_ring(struct libvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static void* wr_ring(struct libvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static uint32_t wr_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static uint32_t rd_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +int libvchan_data_ready(struct libvchan *ctrl) +{ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libvchan_buffer_space(struct libvchan *ctrl) +{ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +static int do_notify(struct libvchan *ctrl) +{ + struct ioctl_evtchn_notify notify; + notify.port = ctrl->event_port; + return ioctl(ctrl->event_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); +} + +int libvchan_wait(struct libvchan *ctrl) +{ + int ret; + uint32_t dummy; + ret = read(ctrl->event_fd, &dummy, sizeof(dummy)); + if (ret == -1) + return -1; + write(ctrl->event_fd, &dummy, sizeof(dummy)); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + barrier(); // data must be in the ring prior to increment + wr_prod(ctrl) += size; + barrier(); // increment must happen prior to notify + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail = libvchan_buffer_space(ctrl); + if (!libvchan_is_open(ctrl)) + return -1; + if (size > avail) + return 0; + return do_send(ctrl, data, size); +} + +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail = libvchan_buffer_space(ctrl); + if (!libvchan_is_open(ctrl)) + return -1; + if (size > avail) + size = avail; + return do_send(ctrl, data, size); +} + +static int do_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + barrier(); // data read must happen after rd_cons read + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + barrier(); // consumption must happen prior to notify of newly freed space + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int avail = libvchan_data_ready(ctrl); + if (size <= avail) + return do_recv(ctrl, data, size); + else if (libvchan_is_open(ctrl)) + return 0; + else + return -1; +} + +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) +{ + int avail = libvchan_data_ready(ctrl); + if (size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + else if (libvchan_is_open(ctrl)) + return 0; + else + return -1; +} + +int libvchan_is_open(struct libvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist || ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +/// The fd to use for select() set +int libvchan_fd_for_select(struct libvchan *ctrl) +{ + return ctrl->event_fd; +} + +void libvchan_close(struct libvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->ring) { + if (ctrl->is_server) + ctrl->ring->srv_live = 0; + else + ctrl->ring->cli_live = 0; + munmap(ctrl->ring, PAGE_SIZE); + } + if (ctrl->event_fd != -1) { + if (ctrl->event_port > 0 && ctrl->ring) + do_notify(ctrl); + close(ctrl->event_fd); + } + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + free(ctrl); +} diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..0000bc8 --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,161 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include <xen/io/libvchan.h> + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libvchan application, both client and server. + Both sides may write and read, both from the libvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libvchan_fd; + if (argc < 4) + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libvchan_server_init(atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[2]), atoi(argv[3])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libvchan_fd = libvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libvchan_fd, &rfds); + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libvchan_fd, &rfds)) { + libvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..fa670b3 --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include <xen/io/libvchan.h> + +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodeid\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libvchan *ctrl = 0; + int wr; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libvchan_server_init(atoi(argv[3]), atoi(argv[4]), 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[3]), atoi(argv[4])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libvchan_close(ctrl); + return 0; +} diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h new file mode 100644 index 0000000..45a9b85 --- /dev/null +++ b/xen/include/public/io/libvchan.h @@ -0,0 +1,154 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<domid>/data/vchan/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <stdint.h> +#include <sys/types.h> +#include <xen/sys/evtchn.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +/// struct vchan_interface is placed in memory shared between domains +struct vchan_interface { + // standard consumer/producer interface, one pair per buffer + // left is client write, server read + // right is client read, server write + struct ring_shared left, right; + // size of the rings, which determines their location + // 10 - at offset 1024 in ring''s page + // 11 - at offset 2048 in ring''s page + // 12+ - uses 2^(N-12) grants to describe the multi-page ring + // These should remain constant once the page is shared. + // Only one of the two orders can be 10 (or 11). + uint16_t left_order, right_order; + // Shutdown detection: + // 0: client (or server) has exited + // 1: client (or server) is connected + // 2: client has not yet connected + uint8_t cli_live, srv_live; + // structure padding; magic values depending on setup stage + uint16_t debug; + // Grant list: ordering is left, right. Must not extend into actual ring + // or grow beyond the end of the initial shared page. + // These should remain constant once the page is shared, to allow + // for possible remapping by a client that restarts. + uint32_t grants[0]; +}; + +struct libvchan_ring { + // Pointer into the shared page. Offsets into buffer + struct ring_shared* shr; + // ring data + void* buffer; + // size of the ring is (1 << order). + // This is used to constrain offsets to the buffer. + // (we can''t trust the order in the shared page to remain constant) + int order; +}; + +/// struct libvchan is a control structure, passed to all library calls +struct libvchan { + // person we communicate with + int other_domain_id; + // "port" we communicate on (allows multiple vchans to exist in xenstore) + int device_number; + // Shared ring page, mapped using gntdev or gntalloc + // Note that the FD for gntdev or gntalloc has already been closed. + struct vchan_interface *ring; + // event channel interface (needs port for API) + int event_fd; + uint32_t event_port; + // informative flags: are we acting as server? + int is_server:1; + // true if server remains active when client closes (allows reconnection) + int server_persist:1; + // communication rings + struct libvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param domain The peer domain that will be connecting + * @param devno A device number, used to identify this vchan in xenstore + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_server_init(int domain, int devno, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param domain The peer domain to connect to + * @param devno A device number, used to identify this vchan in xenstore + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_client_init(int domain, int devno); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libvchan_close(struct libvchan *ctrl); + +// reads exactly size or returns -1 +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); +// reads up to size bytes (including zero) without blocking or returns -1 +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); +// sends entire buffer or returns -1 +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); +// sends as much data as possible without blocking +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); +// waits for reads or writes to unblock, or for a close +int libvchan_wait(struct libvchan *ctrl); +// (only) when this FD is readable, libvchan_wait() will not block +int libvchan_fd_for_select(struct libvchan *ctrl); +// return 0 when one side has called libvchan_close() or crashed +// return 1 when both sides are open +// return 2 [server only] when no client has yet connected +int libvchan_is_open(struct libvchan* ctrl); +int libvchan_data_ready(struct libvchan *ctrl); +int libvchan_buffer_space(struct libvchan *ctrl); -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Aug-24 19:28 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
On Mon, Aug 22, 2011 at 11:40:42AM +0400, Vasiliy G Tolstov wrote:> On Fri, 19 Aug 2011 10:38:44 -0400, Daniel De Graaf wrote: > >Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > >--- > > > >This version includes a local copy of gntalloc.h and gntdev.h to > >allow > >it to compile when the installed kernel headers do not include > >gntalloc > >and to support GNTDEV_SET_UNMAP_NOTIFY when the running kernel > >does not > >match the headers. > > > > Sorry fo offtopic, can you provide example of usage this library?There is a linpicker and then hopefully the Xen PV audio can transition to it (it uses the same devices that this library introduces).> > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Tim Deegan
2011-Aug-25 10:27 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
At 14:52 -0400 on 24 Aug (1314197572), Daniel De Graaf wrote:> Agreed, barriers are needed for a robust implementation. Since this is > a shared library and not xen or kernel code, the existing barrier()/wmb() > functions are not available. The only existing implementations in Xen > appear to expand to asm("":::"memory") which does not actually implement > a memory barrier (i.e. MFENCE or similar opcodes).AIUI on x86_64, writes are guaranteed to be seen in order so the only thing that barrier has to protect against is the compiler reordering the writes. Tim. -- Tim Deegan <tim@xen.org> Principal Software Engineer, Xen Platform Team Citrix Systems UK Ltd. (Company #02937203, SL9 0BG) _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Jeremy Fitzhardinge
2011-Aug-25 18:36 UTC
Re: [Xen-devel] [PATCH] libvchan: interdomain communications library
On 08/25/2011 03:27 AM, Tim Deegan wrote:> At 14:52 -0400 on 24 Aug (1314197572), Daniel De Graaf wrote: >> Agreed, barriers are needed for a robust implementation. Since this is >> a shared library and not xen or kernel code, the existing barrier()/wmb() >> functions are not available. The only existing implementations in Xen >> appear to expand to asm("":::"memory") which does not actually implement >> a memory barrier (i.e. MFENCE or similar opcodes). > AIUI on x86_64, writes are guaranteed to be seen in order so the only > thing that barrier has to protect against is the compiler reordering the > writes.Yeah, x86 is pretty sane about that stuff. The main pitfall is that reads are not necessarily ordered WRT unlocked writes to different memory locations, which can bite when you''re doing things like: shared->locked = 0; if (shared->need_wake) wake_other(); since "need_wake" can be read before the other side has observed the "locked = 0". You can fix it by putting mfence in there, but a sneakier fix is to make the read overlap the written location, so that the CPU will force ordering (and ignore the parts you read that you don''t want) - this is much cheaper than an explicit fence. There are some off-brand x86 implementations which do have out of order store rules, but nobody has been silly enough to implement SMP systems with them, and I doubt Xen supports them anyway. Oh, and there''s a PPro bug which can result in misordered stores, but I think we can overlook that pretty safely too. Also I think the various stores which can jump cache levels have weaker ordering rules, but that shouldn''t matter here. J _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Aug-26 14:01 UTC
Re: [Xen-devel] [PATCH v2] libvchan: interdomain communications library
Daniel De Graaf writes ("[Xen-devel] [PATCH v2] libvchan: interdomain communications library"):> This library implements a bidirectional communication interface between > applications in different domains, similar to unix sockets. Data can be > sent using the byte-oriented libvchan_read/libvchan_write or the > packet-oriented libvchan_recv/libvchan_send. > > Channel setup is done using a client-server model; domain IDs and a port > number must be negotiated prior to initialization. The server allocates > memory for the shared pages and determines the sizes of the > communication rings (which may span multiple pages, although the default > places rings and control within a single page). > > With properly sized rings, testing has shown that this interface > provides speed comparable to pipes within a single Linux domain; it is > significantly faster than network-based communication.This doesn''t build for me. Ian. make[3]: Entering directory `/u/iwj/work/xen-unstable-tools.hg/tools/libvchan'' gcc -O1 -fno-omit-frame-pointer -m32 -march=i686 -g -fno-strict-aliasing -std=gnu99 -Wall -Wstrict-prototypes -Wno-unused-value -Wdeclaration-after-statement -D__XEN_TOOLS__ -MMD -MF .init.o.d -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -D_LARGEFILE64_SOURCE -fno-optimize-sibling-calls -mno-tls-direct-seg-refs -I../include -I. -fPIC -c -o init.o init.c init.c:41:16: warning: xs.h: No such file or directory In file included from init.c:44: ../include/xen/sys/gntalloc.h:22: error: expected specifier-qualifier-list before ''uint16_t'' ../include/xen/sys/gntalloc.h:46: error: expected specifier-qualifier-list before ''uint64_t'' ../include/xen/sys/gntalloc.h:70: error: expected specifier-qualifier-list before ''uint64_t'' In file included from init.c:45: ../include/xen/sys/gntdev.h:38: error: expected specifier-qualifier-list before ''uint32_t'' ../include/xen/sys/gntdev.h:53: error: expected specifier-qualifier-list before ''uint32_t'' ../include/xen/sys/gntdev.h:73: error: expected specifier-qualifier-list before ''uint64_t'' ../include/xen/sys/gntdev.h:96: error: expected specifier-qualifier-list before ''uint64_t'' ../include/xen/sys/gntdev.h:116: error: expected specifier-qualifier-list before ''uint32_t'' ../include/xen/sys/gntdev.h:138: error: expected specifier-qualifier-list before ''uint64_t'' init.c: In function ''init_gnt_srv'': init.c:76: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''domid'' init.c:77: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''flags'' init.c:78: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''count'' init.c:84: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''index'' init.c:90: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''gref_ids'' init.c:105: error: ''struct ioctl_gntalloc_unmap_notify'' has no member named ''index'' init.c:105: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''index'' init.c:106: error: ''struct ioctl_gntalloc_unmap_notify'' has no member named ''action'' init.c:107: error: ''struct ioctl_gntalloc_unmap_notify'' has no member named ''event_channel_port'' init.c:117: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''count'' init.c:122: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''index'' init.c:126: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''gref_ids'' init.c:134: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''count'' init.c:139: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''index'' init.c:144: error: ''struct ioctl_gntalloc_alloc_gref'' has no member named ''gref_ids'' init.c: In function ''do_gnt_map'': init.c:167: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''refs'' init.c:168: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''count'' init.c:170: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''refs'' init.c:171: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''refs'' init.c:178: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''index'' init.c:179: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''index'' init.c:182: error: unknown field ''index'' specified in initializer init.c:182: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''index'' init.c:182: warning: excess elements in struct initializer init.c:182: warning: (near initialization for ''undo'') init.c:183: error: unknown field ''count'' specified in initializer init.c:183: error: ''struct ioctl_gntdev_map_grant_ref'' has no member named ''count'' init.c:184: warning: excess elements in struct initializer init.c:184: warning: (near initialization for ''undo'') init.c: In function ''init_gnt_cli'': init.c:246: error: ''struct ioctl_gntdev_unmap_notify'' has no member named ''index'' init.c:247: error: ''struct ioctl_gntdev_unmap_notify'' has no member named ''action'' init.c:248: error: ''struct ioctl_gntdev_unmap_notify'' has no member named ''event_channel_port'' init.c: In function ''init_xs_srv'': init.c:286: error: array type has incomplete element type init.c:290: warning: implicit declaration of function ''xs_domain_open'' init.c:290: warning: assignment makes pointer from integer without a cast init.c:293: warning: implicit declaration of function ''xs_read'' init.c:293: warning: assignment makes pointer from integer without a cast init.c:300: error: ''XS_PERM_NONE'' undeclared (first use in this function) init.c:300: error: (Each undeclared identifier is reported only once init.c:300: error: for each function it appears in.) init.c:303: error: ''XS_PERM_READ'' undeclared (first use in this function) init.c:307: warning: implicit declaration of function ''xs_write'' init.c:309: warning: implicit declaration of function ''xs_set_permissions'' init.c:322: warning: implicit declaration of function ''xs_daemon_close'' init.c:286: warning: unused variable ''perms'' init.c: In function ''libvchan_client_init'': init.c:418: warning: implicit declaration of function ''xs_daemon_open'' init.c:418: warning: assignment makes pointer from integer without a cast init.c:420: warning: assignment makes pointer from integer without a cast init.c:427: warning: assignment makes pointer from integer without a cast init.c:436: warning: assignment makes pointer from integer without a cast make[3]: *** [init.o] Error 1 make[3]: Leaving directory `/u/iwj/work/xen-unstable-tools.hg/tools/libvchan'' make[2]: *** [subdir-install-libvchan] Error 2 make[2]: Leaving directory `/u/iwj/work/xen-unstable-tools.hg/tools'' make[1]: *** [subdirs-install] Error 2 make[1]: Leaving directory `/u/iwj/work/xen-unstable-tools.hg/tools'' make: *** [install-tools] Error 2 mariner:xen-unstable-tools.hg> hg status M tools/Makefile M tools/include/xen-sys/Linux/gntdev.h ? tools/include/xen-sys/Linux/gntalloc.h ? tools/libvchan/Makefile ? tools/libvchan/init.c ? tools/libvchan/io.c ? tools/libvchan/node-select.c ? tools/libvchan/node.c ? xen/include/headers.chk.new ? xen/include/public/io/libvchan.h mariner:xen-unstable-tools.hg> _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Aug-29 18:48 UTC
[Xen-devel] [PATCH v3] libvchan: interdomain communications library
This library implements a bidirectional communication interface between applications in different domains, similar to unix sockets. Data can be sent using the byte-oriented libvchan_read/libvchan_write or the packet-oriented libvchan_recv/libvchan_send. Channel setup is done using a client-server model; domain IDs and a port number must be negotiated prior to initialization. The server allocates memory for the shared pages and determines the sizes of the communication rings (which may span multiple pages, although the default places rings and control within a single page). With properly sized rings, testing has shown that this interface provides speed comparable to pipes within a single Linux domain; it is significantly faster than network-based communication. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- Changes since v2: - Use asm() barriers instead of __sync_synchronize() - Fix build when xs.h has not yet been installed - Remove // coments in header to fix strict build - Add (non)blocking flag and change node1 example to use it --- tools/Makefile | 1 + tools/include/xen-sys/Linux/gntalloc.h | 82 ++++++ tools/include/xen-sys/Linux/gntdev.h | 33 +++- tools/libvchan/Makefile | 56 ++++ tools/libvchan/init.c | 464 ++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 314 +++++++++++++++++++++ tools/libvchan/node-select.c | 161 +++++++++++ tools/libvchan/node.c | 168 ++++++++++++ xen/include/public/io/libvchan.h | 209 ++++++++++++++ 9 files changed, 1487 insertions(+), 1 deletions(-) create mode 100644 tools/include/xen-sys/Linux/gntalloc.h create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c create mode 100644 xen/include/public/io/libvchan.h diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/include/xen-sys/Linux/gntalloc.h b/tools/include/xen-sys/Linux/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/include/xen-sys/Linux/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/include/xen-sys/Linux/gntdev.h b/tools/include/xen-sys/Linux/gntdev.h index 8bd1467..5304bd3 100644 --- a/tools/include/xen-sys/Linux/gntdev.h +++ b/tools/include/xen-sys/Linux/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..6cccf3e --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,56 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) +$(LIBVCHAN_OBJS): CFLAGS += $(CFLAGS_libxenstore) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I../include -I. -fPIC + +.PHONY: all +all: libvchan.so vchan-node1 vchan-node2 libvchan.a + +libvchan.so: libvchan.so.$(MAJOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR): libvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libvchan.a $^ + +vchan-node1: $(NODE_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libvchan.so $(LDLIBS_libvchan) + +vchan-node2: $(NODE2_OBJS) libvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libvchan.so $(LDLIBS_libvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libvchan.so.$(MAJOR) + ln -sf libvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libvchan.so + $(INSTALL_DATA) libvchan.a $(DESTDIR)$(LIBDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..b267ca7 --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,464 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> +#include <xen/sys/gntalloc.h> +#include <xen/sys/gntdev.h> +#include <xen/io/libvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libvchan *ctrl) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); + int ring_ref = -1; + int err; + void *ring, *area; + + if (ring_fd < 0) + return -1; + + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); + + gref_info->domid = ctrl->other_domain_id; + gref_info->flags = GNTALLOC_FLAG_WRITABLE; + gref_info->count = 1; + + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out; + + ring = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, ring_fd, gref_info->index); + + if (ring == MAP_FAILED) + goto out; + + ctrl->ring = ring; + ring_ref = gref_info->gref_ids[0]; + + memset(ring, 0, PAGE_SIZE); + + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->debug = 0xabcd; + +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY + { + struct ioctl_gntalloc_unmap_notify arg; + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); + } +#endif + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_left; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_ring; + area = mmap(NULL, pages_left * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_ring; + ctrl->read.buffer = area; + memcpy(ctrl->ring->grants, gref_info->gref_ids, pages_left * sizeof(uint32_t)); + } + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + gref_info->count = pages_right; + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) + goto out_unmap_left; + area = mmap(NULL, pages_right * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, ring_fd, gref_info->index); + if (area == MAP_FAILED) + goto out_unmap_left; + ctrl->write.buffer = area; + memcpy(ctrl->ring->grants + (pages_left * sizeof(uint32_t)), + gref_info->gref_ids, pages_right * sizeof(uint32_t)); + } + +out: + close(ring_fd); + free(gref_info); + return ring_ref; +out_unmap_left: + if (ctrl->read.order > 11) + munmap(ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + munmap(ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static void* do_gnt_map(int fd, int domid, uint32_t* pages, size_t npages, uint64_t *index) +{ + int i, rv; + void* area = NULL; + struct ioctl_gntdev_map_grant_ref *gref_info; + gref_info = malloc(sizeof(*gref_info) + npages*sizeof(gref_info->refs[0])); + gref_info->count = npages; + for(i=0; i < npages; i++) { + gref_info->refs[i].domid = domid; + gref_info->refs[i].ref = pages[i]; + } + + rv = ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, gref_info); + if (rv) + goto out; + if (index) + *index = gref_info->index; + area = mmap(NULL, PAGE_SIZE * npages, PROT_READ | PROT_WRITE, MAP_SHARED, fd, gref_info->index); + if (area == MAP_FAILED) { + struct ioctl_gntdev_unmap_grant_ref undo = { + .index = gref_info->index, + .count = gref_info->count + }; + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &undo); + area = NULL; + } + out: + free(gref_info); + return area; +} + +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) +{ + int ring_fd = open("/dev/xen/gntdev", O_RDWR); + int rv = -1; + uint64_t ring_index; + uint32_t *grants; + if (ring_fd < 0) + return -1; + + ctrl->ring = do_gnt_map(ring_fd, ctrl->other_domain_id, &ring_ref, 1, &ring_index); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < 10 || ctrl->write.order > 24) + goto out_unmap_ring; + if (ctrl->read.order < 10 || ctrl->read.order > 24) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) + goto out_unmap_ring; + + grants = ctrl->ring->grants; + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_left, NULL); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = do_gnt_map(ring_fd, ctrl->other_domain_id, grants, pages_right, NULL); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + +#ifdef IOCTL_GNTDEV_SET_UNMAP_NOTIFY + { + struct ioctl_gntdev_unmap_notify arg; + arg.index = ring_index + offsetof(struct vchan_interface, cli_live); + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; + arg.event_channel_port = ctrl->event_port; + ioctl(ring_fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, &arg); + } +#endif + + rv = 0; + out: + close(ring_fd); + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + out_unmap_ring: + munmap(ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_unbound_port bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + bind.remote_domain = ctrl->other_domain_id; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_UNBOUND_PORT, &bind); + if (ctrl->event_port < 0) + return -1; + write(ctrl->event_fd, &ctrl->event_port, sizeof(ctrl->event_port)); + return 0; +} + +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = ctrl->other_domain_id; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libvchan *libvchan_server_init(int domain, int devno, size_t left_min, size_t right_min) +{ + // if you go over this size, you''ll have too many grants to fit in the shared page. + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; + struct libvchan *ctrl; + int ring_ref; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + ctrl = malloc(sizeof(*ctrl)); + if (!ctrl) + return 0; + + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so +#define MAX_SMALL_RING 1024 +#define MAX_LARGE_RING 2048 + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = 10; + ctrl->write.order = 11; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = 11; + ctrl->write.order = 10; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = 11; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = 11; + } + if (init_evt_srv(ctrl)) + goto out; + ring_ref = init_gnt_srv(ctrl); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, ring_ref)) + goto out; + return ctrl; +out: + libvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libvchan *ctrl) +{ + struct ioctl_evtchn_bind_interdomain bind; + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); + if (ctrl->event_fd < 0) + return -1; + + bind.remote_domain = ctrl->other_domain_id; + bind.remote_port = ctrl->event_port; + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); + if (ctrl->event_port < 0) + return -1; + return 0; +} + + +struct libvchan *libvchan_client_init(int domain, int devno) +{ + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + if (!ctrl) + return 0; + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event_fd = -1; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/ring-ref", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%d/event-channel", + ctrl->other_domain_id, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + +// set up event channel + if (init_evt_cli(ctrl)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->debug = 0xabce; + + out: + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..584b169 --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,314 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <xen/io/libvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +#define barrier() asm volatile("" ::: "memory") + +static uint32_t rd_prod(struct libvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static uint32_t* _rd_cons(struct libvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static uint32_t* _wr_prod(struct libvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static uint32_t wr_cons(struct libvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static const void* rd_ring(struct libvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static void* wr_ring(struct libvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static uint32_t wr_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static uint32_t rd_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +int libvchan_data_ready(struct libvchan *ctrl) +{ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libvchan_buffer_space(struct libvchan *ctrl) +{ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +static int do_notify(struct libvchan *ctrl) +{ + struct ioctl_evtchn_notify notify; + notify.port = ctrl->event_port; + return ioctl(ctrl->event_fd, IOCTL_EVTCHN_NOTIFY, ¬ify); +} + +int libvchan_wait(struct libvchan *ctrl) +{ + int ret; + uint32_t dummy; + ret = read(ctrl->event_fd, &dummy, sizeof(dummy)); + if (ret == -1) + return -1; + write(ctrl->event_fd, &dummy, sizeof(dummy)); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + barrier(); // data must be in the ring prior to increment + wr_prod(ctrl) += size; + barrier(); // increment must happen prior to notify + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + while (1) { + if (!libvchan_is_open(ctrl)) + return -1; + avail = libvchan_buffer_space(ctrl); + if (size <= avail) + return do_send(ctrl, data, size); + if (!ctrl->blocking) + return 0; + if (size > wr_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + if (!libvchan_is_open(ctrl)) + return -1; + if (ctrl->blocking) { + size_t pos = 0; + while (1) { + avail = libvchan_buffer_space(ctrl); + if (pos + avail > size) + avail = size - pos; + if (avail) + pos += do_send(ctrl, data + pos, avail); + if (pos == size) + return pos; + if (libvchan_wait(ctrl)) + return -1; + if (!libvchan_is_open(ctrl)) + return -1; + } + } else { + avail = libvchan_buffer_space(ctrl); + if (size > avail) + size = avail; + if (size == 0) + return 0; + return do_send(ctrl, data, size); + } +} + +static int do_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + barrier(); // data read must happen after rd_cons read + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + barrier(); // consumption must happen prior to notify of newly freed space + if (do_notify(ctrl) < 0) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = libvchan_data_ready(ctrl); + if (size <= avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (size > rd_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = libvchan_data_ready(ctrl); + if (avail && size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_is_open(struct libvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist || ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +/// The fd to use for select() set +int libvchan_fd_for_select(struct libvchan *ctrl) +{ + return ctrl->event_fd; +} + +void libvchan_close(struct libvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->ring) { + if (ctrl->is_server) + ctrl->ring->srv_live = 0; + else + ctrl->ring->cli_live = 0; + munmap(ctrl->ring, PAGE_SIZE); + } + if (ctrl->event_fd != -1) { + if (ctrl->event_port > 0 && ctrl->ring) + do_notify(ctrl); + close(ctrl->event_fd); + } + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + free(ctrl); +} diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..0000bc8 --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,161 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include <xen/io/libvchan.h> + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libvchan application, both client and server. + Both sides may write and read, both from the libvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libvchan_fd; + if (argc < 4) + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libvchan_server_init(atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[2]), atoi(argv[3])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libvchan_fd = libvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libvchan_fd, &rfds); + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libvchan_fd, &rfds)) { + libvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..219a6ef --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,168 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include <xen/io/libvchan.h> + +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodeid\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libvchan_close(ctrl); + exit(1); + } + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libvchan *ctrl = 0; + int wr = 0; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libvchan_server_init(atoi(argv[3]), atoi(argv[4]), 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(atoi(argv[3]), atoi(argv[4])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + ctrl->blocking = 1; + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libvchan_close(ctrl); + return 0; +} diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h new file mode 100644 index 0000000..81db0e2 --- /dev/null +++ b/xen/include/public/io/libvchan.h @@ -0,0 +1,209 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<domid>/data/vchan/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <stdint.h> +#include <sys/types.h> +#include <xen/sys/evtchn.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +/** + * vchan_interface: primary shared data structure + */ +struct vchan_interface { + /** + * Standard consumer/producer interface, one pair per buffer + * left is client write, server read + * right is client read, server write + */ + struct ring_shared left, right; + /** + * size of the rings, which determines their location + * 10 - at offset 1024 in ring''s page + * 11 - at offset 2048 in ring''s page + * 12+ - uses 2^(N-12) grants to describe the multi-page ring + * These should remain constant once the page is shared. + * Only one of the two orders can be 10 (or 11). + */ + uint16_t left_order, right_order; + /** + * Shutdown detection: + * 0: client (or server) has exited + * 1: client (or server) is connected + * 2: client has not yet connected + */ + uint8_t cli_live, srv_live; + /** + * structure padding; magic values depending on setup stage + */ + uint16_t debug; + /** + * Grant list: ordering is left, right. Must not extend into actual ring + * or grow beyond the end of the initial shared page. + * These should remain constant once the page is shared, to allow + * for possible remapping by a client that restarts. + */ + uint32_t grants[0]; +}; + +struct libvchan_ring { + /* Pointer into the shared page. Offsets into buffer. */ + struct ring_shared* shr; + /* ring data; may be its own shared page(s) depending on order */ + void* buffer; + /** + * The size of the ring is (1 << order); offsets wrap around when they + * exceed this. This copy is required because we can''t trust the order + * in the shared page to remain constant. + */ + int order; +}; + +/** + * struct libvchan: control structure passed to all library calls + */ +struct libvchan { + /* person we communicate with */ + int other_domain_id; + /* "port" we communicate on (allows multiple vchans to exist in xenstore) */ + int device_number; + /* Shared ring page, mapped using gntdev or gntalloc */ + /* Note that the FD for gntdev or gntalloc has already been closed. */ + struct vchan_interface *ring; + /* event channel interface (needs port for API) */ + int event_fd; + uint32_t event_port; + /* informative flags: are we acting as server? */ + int is_server:1; + /* true if server remains active when client closes (allows reconnection) */ + int server_persist:1; + /* true if operations should block instead of returning 0 */ + int blocking:1; + /* communication rings */ + struct libvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param domain The peer domain that will be connecting + * @param devno A device number, used to identify this vchan in xenstore + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_server_init(int domain, int devno, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param domain The peer domain to connect to + * @param devno A device number, used to identify this vchan in xenstore + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_client_init(int domain, int devno); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libvchan_close(struct libvchan *ctrl); + +/** + * Packet-based receive: always reads exactly $size bytes. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer and amount of data to read + * @return -1 on error, 0 if nonblocking and insufficient data is available, or $size + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); +/** + * Stream-based receive: reads as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data read (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); +/** + * Packet-based send: send entire buffer if possible + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer and amount of data to send + * @return -1 on error, 0 if nonblocking and insufficient space is available, or $size + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); +/** + * Stream-based send: send as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data sent (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); +/** + * Waits for reads or writes to unblock, or for a close + */ +int libvchan_wait(struct libvchan *ctrl); +/** + * Returns the event file descriptor for this vchan. When this FD is readable, + * libvchan_wait() will not block, and the state of the vchan has changed since + * the last invocation of libvchan_wait(). + */ +int libvchan_fd_for_select(struct libvchan *ctrl); +/** + * Query the state of the vchan shared page: + * return 0 when one side has called libvchan_close() or crashed + * return 1 when both sides are open + * return 2 [server only] when no client has yet connected + */ +int libvchan_is_open(struct libvchan* ctrl); +/** Amount of data ready to read, in bytes */ +int libvchan_data_ready(struct libvchan *ctrl); +/** Amount of data it is possible to send without blocking */ +int libvchan_buffer_space(struct libvchan *ctrl); -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Aug-30 10:32 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
On Mon, 2011-08-29 at 19:48 +0100, Daniel De Graaf wrote:> diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile > new file mode 100644 > index 0000000..6cccf3e > --- /dev/null > +++ b/tools/libvchan/Makefile > @@ -0,0 +1,56 @@ > +# > +# tools/libvchan/Makefile > +# > + > +XEN_ROOT = $(CURDIR)/../.. > +include $(XEN_ROOT)/tools/Rules.mk > + > +LIBVCHAN_OBJS = init.o io.o > +NODE_OBJS = node.o > +NODE2_OBJS = node-select.o > + > +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) > +$(LIBVCHAN_OBJS): CFLAGS += $(CFLAGS_libxenstore) > + > +MAJOR = 1.0 > +MINOR = 0 > + > +CFLAGS += -I../include -I. -fPICCan you use foo.opic in your $(*_OBJS) instead of -fPIC? I think that''s how this is intended to work.> + > +.PHONY: all > +all: libvchan.so vchan-node1 vchan-node2 libvchan.a > + > +libvchan.so: libvchan.so.$(MAJOR) > + ln -sf $< $@ > + > +libvchan.so.$(MAJOR): libvchan.so.$(MAJOR).$(MINOR) > + ln -sf $< $@ > + > +libvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_OBJS) > + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) > + > +libvchan.a: $(LIBVCHAN_OBJS) > + $(AR) rcs libvchan.a $^ > + > +vchan-node1: $(NODE_OBJS) libvchan.so > + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libvchan.so $(LDLIBS_libvchan) > + > +vchan-node2: $(NODE2_OBJS) libvchan.so > + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libvchan.so $(LDLIBS_libvchan) > + > +.PHONY: install > +install: all > + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) > + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) > + $(INSTALL_PROG) libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)Perhaps the library should be libxenvchan since it is going in /usr/lib and vchan is a bit generic?> diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c > new file mode 100644 > index 0000000..b267ca7 > --- /dev/null > +++ b/tools/libvchan/init.c > @@ -0,0 +1,464 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; under version 2 of the License.I don''t have a problem with GPL rather than LGPL myself but I suppose we should consider if it meets the needs of the potential users now while the number of people who have touched the library is small enough that we can ask them if they are happy to relicense. [...]> +static int init_gnt_srv(struct libvchan *ctrl) > +{ > + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; > + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; > + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; > + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); > + int ring_ref = -1; > + int err; > + void *ring, *area; > + > + if (ring_fd < 0) > + return -1; > + > + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); > + > + gref_info->domid = ctrl->other_domain_id; > + gref_info->flags = GNTALLOC_FLAG_WRITABLE; > + gref_info->count = 1; > + > + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info);Unless libvchan is going to be the only user of this interface we should add helpful wrappers to libxc, like we do for gntdev and evtchn.> + if (err) > + goto out; > + > + ring = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, ring_fd, gref_info->index); > + > + if (ring == MAP_FAILED) > + goto out; > + > + ctrl->ring = ring; > + ring_ref = gref_info->gref_ids[0]; > + > + memset(ring, 0, PAGE_SIZE); > + > + ctrl->read.shr = &ctrl->ring->left; > + ctrl->write.shr = &ctrl->ring->right; > + ctrl->ring->left_order = ctrl->read.order; > + ctrl->ring->right_order = ctrl->write.order; > + ctrl->ring->cli_live = 2; > + ctrl->ring->srv_live = 1; > + ctrl->ring->debug = 0xabcd;Makes a change from deafbeef I guess ;-)> +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY > + { > + struct ioctl_gntalloc_unmap_notify arg; > + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); > + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; > + arg.event_channel_port = ctrl->event_port; > + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); > + } > +#endifWhat is the fallback if this isn''t available? [...]> static int init_xs_srv(struct libvchan *ctrl, int ring_ref) > +{ > + int ret = -1; > + struct xs_handle *xs; > + struct xs_permissions perms[2]; > + char buf[64]; > + char ref[16]; > + char* domid_str = NULL; > + xs = xs_domain_open(); > + if (!xs) > + goto fail; > + domid_str = xs_read(xs, 0, "domid", NULL); > + if (!domid_str) > + goto fail_xs_open; > + > + // owner domain is us > + perms[0].id = atoi(domid_str);It sucks a bit that xenstore doesn''t appear to allow DOMNID_SELF here but oh well.> + // permissions for domains not listed = none > + perms[0].perms = XS_PERM_NONE; > + // other domains > + perms[1].id = ctrl->other_domain_id; > + perms[1].perms = XS_PERM_READ; > + > + snprintf(ref, sizeof ref, "%d", ring_ref); > + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); > + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > + goto fail_xs_open; > + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > + goto fail_xs_open; > + > + snprintf(ref, sizeof ref, "%d", ctrl->event_port); > + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); > + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > + goto fail_xs_open; > + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > + goto fail_xs_open;Am I right that the intended usage model is that two domains can decide to setup a connection without admin or toolstack involvement? Do we need to arrange on the toolstack side that a suitable vchan-specific directory (or directories) in xenstore exists with suitable permissions to allow this to happen exists or do we think data is an appropriate location? [...]> +static int init_evt_cli(struct libvchan *ctrl) > +{ > + struct ioctl_evtchn_bind_interdomain bind; > + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); > + if (ctrl->event_fd < 0) > + return -1; > + > + bind.remote_domain = ctrl->other_domain_id; > + bind.remote_port = ctrl->event_port; > + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); > + if (ctrl->event_port < 0) > + return -1; > + return 0; > +}This appears to be reimplementing xc_evtchn_bind_interdomain. It should use the provided infrastructure libraries instead.> diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h > new file mode 100644 > index 0000000..81db0e2 > --- /dev/null > +++ b/xen/include/public/io/libvchan.h > @@ -0,0 +1,209 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; under version 2 of the License. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License along > + * with this program; if not, write to the Free Software Foundation, Inc., > + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. > + * > + * @section DESCRIPTION > + * > + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, > + * this code has been substantially rewritten to use the gntdev and gntalloc > + * devices instead of raw MFNs and map_foreign_range. > + * > + * This is a library for inter-domain communication. A standard Xen ring > + * buffer is used, with a datagram-based interface built on top. The grant > + * reference and event channels are shared in XenStore under the path > + * /local/domain/<domid>/data/vchan/<port>/{ring-ref,event-channel}Is the peer''s domid just implicit in port and/or the out of band setup? Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Aug-31 19:17 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
On 08/30/2011 06:32 AM, Ian Campbell wrote:> On Mon, 2011-08-29 at 19:48 +0100, Daniel De Graaf wrote: >> diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile >> new file mode 100644 >> index 0000000..6cccf3e >> --- /dev/null >> +++ b/tools/libvchan/Makefile >> @@ -0,0 +1,56 @@ >> +# >> +# tools/libvchan/Makefile >> +# >> + >> +XEN_ROOT = $(CURDIR)/../.. >> +include $(XEN_ROOT)/tools/Rules.mk >> + >> +LIBVCHAN_OBJS = init.o io.o >> +NODE_OBJS = node.o >> +NODE2_OBJS = node-select.o >> + >> +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) >> +$(LIBVCHAN_OBJS): CFLAGS += $(CFLAGS_libxenstore) >> + >> +MAJOR = 1.0 >> +MINOR = 0 >> + >> +CFLAGS += -I../include -I. -fPIC > > Can you use foo.opic in your $(*_OBJS) instead of -fPIC? I think that''s > how this is intended to work.I was copying libxl''s Makefile which doesn''t use .opic; using .opic for the .so and not the .a looks like a good idea.>> + >> +.PHONY: all >> +all: libvchan.so vchan-node1 vchan-node2 libvchan.a >> + >> +libvchan.so: libvchan.so.$(MAJOR) >> + ln -sf $< $@ >> + >> +libvchan.so.$(MAJOR): libvchan.so.$(MAJOR).$(MINOR) >> + ln -sf $< $@ >> + >> +libvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_OBJS) >> + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) >> + >> +libvchan.a: $(LIBVCHAN_OBJS) >> + $(AR) rcs libvchan.a $^ >> + >> +vchan-node1: $(NODE_OBJS) libvchan.so >> + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libvchan.so $(LDLIBS_libvchan) >> + >> +vchan-node2: $(NODE2_OBJS) libvchan.so >> + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libvchan.so $(LDLIBS_libvchan) >> + >> +.PHONY: install >> +install: all >> + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) >> + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) >> + $(INSTALL_PROG) libvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) > > Perhaps the library should be libxenvchan since it is going in /usr/lib > and vchan is a bit generic?That seems a sensible name change.>> diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c >> new file mode 100644 >> index 0000000..b267ca7 >> --- /dev/null >> +++ b/tools/libvchan/init.c >> @@ -0,0 +1,464 @@ >> +/** >> + * @file >> + * @section AUTHORS >> + * >> + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> >> + * >> + * Authors: >> + * Rafal Wojtczuk <rafal@invisiblethingslab.com> >> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> >> + * >> + * @section LICENSE >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; under version 2 of the License. > > I don''t have a problem with GPL rather than LGPL myself but I suppose we > should consider if it meets the needs of the potential users now while > the number of people who have touched the library is small enough that > we can ask them if they are happy to relicense. >I have agreement from Rafal Wojtczuk to relicense under the LGPL, so that will be done in the next version.> [...] >> +static int init_gnt_srv(struct libvchan *ctrl) >> +{ >> + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; >> + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; >> + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; >> + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); >> + int ring_ref = -1; >> + int err; >> + void *ring, *area; >> + >> + if (ring_fd < 0) >> + return -1; >> + >> + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); >> + >> + gref_info->domid = ctrl->other_domain_id; >> + gref_info->flags = GNTALLOC_FLAG_WRITABLE; >> + gref_info->count = 1; >> + >> + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); > > Unless libvchan is going to be the only user of this interface we should > add helpful wrappers to libxc, like we do for gntdev and evtchn.Adding the wrappers made the library more complex with no other gains when it was out-of-tree; for upstreaming, this does make sense. This will result in a vchan consuming two file descriptors while it is active because the libxc API does not expose the ability to close descriptors without unmapping memory. Since that ability is likely to be linux-specific, it''s reasonable to stop relying on it for portability reasons.>> + if (err) >> + goto out; >> + >> + ring = mmap(NULL, PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, ring_fd, gref_info->index); >> + >> + if (ring == MAP_FAILED) >> + goto out; >> + >> + ctrl->ring = ring; >> + ring_ref = gref_info->gref_ids[0]; >> + >> + memset(ring, 0, PAGE_SIZE); >> + >> + ctrl->read.shr = &ctrl->ring->left; >> + ctrl->write.shr = &ctrl->ring->right; >> + ctrl->ring->left_order = ctrl->read.order; >> + ctrl->ring->right_order = ctrl->write.order; >> + ctrl->ring->cli_live = 2; >> + ctrl->ring->srv_live = 1; >> + ctrl->ring->debug = 0xabcd; > > Makes a change from deafbeef I guess ;-) > >> +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY >> + { >> + struct ioctl_gntalloc_unmap_notify arg; >> + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); >> + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; >> + arg.event_channel_port = ctrl->event_port; >> + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); >> + } >> +#endif > > What is the fallback if this isn''t available?The fallback is that the notify is not sent, and the peer cannot detect when its peer crashes or is killed instead of executing a graceful shutdown. Adding this functionality to libxc requires yet another wrapper on the grant mapping functionality. Instead of attempting to pass back the index as is done in the current version, I am considering adding the functions xc_gnttab_map_grant_ref_notify(xcg, domid, ref, notify_offset, notify_port) and xc_gntshr_share_page_notify(xcs, domid, &ref, notify_offset, notify_port); these would fall back to xc_gnttab_map_grant_ref if notify is not present.> [...] >> static int init_xs_srv(struct libvchan *ctrl, int ring_ref) >> +{ >> + int ret = -1; >> + struct xs_handle *xs; >> + struct xs_permissions perms[2]; >> + char buf[64]; >> + char ref[16]; >> + char* domid_str = NULL; >> + xs = xs_domain_open(); >> + if (!xs) >> + goto fail; >> + domid_str = xs_read(xs, 0, "domid", NULL); >> + if (!domid_str) >> + goto fail_xs_open; >> + >> + // owner domain is us >> + perms[0].id = atoi(domid_str); > > It sucks a bit that xenstore doesn''t appear to allow DOMNID_SELF here > but oh well.On the client side, we need to look up our own domid to find the path (if the changes to follow usual xenstore convention are made) so it''s required either way.>> + // permissions for domains not listed = none >> + perms[0].perms = XS_PERM_NONE; >> + // other domains >> + perms[1].id = ctrl->other_domain_id; >> + perms[1].perms = XS_PERM_READ; >> + >> + snprintf(ref, sizeof ref, "%d", ring_ref); >> + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); >> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) >> + goto fail_xs_open; >> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) >> + goto fail_xs_open; >> + >> + snprintf(ref, sizeof ref, "%d", ctrl->event_port); >> + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); >> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) >> + goto fail_xs_open; >> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) >> + goto fail_xs_open; > > Am I right that the intended usage model is that two domains can decide > to setup a connection without admin or toolstack involvement? > > Do we need to arrange on the toolstack side that a suitable > vchan-specific directory (or directories) in xenstore exists with > suitable permissions to allow this to happen exists or do we think data > is an appropriate location?Yes, the intended use is to avoid needing to have management tools involved in the setup. Of course, that doesn''t mean that vchan can''t have help from management tools - but since this help isn''t required, adding an unneeded dependency was pointless and might also imply a level of control that is not actually present (i.e. restricting the management tools does not actually restrict the ability to set up a vchan; that requires something like an XSM policy blocking the grant or event channels). I picked data because it does not require toolstack modification to use, and no other location jumped out at me - vchan isn''t really a device.> [...] >> +static int init_evt_cli(struct libvchan *ctrl) >> +{ >> + struct ioctl_evtchn_bind_interdomain bind; >> + ctrl->event_fd = open("/dev/xen/evtchn", O_RDWR); >> + if (ctrl->event_fd < 0) >> + return -1; >> + >> + bind.remote_domain = ctrl->other_domain_id; >> + bind.remote_port = ctrl->event_port; >> + ctrl->event_port = ioctl(ctrl->event_fd, IOCTL_EVTCHN_BIND_INTERDOMAIN, &bind); >> + if (ctrl->event_port < 0) >> + return -1; >> + return 0; >> +} > > This appears to be reimplementing xc_evtchn_bind_interdomain. It should > use the provided infrastructure libraries instead.At the time of writing, the infrastructure libraries did not work well in domUs; this appears to be fixed now, so using the libraries should make porting easier. This will require adding libxc logging to the API.>> diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h >> new file mode 100644 >> index 0000000..81db0e2 >> --- /dev/null >> +++ b/xen/include/public/io/libvchan.h >> @@ -0,0 +1,209 @@ >> +/** >> + * @file >> + * @section AUTHORS >> + * >> + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> >> + * >> + * Authors: >> + * Rafal Wojtczuk <rafal@invisiblethingslab.com> >> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> >> + * >> + * @section LICENSE >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; under version 2 of the License. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License along >> + * with this program; if not, write to the Free Software Foundation, Inc., >> + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. >> + * >> + * @section DESCRIPTION >> + * >> + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, >> + * this code has been substantially rewritten to use the gntdev and gntalloc >> + * devices instead of raw MFNs and map_foreign_range. >> + * >> + * This is a library for inter-domain communication. A standard Xen ring >> + * buffer is used, with a datagram-based interface built on top. The grant >> + * reference and event channels are shared in XenStore under the path >> + * /local/domain/<domid>/data/vchan/<port>/{ring-ref,event-channel} > > Is the peer''s domid just implicit in port and/or the out of band setup? > > Ian. >Yes. Since the server already needs to query its own domid, the client can also add such a query and use /local/domain/<srv-d>/data/vchan/<cli-id>/<port>/* which matches the usual xenstore conventions. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 16:22 UTC
[Xen-devel] [PATCH v4 0/3] libvchan: interdomain communications library
Changes since v3: - libxc integration - use of .opic for .so build instead of forcing -fPIC - change library name to "libxenvchan" - relicense to LGPL - change xenstore path to include both client and server domid - avoid sending unneeded event channel notifications [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify [PATCH 2/3] libxc: add xc_gntshr_* functions [PATCH 3/3] libvchan: interdomain communications library _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 16:22 UTC
[Xen-devel] [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
Normally, when a userspace process mapping a grant crashes, the domain providing the reference receives no indication that its peer has crashed, possibly leading to unexpected freezes or timeouts. This function provides a notification of the unmap by signalling an event channel and/or clearing a specific byte in the page. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/libxc/xc_gnttab.c | 15 ++++++++++++ tools/libxc/xc_linux_osdep.c | 52 ++++++++++++++++++++++++++++++++++++++++++ tools/libxc/xenctrl.h | 21 +++++++++++++++++ tools/libxc/xenctrlosdep.h | 5 ++++ 4 files changed, 93 insertions(+), 0 deletions(-) diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index 4f55fce..dc7aa0c 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -174,6 +174,21 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, count, domid, refs, prot); } +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + if (xcg->ops->u.gnttab.map_grant_ref_notify) + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, + domid, ref, notify_offset, notify_port); + else + return xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, + domid, ref, PROT_READ|PROT_WRITE); +} + + int xc_gnttab_munmap(xc_gnttab *xcg, void *start_address, uint32_t count) diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index dca6718..8f7718f 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -613,6 +613,57 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); } +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, + uint32_t domid, uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + int fd = (int)h; + struct ioctl_gntdev_map_grant_ref map; + struct ioctl_gntdev_unmap_notify notify; + void *addr; + + map.count = 1; + map.refs[0].domid = domid; + map.refs[0].ref = ref; + + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); + return NULL; + } + + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); + if ( addr == MAP_FAILED ) + { + int saved_errno = errno; + struct ioctl_gntdev_unmap_grant_ref unmap_grant; + + PERROR("xc_gnttab_map_grant_ref: mmap failed"); + unmap_grant.index = map.index; + unmap_grant.count = 1; + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); + errno = saved_errno; + return NULL; + } + + notify.index = map.index; + notify.action = 0; + if (notify_offset >= 0) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port >= 0) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action && ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify)) { + PERROR("linux_gnttab_map_grant_ref_notify: ioctl SET_UNMAP_NOTIFY failed"); + } + + return addr; +} + + static int linux_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count) { @@ -662,6 +713,7 @@ static struct xc_osdep_ops linux_gnttab_ops = { .map_grant_ref = &linux_gnttab_map_grant_ref, .map_grant_refs = &linux_gnttab_map_grant_refs, .map_domain_grant_refs = &linux_gnttab_map_domain_grant_refs, + .map_grant_ref_notify = &linux_gnttab_map_grant_ref_notify, .munmap = &linux_gnttab_munmap, }, }; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 1b82ee0..7859571 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -1349,6 +1349,27 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, int prot); /* + * Memory maps a grant reference from one domain to a local address range. + * Mappings should be unmapped with xc_gnttab_munmap. Logs errors. + * This version always maps writable pages, and will attempt to set up + * an unmap notification at the given offset and event channel. When the + * page is unmapped, the byte at the given offset will be zeroed and a + * wakeup will be sent to the given event channel. + * + * @parm xcg a handle on an open grant table interface + * @parm domid the domain to map memory from + * @parm ref the grant reference ID to map + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + */ +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port); + +/* * Unmaps the @count pages starting at @start_address, which were mapped by a * call to xc_gnttab_map_grant_ref or xc_gnttab_map_grant_refs. Never logs. */ diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index bfe46e0..01969c5 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -119,6 +119,11 @@ struct xc_osdep_ops uint32_t domid, uint32_t *refs, int prot); + void *(*map_grant_ref_notify)(xc_gnttab *xcg, xc_osdep_handle h, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port); int (*munmap)(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count); -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 16:22 UTC
[Xen-devel] [PATCH 2/3] libxc: add xc_gntshr_* functions
These functions and the xc_gntshr device (/dev/xen/gntalloc on linux) allow applications to create pages shared with other domains. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/libxc/xc_gnttab.c | 27 +++++++++ tools/libxc/xc_linux_osdep.c | 121 ++++++++++++++++++++++++++++++++++++++++++ tools/libxc/xc_private.c | 13 +++++ tools/libxc/xenctrl.h | 48 +++++++++++++++++ tools/libxc/xenctrlosdep.h | 13 +++++ 5 files changed, 222 insertions(+), 0 deletions(-) diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index dc7aa0c..ffa3550 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -204,6 +204,33 @@ int xc_gnttab_set_max_grants(xc_gnttab *xcg, uint32_t count) return xcg->ops->u.gnttab.set_max_grants(xcg, xcg->ops_handle, count); } +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable) +{ + return xcg->ops->u.gntshr.share_pages(xcg, xcg->ops_handle, domid, + count, refs, writable); +} + +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + return xcg->ops->u.gntshr.share_page_notify(xcg, xcg->ops_handle, + domid, ref, writable, notify_offset, notify_port); +} + +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count) +{ + return xcg->ops->u.gntshr.munmap(xcg, xcg->ops_handle, + start_address, count); +} + + /* * Local variables: * mode: C diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index 8f7718f..871d37c 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -34,6 +34,7 @@ #include <xen/memory.h> #include <xen/sys/evtchn.h> #include <xen/sys/gntdev.h> +#include <xen/sys/gntalloc.h> #include "xenctrl.h" #include "xenctrlosdep.h" @@ -718,6 +719,124 @@ static struct xc_osdep_ops linux_gnttab_ops = { }, }; +static xc_osdep_handle linux_gntshr_open(xc_gntshr *xcg) +{ + int fd = open(DEVXEN "gntalloc", O_RDWR); + + if ( fd == -1 ) + return XC_OSDEP_OPEN_ERROR; + + return (xc_osdep_handle)fd; +} + +static int linux_gntshr_close(xc_gntshr *xcg, xc_osdep_handle h) +{ + int fd = (int)h; + return close(fd); +} + +static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable) +{ + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; + int err; + void *area = NULL; + gref_info = malloc(sizeof(*gref_info) + count * sizeof(uint32_t)); + if (!gref_info) + return NULL; + gref_info->domid = domid; + gref_info->flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; + gref_info->count = count; + + err = ioctl((int)h, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) { + PERROR("linux_gntshr_share_pages: ioctl failed"); + goto out; + } + + area = mmap(NULL, count * XC_PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, (int)h, gref_info->index); + + if (area == MAP_FAILED) { + area = NULL; + PERROR("linux_gntshr_share_pages: mmap failed"); + goto out; + } + + memcpy(refs, gref_info->gref_ids, count * sizeof(uint32_t)); + out: + free(gref_info); + return area; +} + +static void *linux_gntshr_share_page_notify(xc_gntshr *xch, xc_osdep_handle h, + uint32_t domid, uint32_t *ref, + int writable, uint32_t notify_offset, + evtchn_port_t notify_port) +{ + struct ioctl_gntalloc_alloc_gref gref_info; + struct ioctl_gntalloc_unmap_notify notify; + int err; + int fd = (int)h; + void *area = NULL; + gref_info.domid = domid; + gref_info.flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; + gref_info.count = 1; + + err = ioctl(fd, IOCTL_GNTALLOC_ALLOC_GREF, &gref_info); + if (err) { + PERROR("linux_gntshr_share_page_notify: ioctl failed"); + goto out; + } + + area = mmap(NULL, XC_PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, gref_info.index); + + if (area == MAP_FAILED) { + PERROR("linux_gntshr_share_page_notify: mmap failed"); + area = NULL; + goto out; + } + + notify.index = gref_info.index; + notify.action = 0; + if (notify_offset >= 0) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port >= 0) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action && ioctl(fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, ¬ify)) { + PERROR("linux_gntshr_share_page_notify: ioctl SET_UNMAP_NOTIFY failed"); + } + + *ref = gref_info.gref_ids[0]; + out: + return area; +} + + +static int linux_gntshr_munmap(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count) +{ + return munmap(start_address, count); +} + +static struct xc_osdep_ops linux_gntshr_ops = { + .open = &linux_gntshr_open, + .close = &linux_gntshr_close, + + .u.gntshr = { + .share_pages = &linux_gntshr_share_pages, + .share_page_notify = &linux_gntshr_share_page_notify, + .munmap = &linux_gntshr_munmap, + }, +}; + + static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_type type) { switch ( type ) @@ -728,6 +847,8 @@ static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_ty return &linux_evtchn_ops; case XC_OSDEP_GNTTAB: return &linux_gnttab_ops; + case XC_OSDEP_GNTSHR: + return &linux_gntshr_ops; default: return NULL; } diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c index 09c8f23..09a91e7 100644 --- a/tools/libxc/xc_private.c +++ b/tools/libxc/xc_private.c @@ -258,6 +258,19 @@ int xc_gnttab_close(xc_gnttab *xcg) return xc_interface_close_common(xcg); } +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags) +{ + return xc_interface_open_common(logger, NULL, open_flags, + XC_OSDEP_GNTSHR); +} + +int xc_gntshr_close(xc_gntshr *xcg) +{ + return xc_interface_close_common(xcg); +} + + static pthread_key_t errbuf_pkey; static pthread_once_t errbuf_pkey_once = PTHREAD_ONCE_INIT; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 7859571..374c705 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -115,6 +115,7 @@ typedef struct xc_interface_core xc_interface; typedef struct xc_interface_core xc_evtchn; typedef struct xc_interface_core xc_gnttab; +typedef struct xc_interface_core xc_gntshr; typedef enum xc_error_code xc_error_code; @@ -1400,6 +1401,53 @@ grant_entry_v1_t *xc_gnttab_map_table_v1(xc_interface *xch, int domid, int *gnt_ grant_entry_v2_t *xc_gnttab_map_table_v2(xc_interface *xch, int domid, int *gnt_num); /* Sometimes these don''t set errno [fixme], and sometimes they don''t log. */ +/* + * Return an fd onto the grant sharing driver. Logs errors. + */ +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags); + +/* + * Close a handle previously allocated with xc_gntshr_open(). + * Never logs errors. + */ +int xc_gntshr_close(xc_gntshr *xcg); + +/* + * Creates and shares pages with another domain. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm count the number of pages to share + * @parm refs the grant references of the pages (output) + * @parm writable true if the other domain can write to the pages + * @return local mapping of the pages + */ +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable); + +/* + * Creates and shares a page with another domain, with unmap notification. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm refs the grant reference of the pages (output) + * @parm writable true if the other domain can write to the page + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + * @return local mapping of the page + */ +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port); +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count); + int xc_physdev_map_pirq(xc_interface *xch, int domid, int index, diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index 01969c5..e1c1ba5 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -54,6 +54,7 @@ enum xc_osdep_type { XC_OSDEP_PRIVCMD, XC_OSDEP_EVTCHN, XC_OSDEP_GNTTAB, + XC_OSDEP_GNTSHR, }; /* Opaque handle internal to the backend */ @@ -129,6 +130,18 @@ struct xc_osdep_ops uint32_t count); int (*set_max_grants)(xc_gnttab *xcg, xc_osdep_handle h, uint32_t count); } gnttab; + struct { + void *(*share_pages)(xc_gntshr *xcg, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable); + void *(*share_page_notify)(xc_gntshr *xcg, xc_osdep_handle h, + uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port); + int (*munmap)(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count); + } gntshr; } u; }; typedef struct xc_osdep_ops xc_osdep_ops; -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 16:22 UTC
[Xen-devel] [PATCH 3/3] libvchan: interdomain communications library
This library implements a bidirectional communication interface between applications in different domains, similar to unix sockets. Data can be sent using the byte-oriented libvchan_read/libvchan_write or the packet-oriented libvchan_recv/libvchan_send. Channel setup is done using a client-server model; domain IDs and a port number must be negotiated prior to initialization. The server allocates memory for the shared pages and determines the sizes of the communication rings (which may span multiple pages, although the default places rings and control within a single page). With properly sized rings, testing has shown that this interface provides speed comparable to pipes within a single Linux domain; it is significantly faster than network-based communication. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/Makefile | 1 + tools/include/xen-sys/Linux/gntalloc.h | 82 +++++++ tools/include/xen-sys/Linux/gntdev.h | 33 +++- tools/libvchan/Makefile | 59 +++++ tools/libvchan/init.c | 396 ++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 375 ++++++++++++++++++++++++++++++ tools/libvchan/libxenvchan.h | 173 ++++++++++++++ tools/libvchan/node-select.c | 162 +++++++++++++ tools/libvchan/node.c | 169 ++++++++++++++ xen/include/public/io/libvchan.h | 97 ++++++++ 10 files changed, 1546 insertions(+), 1 deletions(-) create mode 100644 tools/include/xen-sys/Linux/gntalloc.h create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/libxenvchan.h create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c create mode 100644 xen/include/public/io/libvchan.h diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/include/xen-sys/Linux/gntalloc.h b/tools/include/xen-sys/Linux/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/include/xen-sys/Linux/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/include/xen-sys/Linux/gntdev.h b/tools/include/xen-sys/Linux/gntdev.h index 8bd1467..5304bd3 100644 --- a/tools/include/xen-sys/Linux/gntdev.h +++ b/tools/include/xen-sys/Linux/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..528eaed --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,59 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_PIC_OBJS = $(patsubst %.o,%.opic,$(LIBVCHAN_OBJS)) +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl) +$(LIBVCHAN_OBJS) $(LIBVCHAN_PIC_OBJS): CFLAGS += $(CFLAGS_libxenstore) $(CFLAGS_libxenctrl) +$(NODE_OBJS) $(NODE2_OBJS): CFLAGS += $(CFLAGS_libxenctrl) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I../include -I. + +.PHONY: all +all: libxenvchan.so vchan-node1 vchan-node2 libxenvchan.a + +libxenvchan.so: libxenvchan.so.$(MAJOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR): libxenvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_PIC_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxenvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libxenvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libxenvchan.a $^ + +vchan-node1: $(NODE_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) libxenvchan.so $(LDLIBS_libvchan) + +vchan-node2: $(NODE2_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) libxenvchan.so $(LDLIBS_libvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so.$(MAJOR) + ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so + $(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(LIBDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..9b98104 --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,396 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> +#include <xen/sys/gntalloc.h> +#include <xen/sys/gntdev.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libvchan *ctrl) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + uint32_t ring_ref = -1; + void *ring; + + ring = xc_gntshr_share_page_notify(ctrl->gntshr, ctrl->other_domain_id, + &ring_ref, 1, offsetof(struct vchan_interface, srv_live), + ctrl->event_port); + + if (!ring) + goto out; + + memset(ring, 0, PAGE_SIZE); + + ctrl->ring = ring; + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->cli_notify = VCHAN_NOTIFY_WRITE; + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, + pages_left, ctrl->ring->grants, 1); + if (!ctrl->read.buffer) + goto out_ring; + } + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + ctrl->write.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, + pages_right, ctrl->ring->grants + pages_left, 1); + if (!ctrl->write.buffer) + goto out_unmap_left; + } + +out: + return ring_ref; +out_unmap_left: + if (ctrl->read.order > 11) + xc_gntshr_munmap(ctrl->gntshr, ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + xc_gntshr_munmap(ctrl->gntshr, ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) +{ + int rv = -1; + uint32_t *grants; + + ctrl->ring = xc_gnttab_map_grant_ref_notify(ctrl->gnttab, + ctrl->other_domain_id, ring_ref, + offsetof(struct vchan_interface, cli_live), ctrl->event_port); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < 10 || ctrl->write.order > 24) + goto out_unmap_ring; + if (ctrl->read.order < 10 || ctrl->read.order > 24) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) + goto out_unmap_ring; + + grants = ctrl->ring->grants; + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_left, ctrl->other_domain_id, grants, PROT_READ|PROT_WRITE); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_right, ctrl->other_domain_id, grants, PROT_READ); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + + rv = 0; + out: + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + xc_gnttab_munmap(ctrl->gnttab, ctrl->write.buffer, + 1 << ctrl->write.order); + out_unmap_ring: + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libvchan *ctrl, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_unbound_port(ctrl->event, ctrl->other_domain_id); + if (ctrl->event_port < 0) + return -1; + if (xc_evtchn_unmask(ctrl->event, ctrl->event_port)) + return -1; + return 0; +} + +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = ctrl->other_domain_id; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "data/vchan/%d/%d/ring-ref", ctrl->other_domain_id, ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "data/vchan/%d/%d/event-channel", ctrl->other_domain_id, ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t left_min, size_t right_min) +{ + // if you go over this size, you''ll have too many grants to fit in the shared page. + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; + struct libvchan *ctrl; + int ring_ref; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + ctrl = malloc(sizeof(*ctrl)); + if (!ctrl) + return 0; + + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so +#define MAX_SMALL_RING 1024 +#define MAX_LARGE_RING 2048 + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = 10; + ctrl->write.order = 11; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = 11; + ctrl->write.order = 10; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = 11; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = 11; + } + + ctrl->gntshr = xc_gntshr_open(logger, 0); + if (!ctrl->gntshr) + goto out; + + if (init_evt_srv(ctrl, logger)) + goto out; + ring_ref = init_gnt_srv(ctrl); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, ring_ref)) + goto out; + return ctrl; +out: + libvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libvchan *ctrl, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_interdomain(ctrl->event, + ctrl->other_domain_id, ctrl->event_port); + if (ctrl->event_port < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ctrl->event_port); + return 0; +} + + +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno) +{ + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + char* domid_str = NULL; + + if (!ctrl) + return 0; + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/ring-ref", + ctrl->other_domain_id, domid_str, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/event-channel", + ctrl->other_domain_id, domid_str, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + + ctrl->gnttab = xc_gnttab_open(logger, 0); + if (!ctrl->gnttab) + goto out; + +// set up event channel + if (init_evt_cli(ctrl, logger)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->srv_notify = VCHAN_NOTIFY_WRITE; + + out: + free(domid_str); + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..08d5dcf --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,375 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +#define barrier() asm volatile("" ::: "memory") + + +static inline uint32_t rd_prod(struct libvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static inline uint32_t* _rd_cons(struct libvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static inline uint32_t* _wr_prod(struct libvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static inline uint32_t wr_cons(struct libvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static inline const void* rd_ring(struct libvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static inline void* wr_ring(struct libvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static inline uint32_t wr_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static inline uint32_t rd_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +static inline void request_notify(struct libvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->cli_notify : &ctrl->ring->srv_notify; + __sync_or_and_fetch(notify, bit); +} + +static inline int send_notify(struct libvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->srv_notify : &ctrl->ring->cli_notify; + uint8_t prev = __sync_fetch_and_and(notify, ~bit); + if (prev & bit) + return xc_evtchn_notify(ctrl->event, ctrl->event_port); + else + return 0; +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_data_ready(struct libvchan *ctrl, size_t request) +{ + int ready = rd_prod(ctrl) - rd_cons(ctrl); + if (ready >= request) + return ready; + /* We plan to consume all data; please tell us if you send more */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + /* + * If the writer moved rd_prod after our read but before request, we + * will not get notified even though the actual amount of data ready is + * above request. Reread rd_prod to cover this case. + */ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libvchan_data_ready(struct libvchan *ctrl) +{ + /* Since this value is being used outside libvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + return rd_prod(ctrl) - rd_cons(ctrl); +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_buffer_space(struct libvchan *ctrl, size_t request) +{ + int ready = wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); + if (ready >= request) + return ready; + /* We plan to fill the buffer; please tell us when you''ve read it */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + /* + * If the reader moved wr_cons after our read but before request, we + * will not get notified even though the actual amount of buffer space + * is above request. Reread wr_cons to cover this case. + */ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libvchan_buffer_space(struct libvchan *ctrl) +{ + /* Since this value is being used outside libvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libvchan_wait(struct libvchan *ctrl) +{ + int ret = xc_evtchn_pending(ctrl->event); + if (ret < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ret); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + barrier(); // data must be in the ring prior to increment + wr_prod(ctrl) += size; + barrier(); // increment must happen prior to notify + if (send_notify(ctrl, VCHAN_NOTIFY_WRITE)) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + while (1) { + if (!libvchan_is_open(ctrl)) + return -1; + avail = fast_get_buffer_space(ctrl, size); + if (size <= avail) + return do_send(ctrl, data, size); + if (!ctrl->blocking) + return 0; + if (size > wr_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + if (!libvchan_is_open(ctrl)) + return -1; + if (ctrl->blocking) { + size_t pos = 0; + while (1) { + avail = fast_get_buffer_space(ctrl, size - pos); + if (pos + avail > size) + avail = size - pos; + if (avail) + pos += do_send(ctrl, data + pos, avail); + if (pos == size) + return pos; + if (libvchan_wait(ctrl)) + return -1; + if (!libvchan_is_open(ctrl)) + return -1; + } + } else { + avail = fast_get_buffer_space(ctrl, size); + if (size > avail) + size = avail; + if (size == 0) + return 0; + return do_send(ctrl, data, size); + } +} + +static int do_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + barrier(); // data read must happen after rd_cons read + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + barrier(); // consumption must happen prior to notify of newly freed space + if (send_notify(ctrl, VCHAN_NOTIFY_READ)) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (size <= avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (size > rd_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (avail && size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_is_open(struct libvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist ? 1 : ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +int libvchan_fd_for_select(struct libvchan *ctrl) +{ + return xc_evtchn_fd(ctrl->event); +} + +void libvchan_close(struct libvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + if (ctrl->ring) { + if (ctrl->is_server) { + ctrl->ring->srv_live = 0; + xc_gntshr_munmap(ctrl->gntshr, ctrl->ring, PAGE_SIZE); + } else { + ctrl->ring->cli_live = 0; + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + } + } + if (ctrl->event) { + if (ctrl->event_port >= 0 && ctrl->ring) + xc_evtchn_notify(ctrl->event, ctrl->event_port); + xc_evtchn_close(ctrl->event); + } + if (ctrl->is_server) { + if (ctrl->gntshr) + xc_gntshr_close(ctrl->gntshr); + } else { + if (ctrl->gnttab) + xc_gnttab_close(ctrl->gnttab); + } + free(ctrl); +} diff --git a/tools/libvchan/libxenvchan.h b/tools/libvchan/libxenvchan.h new file mode 100644 index 0000000..c4a3ab9 --- /dev/null +++ b/tools/libvchan/libxenvchan.h @@ -0,0 +1,173 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <xen/io/libvchan.h> +#include <xen/sys/evtchn.h> +#include <xenctrl.h> + +struct libvchan_ring { + /* Pointer into the shared page. Offsets into buffer. */ + struct ring_shared* shr; + /* ring data; may be its own shared page(s) depending on order */ + void* buffer; + /** + * The size of the ring is (1 << order); offsets wrap around when they + * exceed this. This copy is required because we can''t trust the order + * in the shared page to remain constant. + */ + int order; +}; + +/** + * struct libvchan: control structure passed to all library calls + */ +struct libvchan { + /* person we communicate with */ + int other_domain_id; + /* "port" we communicate on (allows multiple vchans to exist in xenstore) */ + int device_number; + /* Mapping handle for shared ring page */ + union { + xc_gntshr *gntshr; /* for server */ + xc_gnttab *gnttab; /* for client */ + }; + /* Pointer to shared ring page */ + struct vchan_interface *ring; + /* event channel interface */ + xc_evtchn *event; + uint32_t event_port; + /* informative flags: are we acting as server? */ + int is_server:1; + /* true if server remains active when client closes (allows reconnection) */ + int server_persist:1; + /* true if operations should block instead of returning 0 */ + int blocking:1; + /* communication rings */ + struct libvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param logger Logger for libxc errors + * @param domain The peer domain that will be connecting + * @param devno A device number, used to identify this vchan in xenstore + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param logger Logger for libxc errors + * @param domain The peer domain to connect to + * @param devno A device number, used to identify this vchan in xenstore + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libvchan_close(struct libvchan *ctrl); + +/** + * Packet-based receive: always reads exactly $size bytes. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer and amount of data to read + * @return -1 on error, 0 if nonblocking and insufficient data is available, or $size + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); +/** + * Stream-based receive: reads as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data read (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); +/** + * Packet-based send: send entire buffer if possible + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer and amount of data to send + * @return -1 on error, 0 if nonblocking and insufficient space is available, or $size + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); +/** + * Stream-based send: send as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data sent (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); +/** + * Waits for reads or writes to unblock, or for a close + */ +int libvchan_wait(struct libvchan *ctrl); +/** + * Returns the event file descriptor for this vchan. When this FD is readable, + * libvchan_wait() will not block, and the state of the vchan has changed since + * the last invocation of libvchan_wait(). + */ +int libvchan_fd_for_select(struct libvchan *ctrl); +/** + * Query the state of the vchan shared page: + * return 0 when one side has called libvchan_close() or crashed + * return 1 when both sides are open + * return 2 [server only] when no client has yet connected + */ +int libvchan_is_open(struct libvchan* ctrl); +/** Amount of data ready to read, in bytes */ +int libvchan_data_ready(struct libvchan *ctrl); +/** Amount of data it is possible to send without blocking */ +int libvchan_buffer_space(struct libvchan *ctrl); diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..ea1bfc6 --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,162 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include <libxenvchan.h> + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libvchan application, both client and server. + Both sides may write and read, both from the libvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libvchan_fd; + if (argc < 4) + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libvchan_server_init(NULL, atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(NULL, atoi(argv[2]), atoi(argv[3])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libvchan_fd = libvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libvchan_fd, &rfds); + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libvchan_fd, &rfds)) { + libvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..6a9204c --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include <libxenvchan.h> + +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodeid\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libvchan_close(ctrl); + exit(1); + } + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libvchan *ctrl = 0; + int wr = 0; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libvchan_server_init(NULL, atoi(argv[3]), atoi(argv[4]), 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(NULL, atoi(argv[3]), atoi(argv[4])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + ctrl->blocking = 1; + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libvchan_close(ctrl); + return 0; +} diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h new file mode 100644 index 0000000..a3bf7cd --- /dev/null +++ b/xen/include/public/io/libvchan.h @@ -0,0 +1,97 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <stdint.h> +#include <sys/types.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +#define VCHAN_NOTIFY_WRITE 0x1 +#define VCHAN_NOTIFY_READ 0x2 + +/** + * vchan_interface: primary shared data structure + */ +struct vchan_interface { + /** + * Standard consumer/producer interface, one pair per buffer + * left is client write, server read + * right is client read, server write + */ + struct ring_shared left, right; + /** + * size of the rings, which determines their location + * 10 - at offset 1024 in ring''s page + * 11 - at offset 2048 in ring''s page + * 12+ - uses 2^(N-12) grants to describe the multi-page ring + * These should remain constant once the page is shared. + * Only one of the two orders can be 10 (or 11). + */ + uint16_t left_order, right_order; + /** + * Shutdown detection: + * 0: client (or server) has exited + * 1: client (or server) is connected + * 2: client has not yet connected + */ + uint8_t cli_live, srv_live; + /** + * Notification bits: + * VCHAN_NOTIFY_WRITE: send notify when data is written + * VCHAN_NOTIFY_READ: send notify when data is read (consumed) + * cli_notify is used for the client to inform the server of its action + */ + uint8_t cli_notify, srv_notify; + /** + * Grant list: ordering is left, right. Must not extend into actual ring + * or grow beyond the end of the initial shared page. + * These should remain constant once the page is shared, to allow + * for possible remapping by a client that restarts. + */ + uint32_t grants[0]; +}; + -- 1.7.6 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-01 16:28 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
On Wed, 2011-08-31 at 20:17 +0100, Daniel De Graaf wrote:> > [...] > >> +static int init_gnt_srv(struct libvchan *ctrl) > >> +{ > >> + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; > >> + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; > >> + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; > >> + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); > >> + int ring_ref = -1; > >> + int err; > >> + void *ring, *area; > >> + > >> + if (ring_fd < 0) > >> + return -1; > >> + > >> + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); > >> + > >> + gref_info->domid = ctrl->other_domain_id; > >> + gref_info->flags = GNTALLOC_FLAG_WRITABLE; > >> + gref_info->count = 1; > >> + > >> + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); > > > > Unless libvchan is going to be the only user of this interface we should > > add helpful wrappers to libxc, like we do for gntdev and evtchn. > > Adding the wrappers made the library more complex with no other gains when > it was out-of-tree; for upstreaming, this does make sense. This will result > in a vchan consuming two file descriptors while it is active because the libxc > API does not expose the ability to close descriptors without unmapping memory. > Since that ability is likely to be linux-specific, it''s reasonable to stop > relying on it for portability reasons.I''m not sure I understand (wouldn''t you just add a gntalloc fd to libvchan and reuse it everywhere?) but if you need a new variant of a particular libxc interface with different semantics feel free to add it (or convert an existing function to it if that seems more appropriate).> >> +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY > >> + { > >> + struct ioctl_gntalloc_unmap_notify arg; > >> + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); > >> + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; > >> + arg.event_channel_port = ctrl->event_port; > >> + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); > >> + } > >> +#endif > > > > What is the fallback if this isn''t available? > > The fallback is that the notify is not sent, and the peer cannot detect when > its peer crashes or is killed instead of executing a graceful shutdown. > > Adding this functionality to libxc requires yet another wrapper on the grant > mapping functionality. Instead of attempting to pass back the index as is > done in the current version, I am considering adding the functions > xc_gnttab_map_grant_ref_notify(xcg, domid, ref, notify_offset, notify_port) and > xc_gntshr_share_page_notify(xcs, domid, &ref, notify_offset, notify_port); > these would fall back to xc_gnttab_map_grant_ref if notify is not present.You can''t just add the xc_gnttab_notify() as a function which just registers the notify and use xc_gnttab_map_grant_ref + that new function?> > [...] > >> static int init_xs_srv(struct libvchan *ctrl, int ring_ref) > >> +{ > >> + int ret = -1; > >> + struct xs_handle *xs; > >> + struct xs_permissions perms[2]; > >> + char buf[64]; > >> + char ref[16]; > >> + char* domid_str = NULL; > >> + xs = xs_domain_open(); > >> + if (!xs) > >> + goto fail; > >> + domid_str = xs_read(xs, 0, "domid", NULL); > >> + if (!domid_str) > >> + goto fail_xs_open; > >> + > >> + // owner domain is us > >> + perms[0].id = atoi(domid_str); > > > > It sucks a bit that xenstore doesn''t appear to allow DOMNID_SELF here > > but oh well. > > On the client side, we need to look up our own domid to find the path > (if the changes to follow usual xenstore convention are made) so it''s > required either way.How do you mean? relative xenstore accesses are relative to /local/domain/<domid> so you shouldn''t need to know domid to access e.g. /local/domain/<domid>/foo/bar -- just access foo/bar instead.> >> + // permissions for domains not listed = none > >> + perms[0].perms = XS_PERM_NONE; > >> + // other domains > >> + perms[1].id = ctrl->other_domain_id; > >> + perms[1].perms = XS_PERM_READ; > >> + > >> + snprintf(ref, sizeof ref, "%d", ring_ref); > >> + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); > >> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > >> + goto fail_xs_open; > >> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > >> + goto fail_xs_open; > >> + > >> + snprintf(ref, sizeof ref, "%d", ctrl->event_port); > >> + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); > >> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > >> + goto fail_xs_open; > >> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > >> + goto fail_xs_open; > > > > Am I right that the intended usage model is that two domains can decide > > to setup a connection without admin or toolstack involvement? > > > > Do we need to arrange on the toolstack side that a suitable > > vchan-specific directory (or directories) in xenstore exists with > > suitable permissions to allow this to happen exists or do we think data > > is an appropriate location? > > Yes, the intended use is to avoid needing to have management tools involved > in the setup. Of course, that doesn''t mean that vchan can''t have help from > management tools - but since this help isn''t required, adding an unneeded > dependency was pointless and might also imply a level of control that is not > actually present (i.e. restricting the management tools does not actually > restrict the ability to set up a vchan; that requires something like an XSM > policy blocking the grant or event channels). I picked data because it does > not require toolstack modification to use, and no other location jumped out > at me - vchan isn''t really a device.OK. I''m a bit fearful that data may become a bit of a dumping ground (I''m not sure what its intended use/semantics actually are) but that''s not the fault of this patch. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 16:47 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
On 09/01/2011 12:28 PM, Ian Campbell wrote:> On Wed, 2011-08-31 at 20:17 +0100, Daniel De Graaf wrote: >>> [...] >>>> +static int init_gnt_srv(struct libvchan *ctrl) >>>> +{ >>>> + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; >>>> + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; >>>> + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; >>>> + int ring_fd = open("/dev/xen/gntalloc", O_RDWR); >>>> + int ring_ref = -1; >>>> + int err; >>>> + void *ring, *area; >>>> + >>>> + if (ring_fd < 0) >>>> + return -1; >>>> + >>>> + gref_info = malloc(sizeof(*gref_info) + max(pages_left, pages_right)*sizeof(uint32_t)); >>>> + >>>> + gref_info->domid = ctrl->other_domain_id; >>>> + gref_info->flags = GNTALLOC_FLAG_WRITABLE; >>>> + gref_info->count = 1; >>>> + >>>> + err = ioctl(ring_fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); >>> >>> Unless libvchan is going to be the only user of this interface we should >>> add helpful wrappers to libxc, like we do for gntdev and evtchn. >> >> Adding the wrappers made the library more complex with no other gains when >> it was out-of-tree; for upstreaming, this does make sense. This will result >> in a vchan consuming two file descriptors while it is active because the libxc >> API does not expose the ability to close descriptors without unmapping memory. >> Since that ability is likely to be linux-specific, it''s reasonable to stop >> relying on it for portability reasons. > > I''m not sure I understand (wouldn''t you just add a gntalloc fd to > libvchan and reuse it everywhere?) but if you need a new variant of a > particular libxc interface with different semantics feel free to add it > (or convert an existing function to it if that seems more appropriate).The previous version of libvchan closed the gntalloc file descriptor during the initialization. This is unlikely to be portable when abstracted to close the entire gntshr interface. Making this change has exposed an interesting ordering dependency in the notify API under Linux: the file descriptor for gntdev or gntalloc must be less than the file descriptor for evtchn in order for the event channel to still be active when the unmap occurs on a crash. The init functions of libvchan do open the files in the proper order for this to happen.>>>> +#ifdef IOCTL_GNTALLOC_SET_UNMAP_NOTIFY >>>> + { >>>> + struct ioctl_gntalloc_unmap_notify arg; >>>> + arg.index = gref_info->index + offsetof(struct vchan_interface, srv_live); >>>> + arg.action = UNMAP_NOTIFY_CLEAR_BYTE | UNMAP_NOTIFY_SEND_EVENT; >>>> + arg.event_channel_port = ctrl->event_port; >>>> + ioctl(ring_fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, &arg); >>>> + } >>>> +#endif >>> >>> What is the fallback if this isn''t available? >> >> The fallback is that the notify is not sent, and the peer cannot detect when >> its peer crashes or is killed instead of executing a graceful shutdown. >> >> Adding this functionality to libxc requires yet another wrapper on the grant >> mapping functionality. Instead of attempting to pass back the index as is >> done in the current version, I am considering adding the functions >> xc_gnttab_map_grant_ref_notify(xcg, domid, ref, notify_offset, notify_port) and >> xc_gntshr_share_page_notify(xcs, domid, &ref, notify_offset, notify_port); >> these would fall back to xc_gnttab_map_grant_ref if notify is not present. > > You can''t just add the xc_gnttab_notify() as a function which just > registers the notify and use xc_gnttab_map_grant_ref + that new > function?This is possible, but you would need to pass back the index used to mmap or keep metadata within the file descriptor to allow this to be determined. Since the current xc_* mapping interfaces do not expose this index, it would require a larger change to expose this mostly-useless index just for the purpose of passing it to the notify call.>>> [...] >>>> static int init_xs_srv(struct libvchan *ctrl, int ring_ref) >>>> +{ >>>> + int ret = -1; >>>> + struct xs_handle *xs; >>>> + struct xs_permissions perms[2]; >>>> + char buf[64]; >>>> + char ref[16]; >>>> + char* domid_str = NULL; >>>> + xs = xs_domain_open(); >>>> + if (!xs) >>>> + goto fail; >>>> + domid_str = xs_read(xs, 0, "domid", NULL); >>>> + if (!domid_str) >>>> + goto fail_xs_open; >>>> + >>>> + // owner domain is us >>>> + perms[0].id = atoi(domid_str); >>> >>> It sucks a bit that xenstore doesn''t appear to allow DOMNID_SELF here >>> but oh well. >> >> On the client side, we need to look up our own domid to find the path >> (if the changes to follow usual xenstore convention are made) so it''s >> required either way. > > How do you mean? relative xenstore accesses are relative > to /local/domain/<domid> so you shouldn''t need to know domid to access > e.g. /local/domain/<domid>/foo/bar -- just access foo/bar instead. >Yes, but the client doesn''t use a path relative to its own domid. It uses /local/domain/<server-id>/data/vchan/<client-id>/<vchan-id>/... Devices work around this problem by having xl or xm fill in paths under both /local/domain/<client-id> and /local/domain/<server-id> pointing to each other; using this style of path is not possible without some side knowing its own domain ID. Is reading "domid" the best method for determining the domain ID of the local domain? I noticed in testing that it may need to be set for dom0 if only the xl tools are used in domain creation.>>>> + // permissions for domains not listed = none >>>> + perms[0].perms = XS_PERM_NONE; >>>> + // other domains >>>> + perms[1].id = ctrl->other_domain_id; >>>> + perms[1].perms = XS_PERM_READ; >>>> + >>>> + snprintf(ref, sizeof ref, "%d", ring_ref); >>>> + snprintf(buf, sizeof buf, "data/vchan/%d/ring-ref", ctrl->device_number); >>>> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) >>>> + goto fail_xs_open; >>>> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) >>>> + goto fail_xs_open; >>>> + >>>> + snprintf(ref, sizeof ref, "%d", ctrl->event_port); >>>> + snprintf(buf, sizeof buf, "data/vchan/%d/event-channel", ctrl->device_number); >>>> + if (!xs_write(xs, 0, buf, ref, strlen(ref))) >>>> + goto fail_xs_open; >>>> + if (!xs_set_permissions(xs, 0, buf, perms, 2)) >>>> + goto fail_xs_open; >>> >>> Am I right that the intended usage model is that two domains can decide >>> to setup a connection without admin or toolstack involvement? >>> >>> Do we need to arrange on the toolstack side that a suitable >>> vchan-specific directory (or directories) in xenstore exists with >>> suitable permissions to allow this to happen exists or do we think data >>> is an appropriate location? >> >> Yes, the intended use is to avoid needing to have management tools involved >> in the setup. Of course, that doesn''t mean that vchan can''t have help from >> management tools - but since this help isn''t required, adding an unneeded >> dependency was pointless and might also imply a level of control that is not >> actually present (i.e. restricting the management tools does not actually >> restrict the ability to set up a vchan; that requires something like an XSM >> policy blocking the grant or event channels). I picked data because it does >> not require toolstack modification to use, and no other location jumped out >> at me - vchan isn''t really a device. > > OK. I''m a bit fearful that data may become a bit of a dumping ground > (I''m not sure what its intended use/semantics actually are) but that''s > not the fault of this patch. > > Ian. >-- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Sep-01 16:56 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
Daniel De Graaf writes ("Re: [PATCH v3] libvchan: interdomain communications library"):> Making this change has exposed an interesting ordering dependency in the > notify API under Linux: the file descriptor for gntdev or gntalloc must be > less than the file descriptor for evtchn in order for the event channel to > still be active when the unmap occurs on a crash. The init functions of > libvchan do open the files in the proper order for this to happen.Wow, that''s pretty crazy. Surely the gnt* fd should have an internal reference to the evtchn fd ? Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-01 17:46 UTC
[Xen-devel] Re: [PATCH v3] libvchan: interdomain communications library
On 09/01/2011 12:56 PM, Ian Jackson wrote:> Daniel De Graaf writes ("Re: [PATCH v3] libvchan: interdomain communications library"): >> Making this change has exposed an interesting ordering dependency in the >> notify API under Linux: the file descriptor for gntdev or gntalloc must be >> less than the file descriptor for evtchn in order for the event channel to >> still be active when the unmap occurs on a crash. The init functions of >> libvchan do open the files in the proper order for this to happen. > > Wow, that''s pretty crazy. Surely the gnt* fd should have an internal > reference to the evtchn fd ? > > Ian. >The gnt* drivers will need to be changed to both find and take such a reference; currently, they only refer to the port. This will probably add a dependency from the gnt* module on evtchn; I''ll look at what is actually required to hold the event channel open when I make the patch. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Sep-01 19:24 UTC
[Xen-devel] Re: [PATCH 2/3] libxc: add xc_gntshr_* functions
On Thu, Sep 01, 2011 at 12:22:17PM -0400, Daniel De Graaf wrote:> These functions and the xc_gntshr device (/dev/xen/gntalloc on linux) > allow applications to create pages shared with other domains. > > Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > --- > tools/libxc/xc_gnttab.c | 27 +++++++++ > tools/libxc/xc_linux_osdep.c | 121 ++++++++++++++++++++++++++++++++++++++++++ > tools/libxc/xc_private.c | 13 +++++ > tools/libxc/xenctrl.h | 48 +++++++++++++++++ > tools/libxc/xenctrlosdep.h | 13 +++++ > 5 files changed, 222 insertions(+), 0 deletions(-) > > diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c > index dc7aa0c..ffa3550 100644 > --- a/tools/libxc/xc_gnttab.c > +++ b/tools/libxc/xc_gnttab.c > @@ -204,6 +204,33 @@ int xc_gnttab_set_max_grants(xc_gnttab *xcg, uint32_t count) > return xcg->ops->u.gnttab.set_max_grants(xcg, xcg->ops_handle, count); > } > > +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, > + int count, uint32_t *refs, int writable) > +{ > + return xcg->ops->u.gntshr.share_pages(xcg, xcg->ops_handle, domid, > + count, refs, writable); > +} > + > +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, > + uint32_t *ref, int writable, > + uint32_t notify_offset, > + evtchn_port_t notify_port) > +{ > + return xcg->ops->u.gntshr.share_page_notify(xcg, xcg->ops_handle, > + domid, ref, writable, notify_offset, notify_port); > +} > + > +/* > + * Unmaps the @count pages starting at @start_address, which were mapped by a > + * call to xc_gntshr_share_*. Never logs. > + */ > +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count) > +{ > + return xcg->ops->u.gntshr.munmap(xcg, xcg->ops_handle, > + start_address, count); > +} > + > + > /* > * Local variables: > * mode: C > diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c > index 8f7718f..871d37c 100644 > --- a/tools/libxc/xc_linux_osdep.c > +++ b/tools/libxc/xc_linux_osdep.c > @@ -34,6 +34,7 @@ > #include <xen/memory.h> > #include <xen/sys/evtchn.h> > #include <xen/sys/gntdev.h> > +#include <xen/sys/gntalloc.h> > > #include "xenctrl.h" > #include "xenctrlosdep.h" > @@ -718,6 +719,124 @@ static struct xc_osdep_ops linux_gnttab_ops = { > }, > }; > > +static xc_osdep_handle linux_gntshr_open(xc_gntshr *xcg) > +{ > + int fd = open(DEVXEN "gntalloc", O_RDWR); > + > + if ( fd == -1 ) > + return XC_OSDEP_OPEN_ERROR; > + > + return (xc_osdep_handle)fd; > +} > + > +static int linux_gntshr_close(xc_gntshr *xcg, xc_osdep_handle h) > +{ > + int fd = (int)h; > + return close(fd); > +} > + > +static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, > + uint32_t domid, int count, > + uint32_t *refs, int writable) > +{ > + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; > + int err; > + void *area = NULL; > + gref_info = malloc(sizeof(*gref_info) + count * sizeof(uint32_t)); > + if (!gref_info) > + return NULL; > + gref_info->domid = domid; > + gref_info->flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; > + gref_info->count = count; > + > + err = ioctl((int)h, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); > + if (err) { > + PERROR("linux_gntshr_share_pages: ioctl failed"); > + goto out; > + } > + > + area = mmap(NULL, count * XC_PAGE_SIZE, PROT_READ | PROT_WRITE, > + MAP_SHARED, (int)h, gref_info->index); > + > + if (area == MAP_FAILED) { > + area = NULL; > + PERROR("linux_gntshr_share_pages: mmap failed"); > + goto out; > + } > + > + memcpy(refs, gref_info->gref_ids, count * sizeof(uint32_t)); > + out: > + free(gref_info); > + return area; > +} > + > +static void *linux_gntshr_share_page_notify(xc_gntshr *xch, xc_osdep_handle h, > + uint32_t domid, uint32_t *ref, > + int writable, uint32_t notify_offset, > + evtchn_port_t notify_port) > +{ > + struct ioctl_gntalloc_alloc_gref gref_info; > + struct ioctl_gntalloc_unmap_notify notify; > + int err; > + int fd = (int)h; > + void *area = NULL; > + gref_info.domid = domid; > + gref_info.flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; > + gref_info.count = 1; > + > + err = ioctl(fd, IOCTL_GNTALLOC_ALLOC_GREF, &gref_info); > + if (err) { > + PERROR("linux_gntshr_share_page_notify: ioctl failed"); > + goto out; > + } > + > + area = mmap(NULL, XC_PAGE_SIZE, PROT_READ | PROT_WRITE, > + MAP_SHARED, fd, gref_info.index); > + > + if (area == MAP_FAILED) { > + PERROR("linux_gntshr_share_page_notify: mmap failed"); > + area = NULL; > + goto out; > + } > + > + notify.index = gref_info.index; > + notify.action = 0; > + if (notify_offset >= 0) { > + notify.index += notify_offset; > + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; > + } > + if (notify_port >= 0) { > + notify.event_channel_port = notify_port; > + notify.action |= UNMAP_NOTIFY_SEND_EVENT; > + } > + if (notify.action && ioctl(fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, ¬ify)) { > + PERROR("linux_gntshr_share_page_notify: ioctl SET_UNMAP_NOTIFY failed");Should we report to the caller that we can''t set it up? Say by reporting via a bool (as in through the arguments )?> + } > + > + *ref = gref_info.gref_ids[0]; > + out: > + return area; > +} > + > + > +static int linux_gntshr_munmap(xc_gntshr *xcg, xc_osdep_handle h, > + void *start_address, uint32_t count) > +{ > + return munmap(start_address, count); > +} > + > +static struct xc_osdep_ops linux_gntshr_ops = { > + .open = &linux_gntshr_open, > + .close = &linux_gntshr_close, > + > + .u.gntshr = { > + .share_pages = &linux_gntshr_share_pages, > + .share_page_notify = &linux_gntshr_share_page_notify, > + .munmap = &linux_gntshr_munmap, > + }, > +}; > + > + > static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_type type) > { > switch ( type ) > @@ -728,6 +847,8 @@ static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_ty > return &linux_evtchn_ops; > case XC_OSDEP_GNTTAB: > return &linux_gnttab_ops; > + case XC_OSDEP_GNTSHR: > + return &linux_gntshr_ops; > default: > return NULL; > } > diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c > index 09c8f23..09a91e7 100644 > --- a/tools/libxc/xc_private.c > +++ b/tools/libxc/xc_private.c > @@ -258,6 +258,19 @@ int xc_gnttab_close(xc_gnttab *xcg) > return xc_interface_close_common(xcg); > } > > +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, > + unsigned open_flags) > +{ > + return xc_interface_open_common(logger, NULL, open_flags, > + XC_OSDEP_GNTSHR); > +} > + > +int xc_gntshr_close(xc_gntshr *xcg) > +{ > + return xc_interface_close_common(xcg); > +} > + > + > static pthread_key_t errbuf_pkey; > static pthread_once_t errbuf_pkey_once = PTHREAD_ONCE_INIT; > > diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h > index 7859571..374c705 100644 > --- a/tools/libxc/xenctrl.h > +++ b/tools/libxc/xenctrl.h > @@ -115,6 +115,7 @@ > typedef struct xc_interface_core xc_interface; > typedef struct xc_interface_core xc_evtchn; > typedef struct xc_interface_core xc_gnttab; > +typedef struct xc_interface_core xc_gntshr; > typedef enum xc_error_code xc_error_code; > > > @@ -1400,6 +1401,53 @@ grant_entry_v1_t *xc_gnttab_map_table_v1(xc_interface *xch, int domid, int *gnt_ > grant_entry_v2_t *xc_gnttab_map_table_v2(xc_interface *xch, int domid, int *gnt_num); > /* Sometimes these don''t set errno [fixme], and sometimes they don''t log. */ > > +/* > + * Return an fd onto the grant sharing driver. Logs errors. > + */ > +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, > + unsigned open_flags); > + > +/* > + * Close a handle previously allocated with xc_gntshr_open(). > + * Never logs errors. > + */ > +int xc_gntshr_close(xc_gntshr *xcg); > + > +/* > + * Creates and shares pages with another domain. > + * > + * @parm xcg a handle to an open grant sharing instance > + * @parm domid the domain to share memory with > + * @parm count the number of pages to share > + * @parm refs the grant references of the pages (output) > + * @parm writable true if the other domain can write to the pages > + * @return local mapping of the pages > + */ > +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, > + int count, uint32_t *refs, int writable); > + > +/* > + * Creates and shares a page with another domain, with unmap notification. > + * > + * @parm xcg a handle to an open grant sharing instance > + * @parm domid the domain to share memory with > + * @parm refs the grant reference of the pages (output) > + * @parm writable true if the other domain can write to the page > + * @parm notify_offset The byte offset in the page to use for unmap > + * notification; -1 for none. > + * @parm notify_port The event channel port to use for unmap notify, or -1 > + * @return local mapping of the page > + */ > +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, > + uint32_t *ref, int writable, > + uint32_t notify_offset, > + evtchn_port_t notify_port); > +/* > + * Unmaps the @count pages starting at @start_address, which were mapped by a > + * call to xc_gntshr_share_*. Never logs. > + */ > +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count); > + > int xc_physdev_map_pirq(xc_interface *xch, > int domid, > int index, > diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h > index 01969c5..e1c1ba5 100644 > --- a/tools/libxc/xenctrlosdep.h > +++ b/tools/libxc/xenctrlosdep.h > @@ -54,6 +54,7 @@ enum xc_osdep_type { > XC_OSDEP_PRIVCMD, > XC_OSDEP_EVTCHN, > XC_OSDEP_GNTTAB, > + XC_OSDEP_GNTSHR, > }; > > /* Opaque handle internal to the backend */ > @@ -129,6 +130,18 @@ struct xc_osdep_ops > uint32_t count); > int (*set_max_grants)(xc_gnttab *xcg, xc_osdep_handle h, uint32_t count); > } gnttab; > + struct { > + void *(*share_pages)(xc_gntshr *xcg, xc_osdep_handle h, > + uint32_t domid, int count, > + uint32_t *refs, int writable); > + void *(*share_page_notify)(xc_gntshr *xcg, xc_osdep_handle h, > + uint32_t domid, > + uint32_t *ref, int writable, > + uint32_t notify_offset, > + evtchn_port_t notify_port); > + int (*munmap)(xc_gntshr *xcg, xc_osdep_handle h, > + void *start_address, uint32_t count); > + } gntshr; > } u; > }; > typedef struct xc_osdep_ops xc_osdep_ops; > -- > 1.7.6_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Konrad Rzeszutek Wilk
2011-Sep-01 19:29 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Thu, Sep 01, 2011 at 12:22:16PM -0400, Daniel De Graaf wrote:> Normally, when a userspace process mapping a grant crashes, the domain > providing the reference receives no indication that its peer has > crashed, possibly leading to unexpected freezes or timeouts. This > function provides a notification of the unmap by signalling an event > channel and/or clearing a specific byte in the page. > > Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > --- > tools/libxc/xc_gnttab.c | 15 ++++++++++++ > tools/libxc/xc_linux_osdep.c | 52 ++++++++++++++++++++++++++++++++++++++++++ > tools/libxc/xenctrl.h | 21 +++++++++++++++++ > tools/libxc/xenctrlosdep.h | 5 ++++ > 4 files changed, 93 insertions(+), 0 deletions(-) > > diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c > index 4f55fce..dc7aa0c 100644 > --- a/tools/libxc/xc_gnttab.c > +++ b/tools/libxc/xc_gnttab.c > @@ -174,6 +174,21 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, > count, domid, refs, prot); > } > > +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, > + uint32_t domid, > + uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port) > +{ > + if (xcg->ops->u.gnttab.map_grant_ref_notify) > + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, > + domid, ref, notify_offset, notify_port); > + else > + return xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, > + domid, ref, PROT_READ|PROT_WRITE); > +} > + > + > int xc_gnttab_munmap(xc_gnttab *xcg, > void *start_address, > uint32_t count) > diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c > index dca6718..8f7718f 100644 > --- a/tools/libxc/xc_linux_osdep.c > +++ b/tools/libxc/xc_linux_osdep.c > @@ -613,6 +613,57 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle > return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); > } > > +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, > + uint32_t domid, uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port) > +{ > + int fd = (int)h; > + struct ioctl_gntdev_map_grant_ref map; > + struct ioctl_gntdev_unmap_notify notify;That looks a bit odd. Like the formatting is off?> + void *addr; > + > + map.count = 1; > + map.refs[0].domid = domid; > + map.refs[0].ref = ref; > + > + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { > + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); > + return NULL; > + } > + > + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); > + if ( addr == MAP_FAILED ) > + { > + int saved_errno = errno; > + struct ioctl_gntdev_unmap_grant_ref unmap_grant; > + > + PERROR("xc_gnttab_map_grant_ref: mmap failed"); > + unmap_grant.index = map.index; > + unmap_grant.count = 1; > + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); > + errno = saved_errno; > + return NULL; > + } > + > + notify.index = map.index; > + notify.action = 0; > + if (notify_offset >= 0) { > + notify.index += notify_offset; > + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; > + } > + if (notify_port >= 0) { > + notify.event_channel_port = notify_port; > + notify.action |= UNMAP_NOTIFY_SEND_EVENT; > + } > + if (notify.action && ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify)) { > + PERROR("linux_gnttab_map_grant_ref_notify: ioctl SET_UNMAP_NOTIFY failed");Perhaps reporting via an argument that we failed at doing the notify would be useful? That way at least you know you need to do polling.> + } > + > + return addr; > +} > + > + > static int linux_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h, > void *start_address, uint32_t count) > { > @@ -662,6 +713,7 @@ static struct xc_osdep_ops linux_gnttab_ops = { > .map_grant_ref = &linux_gnttab_map_grant_ref, > .map_grant_refs = &linux_gnttab_map_grant_refs, > .map_domain_grant_refs = &linux_gnttab_map_domain_grant_refs, > + .map_grant_ref_notify = &linux_gnttab_map_grant_ref_notify, > .munmap = &linux_gnttab_munmap, > }, > }; > diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h > index 1b82ee0..7859571 100644 > --- a/tools/libxc/xenctrl.h > +++ b/tools/libxc/xenctrl.h > @@ -1349,6 +1349,27 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, > int prot); > > /* > + * Memory maps a grant reference from one domain to a local address range. > + * Mappings should be unmapped with xc_gnttab_munmap. Logs errors.^^^^^^^^^^^^ .. that looks odd?> + * This version always maps writable pages, and will attempt to set up > + * an unmap notification at the given offset and event channel. When the > + * page is unmapped, the byte at the given offset will be zeroed and a > + * wakeup will be sent to the given event channel. > + * > + * @parm xcg a handle on an open grant table interface > + * @parm domid the domain to map memory from > + * @parm ref the grant reference ID to map > + * @parm notify_offset The byte offset in the page to use for unmap > + * notification; -1 for none. > + * @parm notify_port The event channel port to use for unmap notify, or -1 > + */ > +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, > + uint32_t domid, > + uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port); > + > +/* > * Unmaps the @count pages starting at @start_address, which were mapped by a > * call to xc_gnttab_map_grant_ref or xc_gnttab_map_grant_refs. Never logs. > */ > diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h > index bfe46e0..01969c5 100644 > --- a/tools/libxc/xenctrlosdep.h > +++ b/tools/libxc/xenctrlosdep.h > @@ -119,6 +119,11 @@ struct xc_osdep_ops > uint32_t domid, > uint32_t *refs, > int prot); > + void *(*map_grant_ref_notify)(xc_gnttab *xcg, xc_osdep_handle h, > + uint32_t domid, > + uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port); > int (*munmap)(xc_gnttab *xcg, xc_osdep_handle h, > void *start_address, > uint32_t count); > -- > 1.7.6_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Changes since v4: - Notify application of failure to setup unmap notification - Add shared library -rpath to fix build on clean systems - Add linux system headers in the patches that use them - Formatting cleanups [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify [PATCH 2/3] libxc: add xc_gntshr_* functions [PATCH 3/3] libvchan: interdomain communications library _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-19 22:43 UTC
[Xen-devel] [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
Normally, when a userspace process mapping a grant crashes, the domain providing the reference receives no indication that its peer has crashed, possibly leading to unexpected freezes or timeouts. This function provides a notification of the unmap by signalling an event channel and/or clearing a specific byte in the page. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/include/xen-sys/Linux/gntdev.h | 33 +++++++++++++++++++- tools/libxc/xc_gnttab.c | 23 +++++++++++++ tools/libxc/xc_linux_osdep.c | 57 ++++++++++++++++++++++++++++++++++ tools/libxc/xenctrl.h | 24 ++++++++++++++ tools/libxc/xenctrlosdep.h | 6 +++ 5 files changed, 142 insertions(+), 1 deletions(-) diff --git a/tools/include/xen-sys/Linux/gntdev.h b/tools/include/xen-sys/Linux/gntdev.h index 8bd1467..5304bd3 100644 --- a/tools/include/xen-sys/Linux/gntdev.h +++ b/tools/include/xen-sys/Linux/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index 4f55fce..3d3c63b 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -18,6 +18,7 @@ */ #include "xc_private.h" +#include <errno.h> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) { @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, count, domid, refs, prot); } +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result) +{ + if (xcg->ops->u.gnttab.map_grant_ref_notify) + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, + domid, ref, notify_offset, notify_port, notify_result); + else { + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, + domid, ref, PROT_READ|PROT_WRITE); + if (area && notify_result) { + *notify_result = -1; + errno = ENOSYS; + } + return area; + } +} + + int xc_gnttab_munmap(xc_gnttab *xcg, void *start_address, uint32_t count) diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index dca6718..3040cb6 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -613,6 +613,62 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); } +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, + uint32_t domid, uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result) +{ + int fd = (int)h; + int rv = 0; + struct ioctl_gntdev_map_grant_ref map; + struct ioctl_gntdev_unmap_notify notify; + void *addr; + + map.count = 1; + map.refs[0].domid = domid; + map.refs[0].ref = ref; + + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); + return NULL; + } + + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); + if ( addr == MAP_FAILED ) + { + int saved_errno = errno; + struct ioctl_gntdev_unmap_grant_ref unmap_grant; + + PERROR("xc_gnttab_map_grant_ref: mmap failed"); + unmap_grant.index = map.index; + unmap_grant.count = 1; + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); + errno = saved_errno; + return NULL; + } + + notify.index = map.index; + notify.action = 0; + if (notify_offset >= 0) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port >= 0) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action) + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify); + if (rv) + PERROR("linux_gnttab_map_grant_ref_notify: ioctl SET_UNMAP_NOTIFY failed"); + if (notify_result) + *notify_result = rv; + + return addr; +} + + static int linux_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count) { @@ -662,6 +718,7 @@ static struct xc_osdep_ops linux_gnttab_ops = { .map_grant_ref = &linux_gnttab_map_grant_ref, .map_grant_refs = &linux_gnttab_map_grant_refs, .map_domain_grant_refs = &linux_gnttab_map_domain_grant_refs, + .map_grant_ref_notify = &linux_gnttab_map_grant_ref_notify, .munmap = &linux_gnttab_munmap, }, }; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 1b82ee0..02c10fa 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -1349,6 +1349,30 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, int prot); /* + * Memory maps a grant reference from one domain to a local address range. + * Mappings should be unmapped with xc_gnttab_munmap. This version always maps + * writable pages, and will attempt to set up an unmap notification at the given + * offset and event channel. When the page is unmapped, the byte at the given + * offset will be zeroed and a wakeup will be sent to the given event channel. + * Logs errors. + * + * @parm xcg a handle on an open grant table interface + * @parm domid the domain to map memory from + * @parm ref the grant reference ID to map + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + * @parm notify_result If nonnull, set to 0 if the notify setup succeeded + * or an errno value if not. + */ +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result); + +/* * Unmaps the @count pages starting at @start_address, which were mapped by a * call to xc_gnttab_map_grant_ref or xc_gnttab_map_grant_refs. Never logs. */ diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index bfe46e0..bf81538 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -119,6 +119,12 @@ struct xc_osdep_ops uint32_t domid, uint32_t *refs, int prot); + void *(*map_grant_ref_notify)(xc_gnttab *xcg, xc_osdep_handle h, + uint32_t domid, + uint32_t ref, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result); int (*munmap)(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count); -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-19 22:43 UTC
[Xen-devel] [PATCH 2/3] libxc: add xc_gntshr_* functions
These functions and the xc_gntshr device (/dev/xen/gntalloc on linux) allow applications to create pages shared with other domains. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/include/xen-sys/Linux/gntalloc.h | 82 ++++++++++++++++++ tools/libxc/xc_gnttab.c | 29 +++++++ tools/libxc/xc_linux_osdep.c | 142 ++++++++++++++++++++++++++++++++ tools/libxc/xc_private.c | 13 +++ tools/libxc/xenctrl.h | 53 ++++++++++++- tools/libxc/xenctrlosdep.h | 14 +++ 6 files changed, 332 insertions(+), 1 deletions(-) create mode 100644 tools/include/xen-sys/Linux/gntalloc.h diff --git a/tools/include/xen-sys/Linux/gntalloc.h b/tools/include/xen-sys/Linux/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/include/xen-sys/Linux/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index 3d3c63b..ea8f76f 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -212,6 +212,35 @@ int xc_gnttab_set_max_grants(xc_gnttab *xcg, uint32_t count) return xcg->ops->u.gnttab.set_max_grants(xcg, xcg->ops_handle, count); } +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable) +{ + return xcg->ops->u.gntshr.share_pages(xcg, xcg->ops_handle, domid, + count, refs, writable); +} + +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result) +{ + return xcg->ops->u.gntshr.share_page_notify(xcg, xcg->ops_handle, + domid, ref, writable, notify_offset, notify_port, + notify_result); +} + +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count) +{ + return xcg->ops->u.gntshr.munmap(xcg, xcg->ops_handle, + start_address, count); +} + + /* * Local variables: * mode: C diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index 3040cb6..6616357 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -34,6 +34,7 @@ #include <xen/memory.h> #include <xen/sys/evtchn.h> #include <xen/sys/gntdev.h> +#include <xen/sys/gntalloc.h> #include "xenctrl.h" #include "xenctrlosdep.h" @@ -723,6 +724,145 @@ static struct xc_osdep_ops linux_gnttab_ops = { }, }; +static xc_osdep_handle linux_gntshr_open(xc_gntshr *xcg) +{ + int fd = open(DEVXEN "gntalloc", O_RDWR); + + if ( fd == -1 ) + return XC_OSDEP_OPEN_ERROR; + + return (xc_osdep_handle)fd; +} + +static int linux_gntshr_close(xc_gntshr *xcg, xc_osdep_handle h) +{ + int fd = (int)h; + return close(fd); +} + +static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable) +{ + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; + struct ioctl_gntalloc_dealloc_gref gref_drop; + int err; + void *area = NULL; + gref_info = malloc(sizeof(*gref_info) + count * sizeof(uint32_t)); + if (!gref_info) + return NULL; + gref_info->domid = domid; + gref_info->flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; + gref_info->count = count; + + err = ioctl((int)h, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) { + PERROR("linux_gntshr_share_pages: ioctl failed"); + goto out; + } + + area = mmap(NULL, count * XC_PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, (int)h, gref_info->index); + + if (area == MAP_FAILED) { + area = NULL; + PERROR("linux_gntshr_share_pages: mmap failed"); + goto out_remove_fdmap; + } + + memcpy(refs, gref_info->gref_ids, count * sizeof(uint32_t)); + + out_remove_fdmap: + /* Removing the mapping from the file descriptor does not cause the pages to + * be deallocated until the mapping is removed. + */ + gref_drop.index = gref_info->index; + gref_drop.count = count; + ioctl((int)h, IOCTL_GNTALLOC_DEALLOC_GREF, &gref_drop); + out: + free(gref_info); + return area; +} + +static void *linux_gntshr_share_page_notify(xc_gntshr *xch, xc_osdep_handle h, + uint32_t domid, uint32_t *ref, + int writable, uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result) +{ + struct ioctl_gntalloc_alloc_gref gref_info; + struct ioctl_gntalloc_unmap_notify notify; + struct ioctl_gntalloc_dealloc_gref gref_drop; + int err; + int fd = (int)h; + void *area = NULL; + gref_info.domid = domid; + gref_info.flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; + gref_info.count = 1; + + err = ioctl(fd, IOCTL_GNTALLOC_ALLOC_GREF, &gref_info); + if (err) { + PERROR("linux_gntshr_share_page_notify: ioctl failed"); + goto out; + } + + area = mmap(NULL, XC_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, gref_info.index); + + if (area == MAP_FAILED) { + PERROR("linux_gntshr_share_page_notify: mmap failed"); + area = NULL; + goto out_remove_fdmap; + } + + notify.index = gref_info.index; + notify.action = 0; + if (notify_offset >= 0) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port >= 0) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action) + err = ioctl(fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, ¬ify); + if (err) + PERROR("linux_gntshr_share_page_notify: ioctl SET_UNMAP_NOTIFY failed"); + if (notify_result) + *notify_result = err; + + *ref = gref_info.gref_ids[0]; + out_remove_fdmap: + /* Removing the mapping from the file descriptor does not cause the pages to + * be deallocated until the mapping is removed. + */ + gref_drop.index = gref_info.index; + gref_drop.count = 1; + ioctl((int)h, IOCTL_GNTALLOC_DEALLOC_GREF, &gref_drop); + out: + return area; +} + + +static int linux_gntshr_munmap(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count) +{ + return munmap(start_address, count); +} + +static struct xc_osdep_ops linux_gntshr_ops = { + .open = &linux_gntshr_open, + .close = &linux_gntshr_close, + + .u.gntshr = { + .share_pages = &linux_gntshr_share_pages, + .share_page_notify = &linux_gntshr_share_page_notify, + .munmap = &linux_gntshr_munmap, + }, +}; + + static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_type type) { switch ( type ) @@ -733,6 +873,8 @@ static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_ty return &linux_evtchn_ops; case XC_OSDEP_GNTTAB: return &linux_gnttab_ops; + case XC_OSDEP_GNTSHR: + return &linux_gntshr_ops; default: return NULL; } diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c index 09c8f23..09a91e7 100644 --- a/tools/libxc/xc_private.c +++ b/tools/libxc/xc_private.c @@ -258,6 +258,19 @@ int xc_gnttab_close(xc_gnttab *xcg) return xc_interface_close_common(xcg); } +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags) +{ + return xc_interface_open_common(logger, NULL, open_flags, + XC_OSDEP_GNTSHR); +} + +int xc_gntshr_close(xc_gntshr *xcg) +{ + return xc_interface_close_common(xcg); +} + + static pthread_key_t errbuf_pkey; static pthread_once_t errbuf_pkey_once = PTHREAD_ONCE_INIT; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 02c10fa..7fefa67 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -115,6 +115,7 @@ typedef struct xc_interface_core xc_interface; typedef struct xc_interface_core xc_evtchn; typedef struct xc_interface_core xc_gnttab; +typedef struct xc_interface_core xc_gntshr; typedef enum xc_error_code xc_error_code; @@ -1363,7 +1364,7 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, * notification; -1 for none. * @parm notify_port The event channel port to use for unmap notify, or -1 * @parm notify_result If nonnull, set to 0 if the notify setup succeeded - * or an errno value if not. + * or -1 if not. */ void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, uint32_t domid, @@ -1403,6 +1404,56 @@ grant_entry_v1_t *xc_gnttab_map_table_v1(xc_interface *xch, int domid, int *gnt_ grant_entry_v2_t *xc_gnttab_map_table_v2(xc_interface *xch, int domid, int *gnt_num); /* Sometimes these don''t set errno [fixme], and sometimes they don''t log. */ +/* + * Return an fd onto the grant sharing driver. Logs errors. + */ +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags); + +/* + * Close a handle previously allocated with xc_gntshr_open(). + * Never logs errors. + */ +int xc_gntshr_close(xc_gntshr *xcg); + +/* + * Creates and shares pages with another domain. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm count the number of pages to share + * @parm refs the grant references of the pages (output) + * @parm writable true if the other domain can write to the pages + * @return local mapping of the pages + */ +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable); + +/* + * Creates and shares a page with another domain, with unmap notification. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm refs the grant reference of the pages (output) + * @parm writable true if the other domain can write to the page + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + * @parm notify_result If nonnull, set to 0 if the notify setup succeeded + * or -1 if not. + * @return local mapping of the page + */ +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result); +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count); + int xc_physdev_map_pirq(xc_interface *xch, int domid, int index, diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index bf81538..17edc39 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -54,6 +54,7 @@ enum xc_osdep_type { XC_OSDEP_PRIVCMD, XC_OSDEP_EVTCHN, XC_OSDEP_GNTTAB, + XC_OSDEP_GNTSHR, }; /* Opaque handle internal to the backend */ @@ -130,6 +131,19 @@ struct xc_osdep_ops uint32_t count); int (*set_max_grants)(xc_gnttab *xcg, xc_osdep_handle h, uint32_t count); } gnttab; + struct { + void *(*share_pages)(xc_gntshr *xcg, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable); + void *(*share_page_notify)(xc_gntshr *xcg, xc_osdep_handle h, + uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port, + int *notify_result); + int (*munmap)(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count); + } gntshr; } u; }; typedef struct xc_osdep_ops xc_osdep_ops; -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-19 22:43 UTC
[Xen-devel] [PATCH 3/3] libvchan: interdomain communications library
This library implements a bidirectional communication interface between applications in different domains, similar to unix sockets. Data can be sent using the byte-oriented libvchan_read/libvchan_write or the packet-oriented libvchan_recv/libvchan_send. Channel setup is done using a client-server model; domain IDs and a port number must be negotiated prior to initialization. The server allocates memory for the shared pages and determines the sizes of the communication rings (which may span multiple pages, although the default places rings and control within a single page). With properly sized rings, testing has shown that this interface provides speed comparable to pipes within a single Linux domain; it is significantly faster than network-based communication. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/Makefile | 1 + tools/Rules.mk | 5 + tools/libvchan/Makefile | 59 ++++++ tools/libvchan/init.c | 397 ++++++++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 375 +++++++++++++++++++++++++++++++++++ tools/libvchan/libxenvchan.h | 173 +++++++++++++++++ tools/libvchan/node-select.c | 162 ++++++++++++++++ tools/libvchan/node.c | 169 ++++++++++++++++ xen/include/public/io/libvchan.h | 97 +++++++++ 9 files changed, 1438 insertions(+), 0 deletions(-) create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/libxenvchan.h create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c create mode 100644 xen/include/public/io/libvchan.h diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/Rules.mk b/tools/Rules.mk index 0d048af..49125f5 100644 --- a/tools/Rules.mk +++ b/tools/Rules.mk @@ -14,6 +14,7 @@ XEN_XENLIGHT = $(XEN_ROOT)/tools/libxl XEN_XENSTORE = $(XEN_ROOT)/tools/xenstore XEN_LIBXENSTAT = $(XEN_ROOT)/tools/xenstat/libxenstat/src XEN_BLKTAP2 = $(XEN_ROOT)/tools/blktap2 +XEN_LIBVCHAN = $(XEN_ROOT)/tools/libvchan CFLAGS_xeninclude = -I$(XEN_INCLUDE) @@ -33,6 +34,10 @@ CFLAGS_libxenstat = -I$(XEN_LIBXENSTAT) LDLIBS_libxenstat = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) -L$(XEN_LIBXENSTAT) -lxenstat SHLIB_libxenstat = -Wl,-rpath-link=$(XEN_LIBXENSTAT) +CFLAGS_libxenvchan = -I$(XEN_LIBVCHAN) +LDLIBS_libxenvchan = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) -L$(XEN_LIBVCHAN) -lxenvchan +SHLIB_libxenvchan = -Wl,-rpath-link=$(XEN_LIBVCHAN) + ifeq ($(CONFIG_Linux),y) LIBXL_BLKTAP = y else diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..daf3593 --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,59 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_PIC_OBJS = $(patsubst %.o,%.opic,$(LIBVCHAN_OBJS)) +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl) +$(LIBVCHAN_OBJS) $(LIBVCHAN_PIC_OBJS): CFLAGS += $(CFLAGS_libxenstore) $(CFLAGS_libxenctrl) +$(NODE_OBJS) $(NODE2_OBJS): CFLAGS += $(CFLAGS_libxenctrl) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I../include -I. + +.PHONY: all +all: libxenvchan.so vchan-node1 vchan-node2 libxenvchan.a + +libxenvchan.so: libxenvchan.so.$(MAJOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR): libxenvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_PIC_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxenvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libxenvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libxenvchan.a $^ + +vchan-node1: $(NODE_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) $(LDLIBS_libxenvchan) + +vchan-node2: $(NODE2_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) $(LDLIBS_libxenvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so.$(MAJOR) + ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so + $(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(LIBDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..c9ad883 --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,397 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> +#include <xen/sys/gntalloc.h> +#include <xen/sys/gntdev.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libvchan *ctrl) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + uint32_t ring_ref = -1; + void *ring; + + ring = xc_gntshr_share_page_notify(ctrl->gntshr, ctrl->other_domain_id, + &ring_ref, 1, offsetof(struct vchan_interface, srv_live), + ctrl->event_port, NULL); + + if (!ring) + goto out; + + memset(ring, 0, PAGE_SIZE); + + ctrl->ring = ring; + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->cli_notify = VCHAN_NOTIFY_WRITE; + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, + pages_left, ctrl->ring->grants, 1); + if (!ctrl->read.buffer) + goto out_ring; + } + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + ctrl->write.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, + pages_right, ctrl->ring->grants + pages_left, 1); + if (!ctrl->write.buffer) + goto out_unmap_left; + } + +out: + return ring_ref; +out_unmap_left: + if (ctrl->read.order > 11) + xc_gntshr_munmap(ctrl->gntshr, ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + xc_gntshr_munmap(ctrl->gntshr, ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) +{ + int rv = -1; + uint32_t *grants; + + ctrl->ring = xc_gnttab_map_grant_ref_notify(ctrl->gnttab, + ctrl->other_domain_id, ring_ref, + offsetof(struct vchan_interface, cli_live), ctrl->event_port, + NULL); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < 10 || ctrl->write.order > 24) + goto out_unmap_ring; + if (ctrl->read.order < 10 || ctrl->read.order > 24) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) + goto out_unmap_ring; + + grants = ctrl->ring->grants; + + if (ctrl->write.order == 10) { + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->write.order == 11) { + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_left, ctrl->other_domain_id, grants, PROT_READ|PROT_WRITE); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + + if (ctrl->read.order == 10) { + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; + } else if (ctrl->read.order == 11) { + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; + } else { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_right, ctrl->other_domain_id, grants, PROT_READ); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + + rv = 0; + out: + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + xc_gnttab_munmap(ctrl->gnttab, ctrl->write.buffer, + 1 << ctrl->write.order); + out_unmap_ring: + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libvchan *ctrl, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_unbound_port(ctrl->event, ctrl->other_domain_id); + if (ctrl->event_port < 0) + return -1; + if (xc_evtchn_unmask(ctrl->event, ctrl->event_port)) + return -1; + return 0; +} + +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = ctrl->other_domain_id; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "data/vchan/%d/%d/ring-ref", ctrl->other_domain_id, ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "data/vchan/%d/%d/event-channel", ctrl->other_domain_id, ctrl->device_number); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t left_min, size_t right_min) +{ + // if you go over this size, you''ll have too many grants to fit in the shared page. + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; + struct libvchan *ctrl; + int ring_ref; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + ctrl = malloc(sizeof(*ctrl)); + if (!ctrl) + return 0; + + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so +#define MAX_SMALL_RING 1024 +#define MAX_LARGE_RING 2048 + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = 10; + ctrl->write.order = 11; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = 11; + ctrl->write.order = 10; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = 11; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = 11; + } + + ctrl->gntshr = xc_gntshr_open(logger, 0); + if (!ctrl->gntshr) + goto out; + + if (init_evt_srv(ctrl, logger)) + goto out; + ring_ref = init_gnt_srv(ctrl); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, ring_ref)) + goto out; + return ctrl; +out: + libvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libvchan *ctrl, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_interdomain(ctrl->event, + ctrl->other_domain_id, ctrl->event_port); + if (ctrl->event_port < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ctrl->event_port); + return 0; +} + + +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno) +{ + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + char* domid_str = NULL; + + if (!ctrl) + return 0; + ctrl->other_domain_id = domain; + ctrl->device_number = devno; + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/ring-ref", + ctrl->other_domain_id, domid_str, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/event-channel", + ctrl->other_domain_id, domid_str, ctrl->device_number); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + + ctrl->gnttab = xc_gnttab_open(logger, 0); + if (!ctrl->gnttab) + goto out; + +// set up event channel + if (init_evt_cli(ctrl, logger)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->srv_notify = VCHAN_NOTIFY_WRITE; + + out: + free(domid_str); + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..08d5dcf --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,375 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +#define barrier() asm volatile("" ::: "memory") + + +static inline uint32_t rd_prod(struct libvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static inline uint32_t* _rd_cons(struct libvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static inline uint32_t* _wr_prod(struct libvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static inline uint32_t wr_cons(struct libvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static inline const void* rd_ring(struct libvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static inline void* wr_ring(struct libvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static inline uint32_t wr_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static inline uint32_t rd_ring_size(struct libvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +static inline void request_notify(struct libvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->cli_notify : &ctrl->ring->srv_notify; + __sync_or_and_fetch(notify, bit); +} + +static inline int send_notify(struct libvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->srv_notify : &ctrl->ring->cli_notify; + uint8_t prev = __sync_fetch_and_and(notify, ~bit); + if (prev & bit) + return xc_evtchn_notify(ctrl->event, ctrl->event_port); + else + return 0; +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_data_ready(struct libvchan *ctrl, size_t request) +{ + int ready = rd_prod(ctrl) - rd_cons(ctrl); + if (ready >= request) + return ready; + /* We plan to consume all data; please tell us if you send more */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + /* + * If the writer moved rd_prod after our read but before request, we + * will not get notified even though the actual amount of data ready is + * above request. Reread rd_prod to cover this case. + */ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libvchan_data_ready(struct libvchan *ctrl) +{ + /* Since this value is being used outside libvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + return rd_prod(ctrl) - rd_cons(ctrl); +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_buffer_space(struct libvchan *ctrl, size_t request) +{ + int ready = wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); + if (ready >= request) + return ready; + /* We plan to fill the buffer; please tell us when you''ve read it */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + /* + * If the reader moved wr_cons after our read but before request, we + * will not get notified even though the actual amount of buffer space + * is above request. Reread wr_cons to cover this case. + */ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libvchan_buffer_space(struct libvchan *ctrl) +{ + /* Since this value is being used outside libvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libvchan_wait(struct libvchan *ctrl) +{ + int ret = xc_evtchn_pending(ctrl->event); + if (ret < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ret); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + barrier(); // data must be in the ring prior to increment + wr_prod(ctrl) += size; + barrier(); // increment must happen prior to notify + if (send_notify(ctrl, VCHAN_NOTIFY_WRITE)) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + while (1) { + if (!libvchan_is_open(ctrl)) + return -1; + avail = fast_get_buffer_space(ctrl, size); + if (size <= avail) + return do_send(ctrl, data, size); + if (!ctrl->blocking) + return 0; + if (size > wr_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) +{ + int avail; + if (!libvchan_is_open(ctrl)) + return -1; + if (ctrl->blocking) { + size_t pos = 0; + while (1) { + avail = fast_get_buffer_space(ctrl, size - pos); + if (pos + avail > size) + avail = size - pos; + if (avail) + pos += do_send(ctrl, data + pos, avail); + if (pos == size) + return pos; + if (libvchan_wait(ctrl)) + return -1; + if (!libvchan_is_open(ctrl)) + return -1; + } + } else { + avail = fast_get_buffer_space(ctrl, size); + if (size > avail) + size = avail; + if (size == 0) + return 0; + return do_send(ctrl, data, size); + } +} + +static int do_recv(struct libvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + barrier(); // data read must happen after rd_cons read + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + barrier(); // consumption must happen prior to notify of newly freed space + if (send_notify(ctrl, VCHAN_NOTIFY_READ)) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (size <= avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (size > rd_ring_size(ctrl)) + return -1; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (avail && size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + if (!libvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (libvchan_wait(ctrl)) + return -1; + } +} + +int libvchan_is_open(struct libvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist ? 1 : ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +int libvchan_fd_for_select(struct libvchan *ctrl) +{ + return xc_evtchn_fd(ctrl->event); +} + +void libvchan_close(struct libvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + if (ctrl->ring) { + if (ctrl->is_server) { + ctrl->ring->srv_live = 0; + xc_gntshr_munmap(ctrl->gntshr, ctrl->ring, PAGE_SIZE); + } else { + ctrl->ring->cli_live = 0; + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + } + } + if (ctrl->event) { + if (ctrl->event_port >= 0 && ctrl->ring) + xc_evtchn_notify(ctrl->event, ctrl->event_port); + xc_evtchn_close(ctrl->event); + } + if (ctrl->is_server) { + if (ctrl->gntshr) + xc_gntshr_close(ctrl->gntshr); + } else { + if (ctrl->gnttab) + xc_gnttab_close(ctrl->gnttab); + } + free(ctrl); +} diff --git a/tools/libvchan/libxenvchan.h b/tools/libvchan/libxenvchan.h new file mode 100644 index 0000000..c4a3ab9 --- /dev/null +++ b/tools/libvchan/libxenvchan.h @@ -0,0 +1,173 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <xen/io/libvchan.h> +#include <xen/sys/evtchn.h> +#include <xenctrl.h> + +struct libvchan_ring { + /* Pointer into the shared page. Offsets into buffer. */ + struct ring_shared* shr; + /* ring data; may be its own shared page(s) depending on order */ + void* buffer; + /** + * The size of the ring is (1 << order); offsets wrap around when they + * exceed this. This copy is required because we can''t trust the order + * in the shared page to remain constant. + */ + int order; +}; + +/** + * struct libvchan: control structure passed to all library calls + */ +struct libvchan { + /* person we communicate with */ + int other_domain_id; + /* "port" we communicate on (allows multiple vchans to exist in xenstore) */ + int device_number; + /* Mapping handle for shared ring page */ + union { + xc_gntshr *gntshr; /* for server */ + xc_gnttab *gnttab; /* for client */ + }; + /* Pointer to shared ring page */ + struct vchan_interface *ring; + /* event channel interface */ + xc_evtchn *event; + uint32_t event_port; + /* informative flags: are we acting as server? */ + int is_server:1; + /* true if server remains active when client closes (allows reconnection) */ + int server_persist:1; + /* true if operations should block instead of returning 0 */ + int blocking:1; + /* communication rings */ + struct libvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param logger Logger for libxc errors + * @param domain The peer domain that will be connecting + * @param devno A device number, used to identify this vchan in xenstore + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param logger Logger for libxc errors + * @param domain The peer domain to connect to + * @param devno A device number, used to identify this vchan in xenstore + * @return The structure, or NULL in case of an error + */ +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libvchan_close(struct libvchan *ctrl); + +/** + * Packet-based receive: always reads exactly $size bytes. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer and amount of data to read + * @return -1 on error, 0 if nonblocking and insufficient data is available, or $size + */ +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); +/** + * Stream-based receive: reads as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data read (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); +/** + * Packet-based send: send entire buffer if possible + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer and amount of data to send + * @return -1 on error, 0 if nonblocking and insufficient space is available, or $size + */ +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); +/** + * Stream-based send: send as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data sent (which may be zero if + * the vchan is nonblocking) + */ +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); +/** + * Waits for reads or writes to unblock, or for a close + */ +int libvchan_wait(struct libvchan *ctrl); +/** + * Returns the event file descriptor for this vchan. When this FD is readable, + * libvchan_wait() will not block, and the state of the vchan has changed since + * the last invocation of libvchan_wait(). + */ +int libvchan_fd_for_select(struct libvchan *ctrl); +/** + * Query the state of the vchan shared page: + * return 0 when one side has called libvchan_close() or crashed + * return 1 when both sides are open + * return 2 [server only] when no client has yet connected + */ +int libvchan_is_open(struct libvchan* ctrl); +/** Amount of data ready to read, in bytes */ +int libvchan_data_ready(struct libvchan *ctrl); +/** Amount of data it is possible to send without blocking */ +int libvchan_buffer_space(struct libvchan *ctrl); diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..ea1bfc6 --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,162 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include <libxenvchan.h> + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libvchan application, both client and server. + Both sides may write and read, both from the libvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libvchan_fd; + if (argc < 4) + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libvchan_server_init(NULL, atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(NULL, atoi(argv[2]), atoi(argv[3])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libvchan_fd = libvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libvchan_fd, &rfds); + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libvchan_fd, &rfds)) { + libvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..6a9204c --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include <libxenvchan.h> + +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodeid\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libvchan_close(ctrl); + exit(1); + } + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libvchan *ctrl = 0; + int wr = 0; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libvchan_server_init(NULL, atoi(argv[3]), atoi(argv[4]), 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libvchan_client_init(NULL, atoi(argv[3]), atoi(argv[4])); + else + usage(argv); + if (!ctrl) { + perror("libvchan_*_init"); + exit(1); + } + ctrl->blocking = 1; + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libvchan_close(ctrl); + return 0; +} diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h new file mode 100644 index 0000000..a3bf7cd --- /dev/null +++ b/xen/include/public/io/libvchan.h @@ -0,0 +1,97 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <stdint.h> +#include <sys/types.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +#define VCHAN_NOTIFY_WRITE 0x1 +#define VCHAN_NOTIFY_READ 0x2 + +/** + * vchan_interface: primary shared data structure + */ +struct vchan_interface { + /** + * Standard consumer/producer interface, one pair per buffer + * left is client write, server read + * right is client read, server write + */ + struct ring_shared left, right; + /** + * size of the rings, which determines their location + * 10 - at offset 1024 in ring''s page + * 11 - at offset 2048 in ring''s page + * 12+ - uses 2^(N-12) grants to describe the multi-page ring + * These should remain constant once the page is shared. + * Only one of the two orders can be 10 (or 11). + */ + uint16_t left_order, right_order; + /** + * Shutdown detection: + * 0: client (or server) has exited + * 1: client (or server) is connected + * 2: client has not yet connected + */ + uint8_t cli_live, srv_live; + /** + * Notification bits: + * VCHAN_NOTIFY_WRITE: send notify when data is written + * VCHAN_NOTIFY_READ: send notify when data is read (consumed) + * cli_notify is used for the client to inform the server of its action + */ + uint8_t cli_notify, srv_notify; + /** + * Grant list: ordering is left, right. Must not extend into actual ring + * or grow beyond the end of the initial shared page. + * These should remain constant once the page is shared, to allow + * for possible remapping by a client that restarts. + */ + uint32_t grants[0]; +}; + -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-21 10:03 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote:> @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { > uint32_t count; > }; > > +/* > + * Sets up an unmap notification within the page, so that the other side can do > + * cleanup if this side crashes. Required to implement cross-domain robust > + * mutexes or close notification on communication channels. > + * > + * Each mapped page only supports one notification; multiple calls referring to > + * the same page overwrite the previous notification. You must clear the > + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it > + * to occur. > + */ > +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ > +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) > +struct ioctl_gntdev_unmap_notify { > + /* IN parameters */ > + /* Offset in the file descriptor for a byte within the page (same as > + * used in mmap).I''m probably being thick but I don''t understand what this means, i.e. what this thing is relative to.> If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to > + * be cleared. Otherwise, it can be any byte in the page whose > + * notification we are adjusting. > + */ > + uint64_t index; > + /* Action(s) to take on unmap */ > + uint32_t action; > + /* Event channel to notify */ > + uint32_t event_channel_port;evtchn_port_t ?> +}; > + > +/* Clear (set to zero) the byte specified by index */ > +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 > +/* Send an interrupt on the indicated event channel */ > +#define UNMAP_NOTIFY_SEND_EVENT 0x2 > + > #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ > diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c > index 4f55fce..3d3c63b 100644 > --- a/tools/libxc/xc_gnttab.c > +++ b/tools/libxc/xc_gnttab.c > @@ -18,6 +18,7 @@ > */ > > #include "xc_private.h" > +#include <errno.h> > > int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) > { > @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, > count, domid, refs, prot); > } > > +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, > + uint32_t domid, > + uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port, > + int *notify_result) > +{ > + if (xcg->ops->u.gnttab.map_grant_ref_notify) > + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, > + domid, ref, notify_offset, notify_port, notify_result); > + else { > + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, > + domid, ref, PROT_READ|PROT_WRITE); > + if (area && notify_result) { > + *notify_result = -1; > + errno = ENOSYS; > + } > + return area; > + }I think the new public interface is fine but do we really need a new internal interface here? I think you can just add the notify_* arguments to the existing OSDEP function and have those OS backends which don''t implement that feature return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value works). Why doesn''t the *_notify variant take a prot argument? I''d be tempted to do away with notify_result too -- if the caller asked for notification and we fail to give that then we can cleanup and return an error. If they want to try again without the notification then that''s up to them.> +} > + > + > int xc_gnttab_munmap(xc_gnttab *xcg, > void *start_address, > uint32_t count) > diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c > index dca6718..3040cb6 100644 > --- a/tools/libxc/xc_linux_osdep.c > +++ b/tools/libxc/xc_linux_osdep.c > @@ -613,6 +613,62 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle > return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); > } > > +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, > + uint32_t domid, uint32_t ref, > + uint32_t notify_offset, > + evtchn_port_t notify_port, > + int *notify_result) > +{ > + int fd = (int)h; > + int rv = 0; > + struct ioctl_gntdev_map_grant_ref map; > + struct ioctl_gntdev_unmap_notify notify; > + void *addr; > + > + map.count = 1; > + map.refs[0].domid = domid; > + map.refs[0].ref = ref; > + > + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { > + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); > + return NULL; > + } > + > + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); > + if ( addr == MAP_FAILED ) > + { > + int saved_errno = errno; > + struct ioctl_gntdev_unmap_grant_ref unmap_grant; > + > + PERROR("xc_gnttab_map_grant_ref: mmap failed"); > + unmap_grant.index = map.index; > + unmap_grant.count = 1; > + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); > + errno = saved_errno; > + return NULL; > + }The non-notify variant handles EAGAIN, why doesn''t this one need to do so?> + > + notify.index = map.index; > + notify.action = 0; > + if (notify_offset >= 0) { > + notify.index += notify_offset; > + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; > + } > + if (notify_port >= 0) { > + notify.event_channel_port = notify_port; > + notify.action |= UNMAP_NOTIFY_SEND_EVENT; > + } > + if (notify.action) > + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify);Is there a race if the other end (or this process) dies between the MAP ioctl and here? Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-21 10:13 UTC
[Xen-devel] Re: [PATCH 2/3] libxc: add xc_gntshr_* functions
On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote:> +static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, > + uint32_t domid, int count, > + uint32_t *refs, int writable) > [...]> +static void *linux_gntshr_share_page_notify(xc_gntshr *xch, xc_osdep_handle h, > + uint32_t domid, uint32_t *ref, > + int writable, uint32_t notify_offset, > + evtchn_port_t notify_port, > + int *notify_result)I have the same opinion of the need for both of these as for the gntmap ones. They are identical apart from the notify block in the middle, right? Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-21 10:53 UTC
[Xen-devel] Re: [PATCH 3/3] libvchan: interdomain communications library
On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote:> This library implements a bidirectional communication interface between > applications in different domains, similar to unix sockets. Data can be > sent using the byte-oriented libvchan_read/libvchan_write or the > packet-oriented libvchan_recv/libvchan_send. > > Channel setup is done using a client-server model; domain IDs and a port > number must be negotiated prior to initialization. The server allocates > memory for the shared pages and determines the sizes of the > communication rings (which may span multiple pages, although the default > places rings and control within a single page). > > With properly sized rings, testing has shown that this interface > provides speed comparable to pipes within a single Linux domain; it is > significantly faster than network-based communication. > > Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov>I only skimmed this one I had a few minor thoughts below but really I''m pretty much OK for it to go in (modulo any fallout from comments on patches 1+2). Definite Bonus Points for the doxygen/kernel doc commentary in the headers, which tool parses them? (a few comments in the code itself seem to have the "/**" marker but not the rest of the syntax). You changed the library name to libxenvchan but not the path to the source nor the API names?> +static int init_gnt_srv(struct libvchan *ctrl) > +{ > + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0;Here you do >= PAGE_SHIFT but on the out_unmap_left path you do > 11. (am I right that left == server and right == client in the libvhan terminology?)> + if (ctrl->read.order == 10) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; > + } else if (ctrl->read.order == 11) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; > + } else { > + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, > + pages_left, ctrl->ring->grants, 1); > + if (!ctrl->read.buffer) > + goto out_ring; > + }switch (...read.order)? In other places you have MAX_LARGE_RING/MAX_SMALL_RING etc, I think using SMALL/LARGE_RING_ORDER instead of 10 and 11 seems like a good idea. Similarly using LARGE/SMALL_RING_OFFSET instead of 1024/2048 would help clarity.> + if (ctrl->write.order < 10 || ctrl->write.order > 24) > + goto out_unmap_ring;What is the significance of 2^24?> + > +// find xenstore entry > + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/ring-ref", > + ctrl->other_domain_id, domid_str, ctrl->device_number);I wonder if the base of this path (up to and including "%s/%d"?) ought to be caller provided? My thinking is that the rendezvous between client and server is out of band and the path is really an element (or even the total encoding) of that OOB communication. It would also push the selection of xs location to be pushed up into the application which also defines the protocol. For example I might want to build a pv protocol with this library which is supported by the toolstack and therefore want to put my stuff under devices etc or in any other protocol specific xs location. The wart I previously mentioned wrt using the "data" directory would then be an application wart (which I think is ok) rather than baked into the libraries. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-21 13:44 UTC
[Xen-devel] Re: [PATCH 3/3] libvchan: interdomain communications library
On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote:> + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version.LGPL v2 isn''t all that widely used (I had some flaw or other which I don''t recall and have failed to google up). Most of our libraries are LGPL v2.1 not v2 as well. Since you have the "or later version" clause I think it should be trivial to uprev? (by the same token it perhaps doesn''t matter, but fewer licenses in use at once seems useful) Ian.> + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * This file contains the setup code used to establish the ring buffer. > + */ > + > +#include <sys/types.h> > +#include <sys/mman.h> > +#include <sys/ioctl.h> > +#include <sys/user.h> > +#include <stdlib.h> > +#include <stdio.h> > +#include <stdint.h> > +#include <string.h> > +#include <unistd.h> > +#include <fcntl.h> > + > +#include <xs.h> > +#include <xen/sys/evtchn.h> > +#include <xen/sys/gntalloc.h> > +#include <xen/sys/gntdev.h> > +#include <libxenvchan.h> > + > +#ifndef PAGE_SHIFT > +#define PAGE_SHIFT 12 > +#endif > + > +#ifndef PAGE_SIZE > +#define PAGE_SIZE 4096 > +#endif > + > +#ifndef offsetof > +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) > +#endif > + > +#define max(a,b) ((a > b) ? a : b) > + > +static int init_gnt_srv(struct libvchan *ctrl) > +{ > + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; > + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; > + uint32_t ring_ref = -1; > + void *ring; > + > + ring = xc_gntshr_share_page_notify(ctrl->gntshr, ctrl->other_domain_id, > + &ring_ref, 1, offsetof(struct vchan_interface, srv_live), > + ctrl->event_port, NULL); > + > + if (!ring) > + goto out; > + > + memset(ring, 0, PAGE_SIZE); > + > + ctrl->ring = ring; > + ctrl->read.shr = &ctrl->ring->left; > + ctrl->write.shr = &ctrl->ring->right; > + ctrl->ring->left_order = ctrl->read.order; > + ctrl->ring->right_order = ctrl->write.order; > + ctrl->ring->cli_live = 2; > + ctrl->ring->srv_live = 1; > + ctrl->ring->cli_notify = VCHAN_NOTIFY_WRITE; > + > + if (ctrl->read.order == 10) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; > + } else if (ctrl->read.order == 11) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; > + } else { > + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, > + pages_left, ctrl->ring->grants, 1); > + if (!ctrl->read.buffer) > + goto out_ring; > + } > + > + if (ctrl->write.order == 10) { > + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; > + } else if (ctrl->write.order == 11) { > + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; > + } else { > + ctrl->write.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, > + pages_right, ctrl->ring->grants + pages_left, 1); > + if (!ctrl->write.buffer) > + goto out_unmap_left; > + } > + > +out: > + return ring_ref; > +out_unmap_left: > + if (ctrl->read.order > 11) > + xc_gntshr_munmap(ctrl->gntshr, ctrl->read.buffer, pages_left * PAGE_SIZE); > +out_ring: > + xc_gntshr_munmap(ctrl->gntshr, ring, PAGE_SIZE); > + ring_ref = -1; > + ctrl->ring = NULL; > + ctrl->write.order = ctrl->read.order = 0; > + goto out; > +} > + > +static int init_gnt_cli(struct libvchan *ctrl, uint32_t ring_ref) > +{ > + int rv = -1; > + uint32_t *grants; > + > + ctrl->ring = xc_gnttab_map_grant_ref_notify(ctrl->gnttab, > + ctrl->other_domain_id, ring_ref, > + offsetof(struct vchan_interface, cli_live), ctrl->event_port, > + NULL); > + > + if (!ctrl->ring) > + goto out; > + > + ctrl->write.order = ctrl->ring->left_order; > + ctrl->read.order = ctrl->ring->right_order; > + ctrl->write.shr = &ctrl->ring->left; > + ctrl->read.shr = &ctrl->ring->right; > + if (ctrl->write.order < 10 || ctrl->write.order > 24) > + goto out_unmap_ring; > + if (ctrl->read.order < 10 || ctrl->read.order > 24) > + goto out_unmap_ring; > + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < 12) > + goto out_unmap_ring; > + > + grants = ctrl->ring->grants; > + > + if (ctrl->write.order == 10) { > + ctrl->write.buffer = ((void*)ctrl->ring) + 1024; > + } else if (ctrl->write.order == 11) { > + ctrl->write.buffer = ((void*)ctrl->ring) + 2048; > + } else { > + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); > + ctrl->write.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, > + pages_left, ctrl->other_domain_id, grants, PROT_READ|PROT_WRITE); > + if (!ctrl->write.buffer) > + goto out_unmap_ring; > + grants += pages_left; > + } > + > + if (ctrl->read.order == 10) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; > + } else if (ctrl->read.order == 11) { > + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; > + } else { > + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); > + ctrl->read.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, > + pages_right, ctrl->other_domain_id, grants, PROT_READ); > + if (!ctrl->read.buffer) > + goto out_unmap_left; > + } > + > + rv = 0; > + out: > + return rv; > + out_unmap_left: > + if (ctrl->write.order >= PAGE_SHIFT) > + xc_gnttab_munmap(ctrl->gnttab, ctrl->write.buffer, > + 1 << ctrl->write.order); > + out_unmap_ring: > + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); > + ctrl->ring = 0; > + ctrl->write.order = ctrl->read.order = 0; > + rv = -1; > + goto out; > +} > + > +static int init_evt_srv(struct libvchan *ctrl, xentoollog_logger *logger) > +{ > + ctrl->event = xc_evtchn_open(logger, 0); > + if (!ctrl->event) > + return -1; > + ctrl->event_port = xc_evtchn_bind_unbound_port(ctrl->event, ctrl->other_domain_id); > + if (ctrl->event_port < 0) > + return -1; > + if (xc_evtchn_unmask(ctrl->event, ctrl->event_port)) > + return -1; > + return 0; > +} > + > +static int init_xs_srv(struct libvchan *ctrl, int ring_ref) > +{ > + int ret = -1; > + struct xs_handle *xs; > + struct xs_permissions perms[2]; > + char buf[64]; > + char ref[16]; > + char* domid_str = NULL; > + xs = xs_domain_open(); > + if (!xs) > + goto fail; > + domid_str = xs_read(xs, 0, "domid", NULL); > + if (!domid_str) > + goto fail_xs_open; > + > + // owner domain is us > + perms[0].id = atoi(domid_str); > + // permissions for domains not listed = none > + perms[0].perms = XS_PERM_NONE; > + // other domains > + perms[1].id = ctrl->other_domain_id; > + perms[1].perms = XS_PERM_READ; > + > + snprintf(ref, sizeof ref, "%d", ring_ref); > + snprintf(buf, sizeof buf, "data/vchan/%d/%d/ring-ref", ctrl->other_domain_id, ctrl->device_number); > + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > + goto fail_xs_open; > + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > + goto fail_xs_open; > + > + snprintf(ref, sizeof ref, "%d", ctrl->event_port); > + snprintf(buf, sizeof buf, "data/vchan/%d/%d/event-channel", ctrl->other_domain_id, ctrl->device_number); > + if (!xs_write(xs, 0, buf, ref, strlen(ref))) > + goto fail_xs_open; > + if (!xs_set_permissions(xs, 0, buf, perms, 2)) > + goto fail_xs_open; > + > + ret = 0; > + fail_xs_open: > + free(domid_str); > + xs_daemon_close(xs); > + fail: > + return ret; > +} > + > +static int min_order(size_t siz) > +{ > + int rv = PAGE_SHIFT; > + while (siz > (1 << rv)) > + rv++; > + return rv; > +} > + > +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t left_min, size_t right_min) > +{ > + // if you go over this size, you''ll have too many grants to fit in the shared page. > + size_t MAX_RING_SIZE = 256 * PAGE_SIZE; > + struct libvchan *ctrl; > + int ring_ref; > + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) > + return 0; > + > + ctrl = malloc(sizeof(*ctrl)); > + if (!ctrl) > + return 0; > + > + ctrl->other_domain_id = domain; > + ctrl->device_number = devno; > + ctrl->ring = NULL; > + ctrl->event = NULL; > + ctrl->is_server = 1; > + ctrl->server_persist = 0; > + > + ctrl->read.order = min_order(left_min); > + ctrl->write.order = min_order(right_min); > + > + // if we can avoid allocating extra pages by using in-page rings, do so > +#define MAX_SMALL_RING 1024 > +#define MAX_LARGE_RING 2048 > + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { > + ctrl->read.order = 10; > + ctrl->write.order = 11; > + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { > + ctrl->read.order = 11; > + ctrl->write.order = 10; > + } else if (left_min <= MAX_LARGE_RING) { > + ctrl->read.order = 11; > + } else if (right_min <= MAX_LARGE_RING) { > + ctrl->write.order = 11; > + } > + > + ctrl->gntshr = xc_gntshr_open(logger, 0); > + if (!ctrl->gntshr) > + goto out; > + > + if (init_evt_srv(ctrl, logger)) > + goto out; > + ring_ref = init_gnt_srv(ctrl); > + if (ring_ref < 0) > + goto out; > + if (init_xs_srv(ctrl, ring_ref)) > + goto out; > + return ctrl; > +out: > + libvchan_close(ctrl); > + return 0; > +} > + > +static int init_evt_cli(struct libvchan *ctrl, xentoollog_logger *logger) > +{ > + ctrl->event = xc_evtchn_open(logger, 0); > + if (!ctrl->event) > + return -1; > + ctrl->event_port = xc_evtchn_bind_interdomain(ctrl->event, > + ctrl->other_domain_id, ctrl->event_port); > + if (ctrl->event_port < 0) > + return -1; > + xc_evtchn_unmask(ctrl->event, ctrl->event_port); > + return 0; > +} > + > + > +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno) > +{ > + struct libvchan *ctrl = malloc(sizeof(struct libvchan)); > + struct xs_handle *xs = NULL; > + char buf[64]; > + char *ref; > + int ring_ref; > + unsigned int len; > + char* domid_str = NULL; > + > + if (!ctrl) > + return 0; > + ctrl->other_domain_id = domain; > + ctrl->device_number = devno; > + ctrl->ring = NULL; > + ctrl->event = NULL; > + ctrl->write.order = ctrl->read.order = 0; > + ctrl->is_server = 0; > + > + xs = xs_daemon_open(); > + if (!xs) > + xs = xs_domain_open(); > + if (!xs) > + goto fail; > + > + domid_str = xs_read(xs, 0, "domid", NULL); > + if (!domid_str) > + goto fail; > + > +// find xenstore entry > + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/ring-ref", > + ctrl->other_domain_id, domid_str, ctrl->device_number); > + ref = xs_read(xs, 0, buf, &len); > + if (!ref) > + goto fail; > + ring_ref = atoi(ref); > + free(ref); > + if (!ring_ref) > + goto fail; > + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/event-channel", > + ctrl->other_domain_id, domid_str, ctrl->device_number); > + ref = xs_read(xs, 0, buf, &len); > + if (!ref) > + goto fail; > + ctrl->event_port = atoi(ref); > + free(ref); > + if (!ctrl->event_port) > + goto fail; > + > + ctrl->gnttab = xc_gnttab_open(logger, 0); > + if (!ctrl->gnttab) > + goto out; > + > +// set up event channel > + if (init_evt_cli(ctrl, logger)) > + goto fail; > + > +// set up shared page(s) > + if (init_gnt_cli(ctrl, ring_ref)) > + goto fail; > + > + ctrl->ring->cli_live = 1; > + ctrl->ring->srv_notify = VCHAN_NOTIFY_WRITE; > + > + out: > + free(domid_str); > + if (xs) > + xs_daemon_close(xs); > + return ctrl; > + fail: > + libvchan_close(ctrl); > + ctrl = NULL; > + goto out; > +} > diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c > new file mode 100644 > index 0000000..08d5dcf > --- /dev/null > +++ b/tools/libvchan/io.c > @@ -0,0 +1,375 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * This file contains the communications interface built on the ring buffer. > + */ > + > +#include <sys/types.h> > +#include <sys/mman.h> > +#include <sys/ioctl.h> > +#include <sys/uio.h> > +#include <stdlib.h> > +#include <stdint.h> > +#include <string.h> > +#include <unistd.h> > + > +#include <xenctrl.h> > +#include <libxenvchan.h> > + > +#ifndef PAGE_SHIFT > +#define PAGE_SHIFT 12 > +#endif > + > +#ifndef PAGE_SIZE > +#define PAGE_SIZE 4096 > +#endif > + > +// allow vchan data to be easily observed in strace by doing a > +// writev() to FD -1 with the data being read/written. > +#ifndef VCHAN_DEBUG > +#define VCHAN_DEBUG 0 > +#endif > + > +#define barrier() asm volatile("" ::: "memory") > + > + > +static inline uint32_t rd_prod(struct libvchan *ctrl) > +{ > + return ctrl->read.shr->prod; > +} > + > +static inline uint32_t* _rd_cons(struct libvchan *ctrl) > +{ > + return &ctrl->read.shr->cons; > +} > +#define rd_cons(x) (*_rd_cons(x)) > + > +static inline uint32_t* _wr_prod(struct libvchan *ctrl) > +{ > + return &ctrl->write.shr->prod; > +} > +#define wr_prod(x) (*_wr_prod(x)) > + > +static inline uint32_t wr_cons(struct libvchan *ctrl) > +{ > + return ctrl->write.shr->cons; > +} > + > +static inline const void* rd_ring(struct libvchan *ctrl) > +{ > + return ctrl->read.buffer; > +} > + > +static inline void* wr_ring(struct libvchan *ctrl) > +{ > + return ctrl->write.buffer; > +} > + > +static inline uint32_t wr_ring_size(struct libvchan *ctrl) > +{ > + return (1 << ctrl->write.order); > +} > + > +static inline uint32_t rd_ring_size(struct libvchan *ctrl) > +{ > + return (1 << ctrl->read.order); > +} > + > +static inline void request_notify(struct libvchan *ctrl, uint8_t bit) > +{ > + uint8_t *notify = ctrl->is_server ? &ctrl->ring->cli_notify : &ctrl->ring->srv_notify; > + __sync_or_and_fetch(notify, bit); > +} > + > +static inline int send_notify(struct libvchan *ctrl, uint8_t bit) > +{ > + uint8_t *notify = ctrl->is_server ? &ctrl->ring->srv_notify : &ctrl->ring->cli_notify; > + uint8_t prev = __sync_fetch_and_and(notify, ~bit); > + if (prev & bit) > + return xc_evtchn_notify(ctrl->event, ctrl->event_port); > + else > + return 0; > +} > + > +/** > + * Get the amount of buffer space available and enable notifications if needed. > + */ > +static inline int fast_get_data_ready(struct libvchan *ctrl, size_t request) > +{ > + int ready = rd_prod(ctrl) - rd_cons(ctrl); > + if (ready >= request) > + return ready; > + /* We plan to consume all data; please tell us if you send more */ > + request_notify(ctrl, VCHAN_NOTIFY_WRITE); > + /* > + * If the writer moved rd_prod after our read but before request, we > + * will not get notified even though the actual amount of data ready is > + * above request. Reread rd_prod to cover this case. > + */ > + return rd_prod(ctrl) - rd_cons(ctrl); > +} > + > +int libvchan_data_ready(struct libvchan *ctrl) > +{ > + /* Since this value is being used outside libvchan, request notification > + * when it changes > + */ > + request_notify(ctrl, VCHAN_NOTIFY_WRITE); > + return rd_prod(ctrl) - rd_cons(ctrl); > +} > + > +/** > + * Get the amount of buffer space available and enable notifications if needed. > + */ > +static inline int fast_get_buffer_space(struct libvchan *ctrl, size_t request) > +{ > + int ready = wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); > + if (ready >= request) > + return ready; > + /* We plan to fill the buffer; please tell us when you''ve read it */ > + request_notify(ctrl, VCHAN_NOTIFY_READ); > + /* > + * If the reader moved wr_cons after our read but before request, we > + * will not get notified even though the actual amount of buffer space > + * is above request. Reread wr_cons to cover this case. > + */ > + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); > +} > + > +int libvchan_buffer_space(struct libvchan *ctrl) > +{ > + /* Since this value is being used outside libvchan, request notification > + * when it changes > + */ > + request_notify(ctrl, VCHAN_NOTIFY_READ); > + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); > +} > + > +int libvchan_wait(struct libvchan *ctrl) > +{ > + int ret = xc_evtchn_pending(ctrl->event); > + if (ret < 0) > + return -1; > + xc_evtchn_unmask(ctrl->event, ret); > + return 0; > +} > + > +/** > + * returns -1 on error, or size on success > + */ > +static int do_send(struct libvchan *ctrl, const void *data, size_t size) > +{ > + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); > + int avail_contig = wr_ring_size(ctrl) - real_idx; > + if (VCHAN_DEBUG) { > + char metainfo[32]; > + struct iovec iov[2]; > + iov[0].iov_base = metainfo; > + iov[0].iov_len = snprintf(metainfo, 32, "vchan wr %d/%d", ctrl->other_domain_id, ctrl->device_number); > + iov[1].iov_base = (void *)data; > + iov[1].iov_len = size; > + writev(-1, iov, 2); > + } > + if (avail_contig > size) > + avail_contig = size; > + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); > + if (avail_contig < size) > + { > + // we rolled across the end of the ring > + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); > + } > + barrier(); // data must be in the ring prior to increment > + wr_prod(ctrl) += size; > + barrier(); // increment must happen prior to notify > + if (send_notify(ctrl, VCHAN_NOTIFY_WRITE)) > + return -1; > + return size; > +} > + > +/** > + * returns 0 if no buffer space is available, -1 on error, or size on success > + */ > +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size) > +{ > + int avail; > + while (1) { > + if (!libvchan_is_open(ctrl)) > + return -1; > + avail = fast_get_buffer_space(ctrl, size); > + if (size <= avail) > + return do_send(ctrl, data, size); > + if (!ctrl->blocking) > + return 0; > + if (size > wr_ring_size(ctrl)) > + return -1; > + if (libvchan_wait(ctrl)) > + return -1; > + } > +} > + > +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size) > +{ > + int avail; > + if (!libvchan_is_open(ctrl)) > + return -1; > + if (ctrl->blocking) { > + size_t pos = 0; > + while (1) { > + avail = fast_get_buffer_space(ctrl, size - pos); > + if (pos + avail > size) > + avail = size - pos; > + if (avail) > + pos += do_send(ctrl, data + pos, avail); > + if (pos == size) > + return pos; > + if (libvchan_wait(ctrl)) > + return -1; > + if (!libvchan_is_open(ctrl)) > + return -1; > + } > + } else { > + avail = fast_get_buffer_space(ctrl, size); > + if (size > avail) > + size = avail; > + if (size == 0) > + return 0; > + return do_send(ctrl, data, size); > + } > +} > + > +static int do_recv(struct libvchan *ctrl, void *data, size_t size) > +{ > + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); > + int avail_contig = rd_ring_size(ctrl) - real_idx; > + if (avail_contig > size) > + avail_contig = size; > + barrier(); // data read must happen after rd_cons read > + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); > + if (avail_contig < size) > + { > + // we rolled across the end of the ring > + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); > + } > + rd_cons(ctrl) += size; > + if (VCHAN_DEBUG) { > + char metainfo[32]; > + struct iovec iov[2]; > + iov[0].iov_base = metainfo; > + iov[0].iov_len = snprintf(metainfo, 32, "vchan rd %d/%d", ctrl->other_domain_id, ctrl->device_number); > + iov[1].iov_base = data; > + iov[1].iov_len = size; > + writev(-1, iov, 2); > + } > + barrier(); // consumption must happen prior to notify of newly freed space > + if (send_notify(ctrl, VCHAN_NOTIFY_READ)) > + return -1; > + return size; > +} > + > +/** > + * reads exactly size bytes from the vchan. > + * returns 0 if insufficient data is available, -1 on error, or size on success > + */ > +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size) > +{ > + while (1) { > + int avail = fast_get_data_ready(ctrl, size); > + if (size <= avail) > + return do_recv(ctrl, data, size); > + if (!libvchan_is_open(ctrl)) > + return -1; > + if (!ctrl->blocking) > + return 0; > + if (size > rd_ring_size(ctrl)) > + return -1; > + if (libvchan_wait(ctrl)) > + return -1; > + } > +} > + > +int libvchan_read(struct libvchan *ctrl, void *data, size_t size) > +{ > + while (1) { > + int avail = fast_get_data_ready(ctrl, size); > + if (avail && size > avail) > + size = avail; > + if (avail) > + return do_recv(ctrl, data, size); > + if (!libvchan_is_open(ctrl)) > + return -1; > + if (!ctrl->blocking) > + return 0; > + if (libvchan_wait(ctrl)) > + return -1; > + } > +} > + > +int libvchan_is_open(struct libvchan* ctrl) > +{ > + if (ctrl->is_server) > + return ctrl->server_persist ? 1 : ctrl->ring->cli_live; > + else > + return ctrl->ring->srv_live; > +} > + > +int libvchan_fd_for_select(struct libvchan *ctrl) > +{ > + return xc_evtchn_fd(ctrl->event); > +} > + > +void libvchan_close(struct libvchan *ctrl) > +{ > + if (!ctrl) > + return; > + if (ctrl->read.order >= PAGE_SHIFT) > + munmap(ctrl->read.buffer, 1 << ctrl->read.order); > + if (ctrl->write.order >= PAGE_SHIFT) > + munmap(ctrl->write.buffer, 1 << ctrl->write.order); > + if (ctrl->ring) { > + if (ctrl->is_server) { > + ctrl->ring->srv_live = 0; > + xc_gntshr_munmap(ctrl->gntshr, ctrl->ring, PAGE_SIZE); > + } else { > + ctrl->ring->cli_live = 0; > + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); > + } > + } > + if (ctrl->event) { > + if (ctrl->event_port >= 0 && ctrl->ring) > + xc_evtchn_notify(ctrl->event, ctrl->event_port); > + xc_evtchn_close(ctrl->event); > + } > + if (ctrl->is_server) { > + if (ctrl->gntshr) > + xc_gntshr_close(ctrl->gntshr); > + } else { > + if (ctrl->gnttab) > + xc_gnttab_close(ctrl->gnttab); > + } > + free(ctrl); > +} > diff --git a/tools/libvchan/libxenvchan.h b/tools/libvchan/libxenvchan.h > new file mode 100644 > index 0000000..c4a3ab9 > --- /dev/null > +++ b/tools/libvchan/libxenvchan.h > @@ -0,0 +1,173 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, > + * this code has been substantially rewritten to use the gntdev and gntalloc > + * devices instead of raw MFNs and map_foreign_range. > + * > + * This is a library for inter-domain communication. A standard Xen ring > + * buffer is used, with a datagram-based interface built on top. The grant > + * reference and event channels are shared in XenStore under the path > + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} > + * > + * The ring.h macros define an asymmetric interface to a shared data structure > + * that assumes all rings reside in a single contiguous memory space. This is > + * not suitable for vchan because the interface to the ring is symmetric except > + * for the setup. Unlike the producer-consumer rings defined in ring.h, the > + * size of the rings used in vchan are determined at execution time instead of > + * compile time, so the macros in ring.h cannot be used to access the rings. > + */ > + > +#include <xen/io/libvchan.h> > +#include <xen/sys/evtchn.h> > +#include <xenctrl.h> > + > +struct libvchan_ring { > + /* Pointer into the shared page. Offsets into buffer. */ > + struct ring_shared* shr; > + /* ring data; may be its own shared page(s) depending on order */ > + void* buffer; > + /** > + * The size of the ring is (1 << order); offsets wrap around when they > + * exceed this. This copy is required because we can''t trust the order > + * in the shared page to remain constant. > + */ > + int order; > +}; > + > +/** > + * struct libvchan: control structure passed to all library calls > + */ > +struct libvchan { > + /* person we communicate with */ > + int other_domain_id; > + /* "port" we communicate on (allows multiple vchans to exist in xenstore) */ > + int device_number; > + /* Mapping handle for shared ring page */ > + union { > + xc_gntshr *gntshr; /* for server */ > + xc_gnttab *gnttab; /* for client */ > + }; > + /* Pointer to shared ring page */ > + struct vchan_interface *ring; > + /* event channel interface */ > + xc_evtchn *event; > + uint32_t event_port; > + /* informative flags: are we acting as server? */ > + int is_server:1; > + /* true if server remains active when client closes (allows reconnection) */ > + int server_persist:1; > + /* true if operations should block instead of returning 0 */ > + int blocking:1; > + /* communication rings */ > + struct libvchan_ring read, write; > +}; > + > +/** > + * Set up a vchan, including granting pages > + * @param logger Logger for libxc errors > + * @param domain The peer domain that will be connecting > + * @param devno A device number, used to identify this vchan in xenstore > + * @param send_min The minimum size (in bytes) of the send ring (left) > + * @param recv_min The minimum size (in bytes) of the receive ring (right) > + * @return The structure, or NULL in case of an error > + */ > +struct libvchan *libvchan_server_init(xentoollog_logger *logger, int domain, int devno, size_t read_min, size_t write_min); > +/** > + * Connect to an existing vchan. Note: you can reconnect to an existing vchan > + * safely, however no locking is performed, so you must prevent multiple clients > + * from connecting to a single server. > + * > + * @param logger Logger for libxc errors > + * @param domain The peer domain to connect to > + * @param devno A device number, used to identify this vchan in xenstore > + * @return The structure, or NULL in case of an error > + */ > +struct libvchan *libvchan_client_init(xentoollog_logger *logger, int domain, int devno); > +/** > + * Close a vchan. This deallocates the vchan and attempts to free its > + * resources. The other side is notified of the close, but can still read any > + * data pending prior to the close. > + */ > +void libvchan_close(struct libvchan *ctrl); > + > +/** > + * Packet-based receive: always reads exactly $size bytes. > + * @param ctrl The vchan control structure > + * @param data Buffer for data that was read > + * @param size Size of the buffer and amount of data to read > + * @return -1 on error, 0 if nonblocking and insufficient data is available, or $size > + */ > +int libvchan_recv(struct libvchan *ctrl, void *data, size_t size); > +/** > + * Stream-based receive: reads as much data as possible. > + * @param ctrl The vchan control structure > + * @param data Buffer for data that was read > + * @param size Size of the buffer > + * @return -1 on error, otherwise the amount of data read (which may be zero if > + * the vchan is nonblocking) > + */ > +int libvchan_read(struct libvchan *ctrl, void *data, size_t size); > +/** > + * Packet-based send: send entire buffer if possible > + * @param ctrl The vchan control structure > + * @param data Buffer for data to send > + * @param size Size of the buffer and amount of data to send > + * @return -1 on error, 0 if nonblocking and insufficient space is available, or $size > + */ > +int libvchan_send(struct libvchan *ctrl, const void *data, size_t size); > +/** > + * Stream-based send: send as much data as possible. > + * @param ctrl The vchan control structure > + * @param data Buffer for data to send > + * @param size Size of the buffer > + * @return -1 on error, otherwise the amount of data sent (which may be zero if > + * the vchan is nonblocking) > + */ > +int libvchan_write(struct libvchan *ctrl, const void *data, size_t size); > +/** > + * Waits for reads or writes to unblock, or for a close > + */ > +int libvchan_wait(struct libvchan *ctrl); > +/** > + * Returns the event file descriptor for this vchan. When this FD is readable, > + * libvchan_wait() will not block, and the state of the vchan has changed since > + * the last invocation of libvchan_wait(). > + */ > +int libvchan_fd_for_select(struct libvchan *ctrl); > +/** > + * Query the state of the vchan shared page: > + * return 0 when one side has called libvchan_close() or crashed > + * return 1 when both sides are open > + * return 2 [server only] when no client has yet connected > + */ > +int libvchan_is_open(struct libvchan* ctrl); > +/** Amount of data ready to read, in bytes */ > +int libvchan_data_ready(struct libvchan *ctrl); > +/** Amount of data it is possible to send without blocking */ > +int libvchan_buffer_space(struct libvchan *ctrl); > diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c > new file mode 100644 > index 0000000..ea1bfc6 > --- /dev/null > +++ b/tools/libvchan/node-select.c > @@ -0,0 +1,162 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this program; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * This is a test program for libvchan. Communications are bidirectional, > + * with either server (grant offeror) or client able to read and write. > + */ > + > +#include <stdlib.h> > +#include <stdio.h> > +#include <string.h> > +#include <unistd.h> > +#include <fcntl.h> > +#include <errno.h> > + > +#include <libxenvchan.h> > + > +void usage(char** argv) > +{ > + fprintf(stderr, "usage:\n" > + "\t%s [client|server] domainid nodeid [rbufsiz wbufsiz]\n", > + argv[0]); > + exit(1); > +} > + > +#define BUFSIZE 5000 > +char inbuf[BUFSIZE]; > +char outbuf[BUFSIZE]; > +int insiz = 0; > +int outsiz = 0; > +struct libvchan *ctrl = 0; > + > +void vchan_wr() { > + if (!insiz) > + return; > + int ret = libvchan_write(ctrl, inbuf, insiz); > + if (ret < 0) { > + fprintf(stderr, "vchan write failed\n"); > + exit(1); > + } > + if (ret > 0) { > + insiz -= ret; > + memmove(inbuf, inbuf + ret, insiz); > + } > +} > + > +void stdout_wr() { > + if (!outsiz) > + return; > + int ret = write(1, outbuf, outsiz); > + if (ret < 0 && errno != EAGAIN) > + exit(1); > + if (ret > 0) { > + outsiz -= ret; > + memmove(outbuf, outbuf + ret, outsiz); > + } > +} > + > +/** > + Simple libvchan application, both client and server. > + Both sides may write and read, both from the libvchan and from > + stdin/stdout (just like netcat). > +*/ > + > +int main(int argc, char **argv) > +{ > + int ret; > + int libvchan_fd; > + if (argc < 4) > + usage(argv); > + if (!strcmp(argv[1], "server")) { > + int rsiz = argc > 4 ? atoi(argv[4]) : 0; > + int wsiz = argc > 5 ? atoi(argv[5]) : 0; > + ctrl = libvchan_server_init(NULL, atoi(argv[2]), atoi(argv[3]), rsiz, wsiz); > + } else if (!strcmp(argv[1], "client")) > + ctrl = libvchan_client_init(NULL, atoi(argv[2]), atoi(argv[3])); > + else > + usage(argv); > + if (!ctrl) { > + perror("libvchan_*_init"); > + exit(1); > + } > + > + fcntl(0, F_SETFL, O_NONBLOCK); > + fcntl(1, F_SETFL, O_NONBLOCK); > + > + libvchan_fd = libvchan_fd_for_select(ctrl); > + for (;;) { > + fd_set rfds; > + fd_set wfds; > + FD_ZERO(&rfds); > + FD_ZERO(&wfds); > + if (insiz != BUFSIZE) > + FD_SET(0, &rfds); > + if (outsiz) > + FD_SET(1, &wfds); > + FD_SET(libvchan_fd, &rfds); > + ret = select(libvchan_fd + 1, &rfds, &wfds, NULL, NULL); > + if (ret < 0) { > + perror("select"); > + exit(1); > + } > + if (FD_ISSET(0, &rfds)) { > + ret = read(0, inbuf + insiz, BUFSIZE - insiz); > + if (ret < 0 && errno != EAGAIN) > + exit(1); > + if (ret == 0) { > + while (insiz) { > + vchan_wr(); > + libvchan_wait(ctrl); > + } > + return 0; > + } > + if (ret) > + insiz += ret; > + vchan_wr(); > + } > + if (FD_ISSET(libvchan_fd, &rfds)) { > + libvchan_wait(ctrl); > + vchan_wr(); > + } > + if (FD_ISSET(1, &wfds)) > + stdout_wr(); > + while (libvchan_data_ready(ctrl) && outsiz < BUFSIZE) { > + ret = libvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); > + if (ret < 0) > + exit(1); > + outsiz += ret; > + stdout_wr(); > + } > + if (!libvchan_is_open(ctrl)) { > + fcntl(1, F_SETFL, 0); > + while (outsiz) > + stdout_wr(); > + return 0; > + } > + } > +} > diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c > new file mode 100644 > index 0000000..6a9204c > --- /dev/null > +++ b/tools/libvchan/node.c > @@ -0,0 +1,169 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this program; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * This is a test program for libvchan. Communications are in one direction, > + * either server (grant offeror) to client or vice versa. > + */ > + > +#include <stdlib.h> > +#include <stdio.h> > +#include <string.h> > +#include <unistd.h> > +#include <time.h> > + > +#include <libxenvchan.h> > + > +int libvchan_write_all(struct libvchan *ctrl, char *buf, int size) > +{ > + int written = 0; > + int ret; > + while (written < size) { > + ret = libvchan_write(ctrl, buf + written, size - written); > + if (ret <= 0) { > + perror("write"); > + exit(1); > + } > + written += ret; > + } > + return size; > +} > + > +int write_all(int fd, char *buf, int size) > +{ > + int written = 0; > + int ret; > + while (written < size) { > + ret = write(fd, buf + written, size - written); > + if (ret <= 0) { > + perror("write"); > + exit(1); > + } > + written += ret; > + } > + return size; > +} > + > +void usage(char** argv) > +{ > + fprintf(stderr, "usage:\n" > + "%s [client|server] [read|write] domid nodeid\n", argv[0]); > + exit(1); > +} > + > +#define BUFSIZE 5000 > +char buf[BUFSIZE]; > +void reader(struct libvchan *ctrl) > +{ > + int size; > + for (;;) { > + size = rand() % (BUFSIZE - 1) + 1; > + size = libvchan_read(ctrl, buf, size); > + fprintf(stderr, "#"); > + if (size < 0) { > + perror("read vchan"); > + libvchan_close(ctrl); > + exit(1); > + } > + size = write_all(1, buf, size); > + if (size < 0) { > + perror("stdout write"); > + exit(1); > + } > + if (size == 0) { > + perror("write size=0?\n"); > + exit(1); > + } > + } > +} > + > +void writer(struct libvchan *ctrl) > +{ > + int size; > + for (;;) { > + size = rand() % (BUFSIZE - 1) + 1; > + size = read(0, buf, size); > + if (size < 0) { > + perror("read stdin"); > + libvchan_close(ctrl); > + exit(1); > + } > + if (size == 0) > + break; > + size = libvchan_write_all(ctrl, buf, size); > + fprintf(stderr, "#"); > + if (size < 0) { > + perror("vchan write"); > + exit(1); > + } > + if (size == 0) { > + perror("write size=0?\n"); > + exit(1); > + } > + } > +} > + > + > +/** > + Simple libvchan application, both client and server. > + One side does writing, the other side does reading; both from > + standard input/output fds. > +*/ > +int main(int argc, char **argv) > +{ > + int seed = time(0); > + struct libvchan *ctrl = 0; > + int wr = 0; > + if (argc < 4) > + usage(argv); > + if (!strcmp(argv[2], "read")) > + wr = 0; > + else if (!strcmp(argv[2], "write")) > + wr = 1; > + else > + usage(argv); > + if (!strcmp(argv[1], "server")) > + ctrl = libvchan_server_init(NULL, atoi(argv[3]), atoi(argv[4]), 0, 0); > + else if (!strcmp(argv[1], "client")) > + ctrl = libvchan_client_init(NULL, atoi(argv[3]), atoi(argv[4])); > + else > + usage(argv); > + if (!ctrl) { > + perror("libvchan_*_init"); > + exit(1); > + } > + ctrl->blocking = 1; > + > + srand(seed); > + fprintf(stderr, "seed=%d\n", seed); > + if (wr) > + writer(ctrl); > + else > + reader(ctrl); > + libvchan_close(ctrl); > + return 0; > +} > diff --git a/xen/include/public/io/libvchan.h b/xen/include/public/io/libvchan.h > new file mode 100644 > index 0000000..a3bf7cd > --- /dev/null > +++ b/xen/include/public/io/libvchan.h > @@ -0,0 +1,97 @@ > +/** > + * @file > + * @section AUTHORS > + * > + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * > + * Authors: > + * Rafal Wojtczuk <rafal@invisiblethingslab.com> > + * Daniel De Graaf <dgdegra@tycho.nsa.gov> > + * > + * @section LICENSE > + * > + * This library is free software; you can redistribute it and/or > + * modify it under the terms of the GNU Lesser General Public > + * License as published by the Free Software Foundation; either > + * version 2 of the License, or (at your option) any later version. > + * > + * This library is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * Lesser General Public License for more details. > + * > + * You should have received a copy of the GNU Lesser General Public > + * License along with this library; if not, write to the Free Software > + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA > + * > + * @section DESCRIPTION > + * > + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, > + * this code has been substantially rewritten to use the gntdev and gntalloc > + * devices instead of raw MFNs and map_foreign_range. > + * > + * This is a library for inter-domain communication. A standard Xen ring > + * buffer is used, with a datagram-based interface built on top. The grant > + * reference and event channels are shared in XenStore under the path > + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} > + * > + * The ring.h macros define an asymmetric interface to a shared data structure > + * that assumes all rings reside in a single contiguous memory space. This is > + * not suitable for vchan because the interface to the ring is symmetric except > + * for the setup. Unlike the producer-consumer rings defined in ring.h, the > + * size of the rings used in vchan are determined at execution time instead of > + * compile time, so the macros in ring.h cannot be used to access the rings. > + */ > + > +#include <stdint.h> > +#include <sys/types.h> > + > +struct ring_shared { > + uint32_t cons, prod; > +}; > + > +#define VCHAN_NOTIFY_WRITE 0x1 > +#define VCHAN_NOTIFY_READ 0x2 > + > +/** > + * vchan_interface: primary shared data structure > + */ > +struct vchan_interface { > + /** > + * Standard consumer/producer interface, one pair per buffer > + * left is client write, server read > + * right is client read, server write > + */ > + struct ring_shared left, right; > + /** > + * size of the rings, which determines their location > + * 10 - at offset 1024 in ring''s page > + * 11 - at offset 2048 in ring''s page > + * 12+ - uses 2^(N-12) grants to describe the multi-page ring > + * These should remain constant once the page is shared. > + * Only one of the two orders can be 10 (or 11). > + */ > + uint16_t left_order, right_order; > + /** > + * Shutdown detection: > + * 0: client (or server) has exited > + * 1: client (or server) is connected > + * 2: client has not yet connected > + */ > + uint8_t cli_live, srv_live; > + /** > + * Notification bits: > + * VCHAN_NOTIFY_WRITE: send notify when data is written > + * VCHAN_NOTIFY_READ: send notify when data is read (consumed) > + * cli_notify is used for the client to inform the server of its action > + */ > + uint8_t cli_notify, srv_notify; > + /** > + * Grant list: ordering is left, right. Must not extend into actual ring > + * or grow beyond the end of the initial shared page. > + * These should remain constant once the page is shared, to allow > + * for possible remapping by a client that restarts. > + */ > + uint32_t grants[0]; > +}; > + > -- > 1.7.6.2 >_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-21 15:02 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On 09/21/2011 06:03 AM, Ian Campbell wrote:> On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote: >> @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { >> uint32_t count; >> }; >> >> +/* >> + * Sets up an unmap notification within the page, so that the other side can do >> + * cleanup if this side crashes. Required to implement cross-domain robust >> + * mutexes or close notification on communication channels. >> + * >> + * Each mapped page only supports one notification; multiple calls referring to >> + * the same page overwrite the previous notification. You must clear the >> + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it >> + * to occur. >> + */ >> +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ >> +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) >> +struct ioctl_gntdev_unmap_notify { >> + /* IN parameters */ >> + /* Offset in the file descriptor for a byte within the page (same as >> + * used in mmap). > > I''m probably being thick but I don''t understand what this means, i.e. > what this thing is relative to.This is an offset that acts like a byte offset in the /dev/xen/gntdev device. Conceptually, if /dev/xen/evtchn implemented pwrite() then this offset would be the offset you would pass to modify your target byte. For example, if you use gntdev to map two pages, the first will be at offset 0 and the second at 4096; you would pass 4098 here to indicate the third byte of the second page. The offsets (0, 4096) are returned by the map-grant ioctls.>> If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to >> + * be cleared. Otherwise, it can be any byte in the page whose >> + * notification we are adjusting. >> + */ >> + uint64_t index; >> + /* Action(s) to take on unmap */ >> + uint32_t action; >> + /* Event channel to notify */ >> + uint32_t event_channel_port; > > evtchn_port_t ?Using that would require an include dependency on event_channel.h which is not exposed as a userspace-visible header. Linux''s evtchn.h also does not use evtchn_port_t (it uses unsigned int). Since this is just a direct copy of the header from the linux source tree, any changes really need to happen there first.>> +}; >> + >> +/* Clear (set to zero) the byte specified by index */ >> +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 >> +/* Send an interrupt on the indicated event channel */ >> +#define UNMAP_NOTIFY_SEND_EVENT 0x2 >> + >> #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ >> diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c >> index 4f55fce..3d3c63b 100644 >> --- a/tools/libxc/xc_gnttab.c >> +++ b/tools/libxc/xc_gnttab.c >> @@ -18,6 +18,7 @@ >> */ >> >> #include "xc_private.h" >> +#include <errno.h> >> >> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) >> { >> @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, >> count, domid, refs, prot); >> } >> >> +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, >> + uint32_t domid, >> + uint32_t ref, >> + uint32_t notify_offset, >> + evtchn_port_t notify_port, >> + int *notify_result) >> +{ >> + if (xcg->ops->u.gnttab.map_grant_ref_notify) >> + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, >> + domid, ref, notify_offset, notify_port, notify_result); >> + else { >> + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, >> + domid, ref, PROT_READ|PROT_WRITE); >> + if (area && notify_result) { >> + *notify_result = -1; >> + errno = ENOSYS; >> + } >> + return area; >> + } > > I think the new public interface is fine but do we really need a new > internal interface here? > > I think you can just add the notify_* arguments to the existing OSDEP > function and have those OS backends which don''t implement that feature > return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value > works). > > Why doesn''t the *_notify variant take a prot argument?At least for the byte-clear portion of the notify, the page must be writable or the notify will not work. I suppose an event channel alone could be used for a read-only mapping, but normally the unmapping of a read-only mapping is not an important event - although I guess you could contrive a use for this type of notificaiton, so there''s no reason not to allow it.> I''d be tempted to do away with notify_result too -- if the caller asked > for notification and we fail to give that then we can cleanup and return > an error. If they want to try again without the notification then that''s > up to them.The source of the error might be unclear, but this would make the interface cleaner.> >> +} >> + >> + >> int xc_gnttab_munmap(xc_gnttab *xcg, >> void *start_address, >> uint32_t count) >> diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c >> index dca6718..3040cb6 100644 >> --- a/tools/libxc/xc_linux_osdep.c >> +++ b/tools/libxc/xc_linux_osdep.c >> @@ -613,6 +613,62 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle >> return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); >> } >> >> +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, >> + uint32_t domid, uint32_t ref, >> + uint32_t notify_offset, >> + evtchn_port_t notify_port, >> + int *notify_result) >> +{ >> + int fd = (int)h; >> + int rv = 0; >> + struct ioctl_gntdev_map_grant_ref map; >> + struct ioctl_gntdev_unmap_notify notify; >> + void *addr; >> + >> + map.count = 1; >> + map.refs[0].domid = domid; >> + map.refs[0].ref = ref; >> + >> + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { >> + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); >> + return NULL; >> + } >> + >> + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); >> + if ( addr == MAP_FAILED ) >> + { >> + int saved_errno = errno; >> + struct ioctl_gntdev_unmap_grant_ref unmap_grant; >> + >> + PERROR("xc_gnttab_map_grant_ref: mmap failed"); >> + unmap_grant.index = map.index; >> + unmap_grant.count = 1; >> + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); >> + errno = saved_errno; >> + return NULL; >> + } > > The non-notify variant handles EAGAIN, why doesn''t this one need to do > so?The non-notify variant doesn''t need to handle EAGAIN anymore (with modern kernels, at least... perhaps it should remain for older kernels). Also, do_gnttab_map_grant_refs does not handle EAGAIN.>> + >> + notify.index = map.index; >> + notify.action = 0; >> + if (notify_offset >= 0) { >> + notify.index += notify_offset; >> + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; >> + } >> + if (notify_port >= 0) { >> + notify.event_channel_port = notify_port; >> + notify.action |= UNMAP_NOTIFY_SEND_EVENT; >> + } >> + if (notify.action) >> + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify); > > Is there a race if the other end (or this process) dies between the MAP > ioctl and here? > > Ian. >Technically it''s a race, but it doesn''t impact any reasonable use of the notification. The local process can''t actually be using the shared page at this point, and the other side will not be certain that the map has actually succeeded until after the function returns (and it is notified in some way - libvchan changes the notify byte from 2->1 at this point). If the domain whose memory we are mapping crashes, this ioctl will succeed unless the event channel it refers to has already been invalidated - but either way, the notifications are now irrelevant as there is nobody to listen to them. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-21 15:25 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Wed, 2011-09-21 at 16:02 +0100, Daniel De Graaf wrote:> On 09/21/2011 06:03 AM, Ian Campbell wrote: > > On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote: > >> @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { > >> uint32_t count; > >> }; > >> > >> +/* > >> + * Sets up an unmap notification within the page, so that the other side can do > >> + * cleanup if this side crashes. Required to implement cross-domain robust > >> + * mutexes or close notification on communication channels. > >> + * > >> + * Each mapped page only supports one notification; multiple calls referring to > >> + * the same page overwrite the previous notification. You must clear the > >> + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it > >> + * to occur. > >> + */ > >> +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ > >> +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) > >> +struct ioctl_gntdev_unmap_notify { > >> + /* IN parameters */ > >> + /* Offset in the file descriptor for a byte within the page (same as > >> + * used in mmap). > > > > I''m probably being thick but I don''t understand what this means, i.e. > > what this thing is relative to. > > This is an offset that acts like a byte offset in the /dev/xen/gntdev device. > Conceptually, if /dev/xen/evtchn implemented pwrite() then this offset would > be the offset you would pass to modify your target byte. > > For example, if you use gntdev to map two pages, the first will be at offset > 0 and the second at 4096; you would pass 4098 here to indicate the third byte > of the second page. The offsets (0, 4096) are returned by the map-grant ioctls.Hmm. I think I was confused because it was an offset into the file rather than, say, a virtual address. When I map a page how do I know what the offset of it is wrt the file descriptor? DO I just have to remember how many pages I mapped an *4096?> > >> If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to > >> + * be cleared. Otherwise, it can be any byte in the page whose > >> + * notification we are adjusting. > >> + */ > >> + uint64_t index; > >> + /* Action(s) to take on unmap */ > >> + uint32_t action; > >> + /* Event channel to notify */ > >> + uint32_t event_channel_port; > > > > evtchn_port_t ? > > Using that would require an include dependency on event_channel.h which is > not exposed as a userspace-visible header. Linux''s evtchn.h also does not > use evtchn_port_t (it uses unsigned int). > > Since this is just a direct copy of the header from the linux source tree, > any changes really need to happen there first.OK, that''s fine as it is then.> > >> +}; > >> + > >> +/* Clear (set to zero) the byte specified by index */ > >> +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 > >> +/* Send an interrupt on the indicated event channel */ > >> +#define UNMAP_NOTIFY_SEND_EVENT 0x2 > >> + > >> #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ > >> diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c > >> index 4f55fce..3d3c63b 100644 > >> --- a/tools/libxc/xc_gnttab.c > >> +++ b/tools/libxc/xc_gnttab.c > >> @@ -18,6 +18,7 @@ > >> */ > >> > >> #include "xc_private.h" > >> +#include <errno.h> > >> > >> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) > >> { > >> @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, > >> count, domid, refs, prot); > >> } > >> > >> +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, > >> + uint32_t domid, > >> + uint32_t ref, > >> + uint32_t notify_offset, > >> + evtchn_port_t notify_port, > >> + int *notify_result) > >> +{ > >> + if (xcg->ops->u.gnttab.map_grant_ref_notify) > >> + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, > >> + domid, ref, notify_offset, notify_port, notify_result); > >> + else { > >> + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, > >> + domid, ref, PROT_READ|PROT_WRITE); > >> + if (area && notify_result) { > >> + *notify_result = -1; > >> + errno = ENOSYS; > >> + } > >> + return area; > >> + } > > > > I think the new public interface is fine but do we really need a new > > internal interface here? > > > > I think you can just add the notify_* arguments to the existing OSDEP > > function and have those OS backends which don''t implement that feature > > return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value > > works). > > > > Why doesn''t the *_notify variant take a prot argument? > > At least for the byte-clear portion of the notify, the page must be writable > or the notify will not work. I suppose an event channel alone could be used > for a read-only mapping, but normally the unmapping of a read-only mapping is > not an important event - although I guess you could contrive a use for this > type of notificaiton, so there''s no reason not to allow it.If you combine this the two functions then returning EINVAL for attempts to map without PROT_WRITE (or whatever perm is necessary) would be reasonable IMHO.> > I''d be tempted to do away with notify_result too -- if the caller asked > > for notification and we fail to give that then we can cleanup and return > > an error. If they want to try again without the notification then that''s > > up to them. > > The source of the error might be unclear, but this would make the interface > cleaner. > > > > >> +} > >> + > >> + > >> int xc_gnttab_munmap(xc_gnttab *xcg, > >> void *start_address, > >> uint32_t count) > >> diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c > >> index dca6718..3040cb6 100644 > >> --- a/tools/libxc/xc_linux_osdep.c > >> +++ b/tools/libxc/xc_linux_osdep.c > >> @@ -613,6 +613,62 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle > >> return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); > >> } > >> > >> +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, > >> + uint32_t domid, uint32_t ref, > >> + uint32_t notify_offset, > >> + evtchn_port_t notify_port, > >> + int *notify_result) > >> +{ > >> + int fd = (int)h; > >> + int rv = 0; > >> + struct ioctl_gntdev_map_grant_ref map; > >> + struct ioctl_gntdev_unmap_notify notify; > >> + void *addr; > >> + > >> + map.count = 1; > >> + map.refs[0].domid = domid; > >> + map.refs[0].ref = ref; > >> + > >> + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { > >> + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); > >> + return NULL; > >> + } > >> + > >> + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); > >> + if ( addr == MAP_FAILED ) > >> + { > >> + int saved_errno = errno; > >> + struct ioctl_gntdev_unmap_grant_ref unmap_grant; > >> + > >> + PERROR("xc_gnttab_map_grant_ref: mmap failed"); > >> + unmap_grant.index = map.index; > >> + unmap_grant.count = 1; > >> + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); > >> + errno = saved_errno; > >> + return NULL; > >> + } > > > > The non-notify variant handles EAGAIN, why doesn''t this one need to do > > so? > > The non-notify variant doesn''t need to handle EAGAIN anymore (with modern > kernels, at least... perhaps it should remain for older kernels). Also, > do_gnttab_map_grant_refs does not handle EAGAIN.OK I guess that is fine (although if you combine them as I suggest it comes back?) I hadn''t noticed that we have both map_gratn_ref and map_grant_refs, that seems like an area which could be cleaned up. (not saying you should, just noticing it)> > >> + > >> + notify.index = map.index; > >> + notify.action = 0; > >> + if (notify_offset >= 0) { > >> + notify.index += notify_offset; > >> + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; > >> + } > >> + if (notify_port >= 0) { > >> + notify.event_channel_port = notify_port; > >> + notify.action |= UNMAP_NOTIFY_SEND_EVENT; > >> + } > >> + if (notify.action) > >> + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify); > > > > Is there a race if the other end (or this process) dies between the MAP > > ioctl and here? > > > > Ian. > > > > Technically it''s a race, but it doesn''t impact any reasonable use of the > notification. The local process can''t actually be using the shared page > at this point, and the other side will not be certain that the map has > actually succeeded until after the function returns (and it is notified > in some way - libvchan changes the notify byte from 2->1 at this point). > > If the domain whose memory we are mapping crashes, this ioctl will succeed > unless the event channel it refers to has already been invalidated - but > either way, the notifications are now irrelevant as there is nobody to > listen to them.OK. Thanks, Ian.>_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-21 16:31 UTC
[Xen-devel] Re: [PATCH 3/3] libvchan: interdomain communications library
On 09/21/2011 06:53 AM, Ian Campbell wrote:> On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote: >> This library implements a bidirectional communication interface between >> applications in different domains, similar to unix sockets. Data can be >> sent using the byte-oriented libvchan_read/libvchan_write or the >> packet-oriented libvchan_recv/libvchan_send. >> >> Channel setup is done using a client-server model; domain IDs and a port >> number must be negotiated prior to initialization. The server allocates >> memory for the shared pages and determines the sizes of the >> communication rings (which may span multiple pages, although the default >> places rings and control within a single page). >> >> With properly sized rings, testing has shown that this interface >> provides speed comparable to pipes within a single Linux domain; it is >> significantly faster than network-based communication. >> >> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > > I only skimmed this one I had a few minor thoughts below but really I''m > pretty much OK for it to go in (modulo any fallout from comments on > patches 1+2). > > Definite Bonus Points for the doxygen/kernel doc commentary in the > headers, which tool parses them? (a few comments in the code itself seem > to have the "/**" marker but not the rest of the syntax).I think doxygen parses them, but I haven''t personally run doxygen to verify that it works as expected.> You changed the library name to libxenvchan but not the path to the > source nor the API names?I suppose backwards compatability with the existing API has already been killed, so there''s no reason not to change the names - it does make everything more consistent (and easier to grep for).>> +static int init_gnt_srv(struct libvchan *ctrl) >> +{ >> + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; > > Here you do >= PAGE_SHIFT but on the out_unmap_left path you do > 11. > > (am I right that left == server and right == client in the libvhan > terminology?) >>From the public/io/libvchan.h header:* Standard consumer/producer interface, one pair per buffer * left is client write, server read * right is client read, server write So the client is on the left, if you assume the writer owns the buffer.>> + if (ctrl->read.order == 10) { >> + ctrl->read.buffer = ((void*)ctrl->ring) + 1024; >> + } else if (ctrl->read.order == 11) { >> + ctrl->read.buffer = ((void*)ctrl->ring) + 2048; >> + } else { >> + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, ctrl->other_domain_id, >> + pages_left, ctrl->ring->grants, 1); >> + if (!ctrl->read.buffer) >> + goto out_ring; >> + } > > switch (...read.order)? > > In other places you have MAX_LARGE_RING/MAX_SMALL_RING etc, I think > using SMALL/LARGE_RING_ORDER instead of 10 and 11 seems like a good > idea. > > Similarly using LARGE/SMALL_RING_OFFSET instead of 1024/2048 would help > clarity. > >> + if (ctrl->write.order < 10 || ctrl->write.order > 24) >> + goto out_unmap_ring; > > What is the significance of 2^24? >Actually, this should be 20 to match MAX_RING_SIZE in init.c; that number is derived from 1024 bytes of grant data. An order of 22 will always cause the list of grants to overrun the primary shared page; an order of 21 on both sides will also cause this, and can also cause the grant list to overlap the LARGE_RING area. From testing, the performance gain from larger rings begins to drop off before 2^20 (although this may depend on the size of your L2/L3 cache). Also, gntalloc is restricted to 1024 pages by default (this can be adjusted via sysfs or a module parameter).>> + >> +// find xenstore entry >> + snprintf(buf, sizeof buf, "/local/domain/%d/data/vchan/%s/%d/ring-ref", >> + ctrl->other_domain_id, domid_str, ctrl->device_number); > > I wonder if the base of this path (up to and including "%s/%d"?) ought > to be caller provided? My thinking is that the rendezvous between client > and server is out of band and the path is really an element (or even the > total encoding) of that OOB communication. > > It would also push the selection of xs location to be pushed up into the > application which also defines the protocol. For example I might want to > build a pv protocol with this library which is supported by the > toolstack and therefore want to put my stuff under devices etc or in any > other protocol specific xs location. The wart I previously mentioned wrt > using the "data" directory would then be an application wart (which I > think is ok) rather than baked into the libraries. > > Ian. >Allowing the caller to specify the xenstore path would make the interface more flexible, and also removes the arbitrary port numbers which already seem likely to collide. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-21 17:07 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On 09/21/2011 11:25 AM, Ian Campbell wrote:> On Wed, 2011-09-21 at 16:02 +0100, Daniel De Graaf wrote: >> On 09/21/2011 06:03 AM, Ian Campbell wrote: >>> On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote: >>>> @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { >>>> uint32_t count; >>>> }; >>>> >>>> +/* >>>> + * Sets up an unmap notification within the page, so that the other side can do >>>> + * cleanup if this side crashes. Required to implement cross-domain robust >>>> + * mutexes or close notification on communication channels. >>>> + * >>>> + * Each mapped page only supports one notification; multiple calls referring to >>>> + * the same page overwrite the previous notification. You must clear the >>>> + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it >>>> + * to occur. >>>> + */ >>>> +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ >>>> +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) >>>> +struct ioctl_gntdev_unmap_notify { >>>> + /* IN parameters */ >>>> + /* Offset in the file descriptor for a byte within the page (same as >>>> + * used in mmap). >>> >>> I''m probably being thick but I don''t understand what this means, i.e. >>> what this thing is relative to. >> >> This is an offset that acts like a byte offset in the /dev/xen/gntdev device. >> Conceptually, if /dev/xen/evtchn implemented pwrite() then this offset would >> be the offset you would pass to modify your target byte. >> >> For example, if you use gntdev to map two pages, the first will be at offset >> 0 and the second at 4096; you would pass 4098 here to indicate the third byte >> of the second page. The offsets (0, 4096) are returned by the map-grant ioctls. > > Hmm. I think I was confused because it was an offset into the file > rather than, say, a virtual address. > > When I map a page how do I know what the offset of it is wrt the file > descriptor? DO I just have to remember how many pages I mapped an *> 4096?You had the offset at the time you mapped it, because that''s what you passed in the offset parameter to mmap(). Just don''t lose the number :) The xen interfaces do forget the number: for gntdev, they recover it via IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR and for gntalloc, it is not needed because the pages are unhooked as part of the map process so the offset is no longer valid. This technique is now possible for gntdev, but older kernels will fail the unmap ioctl and may crash if you close anyway.>> >>>> If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to >>>> + * be cleared. Otherwise, it can be any byte in the page whose >>>> + * notification we are adjusting. >>>> + */ >>>> + uint64_t index; >>>> + /* Action(s) to take on unmap */ >>>> + uint32_t action; >>>> + /* Event channel to notify */ >>>> + uint32_t event_channel_port; >>> >>> evtchn_port_t ? >> >> Using that would require an include dependency on event_channel.h which is >> not exposed as a userspace-visible header. Linux''s evtchn.h also does not >> use evtchn_port_t (it uses unsigned int). >> >> Since this is just a direct copy of the header from the linux source tree, >> any changes really need to happen there first. > > OK, that''s fine as it is then. > >> >>>> +}; >>>> + >>>> +/* Clear (set to zero) the byte specified by index */ >>>> +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 >>>> +/* Send an interrupt on the indicated event channel */ >>>> +#define UNMAP_NOTIFY_SEND_EVENT 0x2 >>>> + >>>> #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ >>>> diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c >>>> index 4f55fce..3d3c63b 100644 >>>> --- a/tools/libxc/xc_gnttab.c >>>> +++ b/tools/libxc/xc_gnttab.c >>>> @@ -18,6 +18,7 @@ >>>> */ >>>> >>>> #include "xc_private.h" >>>> +#include <errno.h> >>>> >>>> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) >>>> { >>>> @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, >>>> count, domid, refs, prot); >>>> } >>>> >>>> +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, >>>> + uint32_t domid, >>>> + uint32_t ref, >>>> + uint32_t notify_offset, >>>> + evtchn_port_t notify_port, >>>> + int *notify_result) >>>> +{ >>>> + if (xcg->ops->u.gnttab.map_grant_ref_notify) >>>> + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, >>>> + domid, ref, notify_offset, notify_port, notify_result); >>>> + else { >>>> + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, >>>> + domid, ref, PROT_READ|PROT_WRITE); >>>> + if (area && notify_result) { >>>> + *notify_result = -1; >>>> + errno = ENOSYS; >>>> + } >>>> + return area; >>>> + } >>> >>> I think the new public interface is fine but do we really need a new >>> internal interface here? >>> >>> I think you can just add the notify_* arguments to the existing OSDEP >>> function and have those OS backends which don''t implement that feature >>> return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value >>> works). >>> >>> Why doesn''t the *_notify variant take a prot argument? >> >> At least for the byte-clear portion of the notify, the page must be writable >> or the notify will not work. I suppose an event channel alone could be used >> for a read-only mapping, but normally the unmapping of a read-only mapping is >> not an important event - although I guess you could contrive a use for this >> type of notificaiton, so there''s no reason not to allow it. > > If you combine this the two functions then returning EINVAL for attempts > to map without PROT_WRITE (or whatever perm is necessary) would be > reasonable IMHO. >The ioctl already prevents you from requesting the impossible, so this should just work.>>> I''d be tempted to do away with notify_result too -- if the caller asked >>> for notification and we fail to give that then we can cleanup and return >>> an error. If they want to try again without the notification then that''s >>> up to them. >> >> The source of the error might be unclear, but this would make the interface >> cleaner. >> >>> >>>> +} >>>> + >>>> + >>>> int xc_gnttab_munmap(xc_gnttab *xcg, >>>> void *start_address, >>>> uint32_t count) >>>> diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c >>>> index dca6718..3040cb6 100644 >>>> --- a/tools/libxc/xc_linux_osdep.c >>>> +++ b/tools/libxc/xc_linux_osdep.c >>>> @@ -613,6 +613,62 @@ static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle >>>> return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); >>>> } >>>> >>>> +static void *linux_gnttab_map_grant_ref_notify(xc_gnttab *xch, xc_osdep_handle h, >>>> + uint32_t domid, uint32_t ref, >>>> + uint32_t notify_offset, >>>> + evtchn_port_t notify_port, >>>> + int *notify_result) >>>> +{ >>>> + int fd = (int)h; >>>> + int rv = 0; >>>> + struct ioctl_gntdev_map_grant_ref map; >>>> + struct ioctl_gntdev_unmap_notify notify; >>>> + void *addr; >>>> + >>>> + map.count = 1; >>>> + map.refs[0].domid = domid; >>>> + map.refs[0].ref = ref; >>>> + >>>> + if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { >>>> + PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); >>>> + return NULL; >>>> + } >>>> + >>>> + addr = mmap(NULL, XC_PAGE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, map.index); >>>> + if ( addr == MAP_FAILED ) >>>> + { >>>> + int saved_errno = errno; >>>> + struct ioctl_gntdev_unmap_grant_ref unmap_grant; >>>> + >>>> + PERROR("xc_gnttab_map_grant_ref: mmap failed"); >>>> + unmap_grant.index = map.index; >>>> + unmap_grant.count = 1; >>>> + ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); >>>> + errno = saved_errno; >>>> + return NULL; >>>> + } >>> >>> The non-notify variant handles EAGAIN, why doesn''t this one need to do >>> so? >> >> The non-notify variant doesn''t need to handle EAGAIN anymore (with modern >> kernels, at least... perhaps it should remain for older kernels). Also, >> do_gnttab_map_grant_refs does not handle EAGAIN. > > OK I guess that is fine (although if you combine them as I suggest it > comes back?) > > I hadn''t noticed that we have both map_gratn_ref and map_grant_refs, > that seems like an area which could be cleaned up. (not saying you > should, just noticing it)Since I''m already rewriting the osdep layer functions, I think I can replace all 3-4 existing map functions with a single function. It looks like the current 2.6.32.x xen kernels also aren''t returning EAGAIN, so I''m unsure as to why this support was added. The commit in question is 20689:fe42b16855aa by Grzegorz Milos (committed by Keir Fraser), but I don''t see any discussion on xen-devel for it. It''s also unclear why repeating the request every millisecond in an infinite loop is better than letting the caller handle an -EAGAIN.>> >>>> + >>>> + notify.index = map.index; >>>> + notify.action = 0; >>>> + if (notify_offset >= 0) { >>>> + notify.index += notify_offset; >>>> + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; >>>> + } >>>> + if (notify_port >= 0) { >>>> + notify.event_channel_port = notify_port; >>>> + notify.action |= UNMAP_NOTIFY_SEND_EVENT; >>>> + } >>>> + if (notify.action) >>>> + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify); >>> >>> Is there a race if the other end (or this process) dies between the MAP >>> ioctl and here? >>> >>> Ian. >>> >> >> Technically it''s a race, but it doesn''t impact any reasonable use of the >> notification. The local process can''t actually be using the shared page >> at this point, and the other side will not be certain that the map has >> actually succeeded until after the function returns (and it is notified >> in some way - libvchan changes the notify byte from 2->1 at this point). >> >> If the domain whose memory we are mapping crashes, this ioctl will succeed >> unless the event channel it refers to has already been invalidated - but >> either way, the notifications are now irrelevant as there is nobody to >> listen to them. > > OK. > > Thanks, > Ian. > >> > > >-- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-22 08:18 UTC
[Xen-devel] Re: [PATCH 3/3] libvchan: interdomain communications library
On Wed, 2011-09-21 at 17:31 +0100, Daniel De Graaf wrote:> On 09/21/2011 06:53 AM, Ian Campbell wrote: > > On Mon, 2011-09-19 at 23:43 +0100, Daniel De Graaf wrote: > >> This library implements a bidirectional communication interface between > >> applications in different domains, similar to unix sockets. Data can be > >> sent using the byte-oriented libvchan_read/libvchan_write or the > >> packet-oriented libvchan_recv/libvchan_send. > >> > >> Channel setup is done using a client-server model; domain IDs and a port > >> number must be negotiated prior to initialization. The server allocates > >> memory for the shared pages and determines the sizes of the > >> communication rings (which may span multiple pages, although the default > >> places rings and control within a single page). > >> > >> With properly sized rings, testing has shown that this interface > >> provides speed comparable to pipes within a single Linux domain; it is > >> significantly faster than network-based communication. > >> > >> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> > > > > I only skimmed this one I had a few minor thoughts below but really I''m > > pretty much OK for it to go in (modulo any fallout from comments on > > patches 1+2). > > > > Definite Bonus Points for the doxygen/kernel doc commentary in the > > headers, which tool parses them? (a few comments in the code itself seem > > to have the "/**" marker but not the rest of the syntax). > > I think doxygen parses them, but I haven''t personally run doxygen to > verify that it works as expected.That''s ok, just having the comments at all is much appreciated!> >> +static int init_gnt_srv(struct libvchan *ctrl) > >> +{ > >> + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; > > > > Here you do >= PAGE_SHIFT but on the out_unmap_left path you do > 11. > > > > (am I right that left == server and right == client in the libvhan > > terminology?) > > > > From the public/io/libvchan.h header: > * Standard consumer/producer interface, one pair per buffer > * left is client write, server read > * right is client read, server write > > So the client is on the left, if you assume the writer owns the buffer.Heh, I guess having praised the docs I should read them ;-)> > What is the significance of 2^24? > > > > Actually, this should be 20 to match MAX_RING_SIZE in init.c;OK, then I think MAX_RING_SIZE should be in a header and reused here instead of a hard-coded 20 or 24.> that number > is derived from 1024 bytes of grant data. An order of 22 will always cause > the list of grants to overrun the primary shared page; an order of 21 on > both sides will also cause this, and can also cause the grant list to overlap > the LARGE_RING area. From testing, the performance gain from larger rings > begins to drop off before 2^20 (although this may depend on the size of > your L2/L3 cache). Also, gntalloc is restricted to 1024 pages by default > (this can be adjusted via sysfs or a module parameter).Makes sense. [...]> Allowing the caller to specify the xenstore path would make the interface > more flexible, and also removes the arbitrary port numbers which already > seem likely to collide.Agreed. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-22 08:32 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Wed, 2011-09-21 at 18:07 +0100, Daniel De Graaf wrote:> On 09/21/2011 11:25 AM, Ian Campbell wrote:> > When I map a page how do I know what the offset of it is wrt the file > > descriptor? DO I just have to remember how many pages I mapped an *> > 4096? > > You had the offset at the time you mapped it, because that''s what you > passed in the offset parameter to mmap(). Just don''t lose the number :)So I guess my followup question is where does the number I pass to mmap come from... /me scrobbles in the code. Aha, so it is an output from the gntdev/gntalloc ioctls. So how about: /* IN parameters */ /* * Offset in the file descriptor for a byte within the page. This offset * is the result of the IOCTL_GNTDEV_MAP_GRANT_REF and is the same as * is used with mmap(). */> > > >> > >>>> +}; > >>>> + > >>>> +/* Clear (set to zero) the byte specified by index */ > >>>> +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 > >>>> +/* Send an interrupt on the indicated event channel */ > >>>> +#define UNMAP_NOTIFY_SEND_EVENT 0x2 > >>>> + > >>>> #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ > >>>> diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c > >>>> index 4f55fce..3d3c63b 100644 > >>>> --- a/tools/libxc/xc_gnttab.c > >>>> +++ b/tools/libxc/xc_gnttab.c > >>>> @@ -18,6 +18,7 @@ > >>>> */ > >>>> > >>>> #include "xc_private.h" > >>>> +#include <errno.h> > >>>> > >>>> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) > >>>> { > >>>> @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, > >>>> count, domid, refs, prot); > >>>> } > >>>> > >>>> +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, > >>>> + uint32_t domid, > >>>> + uint32_t ref, > >>>> + uint32_t notify_offset, > >>>> + evtchn_port_t notify_port, > >>>> + int *notify_result) > >>>> +{ > >>>> + if (xcg->ops->u.gnttab.map_grant_ref_notify) > >>>> + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, > >>>> + domid, ref, notify_offset, notify_port, notify_result); > >>>> + else { > >>>> + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, > >>>> + domid, ref, PROT_READ|PROT_WRITE); > >>>> + if (area && notify_result) { > >>>> + *notify_result = -1; > >>>> + errno = ENOSYS; > >>>> + } > >>>> + return area; > >>>> + } > >>> > >>> I think the new public interface is fine but do we really need a new > >>> internal interface here? > >>> > >>> I think you can just add the notify_* arguments to the existing OSDEP > >>> function and have those OS backends which don''t implement that feature > >>> return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value > >>> works). > >>> > >>> Why doesn''t the *_notify variant take a prot argument? > >> > >> At least for the byte-clear portion of the notify, the page must be writable > >> or the notify will not work. I suppose an event channel alone could be used > >> for a read-only mapping, but normally the unmapping of a read-only mapping is > >> not an important event - although I guess you could contrive a use for this > >> type of notificaiton, so there''s no reason not to allow it. > > > > If you combine this the two functions then returning EINVAL for attempts > > to map without PROT_WRITE (or whatever perm is necessary) would be > > reasonable IMHO. > > > > The ioctl already prevents you from requesting the impossible, so this should > just work.Even better. I see the check in the gntdev driver but not in the gntalloc one, is that right?> > I hadn''t noticed that we have both map_gratn_ref and map_grant_refs, > > that seems like an area which could be cleaned up. (not saying you > > should, just noticing it) > > Since I''m already rewriting the osdep layer functions, I think I can replace > all 3-4 existing map functions with a single function.Awesome, thanks!> It looks like the current 2.6.32.x xen kernels also aren''t returning EAGAIN, > so I''m unsure as to why this support was added. The commit in question is > 20689:fe42b16855aa by Grzegorz Milos (committed by Keir Fraser), but I don''t > see any discussion on xen-devel for it.IIRC it was related to the page sharing patches which can cause grant hypercalls to return EAGAIN if the granted page is swapped or shared. I think this can only happen to backend/dom0 type operations. I think the reason it needs to return to the guest is that the paging daemon may be in the same domain and having the only vcpu block in the hypercall would deadlock.> It''s also unclear why repeating the > request every millisecond in an infinite loop is better than letting the > caller handle an -EAGAIN.Yeah, the millisecond thing is pretty gross, something like sched_yield() might be a bit more palatable (if it''s portable enough, although sleep could be a fallback if not) I suppose that hiding the EAGAIN handling in the library was just thought to be convenient, compared with changing all the existing users. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-22 18:09 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On 09/22/2011 04:32 AM, Ian Campbell wrote:> On Wed, 2011-09-21 at 18:07 +0100, Daniel De Graaf wrote: >> On 09/21/2011 11:25 AM, Ian Campbell wrote: > >>> When I map a page how do I know what the offset of it is wrt the file >>> descriptor? DO I just have to remember how many pages I mapped an *>>> 4096? >> >> You had the offset at the time you mapped it, because that''s what you >> passed in the offset parameter to mmap(). Just don''t lose the number :) > > So I guess my followup question is where does the number I pass to mmap > come from... > > /me scrobbles in the code. > > Aha, so it is an output from the gntdev/gntalloc ioctls. So how about: > > /* IN parameters */ > /* > * Offset in the file descriptor for a byte within the page. This offset > * is the result of the IOCTL_GNTDEV_MAP_GRANT_REF and is the same as > * is used with mmap(). > */ >Sounds good.>>>>>> +}; >>>>>> + >>>>>> +/* Clear (set to zero) the byte specified by index */ >>>>>> +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 >>>>>> +/* Send an interrupt on the indicated event channel */ >>>>>> +#define UNMAP_NOTIFY_SEND_EVENT 0x2 >>>>>> + >>>>>> #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ >>>>>> diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c >>>>>> index 4f55fce..3d3c63b 100644 >>>>>> --- a/tools/libxc/xc_gnttab.c >>>>>> +++ b/tools/libxc/xc_gnttab.c >>>>>> @@ -18,6 +18,7 @@ >>>>>> */ >>>>>> >>>>>> #include "xc_private.h" >>>>>> +#include <errno.h> >>>>>> >>>>>> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) >>>>>> { >>>>>> @@ -174,6 +175,28 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, >>>>>> count, domid, refs, prot); >>>>>> } >>>>>> >>>>>> +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, >>>>>> + uint32_t domid, >>>>>> + uint32_t ref, >>>>>> + uint32_t notify_offset, >>>>>> + evtchn_port_t notify_port, >>>>>> + int *notify_result) >>>>>> +{ >>>>>> + if (xcg->ops->u.gnttab.map_grant_ref_notify) >>>>>> + return xcg->ops->u.gnttab.map_grant_ref_notify(xcg, xcg->ops_handle, >>>>>> + domid, ref, notify_offset, notify_port, notify_result); >>>>>> + else { >>>>>> + void* area = xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, >>>>>> + domid, ref, PROT_READ|PROT_WRITE); >>>>>> + if (area && notify_result) { >>>>>> + *notify_result = -1; >>>>>> + errno = ENOSYS; >>>>>> + } >>>>>> + return area; >>>>>> + } >>>>> >>>>> I think the new public interface is fine but do we really need a new >>>>> internal interface here? >>>>> >>>>> I think you can just add the notify_* arguments to the existing OSDEP >>>>> function and have those OS backends which don''t implement that feature >>>>> return ENOSYS if notify_offset != 0 (or ~0 or whatever invalid value >>>>> works). >>>>> >>>>> Why doesn''t the *_notify variant take a prot argument? >>>> >>>> At least for the byte-clear portion of the notify, the page must be writable >>>> or the notify will not work. I suppose an event channel alone could be used >>>> for a read-only mapping, but normally the unmapping of a read-only mapping is >>>> not an important event - although I guess you could contrive a use for this >>>> type of notificaiton, so there''s no reason not to allow it. >>> >>> If you combine this the two functions then returning EINVAL for attempts >>> to map without PROT_WRITE (or whatever perm is necessary) would be >>> reasonable IMHO. >>> >> >> The ioctl already prevents you from requesting the impossible, so this should >> just work. > > Even better. I see the check in the gntdev driver but not in the > gntalloc one, is that right? >Yes. The unmap notification will always work in gntalloc because the shared page is owned locally, and is always writable there; read-only refers to the mapping domain''s ability to write.> >>> I hadn''t noticed that we have both map_gratn_ref and map_grant_refs, >>> that seems like an area which could be cleaned up. (not saying you >>> should, just noticing it) >> >> Since I''m already rewriting the osdep layer functions, I think I can replace >> all 3-4 existing map functions with a single function. > > Awesome, thanks! > >> It looks like the current 2.6.32.x xen kernels also aren''t returning EAGAIN, >> so I''m unsure as to why this support was added. The commit in question is >> 20689:fe42b16855aa by Grzegorz Milos (committed by Keir Fraser), but I don''t >> see any discussion on xen-devel for it. > > IIRC it was related to the page sharing patches which can cause grant > hypercalls to return EAGAIN if the granted page is swapped or shared. I > think this can only happen to backend/dom0 type operations. > > I think the reason it needs to return to the guest is that the paging > daemon may be in the same domain and having the only vcpu block in the > hypercall would deadlock. > >> It''s also unclear why repeating the >> request every millisecond in an infinite loop is better than letting the >> caller handle an -EAGAIN. > > Yeah, the millisecond thing is pretty gross, something like > sched_yield() might be a bit more palatable (if it''s portable enough, > although sleep could be a fallback if not) > > I suppose that hiding the EAGAIN handling in the library was just > thought to be convenient, compared with changing all the existing users. > > Ian. >It sounds to me like this is best solved in the kernel, although it would still have to invoke some kind of yield operation since I assume the kernel can''t tell when the hypercall will not block (ideally you would be able to put the invoking process to sleep pending the cross-domain page fault). For now, it sounds like the best solution is to keep the usleep-based loop in for all gntdev invocations. Using sched_yield will cause the CPU to spin while the page is pending from disk or possibly while waiting for dom0 to be scheduled (assuming a domU-domU vchan). -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-22 22:14 UTC
[Xen-devel] [PATCH v6 0/3] libxenvchan: interdomain communications library
Changes since v5: - Unify gntdev osdep interface - Eliminate notify_result and revert mapping if notify ioctl fails - Rename functions and structures to libxenvchan - Use application-specified xenstore path for ring/event data - Enforce maximum ring size of 2^20 bytes on client - Change to LGPL 2.1 [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify [PATCH 2/3] libxc: add xc_gntshr_* functions [PATCH 3/3] libvchan: interdomain communications library _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-22 22:14 UTC
[Xen-devel] [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
Normally, when a userspace process mapping a grant crashes, the domain providing the reference receives no indication that its peer has crashed, possibly leading to unexpected freezes or timeouts. This function provides a notification of the unmap by signalling an event channel and/or clearing a specific byte in the page. This also unifies the 3 very similar grant-mapping osdep interfaces into a single function instead of introducing yet another minor variation. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/include/xen-sys/Linux/gntdev.h | 33 ++++++++++- tools/libxc/xc_gnttab.c | 26 ++++++-- tools/libxc/xc_linux_osdep.c | 112 +++++++++++++++------------------ tools/libxc/xc_minios.c | 54 +++++------------ tools/libxc/xenctrl.h | 23 +++++++ tools/libxc/xenctrlosdep.h | 20 ++---- tools/libxl/libxlu_cfg_l.c | 102 ++++++++++++++----------------- tools/libxl/libxlu_cfg_l.h | 32 +++------ 8 files changed, 204 insertions(+), 198 deletions(-) diff --git a/tools/include/xen-sys/Linux/gntdev.h b/tools/include/xen-sys/Linux/gntdev.h index 8bd1467..caf6fb4 100644 --- a/tools/include/xen-sys/Linux/gntdev.h +++ b/tools/include/xen-sys/Linux/gntdev.h @@ -66,7 +66,7 @@ struct ioctl_gntdev_map_grant_ref { * before this ioctl is called, or an error will result. */ #define IOCTL_GNTDEV_UNMAP_GRANT_REF \ -_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) +_IOC(_IOC_NONE, ''G'', 1, sizeof(struct ioctl_gntdev_unmap_grant_ref)) struct ioctl_gntdev_unmap_grant_ref { /* IN parameters */ /* The offset was returned by the corresponding map operation. */ @@ -116,4 +116,35 @@ struct ioctl_gntdev_set_max_grants { uint32_t count; }; +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTDEV_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntdev_unmap_notify)) +struct ioctl_gntdev_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page. This offset + * is the result of the IOCTL_GNTDEV_MAP_GRANT_REF and is the same as + * is used with mmap(). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte + * within the page to be cleared. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + #endif /* __LINUX_PUBLIC_GNTDEV_H__ */ diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index 4f55fce..033cc5c 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -18,6 +18,7 @@ */ #include "xc_private.h" +#include <errno.h> int xc_gnttab_op(xc_interface *xch, int cmd, void * op, int op_size, int count) { @@ -150,8 +151,8 @@ void *xc_gnttab_map_grant_ref(xc_gnttab *xcg, uint32_t ref, int prot) { - return xcg->ops->u.gnttab.map_grant_ref(xcg, xcg->ops_handle, - domid, ref, prot); + return xcg->ops->u.gnttab.grant_map(xcg, xcg->ops_handle, 1, 0, prot, + &domid, &ref, -1, -1); } void *xc_gnttab_map_grant_refs(xc_gnttab *xcg, @@ -160,8 +161,8 @@ void *xc_gnttab_map_grant_refs(xc_gnttab *xcg, uint32_t *refs, int prot) { - return xcg->ops->u.gnttab.map_grant_refs(xcg, xcg->ops_handle, - count, domids, refs, prot); + return xcg->ops->u.gnttab.grant_map(xcg, xcg->ops_handle, count, 0, + prot, domids, refs, -1, -1); } void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, @@ -170,10 +171,23 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, uint32_t *refs, int prot) { - return xcg->ops->u.gnttab.map_domain_grant_refs(xcg, xcg->ops_handle, - count, domid, refs, prot); + return xcg->ops->u.gnttab.grant_map(xcg, xcg->ops_handle, count, + XC_GRANT_MAP_SINGLE_DOMAIN, + prot, &domid, refs, -1, -1); } +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + int prot, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + return xcg->ops->u.gnttab.grant_map(xcg, xcg->ops_handle, 1, 0, prot, + &domid, &ref, notify_offset, notify_port); +} + + int xc_gnttab_munmap(xc_gnttab *xcg, void *start_address, uint32_t count) diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index dca6718..f760421 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -509,56 +509,21 @@ static int linux_gnttab_close(xc_gnttab *xcg, xc_osdep_handle h) return close(fd); } -static void *linux_gnttab_map_grant_ref(xc_gnttab *xch, xc_osdep_handle h, - uint32_t domid, uint32_t ref, int prot) -{ - int fd = (int)h; - struct ioctl_gntdev_map_grant_ref map; - void *addr; - - map.count = 1; - map.refs[0].domid = domid; - map.refs[0].ref = ref; - - if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, &map) ) { - PERROR("xc_gnttab_map_grant_ref: ioctl MAP_GRANT_REF failed"); - return NULL; - } - -mmap_again: - addr = mmap(NULL, XC_PAGE_SIZE, prot, MAP_SHARED, fd, map.index); - if ( addr == MAP_FAILED ) - { - int saved_errno = errno; - struct ioctl_gntdev_unmap_grant_ref unmap_grant; - - if(saved_errno == EAGAIN) - { - usleep(1000); - goto mmap_again; - } - /* Unmap the driver slots used to store the grant information. */ - PERROR("xc_gnttab_map_grant_ref: mmap failed"); - unmap_grant.index = map.index; - unmap_grant.count = 1; - ioctl(fd, IOCTL_GNTDEV_UNMAP_GRANT_REF, &unmap_grant); - errno = saved_errno; - return NULL; - } - - return addr; -} - -static void *do_gnttab_map_grant_refs(xc_gnttab *xch, xc_osdep_handle h, - uint32_t count, - uint32_t *domids, int domids_stride, - uint32_t *refs, int prot) +static void *linux_gnttab_grant_map(xc_gnttab *xch, xc_osdep_handle h, + uint32_t count, int flags, int prot, + uint32_t *domids, uint32_t *refs, + uint32_t notify_offset, + evtchn_port_t notify_port) { int fd = (int)h; struct ioctl_gntdev_map_grant_ref *map; void *addr = NULL; + int domids_stride = 1; int i; + if (flags & XC_GRANT_MAP_SINGLE_DOMAIN) + domids_stride = 0; + map = malloc(sizeof(*map) + (count - 1) * sizeof(struct ioctl_gntdev_map_grant_ref)); if ( map == NULL ) @@ -573,13 +538,52 @@ static void *do_gnttab_map_grant_refs(xc_gnttab *xch, xc_osdep_handle h, map->count = count; if ( ioctl(fd, IOCTL_GNTDEV_MAP_GRANT_REF, map) ) { - PERROR("xc_gnttab_map_grant_refs: ioctl MAP_GRANT_REF failed"); + PERROR("linux_gnttab_grant_map: ioctl MAP_GRANT_REF failed"); goto out; } + retry: addr = mmap(NULL, XC_PAGE_SIZE * count, prot, MAP_SHARED, fd, map->index); - if ( addr == MAP_FAILED ) + + if (addr == MAP_FAILED && errno == EAGAIN) + { + /* + * The grant hypercall can return EAGAIN if the granted page is + * swapped out. Since the paging daemon may be in the same domain, the + * hypercall cannot block without causing a deadlock. + * + * Because there are no notificaitons when the page is swapped in, wait + * a bit before retrying, and hope that the page will arrive eventually. + */ + usleep(1000); + goto retry; + } + + if (addr != MAP_FAILED) + { + int rv = 0; + struct ioctl_gntdev_unmap_notify notify; + notify.index = map->index; + notify.action = 0; + if (notify_offset >= 0 && notify_offset < XC_PAGE_SIZE * count) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port != -1) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action) + rv = ioctl(fd, IOCTL_GNTDEV_SET_UNMAP_NOTIFY, ¬ify); + if (rv) { + PERROR("linux_gnttab_grant_map: ioctl SET_UNMAP_NOTIFY failed"); + munmap(addr, count * XC_PAGE_SIZE); + addr = MAP_FAILED; + } + } + + if (addr == MAP_FAILED) { int saved_errno = errno; struct ioctl_gntdev_unmap_grant_ref unmap_grant; @@ -599,19 +603,7 @@ static void *do_gnttab_map_grant_refs(xc_gnttab *xch, xc_osdep_handle h, return addr; } -static void *linux_gnttab_map_grant_refs(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, uint32_t *domids, - uint32_t *refs, int prot) -{ - return do_gnttab_map_grant_refs(xcg, h, count, domids, 1, refs, prot); -} -static void *linux_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, - uint32_t domid, uint32_t *refs, int prot) -{ - return do_gnttab_map_grant_refs(xcg, h, count, &domid, 0, refs, prot); -} static int linux_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count) @@ -659,9 +651,7 @@ static struct xc_osdep_ops linux_gnttab_ops = { .close = &linux_gnttab_close, .u.gnttab = { - .map_grant_ref = &linux_gnttab_map_grant_ref, - .map_grant_refs = &linux_gnttab_map_grant_refs, - .map_domain_grant_refs = &linux_gnttab_map_domain_grant_refs, + .grant_map = &linux_gnttab_grant_map, .munmap = &linux_gnttab_munmap, }, }; diff --git a/tools/libxc/xc_minios.c b/tools/libxc/xc_minios.c index 3b366eb..ff9c0d8 100644 --- a/tools/libxc/xc_minios.c +++ b/tools/libxc/xc_minios.c @@ -458,45 +458,23 @@ void minios_gnttab_close_fd(int fd) files[fd].type = FTYPE_NONE; } -static void *minios_gnttab_map_grant_ref(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t domid, - uint32_t ref, - int prot) -{ - int fd = (int)h; - return gntmap_map_grant_refs(&files[fd].gntmap, - 1, - &domid, 0, - &ref, - prot & PROT_WRITE); -} - -static void *minios_gnttab_map_grant_refs(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, - uint32_t *domids, - uint32_t *refs, - int prot) -{ - int fd = (int)h; - return gntmap_map_grant_refs(&files[fd].gntmap, - count, - domids, 1, - refs, - prot & PROT_WRITE); -} - -static void *minios_gnttab_map_domain_grant_refs(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, - uint32_t domid, - uint32_t *refs, - int prot) +static void *minios_gnttab_grant_map(xc_gnttab *xcg, xc_osdep_handle h, + uint32_t count, int flags, int prot, + uint32_t *domids, uint32_t *refs, + uint32_t notify_offset, + evtchn_port_t notify_port) { int fd = (int)h; + int stride = 1; + if (flags & XC_GRANT_MAP_SINGLE_DOMAIN) + stride = 0; + if (notify_offset != -1 || notify_port != -1) { + errno = ENOSYS; + return NULL; + } return gntmap_map_grant_refs(&files[fd].gntmap, - count, - &domid, 0, - refs, - prot & PROT_WRITE); + count, domids, stride, + refs, prot & PROT_WRITE); } static int minios_gnttab_munmap(xc_gnttab *xcg, xc_osdep_handle h, @@ -534,9 +512,7 @@ static struct xc_osdep_ops minios_gnttab_ops = { .close = &minios_gnttab_close, .u.gnttab = { - .map_grant_ref = &minios_gnttab_map_grant_ref, - .map_grant_refs = &minios_gnttab_map_grant_refs, - .map_domain_grant_refs = &minios_gnttab_map_domain_grant_refs, + .grant_map = &minios_gnttab_grant_map, .munmap = &minios_gnttab_munmap, .set_max_grants = &minios_gnttab_set_max_grants, }, diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 1b82ee0..6f3165d 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -1348,6 +1348,29 @@ void *xc_gnttab_map_domain_grant_refs(xc_gnttab *xcg, uint32_t *refs, int prot); +/** + * Memory maps a grant reference from one domain to a local address range. + * Mappings should be unmapped with xc_gnttab_munmap. If notify_offset or + * notify_port are not -1, this version will attempt to set up an unmap + * notification at the given offset and event channel. When the page is + * unmapped, the byte at the given offset will be zeroed and a wakeup will be + * sent to the given event channel. Logs errors. + * + * @parm xcg a handle on an open grant table interface + * @parm domid the domain to map memory from + * @parm ref the grant reference ID to map + * @parm prot same flag as in mmap() + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + */ +void *xc_gnttab_map_grant_ref_notify(xc_gnttab *xcg, + uint32_t domid, + uint32_t ref, + int prot, + uint32_t notify_offset, + evtchn_port_t notify_port); + /* * Unmaps the @count pages starting at @start_address, which were mapped by a * call to xc_gnttab_map_grant_ref or xc_gnttab_map_grant_refs. Never logs. diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index bfe46e0..1c6317e 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -105,20 +105,12 @@ struct xc_osdep_ops int (*unmask)(xc_evtchn *xce, xc_osdep_handle h, evtchn_port_t port); } evtchn; struct { - void *(*map_grant_ref)(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t domid, - uint32_t ref, - int prot); - void *(*map_grant_refs)(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, - uint32_t *domids, - uint32_t *refs, - int prot); - void *(*map_domain_grant_refs)(xc_gnttab *xcg, xc_osdep_handle h, - uint32_t count, - uint32_t domid, - uint32_t *refs, - int prot); +#define XC_GRANT_MAP_SINGLE_DOMAIN 0x1 + void *(*grant_map)(xc_gnttab *xcg, xc_osdep_handle h, + uint32_t count, int flags, int prot, + uint32_t *domids, uint32_t *refs, + uint32_t notify_offset, + evtchn_port_t notify_port); int (*munmap)(xc_gnttab *xcg, xc_osdep_handle h, void *start_address, uint32_t count); diff --git a/tools/libxl/libxlu_cfg_l.c b/tools/libxl/libxlu_cfg_l.c index 8448bef..2ffb958 100644 --- a/tools/libxl/libxlu_cfg_l.c +++ b/tools/libxl/libxlu_cfg_l.c @@ -34,7 +34,7 @@ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. + * if you want the limit (max/min) macros for int types. */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 @@ -51,9 +51,10 @@ typedef uint32_t flex_uint32_t; typedef signed char flex_int8_t; typedef short int flex_int16_t; typedef int flex_int32_t; -typedef unsigned char flex_uint8_t; +typedef unsigned char flex_uint8_t; typedef unsigned short int flex_uint16_t; typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ /* Limits of integral types. */ #ifndef INT8_MIN @@ -84,8 +85,6 @@ typedef unsigned int flex_uint32_t; #define UINT32_MAX (4294967295U) #endif -#endif /* ! C99 */ - #endif /* ! FLEXINT_H */ #ifdef __cplusplus @@ -159,15 +158,7 @@ typedef void* yyscan_t; /* Size of default input buffer. */ #ifndef YY_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k. - * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. - * Ditto for the __ia64__ case accordingly. - */ -#define YY_BUF_SIZE 32768 -#else #define YY_BUF_SIZE 16384 -#endif /* __ia64__ */ #endif /* The state buf must be large enough to hold one state per character in the main buffer. @@ -185,7 +176,7 @@ typedef struct yy_buffer_state *YY_BUFFER_STATE; /* Note: We specifically omit the test for yy_rule_can_match_eol because it requires * access to the local variable yy_act. Since yyless() is a macro, it would break - * existing scanners that call yyless() from OUTSIDE xlu__cfg_yylex. + * existing scanners that call yyless() from OUTSIDE xlu__cfg_yylex. * One obvious solution it to make yy_act a global. I tried that, and saw * a 5% performance hit in a non-yylineno scanner, because yy_act is * normally declared as a register variable-- so it is not worth it. @@ -197,7 +188,7 @@ typedef struct yy_buffer_state *YY_BUFFER_STATE; if ( yytext[yyl] == ''\n'' )\ --yylineno;\ }while(0) - + /* Return all but the first "n" matched characters back to the input stream. */ #define yyless(n) \ do \ @@ -259,7 +250,7 @@ struct yy_buffer_state int yy_bs_lineno; /**< The line count. */ int yy_bs_column; /**< The column count. */ - + /* Whether to try to fill the input buffer when we reach the * end of it. */ @@ -514,7 +505,7 @@ int xlu__cfg_yyget_column(yyscan_t yyscanner); void xlu__cfg_yyset_column(int column_no, yyscan_t yyscanner); -#line 518 "libxlu_cfg_l.c" +#line 509 "libxlu_cfg_l.c" #define INITIAL 0 #define lexerr 1 @@ -574,9 +565,9 @@ static int yy_init_globals (yyscan_t yyscanner ); /* This must go here because YYSTYPE and YYLTYPE are included * from bison output in section 1.*/ # define yylval yyg->yylval_r - + # define yylloc yyg->yylloc_r - + int xlu__cfg_yylex_init (yyscan_t* scanner); int xlu__cfg_yylex_init_extra (YY_EXTRA_TYPE user_defined,yyscan_t* scanner); @@ -610,14 +601,18 @@ int xlu__cfg_yyget_lineno (yyscan_t yyscanner ); void xlu__cfg_yyset_lineno (int line_number ,yyscan_t yyscanner ); +int xlu__cfg_yyget_column (yyscan_t yyscanner ); + +void xlu__cfg_yyset_column (int column_no ,yyscan_t yyscanner ); + YYSTYPE * xlu__cfg_yyget_lval (yyscan_t yyscanner ); void xlu__cfg_yyset_lval (YYSTYPE * yylval_param ,yyscan_t yyscanner ); YYLTYPE *xlu__cfg_yyget_lloc (yyscan_t yyscanner ); - + void xlu__cfg_yyset_lloc (YYLTYPE * yylloc_param ,yyscan_t yyscanner ); - + /* Macros after this point can all be overridden by user definitions in * section 1. */ @@ -650,12 +645,7 @@ static int input (yyscan_t yyscanner ); /* Amount of stuff to slurp up with each read. */ #ifndef YY_READ_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k */ -#define YY_READ_BUF_SIZE 16384 -#else #define YY_READ_BUF_SIZE 8192 -#endif /* __ia64__ */ #endif /* Copy whatever the last rule matched to the standard output. */ @@ -674,7 +664,7 @@ static int input (yyscan_t yyscanner ); if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ { \ int c = ''*''; \ - size_t n; \ + unsigned n; \ for ( n = 0; n < max_size && \ (c = getc( yyin )) != EOF && c != ''\n''; ++n ) \ buf[n] = (char) c; \ @@ -762,7 +752,7 @@ YY_DECL #line 53 "libxlu_cfg_l.l" -#line 766 "libxlu_cfg_l.c" +#line 756 "libxlu_cfg_l.c" yylval = yylval_param; @@ -845,7 +835,7 @@ yy_find_action: int yyl; for ( yyl = yyg->yy_more_len; yyl < yyleng; ++yyl ) if ( yytext[yyl] == ''\n'' ) - + do{ yylineno++; yycolumn=0; }while(0) @@ -971,7 +961,7 @@ YY_RULE_SETUP #line 104 "libxlu_cfg_l.l" YY_FATAL_ERROR( "flex scanner jammed" ); YY_BREAK -#line 975 "libxlu_cfg_l.c" +#line 965 "libxlu_cfg_l.c" case YY_STATE_EOF(INITIAL): case YY_STATE_EOF(lexerr): yyterminate(); @@ -1377,7 +1367,7 @@ static int yy_get_next_buffer (yyscan_t yyscanner) yyg->yy_hold_char = *++yyg->yy_c_buf_p; if ( c == ''\n'' ) - + do{ yylineno++; yycolumn=0; }while(0) @@ -1460,7 +1450,7 @@ static void xlu__cfg_yy_load_buffer_state (yyscan_t yyscanner) YY_BUFFER_STATE xlu__cfg_yy_create_buffer (FILE * file, int size , yyscan_t yyscanner) { YY_BUFFER_STATE b; - + b = (YY_BUFFER_STATE) xlu__cfg_yyalloc(sizeof( struct yy_buffer_state ) ,yyscanner ); if ( ! b ) YY_FATAL_ERROR( "out of dynamic memory in xlu__cfg_yy_create_buffer()" ); @@ -1504,7 +1494,7 @@ static void xlu__cfg_yy_load_buffer_state (yyscan_t yyscanner) #ifndef __cplusplus extern int isatty (int ); #endif /* __cplusplus */ - + /* Initializes or reinitializes a buffer. * This function is sometimes called more than once on the same buffer, * such as during a xlu__cfg_yyrestart() or at EOF. @@ -1530,7 +1520,7 @@ extern int isatty (int ); } b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; - + errno = oerrno; } @@ -1636,9 +1626,9 @@ static void xlu__cfg_yyensure_buffer_stack (yyscan_t yyscanner) , yyscanner); if ( ! yyg->yy_buffer_stack ) YY_FATAL_ERROR( "out of dynamic memory in xlu__cfg_yyensure_buffer_stack()" ); - + memset(yyg->yy_buffer_stack, 0, num_to_alloc * sizeof(struct yy_buffer_state*)); - + yyg->yy_buffer_stack_max = num_to_alloc; yyg->yy_buffer_stack_top = 0; return; @@ -1667,12 +1657,12 @@ static void xlu__cfg_yyensure_buffer_stack (yyscan_t yyscanner) * @param base the character buffer * @param size the size in bytes of the character buffer * @param yyscanner The scanner object. - * @return the newly allocated buffer state object. + * @return the newly allocated buffer state object. */ YY_BUFFER_STATE xlu__cfg_yy_scan_buffer (char * base, yy_size_t size , yyscan_t yyscanner) { YY_BUFFER_STATE b; - + if ( size < 2 || base[size-2] != YY_END_OF_BUFFER_CHAR || base[size-1] != YY_END_OF_BUFFER_CHAR ) @@ -1708,14 +1698,14 @@ YY_BUFFER_STATE xlu__cfg_yy_scan_buffer (char * base, yy_size_t size , yyscan_ */ YY_BUFFER_STATE xlu__cfg_yy_scan_string (yyconst char * yystr , yyscan_t yyscanner) { - + return xlu__cfg_yy_scan_bytes(yystr,strlen(yystr) ,yyscanner); } /** Setup the input buffer state to scan the given bytes. The next call to xlu__cfg_yylex() will * scan from a @e copy of @a bytes. - * @param yybytes the byte buffer to scan - * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * @param bytes the byte buffer to scan + * @param len the number of bytes in the buffer pointed to by @a bytes. * @param yyscanner The scanner object. * @return the newly allocated buffer state object. */ @@ -1725,7 +1715,7 @@ YY_BUFFER_STATE xlu__cfg_yy_scan_bytes (yyconst char * yybytes, int _yybytes_l char *buf; yy_size_t n; int i; - + /* Get memory for full buffer, including space for trailing EOB''s. */ n = _yybytes_len + 2; buf = (char *) xlu__cfg_yyalloc(n ,yyscanner ); @@ -1793,10 +1783,10 @@ YY_EXTRA_TYPE xlu__cfg_yyget_extra (yyscan_t yyscanner) int xlu__cfg_yyget_lineno (yyscan_t yyscanner) { struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; - + if (! YY_CURRENT_BUFFER) return 0; - + return yylineno; } @@ -1806,10 +1796,10 @@ int xlu__cfg_yyget_lineno (yyscan_t yyscanner) int xlu__cfg_yyget_column (yyscan_t yyscanner) { struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; - + if (! YY_CURRENT_BUFFER) return 0; - + return yycolumn; } @@ -1870,8 +1860,8 @@ void xlu__cfg_yyset_lineno (int line_number , yyscan_t yyscanner) /* lineno is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "xlu__cfg_yyset_lineno called with no buffer" , yyscanner); - + yy_fatal_error( "xlu__cfg_yyset_lineno called with no buffer" , yyscanner); + yylineno = line_number; } @@ -1885,8 +1875,8 @@ void xlu__cfg_yyset_column (int column_no , yyscan_t yyscanner) /* column is only valid if an input buffer exists. */ if (! YY_CURRENT_BUFFER ) - yy_fatal_error( "xlu__cfg_yyset_column called with no buffer" , yyscanner); - + yy_fatal_error( "xlu__cfg_yyset_column called with no buffer" , yyscanner); + yycolumn = column_no; } @@ -1939,13 +1929,13 @@ YYLTYPE *xlu__cfg_yyget_lloc (yyscan_t yyscanner) struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; return yylloc; } - + void xlu__cfg_yyset_lloc (YYLTYPE * yylloc_param , yyscan_t yyscanner) { struct yyguts_t * yyg = (struct yyguts_t*)yyscanner; yylloc = yylloc_param; } - + /* User-visible API */ /* xlu__cfg_yylex_init is special because it creates the scanner itself, so it is @@ -1993,20 +1983,20 @@ int xlu__cfg_yylex_init_extra(YY_EXTRA_TYPE yy_user_defined,yyscan_t* ptr_yy_glo errno = EINVAL; return 1; } - + *ptr_yy_globals = (yyscan_t) xlu__cfg_yyalloc ( sizeof( struct yyguts_t ), &dummy_yyguts ); - + if (*ptr_yy_globals == NULL){ errno = ENOMEM; return 1; } - + /* By setting to 0xAA, we expose bugs in yy_init_globals. Leave at 0x00 for releases. */ memset(*ptr_yy_globals,0x00,sizeof(struct yyguts_t)); - + xlu__cfg_yyset_extra (yy_user_defined, *ptr_yy_globals); - + return yy_init_globals ( *ptr_yy_globals ); } diff --git a/tools/libxl/libxlu_cfg_l.h b/tools/libxl/libxlu_cfg_l.h index 327c3a4..2066764 100644 --- a/tools/libxl/libxlu_cfg_l.h +++ b/tools/libxl/libxlu_cfg_l.h @@ -38,7 +38,7 @@ #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, - * if you want the limit (max/min) macros for int types. + * if you want the limit (max/min) macros for int types. */ #ifndef __STDC_LIMIT_MACROS #define __STDC_LIMIT_MACROS 1 @@ -55,9 +55,10 @@ typedef uint32_t flex_uint32_t; typedef signed char flex_int8_t; typedef short int flex_int16_t; typedef int flex_int32_t; -typedef unsigned char flex_uint8_t; +typedef unsigned char flex_uint8_t; typedef unsigned short int flex_uint16_t; typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ /* Limits of integral types. */ #ifndef INT8_MIN @@ -88,8 +89,6 @@ typedef unsigned int flex_uint32_t; #define UINT32_MAX (4294967295U) #endif -#endif /* ! C99 */ - #endif /* ! FLEXINT_H */ #ifdef __cplusplus @@ -132,15 +131,7 @@ typedef void* yyscan_t; /* Size of default input buffer. */ #ifndef YY_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k. - * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. - * Ditto for the __ia64__ case accordingly. - */ -#define YY_BUF_SIZE 32768 -#else #define YY_BUF_SIZE 16384 -#endif /* __ia64__ */ #endif #ifndef YY_TYPEDEF_YY_BUFFER_STATE @@ -193,7 +184,7 @@ struct yy_buffer_state int yy_bs_lineno; /**< The line count. */ int yy_bs_column; /**< The column count. */ - + /* Whether to try to fill the input buffer when we reach the * end of it. */ @@ -276,14 +267,18 @@ int xlu__cfg_yyget_lineno (yyscan_t yyscanner ); void xlu__cfg_yyset_lineno (int line_number ,yyscan_t yyscanner ); +int xlu__cfg_yyget_column (yyscan_t yyscanner ); + +void xlu__cfg_yyset_column (int column_no ,yyscan_t yyscanner ); + YYSTYPE * xlu__cfg_yyget_lval (yyscan_t yyscanner ); void xlu__cfg_yyset_lval (YYSTYPE * yylval_param ,yyscan_t yyscanner ); YYLTYPE *xlu__cfg_yyget_lloc (yyscan_t yyscanner ); - + void xlu__cfg_yyset_lloc (YYLTYPE * yylloc_param ,yyscan_t yyscanner ); - + /* Macros after this point can all be overridden by user definitions in * section 1. */ @@ -310,12 +305,7 @@ static int yy_flex_strlen (yyconst char * ,yyscan_t yyscanner); /* Amount of stuff to slurp up with each read. */ #ifndef YY_READ_BUF_SIZE -#ifdef __ia64__ -/* On IA-64, the buffer size is 16k, not 8k */ -#define YY_READ_BUF_SIZE 16384 -#else #define YY_READ_BUF_SIZE 8192 -#endif /* __ia64__ */ #endif /* Number of entries by which start-condition stack grows. */ @@ -352,6 +342,6 @@ extern int xlu__cfg_yylex \ #line 104 "libxlu_cfg_l.l" -#line 356 "libxlu_cfg_l.h" +#line 346 "libxlu_cfg_l.h" #undef xlu__cfg_yyIN_HEADER #endif /* xlu__cfg_yyHEADER_H */ -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-22 22:14 UTC
[Xen-devel] [PATCH 2/3] libxc: add xc_gntshr_* functions
These functions and the xc_gntshr device (/dev/xen/gntalloc on linux) allow applications to create pages shared with other domains. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/include/xen-sys/Linux/gntalloc.h | 82 +++++++++++++++++++++++++ tools/libxc/xc_gnttab.c | 27 ++++++++ tools/libxc/xc_linux_osdep.c | 102 ++++++++++++++++++++++++++++++++ tools/libxc/xc_private.c | 13 ++++ tools/libxc/xenctrl.h | 48 +++++++++++++++ tools/libxc/xenctrlosdep.h | 10 +++ 6 files changed, 282 insertions(+), 0 deletions(-) create mode 100644 tools/include/xen-sys/Linux/gntalloc.h diff --git a/tools/include/xen-sys/Linux/gntalloc.h b/tools/include/xen-sys/Linux/gntalloc.h new file mode 100644 index 0000000..76bd580 --- /dev/null +++ b/tools/include/xen-sys/Linux/gntalloc.h @@ -0,0 +1,82 @@ +/****************************************************************************** + * gntalloc.h + * + * Interface to /dev/xen/gntalloc. + * + * Author: Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * This file is in the public domain. + */ + +#ifndef __LINUX_PUBLIC_GNTALLOC_H__ +#define __LINUX_PUBLIC_GNTALLOC_H__ + +/* + * Allocates a new page and creates a new grant reference. + */ +#define IOCTL_GNTALLOC_ALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 5, sizeof(struct ioctl_gntalloc_alloc_gref)) +struct ioctl_gntalloc_alloc_gref { + /* IN parameters */ + /* The ID of the domain to be given access to the grants. */ + uint16_t domid; + /* Flags for this mapping */ + uint16_t flags; + /* Number of pages to map */ + uint32_t count; + /* OUT parameters */ + /* The offset to be used on a subsequent call to mmap(). */ + uint64_t index; + /* The grant references of the newly created grant, one per page */ + /* Variable size, depending on count */ + uint32_t gref_ids[1]; +}; + +#define GNTALLOC_FLAG_WRITABLE 1 + +/* + * Deallocates the grant reference, allowing the associated page to be freed if + * no other domains are using it. + */ +#define IOCTL_GNTALLOC_DEALLOC_GREF \ +_IOC(_IOC_NONE, ''G'', 6, sizeof(struct ioctl_gntalloc_dealloc_gref)) +struct ioctl_gntalloc_dealloc_gref { + /* IN parameters */ + /* The offset returned in the map operation */ + uint64_t index; + /* Number of references to unmap */ + uint32_t count; +}; + +/* + * Sets up an unmap notification within the page, so that the other side can do + * cleanup if this side crashes. Required to implement cross-domain robust + * mutexes or close notification on communication channels. + * + * Each mapped page only supports one notification; multiple calls referring to + * the same page overwrite the previous notification. You must clear the + * notification prior to the IOCTL_GNTALLOC_DEALLOC_GREF if you do not want it + * to occur. + */ +#define IOCTL_GNTALLOC_SET_UNMAP_NOTIFY \ +_IOC(_IOC_NONE, ''G'', 7, sizeof(struct ioctl_gntalloc_unmap_notify)) +struct ioctl_gntalloc_unmap_notify { + /* IN parameters */ + /* Offset in the file descriptor for a byte within the page (same as + * used in mmap). If using UNMAP_NOTIFY_CLEAR_BYTE, this is the byte to + * be cleared. Otherwise, it can be any byte in the page whose + * notification we are adjusting. + */ + uint64_t index; + /* Action(s) to take on unmap */ + uint32_t action; + /* Event channel to notify */ + uint32_t event_channel_port; +}; + +/* Clear (set to zero) the byte specified by index */ +#define UNMAP_NOTIFY_CLEAR_BYTE 0x1 +/* Send an interrupt on the indicated event channel */ +#define UNMAP_NOTIFY_SEND_EVENT 0x2 + +#endif /* __LINUX_PUBLIC_GNTALLOC_H__ */ diff --git a/tools/libxc/xc_gnttab.c b/tools/libxc/xc_gnttab.c index 033cc5c..cb5995f 100644 --- a/tools/libxc/xc_gnttab.c +++ b/tools/libxc/xc_gnttab.c @@ -203,6 +203,33 @@ int xc_gnttab_set_max_grants(xc_gnttab *xcg, uint32_t count) return xcg->ops->u.gnttab.set_max_grants(xcg, xcg->ops_handle, count); } +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable) +{ + return xcg->ops->u.gntshr.share_pages(xcg, xcg->ops_handle, domid, + count, refs, writable, -1, -1); +} + +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + return xcg->ops->u.gntshr.share_pages(xcg, xcg->ops_handle, + domid, 1, ref, writable, notify_offset, notify_port); +} + +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count) +{ + return xcg->ops->u.gntshr.munmap(xcg, xcg->ops_handle, + start_address, count); +} + + /* * Local variables: * mode: C diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index f760421..04059b8 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -34,6 +34,7 @@ #include <xen/memory.h> #include <xen/sys/evtchn.h> #include <xen/sys/gntdev.h> +#include <xen/sys/gntalloc.h> #include "xenctrl.h" #include "xenctrlosdep.h" @@ -656,6 +657,105 @@ static struct xc_osdep_ops linux_gnttab_ops = { }, }; +static xc_osdep_handle linux_gntshr_open(xc_gntshr *xcg) +{ + int fd = open(DEVXEN "gntalloc", O_RDWR); + + if ( fd == -1 ) + return XC_OSDEP_OPEN_ERROR; + + return (xc_osdep_handle)fd; +} + +static int linux_gntshr_close(xc_gntshr *xcg, xc_osdep_handle h) +{ + int fd = (int)h; + return close(fd); +} + +static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port) +{ + struct ioctl_gntalloc_alloc_gref *gref_info = NULL; + struct ioctl_gntalloc_unmap_notify notify; + struct ioctl_gntalloc_dealloc_gref gref_drop; + int fd = (int)h; + int err; + void *area = NULL; + gref_info = malloc(sizeof(*gref_info) + count * sizeof(uint32_t)); + if (!gref_info) + return NULL; + gref_info->domid = domid; + gref_info->flags = writable ? GNTALLOC_FLAG_WRITABLE : 0; + gref_info->count = count; + + err = ioctl(fd, IOCTL_GNTALLOC_ALLOC_GREF, gref_info); + if (err) { + PERROR("linux_gntshr_share_pages: ioctl failed"); + goto out; + } + + area = mmap(NULL, count * XC_PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, gref_info->index); + + if (area == MAP_FAILED) { + area = NULL; + PERROR("linux_gntshr_share_pages: mmap failed"); + goto out_remove_fdmap; + } + + notify.index = gref_info->index; + notify.action = 0; + if (notify_offset >= 0) { + notify.index += notify_offset; + notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; + } + if (notify_port >= 0) { + notify.event_channel_port = notify_port; + notify.action |= UNMAP_NOTIFY_SEND_EVENT; + } + if (notify.action) + err = ioctl(fd, IOCTL_GNTALLOC_SET_UNMAP_NOTIFY, ¬ify); + if (err) { + PERROR("linux_gntshr_share_page_notify: ioctl SET_UNMAP_NOTIFY failed"); + munmap(area, count * XC_PAGE_SIZE); + area = NULL; + } + + memcpy(refs, gref_info->gref_ids, count * sizeof(uint32_t)); + + out_remove_fdmap: + /* Removing the mapping from the file descriptor does not cause the pages to + * be deallocated until the mapping is removed. + */ + gref_drop.index = gref_info->index; + gref_drop.count = count; + ioctl(fd, IOCTL_GNTALLOC_DEALLOC_GREF, &gref_drop); + out: + free(gref_info); + return area; +} + +static int linux_gntshr_munmap(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count) +{ + return munmap(start_address, count); +} + +static struct xc_osdep_ops linux_gntshr_ops = { + .open = &linux_gntshr_open, + .close = &linux_gntshr_close, + + .u.gntshr = { + .share_pages = &linux_gntshr_share_pages, + .munmap = &linux_gntshr_munmap, + }, +}; + + static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_type type) { switch ( type ) @@ -666,6 +766,8 @@ static struct xc_osdep_ops *linux_osdep_init(xc_interface *xch, enum xc_osdep_ty return &linux_evtchn_ops; case XC_OSDEP_GNTTAB: return &linux_gnttab_ops; + case XC_OSDEP_GNTSHR: + return &linux_gntshr_ops; default: return NULL; } diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c index 09c8f23..09a91e7 100644 --- a/tools/libxc/xc_private.c +++ b/tools/libxc/xc_private.c @@ -258,6 +258,19 @@ int xc_gnttab_close(xc_gnttab *xcg) return xc_interface_close_common(xcg); } +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags) +{ + return xc_interface_open_common(logger, NULL, open_flags, + XC_OSDEP_GNTSHR); +} + +int xc_gntshr_close(xc_gntshr *xcg) +{ + return xc_interface_close_common(xcg); +} + + static pthread_key_t errbuf_pkey; static pthread_once_t errbuf_pkey_once = PTHREAD_ONCE_INIT; diff --git a/tools/libxc/xenctrl.h b/tools/libxc/xenctrl.h index 6f3165d..72a7f2d 100644 --- a/tools/libxc/xenctrl.h +++ b/tools/libxc/xenctrl.h @@ -115,6 +115,7 @@ typedef struct xc_interface_core xc_interface; typedef struct xc_interface_core xc_evtchn; typedef struct xc_interface_core xc_gnttab; +typedef struct xc_interface_core xc_gntshr; typedef enum xc_error_code xc_error_code; @@ -1402,6 +1403,53 @@ grant_entry_v1_t *xc_gnttab_map_table_v1(xc_interface *xch, int domid, int *gnt_ grant_entry_v2_t *xc_gnttab_map_table_v2(xc_interface *xch, int domid, int *gnt_num); /* Sometimes these don''t set errno [fixme], and sometimes they don''t log. */ +/* + * Return an fd onto the grant sharing driver. Logs errors. + */ +xc_gntshr *xc_gntshr_open(xentoollog_logger *logger, + unsigned open_flags); + +/* + * Close a handle previously allocated with xc_gntshr_open(). + * Never logs errors. + */ +int xc_gntshr_close(xc_gntshr *xcg); + +/* + * Creates and shares pages with another domain. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm count the number of pages to share + * @parm refs the grant references of the pages (output) + * @parm writable true if the other domain can write to the pages + * @return local mapping of the pages + */ +void *xc_gntshr_share_pages(xc_gntshr *xcg, uint32_t domid, + int count, uint32_t *refs, int writable); + +/* + * Creates and shares a page with another domain, with unmap notification. + * + * @parm xcg a handle to an open grant sharing instance + * @parm domid the domain to share memory with + * @parm refs the grant reference of the pages (output) + * @parm writable true if the other domain can write to the page + * @parm notify_offset The byte offset in the page to use for unmap + * notification; -1 for none. + * @parm notify_port The event channel port to use for unmap notify, or -1 + * @return local mapping of the page + */ +void *xc_gntshr_share_page_notify(xc_gntshr *xcg, uint32_t domid, + uint32_t *ref, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port); +/* + * Unmaps the @count pages starting at @start_address, which were mapped by a + * call to xc_gntshr_share_*. Never logs. + */ +int xc_gntshr_munmap(xc_gntshr *xcg, void *start_address, uint32_t count); + int xc_physdev_map_pirq(xc_interface *xch, int domid, int index, diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h index 1c6317e..a36c4aa 100644 --- a/tools/libxc/xenctrlosdep.h +++ b/tools/libxc/xenctrlosdep.h @@ -54,6 +54,7 @@ enum xc_osdep_type { XC_OSDEP_PRIVCMD, XC_OSDEP_EVTCHN, XC_OSDEP_GNTTAB, + XC_OSDEP_GNTSHR, }; /* Opaque handle internal to the backend */ @@ -116,6 +117,15 @@ struct xc_osdep_ops uint32_t count); int (*set_max_grants)(xc_gnttab *xcg, xc_osdep_handle h, uint32_t count); } gnttab; + struct { + void *(*share_pages)(xc_gntshr *xcg, xc_osdep_handle h, + uint32_t domid, int count, + uint32_t *refs, int writable, + uint32_t notify_offset, + evtchn_port_t notify_port); + int (*munmap)(xc_gntshr *xcg, xc_osdep_handle h, + void *start_address, uint32_t count); + } gntshr; } u; }; typedef struct xc_osdep_ops xc_osdep_ops; -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-22 22:14 UTC
[Xen-devel] [PATCH 3/3] libvchan: interdomain communications library
This library implements a bidirectional communication interface between applications in different domains, similar to unix sockets. Data can be sent using the byte-oriented libvchan_read/libvchan_write or the packet-oriented libvchan_recv/libvchan_send. Channel setup is done using a client-server model; domain IDs and a port number must be negotiated prior to initialization. The server allocates memory for the shared pages and determines the sizes of the communication rings (which may span multiple pages, although the default places rings and control within a single page). With properly sized rings, testing has shown that this interface provides speed comparable to pipes within a single Linux domain; it is significantly faster than network-based communication. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/Makefile | 1 + tools/Rules.mk | 5 + tools/libvchan/Makefile | 59 +++++ tools/libvchan/init.c | 409 +++++++++++++++++++++++++++++++++++ tools/libvchan/io.c | 375 ++++++++++++++++++++++++++++++++ tools/libvchan/libxenvchan.h | 169 +++++++++++++++ tools/libvchan/node-select.c | 162 ++++++++++++++ tools/libvchan/node.c | 169 +++++++++++++++ xen/include/public/io/libxenvchan.h | 97 +++++++++ 9 files changed, 1446 insertions(+), 0 deletions(-) create mode 100644 tools/libvchan/Makefile create mode 100644 tools/libvchan/init.c create mode 100644 tools/libvchan/io.c create mode 100644 tools/libvchan/libxenvchan.h create mode 100644 tools/libvchan/node-select.c create mode 100644 tools/libvchan/node.c create mode 100644 xen/include/public/io/libxenvchan.h diff --git a/tools/Makefile b/tools/Makefile index df6270c..9389e1f 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -27,6 +27,7 @@ SUBDIRS-$(CONFIG_NetBSD) += blktap2 SUBDIRS-$(CONFIG_NetBSD) += xenbackendd SUBDIRS-y += libfsimage SUBDIRS-$(LIBXENAPI_BINDINGS) += libxen +SUBDIRS-y += libvchan # do not recurse in to a dir we are about to delete ifneq "$(MAKECMDGOALS)" "distclean" diff --git a/tools/Rules.mk b/tools/Rules.mk index 0d048af..49125f5 100644 --- a/tools/Rules.mk +++ b/tools/Rules.mk @@ -14,6 +14,7 @@ XEN_XENLIGHT = $(XEN_ROOT)/tools/libxl XEN_XENSTORE = $(XEN_ROOT)/tools/xenstore XEN_LIBXENSTAT = $(XEN_ROOT)/tools/xenstat/libxenstat/src XEN_BLKTAP2 = $(XEN_ROOT)/tools/blktap2 +XEN_LIBVCHAN = $(XEN_ROOT)/tools/libvchan CFLAGS_xeninclude = -I$(XEN_INCLUDE) @@ -33,6 +34,10 @@ CFLAGS_libxenstat = -I$(XEN_LIBXENSTAT) LDLIBS_libxenstat = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) -L$(XEN_LIBXENSTAT) -lxenstat SHLIB_libxenstat = -Wl,-rpath-link=$(XEN_LIBXENSTAT) +CFLAGS_libxenvchan = -I$(XEN_LIBVCHAN) +LDLIBS_libxenvchan = $(SHLIB_libxenctrl) $(SHLIB_libxenstore) -L$(XEN_LIBVCHAN) -lxenvchan +SHLIB_libxenvchan = -Wl,-rpath-link=$(XEN_LIBVCHAN) + ifeq ($(CONFIG_Linux),y) LIBXL_BLKTAP = y else diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile new file mode 100644 index 0000000..daf3593 --- /dev/null +++ b/tools/libvchan/Makefile @@ -0,0 +1,59 @@ +# +# tools/libvchan/Makefile +# + +XEN_ROOT = $(CURDIR)/../.. +include $(XEN_ROOT)/tools/Rules.mk + +LIBVCHAN_OBJS = init.o io.o +NODE_OBJS = node.o +NODE2_OBJS = node-select.o + +LIBVCHAN_PIC_OBJS = $(patsubst %.o,%.opic,$(LIBVCHAN_OBJS)) +LIBVCHAN_LIBS = $(LDLIBS_libxenstore) $(LDLIBS_libxenctrl) +$(LIBVCHAN_OBJS) $(LIBVCHAN_PIC_OBJS): CFLAGS += $(CFLAGS_libxenstore) $(CFLAGS_libxenctrl) +$(NODE_OBJS) $(NODE2_OBJS): CFLAGS += $(CFLAGS_libxenctrl) + +MAJOR = 1.0 +MINOR = 0 + +CFLAGS += -I../include -I. + +.PHONY: all +all: libxenvchan.so vchan-node1 vchan-node2 libxenvchan.a + +libxenvchan.so: libxenvchan.so.$(MAJOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR): libxenvchan.so.$(MAJOR).$(MINOR) + ln -sf $< $@ + +libxenvchan.so.$(MAJOR).$(MINOR): $(LIBVCHAN_PIC_OBJS) + $(CC) $(LDFLAGS) -Wl,$(SONAME_LDFLAG) -Wl,libxenvchan.so.$(MAJOR) $(SHLIB_LDFLAGS) -o $@ $^ $(LIBVCHAN_LIBS) + +libxenvchan.a: $(LIBVCHAN_OBJS) + $(AR) rcs libxenvchan.a $^ + +vchan-node1: $(NODE_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE_OBJS) $(LDLIBS_libxenvchan) + +vchan-node2: $(NODE2_OBJS) libxenvchan.so + $(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) $(LDLIBS_libxenvchan) + +.PHONY: install +install: all + $(INSTALL_DIR) $(DESTDIR)$(LIBDIR) + $(INSTALL_DIR) $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR) + ln -sf libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so.$(MAJOR) + ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(LIBDIR)/libxenvchan.so + $(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(INCLUDEDIR) + $(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(LIBDIR) + +.PHONY: clean +clean: + $(RM) -f *.o *.so* *.a vchan-node1 vchan-node2 $(DEPS) + +distclean: clean + +-include $(DEPS) diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c new file mode 100644 index 0000000..becf71d --- /dev/null +++ b/tools/libvchan/init.c @@ -0,0 +1,409 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the setup code used to establish the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/user.h> +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> + +#include <xs.h> +#include <xen/sys/evtchn.h> +#include <xen/sys/gntalloc.h> +#include <xen/sys/gntdev.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +#define SMALL_RING_SHIFT 10 +#define LARGE_RING_SHIFT 11 + +#define MAX_SMALL_RING (1 << SMALL_RING_SHIFT) +#define SMALL_RING_OFFSET 1024 +#define MAX_LARGE_RING (1 << LARGE_RING_SHIFT) +#define LARGE_RING_OFFSET 2048 + +// if you go over this size, you''ll have too many grants to fit in the shared page. +#define MAX_RING_SHIFT 20 +#define MAX_RING_SIZE (1 << MAX_RING_SHIFT) + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#define max(a,b) ((a > b) ? a : b) + +static int init_gnt_srv(struct libxenvchan *ctrl, int domain) +{ + int pages_left = ctrl->read.order >= PAGE_SHIFT ? 1 << (ctrl->read.order - PAGE_SHIFT) : 0; + int pages_right = ctrl->write.order >= PAGE_SHIFT ? 1 << (ctrl->write.order - PAGE_SHIFT) : 0; + uint32_t ring_ref = -1; + void *ring; + + ring = xc_gntshr_share_page_notify(ctrl->gntshr, domain, + &ring_ref, 1, offsetof(struct vchan_interface, srv_live), + ctrl->event_port); + + if (!ring) + goto out; + + memset(ring, 0, PAGE_SIZE); + + ctrl->ring = ring; + ctrl->read.shr = &ctrl->ring->left; + ctrl->write.shr = &ctrl->ring->right; + ctrl->ring->left_order = ctrl->read.order; + ctrl->ring->right_order = ctrl->write.order; + ctrl->ring->cli_live = 2; + ctrl->ring->srv_live = 1; + ctrl->ring->cli_notify = VCHAN_NOTIFY_WRITE; + + switch (ctrl->read.order) { + case SMALL_RING_SHIFT: + ctrl->read.buffer = ((void*)ctrl->ring) + SMALL_RING_OFFSET; + break; + case LARGE_RING_SHIFT: + ctrl->read.buffer = ((void*)ctrl->ring) + LARGE_RING_OFFSET; + break; + default: + ctrl->read.buffer = xc_gntshr_share_pages(ctrl->gntshr, domain, + pages_left, ctrl->ring->grants, 1); + if (!ctrl->read.buffer) + goto out_ring; + } + + switch (ctrl->write.order) { + case SMALL_RING_SHIFT: + ctrl->write.buffer = ((void*)ctrl->ring) + SMALL_RING_OFFSET; + break; + case LARGE_RING_SHIFT: + ctrl->write.buffer = ((void*)ctrl->ring) + LARGE_RING_OFFSET; + break; + default: + ctrl->write.buffer = xc_gntshr_share_pages(ctrl->gntshr, domain, + pages_right, ctrl->ring->grants + pages_left, 1); + if (!ctrl->write.buffer) + goto out_unmap_left; + } + +out: + return ring_ref; +out_unmap_left: + if (pages_left) + xc_gntshr_munmap(ctrl->gntshr, ctrl->read.buffer, pages_left * PAGE_SIZE); +out_ring: + xc_gntshr_munmap(ctrl->gntshr, ring, PAGE_SIZE); + ring_ref = -1; + ctrl->ring = NULL; + ctrl->write.order = ctrl->read.order = 0; + goto out; +} + +static int init_gnt_cli(struct libxenvchan *ctrl, int domain, uint32_t ring_ref) +{ + int rv = -1; + uint32_t *grants; + + ctrl->ring = xc_gnttab_map_grant_ref_notify(ctrl->gnttab, + domain, ring_ref, PROT_READ|PROT_WRITE, + offsetof(struct vchan_interface, cli_live), ctrl->event_port); + + if (!ctrl->ring) + goto out; + + ctrl->write.order = ctrl->ring->left_order; + ctrl->read.order = ctrl->ring->right_order; + ctrl->write.shr = &ctrl->ring->left; + ctrl->read.shr = &ctrl->ring->right; + if (ctrl->write.order < SMALL_RING_SHIFT || ctrl->write.order > MAX_RING_SHIFT) + goto out_unmap_ring; + if (ctrl->read.order < SMALL_RING_SHIFT || ctrl->read.order > MAX_RING_SHIFT) + goto out_unmap_ring; + if (ctrl->read.order == ctrl->write.order && ctrl->read.order < PAGE_SHIFT) + goto out_unmap_ring; + + grants = ctrl->ring->grants; + + switch (ctrl->write.order) { + case SMALL_RING_SHIFT: + ctrl->write.buffer = ((void*)ctrl->ring) + SMALL_RING_OFFSET; + break; + case LARGE_RING_SHIFT: + ctrl->write.buffer = ((void*)ctrl->ring) + LARGE_RING_OFFSET; + break; + default: + { + int pages_left = 1 << (ctrl->write.order - PAGE_SHIFT); + ctrl->write.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_left, domain, grants, PROT_READ|PROT_WRITE); + if (!ctrl->write.buffer) + goto out_unmap_ring; + grants += pages_left; + } + } + + switch (ctrl->read.order) { + case SMALL_RING_SHIFT: + ctrl->read.buffer = ((void*)ctrl->ring) + SMALL_RING_OFFSET; + break; + case LARGE_RING_SHIFT: + ctrl->read.buffer = ((void*)ctrl->ring) + LARGE_RING_OFFSET; + break; + default: + { + int pages_right = 1 << (ctrl->read.order - PAGE_SHIFT); + ctrl->read.buffer = xc_gnttab_map_domain_grant_refs(ctrl->gnttab, + pages_right, domain, grants, PROT_READ); + if (!ctrl->read.buffer) + goto out_unmap_left; + } + } + + rv = 0; + out: + return rv; + out_unmap_left: + if (ctrl->write.order >= PAGE_SHIFT) + xc_gnttab_munmap(ctrl->gnttab, ctrl->write.buffer, + 1 << ctrl->write.order); + out_unmap_ring: + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + ctrl->ring = 0; + ctrl->write.order = ctrl->read.order = 0; + rv = -1; + goto out; +} + +static int init_evt_srv(struct libxenvchan *ctrl, int domain, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_unbound_port(ctrl->event, domain); + if (ctrl->event_port < 0) + return -1; + if (xc_evtchn_unmask(ctrl->event, ctrl->event_port)) + return -1; + return 0; +} + +static int init_xs_srv(struct libxenvchan *ctrl, int domain, const char* xs_base, int ring_ref) +{ + int ret = -1; + struct xs_handle *xs; + struct xs_permissions perms[2]; + char buf[64]; + char ref[16]; + char* domid_str = NULL; + xs = xs_domain_open(); + if (!xs) + goto fail; + domid_str = xs_read(xs, 0, "domid", NULL); + if (!domid_str) + goto fail_xs_open; + + // owner domain is us + perms[0].id = atoi(domid_str); + // permissions for domains not listed = none + perms[0].perms = XS_PERM_NONE; + // other domains + perms[1].id = domain; + perms[1].perms = XS_PERM_READ; + + snprintf(ref, sizeof ref, "%d", ring_ref); + snprintf(buf, sizeof buf, "%s/ring-ref", xs_base); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + snprintf(ref, sizeof ref, "%d", ctrl->event_port); + snprintf(buf, sizeof buf, "%s/event-channel", xs_base); + if (!xs_write(xs, 0, buf, ref, strlen(ref))) + goto fail_xs_open; + if (!xs_set_permissions(xs, 0, buf, perms, 2)) + goto fail_xs_open; + + ret = 0; + fail_xs_open: + free(domid_str); + xs_daemon_close(xs); + fail: + return ret; +} + +static int min_order(size_t siz) +{ + int rv = PAGE_SHIFT; + while (siz > (1 << rv)) + rv++; + return rv; +} + +struct libxenvchan *libxenvchan_server_init(xentoollog_logger *logger, int domain, const char* xs_path, size_t left_min, size_t right_min) +{ + struct libxenvchan *ctrl; + int ring_ref; + if (left_min > MAX_RING_SIZE || right_min > MAX_RING_SIZE) + return 0; + + ctrl = malloc(sizeof(*ctrl)); + if (!ctrl) + return 0; + + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->is_server = 1; + ctrl->server_persist = 0; + + ctrl->read.order = min_order(left_min); + ctrl->write.order = min_order(right_min); + + // if we can avoid allocating extra pages by using in-page rings, do so + if (left_min <= MAX_SMALL_RING && right_min <= MAX_LARGE_RING) { + ctrl->read.order = SMALL_RING_SHIFT; + ctrl->write.order = LARGE_RING_SHIFT; + } else if (left_min <= MAX_LARGE_RING && right_min <= MAX_SMALL_RING) { + ctrl->read.order = LARGE_RING_SHIFT; + ctrl->write.order = SMALL_RING_SHIFT; + } else if (left_min <= MAX_LARGE_RING) { + ctrl->read.order = LARGE_RING_SHIFT; + } else if (right_min <= MAX_LARGE_RING) { + ctrl->write.order = LARGE_RING_SHIFT; + } + + ctrl->gntshr = xc_gntshr_open(logger, 0); + if (!ctrl->gntshr) + goto out; + + if (init_evt_srv(ctrl, domain, logger)) + goto out; + ring_ref = init_gnt_srv(ctrl, domain); + if (ring_ref < 0) + goto out; + if (init_xs_srv(ctrl, domain, xs_path, ring_ref)) + goto out; + return ctrl; +out: + libxenvchan_close(ctrl); + return 0; +} + +static int init_evt_cli(struct libxenvchan *ctrl, int domain, xentoollog_logger *logger) +{ + ctrl->event = xc_evtchn_open(logger, 0); + if (!ctrl->event) + return -1; + ctrl->event_port = xc_evtchn_bind_interdomain(ctrl->event, + domain, ctrl->event_port); + if (ctrl->event_port < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ctrl->event_port); + return 0; +} + + +struct libxenvchan *libxenvchan_client_init(xentoollog_logger *logger, int domain, const char* xs_path) +{ + struct libxenvchan *ctrl = malloc(sizeof(struct libxenvchan)); + struct xs_handle *xs = NULL; + char buf[64]; + char *ref; + int ring_ref; + unsigned int len; + + if (!ctrl) + return 0; + ctrl->ring = NULL; + ctrl->event = NULL; + ctrl->gnttab = NULL; + ctrl->write.order = ctrl->read.order = 0; + ctrl->is_server = 0; + + xs = xs_daemon_open(); + if (!xs) + xs = xs_domain_open(); + if (!xs) + goto fail; + +// find xenstore entry + snprintf(buf, sizeof buf, "%s/ring-ref", xs_path); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ring_ref = atoi(ref); + free(ref); + if (!ring_ref) + goto fail; + snprintf(buf, sizeof buf, "%s/event-channel", xs_path); + ref = xs_read(xs, 0, buf, &len); + if (!ref) + goto fail; + ctrl->event_port = atoi(ref); + free(ref); + if (!ctrl->event_port) + goto fail; + + ctrl->gnttab = xc_gnttab_open(logger, 0); + if (!ctrl->gnttab) + goto out; + +// set up event channel + if (init_evt_cli(ctrl, domain, logger)) + goto fail; + +// set up shared page(s) + if (init_gnt_cli(ctrl, domain, ring_ref)) + goto fail; + + ctrl->ring->cli_live = 1; + ctrl->ring->srv_notify = VCHAN_NOTIFY_WRITE; + + out: + if (xs) + xs_daemon_close(xs); + return ctrl; + fail: + libxenvchan_close(ctrl); + ctrl = NULL; + goto out; +} diff --git a/tools/libvchan/io.c b/tools/libvchan/io.c new file mode 100644 index 0000000..7023add --- /dev/null +++ b/tools/libvchan/io.c @@ -0,0 +1,375 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This file contains the communications interface built on the ring buffer. + */ + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> + +#include <xenctrl.h> +#include <libxenvchan.h> + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT 12 +#endif + +#ifndef PAGE_SIZE +#define PAGE_SIZE 4096 +#endif + +// allow vchan data to be easily observed in strace by doing a +// writev() to FD -1 with the data being read/written. +#ifndef VCHAN_DEBUG +#define VCHAN_DEBUG 0 +#endif + +#define barrier() asm volatile("" ::: "memory") + + +static inline uint32_t rd_prod(struct libxenvchan *ctrl) +{ + return ctrl->read.shr->prod; +} + +static inline uint32_t* _rd_cons(struct libxenvchan *ctrl) +{ + return &ctrl->read.shr->cons; +} +#define rd_cons(x) (*_rd_cons(x)) + +static inline uint32_t* _wr_prod(struct libxenvchan *ctrl) +{ + return &ctrl->write.shr->prod; +} +#define wr_prod(x) (*_wr_prod(x)) + +static inline uint32_t wr_cons(struct libxenvchan *ctrl) +{ + return ctrl->write.shr->cons; +} + +static inline const void* rd_ring(struct libxenvchan *ctrl) +{ + return ctrl->read.buffer; +} + +static inline void* wr_ring(struct libxenvchan *ctrl) +{ + return ctrl->write.buffer; +} + +static inline uint32_t wr_ring_size(struct libxenvchan *ctrl) +{ + return (1 << ctrl->write.order); +} + +static inline uint32_t rd_ring_size(struct libxenvchan *ctrl) +{ + return (1 << ctrl->read.order); +} + +static inline void request_notify(struct libxenvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->cli_notify : &ctrl->ring->srv_notify; + __sync_or_and_fetch(notify, bit); +} + +static inline int send_notify(struct libxenvchan *ctrl, uint8_t bit) +{ + uint8_t *notify = ctrl->is_server ? &ctrl->ring->srv_notify : &ctrl->ring->cli_notify; + uint8_t prev = __sync_fetch_and_and(notify, ~bit); + if (prev & bit) + return xc_evtchn_notify(ctrl->event, ctrl->event_port); + else + return 0; +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_data_ready(struct libxenvchan *ctrl, size_t request) +{ + int ready = rd_prod(ctrl) - rd_cons(ctrl); + if (ready >= request) + return ready; + /* We plan to consume all data; please tell us if you send more */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + /* + * If the writer moved rd_prod after our read but before request, we + * will not get notified even though the actual amount of data ready is + * above request. Reread rd_prod to cover this case. + */ + return rd_prod(ctrl) - rd_cons(ctrl); +} + +int libxenvchan_data_ready(struct libxenvchan *ctrl) +{ + /* Since this value is being used outside libxenvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_WRITE); + return rd_prod(ctrl) - rd_cons(ctrl); +} + +/** + * Get the amount of buffer space available and enable notifications if needed. + */ +static inline int fast_get_buffer_space(struct libxenvchan *ctrl, size_t request) +{ + int ready = wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); + if (ready >= request) + return ready; + /* We plan to fill the buffer; please tell us when you''ve read it */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + /* + * If the reader moved wr_cons after our read but before request, we + * will not get notified even though the actual amount of buffer space + * is above request. Reread wr_cons to cover this case. + */ + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libxenvchan_buffer_space(struct libxenvchan *ctrl) +{ + /* Since this value is being used outside libxenvchan, request notification + * when it changes + */ + request_notify(ctrl, VCHAN_NOTIFY_READ); + return wr_ring_size(ctrl) - (wr_prod(ctrl) - wr_cons(ctrl)); +} + +int libxenvchan_wait(struct libxenvchan *ctrl) +{ + int ret = xc_evtchn_pending(ctrl->event); + if (ret < 0) + return -1; + xc_evtchn_unmask(ctrl->event, ret); + return 0; +} + +/** + * returns -1 on error, or size on success + */ +static int do_send(struct libxenvchan *ctrl, const void *data, size_t size) +{ + int real_idx = wr_prod(ctrl) & (wr_ring_size(ctrl) - 1); + int avail_contig = wr_ring_size(ctrl) - real_idx; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan@%p wr", ctrl); + iov[1].iov_base = (void *)data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + if (avail_contig > size) + avail_contig = size; + memcpy(wr_ring(ctrl) + real_idx, data, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(wr_ring(ctrl), data + avail_contig, size - avail_contig); + } + barrier(); // data must be in the ring prior to increment + wr_prod(ctrl) += size; + barrier(); // increment must happen prior to notify + if (send_notify(ctrl, VCHAN_NOTIFY_WRITE)) + return -1; + return size; +} + +/** + * returns 0 if no buffer space is available, -1 on error, or size on success + */ +int libxenvchan_send(struct libxenvchan *ctrl, const void *data, size_t size) +{ + int avail; + while (1) { + if (!libxenvchan_is_open(ctrl)) + return -1; + avail = fast_get_buffer_space(ctrl, size); + if (size <= avail) + return do_send(ctrl, data, size); + if (!ctrl->blocking) + return 0; + if (size > wr_ring_size(ctrl)) + return -1; + if (libxenvchan_wait(ctrl)) + return -1; + } +} + +int libxenvchan_write(struct libxenvchan *ctrl, const void *data, size_t size) +{ + int avail; + if (!libxenvchan_is_open(ctrl)) + return -1; + if (ctrl->blocking) { + size_t pos = 0; + while (1) { + avail = fast_get_buffer_space(ctrl, size - pos); + if (pos + avail > size) + avail = size - pos; + if (avail) + pos += do_send(ctrl, data + pos, avail); + if (pos == size) + return pos; + if (libxenvchan_wait(ctrl)) + return -1; + if (!libxenvchan_is_open(ctrl)) + return -1; + } + } else { + avail = fast_get_buffer_space(ctrl, size); + if (size > avail) + size = avail; + if (size == 0) + return 0; + return do_send(ctrl, data, size); + } +} + +static int do_recv(struct libxenvchan *ctrl, void *data, size_t size) +{ + int real_idx = rd_cons(ctrl) & (rd_ring_size(ctrl) - 1); + int avail_contig = rd_ring_size(ctrl) - real_idx; + if (avail_contig > size) + avail_contig = size; + barrier(); // data read must happen after rd_cons read + memcpy(data, rd_ring(ctrl) + real_idx, avail_contig); + if (avail_contig < size) + { + // we rolled across the end of the ring + memcpy(data + avail_contig, rd_ring(ctrl), size - avail_contig); + } + rd_cons(ctrl) += size; + if (VCHAN_DEBUG) { + char metainfo[32]; + struct iovec iov[2]; + iov[0].iov_base = metainfo; + iov[0].iov_len = snprintf(metainfo, 32, "vchan@%p rd", ctrl); + iov[1].iov_base = data; + iov[1].iov_len = size; + writev(-1, iov, 2); + } + barrier(); // consumption must happen prior to notify of newly freed space + if (send_notify(ctrl, VCHAN_NOTIFY_READ)) + return -1; + return size; +} + +/** + * reads exactly size bytes from the vchan. + * returns 0 if insufficient data is available, -1 on error, or size on success + */ +int libxenvchan_recv(struct libxenvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (size <= avail) + return do_recv(ctrl, data, size); + if (!libxenvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (size > rd_ring_size(ctrl)) + return -1; + if (libxenvchan_wait(ctrl)) + return -1; + } +} + +int libxenvchan_read(struct libxenvchan *ctrl, void *data, size_t size) +{ + while (1) { + int avail = fast_get_data_ready(ctrl, size); + if (avail && size > avail) + size = avail; + if (avail) + return do_recv(ctrl, data, size); + if (!libxenvchan_is_open(ctrl)) + return -1; + if (!ctrl->blocking) + return 0; + if (libxenvchan_wait(ctrl)) + return -1; + } +} + +int libxenvchan_is_open(struct libxenvchan* ctrl) +{ + if (ctrl->is_server) + return ctrl->server_persist ? 1 : ctrl->ring->cli_live; + else + return ctrl->ring->srv_live; +} + +int libxenvchan_fd_for_select(struct libxenvchan *ctrl) +{ + return xc_evtchn_fd(ctrl->event); +} + +void libxenvchan_close(struct libxenvchan *ctrl) +{ + if (!ctrl) + return; + if (ctrl->read.order >= PAGE_SHIFT) + munmap(ctrl->read.buffer, 1 << ctrl->read.order); + if (ctrl->write.order >= PAGE_SHIFT) + munmap(ctrl->write.buffer, 1 << ctrl->write.order); + if (ctrl->ring) { + if (ctrl->is_server) { + ctrl->ring->srv_live = 0; + xc_gntshr_munmap(ctrl->gntshr, ctrl->ring, PAGE_SIZE); + } else { + ctrl->ring->cli_live = 0; + xc_gnttab_munmap(ctrl->gnttab, ctrl->ring, PAGE_SIZE); + } + } + if (ctrl->event) { + if (ctrl->event_port >= 0 && ctrl->ring) + xc_evtchn_notify(ctrl->event, ctrl->event_port); + xc_evtchn_close(ctrl->event); + } + if (ctrl->is_server) { + if (ctrl->gntshr) + xc_gntshr_close(ctrl->gntshr); + } else { + if (ctrl->gnttab) + xc_gnttab_close(ctrl->gnttab); + } + free(ctrl); +} diff --git a/tools/libvchan/libxenvchan.h b/tools/libvchan/libxenvchan.h new file mode 100644 index 0000000..6365d36 --- /dev/null +++ b/tools/libvchan/libxenvchan.h @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under the path + * /local/domain/<srv-id>/data/vchan/<cli-id>/<port>/{ring-ref,event-channel} + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <xen/io/libxenvchan.h> +#include <xen/sys/evtchn.h> +#include <xenctrl.h> + +struct libxenvchan_ring { + /* Pointer into the shared page. Offsets into buffer. */ + struct ring_shared* shr; + /* ring data; may be its own shared page(s) depending on order */ + void* buffer; + /** + * The size of the ring is (1 << order); offsets wrap around when they + * exceed this. This copy is required because we can''t trust the order + * in the shared page to remain constant. + */ + int order; +}; + +/** + * struct libxenvchan: control structure passed to all library calls + */ +struct libxenvchan { + /* Mapping handle for shared ring page */ + union { + xc_gntshr *gntshr; /* for server */ + xc_gnttab *gnttab; /* for client */ + }; + /* Pointer to shared ring page */ + struct vchan_interface *ring; + /* event channel interface */ + xc_evtchn *event; + uint32_t event_port; + /* informative flags: are we acting as server? */ + int is_server:1; + /* true if server remains active when client closes (allows reconnection) */ + int server_persist:1; + /* true if operations should block instead of returning 0 */ + int blocking:1; + /* communication rings */ + struct libxenvchan_ring read, write; +}; + +/** + * Set up a vchan, including granting pages + * @param logger Logger for libxc errors + * @param domain The peer domain that will be connecting + * @param xs_path Base xenstore path for storing ring/event data + * @param send_min The minimum size (in bytes) of the send ring (left) + * @param recv_min The minimum size (in bytes) of the receive ring (right) + * @return The structure, or NULL in case of an error + */ +struct libxenvchan *libxenvchan_server_init(xentoollog_logger *logger, int domain, const char* xs_path, size_t read_min, size_t write_min); +/** + * Connect to an existing vchan. Note: you can reconnect to an existing vchan + * safely, however no locking is performed, so you must prevent multiple clients + * from connecting to a single server. + * + * @param logger Logger for libxc errors + * @param domain The peer domain to connect to + * @param xs_path Base xenstore path for storing ring/event data + * @return The structure, or NULL in case of an error + */ +struct libxenvchan *libxenvchan_client_init(xentoollog_logger *logger, int domain, const char* xs_path); +/** + * Close a vchan. This deallocates the vchan and attempts to free its + * resources. The other side is notified of the close, but can still read any + * data pending prior to the close. + */ +void libxenvchan_close(struct libxenvchan *ctrl); + +/** + * Packet-based receive: always reads exactly $size bytes. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer and amount of data to read + * @return -1 on error, 0 if nonblocking and insufficient data is available, or $size + */ +int libxenvchan_recv(struct libxenvchan *ctrl, void *data, size_t size); +/** + * Stream-based receive: reads as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data that was read + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data read (which may be zero if + * the vchan is nonblocking) + */ +int libxenvchan_read(struct libxenvchan *ctrl, void *data, size_t size); +/** + * Packet-based send: send entire buffer if possible + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer and amount of data to send + * @return -1 on error, 0 if nonblocking and insufficient space is available, or $size + */ +int libxenvchan_send(struct libxenvchan *ctrl, const void *data, size_t size); +/** + * Stream-based send: send as much data as possible. + * @param ctrl The vchan control structure + * @param data Buffer for data to send + * @param size Size of the buffer + * @return -1 on error, otherwise the amount of data sent (which may be zero if + * the vchan is nonblocking) + */ +int libxenvchan_write(struct libxenvchan *ctrl, const void *data, size_t size); +/** + * Waits for reads or writes to unblock, or for a close + */ +int libxenvchan_wait(struct libxenvchan *ctrl); +/** + * Returns the event file descriptor for this vchan. When this FD is readable, + * libxenvchan_wait() will not block, and the state of the vchan has changed since + * the last invocation of libxenvchan_wait(). + */ +int libxenvchan_fd_for_select(struct libxenvchan *ctrl); +/** + * Query the state of the vchan shared page: + * return 0 when one side has called libxenvchan_close() or crashed + * return 1 when both sides are open + * return 2 [server only] when no client has yet connected + */ +int libxenvchan_is_open(struct libxenvchan* ctrl); +/** Amount of data ready to read, in bytes */ +int libxenvchan_data_ready(struct libxenvchan *ctrl); +/** Amount of data it is possible to send without blocking */ +int libxenvchan_buffer_space(struct libxenvchan *ctrl); diff --git a/tools/libvchan/node-select.c b/tools/libvchan/node-select.c new file mode 100644 index 0000000..6c6c19e --- /dev/null +++ b/tools/libvchan/node-select.c @@ -0,0 +1,162 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libxenvchan. Communications are bidirectional, + * with either server (grant offeror) or client able to read and write. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <errno.h> + +#include <libxenvchan.h> + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "\t%s [client|server] domainid nodepath [rbufsiz wbufsiz]\n", + argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char inbuf[BUFSIZE]; +char outbuf[BUFSIZE]; +int insiz = 0; +int outsiz = 0; +struct libxenvchan *ctrl = 0; + +void vchan_wr() { + if (!insiz) + return; + int ret = libxenvchan_write(ctrl, inbuf, insiz); + if (ret < 0) { + fprintf(stderr, "vchan write failed\n"); + exit(1); + } + if (ret > 0) { + insiz -= ret; + memmove(inbuf, inbuf + ret, insiz); + } +} + +void stdout_wr() { + if (!outsiz) + return; + int ret = write(1, outbuf, outsiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret > 0) { + outsiz -= ret; + memmove(outbuf, outbuf + ret, outsiz); + } +} + +/** + Simple libxenvchan application, both client and server. + Both sides may write and read, both from the libxenvchan and from + stdin/stdout (just like netcat). +*/ + +int main(int argc, char **argv) +{ + int ret; + int libxenvchan_fd; + if (argc < 4 || argv[3][0] != ''/'') + usage(argv); + if (!strcmp(argv[1], "server")) { + int rsiz = argc > 4 ? atoi(argv[4]) : 0; + int wsiz = argc > 5 ? atoi(argv[5]) : 0; + ctrl = libxenvchan_server_init(NULL, atoi(argv[2]), argv[3], rsiz, wsiz); + } else if (!strcmp(argv[1], "client")) + ctrl = libxenvchan_client_init(NULL, atoi(argv[2]), argv[3]); + else + usage(argv); + if (!ctrl) { + perror("libxenvchan_*_init"); + exit(1); + } + + fcntl(0, F_SETFL, O_NONBLOCK); + fcntl(1, F_SETFL, O_NONBLOCK); + + libxenvchan_fd = libxenvchan_fd_for_select(ctrl); + for (;;) { + fd_set rfds; + fd_set wfds; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + if (insiz != BUFSIZE) + FD_SET(0, &rfds); + if (outsiz) + FD_SET(1, &wfds); + FD_SET(libxenvchan_fd, &rfds); + ret = select(libxenvchan_fd + 1, &rfds, &wfds, NULL, NULL); + if (ret < 0) { + perror("select"); + exit(1); + } + if (FD_ISSET(0, &rfds)) { + ret = read(0, inbuf + insiz, BUFSIZE - insiz); + if (ret < 0 && errno != EAGAIN) + exit(1); + if (ret == 0) { + while (insiz) { + vchan_wr(); + libxenvchan_wait(ctrl); + } + return 0; + } + if (ret) + insiz += ret; + vchan_wr(); + } + if (FD_ISSET(libxenvchan_fd, &rfds)) { + libxenvchan_wait(ctrl); + vchan_wr(); + } + if (FD_ISSET(1, &wfds)) + stdout_wr(); + while (libxenvchan_data_ready(ctrl) && outsiz < BUFSIZE) { + ret = libxenvchan_read(ctrl, outbuf + outsiz, BUFSIZE - outsiz); + if (ret < 0) + exit(1); + outsiz += ret; + stdout_wr(); + } + if (!libxenvchan_is_open(ctrl)) { + fcntl(1, F_SETFL, 0); + while (outsiz) + stdout_wr(); + return 0; + } + } +} diff --git a/tools/libvchan/node.c b/tools/libvchan/node.c new file mode 100644 index 0000000..cab8368 --- /dev/null +++ b/tools/libvchan/node.c @@ -0,0 +1,169 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * This is a test program for libxenvchan. Communications are in one direction, + * either server (grant offeror) to client or vice versa. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <time.h> + +#include <libxenvchan.h> + +int libxenvchan_write_all(struct libxenvchan *ctrl, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = libxenvchan_write(ctrl, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +int write_all(int fd, char *buf, int size) +{ + int written = 0; + int ret; + while (written < size) { + ret = write(fd, buf + written, size - written); + if (ret <= 0) { + perror("write"); + exit(1); + } + written += ret; + } + return size; +} + +void usage(char** argv) +{ + fprintf(stderr, "usage:\n" + "%s [client|server] [read|write] domid nodepath\n", argv[0]); + exit(1); +} + +#define BUFSIZE 5000 +char buf[BUFSIZE]; +void reader(struct libxenvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = libxenvchan_read(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("read vchan"); + libxenvchan_close(ctrl); + exit(1); + } + size = write_all(1, buf, size); + if (size < 0) { + perror("stdout write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + +void writer(struct libxenvchan *ctrl) +{ + int size; + for (;;) { + size = rand() % (BUFSIZE - 1) + 1; + size = read(0, buf, size); + if (size < 0) { + perror("read stdin"); + libxenvchan_close(ctrl); + exit(1); + } + if (size == 0) + break; + size = libxenvchan_write_all(ctrl, buf, size); + fprintf(stderr, "#"); + if (size < 0) { + perror("vchan write"); + exit(1); + } + if (size == 0) { + perror("write size=0?\n"); + exit(1); + } + } +} + + +/** + Simple libxenvchan application, both client and server. + One side does writing, the other side does reading; both from + standard input/output fds. +*/ +int main(int argc, char **argv) +{ + int seed = time(0); + struct libxenvchan *ctrl = 0; + int wr = 0; + if (argc < 4) + usage(argv); + if (!strcmp(argv[2], "read")) + wr = 0; + else if (!strcmp(argv[2], "write")) + wr = 1; + else + usage(argv); + if (!strcmp(argv[1], "server")) + ctrl = libxenvchan_server_init(NULL, atoi(argv[3]), argv[4], 0, 0); + else if (!strcmp(argv[1], "client")) + ctrl = libxenvchan_client_init(NULL, atoi(argv[3]), argv[4]); + else + usage(argv); + if (!ctrl) { + perror("libxenvchan_*_init"); + exit(1); + } + ctrl->blocking = 1; + + srand(seed); + fprintf(stderr, "seed=%d\n", seed); + if (wr) + writer(ctrl); + else + reader(ctrl); + libxenvchan_close(ctrl); + return 0; +} diff --git a/xen/include/public/io/libxenvchan.h b/xen/include/public/io/libxenvchan.h new file mode 100644 index 0000000..5c3d3d4 --- /dev/null +++ b/xen/include/public/io/libxenvchan.h @@ -0,0 +1,97 @@ +/** + * @file + * @section AUTHORS + * + * Copyright (C) 2010 Rafal Wojtczuk <rafal@invisiblethingslab.com> + * + * Authors: + * Rafal Wojtczuk <rafal@invisiblethingslab.com> + * Daniel De Graaf <dgdegra@tycho.nsa.gov> + * + * @section LICENSE + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * @section DESCRIPTION + * + * Originally borrowed from the Qubes OS Project, http://www.qubes-os.org, + * this code has been substantially rewritten to use the gntdev and gntalloc + * devices instead of raw MFNs and map_foreign_range. + * + * This is a library for inter-domain communication. A standard Xen ring + * buffer is used, with a datagram-based interface built on top. The grant + * reference and event channels are shared in XenStore under a user-specified + * path. + * + * The ring.h macros define an asymmetric interface to a shared data structure + * that assumes all rings reside in a single contiguous memory space. This is + * not suitable for vchan because the interface to the ring is symmetric except + * for the setup. Unlike the producer-consumer rings defined in ring.h, the + * size of the rings used in vchan are determined at execution time instead of + * compile time, so the macros in ring.h cannot be used to access the rings. + */ + +#include <stdint.h> +#include <sys/types.h> + +struct ring_shared { + uint32_t cons, prod; +}; + +#define VCHAN_NOTIFY_WRITE 0x1 +#define VCHAN_NOTIFY_READ 0x2 + +/** + * vchan_interface: primary shared data structure + */ +struct vchan_interface { + /** + * Standard consumer/producer interface, one pair per buffer + * left is client write, server read + * right is client read, server write + */ + struct ring_shared left, right; + /** + * size of the rings, which determines their location + * 10 - at offset 1024 in ring''s page + * 11 - at offset 2048 in ring''s page + * 12+ - uses 2^(N-12) grants to describe the multi-page ring + * These should remain constant once the page is shared. + * Only one of the two orders can be 10 (or 11). + */ + uint16_t left_order, right_order; + /** + * Shutdown detection: + * 0: client (or server) has exited + * 1: client (or server) is connected + * 2: client has not yet connected + */ + uint8_t cli_live, srv_live; + /** + * Notification bits: + * VCHAN_NOTIFY_WRITE: send notify when data is written + * VCHAN_NOTIFY_READ: send notify when data is read (consumed) + * cli_notify is used for the client to inform the server of its action + */ + uint8_t cli_notify, srv_notify; + /** + * Grant list: ordering is left, right. Must not extend into actual ring + * or grow beyond the end of the initial shared page. + * These should remain constant once the page is shared, to allow + * for possible remapping by a client that restarts. + */ + uint32_t grants[0]; +}; + -- 1.7.6.2 _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Vasiliy Tolstov
2011-Sep-30 07:51 UTC
Re: [Xen-devel] [PATCH v6 0/3] libxenvchan: interdomain communications library
2011/9/23 Daniel De Graaf <dgdegra@tycho.nsa.gov>> Changes since v5: > - Unify gntdev osdep interface > - Eliminate notify_result and revert mapping if notify ioctl fails > - Rename functions and structures to libxenvchan > - Use application-specified xenstore path for ring/event data > - Enforce maximum ring size of 2^20 bytes on client > - Change to LGPL 2.1 > > [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify > [PATCH 2/3] libxc: add xc_gntshr_* functions > [PATCH 3/3] libvchan: interdomain communications library > > >Hello. Sorry for bumping.. What version of xen kernel i need to use this library? Now i have 2.6.32.26 in many domUs. And what i need in dom0 for that, if i want to communicate via libxenvchan from domU to dom0? -- Vasiliy Tolstov, Clodo.ru e-mail: v.tolstov@selfip.ru jabber: vase@selfip.ru _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Vasiliy Tolstov
2011-Sep-30 08:28 UTC
Re: [Xen-devel] [PATCH v6 0/3] libxenvchan: interdomain communications library
2011/9/30 Vasiliy Tolstov <v.tolstov@selfip.ru>> > > 2011/9/23 Daniel De Graaf <dgdegra@tycho.nsa.gov> > >> Changes since v5: >> - Unify gntdev osdep interface >> - Eliminate notify_result and revert mapping if notify ioctl fails >> - Rename functions and structures to libxenvchan >> - Use application-specified xenstore path for ring/event data >> - Enforce maximum ring size of 2^20 bytes on client >> - Change to LGPL 2.1 >> >> [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify >> [PATCH 2/3] libxc: add xc_gntshr_* functions >> [PATCH 3/3] libvchan: interdomain communications library >> >> >> > Hello. Sorry for bumping.. > What version of xen kernel i need to use this library? Now i have 2.6.32.26 > in many domUs. > And what i need in dom0 for that, if i want to communicate via libxenvchan > from domU to dom0? > > >And where i can find latest version of the patch? -- Vasiliy Tolstov, Clodo.ru e-mail: v.tolstov@selfip.ru jabber: vase@selfip.ru _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-30 08:34 UTC
[Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
On Thu, 2011-09-22 at 23:14 +0100, Daniel De Graaf wrote:> Changes since v5: > - Unify gntdev osdep interface > - Eliminate notify_result and revert mapping if notify ioctl fails > - Rename functions and structures to libxenvchan > - Use application-specified xenstore path for ring/event data > - Enforce maximum ring size of 2^20 bytes on client > - Change to LGPL 2.1 > > [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify > [PATCH 2/3] libxc: add xc_gntshr_* functions > [PATCH 3/3] libvchan: interdomain communications libraryI meant to say this before but, modulo the spurious changes to tools/libxl/libxlu_cfg_l.[ch] in the first patch, the whole lot are: Acked-by: Ian Campbell <ian.campbell@citrix.com> The error reporting for missing XC_OSDEP_GNTSDHR is missing in xc_netbsd.c but I think that should be pulled out into common code -- I''ll send out a patch. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-30 08:37 UTC
[Xen-devel] [PATCH] libxc: osdep: report missing backends in common code
# HG changeset patch # User Ian Campbell <ian.campbell@citrix.com> # Date 1317371726 -3600 # Node ID 19d4d6b854a6e54612a4a3a993af0716668e6627 # Parent e50da6b98e3d5933b9c98e8f43096fd3ebbae00d libxc: osdep: report missing backends in common code Backends were inconsistent about reporting and it''s a pain to edit them all when adding a new class of osdep. Signed-off-by: Ian Campbell <Ian.campbell@citrix.com> --- Requires Daniel De Graaf''s "libxc: add xc_gntshr_* functions" diff -r e50da6b98e3d -r 19d4d6b854a6 tools/libxc/xc_netbsd.c --- a/tools/libxc/xc_netbsd.c Thu Sep 29 17:21:32 2011 +0100 +++ b/tools/libxc/xc_netbsd.c Fri Sep 30 09:35:26 2011 +0100 @@ -386,9 +386,6 @@ static struct xc_osdep_ops *netbsd_osdep return &netbsd_privcmd_ops; case XC_OSDEP_EVTCHN: return &netbsd_evtchn_ops; - case XC_OSDEP_GNTTAB: - ERROR("GNTTAB interface not supported on this platform"); - return NULL; default: return NULL; } diff -r e50da6b98e3d -r 19d4d6b854a6 tools/libxc/xc_private.c --- a/tools/libxc/xc_private.c Thu Sep 29 17:21:32 2011 +0100 +++ b/tools/libxc/xc_private.c Fri Sep 30 09:35:26 2011 +0100 @@ -111,6 +111,18 @@ static void xc_osdep_put(xc_osdep_info_t #endif } +static const char *xc_osdep_type_name(enum xc_osdep_type type) +{ + switch ( type ) + { + case XC_OSDEP_PRIVCMD: return "privcmd"; + case XC_OSDEP_EVTCHN: return "evtchn"; + case XC_OSDEP_GNTTAB: return "gnttab"; + case XC_OSDEP_GNTSHR: return "gntshr"; + } + return "unknown"; +} + static struct xc_interface_core *xc_interface_open_common(xentoollog_logger *logger, xentoollog_logger *dombuild_logger, unsigned open_flags, @@ -161,7 +173,11 @@ static struct xc_interface_core *xc_inte xch->ops = xch->osdep.init(xch, type); if ( xch->ops == NULL ) + { + ERROR("OSDEP: interface %d (%s) not supported on this platform", + type, xc_osdep_type_name(type)); goto err_put_iface; + } xch->ops_handle = xch->ops->open(xch); if (xch->ops_handle == XC_OSDEP_OPEN_ERROR) diff -r e50da6b98e3d -r 19d4d6b854a6 tools/libxc/xc_solaris.c --- a/tools/libxc/xc_solaris.c Thu Sep 29 17:21:32 2011 +0100 +++ b/tools/libxc/xc_solaris.c Fri Sep 30 09:35:26 2011 +0100 @@ -322,9 +322,6 @@ static struct xc_osdep_ops *solaris_osde return &solaris_privcmd_ops; case XC_OSDEP_EVTCHN: return &solaris_evtchn_ops; - case XC_OSDEP_GNTTAB: - ERROR("GNTTAB interface not supported on this platform"); - return NULL; default: return NULL; } _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-30 09:16 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Thu, 2011-09-22 at 23:14 +0100, Daniel De Graaf wrote:> diff --git a/tools/libxc/xenctrlosdep.h b/tools/libxc/xenctrlosdep.h > index bfe46e0..1c6317e 100644 > --- a/tools/libxc/xenctrlosdep.h > +++ b/tools/libxc/xenctrlosdep.h > @@ -105,20 +105,12 @@ struct xc_osdep_ops > int (*unmask)(xc_evtchn *xce, xc_osdep_handle h, evtchn_port_t port); > } evtchn; > struct { > - void *(*map_grant_ref)(xc_gnttab *xcg, xc_osdep_handle h, > - uint32_t domid, > - uint32_t ref, > - int prot); > - void *(*map_grant_refs)(xc_gnttab *xcg, xc_osdep_handle h, > - uint32_t count, > - uint32_t *domids, > - uint32_t *refs, > - int prot); > - void *(*map_domain_grant_refs)(xc_gnttab *xcg, xc_osdep_handle h, > - uint32_t count, > - uint32_t domid, > - uint32_t *refs, > - int prot); > +#define XC_GRANT_MAP_SINGLE_DOMAIN 0x1 > + void *(*grant_map)(xc_gnttab *xcg, xc_osdep_handle h, > + uint32_t count, int flags, int prot, > + uint32_t *domids, uint32_t *refs, > + uint32_t notify_offset, > + evtchn_port_t notify_port); > int (*munmap)(xc_gnttab *xcg, xc_osdep_handle h, > void *start_address, > uint32_t count);Not specifically to do with this patch but I wonder if we should try and figure a way to version these shared libraries somehow, otherwise changes like this lead to segfault at best and unexpected non-crashing behaviour at worst (for out of tree backends that is). Something as simple as checksumming the xenctrlosdep.h header and including the value in the .so to be checked at load time would do the trick. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Sep-30 14:12 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
Ian Campbell writes ("Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify"):> Not specifically to do with this patch but I wonder if we should try and > figure a way to version these shared libraries somehow, otherwise > changes like this lead to segfault at best and unexpected non-crashing > behaviour at worst (for out of tree backends that is).Our approach is to change the library major number at some point between releases. People using the development tree can expect instability.> Something as simple as checksumming the xenctrlosdep.h header and > including the value in the .so to be checked at load time would do the > trick.Urgh. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Campbell
2011-Sep-30 14:17 UTC
[Xen-devel] Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify
On Fri, 2011-09-30 at 15:12 +0100, Ian Jackson wrote:> Ian Campbell writes ("Re: [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify"): > > Not specifically to do with this patch but I wonder if we should try and > > figure a way to version these shared libraries somehow, otherwise > > changes like this lead to segfault at best and unexpected non-crashing > > behaviour at worst (for out of tree backends that is). > > Our approach is to change the library major number at some point > between releases. People using the development tree can expect > instability.These are "plugins" so they don''t have an SONAME. Perhaps they should, but I''m not sure what would check it when opening with dlopen() or how one goes about doing it manually. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Daniel De Graaf
2011-Sep-30 14:40 UTC
Re: [Xen-devel] [PATCH v6 0/3] libxenvchan: interdomain communications library
On 09/30/2011 04:28 AM, Vasiliy Tolstov wrote:> 2011/9/30 Vasiliy Tolstov <v.tolstov@selfip.ru> > >> >> >> 2011/9/23 Daniel De Graaf <dgdegra@tycho.nsa.gov> >> >>> Changes since v5: >>> - Unify gntdev osdep interface >>> - Eliminate notify_result and revert mapping if notify ioctl fails >>> - Rename functions and structures to libxenvchan >>> - Use application-specified xenstore path for ring/event data >>> - Enforce maximum ring size of 2^20 bytes on client >>> - Change to LGPL 2.1 >>> >>> [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify >>> [PATCH 2/3] libxc: add xc_gntshr_* functions >>> [PATCH 3/3] libvchan: interdomain communications library >>> >>> >>> >> Hello. Sorry for bumping.. >> What version of xen kernel i need to use this library? Now i have 2.6.32.26 >> in many domUs. >> And what i need in dom0 for that, if i want to communicate via libxenvchan >> from domU to dom0? >> >> >> > And where i can find latest version of the patch? >This library depends on gntdev for the client and gntalloc for the server; these were merged in 2.6.39. The client can be used with 2.6.32.x kernels however it is not possible to detect when a peer crashes or exits without calling libxenvchan_close() on that kernel. This library does not require that either peer is dom0; it can be used for both domU-domU and domU-dom0 communication. The latest version of the patch can be found in the xen-devel archives. -- Daniel De Graaf National Security Agency _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Oct-06 18:44 UTC
[Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
Ian Campbell writes ("[Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library"):> I meant to say this before but, modulo the spurious changes to > tools/libxl/libxlu_cfg_l.[ch] in the first patch, the whole lot are: > > Acked-by: Ian Campbell <ian.campbell@citrix.com>Thanks, I have applied all three minus those changes to libxlu_cfg_l.[ch], which I will post separately. Ian. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Roger Pau Monné
2011-Oct-07 08:41 UTC
Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
Why are the binary files tools/libvchan/vchan-node1 and tools/libvchan/vchan-node2 added to the repository? Now after compilation I get: M tools/libvchan/vchan-node1 M tools/libvchan/vchan-node2 Because the compilation process changes them, and I cannot navigate trough my patch list because of this local changes. Regards, Roger. _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Keir Fraser
2011-Oct-07 09:15 UTC
Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
On 07/10/2011 09:41, "Roger Pau Monné" <roger.pau@entel.upc.edu> wrote:> Why are the binary files tools/libvchan/vchan-node1 and > tools/libvchan/vchan-node2 added to the repository? > > Now after compilation I get: > > M tools/libvchan/vchan-node1 > M tools/libvchan/vchan-node2 > > Because the compilation process changes them, and I cannot navigate > trough my patch list because of this local changes.Probably added in error after a test build. The problem is that they should be listed in .hgignore. -- Keir> Regards, Roger. > > _______________________________________________ > Xen-devel mailing list > Xen-devel@lists.xensource.com > http://lists.xensource.com/xen-devel_______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Ian Jackson
2011-Oct-07 09:48 UTC
Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
Keir Fraser writes ("Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library"):> On 07/10/2011 09:41, "Roger Pau Monné" <roger.pau@entel.upc.edu> wrote: > > Why are the binary files tools/libvchan/vchan-node1 and > > tools/libvchan/vchan-node2 added to the repository? > > > > Now after compilation I get: > > > > M tools/libvchan/vchan-node1 > > M tools/libvchan/vchan-node2 > > > > Because the compilation process changes them, and I cannot navigate > > trough my patch list because of this local changes.Sorry about this.> Probably added in error after a test build. The problem is that they should > be listed in .hgignore.Yes, now fixed in staging and will be in the tested tree later today. # HG changeset patch # User Ian Jackson <Ian.Jackson@eu.citrix.com> # Date 1317980824 -3600 # Node ID eb8637351535e9c76779d6dd78a1d5b4a434c082 # Parent 5b5394a55813dfa778a8c944bc0e6a0956629f64 libvchan: remove erroneously-committed libvchan-node[12]; add them to .hgignore Signed-off-by: Ian Jackson <ian.jackson@eu.citrix.com> Committed-by: Ian Jackson <ian.jackson@eu.citrix.com> diff -r 5b5394a55813 -r eb8637351535 .hgignore --- a/.hgignore Fri Oct 07 09:35:31 2011 +0100 +++ b/.hgignore Fri Oct 07 10:47:04 2011 +0100 @@ -189,6 +189,7 @@ ^tools/libxl/xl$ ^tools/libxl/testidl$ ^tools/libxl/testidl\.c$ +^tools/libvchan/vchan-node[12]$ ^tools/libaio/src/.*\.ol$ ^tools/libaio/src/.*\.os$ ^tools/misc/cpuperf/cpuperf-perfcntr$ diff -r 5b5394a55813 -r eb8637351535 tools/libvchan/vchan-node1 Binary file tools/libvchan/vchan-node1 has changed diff -r 5b5394a55813 -r eb8637351535 tools/libvchan/vchan-node2 Binary file tools/libvchan/vchan-node2 has changed _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Roger Pau Monné
2011-Oct-07 10:22 UTC
Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
2011/10/7 Ian Jackson <Ian.Jackson@eu.citrix.com>:> Keir Fraser writes ("Re: [Xen-devel] Re: [PATCH v6 0/3] libxenvchan: interdomain communications library"): >> On 07/10/2011 09:41, "Roger Pau Monné" <roger.pau@entel.upc.edu> wrote: >> > Why are the binary files tools/libvchan/vchan-node1 and >> > tools/libvchan/vchan-node2 added to the repository? >> > >> > Now after compilation I get: >> > >> > M tools/libvchan/vchan-node1 >> > M tools/libvchan/vchan-node2 >> > >> > Because the compilation process changes them, and I cannot navigate >> > trough my patch list because of this local changes. > > Sorry about this.NP, it''s just that I didn''t know if it had anything to do with building Xen on NetBSD. Thanks! _______________________________________________ Xen-devel mailing list Xen-devel@lists.xensource.com http://lists.xensource.com/xen-devel
Anil Madhavapeddy
2011-Nov-24 20:02 UTC
Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
On Fri, Sep 30, 2011 at 10:40:11AM -0400, Daniel De Graaf wrote:> >> > >> 2011/9/23 Daniel De Graaf <dgdegra@tycho.nsa.gov> > >> > >>> Changes since v5: > >>> - Unify gntdev osdep interface > >>> - Eliminate notify_result and revert mapping if notify ioctl fails > >>> - Rename functions and structures to libxenvchan > >>> - Use application-specified xenstore path for ring/event data > >>> - Enforce maximum ring size of 2^20 bytes on client > >>> - Change to LGPL 2.1 > >>> > >>> [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify > >>> [PATCH 2/3] libxc: add xc_gntshr_* functions > >>> [PATCH 3/3] libvchan: interdomain communications library > >>><snip>> This library depends on gntdev for the client and gntalloc for the server; > these were merged in 2.6.39. The client can be used with 2.6.32.x kernels > however it is not possible to detect when a peer crashes or exits without > calling libxenvchan_close() on that kernel.I''m trying this with your most recent dom0 patches all included, and xen-unstable. In order to get decent performance, I tried the vchan-node examples with a cranked up buffer size, which fails immediately on 64-bit VMs. This below patch fixes the xc_gntshr_share_pages call (which sets notify_offset to -1 and without this bounds check will send a big offset to the unmap notify ioctl). With this, communication does work with the big buffers, but it never frees them, and so I can get an ENOSPACE error by calling vchan-node a few times. What''s the intended cleanup path for the grants in normal use of the vchan library? Something''s holding onto them, but I don''t have a serial console on my current laptop in order to drop into Xen''s serial and find out more. diff -r 614e9f371209 tools/libxc/xc_linux_osdep.c --- a/tools/libxc/xc_linux_osdep.c Thu Nov 24 18:53:09 2011 +0000 +++ b/tools/libxc/xc_linux_osdep.c Thu Nov 24 20:00:21 2011 +0000 @@ -709,7 +709,7 @@ notify.index = gref_info->index; notify.action = 0; - if (notify_offset >= 0) { + if (notify_offset >= 0 && notify_offset < XC_PAGE_SIZE * count) { notify.index += notify_offset; notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; }
Daniel De Graaf
2011-Nov-25 16:53 UTC
Re: [PATCH v6 0/3] libxenvchan: interdomain communications library
On 11/24/2011 03:02 PM, Anil Madhavapeddy wrote:> On Fri, Sep 30, 2011 at 10:40:11AM -0400, Daniel De Graaf wrote: >>>> >>>> 2011/9/23 Daniel De Graaf <dgdegra@tycho.nsa.gov> >>>> >>>>> Changes since v5: >>>>> - Unify gntdev osdep interface >>>>> - Eliminate notify_result and revert mapping if notify ioctl fails >>>>> - Rename functions and structures to libxenvchan >>>>> - Use application-specified xenstore path for ring/event data >>>>> - Enforce maximum ring size of 2^20 bytes on client >>>>> - Change to LGPL 2.1 >>>>> >>>>> [PATCH 1/3] libxc: add xc_gnttab_map_grant_ref_notify >>>>> [PATCH 2/3] libxc: add xc_gntshr_* functions >>>>> [PATCH 3/3] libvchan: interdomain communications library >>>>> > <snip> >> This library depends on gntdev for the client and gntalloc for the server; >> these were merged in 2.6.39. The client can be used with 2.6.32.x kernels >> however it is not possible to detect when a peer crashes or exits without >> calling libxenvchan_close() on that kernel. > > I''m trying this with your most recent dom0 patches all included, and > xen-unstable. In order to get decent performance, I tried the vchan-node > examples with a cranked up buffer size, which fails immediately on 64-bit > VMs. This below patch fixes the xc_gntshr_share_pages call (which sets > notify_offset to -1 and without this bounds check will send a big offset > to the unmap notify ioctl). > > With this, communication does work with the big buffers, but it never > frees them, and so I can get an ENOSPACE error by calling vchan-node a few > times. What''s the intended cleanup path for the grants in normal use of > the vchan library? Something''s holding onto them, but I don''t have a > serial console on my current laptop in order to drop into Xen''s serial and > find out more. >As long as things aren''t crashing, you should be able to use "xl debug-key g" and "xl dmesg" together to view the grant table debug output. The ENOSPC bug is due to a missing gnttab_free_grant_reference call, the patch to fix should be sent as a reply. The libxc patch fixing notify_offset is incomplete; a more complete version will also be sent as a reply. It is possible to observe this ENOSPC error even with proper cleanup if the client refuses to release the grants it has mapped. Xen does not allow domains to force unmapping of pages they have granted, so the best we can do is try to remove any hanging grants on later gntalloc calls.
The notify offset and event channels are both unsigned variables, so testing for >= 0 will not correctly detect the use of -1 to indicate the field is unused. Remove the useless comparison and replace with correct range checks or comparisons to -1. Reported-by: Anil Madhavapeddy <anil@recoil.org> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- tools/libxc/xc_linux_osdep.c | 6 +++--- 1 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/libxc/xc_linux_osdep.c b/tools/libxc/xc_linux_osdep.c index 04059b8..2a4dd94 100644 --- a/tools/libxc/xc_linux_osdep.c +++ b/tools/libxc/xc_linux_osdep.c @@ -567,7 +567,7 @@ static void *linux_gnttab_grant_map(xc_gnttab *xch, xc_osdep_handle h, struct ioctl_gntdev_unmap_notify notify; notify.index = map->index; notify.action = 0; - if (notify_offset >= 0 && notify_offset < XC_PAGE_SIZE * count) { + if (notify_offset < XC_PAGE_SIZE * count) { notify.index += notify_offset; notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; } @@ -709,11 +709,11 @@ static void *linux_gntshr_share_pages(xc_gntshr *xch, xc_osdep_handle h, notify.index = gref_info->index; notify.action = 0; - if (notify_offset >= 0) { + if (notify_offset < XC_PAGE_SIZE * count) { notify.index += notify_offset; notify.action |= UNMAP_NOTIFY_CLEAR_BYTE; } - if (notify_port >= 0) { + if (notify_port != -1) { notify.event_channel_port = notify_port; notify.action |= UNMAP_NOTIFY_SEND_EVENT; } -- 1.7.7.3
Daniel De Graaf
2011-Nov-25 16:56 UTC
[PATCH 1/2] xen/events: prevent calling evtchn_get on invlaid channels
The event channel number provided to evtchn_get can be provided by userspace, so needs to be checked against the maximum number of event channels prior to using it to index into evtchn_to_irq. Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- drivers/xen/events.c | 3 +++ 1 files changed, 3 insertions(+), 0 deletions(-) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index a3bcd61..e5e5812 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -1104,6 +1104,9 @@ int evtchn_get(unsigned int evtchn) struct irq_info *info; int err = -ENOENT; + if (evtchn >= NR_EVENT_CHANNELS) + return -EINVAL; + mutex_lock(&irq_mapping_update_lock); irq = evtchn_to_irq[evtchn]; -- 1.7.7.3
Daniel De Graaf
2011-Nov-25 16:56 UTC
[PATCH 2/2] xen/gntalloc: release grant references on page free
gnttab_end_foreign_access_ref does not return the grant reference it is passed to the free list; gnttab_free_grant_reference needs to be explicitly called. While gnttab_end_foreign_access provides a wrapper for this, it is unsuitable because it does not return errors. Reported-by: Anil Madhavapeddy <anil@recoil.org> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- drivers/xen/gntalloc.c | 2 ++ 1 files changed, 2 insertions(+), 0 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 60eee4e..7b936cc 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -191,6 +191,8 @@ static void __del_gref(struct gntalloc_gref *gref) if (!gnttab_end_foreign_access_ref(gref->gref_id, 0)) return; + + gnttab_free_grant_reference(gref->gref_id); } gref_size--; -- 1.7.7.3
Daniel De Graaf
2011-Nov-25 18:37 UTC
[PATCH] xen/gntalloc: fix reference counts on multi-page mappings
When a multi-page mapping of gntalloc is created, the reference counts of all pages in the vma are incremented. However, the vma open/close operations only adjusted the reference count of the first page in the mapping, leaking the other pages. Store a struct in the vm_private_data to track the original page count to properly free the pages when the last reference to the vma is closed. Reported-by: Anil Madhavapeddy <anil@recoil.org> Signed-off-by: Daniel De Graaf <dgdegra@tycho.nsa.gov> --- drivers/xen/gntalloc.c | 56 ++++++++++++++++++++++++++++++++++++----------- 1 files changed, 43 insertions(+), 13 deletions(-) diff --git a/drivers/xen/gntalloc.c b/drivers/xen/gntalloc.c index 7b936cc..e2400c8 100644 --- a/drivers/xen/gntalloc.c +++ b/drivers/xen/gntalloc.c @@ -99,6 +99,12 @@ struct gntalloc_file_private_data { uint64_t index; }; +struct gntalloc_vma_private_data { + struct gntalloc_gref *gref; + int users; + int count; +}; + static void __del_gref(struct gntalloc_gref *gref); static void do_cleanup(void) @@ -451,25 +457,39 @@ static long gntalloc_ioctl(struct file *filp, unsigned int cmd, static void gntalloc_vma_open(struct vm_area_struct *vma) { - struct gntalloc_gref *gref = vma->vm_private_data; - if (!gref) + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + + if (!priv) return; mutex_lock(&gref_mutex); - gref->users++; + priv->users++; mutex_unlock(&gref_mutex); } static void gntalloc_vma_close(struct vm_area_struct *vma) { - struct gntalloc_gref *gref = vma->vm_private_data; - if (!gref) + struct gntalloc_vma_private_data *priv = vma->vm_private_data; + struct gntalloc_gref *gref, *next; + int i; + + if (!priv) return; mutex_lock(&gref_mutex); - gref->users--; - if (gref->users == 0) - __del_gref(gref); + priv->users--; + if (priv->users == 0) { + gref = priv->gref; + for (i = 0; i < priv->count; i++) { + gref->users--; + next = list_entry(gref->next_gref.next, + struct gntalloc_gref, next_gref); + if (gref->users == 0) + __del_gref(gref); + gref = next; + } + kfree(priv); + } mutex_unlock(&gref_mutex); } @@ -481,19 +501,25 @@ static struct vm_operations_struct gntalloc_vmops = { static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) { struct gntalloc_file_private_data *priv = filp->private_data; + struct gntalloc_vma_private_data *vm_priv; struct gntalloc_gref *gref; int count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; int rv, i; - pr_debug("%s: priv %p, page %lu+%d\n", __func__, - priv, vma->vm_pgoff, count); - if (!(vma->vm_flags & VM_SHARED)) { printk(KERN_ERR "%s: Mapping must be shared.\n", __func__); return -EINVAL; } + vm_priv = kmalloc(sizeof(*vm_priv), GFP_KERNEL); + if (!vm_priv) + return -ENOMEM; + mutex_lock(&gref_mutex); + + pr_debug("%s: priv %p,%p, page %lu+%d\n", __func__, + priv, vm_priv, vma->vm_pgoff, count); + gref = find_grefs(priv, vma->vm_pgoff << PAGE_SHIFT, count); if (gref == NULL) { rv = -ENOENT; @@ -502,9 +528,13 @@ static int gntalloc_mmap(struct file *filp, struct vm_area_struct *vma) goto out_unlock; } - vma->vm_private_data = gref; + vm_priv->gref = gref; + vm_priv->users = 1; + vm_priv->count = count; + + vma->vm_private_data = vm_priv; - vma->vm_flags |= VM_RESERVED; + vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND; vma->vm_ops = &gntalloc_vmops; -- 1.7.7.3
Daniel De Graaf writes ("[Xen-devel] [PATCH] libxc: Fix checks on grant notify arguments"):> The notify offset and event channels are both unsigned variables, so > testing for >= 0 will not correctly detect the use of -1 to indicate > the field is unused. Remove the useless comparison and replace with > correct range checks or comparisons to -1.Committed-by: Ian Jackson <ian.jackson@eu.citrix.com>