thr3ads.net - Xen devel - [Xen-devel] [PATCH 0 of 2 V5] libxc: checkpoint compression [Nov 2011]

If this information is useful, please help other people find it:
Share via:

rshriram@cs.ubc.ca

2011-Nov-03 20:05 UTC

[Xen-devel] [PATCH 0 of 2 V5] libxc: checkpoint compression

This patch series adds checkpoint compression functionality, while
running under Remus.

Tested under xen-4.2-unstable/pvops dom0 (x86_64) #ubuntu-10.04.

A simple benchmark: SpecJBB (Java Benchmark)
- 4 minute Remus run, with PV domU (2.6.32 xenolinux kernel), 1G memory and 2
VCPUs
- Data sent without compression: ~70GB.
- Data sent with compression: ~8GB.

Changes since last version:
1. use posix_memalign only on linux platforms and switch to normal malloc for
   the rest. stubdom compiles successfully.

Shriram

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

rshriram@cs.ubc.ca

2011-Nov-03 20:05 UTC

head link

[Xen-devel] [PATCH 1 of 2 V5] tools/libxc: Remus Checkpoint Compression

# HG changeset patch
# User Shriram Rajagopalan <rshriram@cs.ubc.ca>
# Date 1320348671 25200
# Node ID d27072263a483fbf66456722fbd84967fe606602
# Parent  4b0907c6a08c348962bd976c2976257b412408be
tools/libxc: Remus Checkpoint Compression

Instead of sending dirty pages of guest memory as-is, use a simple compression
algorithm that sends a RLE-encoded XOR of the page against its last sent copy.
A small LRU cache is used to hold recently dirtied pages. Pagetable pages are
sent as-is, as they are canonicalized at sender side and uncanonicalized at
receiver.

Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>

diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/Makefile
--- a/tools/libxc/Makefile	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/Makefile	Thu Nov 03 12:31:11 2011 -0700
@@ -42,7 +42,7 @@
 GUEST_SRCS-y : GUEST_SRCS-y += xg_private.c xc_suspend.c
 GUEST_SRCS-$(CONFIG_MIGRATE) += xc_domain_restore.c xc_domain_save.c
-GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c
+GUEST_SRCS-$(CONFIG_MIGRATE) += xc_offline_page.c xc_compression.c
 GUEST_SRCS-$(CONFIG_HVM) += xc_hvm_build.c
 
 vpath %.c ../../xen/common/libelf
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xc_compression.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/libxc/xc_compression.c	Thu Nov 03 12:31:11 2011 -0700
@@ -0,0 +1,567 @@
+/******************************************************************************
+ * xc_compression.c
+ *
+ * Checkpoint Compression using Page Delta Algorithm.
+ * - A LRU cache of recently dirtied guest pages is maintained.
+ * - For each dirty guest page in the checkpoint, if a previous version of the
+ * page exists in the cache, XOR both pages and send the non-zero sections
+ * to the receiver. The cache is then updated with the newer copy of guest
page.
+ * - The receiver will XOR the non-zero sections against its copy of the guest
+ * page, thereby bringing the guest page up-to-date with the sender side.
+ *
+ * Copyright (c) 2011 Shriram Rajagopalan (rshriram@cs.ubc.ca).
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301 
USA
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <inttypes.h>
+#include <errno.h>
+#include "xenctrl.h"
+#include "xg_save_restore.h"
+#include "xg_private.h"
+#include "xc_dom.h"
+
+/* Page Cache for Delta Compression*/
+#define DELTA_CACHE_SIZE (XC_PAGE_SIZE * 8192)
+
+/* Internal page buffer to hold dirty pages of a checkpoint,
+ * to be compressed after the domain is resumed for execution.
+ */
+#define PAGE_BUFFER_SIZE (XC_PAGE_SIZE * 8192)
+
+struct cache_page
+{
+    char *page;
+    xen_pfn_t pfn;
+    struct cache_page *next;
+    struct cache_page *prev;
+};
+
+struct compression_ctx
+{
+    /* compression buffer - holds compressed data */
+    char *compbuf;
+    unsigned long compbuf_size;
+    unsigned long compbuf_pos;
+
+    /* Page buffer to hold pages to be compressed */
+    char *inputbuf;
+    /* pfns of pages to be compressed */
+    xen_pfn_t *sendbuf_pfns;
+    unsigned int pfns_len;
+    unsigned int pfns_index;
+
+    /* Compression Cache (LRU) */
+    char *cache_base;
+    struct cache_page **pfn2cache;
+    struct cache_page *cache;
+    struct cache_page *page_list_head;
+    struct cache_page *page_list_tail;
+    unsigned long dom_pfnlist_size;
+};
+
+#define RUNFLAG 0
+#define SKIPFLAG ((char)128)
+#define FLAGMASK SKIPFLAG
+#define LENMASK ((char)127)
+
+/*
+ * see xg_save_restore.h for details on the compressed stream format.
+ * delta size = 4 bytes.
+ * run header = 1 byte (1 bit for runtype, 7bits for run length).
+ *  i.e maximum size of a run = 127 * 4 = 508 bytes.
+ * Worst case compression: Entire page has changed.
+ * In the worst case, the size of the compressed page is
+ *  8 runs of 508 bytes + 1 run of 32 bytes + 9 run headers 
+ *  = 4105 bytes.
+ * We could detect this worst case and send the entire page with a
+ * FULL_PAGE marker, reducing the total size to 4097 bytes. The cost
+ * of this size reduction is an additional memcpy, on top of two previous
+ * memcpy (to the compressed stream and the cache page in the for loop).
+ *
+ * We might as well sacrifice an extra 8 bytes instead of a memcpy.
+ */
+#define WORST_COMP_PAGE_SIZE (XC_PAGE_SIZE + 9)
+
+/*
+ * A zero length skip indicates full page.
+ */
+#define EMPTY_PAGE 0
+#define FULL_PAGE SKIPFLAG
+#define FULL_PAGE_SIZE (XC_PAGE_SIZE + 1)
+#define MAX_DELTAS (XC_PAGE_SIZE/sizeof(uint32_t))
+
+/*
+ * Add a pagetable page or a new page (uncached)
+ * if srcpage is a pagetable page, cache_page is null.
+ * if srcpage is a page that was not previously in the cache,
+ *  cache_page points to a free page slot in the cache where
+ *  this new page can be copied to.
+ */
+static int add_full_page(comp_ctx *ctx, char *srcpage, char *cache_page)
+{
+    char *dest = (ctx->compbuf + ctx->compbuf_pos);
+
+    if ( (ctx->compbuf_pos + FULL_PAGE_SIZE) > ctx->compbuf_size)
+        return -1;
+
+    if (cache_page)
+        memcpy(cache_page, srcpage, XC_PAGE_SIZE);
+    dest[0] = FULL_PAGE;
+    memcpy(&dest[1], srcpage, XC_PAGE_SIZE);
+    ctx->compbuf_pos += FULL_PAGE_SIZE;
+
+    return FULL_PAGE_SIZE;
+}
+
+static int compress_page(comp_ctx *ctx, char *srcpage, char *cache_page)
+{
+    char *dest = (ctx->compbuf + ctx->compbuf_pos);
+    uint32_t *new, *old;
+
+    int off, runptr = 0;
+    int wascopying = 0, copying = 0, bytes_skipped = 0;
+    int complen = 0, pageoff = 0, runbytes = 0;
+
+    char runlen = 0;
+
+    if ( (ctx->compbuf_pos + WORST_COMP_PAGE_SIZE) >
ctx->compbuf_size)
+        return -1;
+
+    /*
+     * There are no alignment issues here since srcpage is
+     * domU''s page passed from xc_domain_save and cache_page is
+     * a ptr to cache page (cache is page aligned).
+     */
+    new = (uint32_t*)srcpage;
+    old = (uint32_t*)cache_page;
+
+    for (off = 0; off <= MAX_DELTAS; off++)
+    {
+        /*
+         * At (off == MAX_DELTAS), we are processing the last run
+         * in the page. Since there is no XORing, make wascopying != copying
+         * to satisfy the if-block below.
+         */
+        copying = ((off < MAX_DELTAS) ? (old[off] != new[off]) :
!wascopying);
+
+        if (runlen)
+        {
+            /* switching between run types or current run is full */
+            if ( (wascopying != copying) || (runlen == LENMASK) )
+            {
+                runbytes = runlen * sizeof(uint32_t);
+                runlen |= (wascopying ? RUNFLAG : SKIPFLAG);
+                dest[complen++] = runlen;
+
+                if (wascopying) /* RUNFLAG */
+                {
+                    pageoff = runptr * sizeof(uint32_t);
+                    memcpy(dest + complen, srcpage + pageoff, runbytes);
+                    memcpy(cache_page + pageoff, srcpage + pageoff, runbytes);
+                    complen += runbytes;
+                }
+                else /* SKIPFLAG */
+                {
+                    bytes_skipped += runbytes;
+                }
+
+                runlen = 0;
+                runptr = off;
+            }
+        }
+        runlen++;
+        wascopying = copying;
+    }
+
+    /*
+     * Check for empty page.
+     */
+    if (bytes_skipped == XC_PAGE_SIZE)
+    {
+        complen = 1;
+        dest[0] = EMPTY_PAGE;
+    }
+    ctx->compbuf_pos += complen;
+
+    return complen;
+}
+
+static
+char *get_cache_page(comp_ctx *ctx, xen_pfn_t pfn,
+                     int *israw)
+{
+    struct cache_page *item = NULL;
+
+    item = ctx->pfn2cache[pfn];
+
+    if (!item)
+    {
+        *israw = 1;
+
+        /* If the list is full, evict a page from the tail end. */
+        item = ctx->page_list_tail;
+        if (item->pfn != INVALID_P2M_ENTRY)
+            ctx->pfn2cache[item->pfn] = NULL;
+
+        item->pfn = pfn;
+        ctx->pfn2cache[pfn] = item;
+    }
+        
+    /* 	if requested item is in cache move to head of list */
+    if (item != ctx->page_list_head)
+    {
+        if (item == ctx->page_list_tail)
+        {
+            /* item at tail of list. */
+            ctx->page_list_tail = item->prev;
+            (ctx->page_list_tail)->next = NULL;
+        }
+        else
+        {
+            /* item in middle of list */
+            item->prev->next = item->next;
+            item->next->prev = item->prev;
+        }
+
+        item->prev = NULL;
+        item->next = ctx->page_list_head;
+        (ctx->page_list_head)->prev = item;
+        ctx->page_list_head = item;
+    }
+
+    return (ctx->page_list_head)->page;
+}
+
+/* Remove pagetable pages from cache and move to tail, as free pages */
+static
+void invalidate_cache_page(comp_ctx *ctx, xen_pfn_t pfn)
+{
+    struct cache_page *item = NULL;
+
+    item = ctx->pfn2cache[pfn];
+    if (item)
+    {
+        if (item != ctx->page_list_tail)
+        {
+            /* item at head of list */
+            if (item == ctx->page_list_head)
+            {
+                ctx->page_list_head = (ctx->page_list_head)->next;
+                (ctx->page_list_head)->prev = NULL;
+            }
+            else /* item in middle of list */
+            {            
+                item->prev->next = item->next;
+                item->next->prev = item->prev;
+            }
+
+            item->next = NULL;
+            item->prev = ctx->page_list_tail;
+            (ctx->page_list_tail)->next = item;
+            ctx->page_list_tail = item;
+        }
+        ctx->pfn2cache[pfn] = NULL;
+        (ctx->page_list_tail)->pfn = INVALID_P2M_ENTRY;
+    }
+}
+
+int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx,
+                            char *page, xen_pfn_t pfn, int israw)
+{
+    if (pfn > ctx->dom_pfnlist_size)
+    {
+        ERROR("Invalid pfn passed into "
+              "xc_compression_add_page %" PRIpfn "\n",
pfn);
+        return -2;
+    }
+
+    /* pagetable page */
+    if (israw)
+        invalidate_cache_page(ctx, pfn);
+    ctx->sendbuf_pfns[ctx->pfns_len] = israw ? INVALID_P2M_ENTRY : pfn;
+    memcpy(ctx->inputbuf + ctx->pfns_len * XC_PAGE_SIZE, page,
XC_PAGE_SIZE);
+    ctx->pfns_len++;
+
+    /* check if we have run out of space. If so,
+     * we need to synchronously compress the pages and flush them out
+     */
+    if (ctx->pfns_len == NRPAGES(PAGE_BUFFER_SIZE))
+        return -1;
+    return 0;
+}
+
+int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx,
+                                  char *compbuf, unsigned long compbuf_size,
+                                  unsigned long *compbuf_len)
+{
+    char *cache_copy = NULL, *current_page = NULL;
+    int israw, rc = 1;
+
+    if (!ctx->pfns_len || (ctx->pfns_index == ctx->pfns_len)) {
+        ctx->pfns_len = ctx->pfns_index = 0;
+        return 0;
+    }
+
+    ctx->compbuf_pos = 0;
+    ctx->compbuf = compbuf;
+    ctx->compbuf_size = compbuf_size;
+
+    for (; ctx->pfns_index < ctx->pfns_len; ctx->pfns_index++)
+    {
+        israw = 0;
+        cache_copy = NULL;
+        current_page = ctx->inputbuf + ctx->pfns_index * XC_PAGE_SIZE;
+
+        if (ctx->sendbuf_pfns[ctx->pfns_index] == INVALID_P2M_ENTRY)
+            israw = 1;
+        else
+            cache_copy = get_cache_page(ctx,
+                                       
ctx->sendbuf_pfns[ctx->pfns_index],
+                                        &israw);
+
+        if (israw)
+            rc = (add_full_page(ctx, current_page, cache_copy) >= 0);
+        else
+            rc = (compress_page(ctx, current_page, cache_copy) >= 0);
+
+        if ( !rc )
+        {
+            /* Out of space in outbuf! flush and come back */
+            rc = -1;
+            break;
+        }
+    }
+    if (compbuf_len)
+        *compbuf_len = ctx->compbuf_pos;
+
+    return rc;
+}
+
+inline
+void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx)
+{
+    ctx->pfns_index = ctx->pfns_len = 0;
+}
+
+int xc_compression_uncompress_page(xc_interface *xch, char *compbuf,
+                                   unsigned long compbuf_size,
+                                   unsigned long *compbuf_pos, char *destpage)
+{
+    unsigned long pos;
+    unsigned int len = 0, pagepos = 0;
+    char flag;
+
+    pos = *compbuf_pos;
+    if (pos >= compbuf_size)
+    {
+        ERROR("Out of bounds exception in compression buffer (a):"
+              "read ptr:%lu, bufsize = %lu\n",
+              *compbuf_pos, compbuf_size);
+        return -1;
+    }
+
+    switch (compbuf[pos])
+    {
+    case EMPTY_PAGE:
+        pos++;
+        break;
+
+    case FULL_PAGE:
+        {
+            /* Check if the input buffer has 4KB of data */
+            if ((pos + FULL_PAGE_SIZE) > compbuf_size)
+            {
+                ERROR("Out of bounds exception in compression buffer
(b):"
+                      "read ptr = %lu, bufsize = %lu\n",
+                      *compbuf_pos, compbuf_size);
+                return -1;
+            }
+            memcpy(destpage, &compbuf[pos + 1], XC_PAGE_SIZE);
+            pos += FULL_PAGE_SIZE;
+        }
+        break;
+
+    default: /* Normal page with one or more runs */
+        {
+            do
+            {
+                flag = compbuf[pos] & FLAGMASK;
+                len = (compbuf[pos] & LENMASK) * sizeof(uint32_t);
+                /* Sanity Check: Zero-length runs are allowed only for
+                 * FULL_PAGE and EMPTY_PAGE.
+                 */
+                if (!len)
+                {
+                    ERROR("Zero length run encountered for normal page:
"
+                          "buffer (d):read ptr = %lu, flag = %u, "
+                          "bufsize = %lu, pagepos = %u\n",
+                          pos, (unsigned int)flag, compbuf_size, pagepos);
+                    return -1;
+                }
+
+                pos++;
+                if (flag == RUNFLAG)
+                {
+                    /* Check if the input buffer has len bytes of data
+                     * and whether it would fit in the destination page.
+                     */
+                    if (((pos + len) > compbuf_size)
+                        || ((pagepos + len) > XC_PAGE_SIZE))
+                    {
+                        ERROR("Out of bounds exception in compression
"
+                              "buffer (c):read ptr = %lu, runlen = %u,
"
+                              "bufsize = %lu, pagepos = %u\n",
+                              pos, len, compbuf_size, pagepos);
+                        return -1;
+                    }
+                    memcpy(&destpage[pagepos], &compbuf[pos], len);
+                    pos += len;
+                }
+                pagepos += len;
+            } while ((pagepos < XC_PAGE_SIZE) && (pos <
compbuf_size));
+
+            /* Make sure we have copied/skipped 4KB worth of data */
+            if (pagepos != XC_PAGE_SIZE)
+            {
+                ERROR("Invalid data in compression buffer:"
+                      "read ptr = %lu, bufsize = %lu, pagepos =
%u\n",
+                      pos, compbuf_size, pagepos);
+                return -1;
+            }
+        }
+    }
+    *compbuf_pos = pos;
+    return 0;
+}
+
+void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx)
+{
+    if (!ctx) return;
+
+    if (ctx->inputbuf)
+        free(ctx->inputbuf);
+    if (ctx->sendbuf_pfns)
+        free(ctx->sendbuf_pfns);
+    if (ctx->cache_base)
+        free(ctx->cache_base);
+    if (ctx->pfn2cache)
+        free(ctx->pfn2cache);
+    if (ctx->cache)
+        free(ctx->cache);
+    free(ctx);
+}
+
+comp_ctx *xc_compression_create_context(xc_interface *xch,
+                                        unsigned long p2m_size)
+{
+    unsigned long i;
+    comp_ctx *ctx = NULL;
+    unsigned long num_cache_pages = DELTA_CACHE_SIZE/XC_PAGE_SIZE;
+
+    ctx = (comp_ctx *)malloc(sizeof(comp_ctx));
+    if (!ctx)
+    {
+        ERROR("Failed to allocate compression_ctx\n");
+        goto error;
+    }
+    memset(ctx, 0, sizeof(comp_ctx));
+
+#ifdef __linux__
+    if (posix_memalign((void **)&ctx->inputbuf,
+                       XC_PAGE_SIZE, PAGE_BUFFER_SIZE))
+    {
+        ERROR("Failed to allocate page buffer\n");
+        goto error;
+    }
+
+    if (posix_memalign((void **)&ctx->cache_base,
+                       XC_PAGE_SIZE, DELTA_CACHE_SIZE))
+    {
+        ERROR("Failed to allocate delta cache\n");
+        goto error;
+    }
+#else
+    ctx->inputbuf = malloc(PAGE_BUFFER_SIZE);
+    if (!ctx->inputbuf)
+    {
+        ERROR("Failed to allocate page buffer\n");
+        goto error;
+    }
+
+    ctx->cache_base = malloc(DELTA_CACHE_SIZE);
+    if (!ctx->cache_base)
+    {
+        ERROR("Failed to allocate delta cache\n");
+        goto error;
+    }
+#endif
+
+    ctx->sendbuf_pfns = malloc(NRPAGES(PAGE_BUFFER_SIZE) *
+                               sizeof(xen_pfn_t));
+    if (!ctx->sendbuf_pfns)
+    {
+        ERROR("Could not alloc sendbuf_pfns\n");
+        goto error;
+    }
+    memset(ctx->sendbuf_pfns, -1,
+           NRPAGES(PAGE_BUFFER_SIZE) * sizeof(xen_pfn_t));
+
+    ctx->pfn2cache = calloc(p2m_size, sizeof(struct cache_page *));
+    if (!ctx->pfn2cache)
+    {
+        ERROR("Could not alloc pfn2cache map\n");
+        goto error;
+    }
+
+    ctx->cache = malloc(num_cache_pages * sizeof(struct cache_page));
+    if (!ctx->cache)
+    {
+        ERROR("Could not alloc compression cache\n");
+        goto error;
+    }
+
+    for (i = 0; i < num_cache_pages; i++)
+    {
+        ctx->cache[i].pfn = INVALID_P2M_ENTRY;
+        ctx->cache[i].page = ctx->cache_base + i * XC_PAGE_SIZE;
+        ctx->cache[i].prev = (i == 0) ? NULL : &(ctx->cache[i - 1]);
+        ctx->cache[i].next = ((i+1) == num_cache_pages)? NULL :
+            &(ctx->cache[i + 1]);
+    }
+    ctx->page_list_head = &(ctx->cache[0]);
+    ctx->page_list_tail = &(ctx->cache[num_cache_pages -1]);
+    ctx->dom_pfnlist_size = p2m_size;
+
+    return ctx;
+error:
+    xc_compression_free_context(xch, ctx);
+    return NULL;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xc_domain_restore.c
--- a/tools/libxc/xc_domain_restore.c	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/xc_domain_restore.c	Thu Nov 03 12:31:11 2011 -0700
@@ -43,6 +43,7 @@
     xen_pfn_t *p2m_batch; /* A table of P2M mappings in the current region.  */
     int completed; /* Set when a consistent image is available */
     int last_checkpoint; /* Set when we should commit to the current checkpoint
when it completes. */
+    int compressing; /* Set when sender signals that pages would be sent
compressed (for Remus) */
     struct domain_info_context dinfo;
 };
 
@@ -663,6 +664,10 @@
     /* pages is of length nr_physpages, pfn_types is of length nr_pages */
     unsigned int nr_physpages, nr_pages;
 
+    /* checkpoint compression state */
+    int compressing;
+    unsigned long compbuf_pos, compbuf_size;
+
     /* Types of the pfns in the current region */
     unsigned long* pfn_types;
 
@@ -700,6 +705,7 @@
 {
     int count, countpages, oldcount, i;
     void* ptmp;
+    unsigned long compbuf_size;
 
     if ( RDEXACT(fd, &count, sizeof(count)) )
     {
@@ -809,6 +815,40 @@
         }
         return pagebuf_get_one(xch, ctx, buf, fd, dom);
 
+    case XC_SAVE_ID_ENABLE_COMPRESSION:
+        /* We cannot set compression flag directly in pagebuf structure,
+         * since this pagebuf still has uncompressed pages that are yet to
+         * be applied. We enable the compression field in pagebuf structure
+         * after receiving the first tailbuf.
+         */
+        ctx->compressing = 1;
+        // DPRINTF("compression flag received");
+        return pagebuf_get_one(xch, ctx, buf, fd, dom);
+
+    case XC_SAVE_ID_COMPRESSED_DATA:
+
+        /* read the length of compressed chunk coming in */
+        if ( RDEXACT(fd, &compbuf_size, sizeof(unsigned long)) )
+        {
+            PERROR("Error when reading compbuf_size");
+            return -1;
+        }
+        if (!compbuf_size) return 1;
+
+        buf->compbuf_size += compbuf_size;
+        if (!(ptmp = realloc(buf->pages, buf->compbuf_size))) {
+            ERROR("Could not (re)allocate compression buffer");
+            return -1;
+        }
+        buf->pages = ptmp;
+
+        if ( RDEXACT(fd, buf->pages + (buf->compbuf_size - compbuf_size),
+                     compbuf_size) ) {
+            PERROR("Error when reading compression buffer");
+            return -1;
+        }
+        return compbuf_size;
+
     default:
         if ( (count > MAX_BATCH_SIZE) || (count < 0) ) {
             ERROR("Max batch size exceeded (%d). Giving up.", count);
@@ -846,6 +886,13 @@
     if (!countpages)
         return count;
 
+    /* If Remus Checkpoint Compression is turned on, we will only be
+     * receiving the pfn lists now. The compressed pages will come in later,
+     * following a <XC_SAVE_ID_COMPRESSED_DATA, compressedChunkSize>
tuple.
+     */
+    if (buf->compressing)
+        return pagebuf_get_one(xch, ctx, buf, fd, dom);
+
     oldcount = buf->nr_physpages;
     buf->nr_physpages += countpages;
     if (!buf->pages) {
@@ -874,6 +921,7 @@
     int rc;
 
     buf->nr_physpages = buf->nr_pages = 0;
+    buf->compbuf_pos = buf->compbuf_size = 0;
 
     do {
         rc = pagebuf_get_one(xch, ctx, buf, fd, dom);
@@ -1091,7 +1139,21 @@
         /* In verify mode, we use a copy; otherwise we work in place */
         page = pagebuf->verify ? (void *)buf : (region_base + i*PAGE_SIZE);
 
-        memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE,
PAGE_SIZE);
+        /* Remus - page decompression */
+        if (pagebuf->compressing)
+        {
+            if (xc_compression_uncompress_page(xch, pagebuf->pages,
+                                               pagebuf->compbuf_size,
+                                               &pagebuf->compbuf_pos,
+                                               (char *)page))
+            {
+                ERROR("Failed to uncompress page (pfn=%lx)\n", pfn);
+                goto err_mapped;
+            }
+        }
+        else
+            memcpy(page, pagebuf->pages + (curpage + curbatch) * PAGE_SIZE,
+                   PAGE_SIZE);
 
         pagetype &= XEN_DOMCTL_PFINFO_LTABTYPE_MASK;
 
@@ -1353,6 +1415,7 @@
 
         if ( !ctx->completed ) {
             pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+            pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
             if ( pagebuf_get_one(xch, ctx, &pagebuf, io_fd, dom) < 0 ) {
                 PERROR("Error when reading batch");
                 goto out;
@@ -1395,6 +1458,7 @@
         }
 
         pagebuf.nr_physpages = pagebuf.nr_pages = 0;
+        pagebuf.compbuf_pos = pagebuf.compbuf_size = 0;
 
         n += j; /* crude stats */
 
@@ -1438,6 +1502,13 @@
          */
         if ( !ctx->last_checkpoint )
             fcntl(io_fd, F_SETFL, orig_io_fd_flags | O_NONBLOCK);
+
+        /*
+         * If sender had sent enable compression flag, switch to compressed
+         * checkpoints mode once the first checkpoint is received.
+         */
+        if (ctx->compressing)
+            pagebuf.compressing = 1;
     }
 
     if (pagebuf.acpi_ioport_location == 1) {
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xc_domain_save.c
--- a/tools/libxc/xc_domain_save.c	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/xc_domain_save.c	Thu Nov 03 12:31:11 2011 -0700
@@ -218,6 +218,56 @@
         return noncached_write(xch, ob, fd, buf, len);
 }
 
+static int write_compressed(xc_interface *xch, comp_ctx *compress_ctx,
+                            int dobuf, struct outbuf* ob, int fd)
+{
+    int rc = 0;
+    int header = sizeof(int) + sizeof(unsigned long);
+    int marker = XC_SAVE_ID_COMPRESSED_DATA;
+    unsigned long compbuf_len = 0;
+
+    do
+    {
+        /* check for available space (atleast 8k) */
+        if ((ob->pos + header + XC_PAGE_SIZE * 2) > ob->size)
+        {
+            if (outbuf_flush(xch, ob, fd) < 0)
+            {
+                ERROR("Error when flushing outbuf intermediate");
+                return -1;
+            }
+        }
+
+        rc = xc_compression_compress_pages(xch, compress_ctx,
+                                           ob->buf + ob->pos + header,
+                                           ob->size - ob->pos - header,
+                                           &compbuf_len);
+        if (!rc)
+            return 0;
+
+        if (outbuf_hardwrite(xch, ob, fd, &marker, sizeof(marker)) < 0)
+        {
+            PERROR("Error when writing marker (errno %d)", errno);
+            return -1;
+        }
+
+        if (outbuf_hardwrite(xch, ob, fd, &compbuf_len,
sizeof(compbuf_len)) < 0)
+        {
+            PERROR("Error when writing compbuf_len (errno %d)",
errno);
+            return -1;
+        }
+
+        ob->pos += (size_t) compbuf_len;
+        if (!dobuf && outbuf_flush(xch, ob, fd) < 0)
+        {
+            ERROR("Error when writing compressed chunk");
+            return -1;
+        }
+    } while (rc != 0);
+
+    return 0;
+}
+
 struct time_stats {
     struct timeval wall;
     long long d0_cpu, d1_cpu;
@@ -815,11 +865,35 @@
 
     unsigned long mfn;
 
-    struct outbuf ob;
+    /* Without checkpoint compression, the dirty pages, pfn arrays
+     * and tailbuf (vcpu ctx, shared info page, etc.)  are written
+     * directly to outbuf. All of this is done while the domain is
+     * suspended.
+     *
+     * When checkpoint compression is enabled, the dirty pages are
+     * buffered, compressed "after" the domain is resumed and then
+     * written to outbuf. Since tailbuf data are collected while a
+     * domain is suspended, they cannot be directly written to the
+     * outbuf as there is no dirty page data preceeding tailbuf.
+     *
+     * So,two output buffers are maintained. Tailbuf data goes into
+     * ob_tailbuf. The dirty pages are compressed after resuming the
+     * domain and written to ob_pagebuf. ob_tailbuf is then appended
+     * to ob_pagebuf and finally flushed out.
+     */
+    struct outbuf ob_pagebuf, ob_tailbuf, *ob = NULL;
     struct save_ctx _ctx;
     struct save_ctx *ctx = &_ctx;
     struct domain_info_context *dinfo = &ctx->dinfo;
 
+    /* Compression context */
+    comp_ctx *compress_ctx= NULL;
+    /* Even if XCFLAGS_CHECKPOINT_COMPRESS is set, we enable compression only
+     * after sending XC_SAVE_ID_ENABLE_COMPRESSION and the tailbuf for
+     * first time.
+     */
+    int compressing = 0;
+
     int completed = 0;
 
     if ( hvm && !callbacks->switch_qemu_logdirty )
@@ -829,7 +903,7 @@
         return 1;
     }
 
-    outbuf_init(xch, &ob, OUTBUF_SIZE);
+    outbuf_init(xch, &ob_pagebuf, OUTBUF_SIZE);
 
     memset(ctx, 0, sizeof(*ctx));
 
@@ -917,6 +991,16 @@
         }
     }
 
+    if ( flags & XCFLAGS_CHECKPOINT_COMPRESS )
+    {
+        if (!(compress_ctx = xc_compression_create_context(xch,
dinfo->p2m_size)))
+        {
+            ERROR("Failed to create compression context");
+            goto out;
+        }
+        outbuf_init(xch, &ob_tailbuf, OUTBUF_SIZE/4);
+    }
+
     last_iter = !live;
 
     /* pretend we sent all the pages last iteration */
@@ -1025,9 +1109,11 @@
     }
 
   copypages:
-#define wrexact(fd, buf, len) write_buffer(xch, last_iter, &ob, (fd),
(buf), (len))
-#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, &ob,
(fd), (buf), (len))
+#define wrexact(fd, buf, len) write_buffer(xch, last_iter, ob, (fd), (buf),
(len))
+#define wruncached(fd, live, buf, len) write_uncached(xch, last_iter, ob, (fd),
(buf), (len))
+#define wrcompressed(fd) write_compressed(xch, compress_ctx, last_iter, ob,
(fd))
 
+    ob = &ob_pagebuf; /* Holds pfn_types, pages/compressed pages */
     /* Now write out each data page, canonicalising page tables as we go... */
     for ( ; ; )
     {
@@ -1270,7 +1356,7 @@
                 {
                     /* If the page is not a normal data page, write out any
                        run of pages we may have previously acumulated */
-                    if ( run )
+                    if ( !compressing && run )
                     {
                         if ( wruncached(io_fd, live,
                                        (char*)region_base+(PAGE_SIZE*(j-run)), 
@@ -1305,7 +1391,41 @@
                         goto out;
                     }
 
-                    if ( wruncached(io_fd, live, page, PAGE_SIZE) != PAGE_SIZE
)
+                    if (compressing)
+                    {
+                        int c_err;
+                        /* Mark pagetable page to be sent uncompressed */
+                        c_err = xc_compression_add_page(xch, compress_ctx,
page,
+                                                        pfn, 1 /* raw page */);
+                        if (c_err == -2) /* OOB PFN */
+                        {
+                            ERROR("Could not add pagetable page "
+                                  "(pfn:%" PRIpfn "to page
buffer\n", pfn);
+                            goto out;
+                        }
+
+                        if (c_err == -1)
+                        {
+                            /*
+                             * We are out of buffer space to hold dirty
+                             * pages. Compress and flush the current buffer
+                             * to make space. This is a corner case, that
+                             * slows down checkpointing as the compression
+                             * happens while domain is suspended. Happens
+                             * seldom and if you find this occuring
+                             * frequently, increase the PAGE_BUFFER_SIZE
+                             * in xc_compression.c.
+                             */
+                            if (wrcompressed(io_fd) < 0)
+                            {
+                                ERROR("Error when writing compressed"
+                                      " data (4b)\n");
+                                goto out;
+                            }
+                        }
+                    }
+                    else if ( wruncached(io_fd, live, page,
+                                         PAGE_SIZE) != PAGE_SIZE )
                     {
                         PERROR("Error when writing to state file
(4b)"
                               " (errno %d)", errno);
@@ -1315,7 +1435,34 @@
                 else
                 {
                     /* We have a normal page: accumulate it for writing. */
-                    run++;
+                    if (compressing)
+                    {
+                        int c_err;
+                        /* For checkpoint compression, accumulate the page in
the
+                         * page buffer, to be compressed later.
+                         */
+                        c_err = xc_compression_add_page(xch, compress_ctx,
spage,
+                                                        pfn, 0 /* not raw page
*/);
+
+                        if (c_err == -2) /* OOB PFN */
+                        {
+                            ERROR("Could not add page "
+                                  "(pfn:%" PRIpfn "to page
buffer\n", pfn);
+                            goto out;
+                        }
+
+                        if (c_err == -1)
+                        {
+                            if (wrcompressed(io_fd) < 0)
+                            {
+                                ERROR("Error when writing compressed"
+                                      " data (4c)\n");
+                                goto out;
+                            }
+                        }
+                    }
+                    else
+                        run++;
                 }
             } /* end of the write out for this batch */
 
@@ -1423,6 +1570,15 @@
 
     DPRINTF("All memory is saved\n");
 
+    /* After last_iter, buffer the rest of pagebuf & tailbuf data into a
+     * separate output buffer and flush it after the compressed page chunks.
+     */
+    if (compressing)
+    {
+        ob = &ob_tailbuf;
+        ob->pos = 0;
+    }
+
     {
         struct {
             int id;
@@ -1522,6 +1678,25 @@
         }
     }
 
+    /* Enable compression logic on both sides by sending this
+     * one time marker.
+     * NOTE: We could have simplified this procedure by sending
+     * the enable/disable compression flag before the beginning of
+     * the main for loop. But this would break compatibility for
+     * live migration code, with older versions of xen. So we have
+     * to enable it after the last_iter, when the XC_SAVE_ID_*
+     * elements are sent.
+     */
+    if (!compressing && (flags & XCFLAGS_CHECKPOINT_COMPRESS))
+    {
+        i = XC_SAVE_ID_ENABLE_COMPRESSION;
+        if ( wrexact(io_fd, &i, sizeof(int)) )
+        {
+            PERROR("Error when writing enable_compression marker");
+            goto out;
+        }
+    }
+
     /* Zero terminate */
     i = 0;
     if ( wrexact(io_fd, &i, sizeof(int)) )
@@ -1766,14 +1941,38 @@
     if ( !rc && callbacks->postcopy )
         callbacks->postcopy(callbacks->data);
 
+    /* guest has been resumed. Now we can compress data
+     * at our own pace.
+     */
+    if (!rc && compressing)
+    {
+        ob = &ob_pagebuf;
+        if (wrcompressed(io_fd) < 0)
+        {
+            ERROR("Error when writing compressed data, after
postcopy\n");
+            rc = 1;
+            goto out;
+        }
+        /* Append the tailbuf data to the main outbuf */
+        if ( wrexact(io_fd, ob_tailbuf.buf, ob_tailbuf.pos) )
+        {
+            rc = 1;
+            PERROR("Error when copying tailbuf into outbuf");
+            goto out;
+        }
+    }
+
     /* Flush last write and discard cache for file. */
-    if ( outbuf_flush(xch, &ob, io_fd) < 0 ) {
+    if ( outbuf_flush(xch, ob, io_fd) < 0 ) {
         PERROR("Error when flushing output buffer");
         rc = 1;
     }
 
     discard_file_cache(xch, io_fd, 1 /* flush */);
 
+    /* Enable compression now, finally */
+    compressing = (flags & XCFLAGS_CHECKPOINT_COMPRESS);
+
     /* checkpoint_cb can spend arbitrarily long in between rounds */
     if (!rc && callbacks->checkpoint &&
         callbacks->checkpoint(callbacks->data) > 0)
@@ -1815,6 +2014,9 @@
             DPRINTF("Warning - couldn''t disable qemu log-dirty
mode");
     }
 
+    if (compress_ctx)
+        xc_compression_free_context(xch, compress_ctx);
+
     if ( live_shinfo )
         munmap(live_shinfo, PAGE_SIZE);
 
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/xenctrl.h	Thu Nov 03 12:31:11 2011 -0700
@@ -1906,4 +1906,64 @@
                         int verbose);
 /* Useful for callers who also use libelf. */
 
+/**
+ * Checkpoint Compression
+ */
+typedef struct compression_ctx comp_ctx;
+comp_ctx *xc_compression_create_context(xc_interface *xch,
+					unsigned long p2m_size);
+void xc_compression_free_context(xc_interface *xch, comp_ctx *ctx);
+
+/**
+ * Add a page to compression page buffer, to be compressed later.
+ *
+ * returns 0 if the page was successfully added to the page buffer
+ *
+ * returns -1 if there is no space in buffer. In this case, the
+ *  application should call xc_compression_compress_pages to compress
+ *  the buffer (or atleast part of it), thereby freeing some space in
+ *  the page buffer.
+ *
+ * returns -2 if the pfn is out of bounds, where the bound is p2m_size
+ *  parameter passed during xc_compression_create_context.
+ */
+int xc_compression_add_page(xc_interface *xch, comp_ctx *ctx, char *page,
+			    unsigned long pfn, int israw);
+
+/**
+ * Delta compress pages in the compression buffer and inserts the
+ * compressed data into the supplied compression buffer compbuf, whose
+ * size is compbuf_size.
+ * After compression, the pages are copied to the internal LRU cache.
+ *
+ * This function compresses as many pages as possible into the
+ * supplied compression buffer. It maintains an internal iterator to
+ * keep track of pages in the input buffer that are yet to be compressed.
+ *
+ * returns -1 if the compression buffer has run out of space.  
+ * returns 1 on success.
+ * returns 0 if no more pages are left to be compressed.
+ *  When the return value is non-zero, compbuf_len indicates the actual
+ *  amount of data present in compbuf (<=compbuf_size).
+ */
+int xc_compression_compress_pages(xc_interface *xch, comp_ctx *ctx,
+				  char *compbuf, unsigned long compbuf_size,
+				  unsigned long *compbuf_len);
+
+/**
+ * Resets the internal page buffer that holds dirty pages before compression.
+ * Also resets the iterators.
+ */
+void xc_compression_reset_pagebuf(xc_interface *xch, comp_ctx *ctx);
+
+/**
+ * Caller must supply the compression buffer (compbuf),
+ * its size (compbuf_size) and a reference to index variable (compbuf_pos)
+ * that is used internally. Each call pulls out one page from the compressed
+ * chunk and copies it to dest.
+ */
+int xc_compression_uncompress_page(xc_interface *xch, char *compbuf,
+				   unsigned long compbuf_size,
+				   unsigned long *compbuf_pos, char *dest);
+
 #endif /* XENCTRL_H */
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xenguest.h
--- a/tools/libxc/xenguest.h	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/xenguest.h	Thu Nov 03 12:31:11 2011 -0700
@@ -27,6 +27,7 @@
 #define XCFLAGS_DEBUG     2
 #define XCFLAGS_HVM       4
 #define XCFLAGS_STDVGA    8
+#define XCFLAGS_CHECKPOINT_COMPRESS    16
 #define X86_64_B_SIZE   64 
 #define X86_32_B_SIZE   32
 
diff -r 4b0907c6a08c -r d27072263a48 tools/libxc/xg_save_restore.h
--- a/tools/libxc/xg_save_restore.h	Tue Oct 11 12:02:58 2011 +0100
+++ b/tools/libxc/xg_save_restore.h	Thu Nov 03 12:31:11 2011 -0700
@@ -67,7 +67,7 @@
  *
  *   consists of p2m_size bytes comprising an array of xen_pfn_t sized entries.
  *
- * BODY PHASE
+ * BODY PHASE - Format A (for live migration or Remus without compression)
  * ----------
  *
  * A series of chunks with a common header:
@@ -87,6 +87,122 @@
  *
  * If chunk type is 0 then body phase is complete.
  *
+ *
+ * BODY PHASE - Format B (for Remus with compression)
+ * ----------
+ *
+ * A series of chunks with a common header:
+ *   int              : chunk type
+ *
+ * If the chunk type is +ve then chunk contains array of PFNs corresponding
+ * to guest memory and type contains the number of PFNs in the batch:
+ *
+ *     unsigned long[]  : PFN array, length == number of pages in batch
+ *                        Each entry consists of XEN_DOMCTL_PFINFO_*
+ *                        in bits 31-28 and the PFN number in bits 27-0.
+ *
+ * If the chunk type is -ve then chunk consists of one of a number of
+ * metadata types.  See definitions of XC_SAVE_ID_* below.
+ *
+ * If the chunk type is -ve and equals XC_SAVE_ID_COMPRESSED_DATA, then the
+ * chunk consists of compressed page data, in the following format:
+ *
+ *     unsigned long        : Size of the compressed chunk to follow
+ *     compressed data :      variable length data of size indicated above.
+ *                            This chunk consists of compressed page data.
+ *                            The number of pages in one chunk depends on
+ *                            the amount of space available in the
sender''s
+ *                            output buffer.
+ *
+ * Format of compressed data:
+ *   compressed_data = <deltas>*
+ *   delta           = <marker, run*>
+ *   marker          = (RUNFLAG|SKIPFLAG) bitwise-or RUNLEN [1 byte marker]
+ *   RUNFLAG         = 0
+ *   SKIPFLAG        = 1 << 7
+ *   RUNLEN          = 7-bit unsigned value indicating number of WORDS in the
run
+ *   run             = string of bytes of length sizeof(WORD) * RUNLEN
+ *
+ *    If marker contains RUNFLAG, then RUNLEN * sizeof(WORD) bytes of data
following
+ *   the marker is copied into the target page at the appropriate offset
indicated by
+ *   the offset_ptr
+ *    If marker contains SKIPFLAG, then the offset_ptr is advanced
+ *   by RUNLEN * sizeof(WORD).
+ *
+ * If chunk type is 0 then body phase is complete.
+ *
+ * There can be one or more chunks with type XC_SAVE_ID_COMPRESSED_DATA,
+ * containing compressed pages. The compressed chunks are collated to form
+ * one single compressed chunk for the entire iteration. The number of pages
+ * present in this final compressed chunk will be equal to the total number
+ * of valid PFNs specified by the +ve chunks.
+ *
+ * At the sender side, compressed pages are inserted into the output stream
+ * in the same order as they would have been if compression logic was absent.
+ *
+ * Until last iteration, the BODY is sent in Format A, to maintain live
+ * migration compatibility with receivers of older Xen versions.
+ * At the last iteration, if Remus compression was enabled, the sender sends
+ * a trigger, XC_SAVE_ID_ENABLE_COMPRESSION to tell the receiver to parse the
+ * BODY in Format B from the next iteration onwards.
+ *
+ * An example sequence of chunks received in Format B:
+ *     +16                              +ve chunk
+ *     unsigned long[16]                PFN array
+ *     +100                             +ve chunk
+ *     unsigned long[100]               PFN array
+ *     +50                              +ve chunk
+ *     unsigned long[50]                PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA       TAG
+ *       N                              Length of compressed data
+ *       N bytes of DATA                Decompresses to 166 pages
+ *
+ *     XC_SAVE_ID_*                     other xc save chunks
+ *     0                                END BODY TAG
+ *
+ * Corner case with checkpoint compression:
+ *     At sender side, after pausing the domain, dirty pages are usually
+ *   copied out to a temporary buffer. After the domain is resumed,
+ *   compression is done and the compressed chunk(s) are sent, followed by
+ *   other XC_SAVE_ID_* chunks.
+ *     If the temporary buffer gets full while scanning for dirty pages,
+ *   the sender stops buffering of dirty pages, compresses the temporary
+ *   buffer and sends the compressed data with XC_SAVE_ID_COMPRESSED_DATA.
+ *   The sender then resumes the buffering of dirty pages and continues
+ *   scanning for the dirty pages.
+ *     For e.g., assume that the temporary buffer can hold 4096 pages and
+ *   there are 5000 dirty pages. The following is the sequence of chunks
+ *   that the receiver will see:
+ *
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *     +1024                       +ve chunk
+ *     unsigned long[1024]         PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA  TAG
+ *      N                          Length of compressed data
+ *      N bytes of DATA            Decompresses to 4096 pages
+ *
+ *     +4                          +ve chunk
+ *     unsigned long[4]            PFN array
+ *
+ *     XC_SAVE_ID_COMPRESSED_DATA  TAG
+ *      M                          Length of compressed data
+ *      M bytes of DATA            Decompresses to 4 pages
+ *
+ *     XC_SAVE_ID_*                other xc save chunks
+ *     0                           END BODY TAG
+ *
+ *     In other words, XC_SAVE_ID_COMPRESSED_DATA can be interleaved with
+ *   +ve chunks arbitrarily. But at the receiver end, the following condition
+ *   always holds true until the end of BODY PHASE:
+ *    num(PFN entries +ve chunks) >= num(pages received in compressed form)
+ *
  * TAIL PHASE
  * ----------
  *
@@ -134,6 +250,8 @@
 #define XC_SAVE_ID_HVM_CONSOLE_PFN    -8 /* (HVM-only) */
 #define XC_SAVE_ID_LAST_CHECKPOINT    -9 /* Commit to restoring after
completion of current iteration. */
 #define XC_SAVE_ID_HVM_ACPI_IOPORTS_LOCATION -10
+#define XC_SAVE_ID_COMPRESSED_DATA    -11 /* Marker to indicate arrival of
compressed data */
+#define XC_SAVE_ID_ENABLE_COMPRESSION -12 /* Marker to enable compression logic
at receiver side */
 
 /*
 ** We process save/restore/migrate in batches of pages; the below

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

rshriram@cs.ubc.ca

2011-Nov-03 20:05 UTC

head link

[Xen-devel] [PATCH 2 of 2 V5] remus: command line switch to enable/disable checkpoint compression

# HG changeset patch
# User Shriram Rajagopalan <rshriram@cs.ubc.ca>
# Date 1320348758 25200
# Node ID b6ed8e28ae3fb14aeb978fdf626b682ed24957ba
# Parent  d27072263a483fbf66456722fbd84967fe606602
remus: command line switch to enable/disable checkpoint compression

Add a command line switch to remus script that allows the user to
enable or disable checkpoint compression in the libxc code.

Signed-off-by: Shriram Rajagopalan <rshriram@cs.ubc.ca>

diff -r d27072263a48 -r b6ed8e28ae3f
tools/python/xen/lowlevel/checkpoint/checkpoint.c
--- a/tools/python/xen/lowlevel/checkpoint/checkpoint.c	Thu Nov 03 12:31:11 2011
-0700
+++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.c	Thu Nov 03 12:32:38 2011
-0700
@@ -104,13 +104,14 @@
   PyObject* postcopy_cb = NULL;
   PyObject* checkpoint_cb = NULL;
   unsigned int interval = 0;
+  unsigned int flags = 0;
 
   int fd;
   struct save_callbacks callbacks;
   int rc;
 
-  if (!PyArg_ParseTuple(args, "O|OOOI", &iofile, &suspend_cb,
&postcopy_cb,
-                       &checkpoint_cb, &interval))
+  if (!PyArg_ParseTuple(args, "O|OOOII", &iofile,
&suspend_cb, &postcopy_cb,
+			&checkpoint_cb, &interval, &flags))
     return NULL;
 
   self->interval = interval;
@@ -160,7 +161,7 @@
   callbacks.data = self;
 
   self->threadstate = PyEval_SaveThread();
-  rc = checkpoint_start(&self->cps, fd, &callbacks);
+  rc = checkpoint_start(&self->cps, fd, &callbacks, flags);
   PyEval_RestoreThread(self->threadstate);
 
   if (rc < 0) {
diff -r d27072263a48 -r b6ed8e28ae3f
tools/python/xen/lowlevel/checkpoint/checkpoint.h
--- a/tools/python/xen/lowlevel/checkpoint/checkpoint.h	Thu Nov 03 12:31:11 2011
-0700
+++ b/tools/python/xen/lowlevel/checkpoint/checkpoint.h	Thu Nov 03 12:32:38 2011
-0700
@@ -40,13 +40,15 @@
     timer_t timer;
 } checkpoint_state;
 
+#define CHECKPOINT_FLAGS_COMPRESSION 1
 char* checkpoint_error(checkpoint_state* s);
 
 void checkpoint_init(checkpoint_state* s);
 int checkpoint_open(checkpoint_state* s, unsigned int domid);
 void checkpoint_close(checkpoint_state* s);
 int checkpoint_start(checkpoint_state* s, int fd,
-                    struct save_callbacks* callbacks);
+		     struct save_callbacks* callbacks,
+		     unsigned int remus_flags);
 int checkpoint_suspend(checkpoint_state* s);
 int checkpoint_resume(checkpoint_state* s);
 int checkpoint_postflush(checkpoint_state* s);
diff -r d27072263a48 -r b6ed8e28ae3f
tools/python/xen/lowlevel/checkpoint/libcheckpoint.c
--- a/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c	Thu Nov 03 12:31:11
2011 -0700
+++ b/tools/python/xen/lowlevel/checkpoint/libcheckpoint.c	Thu Nov 03 12:32:38
2011 -0700
@@ -170,7 +170,8 @@
 }
 
 int checkpoint_start(checkpoint_state* s, int fd,
-                    struct save_callbacks* callbacks)
+		     struct save_callbacks* callbacks,
+		     unsigned int remus_flags)
 {
     int hvm, rc;
     int flags = XCFLAGS_LIVE;
@@ -188,6 +189,8 @@
        if (switch_qemu_logdirty(s, 1))
            return -1;
     }
+    if (remus_flags & CHECKPOINT_FLAGS_COMPRESSION)
+      flags |= XCFLAGS_CHECKPOINT_COMPRESS;
 
     callbacks->switch_qemu_logdirty = noop_switch_logdirty;
 
diff -r d27072263a48 -r b6ed8e28ae3f tools/python/xen/remus/save.py
--- a/tools/python/xen/remus/save.py	Thu Nov 03 12:31:11 2011 -0700
+++ b/tools/python/xen/remus/save.py	Thu Nov 03 12:32:38 2011 -0700
@@ -133,7 +133,7 @@
 
 class Saver(object):
     def __init__(self, domid, fd, suspendcb=None, resumecb=None,
-                 checkpointcb=None, interval=0):
+                 checkpointcb=None, interval=0, flags=0):
         """Create a Saver object for taking guest checkpoints.
         domid:        name, number or UUID of a running domain
         fd:           a stream to which checkpoint data will be written.
@@ -141,12 +141,14 @@
         resumecb:     callback invoked before guest resumes
         checkpointcb: callback invoked when a checkpoint is complete. Return
                       True to take another checkpoint, or False to stop.
+        flags:        Remus flags to be passed to xc_domain_save
         """
         self.fd = fd
         self.suspendcb = suspendcb
         self.resumecb = resumecb
         self.checkpointcb = checkpointcb
         self.interval = interval
+        self.flags = flags
 
         self.vm = vm.VM(domid)
 
@@ -164,7 +166,8 @@
             try:
                 self.checkpointer.open(self.vm.domid)
                 self.checkpointer.start(self.fd, self.suspendcb, self.resumecb,
-                                        self.checkpointcb, self.interval)
+                                        self.checkpointcb, self.interval,
+                                        self.flags)
             except xen.lowlevel.checkpoint.error, e:
                 raise CheckpointError(e)
         finally:
diff -r d27072263a48 -r b6ed8e28ae3f tools/remus/remus
--- a/tools/remus/remus	Thu Nov 03 12:31:11 2011 -0700
+++ b/tools/remus/remus	Thu Nov 03 12:32:38 2011 -0700
@@ -16,6 +16,9 @@
 class CfgException(Exception): pass
 
 class Cfg(object):
+
+    REMUS_FLAGS_COMPRESSION = 1
+
     def __init__(self):
         # must be set
         self.domid = 0
@@ -25,6 +28,7 @@
         self.port = XendOptions.instance().get_xend_relocation_port()
         self.interval = 200
         self.netbuffer = True
+        self.flags = self.REMUS_FLAGS_COMPRESSION
         self.timer = False
 
         parser = optparse.OptionParser()
@@ -38,6 +42,8 @@
                           help=''replicate to /dev/null (no disk
checkpoints, only memory & net buffering)'')
         parser.add_option('''', ''--no-net'',
dest=''nonet'', action=''store_true'',
                           help=''run without net buffering (benchmark
option)'')
+        parser.add_option('''',
''--no-compression'', dest=''nocompress'',
action=''store_true'',
+                          help=''run without checkpoint
compression'')
         parser.add_option('''', ''--timer'',
dest=''timer'', action=''store_true'',
                           help=''force pause at checkpoint interval
(experimental)'')
         self.parser = parser
@@ -56,6 +62,8 @@
             self.nullremus = True
         if opts.nonet:
             self.netbuffer = False
+        if opts.nocompress:
+            self.flags &= ~self.REMUS_FLAGS_COMPRESSION
         if opts.timer:
             self.timer = True
 
@@ -190,7 +198,7 @@
     rc = 0
 
     checkpointer = save.Saver(cfg.domid, fd, postsuspend, preresume, commit,
-                              interval)
+                              interval, cfg.flags)
 
     try:
         checkpointer.start()

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Ian Jackson

2011-Nov-04 12:14 UTC

head link

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

rshriram@cs.ubc.ca writes ("[PATCH 0 of 2 V5] libxc: checkpoint
compression"):> This patch series adds checkpoint compression functionality, while
> running under Remus.
...> Changes since last version:
> 1. use posix_memalign only on linux platforms and switch to normal malloc
for
>    the rest. stubdom compiles successfully.
Looking at this in more detail, I don''t understand why you''re
using
posix_memalign rather than just malloc, anyway.  If it''s necessary to
use posix_memalign on Linux, why is it OK to use malloc on other
platforms ?

Also this #ifdef is quite ugly.

Ian.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Shriram Rajagopalan

2011-Nov-04 19:21 UTC

head link

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Why posix_memalign?

The compression code involves a lot of memcpys at 4K granularity (dirty
pages
copied from domU''s memory to internal cache/page buffers etc). I would
like
to
keep these memcpys page aligned for purposes of speed. The source pages
(from domU) are already aligned. The destination pages allocated by the
compression code need to be page aligned.

correct me if I am wrong:
 mallocing a huge buffer for this purpose is not optimal. malloc aligns
allocations
 on 16byte (or 8byte) granularity but if a 4K region straddles across two
physical
memory frames, then the memcpy is going to be suboptimal. OTOH, memalign
ensures that we are dealing with just 2 memory frames as opposed
to 3 (possible) frames in malloc.

A simple 8Mb memcpy test shows an average of 500us overhead for malloc
based allocation compared to posix_memalign based allocation. While this
might seem low, the checkpoints are being taken at high frequency
(every 20ms for instance).

It is not okay to use malloc on other platforms. I simply dont have access
to other
platforms to test their equivalent versions.  Short of using something
like qemu_memalign function.

I am open to suggestions :)

shriram
On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
<Ian.Jackson@eu.citrix.com>wrote:
> rshriram@cs.ubc.ca writes ("[PATCH 0 of 2 V5] libxc: checkpoint
> compression"):
> > This patch series adds checkpoint compression functionality, while
> > running under Remus.
> ...
> > Changes since last version:
> > 1. use posix_memalign only on linux platforms and switch to normal
> malloc for
> >    the rest. stubdom compiles successfully.
>
> Looking at this in more detail, I don''t understand why
you''re using
> posix_memalign rather than just malloc, anyway.  If it''s necessary
to
> use posix_memalign on Linux, why is it OK to use malloc on other
> platforms ?
>
> Also this #ifdef is quite ugly.
>
> Ian.
>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Shriram Rajagopalan

2011-Nov-08 16:51 UTC

head link

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
<rshriram@cs.ubc.ca>wrote:
> Why posix_memalign?
>
> The compression code involves a lot of memcpys at 4K granularity (dirty
> pages
> copied from domU''s memory to internal cache/page buffers etc). I
would
> like to
> keep these memcpys page aligned for purposes of speed. The source pages
> (from domU) are already aligned. The destination pages allocated by the
> compression code need to be page aligned.
>
> correct me if I am wrong:
>  mallocing a huge buffer for this purpose is not optimal. malloc aligns
> allocations
>  on 16byte (or 8byte) granularity but if a 4K region straddles across two
> physical
> memory frames, then the memcpy is going to be suboptimal. OTOH, memalign
> ensures that we are dealing with just 2 memory frames as opposed
> to 3 (possible) frames in malloc.
>
> A simple 8Mb memcpy test shows an average of 500us overhead for malloc
> based allocation compared to posix_memalign based allocation. While this
> might seem low, the checkpoints are being taken at high frequency
> (every 20ms for instance).
>
> It is not okay to use malloc on other platforms. I simply dont have access
> to other
> platforms to test their equivalent versions.  Short of using something
> like qemu_memalign function.
>
> I am open to suggestions :)
>
> shriram
>
>Ping.

> On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
<Ian.Jackson@eu.citrix.com>wrote:
>
>> rshriram@cs.ubc.ca writes ("[PATCH 0 of 2 V5] libxc: checkpoint
>> compression"):
>> > This patch series adds checkpoint compression functionality, while
>> > running under Remus.
>> ...
>> > Changes since last version:
>> > 1. use posix_memalign only on linux platforms and switch to normal
>> malloc for
>> >    the rest. stubdom compiles successfully.
>>
>> Looking at this in more detail, I don''t understand why
you''re using
>> posix_memalign rather than just malloc, anyway.  If it''s
necessary to
>> use posix_memalign on Linux, why is it OK to use malloc on other
>> platforms ?
>>
>> Also this #ifdef is quite ugly.
>>
>> Ian.
>>
>>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Ian Campbell

2011-Nov-08 17:02 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, 2011-11-08 at 16:51 +0000, Shriram Rajagopalan
wrote:> On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
> <rshriram@cs.ubc.ca> wrote:
>         Why posix_memalign?
>         
>         The compression code involves a lot of memcpys at 4K
>         granularity (dirty pages
>         copied from domU''s memory to internal cache/page buffers
etc).
>         I would like to
>         keep these memcpys page aligned for purposes of speed. The
>         source pages 
>         (from domU) are already aligned. The destination pages
>         allocated by the 
>         compression code need to be page aligned.
>         
>         correct me if I am wrong:
>          mallocing a huge buffer for this purpose is not optimal.
>         malloc aligns allocations
>          on 16byte (or 8byte) granularity but if a 4K region straddles
>         across two physical
>         memory frames, then the memcpy is going to be suboptimal.
>         OTOH, memalign 
>         ensures that we are dealing with just 2 memory frames as
>         opposed 
>         to 3 (possible) frames in malloc.
>         
>         A simple 8Mb memcpy test shows an average of 500us overhead
>         for malloc 
>         based allocation compared to posix_memalign based allocation.
>         While this 
>         might seem low, the checkpoints are being taken at high
>         frequency 
>         (every 20ms for instance). 
>         
>         It is not okay to use malloc on other platforms. I simply dont
>         have access to other
>         platforms to test their equivalent versions.  Short of using
>         something 
>         like qemu_memalign function.
>         
>         I am open to suggestions :)
This is due to minios (aka stubdoms) not having posix_memalign, right?

minios (or rather newlib) does appear to have memalign though, which if
true would also work, right? You could potentially also implement
posix_memalign in terms of memalign on minios and avoid the ifdef.

Ian.
>         
>         shriram
>         
>         
> 
> Ping.
>  
> 
>         On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
>         <Ian.Jackson@eu.citrix.com> wrote:
>                 rshriram@cs.ubc.ca writes ("[PATCH 0 of 2 V5] libxc:
>                 checkpoint compression"):
>                 > This patch series adds checkpoint compression
>                 functionality, while
>                 > running under Remus.
>                 
>                 ...
>                 > Changes since last version:
>                 > 1. use posix_memalign only on linux platforms and
>                 switch to normal malloc for
>                 >    the rest. stubdom compiles successfully.
>                 
>                 
>                 Looking at this in more detail, I don''t understand
why
>                 you''re using
>                 posix_memalign rather than just malloc, anyway.  If
>                 it''s necessary to
>                 use posix_memalign on Linux, why is it OK to use
>                 malloc on other
>                 platforms ?
>                 
>                 Also this #ifdef is quite ugly.
>                 
>                 Ian.
>                 
>         
>         
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Shriram Rajagopalan

2011-Nov-08 17:13 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, Nov 8, 2011 at 9:02 AM, Ian Campbell
<Ian.Campbell@citrix.com>wrote:
> On Tue, 2011-11-08 at 16:51 +0000, Shriram Rajagopalan wrote:
> > On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
> > <rshriram@cs.ubc.ca> wrote:
> >         Why posix_memalign?
> >
> >         The compression code involves a lot of memcpys at 4K
> >         granularity (dirty pages
> >         copied from domU''s memory to internal cache/page
buffers etc).
> >         I would like to
> >         keep these memcpys page aligned for purposes of speed. The
> >         source pages
> >         (from domU) are already aligned. The destination pages
> >         allocated by the
> >         compression code need to be page aligned.
> >
> >         correct me if I am wrong:
> >          mallocing a huge buffer for this purpose is not optimal.
> >         malloc aligns allocations
> >          on 16byte (or 8byte) granularity but if a 4K region straddles
> >         across two physical
> >         memory frames, then the memcpy is going to be suboptimal.
> >         OTOH, memalign
> >         ensures that we are dealing with just 2 memory frames as
> >         opposed
> >         to 3 (possible) frames in malloc.
> >
> >         A simple 8Mb memcpy test shows an average of 500us overhead
> >         for malloc
> >         based allocation compared to posix_memalign based allocation.
> >         While this
> >         might seem low, the checkpoints are being taken at high
> >         frequency
> >         (every 20ms for instance).
> >
> >         It is not okay to use malloc on other platforms. I simply dont
> >         have access to other
> >         platforms to test their equivalent versions.  Short of using
> >         something
> >         like qemu_memalign function.
> >
> >         I am open to suggestions :)
>
> This is due to minios (aka stubdoms) not having posix_memalign, right?
>
> minios (or rather newlib) does appear to have memalign though, which if
> true would also work, right? You could potentially also implement
> posix_memalign in terms of memalign on minios and avoid the ifdef.
>
>Sounds good. In that case, can I just post a patch to minios, implementing
posix_memalign and will you then directly take the previous version V4 of
this
patch series (the one without #ifdefs) ?

thanks
shriram

Ian.>
> >
> >         shriram
> >
> >
> >
> > Ping.
> >
> >
> >         On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
> >         <Ian.Jackson@eu.citrix.com> wrote:
> >                 rshriram@cs.ubc.ca writes ("[PATCH 0 of 2 V5]
libxc:
> >                 checkpoint compression"):
> >                 > This patch series adds checkpoint compression
> >                 functionality, while
> >                 > running under Remus.
> >
> >                 ...
> >                 > Changes since last version:
> >                 > 1. use posix_memalign only on linux platforms and
> >                 switch to normal malloc for
> >                 >    the rest. stubdom compiles successfully.
> >
> >
> >                 Looking at this in more detail, I don''t
understand why
> >                 you''re using
> >                 posix_memalign rather than just malloc, anyway.  If
> >                 it''s necessary to
> >                 use posix_memalign on Linux, why is it OK to use
> >                 malloc on other
> >                 platforms ?
> >
> >                 Also this #ifdef is quite ugly.
> >
> >                 Ian.
> >
> >
> >
> >
>
>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Ian Campbell

2011-Nov-08 17:16 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, 2011-11-08 at 17:13 +0000, Shriram Rajagopalan
wrote:> On Tue, Nov 8, 2011 at 9:02 AM, Ian Campbell
<Ian.Campbell@citrix.com>
> wrote:
>         On Tue, 2011-11-08 at 16:51 +0000, Shriram Rajagopalan wrote:
>         > On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
>         > <rshriram@cs.ubc.ca> wrote:
>         >         Why posix_memalign?
>         >
>         >         The compression code involves a lot of memcpys at 4K
>         >         granularity (dirty pages
>         >         copied from domU''s memory to internal
cache/page
>         buffers etc).
>         >         I would like to
>         >         keep these memcpys page aligned for purposes of
>         speed. The
>         >         source pages
>         >         (from domU) are already aligned. The destination
>         pages
>         >         allocated by the
>         >         compression code need to be page aligned.
>         >
>         >         correct me if I am wrong:
>         >          mallocing a huge buffer for this purpose is not
>         optimal.
>         >         malloc aligns allocations
>         >          on 16byte (or 8byte) granularity but if a 4K region
>         straddles
>         >         across two physical
>         >         memory frames, then the memcpy is going to be
>         suboptimal.
>         >         OTOH, memalign
>         >         ensures that we are dealing with just 2 memory
>         frames as
>         >         opposed
>         >         to 3 (possible) frames in malloc.
>         >
>         >         A simple 8Mb memcpy test shows an average of 500us
>         overhead
>         >         for malloc
>         >         based allocation compared to posix_memalign based
>         allocation.
>         >         While this
>         >         might seem low, the checkpoints are being taken at
>         high
>         >         frequency
>         >         (every 20ms for instance).
>         >
>         >         It is not okay to use malloc on other platforms. I
>         simply dont
>         >         have access to other
>         >         platforms to test their equivalent versions.  Short
>         of using
>         >         something
>         >         like qemu_memalign function.
>         >
>         >         I am open to suggestions :)
>         
>         
>         This is due to minios (aka stubdoms) not having
>         posix_memalign, right?
>         
>         minios (or rather newlib) does appear to have memalign though,
>         which if
>         true would also work, right? You could potentially also
>         implement
>         posix_memalign in terms of memalign on minios and avoid the
>         ifdef.
>         
> 
> Sounds good. In that case, can I just post a patch to minios,
> implementing posix_memalign and will you then directly take the
> previous version V4 of this patch series (the one without #ifdefs) ?
Well, *I* won''t be taking any version of the patch but that sounds like
a sane plan to me, assuming V4 builds after your minios patch.
> 
> thanks
> shriram
> 
> 
>         Ian.
>         
>         >
>         >         shriram
>         >
>         >
>         >
>         > Ping.
>         >
>         >
>         >         On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
>         >         <Ian.Jackson@eu.citrix.com> wrote:
>         >                 rshriram@cs.ubc.ca writes ("[PATCH 0 of 2
>         V5] libxc:
>         >                 checkpoint compression"):
>         >                 > This patch series adds checkpoint
>         compression
>         >                 functionality, while
>         >                 > running under Remus.
>         >
>         >                 ...
>         >                 > Changes since last version:
>         >                 > 1. use posix_memalign only on linux
>         platforms and
>         >                 switch to normal malloc for
>         >                 >    the rest. stubdom compiles
>         successfully.
>         >
>         >
>         >                 Looking at this in more detail, I
don''t
>         understand why
>         >                 you''re using
>         >                 posix_memalign rather than just malloc,
>         anyway.  If
>         >                 it''s necessary to
>         >                 use posix_memalign on Linux, why is it OK to
>         use
>         >                 malloc on other
>         >                 platforms ?
>         >
>         >                 Also this #ifdef is quite ugly.
>         >
>         >                 Ian.
>         >
>         >
>         >
>         >
>         
>         
>         
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Shriram Rajagopalan

2011-Nov-08 17:20 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, Nov 8, 2011 at 9:16 AM, Ian Campbell
<Ian.Campbell@citrix.com>wrote:
> On Tue, 2011-11-08 at 17:13 +0000, Shriram Rajagopalan wrote:
> > On Tue, Nov 8, 2011 at 9:02 AM, Ian Campbell
<Ian.Campbell@citrix.com>
> > wrote:
> >         On Tue, 2011-11-08 at 16:51 +0000, Shriram Rajagopalan wrote:
> >         > On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
> >         > <rshriram@cs.ubc.ca> wrote:
> >         >         Why posix_memalign?
> >         >
> >         >         The compression code involves a lot of memcpys at
4K
> >         >         granularity (dirty pages
> >         >         copied from domU''s memory to internal
cache/page
> >         buffers etc).
> >         >         I would like to
> >         >         keep these memcpys page aligned for purposes of
> >         speed. The
> >         >         source pages
> >         >         (from domU) are already aligned. The destination
> >         pages
> >         >         allocated by the
> >         >         compression code need to be page aligned.
> >         >
> >         >         correct me if I am wrong:
> >         >          mallocing a huge buffer for this purpose is not
> >         optimal.
> >         >         malloc aligns allocations
> >         >          on 16byte (or 8byte) granularity but if a 4K
region
> >         straddles
> >         >         across two physical
> >         >         memory frames, then the memcpy is going to be
> >         suboptimal.
> >         >         OTOH, memalign
> >         >         ensures that we are dealing with just 2 memory
> >         frames as
> >         >         opposed
> >         >         to 3 (possible) frames in malloc.
> >         >
> >         >         A simple 8Mb memcpy test shows an average of
500us
> >         overhead
> >         >         for malloc
> >         >         based allocation compared to posix_memalign based
> >         allocation.
> >         >         While this
> >         >         might seem low, the checkpoints are being taken
at
> >         high
> >         >         frequency
> >         >         (every 20ms for instance).
> >         >
> >         >         It is not okay to use malloc on other platforms.
I
> >         simply dont
> >         >         have access to other
> >         >         platforms to test their equivalent versions. 
Short
> >         of using
> >         >         something
> >         >         like qemu_memalign function.
> >         >
> >         >         I am open to suggestions :)
> >
> >
> >         This is due to minios (aka stubdoms) not having
> >         posix_memalign, right?
> >
> >         minios (or rather newlib) does appear to have memalign though,
> >         which if
> >         true would also work, right? You could potentially also
> >         implement
> >         posix_memalign in terms of memalign on minios and avoid the
> >         ifdef.
> >
> >
> > Sounds good. In that case, can I just post a patch to minios,
> > implementing posix_memalign and will you then directly take the
> > previous version V4 of this patch series (the one without #ifdefs) ?
>
> Well, *I* won''t be taking any version of the patch but that sounds
like
> a sane plan to me, assuming V4 builds after your minios patch.
>
>oops. sorry.. I was referring to IanJ.

 >> > thanks
> > shriram
> >
> >
> >         Ian.
> >
> >         >
> >         >         shriram
> >         >
> >         >
> >         >
> >         > Ping.
> >         >
> >         >
> >         >         On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
> >         >         <Ian.Jackson@eu.citrix.com> wrote:
> >         >                 rshriram@cs.ubc.ca writes ("[PATCH 0
of 2
> >         V5] libxc:
> >         >                 checkpoint compression"):
> >         >                 > This patch series adds checkpoint
> >         compression
> >         >                 functionality, while
> >         >                 > running under Remus.
> >         >
> >         >                 ...
> >         >                 > Changes since last version:
> >         >                 > 1. use posix_memalign only on linux
> >         platforms and
> >         >                 switch to normal malloc for
> >         >                 >    the rest. stubdom compiles
> >         successfully.
> >         >
> >         >
> >         >                 Looking at this in more detail, I
don''t
> >         understand why
> >         >                 you''re using
> >         >                 posix_memalign rather than just malloc,
> >         anyway.  If
> >         >                 it''s necessary to
> >         >                 use posix_memalign on Linux, why is it OK
to
> >         use
> >         >                 malloc on other
> >         >                 platforms ?
> >         >
> >         >                 Also this #ifdef is quite ugly.
> >         >
> >         >                 Ian.
> >         >
> >         >
> >         >
> >         >
> >
> >
> >
> >
>
>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Shriram Rajagopalan

2011-Nov-08 19:41 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, Nov 8, 2011 at 9:20 AM, Shriram Rajagopalan
<rshriram@cs.ubc.ca>wrote:
> On Tue, Nov 8, 2011 at 9:16 AM, Ian Campbell
<Ian.Campbell@citrix.com>wrote:
>
>> On Tue, 2011-11-08 at 17:13 +0000, Shriram Rajagopalan wrote:
>> > On Tue, Nov 8, 2011 at 9:02 AM, Ian Campbell
<Ian.Campbell@citrix.com>
>> > wrote:
>> >         On Tue, 2011-11-08 at 16:51 +0000, Shriram Rajagopalan
wrote:
>> >         > On Fri, Nov 4, 2011 at 12:21 PM, Shriram Rajagopalan
>> >         > <rshriram@cs.ubc.ca> wrote:
>> >         >         Why posix_memalign?
>> >         >
>> >         >         The compression code involves a lot of
memcpys at 4K
>> >         >         granularity (dirty pages
>> >         >         copied from domU''s memory to
internal cache/page
>> >         buffers etc).
>> >         >         I would like to
>> >         >         keep these memcpys page aligned for purposes
of
>> >         speed. The
>> >         >         source pages
>> >         >         (from domU) are already aligned. The
destination
>> >         pages
>> >         >         allocated by the
>> >         >         compression code need to be page aligned.
>> >         >
>> >         >         correct me if I am wrong:
>> >         >          mallocing a huge buffer for this purpose is
not
>> >         optimal.
>> >         >         malloc aligns allocations
>> >         >          on 16byte (or 8byte) granularity but if a 4K
region
>> >         straddles
>> >         >         across two physical
>> >         >         memory frames, then the memcpy is going to be
>> >         suboptimal.
>> >         >         OTOH, memalign
>> >         >         ensures that we are dealing with just 2
memory
>> >         frames as
>> >         >         opposed
>> >         >         to 3 (possible) frames in malloc.
>> >         >
>> >         >         A simple 8Mb memcpy test shows an average of
500us
>> >         overhead
>> >         >         for malloc
>> >         >         based allocation compared to posix_memalign
based
>> >         allocation.
>> >         >         While this
>> >         >         might seem low, the checkpoints are being
taken at
>> >         high
>> >         >         frequency
>> >         >         (every 20ms for instance).
>> >         >
>> >         >         It is not okay to use malloc on other
platforms. I
>> >         simply dont
>> >         >         have access to other
>> >         >         platforms to test their equivalent versions. 
Short
>> >         of using
>> >         >         something
>> >         >         like qemu_memalign function.
>> >         >
>> >         >         I am open to suggestions :)
>> >
>> >
>> >         This is due to minios (aka stubdoms) not having
>> >         posix_memalign, right?
>> >
>> >         minios (or rather newlib) does appear to have memalign
though,
>> >         which if
>> >         true would also work, right? You could potentially also
>> >         implement
>> >         posix_memalign in terms of memalign on minios and avoid
the
>> >         ifdef.
>> >
>> >
>> > Sounds good. In that case, can I just post a patch to minios,
>> > implementing posix_memalign and will you then directly take the
>> > previous version V4 of this patch series (the one without #ifdefs)
?
>>
>> Well, *I* won''t be taking any version of the patch but that
sounds like
>> a sane plan to me, assuming V4 builds after your minios patch.
>>
>>
> oops. sorry.. I was referring to IanJ.
>
>Just realized i forgot to state why I had to the __linux__.

a. minios lacks posix_memalign
b. I looked up online. solaris has no posix_memalign. I am not sure about
netbsd.
c. in tools/libxc/
xc_solaris.c uses memalign
xc_netbsd.c uses valloc
xc_minios.c uses memalign
xc_linux_osdep.c uses posix_memalign!

further posix_memalign manpage states that
*"*posix_memalign() verifies that *alignment* matches the requirements
detailed above.
 memalign**() may not check that the *boundary* argument is correct."

fortified by newlib-1.16.0''s comments in mallocr.c
(newlib-1.16.0/newlib/libc/stdlib/)
"The alignment argument must be a power of two. This property is not
checked by memalign, so misuse may result in random runtime errors."

Judging by all this mess, I thought i was better off doing a #ifdef
__linux__ and
resorting to simple malloc for the other platforms.

One alternative would be to add the xc_memalign function alone, that was
removed
by c/s 22520.

-void *xc_memalign(size_t alignment, size_t size)
-{
-#if defined(_POSIX_C_SOURCE) && !defined(__sun__)
-    int ret;
-    void *ptr;
-    ret = posix_memalign(&ptr, alignment, size);
-    if (ret != 0)
-        return NULL;
-    return ptr;
-#elif defined(__NetBSD__) || defined(__OpenBSD__)
-    return valloc(size);
-#else
-    return memalign(alignment, size);
-#endif
-}
-

shriram

>  >
>> > thanks
>> > shriram
>> >
>> >
>> >         Ian.
>> >
>> >         >
>> >         >         shriram
>> >         >
>> >         >
>> >         >
>> >         > Ping.
>> >         >
>> >         >
>> >         >         On Fri, Nov 4, 2011 at 5:14 AM, Ian Jackson
>> >         >         <Ian.Jackson@eu.citrix.com> wrote:
>> >         >                 rshriram@cs.ubc.ca writes
("[PATCH 0 of 2
>> >         V5] libxc:
>> >         >                 checkpoint compression"):
>> >         >                 > This patch series adds
checkpoint
>> >         compression
>> >         >                 functionality, while
>> >         >                 > running under Remus.
>> >         >
>> >         >                 ...
>> >         >                 > Changes since last version:
>> >         >                 > 1. use posix_memalign only on
linux
>> >         platforms and
>> >         >                 switch to normal malloc for
>> >         >                 >    the rest. stubdom compiles
>> >         successfully.
>> >         >
>> >         >
>> >         >                 Looking at this in more detail, I
don''t
>> >         understand why
>> >         >                 you''re using
>> >         >                 posix_memalign rather than just
malloc,
>> >         anyway.  If
>> >         >                 it''s necessary to
>> >         >                 use posix_memalign on Linux, why is
it OK to
>> >         use
>> >         >                 malloc on other
>> >         >                 platforms ?
>> >         >
>> >         >                 Also this #ifdef is quite ugly.
>> >         >
>> >         >                 Ian.
>> >         >
>> >         >
>> >         >
>> >         >
>> >
>> >
>> >
>> >
>>
>>
>>
>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Ian Campbell

2011-Nov-08 19:56 UTC

head link

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

On Tue, 2011-11-08 at 19:41 +0000, Shriram Rajagopalan wrote:
> One alternative would be to add the xc_memalign function alone, that
> was removed by c/s 22520.
22520 is 22520:6df91a11dcb0 "libxc: remove comment obsoleted by addition
of hypercall bounce buffer." here. Did you mean 22312:9fad5e5e2fc1
(remember the cset number is not globally unique/stable, only the longer
node hash is)

I think putting xc_memalign back would be fine, better than the other
options we''ve discussed even.

I''d be tempted to do it as separate functions
tools/libxc/xc_{minios,netbsd,linux}.c rather than using #ifdef though.

(For my money you can ignore Solaris, it''s been unmaintained for long
enough that I bet it doesn''t even build now and there''s no one
we can
ask to even build test it).
> 
> -void *xc_memalign(size_t alignment, size_t size)
> -{
> -#if defined(_POSIX_C_SOURCE) && !defined(__sun__)
> -    int ret;
> -    void *ptr;
> -    ret = posix_memalign(&ptr, alignment, size);
> -    if (ret != 0)
> -        return NULL;
> -    return ptr;
> -#elif defined(__NetBSD__) || defined(__OpenBSD__)
> -    return valloc(size);
> -#else
> -    return memalign(alignment, size);
> -#endif
> -}
> -
> 
> shriram
>  
>         
>                 >
>                 > thanks
>                 > shriram
>                 >
>                 >
>                 >         Ian.
>                 >
>                 >         >
>                 >         >         shriram
>                 >         >
>                 >         >
>                 >         >
>                 >         > Ping.
>                 >         >
>                 >         >
>                 >         >         On Fri, Nov 4, 2011 at 5:14 AM,
>                 Ian Jackson
>                 >         >         <Ian.Jackson@eu.citrix.com>
wrote:
>                 >         >                 rshriram@cs.ubc.ca writes
>                 ("[PATCH 0 of 2
>                 >         V5] libxc:
>                 >         >                 checkpoint
compression"):
>                 >         >                 > This patch series
adds
>                 checkpoint
>                 >         compression
>                 >         >                 functionality, while
>                 >         >                 > running under Remus.
>                 >         >
>                 >         >                 ...
>                 >         >                 > Changes since last
>                 version:
>                 >         >                 > 1. use
posix_memalign
>                 only on linux
>                 >         platforms and
>                 >         >                 switch to normal malloc
>                 for
>                 >         >                 >    the rest. stubdom
>                 compiles
>                 >         successfully.
>                 >         >
>                 >         >
>                 >         >                 Looking at this in more
>                 detail, I don''t
>                 >         understand why
>                 >         >                 you''re using
>                 >         >                 posix_memalign rather
than
>                 just malloc,
>                 >         anyway.  If
>                 >         >                 it''s necessary
to
>                 >         >                 use posix_memalign on
>                 Linux, why is it OK to
>                 >         use
>                 >         >                 malloc on other
>                 >         >                 platforms ?
>                 >         >
>                 >         >                 Also this #ifdef is quite
>                 ugly.
>                 >         >
>                 >         >                 Ian.
>                 >         >
>                 >         >
>                 >         >
>                 >         >
>                 >
>                 >
>                 >
>                 >
>                 
>                 
>                 
>         
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

Xen devel - Nov 2011 - [PATCH 0 of 2 V5] libxc: checkpoint compression

[Xen-devel] [PATCH 0 of 2 V5] libxc: checkpoint compression

[Xen-devel] [PATCH 1 of 2 V5] tools/libxc: Remus Checkpoint Compression

[Xen-devel] [PATCH 2 of 2 V5] remus: command line switch to enable/disable checkpoint compression

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

[Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression

Re: [Xen-devel] Re: [PATCH 0 of 2 V5] libxc: checkpoint compression