Richard W.M. Jones
2016-May-16  18:20 UTC
[Libguestfs] [PATCH] launch: direct: Add DAX root filesystem support.
Allow the appliance / root filesystem to be placed on a virtual NVDIMM
and accessed directly by the guest kernel (DAX).
This requires corresponding changes in supermin.
---
 src/guestfs-internal.h |  1 +
 src/launch-direct.c    | 68 ++++++++++++++++++++++++++++++++++++++++----------
 src/launch.c           |  8 +++++-
 3 files changed, 63 insertions(+), 14 deletions(-)
diff --git a/src/guestfs-internal.h b/src/guestfs-internal.h
index d325f50..3655219 100644
--- a/src/guestfs-internal.h
+++ b/src/guestfs-internal.h
@@ -782,6 +782,7 @@ extern int64_t guestfs_int_timeval_diff (const struct
timeval *x, const struct t
 extern void guestfs_int_launch_send_progress (guestfs_h *g, int perdozen);
 extern char *guestfs_int_appliance_command_line (guestfs_h *g, const char
*appliance_dev, int flags);
 #define APPLIANCE_COMMAND_LINE_IS_TCG 1
+#define APPLIANCE_COMMAND_LINE_USE_ACPI 2
 const char *guestfs_int_get_cpu_model (int kvm);
 int guestfs_int_create_socketname (guestfs_h *g, const char *filename, char
(*sockname)[UNIX_PATH_MAX]);
 extern void guestfs_int_register_backend (const char *name, const struct
backend_ops *);
diff --git a/src/launch-direct.c b/src/launch-direct.c
index 01b7e07..a005bda 100644
--- a/src/launch-direct.c
+++ b/src/launch-direct.c
@@ -234,6 +234,7 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
   struct hv_param *hp;
   bool has_kvm;
   int force_tcg;
+  bool dax;
   const char *cpu_model;
 
   /* At present you must add drives before starting the appliance.  In
@@ -371,15 +372,29 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
     warning (g, "qemu debugging is enabled, connect gdb to tcp::1234 to
begin");
   }
 
+  /* Can we use DAX? */
+#ifdef __x86_64__
+  dax = guestfs_int_qemu_version_ge (data->qemu_data, 2, 6) &&
+    guestfs_int_qemu_supports_device (g, data->qemu_data,
"nvdimm");
+#else
+  dax = false;
+#endif
+
   ADD_CMDLINE ("-machine");
   ADD_CMDLINE_PRINTF (
 #ifdef MACHINE_TYPE
                       MACHINE_TYPE ","
+                      "%s"
+#elif __x86_64__
+                      "pc,%s"
+#else
+                      "%s"
 #endif
 #ifdef __aarch64__
                       "gic-version=host,"
 #endif
                       "accel=%s",
+                      dax ? "nvdimm," : "",
                       !force_tcg ? "kvm:tcg" : "tcg");
 
   cpu_model = guestfs_int_get_cpu_model (has_kvm && !force_tcg);
@@ -394,7 +409,10 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
   }
 
   ADD_CMDLINE ("-m");
-  ADD_CMDLINE_PRINTF ("%d", g->memsize);
+  if (dax)
+    ADD_CMDLINE_PRINTF ("%d,maxmem=32G,slots=32", g->memsize);
+  else
+    ADD_CMDLINE_PRINTF ("%d", g->memsize);
 
   /* Force exit instead of reboot on panic */
   ADD_CMDLINE ("-no-reboot");
@@ -541,21 +559,43 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
 
   /* Add the ext2 appliance drive (after all the drives). */
   if (has_appliance_drive) {
-    ADD_CMDLINE ("-drive");
-    ADD_CMDLINE_PRINTF ("file=%s,snapshot=on,id=appliance,"
-                        "cache=unsafe,if=none,format=raw",
-                        appliance);
+    if (dax) {
+      struct stat statbuf;
 
-    if (virtio_scsi) {
-      ADD_CMDLINE ("-device");
-      ADD_CMDLINE ("scsi-hd,drive=appliance");
-    }
-    else {
+      if (stat (appliance, &statbuf) == -1) {
+        perrorf (g, "stat: %s", appliance);
+        goto cleanup0;
+      }
+
+      ADD_CMDLINE ("-object");
+      /* share=off corresponds to mmap MAP_PRIVATE inside qemu, so
+       * this should not affect the underlying file.  IOW parallel
+       * access should be fine.
+       */
+      ADD_CMDLINE_PRINTF ("memory-backend-file,id=mem1,share=off,"
+                          "mem-path=%s,size=%" PRIu64 "b",
+                          appliance, (uint64_t) statbuf.st_size);
       ADD_CMDLINE ("-device");
-      ADD_CMDLINE (VIRTIO_BLK ",drive=appliance");
-    }
+      ADD_CMDLINE ("nvdimm,memdev=mem1,id=nv1");
 
-    appliance_dev = make_appliance_dev (g, virtio_scsi);
+      appliance_dev = safe_strdup (g, "/dev/pmem0");
+    } else {
+      ADD_CMDLINE ("-drive");
+      ADD_CMDLINE_PRINTF ("file=%s,snapshot=on,id=appliance,"
+                          "cache=unsafe,if=none,format=raw",
+                          appliance);
+
+      if (virtio_scsi) {
+        ADD_CMDLINE ("-device");
+        ADD_CMDLINE ("scsi-hd,drive=appliance");
+      }
+      else {
+        ADD_CMDLINE ("-device");
+        ADD_CMDLINE (VIRTIO_BLK ",drive=appliance");
+      }
+
+      appliance_dev = make_appliance_dev (g, virtio_scsi);
+    }
   }
 
   /* Create the virtio serial bus. */
@@ -597,6 +637,8 @@ launch_direct (guestfs_h *g, void *datav, const char *arg)
   flags = 0;
   if (!has_kvm || force_tcg)
     flags |= APPLIANCE_COMMAND_LINE_IS_TCG;
+  if (dax)
+    flags |= APPLIANCE_COMMAND_LINE_USE_ACPI;
   ADD_CMDLINE_STRING_NODUP
     (guestfs_int_appliance_command_line (g, appliance_dev, flags));
 
diff --git a/src/launch.c b/src/launch.c
index 72a8b29..49f0455 100644
--- a/src/launch.c
+++ b/src/launch.c
@@ -318,6 +318,10 @@ guestfs_impl_config (guestfs_h *g,
  * If we are launching a qemu TCG guest (ie. KVM is known to be
  * disabled or unavailable).  If you don't know, don't pass this flag.
  *
+ * =item C<APPLIANCE_COMMAND_LINE_USE_ACPI>
+ *
+ * Use ACPI in the appliance.  Normally disabled because it is slow.
+ *
  * =back
  *
  * Note that this function returns a newly allocated buffer which must
@@ -331,6 +335,7 @@ guestfs_int_appliance_command_line (guestfs_h *g, const char
*appliance_dev,
   char *term = getenv ("TERM");
   char *ret;
   bool tcg = flags & APPLIANCE_COMMAND_LINE_IS_TCG;
+  bool use_acpi = flags & APPLIANCE_COMMAND_LINE_USE_ACPI;
   char lpj_s[64] = "";
 
   if (appliance_dev)
@@ -367,7 +372,7 @@ guestfs_int_appliance_command_line (guestfs_h *g, const char
*appliance_dev,
      " udev.event-timeout=6000" /* for newer udevd */
      " no_timer_check"  /* fix for RHBZ#502058 */
      "%s"               /* lpj */
-     " acpi=off"        /* ACPI is slow - 150-200ms extra on my
laptop */
+     "%s"               /* acpi=off: ACPI is slow, 150-200ms on my
laptop */
      " printk.time=1"   /* display timestamp before kernel messages
*/
      " cgroup_disable=memory"   /* saves us about 5 MB of RAM */
      " usbcore.nousb"           /* disable USB, only saves about 1ms
*/
@@ -386,6 +391,7 @@ guestfs_int_appliance_command_line (guestfs_h *g, const char
*appliance_dev,
      g->memsize,
 #endif
      lpj_s,
+     !use_acpi ? " acpi=off" : "",
      root,
      g->selinux ? "selinux=1 enforcing=0" : "selinux=0",
      g->verbose ? "guestfs_verbose=1" : "quiet",
-- 
2.7.4
Seemingly Similar Threads
- [PATCH] launch: rework handling of --enable-valgrind-daemon
- [PATCH] lib: direct: Remove support for virtio-blk as the default.
- [PATCH 1/2] launch: direct: Use a single -machine [type, ]accel=... option.
- [PATCH 1/2] launch: Rationalize how we construct the Linux kernel command line.
- [PATCH v2 1/9] build: Remove ./configure --enable-valgrind-daemon.
